diff --git a/Cargo.toml b/Cargo.toml index be6b051..8e28b0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,20 +1,23 @@ [package] name = "gia" -version = "0.2.4" +version = "0.2.5" edition = "2021" description = "A tool for set theoretic operations of genomic intervals" license = "MIT" homepage = "https://noamteyssier.github.io/gia" repository = "https://github.com/noamteyssier/gia" -categories = [ "science", "command-line-utilities" ] -keywords = [ "genomics", "bioinformatics", "bed", "set", "interval" ] +categories = ["science", "command-line-utilities"] +keywords = ["genomics", "bioinformatics", "bed", "set", "interval"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] anyhow = "1.0.72" # bedrs = { version = "0.1.11", features = ["serde", "rayon"] } -bedrs = { git = "https://github.com/noamteyssier/bedrs", branch="bedrs-2.0", features = ["serde", "rayon"] } +bedrs = { git = "https://github.com/noamteyssier/bedrs", branch = "bedrs-2.0", features = [ + "serde", + "rayon", +] } bstr = "1.6.0" clap = { version = "4.3.19", features = ["derive"] } csv = "1.2.2" @@ -26,8 +29,10 @@ memmap2 = "0.7.1" rand = "0.8.5" rand_chacha = "0.3.1" serde = { version = "1.0.183", features = ["derive"] } -niffler = {version = "2.5.0", default-features = false, features = ["gz"]} -gzp = { version="0.11.3", features=["deflate_rust"], default-features = false } +niffler = { version = "2.5.0", default-features = false, features = ["gz"] } +gzp = { version = "0.11.3", features = [ + "deflate_rust", +], default-features = false } rayon = "1.8.0" flate2 = "1.0.28" diff --git a/src/cli.rs b/src/cli.rs index 910645d..a18a407 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -456,4 +456,37 @@ pub enum Command { #[clap(short, long)] unmerged: bool, }, + + /// Finds all the overlapping intervals in Set B after adding a window around all + /// intervals in Set A + Window { + /// Input BED file to subtract from (default=stdin) + #[clap(short, long)] + a: Option, + + /// Secondary BED file to subtract with + #[clap(short, long)] + b: String, + + /// Output BED file to write to (default=stdout) + #[clap(short, long)] + output: Option, + + /// windows intervals on both sides by the same provided amount + #[clap(short = 'w', long, required_unless_present_any(["left", "right"]), conflicts_with_all(&["left", "right"]))] + both: Option, + + /// windows intervals on the left side by the provided amount + #[clap(short, long, required_unless_present_any(["both", "right"]))] + left: Option, + + /// windows intervals on the right side by the provided amount + #[clap(short, long, required_unless_present_any(["both", "left"]))] + right: Option, + + /// Only report the intervals in the query that do not overlap with the target + /// (i.e. the inverse of the intersection) + #[clap(short = 'v', long)] + inverse: bool, + }, } diff --git a/src/commands/get_fasta.rs b/src/commands/get_fasta.rs index 2b7693f..cbf7f84 100644 --- a/src/commands/get_fasta.rs +++ b/src/commands/get_fasta.rs @@ -24,17 +24,17 @@ fn get_fasta_bed3( let record: NamedBed3 = byterecord.deserialize(None)?; match fasta.query_buffer(record.chr(), record.start(), record.end()) { Ok(buffer) => { - write!( + writeln!( output, - ">{}:{}-{}\n", + ">{}:{}-{}", record.chr(), record.start(), record.end() )?; for subseq in buffer.split_str("\n") { - output.write(subseq)?; + output.write_all(subseq)?; } - output.write(b"\n")?; + output.write_all(b"\n")?; } Err(_) => continue, } @@ -52,9 +52,9 @@ fn get_fasta_bed6( let record: NamedBed6 = byterecord.deserialize(None)?; match fasta.query_buffer(record.chr(), record.start(), record.end()) { Ok(buffer) => { - write!( + writeln!( output, - ">{}:{}-{}::{}::{}::{}\n", + ">{}:{}-{}::{}::{}::{}", record.chr(), record.start(), record.end(), @@ -63,9 +63,9 @@ fn get_fasta_bed6( record.strand().unwrap_or_default(), )?; for subseq in buffer.split_str("\n") { - output.write(subseq)?; + output.write_all(subseq)?; } - output.write(b"\n")?; + output.write_all(b"\n")?; } Err(_) => continue, } @@ -83,9 +83,9 @@ fn get_fasta_bed12( let record: NamedBed12 = byterecord.deserialize(None)?; match fasta.query_buffer(record.chr(), record.start(), record.end()) { Ok(buffer) => { - write!( + writeln!( output, - ">{}:{}-{}::{}::{}::{}::{}::{}::{}::{}::{}::{}\n", + ">{}:{}-{}::{}::{}::{}::{}::{}::{}::{}::{}::{}", record.chr(), record.start(), record.end(), @@ -100,9 +100,9 @@ fn get_fasta_bed12( record.block_starts(), )?; for subseq in buffer.split_str("\n") { - output.write(subseq)?; + output.write_all(subseq)?; } - output.write(b"\n")?; + output.write_all(b"\n")?; } Err(_) => continue, } diff --git a/src/commands/mod.rs b/src/commands/mod.rs index d8d2b4d..a4e49a4 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -11,6 +11,7 @@ mod sample; mod shift; mod sort; mod subtract; +mod window; pub use closest::closest; pub use complement::complement; pub use extend::extend; @@ -24,3 +25,4 @@ pub use sample::sample; pub use shift::shift; pub use sort::sort; pub use subtract::subtract; +pub use window::window; diff --git a/src/commands/window.rs b/src/commands/window.rs new file mode 100644 index 0000000..dda9b44 --- /dev/null +++ b/src/commands/window.rs @@ -0,0 +1,230 @@ +use std::io::Write; + +use anyhow::{bail, Result}; +use bedrs::{traits::IntervalBounds, IntervalContainer}; +use serde::Serialize; + +use crate::{ + io::{ + match_output, write_pairs_iter_with, write_records_iter_with, BedReader, WriteNamedIter, + WriteNamedIterImpl, + }, + types::{InputFormat, IntervalPair, Rename, Renamer, Translater}, + utils::sort_pairs, +}; + +fn windowed_set_overlaps<'a, Ia, Ib, Na, Nb, W>( + set_a: &'a mut IntervalContainer, + set_b: &'a mut IntervalContainer, + translater: Option<&'a Translater>, + left: usize, + right: usize, + inverse: bool, + output: W, +) -> Result<()> +where + Ia: IntervalBounds + Serialize + Copy, + Ib: IntervalBounds + Serialize + Copy, + Na: IntervalBounds<&'a str, usize> + Serialize, + Nb: IntervalBounds<&'a str, usize> + Serialize, + W: Write, + WriteNamedIterImpl: WriteNamedIter + WriteNamedIter, + Renamer: Rename<'a, Ia, Na> + Rename<'a, Ib, Nb>, +{ + sort_pairs(set_a, set_b, false); + if inverse { + let iv_iter = set_a + .iter() + .map(|iv| { + let mut w_iv = *iv; + w_iv.extend_left(&left); + w_iv.extend_right(&right, None); + (iv, w_iv) + }) + .filter(|(_iv, w_iv)| { + let overlaps = set_b.find_iter_sorted_unchecked(w_iv).count(); + overlaps == 0 + }) + .map(|(iv, _w_iv)| *iv); + write_records_iter_with(iv_iter, output, translater)?; + } else { + let windows_iter = set_a.iter().map(|iv| { + let mut w_iv = *iv; + w_iv.extend_left(&left); + w_iv.extend_right(&right, None); + (iv, w_iv) + }); + let pairs_iter = windows_iter.flat_map(|(iv, w_iv)| { + let overlaps = set_b.find_iter_sorted_owned_unchecked(w_iv); + overlaps.map(|ov| IntervalPair::new(*iv, *ov, translater)) + }); + write_pairs_iter_with(pairs_iter, output, translater)?; + } + Ok(()) +} + +fn dispatch_window( + reader_a: BedReader, + reader_b: BedReader, + both: Option, + left: Option, + right: Option, + inverse: bool, + output: W, +) -> Result<()> { + if reader_a.is_named() != reader_b.is_named() { + bail!("Input files must both be named or both be unnamed"); + } + let mut translater = if reader_a.is_named() { + Some(Translater::new()) + } else { + None + }; + let (left, right) = if let Some(b) = both { + (b, b) + } else { + (left.unwrap_or(0), right.unwrap_or(0)) + }; + match reader_a.input_format() { + InputFormat::Bed3 => { + let mut set_a = reader_a.bed3_set_with(translater.as_mut())?; + match reader_b.input_format() { + InputFormat::Bed3 => { + let mut set_b = reader_b.bed3_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + InputFormat::Bed6 => { + let mut set_b = reader_b.bed6_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + InputFormat::Bed12 => { + let mut set_b = reader_b.bed12_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + } + } + InputFormat::Bed6 => { + let mut set_a = reader_a.bed6_set_with(translater.as_mut())?; + match reader_b.input_format() { + InputFormat::Bed3 => { + let mut set_b = reader_b.bed3_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + InputFormat::Bed6 => { + let mut set_b = reader_b.bed6_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + InputFormat::Bed12 => { + let mut set_b = reader_b.bed12_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + } + } + InputFormat::Bed12 => { + let mut set_a = reader_a.bed12_set_with(translater.as_mut())?; + match reader_b.input_format() { + InputFormat::Bed3 => { + let mut set_b = reader_b.bed3_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + InputFormat::Bed6 => { + let mut set_b = reader_b.bed6_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + InputFormat::Bed12 => { + let mut set_b = reader_b.bed12_set_with(translater.as_mut())?; + windowed_set_overlaps( + &mut set_a, + &mut set_b, + translater.as_ref(), + left, + right, + inverse, + output, + ) + } + } + } + } +} + +pub fn window( + path_a: Option, + path_b: String, + output: Option, + both: Option, + left: Option, + right: Option, + inverse: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let bed_a = BedReader::from_path(path_a, None, None)?; + let bed_b = BedReader::from_path(Some(path_b), None, None)?; + let output_handle = match_output(output, compression_threads, compression_level)?; + dispatch_window(bed_a, bed_b, both, left, right, inverse, output_handle) +} diff --git a/src/main.rs b/src/main.rs index 9ef071b..c5c4e88 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,7 +9,7 @@ use clap::Parser; use cli::{Cli, Command}; use commands::{ closest, complement, extend, flank, get_fasta, intersect, merge, name_map, random, sample, - shift, sort, subtract, + shift, sort, subtract, window, }; fn main() -> Result<()> { @@ -254,6 +254,25 @@ fn main() -> Result<()> { cli.compression_level, )?; } + Command::Window { + a, + b, + output, + both, + left, + right, + inverse, + } => window( + a, + b, + output, + both, + left, + right, + inverse, + cli.compression_threads, + cli.compression_level, + )?, } Ok(()) } diff --git a/tests/datasets/windows/windows_a.bed b/tests/datasets/windows/windows_a.bed new file mode 100644 index 0000000..abc19c9 --- /dev/null +++ b/tests/datasets/windows/windows_a.bed @@ -0,0 +1,10 @@ +1 72 222 +1 257 407 +1 268 418 +1 467 617 +1 819 969 +2 174 324 +2 587 737 +3 395 545 +3 554 704 +3 653 803 diff --git a/tests/datasets/windows/windows_a.bed12 b/tests/datasets/windows/windows_a.bed12 new file mode 100644 index 0000000..97b9f53 --- /dev/null +++ b/tests/datasets/windows/windows_a.bed12 @@ -0,0 +1,10 @@ +1 72 222 0 0.0 + 0 0 0 0 0 0 +1 257 407 0 0.0 + 0 0 0 0 0 0 +1 268 418 0 0.0 + 0 0 0 0 0 0 +1 467 617 0 0.0 + 0 0 0 0 0 0 +1 819 969 0 0.0 + 0 0 0 0 0 0 +2 174 324 0 0.0 + 0 0 0 0 0 0 +2 587 737 0 0.0 + 0 0 0 0 0 0 +3 395 545 0 0.0 + 0 0 0 0 0 0 +3 554 704 0 0.0 + 0 0 0 0 0 0 +3 653 803 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/windows/windows_a.bed6 b/tests/datasets/windows/windows_a.bed6 new file mode 100644 index 0000000..802646e --- /dev/null +++ b/tests/datasets/windows/windows_a.bed6 @@ -0,0 +1,10 @@ +1 72 222 0 0.0 + +1 257 407 0 0.0 + +1 268 418 0 0.0 + +1 467 617 0 0.0 + +1 819 969 0 0.0 + +2 174 324 0 0.0 + +2 587 737 0 0.0 + +3 395 545 0 0.0 + +3 554 704 0 0.0 + +3 653 803 0 0.0 + diff --git a/tests/datasets/windows/windows_b.bed b/tests/datasets/windows/windows_b.bed new file mode 100644 index 0000000..7c20624 --- /dev/null +++ b/tests/datasets/windows/windows_b.bed @@ -0,0 +1,10 @@ +1 55 205 +1 69 219 +1 93 243 +1 156 306 +1 603 753 +1 837 987 +2 39 189 +2 71 221 +2 672 822 +3 138 288 diff --git a/tests/datasets/windows/windows_b.bed12 b/tests/datasets/windows/windows_b.bed12 new file mode 100644 index 0000000..515e704 --- /dev/null +++ b/tests/datasets/windows/windows_b.bed12 @@ -0,0 +1,10 @@ +1 55 205 0 0.0 + 0 0 0 0 0 0 +1 69 219 0 0.0 + 0 0 0 0 0 0 +1 93 243 0 0.0 + 0 0 0 0 0 0 +1 156 306 0 0.0 + 0 0 0 0 0 0 +1 603 753 0 0.0 + 0 0 0 0 0 0 +1 837 987 0 0.0 + 0 0 0 0 0 0 +2 39 189 0 0.0 + 0 0 0 0 0 0 +2 71 221 0 0.0 + 0 0 0 0 0 0 +2 672 822 0 0.0 + 0 0 0 0 0 0 +3 138 288 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/windows/windows_b.bed6 b/tests/datasets/windows/windows_b.bed6 new file mode 100644 index 0000000..c8611d5 --- /dev/null +++ b/tests/datasets/windows/windows_b.bed6 @@ -0,0 +1,10 @@ +1 55 205 0 0.0 + +1 69 219 0 0.0 + +1 93 243 0 0.0 + +1 156 306 0 0.0 + +1 603 753 0 0.0 + +1 837 987 0 0.0 + +2 39 189 0 0.0 + +2 71 221 0 0.0 + +2 672 822 0 0.0 + +3 138 288 0 0.0 + diff --git a/tests/windows.rs b/tests/windows.rs new file mode 100644 index 0000000..abace72 --- /dev/null +++ b/tests/windows.rs @@ -0,0 +1,212 @@ +#[cfg(test)] +mod testing { + use anyhow::Result; + use assert_cmd::prelude::*; + use std::process::Command; + + #[test] + fn test_windows() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed"; + let b = "tests/datasets/windows/windows_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("100") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 18); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 6); + Ok(()) + } + + #[test] + fn test_windows_inv_bed3() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed"; + let b = "tests/datasets/windows/windows_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("100") + .arg("-v") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 3); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 3); + Ok(()) + } + + #[test] + fn test_windows_inv_bed6() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed6"; + let b = "tests/datasets/windows/windows_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("100") + .arg("-v") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 3); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 6); + Ok(()) + } + + #[test] + fn test_windows_inv_bed12() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed12"; + let b = "tests/datasets/windows/windows_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("100") + .arg("-v") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 3); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 12); + Ok(()) + } + + #[test] + fn test_windows_bed3_bed3() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed"; + let b = "tests/datasets/windows/windows_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("0") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 11); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 6); + Ok(()) + } + + #[test] + fn test_windows_bed3_bed6() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed"; + let b = "tests/datasets/windows/windows_b.bed6"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("0") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 11); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 9); + Ok(()) + } + + #[test] + fn test_windows_bed3_bed12() -> Result<()> { + let a = "tests/datasets/windows/windows_a.bed"; + let b = "tests/datasets/windows/windows_b.bed12"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("window") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("-w") + .arg("0") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 11); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 15); + Ok(()) + } +}