Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

77 create windowed overlaps #78

Merged
merged 6 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
[package]
name = "gia"
version = "0.2.4"
version = "0.2.5"
edition = "2021"
description = "A tool for set theoretic operations of genomic intervals"
license = "MIT"
homepage = "https://noamteyssier.github.io/gia"
repository = "https://github.com/noamteyssier/gia"
categories = [ "science", "command-line-utilities" ]
keywords = [ "genomics", "bioinformatics", "bed", "set", "interval" ]
categories = ["science", "command-line-utilities"]
keywords = ["genomics", "bioinformatics", "bed", "set", "interval"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
anyhow = "1.0.72"
# bedrs = { version = "0.1.11", features = ["serde", "rayon"] }
bedrs = { git = "https://github.com/noamteyssier/bedrs", branch="bedrs-2.0", features = ["serde", "rayon"] }
bedrs = { git = "https://github.com/noamteyssier/bedrs", branch = "bedrs-2.0", features = [
"serde",
"rayon",
] }
bstr = "1.6.0"
clap = { version = "4.3.19", features = ["derive"] }
csv = "1.2.2"
Expand All @@ -26,8 +29,10 @@ memmap2 = "0.7.1"
rand = "0.8.5"
rand_chacha = "0.3.1"
serde = { version = "1.0.183", features = ["derive"] }
niffler = {version = "2.5.0", default-features = false, features = ["gz"]}
gzp = { version="0.11.3", features=["deflate_rust"], default-features = false }
niffler = { version = "2.5.0", default-features = false, features = ["gz"] }
gzp = { version = "0.11.3", features = [
"deflate_rust",
], default-features = false }
rayon = "1.8.0"
flate2 = "1.0.28"

Expand Down
33 changes: 33 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,4 +456,37 @@ pub enum Command {
#[clap(short, long)]
unmerged: bool,
},

/// Finds all the overlapping intervals in Set B after adding a window around all
/// intervals in Set A
Window {
/// Input BED file to subtract from (default=stdin)
#[clap(short, long)]
a: Option<String>,

/// Secondary BED file to subtract with
#[clap(short, long)]
b: String,

/// Output BED file to write to (default=stdout)
#[clap(short, long)]
output: Option<String>,

/// windows intervals on both sides by the same provided amount
#[clap(short = 'w', long, required_unless_present_any(["left", "right"]), conflicts_with_all(&["left", "right"]))]
both: Option<usize>,

/// windows intervals on the left side by the provided amount
#[clap(short, long, required_unless_present_any(["both", "right"]))]
left: Option<usize>,

/// windows intervals on the right side by the provided amount
#[clap(short, long, required_unless_present_any(["both", "left"]))]
right: Option<usize>,

/// Only report the intervals in the query that do not overlap with the target
/// (i.e. the inverse of the intersection)
#[clap(short = 'v', long)]
inverse: bool,
},
}
24 changes: 12 additions & 12 deletions src/commands/get_fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ fn get_fasta_bed3<R: Read, W: Write>(
let record: NamedBed3 = byterecord.deserialize(None)?;
match fasta.query_buffer(record.chr(), record.start(), record.end()) {
Ok(buffer) => {
write!(
writeln!(
output,
">{}:{}-{}\n",
">{}:{}-{}",
record.chr(),
record.start(),
record.end()
)?;
for subseq in buffer.split_str("\n") {
output.write(subseq)?;
output.write_all(subseq)?;
}
output.write(b"\n")?;
output.write_all(b"\n")?;
}
Err(_) => continue,
}
Expand All @@ -52,9 +52,9 @@ fn get_fasta_bed6<R: Read, W: Write>(
let record: NamedBed6 = byterecord.deserialize(None)?;
match fasta.query_buffer(record.chr(), record.start(), record.end()) {
Ok(buffer) => {
write!(
writeln!(
output,
">{}:{}-{}::{}::{}::{}\n",
">{}:{}-{}::{}::{}::{}",
record.chr(),
record.start(),
record.end(),
Expand All @@ -63,9 +63,9 @@ fn get_fasta_bed6<R: Read, W: Write>(
record.strand().unwrap_or_default(),
)?;
for subseq in buffer.split_str("\n") {
output.write(subseq)?;
output.write_all(subseq)?;
}
output.write(b"\n")?;
output.write_all(b"\n")?;
}
Err(_) => continue,
}
Expand All @@ -83,9 +83,9 @@ fn get_fasta_bed12<R: Read, W: Write>(
let record: NamedBed12 = byterecord.deserialize(None)?;
match fasta.query_buffer(record.chr(), record.start(), record.end()) {
Ok(buffer) => {
write!(
writeln!(
output,
">{}:{}-{}::{}::{}::{}::{}::{}::{}::{}::{}::{}\n",
">{}:{}-{}::{}::{}::{}::{}::{}::{}::{}::{}::{}",
record.chr(),
record.start(),
record.end(),
Expand All @@ -100,9 +100,9 @@ fn get_fasta_bed12<R: Read, W: Write>(
record.block_starts(),
)?;
for subseq in buffer.split_str("\n") {
output.write(subseq)?;
output.write_all(subseq)?;
}
output.write(b"\n")?;
output.write_all(b"\n")?;
}
Err(_) => continue,
}
Expand Down
2 changes: 2 additions & 0 deletions src/commands/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ mod sample;
mod shift;
mod sort;
mod subtract;
mod window;
pub use closest::closest;
pub use complement::complement;
pub use extend::extend;
Expand All @@ -24,3 +25,4 @@ pub use sample::sample;
pub use shift::shift;
pub use sort::sort;
pub use subtract::subtract;
pub use window::window;
230 changes: 230 additions & 0 deletions src/commands/window.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
use std::io::Write;

use anyhow::{bail, Result};
use bedrs::{traits::IntervalBounds, IntervalContainer};
use serde::Serialize;

use crate::{
io::{
match_output, write_pairs_iter_with, write_records_iter_with, BedReader, WriteNamedIter,
WriteNamedIterImpl,
},
types::{InputFormat, IntervalPair, Rename, Renamer, Translater},
utils::sort_pairs,
};

fn windowed_set_overlaps<'a, Ia, Ib, Na, Nb, W>(
set_a: &'a mut IntervalContainer<Ia, usize, usize>,
set_b: &'a mut IntervalContainer<Ib, usize, usize>,
translater: Option<&'a Translater>,
left: usize,
right: usize,
inverse: bool,
output: W,
) -> Result<()>
where
Ia: IntervalBounds<usize, usize> + Serialize + Copy,
Ib: IntervalBounds<usize, usize> + Serialize + Copy,
Na: IntervalBounds<&'a str, usize> + Serialize,
Nb: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
WriteNamedIterImpl: WriteNamedIter<Ia> + WriteNamedIter<Ib>,
Renamer: Rename<'a, Ia, Na> + Rename<'a, Ib, Nb>,
{
sort_pairs(set_a, set_b, false);
if inverse {
let iv_iter = set_a
.iter()
.map(|iv| {
let mut w_iv = *iv;
w_iv.extend_left(&left);
w_iv.extend_right(&right, None);
(iv, w_iv)
})
.filter(|(_iv, w_iv)| {
let overlaps = set_b.find_iter_sorted_unchecked(w_iv).count();
overlaps == 0
})
.map(|(iv, _w_iv)| *iv);
write_records_iter_with(iv_iter, output, translater)?;
} else {
let windows_iter = set_a.iter().map(|iv| {
let mut w_iv = *iv;
w_iv.extend_left(&left);
w_iv.extend_right(&right, None);
(iv, w_iv)
});
let pairs_iter = windows_iter.flat_map(|(iv, w_iv)| {
let overlaps = set_b.find_iter_sorted_owned_unchecked(w_iv);
overlaps.map(|ov| IntervalPair::new(*iv, *ov, translater))
});
write_pairs_iter_with(pairs_iter, output, translater)?;
}
Ok(())
}

fn dispatch_window<W: Write>(
reader_a: BedReader,
reader_b: BedReader,
both: Option<usize>,
left: Option<usize>,
right: Option<usize>,
inverse: bool,
output: W,
) -> Result<()> {
if reader_a.is_named() != reader_b.is_named() {
bail!("Input files must both be named or both be unnamed");
}
let mut translater = if reader_a.is_named() {
Some(Translater::new())
} else {
None
};
let (left, right) = if let Some(b) = both {
(b, b)
} else {
(left.unwrap_or(0), right.unwrap_or(0))
};
match reader_a.input_format() {
InputFormat::Bed3 => {
let mut set_a = reader_a.bed3_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
}
}
InputFormat::Bed6 => {
let mut set_a = reader_a.bed6_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
}
}
InputFormat::Bed12 => {
let mut set_a = reader_a.bed12_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
windowed_set_overlaps(
&mut set_a,
&mut set_b,
translater.as_ref(),
left,
right,
inverse,
output,
)
}
}
}
}
}

pub fn window(
path_a: Option<String>,
path_b: String,
output: Option<String>,
both: Option<usize>,
left: Option<usize>,
right: Option<usize>,
inverse: bool,
compression_threads: usize,
compression_level: u32,
) -> Result<()> {
let bed_a = BedReader::from_path(path_a, None, None)?;
let bed_b = BedReader::from_path(Some(path_b), None, None)?;
let output_handle = match_output(output, compression_threads, compression_level)?;
dispatch_window(bed_a, bed_b, both, left, right, inverse, output_handle)
}
Loading
Loading