Skip to content

Commit

Permalink
Merge pull request #80 from noamteyssier/79-build-naive-coverage-subc…
Browse files Browse the repository at this point in the history
…ommand

79 build naive coverage subcommand
  • Loading branch information
noamteyssier authored Feb 6, 2024
2 parents 896a2a8 + ff8140b commit 267dfc4
Show file tree
Hide file tree
Showing 17 changed files with 572 additions and 20 deletions.
24 changes: 12 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gia"
version = "0.2.5"
version = "0.2.6"
edition = "2021"
description = "A tool for set theoretic operations of genomic intervals"
license = "MIT"
Expand All @@ -12,33 +12,33 @@ keywords = ["genomics", "bioinformatics", "bed", "set", "interval"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
anyhow = "1.0.72"
anyhow = "1.0.79"
# bedrs = { version = "0.1.11", features = ["serde", "rayon"] }
bedrs = { git = "https://github.com/noamteyssier/bedrs", branch = "bedrs-2.0", features = [
"serde",
"rayon",
] }
bstr = "1.6.0"
clap = { version = "4.3.19", features = ["derive"] }
csv = "1.2.2"
dashmap = { version = "5.5.0", features = ["serde"] }
bstr = "1.9.0"
clap = { version = "4.4.18", features = ["derive"] }
csv = "1.3.0"
dashmap = { version = "5.5.3", features = ["serde"] }
faiquery = "0.1.3"
hashbrown = "0.14.0"
hashbrown = "0.14.3"
human-sort = "0.2.2"
memmap2 = "0.7.1"
memmap2 = "0.9.4"
rand = "0.8.5"
rand_chacha = "0.3.1"
serde = { version = "1.0.183", features = ["derive"] }
serde = { version = "1.0.196", features = ["derive"] }
niffler = { version = "2.5.0", default-features = false, features = ["gz"] }
gzp = { version = "0.11.3", features = [
"deflate_rust",
], default-features = false }
rayon = "1.8.0"
rayon = "1.8.1"
flate2 = "1.0.28"

[dev-dependencies]
assert_cmd = "2.0.11"
predicates = "3.0.3"
assert_cmd = "2.0.13"
predicates = "3.1.0"

[profile.release]
lto = true
42 changes: 42 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,48 @@ pub enum Command {
stream: bool,
},

/// Calculates the coverage of intervals in Set A by intervals in Set B
Coverage {
/// Input BED file to intersect (default=stdin)
#[clap(short, long)]
a: Option<String>,

/// Secondary BED file to intersect
#[clap(short, long)]
b: String,

/// Output BED file to write to (default=stdout)
#[clap(short, long)]
output: Option<String>,

/// Minimum fraction of a's interval that must be covered by b's interval
#[clap(short = 'f', long)]
fraction_query: Option<f64>,

/// Minimum fraction of b's interval that must be covered by a's interval
#[clap(short = 'F', long)]
fraction_target: Option<f64>,

/// Require that the fraction provided with `-f` is reciprocal to both
/// query and target
#[clap(
short,
long,
requires = "fraction_query",
conflicts_with = "fraction_target"
)]
reciprocal: bool,

/// Requires that either fraction provided with `-f` or `-F` is met
#[clap(short, long, requires_all=&["fraction_query", "fraction_target"], conflicts_with = "reciprocal")]
either: bool,

/// Assert that the intervals are presorted in BOTH files (unexpected behavior if they are
/// not)
#[clap(short, long)]
sorted: bool,
},

/// Extends the intervals of a BED file
///
/// The extension is either done on both sides at once
Expand Down
189 changes: 189 additions & 0 deletions src/commands/coverage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
use std::io::Write;

use anyhow::{bail, Result};
use bedrs::{traits::IntervalBounds, types::QueryMethod, IntervalContainer};
use serde::Serialize;

use crate::{
io::{match_output, write_depth_iter_with, BedReader},
types::{InputFormat, IntervalDepth, Rename, Renamer, Translater},
utils::{assign_query_method, sort_pairs},
};

fn run_coverage<'a, Ia, Ib, Na, W>(
set_a: &'a mut IntervalContainer<Ia, usize, usize>,
set_b: &'a mut IntervalContainer<Ib, usize, usize>,
translater: Option<&'a Translater>,
query_method: QueryMethod<usize>,
presorted: bool,
output_handle: W,
) -> Result<()>
where
Ia: IntervalBounds<usize, usize> + Copy + Serialize,
Ib: IntervalBounds<usize, usize> + Copy + Serialize,
Na: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
Renamer: Rename<'a, Ia, Na>,
{
sort_pairs(set_a, set_b, presorted);
let depth_iter = set_a.records().iter().map(|iv| {
let n_overlaps = set_b
.find_iter_sorted_method_unchecked(iv, query_method)
.expect("Error in finding overlaps")
.count();
IntervalDepth::new(*iv, n_overlaps, translater)
});
write_depth_iter_with(depth_iter, output_handle, translater)
}

fn dispatch_coverage<W: Write>(
reader_a: BedReader,
reader_b: BedReader,
output: W,
query_method: QueryMethod<usize>,
presorted: bool,
) -> Result<()> {
if reader_a.is_named() != reader_b.is_named() {
bail!("Input files must both be named or both be unnamed");
}
let mut translater = if reader_a.is_named() {
Some(Translater::new())
} else {
None
};
match reader_a.input_format() {
InputFormat::Bed3 => {
let mut set_a = reader_a.bed3_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
}
}
InputFormat::Bed6 => {
let mut set_a = reader_a.bed6_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
}
}
InputFormat::Bed12 => {
let mut set_a = reader_a.bed12_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
}
}
}
}

pub fn coverage(
a: Option<String>,
b: String,
output: Option<String>,
fraction_query: Option<f64>,
fraction_target: Option<f64>,
reciprocal: bool,
either: bool,
presorted: bool,
compression_threads: usize,
compression_level: u32,
) -> Result<()> {
let bed_a = BedReader::from_path(a, None, None)?;
let bed_b = BedReader::from_path(Some(b), None, None)?;
let query_method = assign_query_method(fraction_query, fraction_target, reciprocal, either);
let output_handle = match_output(output, compression_threads, compression_level)?;
dispatch_coverage(bed_a, bed_b, output_handle, query_method, presorted)
}
2 changes: 2 additions & 0 deletions src/commands/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod closest;
mod complement;
mod coverage;
mod extend;
mod flank;
mod get_fasta;
Expand All @@ -14,6 +15,7 @@ mod subtract;
mod window;
pub use closest::closest;
pub use complement::complement;
pub use coverage::coverage;
pub use extend::extend;
pub use flank::flank;
pub use get_fasta::get_fasta;
Expand Down
6 changes: 3 additions & 3 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ pub use read::{
};
use serde::{Deserialize, Serialize};
pub use write::{
build_writer, write_3col_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap,
write_pairs_iter, write_pairs_iter_with, write_records_iter, write_records_iter_with,
WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl,
build_writer, write_3col_iter_with, write_depth_iter_with, write_named_pairs_iter,
write_named_records_iter_dashmap, write_pairs_iter, write_pairs_iter_with, write_records_iter,
write_records_iter_with, WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl,
};

#[derive(Deserialize, Serialize)]
Expand Down
4 changes: 2 additions & 2 deletions src/io/write/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ pub use iter::{
WriteNamedIterImpl,
};
pub use utils::{
build_writer, write_named_pairs_iter, write_named_records_iter_dashmap, write_pairs_iter,
write_pairs_iter_with, write_records_iter,
build_writer, write_depth_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap,
write_pairs_iter, write_pairs_iter_with, write_records_iter,
};
55 changes: 54 additions & 1 deletion src/io/write/utils.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use crate::types::{IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater};
use crate::types::{
IntervalDepth, IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater,
};
use anyhow::Result;
use bedrs::{traits::IntervalBounds, Coordinates};
use serde::Serialize;
Expand All @@ -25,6 +27,57 @@ where
Ok(())
}

pub fn write_depth_iter_with<'a, W, I, N, It>(
records: It,
writer: W,
translater: Option<&Translater>,
) -> Result<()>
where
I: IntervalBounds<usize, usize> + Serialize,
N: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
It: Iterator<Item = IntervalDepth<'a, I, N>>,
Renamer: Rename<'a, I, N>,
{
if translater.is_some() {
write_named_depth_iter(records, writer)
} else {
write_depth_iter(records, writer)
}
}

fn write_depth_iter<'a, W, I, N, It>(records: It, writer: W) -> Result<()>
where
I: IntervalBounds<usize, usize> + Serialize,
N: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
It: Iterator<Item = IntervalDepth<'a, I, N>>,
Renamer: Rename<'a, I, N>,
{
let mut wtr = build_writer(writer);
for record in records {
wtr.serialize(record.get_tuple())?;
}
wtr.flush()?;
Ok(())
}

pub fn write_named_depth_iter<'a, W, I, N, It>(records: It, writer: W) -> Result<()>
where
I: IntervalBounds<usize, usize> + Serialize,
N: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
It: Iterator<Item = IntervalDepth<'a, I, N>>,
Renamer: Rename<'a, I, N>,
{
let mut wtr = build_writer(writer);
for record in records {
wtr.serialize(record.get_named_tuple())?;
}
wtr.flush()?;
Ok(())
}

pub fn write_pairs_iter_with<'a, W, Ia, Ib, Na, Nb, It>(
records: It,
writer: W,
Expand Down
Loading

0 comments on commit 267dfc4

Please sign in to comment.