Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

79 build naive coverage subcommand #80

Merged
merged 5 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gia"
version = "0.2.5"
version = "0.2.6"
edition = "2021"
description = "A tool for set theoretic operations of genomic intervals"
license = "MIT"
Expand All @@ -12,33 +12,33 @@ keywords = ["genomics", "bioinformatics", "bed", "set", "interval"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
anyhow = "1.0.72"
anyhow = "1.0.79"
# bedrs = { version = "0.1.11", features = ["serde", "rayon"] }
bedrs = { git = "https://github.com/noamteyssier/bedrs", branch = "bedrs-2.0", features = [
"serde",
"rayon",
] }
bstr = "1.6.0"
clap = { version = "4.3.19", features = ["derive"] }
csv = "1.2.2"
dashmap = { version = "5.5.0", features = ["serde"] }
bstr = "1.9.0"
clap = { version = "4.4.18", features = ["derive"] }
csv = "1.3.0"
dashmap = { version = "5.5.3", features = ["serde"] }
faiquery = "0.1.3"
hashbrown = "0.14.0"
hashbrown = "0.14.3"
human-sort = "0.2.2"
memmap2 = "0.7.1"
memmap2 = "0.9.4"
rand = "0.8.5"
rand_chacha = "0.3.1"
serde = { version = "1.0.183", features = ["derive"] }
serde = { version = "1.0.196", features = ["derive"] }
niffler = { version = "2.5.0", default-features = false, features = ["gz"] }
gzp = { version = "0.11.3", features = [
"deflate_rust",
], default-features = false }
rayon = "1.8.0"
rayon = "1.8.1"
flate2 = "1.0.28"

[dev-dependencies]
assert_cmd = "2.0.11"
predicates = "3.0.3"
assert_cmd = "2.0.13"
predicates = "3.1.0"

[profile.release]
lto = true
42 changes: 42 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,48 @@ pub enum Command {
stream: bool,
},

/// Calculates the coverage of intervals in Set A by intervals in Set B
Coverage {
/// Input BED file to intersect (default=stdin)
#[clap(short, long)]
a: Option<String>,

/// Secondary BED file to intersect
#[clap(short, long)]
b: String,

/// Output BED file to write to (default=stdout)
#[clap(short, long)]
output: Option<String>,

/// Minimum fraction of a's interval that must be covered by b's interval
#[clap(short = 'f', long)]
fraction_query: Option<f64>,

/// Minimum fraction of b's interval that must be covered by a's interval
#[clap(short = 'F', long)]
fraction_target: Option<f64>,

/// Require that the fraction provided with `-f` is reciprocal to both
/// query and target
#[clap(
short,
long,
requires = "fraction_query",
conflicts_with = "fraction_target"
)]
reciprocal: bool,

/// Requires that either fraction provided with `-f` or `-F` is met
#[clap(short, long, requires_all=&["fraction_query", "fraction_target"], conflicts_with = "reciprocal")]
either: bool,

/// Assert that the intervals are presorted in BOTH files (unexpected behavior if they are
/// not)
#[clap(short, long)]
sorted: bool,
},

/// Extends the intervals of a BED file
///
/// The extension is either done on both sides at once
Expand Down
189 changes: 189 additions & 0 deletions src/commands/coverage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
use std::io::Write;

use anyhow::{bail, Result};
use bedrs::{traits::IntervalBounds, types::QueryMethod, IntervalContainer};
use serde::Serialize;

use crate::{
io::{match_output, write_depth_iter_with, BedReader},
types::{InputFormat, IntervalDepth, Rename, Renamer, Translater},
utils::{assign_query_method, sort_pairs},
};

fn run_coverage<'a, Ia, Ib, Na, W>(
set_a: &'a mut IntervalContainer<Ia, usize, usize>,
set_b: &'a mut IntervalContainer<Ib, usize, usize>,
translater: Option<&'a Translater>,
query_method: QueryMethod<usize>,
presorted: bool,
output_handle: W,
) -> Result<()>
where
Ia: IntervalBounds<usize, usize> + Copy + Serialize,
Ib: IntervalBounds<usize, usize> + Copy + Serialize,
Na: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
Renamer: Rename<'a, Ia, Na>,
{
sort_pairs(set_a, set_b, presorted);
let depth_iter = set_a.records().iter().map(|iv| {
let n_overlaps = set_b
.find_iter_sorted_method_unchecked(iv, query_method)
.expect("Error in finding overlaps")
.count();
IntervalDepth::new(*iv, n_overlaps, translater)
});
write_depth_iter_with(depth_iter, output_handle, translater)
}

fn dispatch_coverage<W: Write>(
reader_a: BedReader,
reader_b: BedReader,
output: W,
query_method: QueryMethod<usize>,
presorted: bool,
) -> Result<()> {
if reader_a.is_named() != reader_b.is_named() {
bail!("Input files must both be named or both be unnamed");
}
let mut translater = if reader_a.is_named() {
Some(Translater::new())
} else {
None
};
match reader_a.input_format() {
InputFormat::Bed3 => {
let mut set_a = reader_a.bed3_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
}
}
InputFormat::Bed6 => {
let mut set_a = reader_a.bed6_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
}
}
InputFormat::Bed12 => {
let mut set_a = reader_a.bed12_set_with(translater.as_mut())?;
match reader_b.input_format() {
InputFormat::Bed3 => {
let mut set_b = reader_b.bed3_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed6 => {
let mut set_b = reader_b.bed6_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
InputFormat::Bed12 => {
let mut set_b = reader_b.bed12_set_with(translater.as_mut())?;
run_coverage(
&mut set_a,
&mut set_b,
translater.as_ref(),
query_method,
presorted,
output,
)
}
}
}
}
}

pub fn coverage(
a: Option<String>,
b: String,
output: Option<String>,
fraction_query: Option<f64>,
fraction_target: Option<f64>,
reciprocal: bool,
either: bool,
presorted: bool,
compression_threads: usize,
compression_level: u32,
) -> Result<()> {
let bed_a = BedReader::from_path(a, None, None)?;
let bed_b = BedReader::from_path(Some(b), None, None)?;
let query_method = assign_query_method(fraction_query, fraction_target, reciprocal, either);
let output_handle = match_output(output, compression_threads, compression_level)?;
dispatch_coverage(bed_a, bed_b, output_handle, query_method, presorted)
}
2 changes: 2 additions & 0 deletions src/commands/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod closest;
mod complement;
mod coverage;
mod extend;
mod flank;
mod get_fasta;
Expand All @@ -14,6 +15,7 @@ mod subtract;
mod window;
pub use closest::closest;
pub use complement::complement;
pub use coverage::coverage;
pub use extend::extend;
pub use flank::flank;
pub use get_fasta::get_fasta;
Expand Down
6 changes: 3 additions & 3 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ pub use read::{
};
use serde::{Deserialize, Serialize};
pub use write::{
build_writer, write_3col_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap,
write_pairs_iter, write_pairs_iter_with, write_records_iter, write_records_iter_with,
WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl,
build_writer, write_3col_iter_with, write_depth_iter_with, write_named_pairs_iter,
write_named_records_iter_dashmap, write_pairs_iter, write_pairs_iter_with, write_records_iter,
write_records_iter_with, WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl,
};

#[derive(Deserialize, Serialize)]
Expand Down
4 changes: 2 additions & 2 deletions src/io/write/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ pub use iter::{
WriteNamedIterImpl,
};
pub use utils::{
build_writer, write_named_pairs_iter, write_named_records_iter_dashmap, write_pairs_iter,
write_pairs_iter_with, write_records_iter,
build_writer, write_depth_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap,
write_pairs_iter, write_pairs_iter_with, write_records_iter,
};
55 changes: 54 additions & 1 deletion src/io/write/utils.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use crate::types::{IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater};
use crate::types::{
IntervalDepth, IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater,
};
use anyhow::Result;
use bedrs::{traits::IntervalBounds, Coordinates};
use serde::Serialize;
Expand All @@ -25,6 +27,57 @@ where
Ok(())
}

pub fn write_depth_iter_with<'a, W, I, N, It>(
records: It,
writer: W,
translater: Option<&Translater>,
) -> Result<()>
where
I: IntervalBounds<usize, usize> + Serialize,
N: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
It: Iterator<Item = IntervalDepth<'a, I, N>>,
Renamer: Rename<'a, I, N>,
{
if translater.is_some() {
write_named_depth_iter(records, writer)
} else {
write_depth_iter(records, writer)
}
}

fn write_depth_iter<'a, W, I, N, It>(records: It, writer: W) -> Result<()>
where
I: IntervalBounds<usize, usize> + Serialize,
N: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
It: Iterator<Item = IntervalDepth<'a, I, N>>,
Renamer: Rename<'a, I, N>,
{
let mut wtr = build_writer(writer);
for record in records {
wtr.serialize(record.get_tuple())?;
}
wtr.flush()?;
Ok(())
}

pub fn write_named_depth_iter<'a, W, I, N, It>(records: It, writer: W) -> Result<()>
where
I: IntervalBounds<usize, usize> + Serialize,
N: IntervalBounds<&'a str, usize> + Serialize,
W: Write,
It: Iterator<Item = IntervalDepth<'a, I, N>>,
Renamer: Rename<'a, I, N>,
{
let mut wtr = build_writer(writer);
for record in records {
wtr.serialize(record.get_named_tuple())?;
}
wtr.flush()?;
Ok(())
}

pub fn write_pairs_iter_with<'a, W, Ia, Ib, Na, Nb, It>(
records: It,
writer: W,
Expand Down
Loading
Loading