Skip to content

Commit

Permalink
feat: added invert as an output predicate to bam filter, bit slow tha…
Browse files Browse the repository at this point in the history
…n bedtools so should compare whether noodles or htslib is faster for writing
  • Loading branch information
noamteyssier committed Apr 5, 2024
1 parent 0f9bb0c commit 6106000
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 16 deletions.
11 changes: 11 additions & 0 deletions src/cli/bam/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,15 @@ pub struct FilterArgs {
pub struct FilterParams {
#[clap(flatten)]
pub overlap_predicates: OverlapPredicates,

#[clap(flatten)]
pub output_predicates: OutputPredicates,
}

#[derive(Parser, Debug)]
#[clap(next_help_heading = "Output Predicates")]
pub struct OutputPredicates {
/// Only return the records from a that DON'T overlap with b
#[clap(short = 'v', long)]
pub invert: bool,
}
78 changes: 62 additions & 16 deletions src/commands/bam/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::{
types::{InputFormat, NumericBed3, SplitTranslater},
};

use super::utils::{parse_chr_name, parse_endpoints};
use anyhow::Result;
use bedrs::{traits::IntervalBounds, types::QueryMethod, IntervalContainer};
use noodles::bam::io::Writer as BamWriter;
Expand All @@ -16,8 +17,6 @@ use noodles::{
use serde::Serialize;
use std::io::{Read, Write};

use super::utils::{parse_chr_name, parse_endpoints};

fn temp_bed3(
record: &BamRecord,
header: &Header,
Expand All @@ -34,6 +33,58 @@ fn temp_bed3(
Ok(Some(NumericBed3::new(chr_idx, start, end)))
}

fn run_inverted_overlap<I, W>(
record: &BamRecord,
header: &Header,
set: &IntervalContainer<I, usize, usize>,
translater: &SplitTranslater,
query_method: QueryMethod<usize>,
wtr: &mut BamWriter<W>,
) -> Result<()>
where
I: IntervalBounds<usize, usize> + Copy + Serialize,
W: Write,
WriteNamedIterImpl: WriteNamedIter<I>,
{
if let Some(bed) = temp_bed3(record, header, translater)? {
let no_overlaps = set
.find_iter_sorted_method_unchecked(&bed, query_method)?
.next()
.is_none();
if no_overlaps {
wtr.write_record(header, record)?;
}
} else {
wtr.write_record(header, record)?;
}
Ok(())
}

fn run_overlap<I, W>(
record: &BamRecord,
header: &Header,
set: &IntervalContainer<I, usize, usize>,
translater: &SplitTranslater,
query_method: QueryMethod<usize>,
wtr: &mut BamWriter<W>,
) -> Result<()>
where
I: IntervalBounds<usize, usize> + Copy + Serialize,
W: Write,
WriteNamedIterImpl: WriteNamedIter<I>,
{
if let Some(bed) = temp_bed3(record, header, translater)? {
let any_overlaps = set
.find_iter_sorted_method_unchecked(&bed, query_method)?
.next()
.is_some();
if any_overlaps {
wtr.write_record(header, record)?;
}
}
Ok(())
}

fn run_filter<I, R, W>(
mut bam: Reader<R>,
header: Header,
Expand Down Expand Up @@ -61,20 +112,15 @@ where
// Initialize the overlap query method
let query_method: QueryMethod<usize> = params.overlap_predicates.into();

// Iterate over the BAM records
for record in bam.records() {
let record = record?;
let bed_record = if let Some(bed) = temp_bed3(&record, &header, translater)? {
bed
} else {
continue;
};
let any_overlaps = set
.find_iter_sorted_method_unchecked(&bed_record, query_method)?
.next()
.is_some();
if any_overlaps {
wtr.write_record(&header, &record)?;
if params.output_predicates.invert {
for record in bam.records() {
let record = record?;
run_inverted_overlap(&record, &header, &set, translater, query_method, &mut wtr)?;
}
} else {
for record in bam.records() {
let record = record?;
run_overlap(&record, &header, &set, translater, query_method, &mut wtr)?;
}
}
Ok(())
Expand Down

0 comments on commit 6106000

Please sign in to comment.