Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

99 add bam centric methods as separate subcommand #101

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gia"
version = "0.2.15"
version = "0.2.16"
edition = "2021"
description = "A tool for set theoretic operations of genomic intervals"
license = "MIT"
Expand Down Expand Up @@ -34,6 +34,7 @@ gzp = { version = "0.11.3", features = [
], default-features = false }
rayon = "1.10.0"
flate2 = "1.0.28"
rust-htslib = "0.46.0"

[dev-dependencies]
assert_cmd = "2.0.14"
Expand Down
11 changes: 11 additions & 0 deletions src/cli/bam/commands.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use super::{ConvertArgs, FilterArgs};
use clap::Parser;

#[derive(Parser, Debug)]
pub enum BamCommand {
/// Convert BAM to different formats
Convert(ConvertArgs),

/// Filter BAM records based on overlap criteria to other regions
Filter(FilterArgs),
}
43 changes: 43 additions & 0 deletions src/cli/bam/convert.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
use crate::cli::SingleInputBam;
use clap::{Parser, ValueEnum};

#[derive(Parser, Debug)]
pub struct ConvertArgs {
#[clap(flatten)]
pub input: SingleInputBam,

#[clap(flatten)]
pub params: ConvertParams,
}

#[derive(Parser, Debug)]
#[clap(next_help_heading = "Parameters")]
pub struct ConvertParams {
#[clap(short, long, default_value = "bed")]
pub conv: BamConversionType,

#[clap(flatten)]
pub bed: BedConversionParams,

#[clap(flatten)]
pub fastq: FastqConversionParams,
}

#[derive(Parser, Debug)]
#[clap(next_help_heading = "BED Conversion Options")]
pub struct BedConversionParams {
#[clap(short = 'C', long)]
/// Include CIGAR string in BED output
pub cigar: bool,
}

#[derive(Parser, Debug)]
#[clap(next_help_heading = "FASTQ Conversion Options")]
pub struct FastqConversionParams {}

#[derive(Debug, Clone, ValueEnum, Default)]
pub enum BamConversionType {
#[default]
Bed,
Fastq,
}
33 changes: 33 additions & 0 deletions src/cli/bam/filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
use crate::cli::{BamOutput, MixedInput, OverlapPredicates};

use clap::Parser;

#[derive(Parser, Debug)]
pub struct FilterArgs {
#[clap(flatten)]
pub inputs: MixedInput,

#[clap(flatten)]
pub params: FilterParams,

#[clap(flatten)]
pub output: BamOutput,
}

#[derive(Parser, Debug)]
#[clap(next_help_heading = "Parameters")]
pub struct FilterParams {
#[clap(flatten)]
pub overlap_predicates: OverlapPredicates,

#[clap(flatten)]
pub output_predicates: OutputPredicates,
}

#[derive(Parser, Debug)]
#[clap(next_help_heading = "Output Predicates")]
pub struct OutputPredicates {
/// Only return the records from a that DON'T overlap with b
#[clap(short = 'v', long)]
pub invert: bool,
}
7 changes: 7 additions & 0 deletions src/cli/bam/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mod commands;
mod convert;
mod filter;

pub use commands::BamCommand;
pub use convert::{BamConversionType, ConvertArgs, ConvertParams};
pub use filter::{FilterArgs, FilterParams};
10 changes: 7 additions & 3 deletions src/cli/commands.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
use super::{
ClosestArgs, ClusterArgs, ComplementArgs, CoverageArgs, ExtendArgs, FlankArgs, GetFastaArgs,
IntersectArgs, MergeArgs, RandomArgs, SampleArgs, SegmentArgs, ShiftArgs, SortArgs,
SpacingArgs, SubtractArgs, UnionBedGraphArgs, WindowArgs,
bam::BamCommand, ClosestArgs, ClusterArgs, ComplementArgs, CoverageArgs, ExtendArgs, FlankArgs,
GetFastaArgs, IntersectArgs, MergeArgs, RandomArgs, SampleArgs, SegmentArgs, ShiftArgs,
SortArgs, SpacingArgs, SubtractArgs, UnionBedGraphArgs, WindowArgs,
};
use clap::Subcommand;

#[derive(Subcommand)]
pub enum Command {
/// BAM-centric commands
#[clap(subcommand)]
Bam(BamCommand),

/// Finds the closest interval in a secondary BED file for all intervals in a primary BED file
Closest(ClosestArgs),

Expand Down
25 changes: 25 additions & 0 deletions src/cli/inputs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,31 @@ impl SingleInput {
}
}

#[derive(Parser, Debug, Clone)]
#[clap(next_help_heading = "Single BAM Input Options")]
pub struct SingleInputBam {
/// Input BAM file to process (default=stdin)
#[clap(short, long)]
pub input: Option<String>,
}

#[derive(Parser, Debug, Clone)]
#[clap(next_help_heading = "Mixed BAM/Bed Dual Input")]
pub struct MixedInput {
/// Input BAM file to process (default=stdin)
#[clap(short = 'a', long)]
pub bam: Option<String>,
/// Input BED file to process
#[clap(short = 'b', long)]
pub bed: String,
}
impl MixedInput {
pub fn get_reader_bed(&self) -> Result<BedReader> {
// The bed format must always be read as string-based when working with BAM files
BedReader::from_path(Some(self.bed.clone()), None, Some(FieldFormat::StringBased))
}
}

#[derive(Parser, Debug, Clone)]
#[clap(next_help_heading = "Dual Input Options")]
pub struct DualInput {
Expand Down
5 changes: 3 additions & 2 deletions src/cli/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod bam;
mod closest;
mod cluster;
mod commands;
Expand Down Expand Up @@ -31,10 +32,10 @@ pub use extend::ExtendArgs;
pub use flank::FlankArgs;
pub use get_fasta::GetFastaArgs;
pub use growth::Growth;
pub use inputs::{DualInput, MultiInput, SingleInput};
pub use inputs::{DualInput, MixedInput, MultiInput, SingleInput, SingleInputBam};
pub use intersect::{IntersectArgs, IntersectParams, OutputMethod};
pub use merge::{MergeArgs, MergeParams};
pub use outputs::Output;
pub use outputs::{BamOutput, Output};
pub use overlap_predicates::OverlapPredicates;
pub use random::{RandomArgs, RandomParams};
pub use sample::{SampleArgs, SampleParams};
Expand Down
51 changes: 46 additions & 5 deletions src/cli/outputs.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::io::Write;

use crate::io::{match_bam_output, match_output};
use anyhow::Result;
use clap::Parser;

use crate::io::match_output;
use clap::{Parser, ValueEnum};
use rust_htslib::bam::{Format, HeaderView, Writer as BamWriter};
use std::io::Write;

#[derive(Parser, Debug, Clone)]
#[clap(next_help_heading = "Output Options")]
Expand All @@ -29,3 +28,45 @@ impl Output {
)
}
}

#[derive(Parser, Debug, Clone)]
#[clap(next_help_heading = "BAM Output Options")]
pub struct BamOutput {
/// Output BAM file to write to (default=stdout)
#[clap(short, long)]
pub output: Option<String>,

/// Output Format to write to (default=BAM)
#[clap(short = 'O', long, default_value = "bam")]
pub format: WrapHtsFormat,

/// Threads to use when writing BAM files
#[clap(short = 't', long, default_value = "1")]
pub threads: usize,
}
impl BamOutput {
pub fn get_writer(&self, header: &HeaderView) -> Result<BamWriter> {
match_bam_output(
self.output.clone(),
header,
self.format.into(),
self.threads,
)
}
}

#[derive(Parser, Debug, Clone, ValueEnum, Copy)]
pub enum WrapHtsFormat {
Bam,
Sam,
Cram,
}
impl From<WrapHtsFormat> for Format {
fn from(format: WrapHtsFormat) -> Self {
match format {
WrapHtsFormat::Bam => Format::Bam,
WrapHtsFormat::Sam => Format::Sam,
WrapHtsFormat::Cram => Format::Cram,
}
}
}
60 changes: 60 additions & 0 deletions src/commands/bam/convert/bed.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
use crate::cli::bam::ConvertParams;
use crate::commands::bam::utils::{
get_strand, parse_chr_name, parse_endpoints, parse_mapping_quality, parse_query_name,
};
use crate::io::build_writer;

use anyhow::Result;
use rust_htslib::bam::{HeaderView, Read, Reader as BamReader, Record};
use std::io::{stdout, Write};
use std::str::from_utf8;

fn format_print_record<W: Write>(
record: &Record,
header: &HeaderView,
params: &ConvertParams,
wtr: &mut csv::Writer<W>,
) -> Result<()> {
let chr_name = parse_chr_name(record, header)?;
let (start, end) = parse_endpoints(record)?;
let qname = parse_query_name(record)?;
let mapq = parse_mapping_quality(record);
let strand = get_strand(record);
//
if params.bed.cigar {
let cigar = record.cigar();
let tuple = (
from_utf8(chr_name)?,
start,
end,
from_utf8(&qname)?,
mapq,
strand,
format!("{}", cigar),
);
wtr.serialize(tuple)?;
} else {
let tuple = (
from_utf8(chr_name)?,
start,
end,
from_utf8(&qname)?,
mapq,
strand,
);
wtr.serialize(tuple)?;
}
Ok(())
}

pub fn convert_bed(mut bam: BamReader, params: ConvertParams) -> Result<()> {
let header = bam.header().clone();
let mut wtr = build_writer(stdout());
let mut record = Record::new();
while let Some(result) = bam.read(&mut record) {
result?;
format_print_record(&record, &header, &params, &mut wtr)?;
}
wtr.flush()?;
Ok(())
}
22 changes: 22 additions & 0 deletions src/commands/bam/convert/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
mod bed;
pub use bed::convert_bed;

use crate::cli::bam::{BamConversionType, ConvertArgs, ConvertParams};
use crate::io::match_bam_input;

use anyhow::{bail, Result};
use rust_htslib::bam::Reader as BamReader;

fn dispatch_conversion(bam: BamReader, params: ConvertParams) -> Result<()> {
match params.conv {
BamConversionType::Bed => convert_bed(bam, params),
_ => bail!(
"FASTQ conversion is not implemented yet - but checkout samtools fastq for a solution"
),
}
}

pub fn convert(args: ConvertArgs) -> Result<()> {
let bam = match_bam_input(args.input.input)?;
dispatch_conversion(bam, args.params)
}
Loading
Loading