Skip to content

Commit

Permalink
Merge pull request #84 from noamteyssier/83-add-bed4-as-an-input-format
Browse files Browse the repository at this point in the history
83 add bed4 as an input format
  • Loading branch information
noamteyssier authored Feb 8, 2024
2 parents 3f2279c + 65e4830 commit bbc1489
Show file tree
Hide file tree
Showing 22 changed files with 319 additions and 84 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gia"
version = "0.2.7"
version = "0.2.9"
edition = "2021"
description = "A tool for set theoretic operations of genomic intervals"
license = "MIT"
Expand Down
32 changes: 31 additions & 1 deletion src/commands/get_fasta.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{
cli::GetFastaArgs,
io::build_reader,
types::{InputFormat, NamedBed12, NamedBed3, NamedBed6},
types::{InputFormat, NamedBed12, NamedBed3, NamedBed4, NamedBed6},
};
use anyhow::Result;
use bedrs::Coordinates;
Expand Down Expand Up @@ -43,6 +43,35 @@ fn get_fasta_bed3<R: Read, W: Write>(
Ok(())
}

fn get_fasta_bed4<R: Read, W: Write>(
csv_reader: &mut csv::Reader<R>,
byterecord: &mut ByteRecord,
fasta: IndexedFasta,
mut output: W,
) -> Result<()> {
while csv_reader.read_byte_record(byterecord)? {
let record: NamedBed4 = byterecord.deserialize(None)?;
match fasta.query_buffer(record.chr(), record.start(), record.end()) {
Ok(buffer) => {
writeln!(
output,
">{}:{}-{}::{}",
record.chr(),
record.start(),
record.end(),
record.name(),
)?;
for subseq in buffer.split_str("\n") {
output.write_all(subseq)?;
}
output.write_all(b"\n")?;
}
Err(_) => continue,
}
}
Ok(())
}

fn get_fasta_bed6<R: Read, W: Write>(
csv_reader: &mut csv::Reader<R>,
byterecord: &mut ByteRecord,
Expand Down Expand Up @@ -121,6 +150,7 @@ pub fn get_fasta(args: GetFastaArgs) -> Result<()> {
let mut byterecord = ByteRecord::new();
match format {
InputFormat::Bed3 => get_fasta_bed3(&mut csv_reader, &mut byterecord, fasta, writer),
InputFormat::Bed4 => get_fasta_bed4(&mut csv_reader, &mut byterecord, fasta, writer),
InputFormat::Bed6 => get_fasta_bed6(&mut csv_reader, &mut byterecord, fasta, writer),
InputFormat::Bed12 => get_fasta_bed12(&mut csv_reader, &mut byterecord, fasta, writer),
}
Expand Down
10 changes: 7 additions & 3 deletions src/commands/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
build_reader, iter_unnamed, write_3col_iter_with, write_records_iter, BedReader,
WriteNamedIter, WriteNamedIterImpl,
},
types::{InputFormat, NumericBed3, Translater},
types::{InputFormat, NumericBed12, NumericBed3, NumericBed4, NumericBed6, Translater},
};
use anyhow::Result;
use bedrs::{traits::IntervalBounds, IntervalContainer, MergeIter};
Expand Down Expand Up @@ -56,12 +56,16 @@ fn merge_streamed_by_format<W: Write>(bed_reader: BedReader, writer: W) -> Resul
let record_iter: Box<dyn Iterator<Item = NumericBed3>> = iter_unnamed(&mut csv_reader);
merge_streamed(record_iter, writer)
}
InputFormat::Bed4 => {
let record_iter: Box<dyn Iterator<Item = NumericBed4>> = iter_unnamed(&mut csv_reader);
merge_streamed(record_iter, writer)
}
InputFormat::Bed6 => {
let record_iter: Box<dyn Iterator<Item = NumericBed3>> = iter_unnamed(&mut csv_reader);
let record_iter: Box<dyn Iterator<Item = NumericBed6>> = iter_unnamed(&mut csv_reader);
merge_streamed(record_iter, writer)
}
InputFormat::Bed12 => {
let record_iter: Box<dyn Iterator<Item = NumericBed3>> = iter_unnamed(&mut csv_reader);
let record_iter: Box<dyn Iterator<Item = NumericBed12>> = iter_unnamed(&mut csv_reader);
merge_streamed(record_iter, writer)
}
}
Expand Down
36 changes: 35 additions & 1 deletion src/commands/random.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{
cli::RandomArgs,
io::{match_input, write_records_iter_with},
types::{Genome, InputFormat, NumericBed12, NumericBed3, NumericBed6, Translater},
types::{Genome, InputFormat, NumericBed12, NumericBed3, NumericBed4, NumericBed6, Translater},
};
use anyhow::Result;
use bedrs::Strand;
Expand Down Expand Up @@ -58,6 +58,39 @@ pub fn random_bed3<W: Write>(args: RandomArgs, writer: W) -> Result<()> {
write_records_iter_with(interval_gen, writer, genome.translater())
}

pub fn random_bed4<W: Write>(args: RandomArgs, writer: W) -> Result<()> {
let mut rng_intervals = args.build_rng();
let mut rng_chr = args.build_rng();
let mut translater = Translater::new();
let genome_sizes = build_chr_size(
args.n_chr,
args.max_chr_len,
args.genome,
args.named,
&mut translater,
)?;

let interval_gen = (0..args.n_intervals)
// draw a random chromosome
.map(|_| genome_sizes.sample_chr(&mut rng_chr))
// draw a random end position in the chromosome
.map(|c| {
let y = rng_intervals.gen_range(args.l_intervals..=genome_sizes.chr_size_unchecked(c));
(c, y)
})
// calculate the start position
.map(|(c, y)| {
let x = y - args.l_intervals;
(c, x, y)
})
// build the interval
.map(|(c, x, y)| NumericBed4::new(c, x, y, 0));

write_records_iter_with(interval_gen, writer, genome_sizes.translater())?;

Ok(())
}

pub fn random_bed6<W: Write>(args: RandomArgs, writer: W) -> Result<()> {
let mut rng_intervals = args.build_rng();
let mut rng_chr = args.build_rng();
Expand Down Expand Up @@ -153,6 +186,7 @@ pub fn random(args: RandomArgs) -> Result<()> {
let writer = args.output.get_writer()?;
match args.format {
InputFormat::Bed3 => random_bed3(args, writer),
InputFormat::Bed4 => random_bed4(args, writer),
InputFormat::Bed6 => random_bed6(args, writer),
InputFormat::Bed12 => random_bed12(args, writer),
}
Expand Down
4 changes: 4 additions & 0 deletions src/commands/sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ pub fn sample(args: SampleArgs) -> Result<()> {
let (mut set, translater) = reader.bed3_set()?;
sample_from_set(&mut set, translater.as_ref(), args.params, writer)
}
InputFormat::Bed4 => {
let (mut set, translater) = reader.bed4_set()?;
sample_from_set(&mut set, translater.as_ref(), args.params, writer)
}
InputFormat::Bed6 => {
let (mut set, translater) = reader.bed6_set()?;
sample_from_set(&mut set, translater.as_ref(), args.params, writer)
Expand Down
12 changes: 12 additions & 0 deletions src/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ macro_rules! dispatch_single {
let (set, translater) = $reader.bed3_set()?;
$func(set, translater, $params, $writer)
}
InputFormat::Bed4 => {
let (set, translater) = $reader.bed4_set()?;
$func(set, translater, $params, $writer)
}
InputFormat::Bed6 => {
let (set, translater) = $reader.bed6_set()?;
$func(set, translater, $params, $writer)
Expand Down Expand Up @@ -39,6 +43,10 @@ macro_rules! dispatch_to_lhs {
let set_a = $reader_a.bed3_set_with($translater.as_mut())?;
$crate::dispatch_to_rhs!(set_a, $reader_b, $translater, $writer, $params, $func)
}
InputFormat::Bed4 => {
let set_a = $reader_a.bed4_set_with($translater.as_mut())?;
$crate::dispatch_to_rhs!(set_a, $reader_b, $translater, $writer, $params, $func)
}
InputFormat::Bed6 => {
let set_a = $reader_a.bed6_set_with($translater.as_mut())?;
$crate::dispatch_to_rhs!(set_a, $reader_b, $translater, $writer, $params, $func)
Expand All @@ -61,6 +69,10 @@ macro_rules! dispatch_to_rhs {
let set_b = $reader_b.bed3_set_with($translater.as_mut())?;
$func($set_a, set_b, $translater.as_ref(), $params, $writer)
}
InputFormat::Bed4 => {
let set_b = $reader_b.bed4_set_with($translater.as_mut())?;
$func($set_a, set_b, $translater.as_ref(), $params, $writer)
}
InputFormat::Bed6 => {
let set_b = $reader_b.bed6_set_with($translater.as_mut())?;
$func($set_a, set_b, $translater.as_ref(), $params, $writer)
Expand Down
72 changes: 72 additions & 0 deletions src/io/read/bed4.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
use super::build_reader;
use crate::types::{Bed4Set, NamedBed4, NumericBed4, Translater};
use anyhow::{bail, Result};
use bedrs::Coordinates;
use csv::ByteRecord;
use std::io::Read;

pub fn read_bed4_set<R: Read>(reader: R, named: bool) -> Result<(Bed4Set, Option<Translater>)> {
if named {
let (set, translater) = read_bed4_set_named(reader)?;
Ok((set, Some(translater)))
} else {
let set = read_bed4_set_unnamed(reader)?;
Ok((set, None))
}
}

pub fn read_bed4_set_with<R: Read>(
reader: R,
translater: Option<&mut Translater>,
) -> Result<Bed4Set> {
if let Some(translater) = translater {
convert_bed4_set(reader, translater)
} else {
read_bed4_set_unnamed(reader)
}
}

fn read_bed4_set_unnamed<R: Read>(reader: R) -> Result<Bed4Set> {
let mut reader = build_reader(reader);
let set = reader
.deserialize()
.map(|record| {
let record: NumericBed4 = match record {
Ok(record) => record,
Err(e) => {
bail!("Could not build bed record:\n\nIf your BED has non-integer chromosome names try rerunning with the `-N` flag:\n\nERROR: {}", e)
}
};
Ok(record)
})
.collect::<Result<Bed4Set>>()?;
Ok(set)
}

/// Reads a single file into a GenomicIntervalSet and a Translater
fn read_bed4_set_named<R: Read>(reader: R) -> Result<(Bed4Set, Translater)> {
let mut translater = Translater::new();
let set = convert_bed4_set(reader, &mut translater)?;
Ok((set, translater))
}

/// Convert a CSV reader into a GenomicIntervalSet
///
/// It uses an externally initialized name map and index map to keep track of
/// chromosome names and indices. This is useful for reading multiple files
/// and keeping track of the same chromosome names and indices.
fn convert_bed4_set<R: Read>(reader: R, translater: &mut Translater) -> Result<Bed4Set> {
let mut reader = build_reader(reader);
let mut raw_record = ByteRecord::new();
let mut set = Bed4Set::empty();
while reader.read_byte_record(&mut raw_record)? {
let record: NamedBed4 = raw_record.deserialize(None)?;
translater.add_name(record.chr());
translater.add_name(record.name());
let chr_int = translater.get_idx(record.chr()).unwrap();
let name_int = translater.get_idx(record.name()).unwrap();
let interval = NumericBed4::new(chr_int, record.start(), record.end(), name_int);
set.insert(interval);
}
Ok(set)
}
17 changes: 14 additions & 3 deletions src/io/read/bed_reader.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use super::{
read_bed12_set, read_bed12_set_with, read_bed3_set, read_bed3_set_with, read_bed6_set,
read_bed6_set_with,
read_bed12_set, read_bed12_set_with, read_bed3_set, read_bed3_set_with, read_bed4_set,
read_bed4_set_with, read_bed6_set, read_bed6_set_with,
};
use crate::types::{Bed12Set, Bed3Set, Bed6Set, FieldFormat, InputFormat, Translater};
use crate::types::{Bed12Set, Bed3Set, Bed4Set, Bed6Set, FieldFormat, InputFormat, Translater};
use anyhow::Result;
use flate2::read::MultiGzDecoder;
use gzp::BgzfSyncReader;
Expand Down Expand Up @@ -105,6 +105,12 @@ impl BedReader {
read_bed3_set(self.reader(), is_named)
}

/// Returns a Bed4Set from the reader with an Option<Translater>
pub fn bed4_set(self) -> Result<(Bed4Set, Option<Translater>)> {
let is_named = self.is_named();
read_bed4_set(self.reader(), is_named)
}

/// Returns a Bed6Set from the reader with an Option<Translater>
pub fn bed6_set(self) -> Result<(Bed6Set, Option<Translater>)> {
let is_named = self.is_named();
Expand All @@ -122,6 +128,11 @@ impl BedReader {
read_bed3_set_with(self.reader(), translater)
}

/// Returns a Bed4Set from the reader
pub fn bed4_set_with(self, translater: Option<&mut Translater>) -> Result<Bed4Set> {
read_bed4_set_with(self.reader(), translater)
}

/// Returns a Bed6Set from the reader
pub fn bed6_set_with(self, translater: Option<&mut Translater>) -> Result<Bed6Set> {
read_bed6_set_with(self.reader(), translater)
Expand Down
10 changes: 2 additions & 8 deletions src/io/read/mod.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,14 @@
pub mod bed12;
pub mod bed3;
pub mod bed4;
pub mod bed6;
pub mod bed_reader;
pub mod iter;
pub mod utils;
pub use bed12::{read_bed12_set, read_bed12_set_with};
pub use bed3::{read_bed3_set, read_bed3_set_with};
pub use bed4::{read_bed4_set, read_bed4_set_with};
pub use bed6::{read_bed6_set, read_bed6_set_with};
pub use bed_reader::BedReader;
pub use iter::iter_unnamed;
pub use utils::build_reader;

use crate::types::{Bed3Set, Bed6Set};

#[allow(dead_code)]
pub enum SetFormat {
Bed3(Bed3Set),
Bed6(Bed6Set),
}
36 changes: 35 additions & 1 deletion src/io/write/iter.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::build_writer;
use crate::types::{NumericBed12, NumericBed3, NumericBed6, Translate};
use crate::types::{NumericBed12, NumericBed3, NumericBed4, NumericBed6, Translate};
use anyhow::Result;
use bedrs::Coordinates;
use serde::Serialize;
Expand Down Expand Up @@ -144,6 +144,40 @@ impl<'a> WriteNamedIter<&'a NumericBed3> for WriteNamedIterImpl {
Ok(())
}
}
impl WriteNamedIter<NumericBed4> for WriteNamedIterImpl {
fn write_named_iter<W: Write, It: Iterator<Item = NumericBed4>, Tr: Translate>(
writer: W,
iterator: It,
translater: &Tr,
) -> Result<()> {
let mut wtr = build_writer(writer);
for interval in iterator {
let chr = translater.get_name(*interval.chr()).unwrap();
let name = translater.get_name(*interval.name()).unwrap();
let named_interval = (chr, interval.start(), interval.end(), name);
wtr.serialize(named_interval)?;
}
wtr.flush()?;
Ok(())
}
}
impl<'a> WriteNamedIter<&'a NumericBed4> for WriteNamedIterImpl {
fn write_named_iter<W: Write, It: Iterator<Item = &'a NumericBed4>, Tr: Translate>(
writer: W,
iterator: It,
translater: &Tr,
) -> Result<()> {
let mut wtr = build_writer(writer);
for interval in iterator {
let chr = translater.get_name(*interval.chr()).unwrap();
let name = translater.get_name(*interval.name()).unwrap();
let named_interval = (chr, interval.start(), interval.end(), name);
wtr.serialize(named_interval)?;
}
wtr.flush()?;
Ok(())
}
}
impl WriteNamedIter<NumericBed6> for WriteNamedIterImpl {
fn write_named_iter<W: Write, It: Iterator<Item = NumericBed6>, Tr: Translate>(
writer: W,
Expand Down
4 changes: 3 additions & 1 deletion src/types/formats/in_formats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use std::{io::BufReader, str::from_utf8};
pub enum InputFormat {
#[default]
Bed3,
Bed4,
Bed6,
Bed12,
}
Expand All @@ -28,6 +29,7 @@ impl InputFormat {
let num_fields = first.split(|b| *b == b'\t').count();
match num_fields {
3 => Ok(InputFormat::Bed3),
4 => Ok(InputFormat::Bed4),
6 => Ok(InputFormat::Bed6),
12 => Ok(InputFormat::Bed12),
_ => bail!(
Expand Down Expand Up @@ -68,7 +70,7 @@ impl FieldFormat {
Ok(FieldFormat::IntegerBased)
}
}
InputFormat::Bed6 | InputFormat::Bed12 => {
InputFormat::Bed4 | InputFormat::Bed6 | InputFormat::Bed12 => {
let chr = from_utf8(fields[0])?;
let name = from_utf8(fields[3])?;
if chr.parse::<u32>().is_err() || name.parse::<u32>().is_err() {
Expand Down
Loading

0 comments on commit bbc1489

Please sign in to comment.