Skip to content

Commit

Permalink
Merge pull request #46 from cauliyang/dev
Browse files Browse the repository at this point in the history
feat: Add Fasta module and Fasta2Parquet CLI
  • Loading branch information
cauliyang authored Jan 7, 2025
2 parents 468b8c3 + e222039 commit 401ced5
Show file tree
Hide file tree
Showing 22 changed files with 146 additions and 76 deletions.
14 changes: 7 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repository = "https://github.com/cauliyang/DeepBioP"
license = "Apache-2.0"

[workspace.dependencies]
pyo3 = { version = "0.21.2", features = [
pyo3 = { version = "0.23.3", features = [
"abi3-py39",
"extension-module",
"anyhow",
Expand All @@ -23,7 +23,7 @@ anyhow = "1.0"
walkdir = { version = "2.5" }
rayon = { version = "1.10" }
log = "0.4"
pyo3-log = "0.11"
pyo3-log = "0.12.1"
noodles = { version = "0.87.0", features = [
"bgzf",
"core",
Expand All @@ -38,22 +38,22 @@ bio = "2.0"
needletail = "0.6"

ahash = "0.8.11"
numpy = "0.21"
ndarray = { version = "0.15", features = ["serde", "rayon"] }
numpy = "0.23"
ndarray = { version = "0.16", features = ["serde", "rayon"] }
num-traits = { version = "0.2" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
rand = "0.8"
rand_distr = "0.4"
bitvec = "1.0"
itertools = "0.13.0"
itertools = "0.14.0"
derive_builder = "0.20"
lexical = "7.0"
bstr = "1.11.3"
lazy_static = "1.5.0"
tempfile = "3.15"
parquet = "52.2.0"
arrow = "52.2"
parquet = "54.0"
arrow = "54.0"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.8.1" }
colored = "2.2"
textwrap = "0.16"
Expand Down
4 changes: 3 additions & 1 deletion crates/deepbiop-bam/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ use pyo3_stub_gen::derive::*;
/// Calculate the number of chimeric reads in a BAM file.
#[gen_stub_pyfunction(module = "deepbiop.bam")]
#[pyfunction]
#[pyo3(signature = (bam, threads=None))]
fn count_chimeric_reads_for_path(bam: PathBuf, threads: Option<usize>) -> Result<usize> {
chimeric::count_chimeric_reads_for_path(bam, threads)
}

/// Calculate the number of chimeric reads in multiple BAM files.
#[gen_stub_pyfunction(module = "deepbiop.bam")]
#[pyfunction]
#[pyo3(signature = (bams, threads=None))]
fn count_chimeric_reads_for_paths(
bams: Vec<PathBuf>,
threads: Option<usize>,
Expand All @@ -38,7 +40,7 @@ fn left_right_soft_clip(cigar_string: &str) -> Result<(usize, usize)> {
// register bam sub module
pub fn register_bam_module(parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
let sub_module_name = "bam";
let child_module = PyModule::new_bound(parent_module.py(), sub_module_name)?;
let child_module = PyModule::new(parent_module.py(), sub_module_name)?;

child_module.add_function(wrap_pyfunction!(left_right_soft_clip, &child_module)?)?;
child_module.add_function(wrap_pyfunction!(
Expand Down
3 changes: 2 additions & 1 deletion crates/deepbiop-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ noodles = { workspace = true }
deepbiop-fq = { workspace = true }
deepbiop-bam = { workspace = true }
deepbiop-utils = { workspace = true }
deepbiop-fa = { workspace = true }

rayon = { workspace = true }
log = { workspace = true }
Expand All @@ -23,7 +24,7 @@ ahash = { workspace = true }

clap = { version = "4.5", features = ["derive"] }
clap_complete = "4.5"
clap-verbosity-flag = "2.2"
clap-verbosity-flag = "3.0"
ctrlc = "3.4"
human-panic = "2.0"
env_logger = "0.11.6"
Expand Down
2 changes: 2 additions & 0 deletions crates/deepbiop-cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ pub mod fa2fq;
pub use fa2fq::*;
pub mod chimeric_count;
pub use chimeric_count::*;
pub mod fa2parquet;
pub use fa2parquet::*;

use anyhow::Result;

Expand Down
65 changes: 65 additions & 0 deletions crates/deepbiop-cli/src/cli/fa2parquet.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use anyhow::Result;
use clap::Parser;
use log::warn;

use std::path::PathBuf;

use super::set_up_threads;
use deepbiop_fa as fa;
use fa::encode::Encoder;

use deepbiop_utils as utils;

#[derive(Debug, Parser)]
pub struct FaToParquet {
/// path to the fa file
#[arg(value_name = "fa")]
fa: PathBuf,

/// if convert the fa file to parquet by chunk or not
#[arg(long)]
chunk: bool,

/// chunk size
#[arg(long, default_value = "1000000")]
chunk_size: usize,

/// result path
#[arg(long, value_name = "result")]
output: Option<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FaToParquet {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;
let option = fa::encode::FaEncoderOptionBuilder::default()
.bases(fa::encode::BASES.to_vec())
.build()?;
let mut fa_encoder = fa::encode::ParquetEncoderBuilder::default()
.option(option)
.build()?;

if self.chunk {
fa_encoder.encode_chunk(&self.fa, self.chunk_size, false)?;
return Ok(());
}

let (record_batch, schema) = fa_encoder.encode(&self.fa)?;
// result file is fq_path with .parquet extension
let parquet_path = if let Some(path) = &self.output {
if path.with_extension("parquet").exists() {
warn!("{} already exists, overwriting", path.display());
}
path.with_extension("parquet")
} else {
self.fa.with_extension("parquet")
};
utils::io::write_parquet(parquet_path, record_batch, schema)?;

Ok(())
}
}
8 changes: 8 additions & 0 deletions crates/deepbiop-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ pub enum Commands {

/// Fastq to fasta conversion.
FaToFq(cli::FaToFq),

/// Fastq to parquet conversion.
FaToParquet(cli::FaToParquet),
}

impl Display for Commands {
Expand All @@ -48,6 +51,7 @@ impl Display for Commands {
Commands::BamToFq(_) => write!(f, "bam2fq"),
Commands::FqToFa(_) => write!(f, "fq2fa"),
Commands::FaToFq(_) => write!(f, "fa2fq"),
Commands::FaToParquet(_) => write!(f, "fa2parquet"),
}
}
}
Expand Down Expand Up @@ -100,6 +104,10 @@ fn main() -> Result<()> {
fa2fq.run().unwrap();
}

Some(Commands::FaToParquet(fa2parquet)) => {
fa2parquet.run().unwrap();
}

None => {
println!("No command provided!");
}
Expand Down
20 changes: 3 additions & 17 deletions crates/deepbiop-fa/src/encode/option.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,31 @@ use pyo3::prelude::*;
use pyo3_stub_gen::derive::*;

pub const BASES: &[u8] = b"ATCGN";
pub const QUAL_OFFSET: u8 = 33;

#[gen_stub_pyclass]
#[pyclass(module = "deepbiop.fa")]
#[derive(Debug, Builder, Default, Clone, Serialize, Deserialize)]
pub struct FaEncoderOption {
#[pyo3(get, set)]
#[builder(default = "QUAL_OFFSET")]
pub qual_offset: u8,

#[pyo3(get, set)]
#[builder(default = "BASES.to_vec()")]
pub bases: Vec<u8>,

#[pyo3(get, set)]
#[builder(default = "2")]
pub threads: usize,
}

#[gen_stub_pymethods]
#[pymethods]
impl FaEncoderOption {
#[new]
fn py_new(qual_offset: u8, bases: String, threads: Option<usize>) -> Self {
#[pyo3(signature = (bases))]
fn py_new(bases: String) -> Self {
FaEncoderOptionBuilder::default()
.qual_offset(qual_offset)
.bases(bases.as_bytes().to_vec())
.threads(threads.unwrap_or(2))
.build()
.expect("Failed to build FqEncoderOption from Python arguments.")
}
}

impl Display for FaEncoderOption {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(
f,
"FaEncoderOption {{ qual_offset: {}, bases: {:?}}}",
self.qual_offset, self.bases
)
write!(f, "FaEncoderOption {{ bases: {:?} }}", self.bases)
}
}
2 changes: 1 addition & 1 deletion crates/deepbiop-fa/src/encode/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,11 @@ mod tests {
#[test]
fn test_encode_fq_for_parquet() {
let option = FaEncoderOptionBuilder::default().build().unwrap();

let mut encoder = ParquetEncoderBuilder::default()
.option(option)
.build()
.unwrap();

let (record_batch, scheme) = encoder.encode("tests/data/test.fa").unwrap();
write_parquet("test.parquet", record_batch, scheme).unwrap();
// remove test.parquet
Expand Down
16 changes: 5 additions & 11 deletions crates/deepbiop-fa/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ impl PyRecordData {

#[gen_stub_pyfunction(module = "deepbiop.fa")]
#[pyfunction]
#[pyo3(signature = (records_data, file_path=None))]
fn write_fa(records_data: Vec<PyRecordData>, file_path: Option<PathBuf>) -> Result<()> {
let records: Vec<encode::RecordData> = records_data
.into_par_iter()
Expand Down Expand Up @@ -108,11 +109,9 @@ fn encode_fa_path_to_parquet_chunk(
chunk_size: usize,
parallel: bool,
bases: String,
qual_offset: usize,
) -> Result<()> {
let option = encode::FaEncoderOptionBuilder::default()
.bases(bases.as_bytes().to_vec())
.qual_offset(qual_offset as u8)
.build()?;

let mut fa_encoder = encode::ParquetEncoderBuilder::default()
Expand All @@ -124,15 +123,14 @@ fn encode_fa_path_to_parquet_chunk(

#[gen_stub_pyfunction(module = "deepbiop.fa")]
#[pyfunction]
#[pyo3(signature = (fa_path, bases, result_path=None))]
fn encode_fa_path_to_parquet(
fa_path: PathBuf,
bases: String,
qual_offset: usize,
result_path: Option<PathBuf>,
) -> Result<()> {
let option = encode::FaEncoderOptionBuilder::default()
.bases(bases.as_bytes().to_vec())
.qual_offset(qual_offset as u8)
.build()?;

let mut fa_encoder = encode::ParquetEncoderBuilder::default()
Expand All @@ -155,13 +153,9 @@ fn encode_fa_path_to_parquet(

#[gen_stub_pyfunction(module = "deepbiop.fa")]
#[pyfunction]
fn encode_fa_paths_to_parquet(
fa_path: Vec<PathBuf>,
bases: String,
qual_offset: usize,
) -> Result<()> {
fn encode_fa_paths_to_parquet(fa_path: Vec<PathBuf>, bases: String) -> Result<()> {
fa_path.iter().for_each(|path| {
encode_fa_path_to_parquet(path.clone(), bases.clone(), qual_offset, None).unwrap();
encode_fa_path_to_parquet(path.clone(), bases.clone(), None).unwrap();
});
Ok(())
}
Expand Down Expand Up @@ -191,7 +185,7 @@ fn convert_multiple_fas_to_one_fa(
// register fq sub_module
pub fn register_fa_module(parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
let sub_module_name = "fa";
let child_module = PyModule::new_bound(parent_module.py(), sub_module_name)?;
let child_module = PyModule::new(parent_module.py(), sub_module_name)?;

child_module.add_class::<PyRecordData>()?;
child_module.add_class::<encode::FaEncoderOption>()?;
Expand Down
Binary file added crates/deepbiop-fa/tests/data/test.parquet
Binary file not shown.
1 change: 1 addition & 0 deletions crates/deepbiop-fq/src/encode/option.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub struct FqEncoderOption {
#[pymethods]
impl FqEncoderOption {
#[new]
#[pyo3(signature = (kmer_size, qual_offset, bases, vectorized_target, threads=None))]
fn py_new(
kmer_size: u8,
qual_offset: u8,
Expand Down
13 changes: 8 additions & 5 deletions crates/deepbiop-fq/src/predicts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pub struct Predict {
#[pymethods]
impl Predict {
#[new]
#[pyo3(signature = (prediction, seq, id, is_truncated, qual=None))]
pub fn new(
prediction: Vec<i8>,
seq: String,
Expand Down Expand Up @@ -148,6 +149,7 @@ impl Predict {
}

/// Show the information of the prediction
#[pyo3(signature = (smooth_interval, text_width=None))]
pub fn show_info(
&self,
smooth_interval: Vec<(usize, usize)>,
Expand Down Expand Up @@ -175,15 +177,16 @@ impl Predict {
})?;

// Convert JSON string to Python bytes
Ok(PyBytes::new_bound(py, serialized.as_bytes()).into())
Ok(PyBytes::new(py, serialized.as_bytes()).into())
}

fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
// Expect a bytes object for state
let state_bytes: &PyBytes = state.extract(py)?;
// Convert PyObject to PyBytes
let state_bytes = state.downcast_bound::<PyBytes>(py)?;

// Deserialize the JSON string into the current instance
*self = serde_json::from_slice(state_bytes.as_bytes()).map_err(|e| {
// Get the bytes and deserialize
let bytes = state_bytes.as_bytes();
*self = serde_json::from_slice(bytes).map_err(|e| {
PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(format!(
"Failed to deserialize: {}",
e
Expand Down
Loading

0 comments on commit 401ced5

Please sign in to comment.