Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Update fetch_records method to use fastq reader and improve… #54

Merged
merged 16 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
ce7622b
refactor: Update fetch_records method to use fastq reader and improve…
cauliyang Jan 17, 2025
a927fa5
style: Update pyo3 version and format dependencies in Cargo.toml
cauliyang Jan 21, 2025
eb54b5d
feat: Add fas2one and fqs2one commands for batch file conversion
cauliyang Jan 21, 2025
f595522
feat: Add functions to combine multiple FASTA and FASTQ files into bg…
cauliyang Jan 21, 2025
07edeb7
style: Format function signatures for consistency in type annotations
cauliyang Jan 21, 2025
5e8b356
style: Update type annotations for consistency across multiple files
cauliyang Jan 21, 2025
7747606
feat: Enhance ExtractFq command to support random selection of reads …
cauliyang Jan 21, 2025
5e0e3ab
feat: Add random selection functionality for FASTQ records and expose…
cauliyang Jan 21, 2025
f4829b6
style: Update type annotations for consistency across bam.pyi, core.p…
cauliyang Jan 21, 2025
fe1f7b4
feat: Update ExtractFa to support optional read selection by path or …
cauliyang Jan 21, 2025
d069d09
feat: Add output option for specifying file paths in ExtractFa and Ex…
cauliyang Jan 21, 2025
3a14d43
♻️ Refactor select_record_from_fq_by_random function to use reservoir…
cauliyang Jan 22, 2025
3a793df
♻️ Optimize record selection in select_record_from_fq_by_random funct…
cauliyang Jan 22, 2025
80a4563
♻️ Optimize record selection in select_record_from_fq_by_random funct…
cauliyang Jan 22, 2025
bb5be05
feat: Update output file extensions for FasToOne and FqsToOne command…
cauliyang Jan 22, 2025
d4c6d15
feat: Add markdown help option and documentation for deepbiop-cli
cauliyang Jan 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ repos:
- id: cargo-check

- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: "v0.9.1"
rev: "v0.9.2"
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes]
Expand Down
24 changes: 12 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ repository = "https://github.com/cauliyang/DeepBioP"
license = "Apache-2.0"

[workspace.dependencies]
pyo3 = { version = "0.23.3", features = [
"abi3-py39",
"extension-module",
"anyhow",
pyo3 = { version = "0.23.4", features = [
"abi3-py39",
"extension-module",
"anyhow",
] }
pyo3-stub-gen = "0.6.2"
thiserror = "2.0"
Expand All @@ -25,13 +25,13 @@ rayon = { version = "1.10" }
log = "0.4"
pyo3-log = "0.12.1"
noodles = { version = "0.87.0", features = [
"bgzf",
"core",
"csi",
"fasta",
"fastq",
"sam",
"bam",
"bgzf",
"core",
"csi",
"fasta",
"fastq",
"sam",
"bam",
] }

bio = "2.0"
Expand All @@ -58,7 +58,7 @@ candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.
colored = "3.0"
textwrap = "0.16"
flate2 = { version = "1.0.35", features = [
"zlib-ng",
"zlib-ng",
], default-features = false }

deepbiop-fq = { version = "0.1.14", path = "crates/deepbiop-fq" }
Expand Down
1 change: 1 addition & 0 deletions crates/deepbiop-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ clap-verbosity-flag = "3.0"
ctrlc = "3.4"
human-panic = "2.0"
env_logger = "0.11.6"
clap-markdown = "0.1.4"

[[bin]]
path = "src/main.rs"
Expand Down
6 changes: 5 additions & 1 deletion crates/deepbiop-cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@ pub use extractfq::*;
pub mod extractfa;
pub use extractfa::*;

use anyhow::Result;
pub mod fqs2one;
pub use fqs2one::*;
pub mod fas2one;
pub use fas2one::*;

use anyhow::Result;
// Set up threads only once, using the common_opts from the top-level Cli struct
pub fn set_up_threads(threads: Option<usize>) -> Result<()> {
log::info!("Threads number: {:?}", threads.unwrap());
Expand Down
50 changes: 43 additions & 7 deletions crates/deepbiop-cli/src/cli/extractfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,16 @@ pub struct ExtractFa {
fa: PathBuf,

/// Path to the selected reads
#[arg(value_name = "reads")]
reads: PathBuf,
#[arg(long, value_name = "reads", conflicts_with = "number")]
reads: Option<PathBuf>,

/// The number of selected reads by random
#[arg(long, value_name = "number", conflicts_with = "reads")]
number: Option<usize>,

/// output bgzip compressed file
#[arg(long, value_name = "output")]
output: Option<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
Expand Down Expand Up @@ -50,18 +58,46 @@ impl ExtractFa {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

let reads = parse_reads(&self.reads)?;
info!("load {} selected reads from {:?}", reads.len(), &self.reads);
let records = if let Some(reads_path) = &self.reads {
let reads = parse_reads(reads_path)?;
let records = fa::io::select_record_from_fa(&self.fa, &reads)?;
info!("load {} selected reads from {:?}", reads.len(), reads_path);
records
} else if let Some(number) = self.number {
let records = fa::io::select_record_from_fq_by_random(&self.fa, number)?;
info!("select {} reads by random", number);
records
} else {
return Err(anyhow::anyhow!(
"Either --reads or --number must be specified"
));
};

let records = fa::io::select_record_from_fa_by_stream(&self.fa, &reads)?;
info!("collect {} records", records.len());

if self.compressed {
let file_path = self.fa.with_extension("selected.fa.gz");
let file_path = if let Some(path) = &self.output {
let path = path.with_extension("fa.gz");
if path.exists() {
info!("{} already exists, overwriting", path.display());
}
path
} else {
self.fa.with_extension("selected.fa.gz")
};

info!("write to {}", &file_path.display());
fa::io::write_bzip_fa_parallel_for_noodle_record(&records, file_path, self.threads)?;
} else {
let file_path = self.fa.with_extension("selected.fa");
let file_path = if let Some(path) = &self.output {
let path = path.with_extension("fa");
if path.exists() {
info!("{} already exists, overwriting", path.display());
}
path
} else {
self.fa.with_extension("selected.fa")
};
info!("write to {}", &file_path.display());
fa::io::write_fa_for_noodle_record(&records, file_path)?;
}
Expand Down
58 changes: 46 additions & 12 deletions crates/deepbiop-cli/src/cli/extractfq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,29 @@ use bstr::BString;
use clap::Parser;
use deepbiop_fq as fq;

use super::set_up_threads;
use log::info;
use std::io::BufRead;
use std::io::BufReader;
use std::path::{Path, PathBuf};

use super::set_up_threads;
use log::info;

#[derive(Debug, Parser)]
pub struct ExtractFq {
/// path to the bam file
/// path to the fq file
#[arg(value_name = "fq")]
fq: PathBuf,

/// Path to the selected reads
#[arg(value_name = "reads")]
reads: PathBuf,
#[arg(long, value_name = "reads", conflicts_with = "number")]
reads: Option<PathBuf>,

/// The number of selected reads by random
#[arg(long, value_name = "number", conflicts_with = "reads")]
number: Option<usize>,

/// output bgzip compressed file
#[arg(long, value_name = "output")]
output: Option<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
Expand All @@ -35,7 +42,6 @@ fn parse_reads<P: AsRef<Path>>(reads: P) -> Result<HashSet<BString>> {
let file = std::fs::File::open(reads.as_ref())?;

let reader = BufReader::new(file);

let mut reads = HashSet::new();

for line in reader.lines() {
Expand All @@ -50,18 +56,46 @@ impl ExtractFq {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

let reads = parse_reads(&self.reads)?;
info!("load {} selected reads from {:?}", reads.len(), &self.reads);
let records = if let Some(reads_path) = &self.reads {
let reads = parse_reads(reads_path)?;
let records = fq::io::select_record_from_fq(&self.fq, &reads)?;

info!("load {} selected reads from {:?}", reads.len(), reads_path);
records
} else if let Some(number) = self.number {
let records = fq::io::select_record_from_fq_by_random(&self.fq, number)?;
info!("select {} reads by random", number);
records
} else {
return Err(anyhow::anyhow!(
"Either --reads or --number must be specified"
));
};

let records = fq::io::select_record_from_fq(&self.fq, &reads)?;
info!("collect {} records", records.len());

if self.compressed {
let file_path = self.fq.with_extension("selected.fq.gz");
let file_path = if let Some(path) = &self.output {
let path = path.with_extension("fq.gz");
if path.exists() {
info!("{} already exists, overwriting", path.display());
}
path
} else {
self.fq.with_extension("selected.fq.gz")
};
info!("write to {}", &file_path.display());
fq::io::write_bgzip_fq_parallel_for_noodle_record(&records, file_path, self.threads)?;
} else {
let file_path = self.fq.with_extension("selected.fq");
let file_path = if let Some(path) = &self.output {
let path = path.with_extension("fq");
if path.exists() {
info!("{} already exists, overwriting", path.display());
}
path
} else {
self.fq.with_extension("selected.fq")
};
info!("write to {}", &file_path.display());
fq::io::write_fq_for_noodle_record(&records, file_path)?;
}
Expand Down
30 changes: 30 additions & 0 deletions crates/deepbiop-cli/src/cli/fas2one.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use anyhow::Result;
use clap::Parser;
use deepbiop_fa as fa;

use std::path::PathBuf;

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct FasToOne {
/// path to the fa file
#[arg(value_name = "fas", action=clap::ArgAction::Append)]
fas: Vec<PathBuf>,

/// output bgzip compressed file
#[arg(long, value_name = "output")]
output: PathBuf,

#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FasToOne {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;
let output = self.output.with_extension("fa.gz");
fa::io::convert_multiple_fas_to_one_bgzip_fa(&self.fas, output, true)?;
Ok(())
}
}
2 changes: 1 addition & 1 deletion crates/deepbiop-cli/src/cli/fq2parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pub struct FqToParquet {
chunk_size: usize,

/// result path
#[arg(long, value_name = "result")]
#[arg(long, value_name = "output")]
output: Option<PathBuf>,

/// threads number
Expand Down
30 changes: 30 additions & 0 deletions crates/deepbiop-cli/src/cli/fqs2one.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use anyhow::Result;
use clap::Parser;
use deepbiop_fq as fq;

use std::path::PathBuf;

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct FqsToOne {
/// path to the fq file
#[arg(value_name = "fqs", action=clap::ArgAction::Append)]
fqs: Vec<PathBuf>,

/// output bgzip compressed file
#[arg(long, value_name = "output")]
output: PathBuf,

#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FqsToOne {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;
let output = self.output.with_extension("fq.gz");
fq::io::convert_multiple_fqs_to_one_bgzip_fq(&self.fqs, output, true)?;
Ok(())
}
}
24 changes: 24 additions & 0 deletions crates/deepbiop-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ struct Cli {

#[command(flatten)]
verbose: clap_verbosity_flag::Verbosity,

#[arg(long, hide = true)]
markdown_help: bool,
}

#[derive(Subcommand, Debug)]
Expand Down Expand Up @@ -51,6 +54,12 @@ pub enum Commands {

/// Extract fasta reads from a fasta file.
ExtractFa(cli::ExtractFa),

/// Multiple Fastqs to one Fastq conversion.
FqsToOne(cli::FqsToOne),

/// Multiple Fastas to one Fasta conversion.
FasToOne(cli::FasToOne),
}

impl Display for Commands {
Expand All @@ -64,6 +73,8 @@ impl Display for Commands {
Commands::ExtractFq(_) => write!(f, "extractfq"),
Commands::ExtractFa(_) => write!(f, "extractfa"),
Commands::FqToParquet(_) => write!(f, "fq2parquet"),
Commands::FqsToOne(_) => write!(f, "fqs2one"),
Commands::FasToOne(_) => write!(f, "fas2one"),
}
}
}
Expand Down Expand Up @@ -99,6 +110,11 @@ fn main() -> Result<()> {
return Ok(());
}

if cli.markdown_help {
clap_markdown::print_help_markdown::<Cli>();
return Ok(());
}

match &cli.command {
Some(Commands::CountChimeric(count_chimeric)) => {
count_chimeric.run().unwrap();
Expand Down Expand Up @@ -132,6 +148,14 @@ fn main() -> Result<()> {
extractfa.run().unwrap();
}

Some(Commands::FqsToOne(fqs2one)) => {
fqs2one.run().unwrap();
}

Some(Commands::FasToOne(fas2one)) => {
fas2one.run().unwrap();
}

None => {
println!("No command provided!");
}
Expand Down
1 change: 1 addition & 0 deletions crates/deepbiop-fa/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ lexical = { workspace = true }
flate2 = { workspace = true }
bstr = { workspace = true }
walkdir = { workspace = true }
rand = { workspace = true }

parquet = { workspace = true }
arrow = { workspace = true }
Expand Down
Loading
Loading