Skip to content

Commit

Permalink
feat: add memchr and clap dependencies, enhance filterx functions wit…
Browse files Browse the repository at this point in the history
…h new parameters and types
  • Loading branch information
dwpeng committed Nov 14, 2024
1 parent 9b4aa50 commit ce9cb99
Show file tree
Hide file tree
Showing 26 changed files with 613 additions and 109 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ flate2 = { version = "1.0.34", features = ["zlib-rs"] }
regex = "1.11.1"
colored = "2.1.0"
lazy_static = "1.5.0"
memchr = "2.7.4"


[profile.dev]
Expand Down
19 changes: 18 additions & 1 deletion src/filterx/src/args.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use clap::{ArgAction, Args, Parser, Subcommand, ValueHint};
use filterx_source::{FastaRecordType, QualityType};

static LONG_ABOUT: &'static str = include_str!("./long.txt");

Expand Down Expand Up @@ -55,7 +56,7 @@ pub struct ShareArgs {
#[clap(short='o', long, value_hint=ValueHint::FilePath)]
pub output: Option<String>,

/// output as table format
/// output as table format, only output to stdout
#[clap(short = 't', long, default_value = "false", action = ArgAction::SetTrue)]
pub table: Option<bool>,
}
Expand Down Expand Up @@ -110,6 +111,14 @@ pub struct FastaCommand {
/// limit sequence number, 0 means no limit
#[clap(long, default_value = "0")]
pub limit: Option<usize>,

/// sequence type, default is DNA
#[clap(long, default_value = "auto")]
pub r#type: Option<FastaRecordType>,

/// detect sequence type by first N sequences
#[clap(long, default_value = "3")]
pub detect_size: Option<usize>,
}

#[derive(Debug, Clone, Parser)]
Expand All @@ -132,6 +141,14 @@ pub struct FastqCommand {
/// limit sequence number, 0 means no limit
#[clap(long, default_value = "0")]
pub limit: Option<usize>,

/// quality type, phred33, phred64, auto, auto: will try to detect
#[clap(long, default_value = "auto")]
pub phred: Option<QualityType>,

/// detect quality type by first N sequences
#[clap(long, default_value = "100")]
pub detect_size: Option<usize>,
}

#[derive(Debug, Clone, Parser)]
Expand Down
6 changes: 3 additions & 3 deletions src/filterx/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::args::{Cli, Command};
use crate::files::csv::filterx_csv;
use crate::files::fasta::filterx_fasta;
use crate::files::fastq::filterx_fastq;
use crate::files::gxf::filterx_gxf;
use crate::files::gxf::{filterx_gxf, GxfType};
use crate::files::sam::filterx_sam;
use crate::files::vcf::filterx_vcf;

Expand All @@ -18,7 +18,7 @@ pub fn cli() -> FilterxResult<()> {
Command::Fastq(cmd) => filterx_fastq(cmd),
Command::Sam(cmd) => filterx_sam(cmd),
Command::Vcf(cmd) => filterx_vcf(cmd),
Command::GFF(cmd) => filterx_gxf(cmd),
Command::GTF(cmd) => filterx_gxf(cmd),
Command::GFF(cmd) => filterx_gxf(cmd, GxfType::Gff),
Command::GTF(cmd) => filterx_gxf(cmd, GxfType::Gtf),
}
}
9 changes: 8 additions & 1 deletion src/filterx/src/files/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ pub fn filterx_fasta(cmd: FastaCommand) -> FilterxResult<()> {
},
chunk: long,
no_comment,
r#type,
limit,
detect_size,
} = cmd;

let _limit = match limit {
Expand All @@ -39,7 +41,12 @@ pub fn filterx_fasta(cmd: FastaCommand) -> FilterxResult<()> {
let names = names.iter().map(|x| x.to_string()).collect();

let expr = util::merge_expr(expr);
let mut source = FastaSource::new(path.as_str(), !no_comment.unwrap())?;
let mut source = FastaSource::new(
path.as_str(),
!no_comment.unwrap(),
r#type.unwrap(),
detect_size.unwrap(),
)?;
let output = util::create_buffer_writer(output)?;
let mut output = Box::new(output);
if expr.is_empty() {
Expand Down
10 changes: 9 additions & 1 deletion src/filterx/src/files/fastq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ pub fn filterx_fastq(cmd: FastqCommand) -> FilterxResult<()> {
chunk: long,
no_comment,
no_quality,
phred,
limit,
detect_size,
} = cmd;

let _limit = match limit {
Expand Down Expand Up @@ -51,7 +53,13 @@ pub fn filterx_fastq(cmd: FastqCommand) -> FilterxResult<()> {

let names = names.iter().map(|x| x.to_string()).collect::<Vec<String>>();
let expr = util::merge_expr(expr);
let mut source = FastqSource::new(path.as_str(), !no_comment.unwrap(), !no_quality.unwrap())?;
let mut source = FastqSource::new(
path.as_str(),
!no_comment.unwrap(),
!no_quality.unwrap(),
phred.unwrap(),
detect_size.unwrap(),
)?;
let output = util::create_buffer_writer(output)?;
let mut output = Box::new(output);
if expr.is_empty() {
Expand Down
20 changes: 17 additions & 3 deletions src/filterx/src/files/gxf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,22 @@ fn init_gxf_schema() -> Option<SchemaRef> {
util::create_schemas(files)
}

pub fn filterx_gxf(cmd: GFFCommand) -> FilterxResult<()> {
#[derive(Debug, Clone, Copy, PartialEq, clap::ValueEnum)]
pub enum GxfType {
Gff,
Gtf,
}

impl From<GxfType> for SourceType {
fn from(g: GxfType) -> Self {
match g {
GxfType::Gff => SourceType::Gff,
GxfType::Gtf => SourceType::Gtf,
}
}
}

pub fn filterx_gxf(cmd: GFFCommand, gxf_type: GxfType) -> FilterxResult<()> {
let GFFCommand {
share_args:
ShareArgs {
Expand All @@ -33,7 +48,6 @@ pub fn filterx_gxf(cmd: GFFCommand) -> FilterxResult<()> {
table,
},
} = cmd;

let comment_prefix = "#";
let separator = "\t";
let writer = util::create_buffer_writer(output.clone())?;
Expand All @@ -55,7 +69,7 @@ pub fn filterx_gxf(cmd: GFFCommand) -> FilterxResult<()> {
)?;
let mut s = DataframeSource::new(lazy_df.clone());
s.set_init_column_names(&names);
let mut vm = Vm::from_source(Source::new(s.into(), SourceType::Gxf));
let mut vm = Vm::from_source(Source::new(s.into(), gxf_type.into()));
let expr = util::merge_expr(expr);
let writer = Box::new(writer);
vm.set_writer(writer);
Expand Down
15 changes: 15 additions & 0 deletions src/filterx_engine/src/engine_macro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,18 @@ macro_rules! builtin_function {
)*
};
}

#[macro_export]
macro_rules! execuable {
($vm:expr, $target: literal) => {
use crate::vm::VmMode;
if $vm.mode == VmMode::Printable {
let h = &mut $vm.hint;
h.white("Con't use ")
.red($target)
.white(" in builtin function")
.green(" `print`.")
.print_and_exit()
}
};
}
3 changes: 2 additions & 1 deletion src/filterx_engine/src/eval/assign.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ use filterx_core::FilterxResult;
use crate::eval::Eval;
use crate::vm::Vm;

use crate::eval;
use crate::{eval, execuable};

impl<'a> Eval<'a> for ast::StmtAssign {
type Output = Value;
fn eval(&self, vm: &'a mut Vm) -> FilterxResult<Self::Output> {
execuable!(vm, "=");
if self.targets.len() != 1 {
let h = &mut vm.hint;
h.white("Dosn't support unpacking multiple assignment expression")
Expand Down
51 changes: 20 additions & 31 deletions src/filterx_engine/src/eval/call/builtin/column/print.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
use polars::{
frame::DataFrame,
io::SerWriter,
prelude::{format_str, IntoLazy},
};
use polars::{io::SerWriter, prelude::format_str};

use crate::vm::VmMode;

use super::super::*;
use filterx_source::Source;
use polars::prelude::{col, Expr};
use regex::Regex;

use lazy_static::lazy_static;

lazy_static! {
static ref REGEX_PATTERN: Regex = Regex::new(r"\{([\(\)a-zA-Z0-9_\-+*\\ ]*)\}").unwrap();
static ref REGEX_PATTERN: Regex = Regex::new(r"\{([\(\)a-zA-Z0-9_\-+/*\\ ]*)\}").unwrap();
static ref REGEX_VARNAME: Regex = Regex::new(r"^[_a-zA-Z]+[a-zA-Z_0-9]*$").unwrap();
}

fn parse_format_string(
source_type: SourceType,
valid_names: Option<&Vec<String>>,
s: &str,
) -> FilterxResult<(String, Option<Vec<Expr>>)> {
fn parse_format_string(s: &str, vm: &mut Vm) -> FilterxResult<(String, Option<Vec<Expr>>)> {
// value: "xxxxx" -> "xxxxx"
// value: "xxx_{seq}" -> "xxx_{}" and col("seq")
// value: "xxx_{seq}_{seq}" -> "xxx_{}_{}" and col("seq"), col("seq")
Expand All @@ -38,11 +31,6 @@ fn parse_format_string(
let re = &REGEX_PATTERN;
let fmt = re.replace_all(s, "{}").to_string();
let mut cols = Vec::new();
let source = DataframeSource::new(DataFrame::empty().lazy());
let mut vm = Vm::from_source(Source::new(source.into(), source_type));
if let Some(valid_names) = valid_names {
vm.source_mut().set_init_column_names(valid_names);
}
for cap in re.captures_iter(s) {
let item = cap.get(1).unwrap().as_str();
if item.is_empty() {
Expand All @@ -58,12 +46,15 @@ fn parse_format_string(
}
let ast = vm.ast(item)?;
if !ast.is_expression() {
return Err(FilterxError::RuntimeError(
"Error format string, only support expression".to_string(),
));
let h = &mut vm.hint;
h.white("Only support expression in ")
.cyan("print")
.white(", but got ")
.red(item)
.print_and_exit();
}
let ast = ast.expression().unwrap();
let ast = ast.eval(&mut vm)?;
let ast = ast.eval(vm)?;
let value = ast.expr()?;
cols.push(value);
}
Expand All @@ -73,17 +64,17 @@ fn parse_format_string(
#[test]
fn test_parse_format_string() {
use polars::prelude::col;

let mut vm = Vm::mock(SourceType::Fasta);
let s = "xxx_{seq}";
let (fmt, cols) = parse_format_string(SourceType::Fasta, None, s).unwrap();
let (fmt, cols) = parse_format_string(s, &mut vm).unwrap();
assert_eq!(fmt, "xxx_{}");
assert!(cols.is_some());
let cols = cols.unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0], col("seq"));

let s = "xxx_{seq}_{seq}";
let (fmt, cols) = parse_format_string(SourceType::Fasta, None, s).unwrap();
let (fmt, cols) = parse_format_string(s, &mut vm).unwrap();
assert_eq!(fmt, "xxx_{}_{}");
assert!(cols.is_some());
let cols = cols.unwrap();
Expand All @@ -92,12 +83,12 @@ fn test_parse_format_string() {
assert_eq!(cols[1], col("seq"));

let s = "xxx";
let (fmt, cols) = parse_format_string(SourceType::Fasta, None, s).unwrap();
let (fmt, cols) = parse_format_string(s, &mut vm).unwrap();
assert_eq!(fmt, "xxx");
assert!(cols.is_none());

let s = "xxx{len(seq)}";
let (fmt, cols) = parse_format_string(SourceType::Fasta, None, s).unwrap();
let (fmt, cols) = parse_format_string(s, &mut vm).unwrap();
assert_eq!(fmt, "xxx{}");
assert!(cols.is_some());
let cols = cols.unwrap();
Expand All @@ -123,11 +114,9 @@ pub fn print<'a>(vm: &'a mut Vm, args: &Vec<ast::Expr>) -> FilterxResult<value::
let (fmt, cols) = if let Some(value) = vm.expr_cache.get(&value) {
(value.0.clone(), value.1.clone())
} else {
let (fmt_, cols_) = parse_format_string(
vm.source_type(),
Some(&vm.source().ret_column_names),
&value,
)?;
vm.set_mode(VmMode::Printable);
let (fmt_, cols_) = parse_format_string(&value, vm)?;
vm.set_mode(VmMode::Expression);
let cols_ = cols_.unwrap_or(vec![]);
vm.expr_cache
.insert(value.clone(), (fmt_.clone(), cols_.clone()));
Expand Down
2 changes: 1 addition & 1 deletion src/filterx_engine/src/eval/call/builtin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pub use super::*;
pub use crate::ast;
pub use crate::eval::Eval;
pub use crate::vm::Vm;
pub use crate::{eval, eval_col};
pub use crate::{eval, eval_col, execuable};
pub use filterx_core::{value, FilterxError, FilterxResult};
pub use filterx_source::{source::SourceType, DataframeSource};

Expand Down
22 changes: 18 additions & 4 deletions src/filterx_engine/src/eval/call/builtin/sequence/gc.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::super::*;
use filterx_source::FastaRecordType;
use polars::prelude::*;

use polars::prelude::col;
use polars_arrow::{
array::{ArrayRef, Float32Array, Utf8ViewArray},
buffer::Buffer,
Expand Down Expand Up @@ -52,9 +52,23 @@ fn compute_gc(s: Column) -> PolarsResult<Option<Column>> {

pub fn gc<'a>(vm: &'a mut Vm, args: &Vec<ast::Expr>) -> FilterxResult<value::Value> {
expect_args_len(args, 1)?;
if vm.source.source_type.is_fasta() || vm.source.source_type.is_fastq() {
if vm.source.source_type.is_fasta() {
let fasta = vm.source.get_fasta()?;
match fasta.record_type {
FastaRecordType::Protein => {
let h = &mut vm.hint;
h.white("gc: protein sequence is not supported")
.print_and_exit();
}
_ => {}
}
}
}
let col_name = eval_col!(vm, &args[0], "gc: expected a column name as first argument");
let col_name = col_name.column()?;
vm.source_mut().has_column(col_name);
let e = col(col_name).map(compute_gc, GetOutput::float_type());
let name = col_name.column()?;
let e = col_name.expr()?;
vm.source_mut().has_column(name);
let e = e.map(compute_gc, GetOutput::float_type());
return Ok(value::Value::named_expr(None, e));
}
4 changes: 3 additions & 1 deletion src/filterx_engine/src/eval/call/builtin/sequence/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ builtin_function! {
gc,
revcomp,
to_fasta,
to_fastq
to_fastq,
qual,
phred
}
15 changes: 15 additions & 0 deletions src/filterx_engine/src/eval/call/builtin/sequence/phred.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
use super::super::*;
pub fn phred(vm: &mut Vm) -> FilterxResult<value::Value> {
if vm.source_type() == SourceType::Fastq {
let fastp = vm.source.get_fastq()?;
let h = &mut vm.hint;
h.white("phred: ")
.green(&format!("{}", fastp.quality_type))
.print_and_exit();
}
let h = &mut vm.hint;
h.white("phred: Only ")
.cyan("fastq")
.white(" format is supported for now.")
.print_and_exit();
}
Loading

0 comments on commit ce9cb99

Please sign in to comment.