From f75837594e42fc4b6aed30b6abfe5325e69f6862 Mon Sep 17 00:00:00 2001 From: dwpeng Date: Fri, 8 Nov 2024 19:57:10 +0800 Subject: [PATCH] feat: add support for FASTA and FASTQ formats with corresponding converting functions --- src/engine/eval/call/builtin/to_fasta.rs | 64 ++++++++++++++++ src/engine/eval/call/builtin/to_fastq.rs | 94 ++++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 src/engine/eval/call/builtin/to_fasta.rs create mode 100644 src/engine/eval/call/builtin/to_fastq.rs diff --git a/src/engine/eval/call/builtin/to_fasta.rs b/src/engine/eval/call/builtin/to_fasta.rs new file mode 100644 index 0000000..37f8491 --- /dev/null +++ b/src/engine/eval/call/builtin/to_fasta.rs @@ -0,0 +1,64 @@ +use super::*; +use crate::engine::vm::VmSourceType; +use std::io::Write; + +fn print_fasta(vm: &mut Vm) -> FilterxResult { + vm.status.printed = true; + let name_index = vm.source.ret_column_names.iter().position(|x| x == "name"); + let seq_index = vm.source.ret_column_names.iter().position(|x| x == "seq"); + + if name_index.is_none() { + let h = &mut vm.hint; + h.white("Lost ") + .cyan("'name'") + .white(" column.") + .print_and_exit(); + } + + if seq_index.is_none() { + let h = &mut vm.hint; + h.white("Lost ") + .cyan("'seq'") + .white(" column.") + .print_and_exit(); + } + let name_index = name_index.unwrap(); + let seq_index = seq_index.unwrap(); + let df = vm.source.lazy().collect()?; + let columns = df.get_columns(); + let name_col = &columns[name_index]; + let seq_col = &columns[seq_index]; + if name_col.len() != seq_col.len() { + let h = &mut vm.hint; + h.white("Length of ") + .cyan("'name'") + .white(" and ") + .cyan("'seq'") + .white(" columns are different.") + .print_and_exit(); + } + let writer = vm.writer.as_mut().unwrap().as_mut(); + for i in 0..name_col.len() { + let name = name_col.get(i).unwrap(); + let seq = seq_col.get(i).unwrap(); + let _ = writeln!(writer, ">{}", name.get_str().unwrap_or("name"))?; + let _ = writeln!(writer, "{}", seq.get_str().unwrap_or("seq"))?; + vm.status.consume_rows += 1; + if vm.status.consume_rows >= vm.status.limit_rows { + vm.status.stop = true; + break; + } + } + Ok(value::Value::None) +} + +pub fn to_fasta(vm: &mut Vm) -> FilterxResult { + if vm.source_type == VmSourceType::Fasta || vm.source_type == VmSourceType::Fastq { + return print_fasta(vm); + } + let h = &mut vm.hint; + h.white("Only ") + .cyan("fastq, fasta ") + .white("formats are supported for now.") + .print_and_exit(); +} diff --git a/src/engine/eval/call/builtin/to_fastq.rs b/src/engine/eval/call/builtin/to_fastq.rs new file mode 100644 index 0000000..0bd61f4 --- /dev/null +++ b/src/engine/eval/call/builtin/to_fastq.rs @@ -0,0 +1,94 @@ +use super::*; +use crate::engine::vm::VmSourceType; +use std::io::Write; + +fn print_fastq(vm: &mut Vm) -> FilterxResult { + vm.status.printed = true; + let name_index = vm.source.ret_column_names.iter().position(|x| x == "name"); + let seq_index = vm.source.ret_column_names.iter().position(|x| x == "seq"); + let qual_index = vm.source.ret_column_names.iter().position(|x| x == "qual"); + if name_index.is_none() { + let h = &mut vm.hint; + h.white("Lost ") + .cyan("'name'") + .white(" column.") + .print_and_exit(); + } + if seq_index.is_none() { + let h = &mut vm.hint; + h.white("Lost ") + .cyan("'seq'") + .white(" column.") + .print_and_exit(); + } + let name_index = name_index.unwrap(); + let seq_index = seq_index.unwrap(); + let df = vm.source.lazy().collect()?; + let columns = df.get_columns(); + let name_col = &columns[name_index]; + let seq_col = &columns[seq_index]; + if name_col.len() != seq_col.len() { + let h = &mut vm.hint; + h.white("Length of ") + .cyan("'name'") + .white(" and ") + .cyan("'seq'") + .white(" columns are different.") + .print_and_exit(); + } + let writer = vm.writer.as_mut().unwrap().as_mut(); + if qual_index.is_some() { + let qual_index = qual_index.unwrap(); + let qual_col = &columns[qual_index]; + for i in 0..name_col.len() { + let name = name_col.get(i).unwrap(); + let seq = seq_col.get(i).unwrap(); + let q = qual_col.get(i).unwrap(); + let _ = writeln!(writer, "@{}", name.get_str().unwrap_or("name"))?; + let _ = writeln!(writer, "{}", seq.get_str().unwrap_or("seq"))?; + let _ = writeln!(writer, "+")?; + let _ = writeln!(writer, "{}", q.get_str().unwrap_or("qual"))?; + vm.status.consume_rows += 1; + if vm.status.consume_rows >= vm.status.limit_rows { + vm.status.stop = true; + break; + } + } + } else { + let mut qual_buffer = String::with_capacity(512); + for i in 0..name_col.len() { + let name = name_col.get(i).unwrap(); + let seq = seq_col.get(i).unwrap(); + let seq = seq.get_str().unwrap_or("seq"); + let _ = writeln!(writer, "@{}", name.get_str().unwrap_or("name"))?; + let _ = writeln!(writer, "{}", seq)?; + let _ = writeln!(writer, "+")?; + if qual_buffer.len() != seq.len() { + if qual_buffer.len() > seq.len() { + qual_buffer.truncate(seq.len()); + } else { + qual_buffer.push_str(&"?".repeat(seq.len() - qual_buffer.len())) + } + } + let _ = writeln!(writer, "{}", &qual_buffer)?; + vm.status.consume_rows += 1; + if vm.status.consume_rows >= vm.status.limit_rows { + vm.status.stop = true; + break; + } + } + } + + Ok(value::Value::None) +} + +pub fn to_fastq(vm: &mut Vm) -> FilterxResult { + if vm.source_type == VmSourceType::Fasta || vm.source_type == VmSourceType::Fastq { + return print_fastq(vm); + } + let h = &mut vm.hint; + h.white("Only ") + .cyan("fastq, fasta ") + .white("formats are supported for now.") + .print_and_exit(); +}