diff --git a/README.md b/README.md index ab6652e..fa49947 100644 --- a/README.md +++ b/README.md @@ -346,8 +346,11 @@ Options: Minimum quality score for FASTQ data [default: 0] -p, --num-threads The number of threads to use [default: 10] - --batch-size + --buffer-size [default: 16777216] + --batch-size + The size of each batch for processing taxid match results, used to control memory usage + [default: 16] -T, --confidence-threshold Confidence score threshold [default: 0] -g, --minimum-hit-groups @@ -358,8 +361,6 @@ Options: In comb. w/ -R, provide minimizer information in report -z, --report-zero-counts In comb. w/ -R, report taxa w/ 0 count - --full-output - output file contains all unclassified sequence -h, --help Print help (see more with '--help') -V, --version diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml index fe5dcf4..21849b1 100644 --- a/kr2r/Cargo.toml +++ b/kr2r/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kr2r" -version = "0.6.3" +version = "0.6.8" edition = "2021" authors = ["eric9n@gmail.com"] diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs index f11bf9c..bfd1bf3 100644 --- a/kr2r/src/args.rs +++ b/kr2r/src/args.rs @@ -117,10 +117,9 @@ pub struct ClassifyArgs { #[clap(short = 'z', long, value_parser, default_value_t = false)] pub report_zero_counts: bool, - /// output file contains all unclassified sequence - #[clap(long, value_parser, default_value_t = false)] - pub full_output: bool, - + // /// output file contains all unclassified sequence + // #[clap(long, value_parser, default_value_t = false)] + // pub full_output: bool, /// A list of input file paths (FASTA/FASTQ) to be processed by the classify program. /// Supports fasta or fastq format files (e.g., .fasta, .fastq) and gzip compressed files (e.g., .fasta.gz, .fastq.gz). // #[clap(short = 'F', long = "files")] diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index d54f0f5..27c09e0 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -97,7 +97,7 @@ impl From for resolve::Args { kraken_output_dir: item.kraken_output_dir, report_kmer_data: item.report_kmer_data, report_zero_counts: item.report_zero_counts, - full_output: item.full_output, + // full_output: item.full_output, num_threads: item.num_threads, } } diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs index 775b9a6..2aa73cf 100644 --- a/kr2r/src/bin/resolve.rs +++ b/kr2r/src/bin/resolve.rs @@ -8,7 +8,7 @@ use kr2r::utils::{find_and_trans_bin_files, find_and_trans_files, open_file}; use kr2r::HitGroup; // use rayon::prelude::*; use seqkmer::{buffer_map_parallel, trim_pair_info, OptionPair}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Result, Write}; use std::path::{Path, PathBuf}; @@ -65,9 +65,9 @@ pub struct Args { #[clap(long = "output-dir", value_parser)] pub kraken_output_dir: Option, - /// output file contains all unclassified sequence - #[clap(long, value_parser, default_value_t = false)] - pub full_output: bool, + // /// output file contains all unclassified sequence + // #[clap(long, value_parser, default_value_t = false)] + // pub full_output: bool, /// Confidence score threshold, default is 0.0. #[clap( short = 'T', @@ -120,8 +120,7 @@ fn process_batch>( id_map: &HashMap)>, writer: &mut Box, value_mask: usize, -) -> Result<(TaxonCountersDash, usize, HashSet)> { - let hit_seq_id_set = HashSet::new(); +) -> Result<(TaxonCountersDash, usize)> { let confidence_threshold = args.confidence_threshold; let minimum_hit_groups = args.minimum_hit_groups; @@ -138,6 +137,7 @@ fn process_batch>( if let Some(item) = id_map.get(&k) { let mut rows = rows.to_owned(); rows.sort_unstable(); + let dna_id = trim_pair_info(&item.0); let range = OptionPair::from(((0, item.2), item.3.map(|size| (item.2, size + item.2)))); @@ -179,11 +179,7 @@ fn process_batch>( .expect("failed"); } - Ok(( - cur_taxon_counts, - classify_counter.load(Ordering::SeqCst), - hit_seq_id_set, - )) + Ok((cur_taxon_counts, classify_counter.load(Ordering::SeqCst))) } pub fn run(args: Args) -> Result<()> { @@ -218,7 +214,7 @@ pub fn run(args: Args) -> Result<()> { } None => Box::new(BufWriter::new(io::stdout())) as Box, }; - let (thread_taxon_counts, thread_classified, hit_seq_set) = process_batch::( + let (thread_taxon_counts, thread_classified) = process_batch::( sam_files, &args, &taxo, @@ -227,22 +223,22 @@ pub fn run(args: Args) -> Result<()> { value_mask, )?; - if args.full_output { - sample_id_map - .iter() - .filter(|(key, _)| !hit_seq_set.contains(key)) - .for_each(|(_, value)| { - let dna_id = trim_pair_info(&value.0); // 假设 key 是 &str 类型 - let output_line = format!( - "U\t{}\t0\t{}\t{}\n", - dna_id, - value.1, - if value.3.is_none() { "" } else { " |:| " } - ); - - writer.write_all(output_line.as_bytes()).unwrap(); - }); - } + // if args.full_output { + // sample_id_map + // .iter() + // .filter(|(key, _)| !hit_seq_set.contains(key)) + // .for_each(|(_, value)| { + // let dna_id = trim_pair_info(&value.0); // 假设 key 是 &str 类型 + // let output_line = format!( + // "U\t{}\t0\t{}\t{}\n", + // dna_id, + // value.1, + // if value.3.is_none() { "" } else { " |:| " } + // ); + + // writer.write_all(output_line.as_bytes()).unwrap(); + // }); + // } let mut sample_taxon_counts: HashMap< u64,