diff --git a/CHANGELOG.md b/CHANGELOG.md index a0e10ae..5a421e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## sylph v0.6.1: improved automatic detection of sequencing error for -u option + +### Major + +* Improved automatic estimation of sequencing error for estimating unknown abundances/coverages. + +Explanation: + +The -u option estimates the % of sequences that are "unknown" i.e. not captured by the database and the "true" coverage. This requires knowledge of sequencing error. Previous versions failed when the sample was too diverse compared to sequencing depth (e.g. low-throughput sequencing or complex (ocean/soil) metagenomes). + +**New fallback added**: For short-reads only, if the diversity is too high relative to sequencing depth, (avg k-mer depth < 3) then 99.5% is used as a fallback sequence identity estimate. + ## sylph v0.6.0 release: New output column, lazy raw paired fastq profiling: 2024-04-06 ### Major diff --git a/Cargo.lock b/Cargo.lock index fa7b8b8..ee85031 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1152,7 +1152,7 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "sylph" -version = "0.6.0" +version = "0.6.1" dependencies = [ "assert_cmd", "bincode", diff --git a/src/cmdline.rs b/src/cmdline.rs index 925efe4..bfefbf3 100644 --- a/src/cmdline.rs +++ b/src/cmdline.rs @@ -104,8 +104,8 @@ pub struct ContainArgs { pub estimate_unknown: bool, - #[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Mean base-level identity (%) of reads. Only used in -u option as fallback if automatic detection fails.", default_value_t=99.5)] - pub seq_id: f64, + #[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Sequence identity (%) of reads. Only used in -u option and overrides automatic detection. ")] + pub seq_id: Option, //#[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )] //pub read_length: Option, diff --git a/src/contain.rs b/src/contain.rs index 5ad6d30..dfbfb76 100644 --- a/src/contain.rs +++ b/src/contain.rs @@ -267,14 +267,14 @@ pub fn contain(mut args: ContainArgs, pseudotax_in: bool) { let first_read_file = read_files[j][0]; let sequence_sketch = sequence_sketch.unwrap(); - let mut kmer_id_opt = get_kmer_identity(&sequence_sketch, args.estimate_unknown); - if kmer_id_opt.is_none(){ - if args.estimate_unknown{ - log::warn!("{} sample has high diversity compared to sequencing depth or sequencing error (approx. avg depth < 3). Using {} (-I) as read accuracy estimate instead of automatic detection.", &first_read_file, args.seq_id); - } - kmer_id_opt = Some((args.seq_id/100.).powf(sequence_sketch.k as f64)); + let kmer_id_opt; + if args.seq_id.is_some(){ + kmer_id_opt = Some((args.seq_id.unwrap()/100.).powf(sequence_sketch.k as f64)); + } + else{ + kmer_id_opt = get_kmer_identity(&sequence_sketch, args.estimate_unknown); + log::debug!("{} has estimated identity {:.3}.", &first_read_file, kmer_id_opt.unwrap().powf(1./sequence_sketch.k as f64) * 100.); } - log::debug!("{} has estimated identity {:.3}.", &first_read_file, kmer_id_opt.unwrap().powf(1./sequence_sketch.k as f64) * 100.); let stats_vec_seq: Mutex> = Mutex::new(vec![]); genome_index_vec.par_iter().for_each(|i| { @@ -926,11 +926,7 @@ fn get_kmer_identity(seq_sketch: &SequencesSketch, estimate_unknown: bool) -> Op mov_avg_median /= n; log::debug!("Estimated continuous median k-mer count for {} is {:.3}", &seq_sketch.file_name, mov_avg_median); - - if mov_avg_median < MED_KMER_FOR_ID_EST{ - return None; - } - + let mut num_1s = 0; let mut num_not1s = 0; for count in seq_sketch.kmer_counts.values(){ @@ -943,10 +939,17 @@ fn get_kmer_identity(seq_sketch: &SequencesSketch, estimate_unknown: bool) -> Op } //0.1 so no div by 0 error let eps = num_not1s as f64 / (num_not1s as f64 + num_1s as f64 + 0.1); + //dbg!("Automatic id est, 1-to-2 ratio, 2-to-3", eps.powf(1./31.), num_1s as f64 / num_2s as f64, two_to_three); + + if mov_avg_median < MED_KMER_FOR_ID_EST && seq_sketch.mean_read_length < 400.{ + log::info!("{} short-read sample has high diversity compared to sequencing depth (approx. avg depth < 3). Using 99.5% as read accuracy estimate instead of automatic detection for --estimate-unknown.", &seq_sketch.file_name); + return Some(0.995f64.powf(seq_sketch.k as f64)); + } + if eps < 1.{ return Some(eps) } else{ - return None + return Some(1.) } }