v0.6.1 fixed bug with -I. pushing now to try and fix conda...

bluenote-1577 · Apr 29, 2024 · 02232c7 · 02232c7
1 parent 8158e6c
commit 02232c7
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## sylph v0.6.1: improved automatic detection of sequencing error for -u option
+
+### Major
+
+* Improved automatic estimation of sequencing error for estimating unknown abundances/coverages.
+
+Explanation:
+
+The -u option estimates the % of sequences that are "unknown" i.e. not captured by the database and the "true" coverage. This requires knowledge of sequencing error. Previous versions failed when the sample was too diverse compared to sequencing depth (e.g. low-throughput sequencing or complex (ocean/soil) metagenomes). 
+
+**New fallback added**: For short-reads only, if the diversity is too high relative to sequencing depth, (avg k-mer depth < 3) then 99.5% is used as a fallback sequence identity estimate. 
+
 ## sylph v0.6.0 release: New output column, lazy raw paired fastq profiling: 2024-04-06 
 
 ### Major

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/cmdline.rs b/src/cmdline.rs
@@ -104,8 +104,8 @@ pub struct ContainArgs {
     pub estimate_unknown: bool,
 
 
-    #[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Mean base-level identity (%) of reads. Only used in -u option as fallback if automatic detection fails.", default_value_t=99.5)]
-    pub seq_id: f64,
+    #[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Sequence identity (%) of reads. Only used in -u option and overrides automatic detection. ")]
+    pub seq_id: Option<f64>,
 
     //#[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )]
     //pub read_length: Option<usize>,

diff --git a/src/contain.rs b/src/contain.rs
@@ -267,14 +267,14 @@ pub fn contain(mut args: ContainArgs, pseudotax_in: bool) {
                 let first_read_file = read_files[j][0];
                 let sequence_sketch = sequence_sketch.unwrap();
 
-                let mut kmer_id_opt = get_kmer_identity(&sequence_sketch, args.estimate_unknown);
-                if kmer_id_opt.is_none(){
-                    if args.estimate_unknown{
-                        log::warn!("{} sample has high diversity compared to sequencing depth or sequencing error (approx. avg depth < 3). Using {} (-I) as read accuracy estimate instead of automatic detection.", &first_read_file, args.seq_id);
-                    }
-                    kmer_id_opt = Some((args.seq_id/100.).powf(sequence_sketch.k as f64));
+                let kmer_id_opt;
+                if args.seq_id.is_some(){
+                    kmer_id_opt = Some((args.seq_id.unwrap()/100.).powf(sequence_sketch.k as f64));
+                }
+                else{
+                    kmer_id_opt = get_kmer_identity(&sequence_sketch, args.estimate_unknown);
+                    log::debug!("{} has estimated identity {:.3}.", &first_read_file, kmer_id_opt.unwrap().powf(1./sequence_sketch.k as f64) * 100.);
                 }
-                log::debug!("{} has estimated identity {:.3}.", &first_read_file, kmer_id_opt.unwrap().powf(1./sequence_sketch.k as f64) * 100.);
 
                 let stats_vec_seq: Mutex<Vec<AniResult>> = Mutex::new(vec![]);
                 genome_index_vec.par_iter().for_each(|i| {
@@ -926,11 +926,7 @@ fn get_kmer_identity(seq_sketch: &SequencesSketch, estimate_unknown: bool) -> Op
 
     mov_avg_median /= n;
     log::debug!("Estimated continuous median k-mer count for {} is {:.3}", &seq_sketch.file_name, mov_avg_median);
-
-    if mov_avg_median < MED_KMER_FOR_ID_EST{
-        return None;
-    }
-
+
     let mut num_1s = 0;
     let mut num_not1s = 0;
     for count in seq_sketch.kmer_counts.values(){
@@ -943,10 +939,17 @@ fn get_kmer_identity(seq_sketch: &SequencesSketch, estimate_unknown: bool) -> Op
     }
     //0.1 so no div by 0 error
     let eps = num_not1s as f64 / (num_not1s as f64 + num_1s as f64 + 0.1);
+    //dbg!("Automatic id est, 1-to-2 ratio, 2-to-3", eps.powf(1./31.), num_1s as f64 / num_2s as f64, two_to_three);
+
+    if mov_avg_median < MED_KMER_FOR_ID_EST && seq_sketch.mean_read_length < 400.{
+        log::info!("{} short-read sample has high diversity compared to sequencing depth (approx. avg depth < 3). Using 99.5% as read accuracy estimate instead of automatic detection for --estimate-unknown.", &seq_sketch.file_name);
+        return Some(0.995f64.powf(seq_sketch.k as f64));
+    }
+
     if eps < 1.{
         return Some(eps)
     }
     else{
-        return None
+        return Some(1.)
     }
 }