Skip to content

Commit

Permalink
v0.6.1 fixed bug with -I. pushing now to try and fix conda...
Browse files Browse the repository at this point in the history
  • Loading branch information
bluenote-1577 committed Apr 29, 2024
1 parent 8158e6c commit 02232c7
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 16 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
## sylph v0.6.1: improved automatic detection of sequencing error for -u option

### Major

* Improved automatic estimation of sequencing error for estimating unknown abundances/coverages.

Explanation:

The -u option estimates the % of sequences that are "unknown" i.e. not captured by the database and the "true" coverage. This requires knowledge of sequencing error. Previous versions failed when the sample was too diverse compared to sequencing depth (e.g. low-throughput sequencing or complex (ocean/soil) metagenomes).

**New fallback added**: For short-reads only, if the diversity is too high relative to sequencing depth, (avg k-mer depth < 3) then 99.5% is used as a fallback sequence identity estimate.

## sylph v0.6.0 release: New output column, lazy raw paired fastq profiling: 2024-04-06

### Major
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions src/cmdline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ pub struct ContainArgs {
pub estimate_unknown: bool,


#[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Mean base-level identity (%) of reads. Only used in -u option as fallback if automatic detection fails.", default_value_t=99.5)]
pub seq_id: f64,
#[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Sequence identity (%) of reads. Only used in -u option and overrides automatic detection. ")]
pub seq_id: Option<f64>,

//#[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )]
//pub read_length: Option<usize>,
Expand Down
29 changes: 16 additions & 13 deletions src/contain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -267,14 +267,14 @@ pub fn contain(mut args: ContainArgs, pseudotax_in: bool) {
let first_read_file = read_files[j][0];
let sequence_sketch = sequence_sketch.unwrap();

let mut kmer_id_opt = get_kmer_identity(&sequence_sketch, args.estimate_unknown);
if kmer_id_opt.is_none(){
if args.estimate_unknown{
log::warn!("{} sample has high diversity compared to sequencing depth or sequencing error (approx. avg depth < 3). Using {} (-I) as read accuracy estimate instead of automatic detection.", &first_read_file, args.seq_id);
}
kmer_id_opt = Some((args.seq_id/100.).powf(sequence_sketch.k as f64));
let kmer_id_opt;
if args.seq_id.is_some(){
kmer_id_opt = Some((args.seq_id.unwrap()/100.).powf(sequence_sketch.k as f64));
}
else{
kmer_id_opt = get_kmer_identity(&sequence_sketch, args.estimate_unknown);
log::debug!("{} has estimated identity {:.3}.", &first_read_file, kmer_id_opt.unwrap().powf(1./sequence_sketch.k as f64) * 100.);
}
log::debug!("{} has estimated identity {:.3}.", &first_read_file, kmer_id_opt.unwrap().powf(1./sequence_sketch.k as f64) * 100.);

let stats_vec_seq: Mutex<Vec<AniResult>> = Mutex::new(vec![]);
genome_index_vec.par_iter().for_each(|i| {
Expand Down Expand Up @@ -926,11 +926,7 @@ fn get_kmer_identity(seq_sketch: &SequencesSketch, estimate_unknown: bool) -> Op

mov_avg_median /= n;
log::debug!("Estimated continuous median k-mer count for {} is {:.3}", &seq_sketch.file_name, mov_avg_median);

if mov_avg_median < MED_KMER_FOR_ID_EST{
return None;
}


let mut num_1s = 0;
let mut num_not1s = 0;
for count in seq_sketch.kmer_counts.values(){
Expand All @@ -943,10 +939,17 @@ fn get_kmer_identity(seq_sketch: &SequencesSketch, estimate_unknown: bool) -> Op
}
//0.1 so no div by 0 error
let eps = num_not1s as f64 / (num_not1s as f64 + num_1s as f64 + 0.1);
//dbg!("Automatic id est, 1-to-2 ratio, 2-to-3", eps.powf(1./31.), num_1s as f64 / num_2s as f64, two_to_three);

if mov_avg_median < MED_KMER_FOR_ID_EST && seq_sketch.mean_read_length < 400.{
log::info!("{} short-read sample has high diversity compared to sequencing depth (approx. avg depth < 3). Using 99.5% as read accuracy estimate instead of automatic detection for --estimate-unknown.", &seq_sketch.file_name);
return Some(0.995f64.powf(seq_sketch.k as f64));
}

if eps < 1.{
return Some(eps)
}
else{
return None
return Some(1.)
}
}

0 comments on commit 02232c7

Please sign in to comment.