nf-core · ilight1542 · Sep 13, 2024 · Oct 4, 2024 · Oct 18, 2024 · Oct 25, 2024
diff --git a/conf/modules.config b/conf/modules.config
@@ -447,29 +447,14 @@ process {
         ]
     }
 
-    withName: SAMTOOLS_FASTQ_MAPPED {
+    withName: SAMTOOLS_FASTQ_METAGENOMICS {
         tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
         ext.args = [
-            params.metagenomics_input == 'all' ? '' : '-F 4',
+            params.metagenomics_input == 'mapped' ? '-F 4': '',
+            params.metagenomics_input == 'unmapped' ? '-f 4': '',
+            // 'all' is left then with NO -F or -f flag, therefore all reads get sent to fastq
         ].join(' ').trim()
-        ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_mapped" }
-        publishDir = [
-            [
-                // data
-                path: { "${params.outdir}/read_filtering/fastq/data/" },
-                mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
-                enabled: params.bamfiltering_generatemappedfastq
-            ]
-        ]
-    }
-
-    withName: SAMTOOLS_FASTQ_UNMAPPED {
-        tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
-        ext.args = [
-            '-f 4',
-        ].join(' ').trim()
-        ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_unmapped" }
+        ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_metagenomics_fastq_${params.metagenomics_input}" }
         publishDir = [
             [
                 // data
@@ -481,7 +466,7 @@ process {
         ]
     }
 
-    withName: 'CAT_FASTQ_UNMAPPED|CAT_FASTQ_MAPPED' {
+    withName: 'CAT_FASTQ_METAGENOMICS' {
         tag = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" }
         ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" }
         publishDir = [
@@ -926,6 +911,153 @@ process {
         ]
     }
 
+    withName: BBMAP_BBDUK {
+        tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
+        ext.args = { "entropymask=f entropy=${params.metagenomics_complexity_entropy}" }
+        ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_complexity" }
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/complexity_filter/bbduk/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{fastq.gz,log}',
+            enabled: params.metagenomics_complexity_savefastq
+        ]
+    }
+
+    withName: MALT_RUN {
+        ext.args = [
+            "-m ${params.metagenomics_malt_mode}",
+            "-at ${params.metagenomics_malt_alignmentmode}",
+            "-top ${params.metagenomics_malt_toppercent}",
+            "-id ${params.metagenomics_malt_minpercentidentity}",
+            "-mq ${params.metagenomics_malt_maxqueries}",
+            "--memoryMode ${params.metagenomics_malt_memorymode}",
+            params.metagenomics_malt_minsupportmode == "percent" ? "-supp ${params.metagenomics_malt_minsupportpercent}" : "-sup ${params.metagenomics_malt_minsupportreads}",
+            params.metagenomics_malt_savereads ? "--alignments ./" : ""
+        ].join(' ').trim()
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/profiling/malt/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{rma6,log,sam.gz}'
+        ]
+        ext.prefix = { "${meta.label}_${meta.id}-run" }
+    }
+
+    withName: CAT_CAT_MALT {
+        ext.prefix = { "${meta.id}_runtime_log_concatenated.log" }
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/profiling/malt/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{log}'
+        ]
+    }
+
+    withName: KRAKEN2_KRAKEN2 {
+        tag = { "${meta.sample_id}|single_end_mode_${meta.single_end}" }
+        ext.args = [
+            params.metagenomics_kraken2_saveminimizers ? "--report-minimizer-data" : ""
+        ].join(' ').trim()
+        ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" }
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/profiling/kraken2/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{txt,fastq.gz}'
+        ]
+    }
+
+    withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ {
+        tag = { "single_end_mode_${meta.single_end}" }
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/profiling/krakenuniq/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{txt,fastq.gz}'
+        ]
+        ext.prefix = { "${meta.single_end}" }
+    }
+
+    withName: METAPHLAN_METAPHLAN {
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/profiling/metaphlan/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{biom,txt}'
+        ]
+        ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" }
+    }
+
+    withName: MALTEXTRACT {
+        ext.args = [
+            "-f ${params.metagenomics_maltextract_filter}",
+            "-a ${params.metagenomics_maltextract_toppercent}",
+            "--minPI ${params.metagenomics_maltextract_minpercentidentity}",
+            params.metagenomics_maltextract_destackingoff ? "--destackingOff" : "",
+            params.metagenomics_maltextract_downsamplingoff ? "--downSampOff" : "",
+            params.metagenomics_maltextract_duplicateremovaloff ? "--dupRemOff" : "",
+            params.metagenomics_maltextract_matches ? "--matches" : "",
+            params.metagenomics_maltextract_megansummary ? "--meganSummary" : "",
+            params.metagenomics_maltextract_usetopalignment ?  "--useTopAlignment" : "",
+            { meta.strandedness } == "single" ? '--singleStranded' : '',
+        ].join(' ').trim()
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/postprocessing/maltextract/" },
+            mode: params.publish_dir_mode,
+            pattern: 'results',
+            saveAs: { "${meta.id}" }
+        ]
+    }
+
+    withName: MEGAN_RMA2INFO {
+        tag = {"${meta.id}"}
+        ext.args = "-c2c Taxonomy"
+        ext.prefix = { "${meta.id}" }
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/postprocessing/megan_summaries/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{txt.gz,megan}'
+        ]
+    }
+
+    withName: AMPS {
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/postprocessing/maltextract/" },
+            mode: params.publish_dir_mode,
+            pattern: 'results'
+        ]
+        errorStrategy = 'ignore' // required as it fails the run for low reads: https://github.com/rhuebler/HOPS/issues/9
+    }
+
+    withName: TAXPASTA_MERGE {
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/postprocessing/taxpasta/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}'
+        ]
+        ext.args = { "--profiler ${meta.profiler} --output ${meta.profiler}_taxpasta_table.tsv" }
+    }
+
+    withName: TAXPASTA_STANDARDISE {
+        publishDir = [
+            path: { "${params.outdir}/metagenomics/postprocessing/taxpasta/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{csv,tsv,ods,xlsx,arrow,parquet,biom}'
+        ]
+        ext.args = { "--profiler ${meta.profiler} --output ${meta.profiler}taxpasta_table.tsv" }
+    }
+
+    //
+    // QUALIMAP
+    //
+
+    withName: 'QUALIMAP_BAMQC_WITHBED|QUALIMAP_BAMQC_NOBED' {
+        tag = { "${meta.reference}|${meta.sample_id}" }
+        publishDir = [
+            path: { "${params.outdir}/mapstats/qualimap/${meta.reference}/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+
+    //
+    // DAMAGE CALCULATION
+    //
     withName: DAMAGEPROFILER {
         tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
         ext.args   = [

diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md
@@ -721,7 +721,7 @@ HOP001 ERR8958750 0 4 paired double half /workspace/eager/testing/test_data/ERR8
 HOP001 ERR8958751 0 2 paired double half /workspace/eager/testing/test_data/ERR8958751_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958751_2.fastq.gz_reduced.fastq.gz NA NA
 HOP001 ERR8958752 0 2 paired double half /workspace/eager/testing/test_data/ERR8958752_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958752_2.fastq.gz_reduced.fastq.gz NA NA
 HOP001 ERR8958753 0 2 paired double half /workspace/eager/testing/test_data/ERR8958753_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958753_2.fastq.gz_reduced.fastq.gz NA NA
-HOP001 ERR8958754 0 2 paired double none /workspace/eager/testing/test_data/ERR8958754_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958754_2.fastq.gz_reduced.fastq.gz NA NA" | sed 's/ /\t/g' > test.tsv
+HOP001 ERR8958754 0 2 paired double none /workspace/eager/testing/test_data/ERR8958754_1.fastq.gz_reduced.fastq.gz /workspace/eager/testing/test_data/ERR8958754_2.fastq.gz_reduced.fastq.gz NA NA" | sed 's/NA/ /g' | sed 's/ /\t/g'  > test.tsv
 
 nextflow run ../main.nf -profile docker \
   --input test.tsv \
@@ -738,6 +738,16 @@ nextflow run ../main.nf -profile docker \
   --metagenomics_malt_group_size 3
 ```
 
+# kraken2
+
+nextflow run main.nf -profile docker \
+ --input testing/test.tsv \
+ --outdir ./out \
+ --run_metagenomics \
+ --metagenomics_profiling_tool kraken2 \
+ --metagenomics_profiling_database /workspace/eager/testing/eager_test.tar.gz
+--preprocessing_skippairmerging
+
 ## Mapping statistics
 
 ### ENDOSPY

diff --git a/docs/development/metagenomics_paired_end.md b/docs/development/metagenomics_paired_end.md
@@ -0,0 +1,18 @@
+## investigation notes for updating code to allow for PE inputs into metagenomics profiling (eg for kraken, malt)
+
+see
+https://github.com/nf-core/eager/issues/945
+
+current issue is that the reads that go into mapping are not by default extracted as singletons and non-singletons, so we lose that information
+Then downstream the inputs into the krakenuniq module (even if split correctly with meta vars) don't have the correct headers to parse the PE nature of the reads (since they have all been concatenated anyways, and just were ORIGINALLY PE)
+
+So: needs to be fixed up higher (eg in bamfiltering.nf, likely with a new adjustment to the SAMTOOLS_FASTQ_UNMAPPED, SAMTOOLS_FASTQ_MAPPED, and SAMTOOLS_VIEW_BAM_FILTERING modules )
+
+ISSUE FOUND: while the outputting of PE reads is OK in bamfiltering.nf (fastq_mapped & fastq_unmapped) when overlap merging is not done cat_fastq weirdly merges singletons to one PE file and other to the other PE file, so then everything gets fucked up
+"""
+cat input1/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_other.fastq.gz input3/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_1.fastq.gz > JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_1.merged.fastq.gz
+cat input2/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_singleton.fastq.gz input4/JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_unmapped_2.fastq.gz > JK2782_JK2782_TGGCCGATCAACGA_Mammoth_MT_Krause_2.merged.fastq.gz
+"""
+
+Decision is needed on what behavior is wanted for unmapped singletons, other. and then likely remove the call to cat_fastq for PE reads
+Possibly just split to also have the singletons parsed separately?
diff --git a/modules.json b/modules.json
@@ -187,7 +187,7 @@
                     },
                     "krakenuniq/preloadedkrakenuniq": {
                         "branch": "master",
-                        "git_sha": "a6eb17f65b3ee5761c25c075a6166c9f76733cee",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
                     "malt/run": {

diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml