From e6ba4967f4b561cd5ba5fb2f887ab564bed6030c Mon Sep 17 00:00:00 2001
From: lbresadola <77614727+lbresadola@users.noreply.github.com>
Date: Mon, 17 May 2021 10:21:06 +0200
Subject: [PATCH 1/6] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d6dcb1c..0f70558 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ GATK has been providing a well known best practices document on BAM preprocessin
 
 We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different situations. For this purpose there are some required steps and some optional steps. This is implemented as a Nextflow pipeline to simplify parallelization of execution in the cluster. The default configuration uses reference genome hg19, if another reference is needed the adequate resources must be provided. The reference genome resources  for hg19 were downloaded from https://software.broadinstitute.org/gatk/download/bundle
 
-The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs.
+The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs. 
 
 ## Implementation
 

From df621769d174494972dbb40cbff3eb63cd2fd1f3 Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Tue, 25 May 2021 21:30:18 +0200
Subject: [PATCH 2/6] ensure the BAM for mark duplicates is query sorted

---
 environment.yml |  2 +-
 main.nf         | 33 +++++++++++++++++++++++++++------
 nextflow.config |  4 ++--
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/environment.yml b/environment.yml
index 56ab345..c8a1021 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,6 @@
 # You can use this file to create a conda environment for this pipeline:
 #   conda env create -f environment.yml
-name: tronflow-bam-preprocessing-1.3.1
+name: tronflow-bam-preprocessing-1.4.0
 channels:
   - conda-forge
   - bioconda
diff --git a/main.nf b/main.nf
index 8d1282d..043a0e3 100755
--- a/main.nf
+++ b/main.nf
@@ -83,9 +83,10 @@ process prepareBam {
     output:
       set val(name),
         val("${bam.baseName}"),
-        val(type), file("${bam.baseName}.prepared.bam"),
-        file("${bam.baseName}.prepared.bai")  into prepared_bams
+        val(type), file("${bam.baseName}.prepared.bam") into prepared_bams
 
+    script:
+    order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
     """
     mkdir tmp
 
@@ -109,8 +110,7 @@ process prepareBam {
     --RGSM ${type} \
     --RGLB 1 \
     --RGPL ${params.platform} \
-    --SORT_ORDER coordinate \
-    --CREATE_INDEX true
+    ${order}
     """
 }
 
@@ -126,7 +126,7 @@ if (!params.skip_deduplication) {
 	    publishDir "${publish_dir}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics"
 
 	    input:
-	    	set name, bam_name, type, file(bam), file(bai) from prepared_bams
+	    	set name, bam_name, type, file(bam) from prepared_bams
 
 	    output:
 	    	set val(name), val(bam_name), val(type),
@@ -149,7 +149,28 @@ if (!params.skip_deduplication) {
 	}
 }
 else {
-    prepared_bams.into{ deduplicated_bams; deduplicated_bams_for_metrics; deduplicated_bams_for_hs_metrics}
+    process indexBam {
+	    cpus "1"
+        memory "8g"
+	    tag "${name}"
+
+	    input:
+	    	set name, bam_name, type, file(bam) from prepared_bams
+
+	    output:
+	    	set val(name), val(bam_name), val(type),
+	    	    file("${bam}"), file("${bam.baseName}.bai") into deduplicated_bams,
+	    	    deduplicated_bams_for_metrics, deduplicated_bams_for_hs_metrics
+
+        script:
+	    """
+	    mkdir tmp
+
+        gatk BuildBamIndex \
+        --java-options '-Xmx8g  -Djava.io.tmpdir=tmp' \
+        --INPUT  ${bam}
+	    """
+	}
 }
 
 if (! params.skip_metrics) {
diff --git a/nextflow.config b/nextflow.config
index 11b6001..62d5efb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -57,12 +57,12 @@ dag {
   //file = "${params.output}/pipeline_dag.svg"
 }
 
-VERSION = '1.3.1'
+VERSION = '1.4.0'
 DOI = 'https://zenodo.org/badge/latestdoi/358400957'
 
 manifest {
   name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
-  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu'
+  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
   homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
   description = 'Picard and GATK BAM preprocessing pipeline'
   mainScript = 'main.nf'

From e05f43b5214dd2c6ae0ac9ade908b3c9d39ab73f Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Wed, 26 May 2021 11:06:11 +0200
Subject: [PATCH 3/6] change default settings for reports

---
 nextflow.config | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 62d5efb..61ef140 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -27,6 +27,10 @@ profiles {
     params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
     params.intervals = "$baseDir/test_data/minimal_intervals.intervals"
     params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
+    timeline.enabled = false
+    report.enabled = false
+    trace.enabled = false
+    dag.enabled = false
   }
 }
 
@@ -40,23 +44,6 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
 
 cleanup = true
 
-timeline {
-  enabled = true
-  //file = "${params.output}/execution_timeline.html"
-}
-report {
-  enabled = true
-  //file = "${params.output}/execution_report.html"
-}
-trace {
-  enabled = true
-  //file = "${params.output}/execution_trace.txt"
-}
-dag {
-  enabled = true
-  //file = "${params.output}/pipeline_dag.svg"
-}
-
 VERSION = '1.4.0'
 DOI = 'https://zenodo.org/badge/latestdoi/358400957'
 

From 4f02aa70e886f2a29cf494ccc4503f9580f63201 Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Wed, 26 May 2021 11:27:04 +0200
Subject: [PATCH 4/6] remove duplicate reads optionally

---
 Makefile        | 16 ++++++++--------
 README.md       |  1 +
 main.nf         |  3 +++
 nextflow.config |  1 +
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 0daa2b7..90add80 100644
--- a/Makefile
+++ b/Makefile
@@ -8,11 +8,11 @@ clean:
 	rm -rf .nextflow*
 
 test:
-	nextflow main.nf -profile test,conda --output output/test1
-	nextflow main.nf -profile test,conda --skip_bqsr --output output/test2
-	nextflow main.nf -profile test,conda --skip_realignment --output output/test3
-	nextflow main.nf -profile test,conda --skip_deduplication --output output/test4
-	nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
-	nextflow main.nf -profile test,conda --output output/test6 --intervals false
-	nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
-	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10
+	#nextflow main.nf -profile test,conda --output output/test1
+	#nextflow main.nf -profile test,conda --skip_bqsr --output output/test2
+	#nextflow main.nf -profile test,conda --skip_realignment --output output/test3
+	#nextflow main.nf -profile test,conda --skip_deduplication --output output/test4
+	#nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
+	#nextflow main.nf -profile test,conda --output output/test6 --intervals false
+	#nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
+	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false
diff --git a/README.md b/README.md
index 0f70558..40de942 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,7 @@ Optional input:
     * --skip_bqsr: optionally skip BQSR (default: false)
     * --skip_realignment: optionally skip realignment (default: false)
     * --skip_deduplication: optionally skip deduplication (default: false)
+    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
     * --skip_metrics: optionally skip metrics (default: false)
     * --output: the folder where to publish output (default: ./output)
     * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)
diff --git a/main.nf b/main.nf
index 043a0e3..eeb8baf 100755
--- a/main.nf
+++ b/main.nf
@@ -13,6 +13,7 @@ params.hs_metrics_per_base_coverage = false
 params.skip_bqsr = false
 params.skip_realignment = false
 params.skip_deduplication = false
+params.remove_duplicates = true
 params.skip_metrics = false
 params.output = false
 params.platform = "ILLUMINA"
@@ -136,6 +137,7 @@ if (!params.skip_deduplication) {
 
         script:
         dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics"
+        remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false"
 	    """
 	    mkdir tmp
 
@@ -144,6 +146,7 @@ if (!params.skip_deduplication) {
         --input  ${bam} \
         --output ${bam.baseName}.dedup.bam \
         --conf 'spark.executor.cores=${task.cpus}' \
+        ${remove_duplicates} \
         ${dedup_metrics}
 	    """
 	}
diff --git a/nextflow.config b/nextflow.config
index 61ef140..af79486 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -86,6 +86,7 @@ Optional input:
     * --skip_bqsr: optionally skip BQSR (default: false)
     * --skip_realignment: optionally skip realignment (default: false)
     * --skip_deduplication: optionally skip deduplication (default: false)
+    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
     * --skip_metrics: optionally skip metrics (default: false)
     * --output: the folder where to publish output (default: ./output)
     * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)

From 7ff227bf6bcb5514c63c68708a933814cad72c66 Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Wed, 26 May 2021 11:32:21 +0200
Subject: [PATCH 5/6] uncomment tests

---
 Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 90add80..ab970e7 100644
--- a/Makefile
+++ b/Makefile
@@ -8,11 +8,11 @@ clean:
 	rm -rf .nextflow*
 
 test:
-	#nextflow main.nf -profile test,conda --output output/test1
-	#nextflow main.nf -profile test,conda --skip_bqsr --output output/test2
-	#nextflow main.nf -profile test,conda --skip_realignment --output output/test3
-	#nextflow main.nf -profile test,conda --skip_deduplication --output output/test4
-	#nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
-	#nextflow main.nf -profile test,conda --output output/test6 --intervals false
-	#nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
+	nextflow main.nf -profile test,conda --output output/test1
+	nextflow main.nf -profile test,conda --skip_bqsr --output output/test2
+	nextflow main.nf -profile test,conda --skip_realignment --output output/test3
+	nextflow main.nf -profile test,conda --skip_deduplication --output output/test4
+	nextflow main.nf -profile test,conda --output output/test5 --skip_metrics
+	nextflow main.nf -profile test,conda --output output/test6 --intervals false
+	nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt
 	nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false

From 92c925e5cf7f0a638c989735638335123f24f08e Mon Sep 17 00:00:00 2001
From: priesgof <priesgoferreiro@gmail.com>
Date: Wed, 26 May 2021 12:03:59 +0200
Subject: [PATCH 6/6] limit index memory usage for Ci environment

---
 main.nf         | 6 ++++--
 nextflow.config | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index eeb8baf..ca9b716 100755
--- a/main.nf
+++ b/main.nf
@@ -31,6 +31,8 @@ params.bqsr_cpus = 3
 params.bqsr_memory = "4g"
 params.metrics_cpus = 1
 params.metrics_memory = "8g"
+params.index_cpus = 1
+params.index_memory = "8g"
 
 
 
@@ -153,8 +155,8 @@ if (!params.skip_deduplication) {
 }
 else {
     process indexBam {
-	    cpus "1"
-        memory "8g"
+	    cpus "${params.index_cpus}"
+        memory "${params.index_memory}"
 	    tag "${name}"
 
 	    input:
diff --git a/nextflow.config b/nextflow.config
index af79486..ec102b4 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -23,6 +23,8 @@ profiles {
     params.bqsr_memory = "3g"
     params.metrics_cpus = 1
     params.metrics_memory = "3g"
+    params.index_cpus = 1
+    params.index_memory = "3g"
     params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
     params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
     params.intervals = "$baseDir/test_data/minimal_intervals.intervals"