diff --git a/Makefile b/Makefile index 0daa2b7..ab970e7 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,4 @@ test: nextflow main.nf -profile test,conda --output output/test5 --skip_metrics nextflow main.nf -profile test,conda --output output/test6 --intervals false nextflow main.nf -profile test,conda --output output/test7 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt - nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 + nextflow main.nf -profile test,conda --output output/test8 --hs_metrics_target_coverage target_coverage.txt --hs_metrics_per_base_coverage per_base_coverage.txt --collect_hs_metrics_min_base_quality 10 --collect_hs_metrics_min_mapping_quality 10 --remove_duplicates false diff --git a/README.md b/README.md index d6dcb1c..40de942 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ GATK has been providing a well known best practices document on BAM preprocessin We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different situations. For this purpose there are some required steps and some optional steps. This is implemented as a Nextflow pipeline to simplify parallelization of execution in the cluster. The default configuration uses reference genome hg19, if another reference is needed the adequate resources must be provided. The reference genome resources for hg19 were downloaded from https://software.broadinstitute.org/gatk/download/bundle -The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs. +The input is a tab-separated values file where each line corresponds to one input BAM. The output is another tab-separated values file with the absolute paths of the preprocessed and indexed BAMs. ## Implementation @@ -74,6 +74,7 @@ Optional input: * --skip_bqsr: optionally skip BQSR (default: false) * --skip_realignment: optionally skip realignment (default: false) * --skip_deduplication: optionally skip deduplication (default: false) + * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true) * --skip_metrics: optionally skip metrics (default: false) * --output: the folder where to publish output (default: ./output) * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA) diff --git a/environment.yml b/environment.yml index 56ab345..c8a1021 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: tronflow-bam-preprocessing-1.3.1 +name: tronflow-bam-preprocessing-1.4.0 channels: - conda-forge - bioconda diff --git a/main.nf b/main.nf index 8d1282d..ca9b716 100755 --- a/main.nf +++ b/main.nf @@ -13,6 +13,7 @@ params.hs_metrics_per_base_coverage = false params.skip_bqsr = false params.skip_realignment = false params.skip_deduplication = false +params.remove_duplicates = true params.skip_metrics = false params.output = false params.platform = "ILLUMINA" @@ -30,6 +31,8 @@ params.bqsr_cpus = 3 params.bqsr_memory = "4g" params.metrics_cpus = 1 params.metrics_memory = "8g" +params.index_cpus = 1 +params.index_memory = "8g" @@ -83,9 +86,10 @@ process prepareBam { output: set val(name), val("${bam.baseName}"), - val(type), file("${bam.baseName}.prepared.bam"), - file("${bam.baseName}.prepared.bai") into prepared_bams + val(type), file("${bam.baseName}.prepared.bam") into prepared_bams + script: + order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname" """ mkdir tmp @@ -109,8 +113,7 @@ process prepareBam { --RGSM ${type} \ --RGLB 1 \ --RGPL ${params.platform} \ - --SORT_ORDER coordinate \ - --CREATE_INDEX true + ${order} """ } @@ -126,7 +129,7 @@ if (!params.skip_deduplication) { publishDir "${publish_dir}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics" input: - set name, bam_name, type, file(bam), file(bai) from prepared_bams + set name, bam_name, type, file(bam) from prepared_bams output: set val(name), val(bam_name), val(type), @@ -136,6 +139,7 @@ if (!params.skip_deduplication) { script: dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics" + remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false" """ mkdir tmp @@ -144,12 +148,34 @@ if (!params.skip_deduplication) { --input ${bam} \ --output ${bam.baseName}.dedup.bam \ --conf 'spark.executor.cores=${task.cpus}' \ + ${remove_duplicates} \ ${dedup_metrics} """ } } else { - prepared_bams.into{ deduplicated_bams; deduplicated_bams_for_metrics; deduplicated_bams_for_hs_metrics} + process indexBam { + cpus "${params.index_cpus}" + memory "${params.index_memory}" + tag "${name}" + + input: + set name, bam_name, type, file(bam) from prepared_bams + + output: + set val(name), val(bam_name), val(type), + file("${bam}"), file("${bam.baseName}.bai") into deduplicated_bams, + deduplicated_bams_for_metrics, deduplicated_bams_for_hs_metrics + + script: + """ + mkdir tmp + + gatk BuildBamIndex \ + --java-options '-Xmx8g -Djava.io.tmpdir=tmp' \ + --INPUT ${bam} + """ + } } if (! params.skip_metrics) { diff --git a/nextflow.config b/nextflow.config index 11b6001..ec102b4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,10 +23,16 @@ profiles { params.bqsr_memory = "3g" params.metrics_cpus = 1 params.metrics_memory = "3g" + params.index_cpus = 1 + params.index_memory = "3g" params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf" params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf" params.intervals = "$baseDir/test_data/minimal_intervals.intervals" params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf" + timeline.enabled = false + report.enabled = false + trace.enabled = false + dag.enabled = false } } @@ -40,29 +46,12 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] cleanup = true -timeline { - enabled = true - //file = "${params.output}/execution_timeline.html" -} -report { - enabled = true - //file = "${params.output}/execution_report.html" -} -trace { - enabled = true - //file = "${params.output}/execution_trace.txt" -} -dag { - enabled = true - //file = "${params.output}/pipeline_dag.svg" -} - -VERSION = '1.3.1' +VERSION = '1.4.0' DOI = 'https://zenodo.org/badge/latestdoi/358400957' manifest { name = 'TRON-Bioinformatics/tronflow-bam-preprocessing' - author = 'Pablo Riesgo-Ferreiro, Özlem Muslu' + author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola' homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing' description = 'Picard and GATK BAM preprocessing pipeline' mainScript = 'main.nf' @@ -99,6 +88,7 @@ Optional input: * --skip_bqsr: optionally skip BQSR (default: false) * --skip_realignment: optionally skip realignment (default: false) * --skip_deduplication: optionally skip deduplication (default: false) + * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true) * --skip_metrics: optionally skip metrics (default: false) * --output: the folder where to publish output (default: ./output) * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)