Skip to content

Commit

Permalink
Merge pull request #10 from TRON-Bioinformatics/rna-support
Browse files Browse the repository at this point in the history
RNA support
  • Loading branch information
priesgo authored Oct 18, 2022
2 parents 802fa8c + 7c514be commit 6a64388
Show file tree
Hide file tree
Showing 11 changed files with 138 additions and 56 deletions.
1 change: 1 addition & 0 deletions .github/workflows/automated_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ jobs:
key: ${{ runner.os }}-tronflow-bam-preprocessing
- name: Run tests
run: |
export NXF_VER=22.04.5
make
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ test:
bash tests/test_07.sh
bash tests/test_08.sh
bash tests/test_09.sh
bash tests/test_10.sh
19 changes: 13 additions & 6 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
nextflow.enable.dsl = 2

include { PREPARE_BAM; INDEX_BAM } from './modules/01_prepare_bam'
include { MARK_DUPLICATES } from './modules/02_mark_duplicates'
include { METRICS; HS_METRICS; COVERAGE_ANALYSIS } from './modules/03_metrics'
include { MARK_DUPLICATES; SPLIT_CIGAR_N_READS } from './modules/02_mark_duplicates'
include { METRICS; HS_METRICS; COVERAGE_ANALYSIS; FLAGSTAT } from './modules/03_metrics'
include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels'
include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr'

Expand All @@ -26,6 +26,7 @@ params.output = 'output'
params.platform = "ILLUMINA"
params.collect_hs_metrics_min_base_quality = false
params.collect_hs_metrics_min_mapping_quality = false
params.split_cigarn = false

// computational resources
params.prepare_bam_cpus = 3
Expand Down Expand Up @@ -84,7 +85,7 @@ else if (params.input_files) {

workflow {

PREPARE_BAM(input_files)
PREPARE_BAM(input_files, params.reference)

if (!params.skip_deduplication) {
MARK_DUPLICATES(PREPARE_BAM.out.prepared_bams)
Expand All @@ -95,24 +96,30 @@ workflow {
deduplicated_bams = INDEX_BAM.out.indexed_bams
}

if (params.split_cigarn) {
SPLIT_CIGAR_N_READS(deduplicated_bams, params.reference)
deduplicated_bams = SPLIT_CIGAR_N_READS.out.split_cigarn_bams
}

if (! params.skip_metrics) {
if (params.intervals) {
HS_METRICS(deduplicated_bams)
}
METRICS(deduplicated_bams)
METRICS(deduplicated_bams, params.reference)
COVERAGE_ANALYSIS(deduplicated_bams)
FLAGSTAT(deduplicated_bams)
}

if (!params.skip_realignment) {
REALIGNMENT_AROUND_INDELS(deduplicated_bams)
REALIGNMENT_AROUND_INDELS(deduplicated_bams, params.reference)
realigned_bams = REALIGNMENT_AROUND_INDELS.out.realigned_bams
}
else {
realigned_bams = deduplicated_bams
}

if (!params.skip_bqsr) {
BQSR(realigned_bams)
BQSR(realigned_bams, params.reference)
preprocessed_bams = BQSR.out.recalibrated_bams
}
else {
Expand Down
41 changes: 20 additions & 21 deletions modules/01_prepare_bam.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ params.prepare_bam_memory = "8g"
params.index_cpus = 1
params.index_memory = "8g"
params.platform = "ILLUMINA"
params.reference = false
params.skip_deduplication = false
params.output = 'output'

/*
Expand All @@ -18,35 +16,31 @@ process PREPARE_BAM {
tag "${name}"
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"

conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::samtools=1.12" : null)
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)

input:
tuple val(name), val(type), file(bam)
val(reference)

output:
tuple val(name), val(type), file("${name}.prepared.bam"), emit: prepared_bams
file("software_versions.${task.process}.txt")

script:
order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
"""
mkdir tmp
samtools sort \
--threads ${params.prepare_bam_cpus} \
-o ${name}.sorted.bam ${bam}
gatk AddOrReplaceReadGroups \
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
--VALIDATION_STRINGENCY SILENT \
--INPUT ${name}.sorted.bam \
--INPUT ${bam} \
--OUTPUT /dev/stdout \
--REFERENCE_SEQUENCE ${params.reference} \
--REFERENCE_SEQUENCE ${reference} \
--RGPU 1 \
--RGID 1 \
--RGSM ${type} \
--RGLB 1 \
--RGPL ${params.platform} ${order} | \
--RGPL ${params.platform} | \
gatk CleanSam \
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
--INPUT /dev/stdin \
Expand All @@ -55,13 +49,10 @@ process PREPARE_BAM {
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
--INPUT /dev/stdin \
--OUTPUT ${name}.prepared.bam \
--SEQUENCE_DICTIONARY ${params.reference}
rm -f ${name}.sorted.bam
--SEQUENCE_DICTIONARY ${reference}
echo ${params.manifest} >> software_versions.${task.process}.txt
gatk --version >> software_versions.${task.process}.txt
samtools --version >> software_versions.${task.process}.txt
"""
}

Expand All @@ -71,24 +62,32 @@ process INDEX_BAM {
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy", pattern: "software_versions.*"

conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null)

input:
tuple val(name), val(type), file(bam)

output:
tuple val(name), val(type), file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams
tuple val(name), val(type), file("${name}.sorted.bam"), file("${name}.sorted.bam.bai"), emit: indexed_bams
file("software_versions.${task.process}.txt")

script:
"""
mkdir tmp
gatk BuildBamIndex \
--java-options '-Xmx8g -Djava.io.tmpdir=./tmp' \
--INPUT ${bam}
# sort
sambamba sort \
--nthreads=${task.cpus} \
--tmpdir=./tmp \
--out=${name}.sorted.bam \
${bam}
# indexes the output BAM file
sambamba index \
--nthreads=${task.cpus} \
${name}.sorted.bam ${name}.sorted.bam.bai
echo ${params.manifest} >> software_versions.${task.process}.txt
gatk --version >> software_versions.${task.process}.txt
sambamba --version >> software_versions.${task.process}.txt
"""
}
71 changes: 52 additions & 19 deletions modules/02_mark_duplicates.nf
Original file line number Diff line number Diff line change
@@ -1,50 +1,83 @@
params.mark_duplicates_cpus = 2
params.mark_duplicates_memory = "16g"
params.remove_duplicates = true
params.skip_metrics = false
params.output = 'output'


process MARK_DUPLICATES {
cpus "${params.mark_duplicates_cpus}"
memory "${params.mark_duplicates_memory}"
tag "${name}"
publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt"
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"

conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null)

input:
tuple val(name), val(type), file(bam)

output:
tuple val(name), val(type), file("${name}.dedup.bam"), file("${name}.dedup.bam.bai"), emit: deduplicated_bams
file("${name}.dedup_metrics.txt") optional true
file("software_versions.${task.process}.txt")

script:
dedup_metrics = params.skip_metrics ? "": "--METRICS_FILE ${name}.dedup_metrics.txt"
remove_duplicates = params.remove_duplicates ? "--REMOVE_DUPLICATES true" : "--REMOVE_DUPLICATES false"
remove_duplicates_param = params.remove_duplicates ? "--remove-duplicates" : ""
"""
mkdir tmp
gatk SortSam \
--java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \
--INPUT ${bam} \
--OUTPUT ${name}.sorted.bam \
--SORT_ORDER coordinate
gatk MarkDuplicates \
--java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \
--INPUT ${name}.sorted.bam \
--OUTPUT ${name}.dedup.bam \
--ASSUME_SORT_ORDER coordinate \
--CREATE_INDEX true ${remove_duplicates} ${dedup_metrics}
# sort
sambamba sort \
--nthreads=${task.cpus} \
--tmpdir=./tmp \
--out=${name}.sorted.bam \
${bam}
cp ${name}.dedup.bai ${name}.dedup.bam.bai
# removes duplicates (sorted from the alignment process)
sambamba markdup ${remove_duplicates_param} \
--nthreads=${task.cpus} \
--tmpdir=./tmp \
${name}.sorted.bam ${name}.dedup.bam
rm -f ${name}.sorted.bam
# indexes the output BAM file
sambamba index \
--nthreads=${task.cpus} \
${name}.dedup.bam ${name}.dedup.bam.bai
echo ${params.manifest} >> software_versions.${task.process}.txt
sambamba --version >> software_versions.${task.process}.txt
"""
}

process SPLIT_CIGAR_N_READS {
cpus "${params.prepare_bam_cpus}"
memory "${params.prepare_bam_memory}"
tag "${name}"
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"

conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)

input:
tuple val(name), val(type), file(bam), file(bai)
val(reference)

output:
tuple val(name), val(type), file("${name}.split_cigarn.bam"), file("${name}.split_cigarn.bam.bai"), emit: split_cigarn_bams
file("software_versions.${task.process}.txt")

script:
"""
mkdir tmp
gatk SplitNCigarReads \
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
--input ${bam} \
--output ${name}.split_cigarn.bam \
--create-output-bam-index true \
--reference ${reference}
cp ${name}.split_cigarn.bai ${name}.split_cigarn.bam.bai
echo ${params.manifest} >> software_versions.${task.process}.txt
gatk --version >> software_versions.${task.process}.txt
"""
Expand Down
32 changes: 30 additions & 2 deletions modules/03_metrics.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ params.metrics_cpus = 1
params.metrics_memory = "8g"
params.collect_hs_metrics_min_base_quality = false
params.collect_hs_metrics_min_mapping_quality = false
params.reference = false
params.output = 'output'
params.intervals = false

Expand Down Expand Up @@ -63,6 +62,7 @@ process METRICS {

input:
tuple val(name), val(type), file(bam), file(bai)
val(reference)

output:
file("*_metrics") optional true
Expand All @@ -76,7 +76,7 @@ process METRICS {
--java-options '-Xmx${params.metrics_memory} -Djava.io.tmpdir=./tmp' \
--INPUT ${bam} \
--OUTPUT ${name} \
--REFERENCE_SEQUENCE ${params.reference} \
--REFERENCE_SEQUENCE ${reference} \
--PROGRAM QualityScoreDistribution \
--PROGRAM MeanQualityByCycle \
--PROGRAM CollectAlignmentSummaryMetrics \
Expand Down Expand Up @@ -122,3 +122,31 @@ process COVERAGE_ANALYSIS {
samtools --version >> software_versions.${task.process}.txt
"""
}

process FLAGSTAT {
cpus "${params.metrics_cpus}"
memory "${params.metrics_memory}"
tag "${name}"
publishDir "${params.output}/${name}/metrics/flagstat", mode: "copy", pattern: "*.flagstat.csv"
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"

conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null)

input:
tuple val(name), val(type), file(bam), file(bai)

output:
file("${name}.flagstat.csv")
file("software_versions.${task.process}.txt")

script:
"""
sambamba flagstat \
--nthreads=${task.cpus} \
--tabular \
${bam} > ${name}.flagstat.csv
echo ${params.manifest} >> software_versions.${task.process}.txt
sambamba --version >> software_versions.${task.process}.txt
"""
}
6 changes: 3 additions & 3 deletions modules/04_realignment_around_indels.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ params.realignment_around_indels_cpus = 2
params.realignment_around_indels_memory = "31g"
params.known_indels1 = false
params.known_indels2 = false
params.reference = false
params.output = 'output'


Expand All @@ -19,6 +18,7 @@ process REALIGNMENT_AROUND_INDELS {

input:
tuple val(name), val(type), file(bam), file(bai)
val(reference)

output:
tuple val(name), val(type), file("${name}.realigned.bam"), file("${name}.realigned.bai"), emit: realigned_bams
Expand All @@ -36,12 +36,12 @@ process REALIGNMENT_AROUND_INDELS {
gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T RealignerTargetCreator \
--input_file ${bam} \
--out ${name}.RA.intervals \
--reference_sequence ${params.reference} ${known_indels1} ${known_indels2}
--reference_sequence ${reference} ${known_indels1} ${known_indels2}
gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T IndelRealigner \
--input_file ${bam} \
--out ${name}.realigned.bam \
--reference_sequence ${params.reference} \
--reference_sequence ${reference} \
--targetIntervals ${name}.RA.intervals \
--consensusDeterminationModel USE_SW \
--LODThresholdForCleaning 0.4 \
Expand Down
Loading

0 comments on commit 6a64388

Please sign in to comment.