Merge pull request #155 from CenterForMedicalGeneticsGhent/dev

Release PR v1.4.0
nf-cmgg · Dec 6, 2023 · 384ed60 · 384ed60
2 parents 1e8a16e + 649c2b9
commit 384ed60
Show file tree

Hide file tree

Showing 261 changed files with 12,554 additions and 4,136 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -2,6 +2,7 @@
     "name": "nfcore",
     "image": "nfcore/gitpod:latest",
     "remoteUser": "gitpod",
+    "runArgs": ["--privileged"],
 
     // Configure tool-specific properties.
     "customizations": {

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -27,7 +27,8 @@ jobs:
           - "23.04.0"
           - "latest-everything"
         test:
-          - "default"
+          - "vardict"
+          - "haplotypecaller"
           - "fails"
           - "annotate_test"
           - "filter_test"

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -70,8 +70,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install nf-core
+          python -m pip install --upgrade --force-reinstall git+https://github.com/nf-core/tools.git@dev
 
       - name: Run nf-core lint
         env:

diff --git a/.github/workflows/release-announcments.yml b/.github/workflows/release-announcments.yml
@@ -0,0 +1,68 @@
+name: release-announcements
+# Automatic release toot and tweet anouncements
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+
+jobs:
+  toot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: rzr/fediverse-action@master
+        with:
+          access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }}
+          host: "mstdn.science" # custom host if not "mastodon.social" (default)
+          # GitHub event payload
+          # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release
+          message: |
+            Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}!
+
+            Please see the changelog: ${{ github.event.release.html_url }}
+
+  send-tweet:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: pip install tweepy==4.14.0
+      - name: Send tweet
+        shell: python
+        run: |
+          import os
+          import tweepy
+
+          client = tweepy.Client(
+              access_token=os.getenv("TWITTER_ACCESS_TOKEN"),
+              access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"),
+              consumer_key=os.getenv("TWITTER_CONSUMER_KEY"),
+              consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"),
+          )
+          tweet = os.getenv("TWEET")
+          client.create_tweet(text=tweet)
+        env:
+          TWEET: |
+            Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}!
+
+            Please see the changelog: ${{ github.event.release.html_url }}
+          TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }}
+          TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }}
+          TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }}
+          TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
+
+  bsky-post:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: zentered/bluesky-post-action@v0.0.2
+        with:
+          post: |
+            Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}!
+
+            Please see the changelog: ${{ github.event.release.html_url }}
+        env:
+          BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }}
+          BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }}
+          #
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,4 +1,3 @@
-repository_type: pipeline
 lint:
   files_exist:
     - CODE_OF_CONDUCT.md
@@ -32,5 +31,11 @@ lint:
     - lib/NfcoreTemplate.groovy
     - .prettierignore
     - LICENSE
-  actions_ci: false
   pipeline_name_conventions: false
+  actions_ci: false
+repository_type: pipeline
+template:
+  author: nvnieuwk
+  description: A nextflow pipeline for calling and annotating variants
+  name: nf-cmgg-germline
+  prefix: CenterForMedicalGeneticsGhent
diff --git a/.prettierignore b/.prettierignore
@@ -2,7 +2,7 @@ email_template.html
 adaptivecard.json
 slackreport.json
 .nextflow*
-.nf-test/
+.nf-test*
 work/
 data/
 results/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,31 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v1.4.0 - Kingly Kortrijk - [December 6 2023]
+
+### New Features
+
+1. Added the `--callers` parameter to specify the variant caller to use. Currently only `haplotypecaller` and `vardict` are supported.
+2. Added the `vardict` variant caller.
+3. Added the `--vardict_min_af` parameter to specify the minimum allele frequency for `vardict`. This option is also available in the samplesheet as `vardict_min_af` to set it dynamically per sample.
+4. Added the `--output_genomicsdb` option to specify whether a GenomicsDB should be outputted or not. This will be `true` when using `only_merge`.
+5. Added `--normalize` options for decomposing and normalizing of variants after calling and genotyping.
+6. Added `WGS`, `WES`, `SeqCap`, `HyperCap` and `seqplorer` profiles that can be used to set the default parameters for these types of runs.
+
+### Improvements
+
+1. Refactored the pipeline to accomodate future additions of variant callers and genotypers
+2. Removed a lot of unnecessary bloat
+3. Improved GenomicsDBImport (can now be multithreaded and runs a lot faster). This will make very big runs more possible.
+4. Changed `coverage_fast` to `mosdepth_slow`, reversing the effect of the parameter. By default mosdepth will now be run with `--fast-mode`. This can be disabled using the new `mosdepth_slow` parameter.
+5. Automatically merge the regions that are within 150 bps of eachother for the variant calling. This way it's ensured that indel calling happens correctly.
+
+### Fixes
+
+1. Fixed an issue with the outputting of the validation PNG files, now all three types of PNGs are outputted.
+2. Fixed a small issue where VCFs without a sample created by the callers could not be used by `bcftools concat`, these files will now be filtered from the input of the command.
+3. Removed the `--maxentscan` parameter because this file is automatically present in the container
+
 ## v1.3.0 - Happy Hasselt - [July 10 2023]
 
 ### New Features

diff --git a/README.md b/README.md
@@ -7,48 +7,53 @@
 
 ## Introduction
 
-**nf-cmgg-germline** is a bioinformatics best-practice analysis pipeline for calling, joint-genotyping and annotating small variants variants for WGS and WES data. It uses HaplotypeCaller to call variants, GenotypeGVCFs to genotype and EnsemblVEP to annotate.
+**nf-cmgg-germline** is a bioinformatics best-practice analysis pipeline for calling, joint-genotyping and annotating small variants variants for WGS and WES data. It uses HaplotypeCaller and/or Vardict to call variants, GenotypeGVCFs to genotype and EnsemblVEP to annotate.
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 
 ## Pipeline summary
 
 ![metro graph](docs/images/nf-cmgg-germline_metro.png)
 
+:::note
+If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how
+to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline)
+with `-profile test` before running the workflow on actual data.
+:::
+
 ## Quick Start
 
-1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.5`)
+1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=23.04.0`)
 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_.
 
 `samplesheet.csv`:
 
 ```csv
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
+sample,family,cram,crai
+SAMPLE_1,FAMILY_1,SAMPLE_1.cram,SAMPLE_1.crai
 ```
 
-Each row represents a fastq file (single-end) or a pair of fastq files (paired end).
+Each row represents a single sample to be analysed. More information can be found in the [usage](docs/usage.md) documentation.
 
 -->
 
 Now, you can run the pipeline using:
 
-   <!-- TODO nf-core: Update the example "typical command" below used to run the pipeline -->
-
 ```bash
-nextflow run CenterForMedicalGeneticsGhent/nf-cmgg-germline --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+nextflow run CenterForMedicalGeneticsGhent/nf-cmgg-germline --input samplesheet.csv --outdir <OUTDIR> --genome GRCh38 -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
 ```
 
-> **Warning:**
-> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
-> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
-> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
+:::warning
+Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
+provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
+see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
+:::
 
 ## Credits
 
-nf-cmgg-germline was originally written and is maintained by @nvnieuwk.
+nf-cmgg-germline was originally written and is maintained by [@nvnieuwk](https://github.com/nvnieuwk).
 
-Special thanks to @matthdsm for the many tips and feedback and to @mvheetve for testing the pipeline.
+Special thanks to [@matthdsm](https://github.com/matthdsm) for the many tips and feedback and to [@mvheetve](https://github.com/mvheetve) for testing the pipeline.
 
 ## Contributions and Support
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,5 +1,5 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-germline/1.3.0" target="_blank">CenterForMedicalGeneticsGhent/nf-cmgg-germline</a>
+  This report has been generated by the <a href="https://github.com/CenterForMedicalGeneticsGhent/nf-cmgg-germline/1.4.0" target="_blank">CenterForMedicalGeneticsGhent/nf-cmgg-germline</a>
   analysis pipeline.
 report_section_order:
   "CenterForMedicalGeneticsGhent-nf-cmgg-germline-methods-description":

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,family,cram,crai,roi,ped,truth_vcf,truth_tbi,truth_bed
-DNA049572,Proband_12345,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/cram/test.cram,,,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-germline/dev/assets/test.ped,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/vcf/test.vcf.gz,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/vcf/test.vcf.gz.tbi,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/regions/roi.bed
-NA24385D2_NVQ_034,Proband_12345,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/cram/test2.cram,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/cram/test2.cram.crai,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/regions/roi.bed,,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/vcf/test2.vcf.gz,,
+sample,family,cram,crai,roi,ped,truth_vcf,truth_tbi,truth_bed,vardict_min_af
+DNA049572,Proband_12345,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/cram/test.cram,,,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-germline/dev/assets/test.ped,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/vcf/test.vcf.gz,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/vcf/test.vcf.gz.tbi,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/regions/roi.bed,0.01
+NA24385D2_NVQ_034,Proband_12345,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/cram/test2.cram,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/cram/test2.cram.crai,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/regions/roi.bed,,https://raw.githubusercontent.com/CenterForMedicalGeneticsGhent/nf-cmgg-test-datasets/main/data/genomics/homo_sapiens/illumina/vcf/test2.vcf.gz,,,
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -18,47 +18,60 @@
             "cram": {
                 "type": "string",
                 "pattern": "^\\S+\\.cram$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "crai": {
                 "type": "string",
                 "pattern": "^\\S+\\.crai$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "gvcf": {
                 "type": "string",
                 "pattern": "^\\S+vcf(.gz)?$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "tbi": {
                 "type": "string",
                 "pattern": "^\\S+\\.tbi$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "roi": {
                 "type": "string",
                 "pattern": "^\\S+\\.bed(\\.gz)?$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "ped": {
                 "type": "string",
                 "pattern": "^\\S+\\.ped$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "truth_vcf": {
                 "type": "string",
                 "pattern": "^\\S+\\.vcf\\.gz$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "truth_tbi": {
                 "type": "string",
                 "pattern": "^\\S+\\.tbi$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
             },
             "truth_bed": {
                 "type": "string",
                 "pattern": "^\\S+\\.bed$",
-                "format": "file-path"
+                "format": "file-path",
+                "exists": true
+            },
+            "vardict_min_af": {
+                "type": "number",
+                "meta": ["vardict_min_af"]
             }
         },
         "required": ["sample"]

diff --git a/assets/vardict.header.vcf.gz b/assets/vardict.header.vcf.gz
diff --git a/conf/HyperCap.config b/conf/HyperCap.config
@@ -0,0 +1,20 @@
+/*
+========================================================================================
+    Nextflow config file for HyperCap runs
+========================================================================================
+*/
+
+params {
+    callers             = "vardict"
+    filter              = true
+    normalize           = true
+    annotate            = true
+    vcfanno             = true
+    scatter_count       = 5
+    vep_dbnsfp          = true
+    vep_spliceai        = true
+    vep_spliceregion    = true
+    vep_mastermind      = true
+    vep_maxentscan      = true
+    vep_eog             = true
+}
diff --git a/conf/SeqCap.config b/conf/SeqCap.config
@@ -0,0 +1,12 @@
+/*
+========================================================================================
+    Nextflow config file for SeqCap runs
+========================================================================================
+*/
+
+params {
+    callers   = "vardict"
+    filter    = true
+    normalize = true
+    scatter_count = 14
+}
diff --git a/conf/WES.config b/conf/WES.config
@@ -0,0 +1,11 @@
+/*
+========================================================================================
+    Nextflow config file for WES runs
+========================================================================================
+*/
+
+params {
+    roi             = "${params.genomes_base}/Hsapiens/GRCh38.p14/regions/CMGG_WES_analysis_ROI_v4.bed"
+    scatter_count   = 14
+
+}
diff --git a/conf/WGS.config b/conf/WGS.config
@@ -0,0 +1,9 @@
+/*
+========================================================================================
+    Nextflow config file for WGS runs
+========================================================================================
+*/
+
+params {
+    scatter_count = 30
+}