diff --git a/.gitignore b/.gitignore index efca3e5..7ddc738 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ resources/** logs/** .snakemake .snakemake/** +.test/results/* diff --git a/.test/config/config.yml b/.test/config/config.yml new file mode 100644 index 0000000..51020b9 --- /dev/null +++ b/.test/config/config.yml @@ -0,0 +1,34 @@ +samplesheet: "config/samples.tsv" + +get_genome: + database: "ncbi" + assembly: "GCF_000006785.2" + fasta: Null + gff: Null + gff_source_type: + [ + "RefSeq": "gene", + "RefSeq": "pseudogene", + "RefSeq": "CDS", + "Protein Homology": "CDS", + ] + +simulate_reads: + read_length: 100 + read_number: 100000 + random_freq: 0.01 + +cutadapt: + threep_adapter: "-a ATCGTAGATCGG" + fivep_adapter: "-A GATGGCGATAGG" + default: ["-q 10 ", "-m 25 ", "-M 100", "--overlap=5"] + +multiqc: + config: "config/multiqc_config.yml" + +report: + export_figures: True + export_dir: "figures/" + figure_width: 875 + figure_height: 500 + figure_resolution: 125 diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml new file mode 100644 index 0000000..2f9d44b --- /dev/null +++ b/.test/config/multiqc_config.yml @@ -0,0 +1,2 @@ +remove_sections: + - samtools-stats \ No newline at end of file diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv new file mode 100644 index 0000000..b931d64 --- /dev/null +++ b/.test/config/samples.tsv @@ -0,0 +1,3 @@ +sample condition replicate read1 read2 +sample1 wild_type 1 sample1.bwa.read1.fastq.gz sample1.bwa.read2.fastq.gz +sample2 wild_type 2 sample2.bwa.read1.fastq.gz sample2.bwa.read2.fastq.gz diff --git a/README.md b/README.md index aab998b..ecaed19 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,109 @@ # Snakemake workflow: `` -[![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) -[![GitHub actions status](https://github.com///workflows/Tests/badge.svg?branch=main)](https://github.com///actions?query=branch%3Amain+workflow%3ATests) - +[![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io) +[![GitHub actions status](https://github.com/MPUSP/snakemake-workflow-template/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/MPUSP/snakemake-workflow-template/actions/workflows/main.yml) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1D355C.svg?labelColor=000000)](https://sylabs.io/docs/) +[![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog) A Snakemake workflow for `` +- [Snakemake workflow: ``](#snakemake-workflow-name) + - [Usage](#usage) + - [Workflow overview](#workflow-overview) + - [Running the workflow](#running-the-workflow) + - [Input data](#input-data) + - [Execution](#execution) + - [Parameters](#parameters) + - [Authors](#authors) + - [References](#references) + - [TODO](#todo) ## Usage The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=%2F). -If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) sitory and its DOI (see above). +If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI. + +## Workflow overview + +This workflow is a best-practice workflow for ``. +The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps: + +1. Parse sample sheet containing sample meta data (`python`) +2. Simulate short read sequencing data on the fly (`dwgsim`) +3. Check quality of input read data (`FastQC`) +4. Trim adapters from input data (`cutadapt`) +5. Collect statistics from tool output (`MultiQC`) + +## Running the workflow + +### Input data + +This template workflow contains artifical sequencing data in `*.fastq.gz` format. +The test data is located in `.test/data`. Input files are supplied with a mandatory table, whose location is indicated in the `config.yml` file (default: `.test/samples.tsv`). The sample sheet has the following layout: + +| sample | condition | replicate | data_folder | fq1 | +| -------- | --------- | --------- | ----------- | ------------------------ | +| RPF-RTP1 | RPF-RTP | 1 | data | RPF-RTP1_R1_001.fastq.gz | +| RPF-RTP2 | RPF-RTP | 2 | data | RPF-RTP2_R1_001.fastq.gz | + +### Execution + +To run the workflow from command line, change the working directory. + +```bash +cd path/to/snakemake-workflow-name +``` + +Adjust options in the default config file `config/config.yml`. +Before running the entire workflow, you can perform a dry run using: + +```bash +snakemake --dry-run +``` + +To run the complete workflow with test files using **conda**, execute the following command. The definition of the number of compute cores is mandatory. + +```bash +snakemake --cores 10 --sdm conda --directory .test +``` + +To run the workflow with **singularity** / **apptainer**, use: + +```bash +snakemake --cores 10 --sdm conda apptainer --directory .test +``` + +### Parameters + +This table lists all parameters that can be used to run the workflow. + +| parameter | type | details | default | +| ---------------------- | ---- | ------------------------------------------- | -------------------------------------------- | +| **samplesheet** | | | | +| path | str | path to samplesheet, mandatory | "config/samples.tsv" | +| **cutadapt** | | | | +| fivep_adapter | str | sequence of the 5' adapter | Null | +| threep_adapter | str | sequence of the 3' adapter | `ATCGTAGATCGGAAGAGCACACGTCTGAA` | +| default | str | additional options passed to `cutadapt` | [`-q 10 `, `-m 22 `, `-M 52`, `--overlap=3`] | + +## Authors + +- Firstname Lastname + - Affiliation + - ORCID profile + - home page + +## References + +> Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. *Sustainable data analysis with Snakemake*. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2. -# TODO +## TODO * Replace `` and `` everywhere in the template (also under .github/workflows) with the correct `` name and owning user or organization. * Replace `` with the workflow name (can be the same as ``). * Replace `` with a description of what the workflow does. +* Update the workflow description, parameters, running options, authors and references in the `README.md` +* Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's capability * The workflow will occur in the snakemake-workflow-catalog once it has been made public. Then the link under "Usage" will point to the usage instructions if `` and `` were correctly set. \ No newline at end of file diff --git a/config/config.yml b/config/config.yml new file mode 100644 index 0000000..94fffcd --- /dev/null +++ b/config/config.yml @@ -0,0 +1,34 @@ +samplesheet: ".test/config/samples.tsv" + +get_genome: + database: "ncbi" + assembly: "GCF_000006785.2" + fasta: Null + gff: Null + gff_source_type: + [ + "RefSeq": "gene", + "RefSeq": "pseudogene", + "RefSeq": "CDS", + "Protein Homology": "CDS", + ] + +simulate_reads: + read_length: 100 + read_number: 100000 + random_freq: 0.01 + +cutadapt: + threep_adapter: "-a ATCGTAGATCGG" + fivep_adapter: "-A GATGGCGATAGG" + default: ["-q 10 ", "-m 25 ", "-M 100", "--overlap=5"] + +multiqc: + config: "config/multiqc_config.yml" + +report: + export_figures: True + export_dir: "figures/" + figure_width: 875 + figure_height: 500 + figure_resolution: 125 diff --git a/config/multiqc_config.yml b/config/multiqc_config.yml new file mode 100644 index 0000000..2f9d44b --- /dev/null +++ b/config/multiqc_config.yml @@ -0,0 +1,2 @@ +remove_sections: + - samtools-stats \ No newline at end of file diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml new file mode 100644 index 0000000..6a6d8fd --- /dev/null +++ b/config/schemas/config.schema.yml @@ -0,0 +1,44 @@ +$schema: "http://json-schema.org/draft-07/schema#" +description: an entry in the sample sheet +properties: + samplesheet: + type: string + description: sample name/identifier + + get_genome: + properties: + database: + type: ["string", "null"] + assembly: + type: ["string", "null"] + fasta: + type: ["string", "null"] + gff: + type: ["string", "null"] + gff_source_type: + type: array + + simulate_reads: + properties: + read_length: + type: number + read_number: + type: number + random_freq: + type: number + + cutadapt: + properties: + threep_adapter: + type: string + fivep_adapter: + type: string + default: + type: array + + multiqc: + properties: + config: + type: string + +required: ["samplesheet", "get_genome", "simulate_reads", "cutadapt", "multiqc"] diff --git a/config/schemas/samples.schema.yml b/config/schemas/samples.schema.yml new file mode 100644 index 0000000..d7ecee3 --- /dev/null +++ b/config/schemas/samples.schema.yml @@ -0,0 +1,25 @@ +$schema: "http://json-schema.org/draft-07/schema#" +description: an entry in the sample sheet +properties: + sample: + type: string + description: sample name/identifier + condition: + type: string + description: sample condition that will be compared during differential analysis + replicate: + type: number + default: 1 + description: consecutive numbers representing multiple replicates of one condition + read1: + type: string + description: names of fastq.gz files, read 1 + read2: + type: string + description: names of fastq.gz files, read 2 (optional) + +required: + - sample + - condition + - replicate + - read1 diff --git a/workflow/Snakefile b/workflow/Snakefile index 3826ec5..793538f 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,4 +1,45 @@ -# Main entrypoint of the workflow. -# Please follow the best practices: +# Main entrypoint of the workflow. +# Please follow the best practices: # https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html, -# in particular regarding the standardized folder structure mentioned there. +# in particular regarding the standardized folder structure mentioned there. + + +import os +import pandas as pd + + +# load configuration +# ----------------------------------------------------- +configfile: "config/config.yml" + + +# container definition: uncomment to include a singularity image, e.g. from github's container registry +# container: "oras://ghcr.io//:" + + +# load rules +# ----------------------------------------------------- +include: "rules/common.smk" +include: "rules/process_reads.smk" + + +# optional messages, log and error handling +# ----------------------------------------------------- +onstart: + print("\n--- Analysis started ---\n") + + +onsuccess: + print("--- Workflow finished! ---") + + +onerror: + print("--- An error occurred! ---") + + +# target rules +# ----------------------------------------------------- +rule all: + input: + "results/multiqc/multiqc_report.html", + default_target: True diff --git a/workflow/envs/cutadapt.yml b/workflow/envs/cutadapt.yml new file mode 100644 index 0000000..e6bbff7 --- /dev/null +++ b/workflow/envs/cutadapt.yml @@ -0,0 +1,6 @@ +name: cutadapt +channels: + - conda-forge + - bioconda +dependencies: + - cutadapt=4.9 diff --git a/workflow/envs/fastqc.yml b/workflow/envs/fastqc.yml new file mode 100644 index 0000000..25e6d86 --- /dev/null +++ b/workflow/envs/fastqc.yml @@ -0,0 +1,6 @@ +name: fastqc +channels: + - conda-forge + - bioconda +dependencies: + - fastqc=0.12.1 diff --git a/workflow/envs/get_genome.yml b/workflow/envs/get_genome.yml new file mode 100644 index 0000000..79d325e --- /dev/null +++ b/workflow/envs/get_genome.yml @@ -0,0 +1,9 @@ +name: get_genome +channels: + - conda-forge + - bioconda +dependencies: + - unzip=6.0 + - ncbi-datasets-cli=16.23.0 + - bcbio-gff=0.7.1 + - samtools=1.20 diff --git a/workflow/envs/multiqc.yml b/workflow/envs/multiqc.yml new file mode 100644 index 0000000..b99e9c3 --- /dev/null +++ b/workflow/envs/multiqc.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda +dependencies: + - python=3.9 + - multiqc=1.14 diff --git a/workflow/envs/simulate_reads.yml b/workflow/envs/simulate_reads.yml new file mode 100644 index 0000000..b8d2db2 --- /dev/null +++ b/workflow/envs/simulate_reads.yml @@ -0,0 +1,6 @@ +name: get_genome +channels: + - conda-forge + - bioconda +dependencies: + - dwgsim=1.1.14 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk new file mode 100644 index 0000000..4482e09 --- /dev/null +++ b/workflow/rules/common.smk @@ -0,0 +1,17 @@ +# import basic packages +import pandas as pd +from snakemake.utils import validate +from os import path + + +# read sample sheet +samples = ( + pd.read_csv(config["samplesheet"], sep="\t", dtype={"sample": str}) + .set_index("sample", drop=False) + .sort_index() +) + + +# validate sample sheet and config file +validate(samples, schema="../../config/schemas/samples.schema.yml") +validate(config, schema="../../config/schemas/config.schema.yml") diff --git a/workflow/rules/process_reads.smk b/workflow/rules/process_reads.smk new file mode 100644 index 0000000..ec49a82 --- /dev/null +++ b/workflow/rules/process_reads.smk @@ -0,0 +1,122 @@ +# ----------------------------------------------------- # +# EXAMPLE WORKFLOW # +# ----------------------------------------------------- # + + +# module to fetch genome from NCBI or Ensemble +# ----------------------------------------------------- +rule get_genome: + output: + path=directory("results/get_genome"), + fasta="results/get_genome/genome.fasta", + gff="results/get_genome/genome.gff", + conda: + "../envs/get_genome.yml" + message: + """--- Parsing genome GFF and FASTA files.""" + params: + database=config["get_genome"]["database"], + assembly=config["get_genome"]["assembly"], + fasta=config["get_genome"]["fasta"], + gff=config["get_genome"]["gff"], + log: + path="results/get_genome/log/get_genome.log", + script: + "../scripts/get_genome.py" + + +# module simulate reads +# ----------------------------------------------------- +rule simulate_reads: + input: + fasta=rules.get_genome.output.fasta, + output: + fastq1="results/simulate_reads/{sample}.bwa.read1.fastq.gz", + fastq2="results/simulate_reads/{sample}.bwa.read2.fastq.gz", + conda: + "../envs/simulate_reads.yml" + message: + """--- Simulating reads with DWGSIM.""" + params: + read_length=config["simulate_reads"]["read_length"], + read_number=config["simulate_reads"]["read_number"], + random_freq=config["simulate_reads"]["random_freq"], + log: + path="results/simulate_reads/{sample}.log", + shell: + "prefix=`echo {output.fastq1} | cut -f 1 -d .`;" + "dwgsim -1 {params.read_length} -2 {params.read_length} " + "-N {params.read_number} -o 1 -y {params.random_freq} {input.fasta} ${{prefix}} &> {log.path}" + + +# module to make QC report +# ----------------------------------------------------- +rule fastqc: + input: + fastq="results/simulate_reads/{sample}.bwa.{read}.fastq.gz", + output: + report="results/fastqc/{sample}.bwa.{read}_fastqc.html", + conda: + "../envs/fastqc.yml" + message: + """--- Checking fastq files with FastQC.""" + log: + path="results/fastqc/{sample}.bwa.{read}.log", + shell: + "prefix=`echo {output.report} | cut -f 1-2 -d /`;" + "fastqc --nogroup --extract --quiet --threads {threads} -o ${{prefix}} {input.fastq} > {log}" + + +# module to trim adapters from reads +# ----------------------------------------------------- +rule cutadapt: + input: + fastq1="results/simulate_reads/{sample}.bwa.read1.fastq.gz", + fastq2="results/simulate_reads/{sample}.bwa.read2.fastq.gz", + output: + fastq1="results/cutadapt/{sample}.bwa.read1.fastq.gz", + fastq2="results/cutadapt/{sample}.bwa.read2.fastq.gz", + conda: + "../envs/cutadapt.yml" + message: + """--- Trim adapters from reads.""" + params: + threep_adapter=config["cutadapt"]["threep_adapter"], + fivep_adapter=config["cutadapt"]["fivep_adapter"], + default=config["cutadapt"]["default"], + log: + stdout="results/cutadapt/{sample}.bwa.log", + stderr="results/cutadapt/{sample}.bwa.stderr", + threads: int(workflow.cores * 0.25) + shell: + "cutadapt {params.threep_adapter} {params.fivep_adapter} --cores {threads} " + "-o {output.fastq1} -p {output.fastq2} {input.fastq1} {input.fastq1} > {log.stdout} 2> {log.stderr}" + + +# module to run multiQC on input + processed files +# ----------------------------------------------------- +rule multiqc: + input: + expand( + "results/cutadapt/{sample}.bwa.{read}.fastq.gz", + sample=samples.index, + read=["read1", "read2"], + ), + expand( + "results/fastqc/{sample}.bwa.{read}_fastqc.html", + sample=samples.index, + read=["read1", "read2"], + ), + output: + report="results/multiqc/multiqc_report.html", + conda: + "../envs/multiqc.yml" + message: + """--- Generating MultiQC report for seq data.""" + params: + config=config["multiqc"]["config"], + log: + path="results/multiqc/log/multiqc.log", + shell: + "outdir=`echo {output.report} | cut -f 1-2 -d /`; " + "multiqc -c {params.config} --force --verbose --dirs --outdir ${{outdir}} results &> {log.path}" diff --git a/workflow/scripts/get_genome.py b/workflow/scripts/get_genome.py new file mode 100644 index 0000000..107dad9 --- /dev/null +++ b/workflow/scripts/get_genome.py @@ -0,0 +1,159 @@ +#!/usr/bin/python3 + +# GET GENOME +# ----------------------------------------------------------------------------- +# +# This script attempts to download genome sequence (FASTA) and +# genome annotation (GFF / GTF) files from NCBI using the NCBI datasets +# API, or a similar database. Also, the genome sequence is indexed using samtools. +# Alternatively, a FASTA and GFF file can be +# supplied by the user. Input is roughly checked for validity. + +from os import path +from BCBio.GFF import GFFExaminer +from BCBio import GFF +from subprocess import getoutput + +input_database = snakemake.params["database"] +input_assembly = snakemake.params["assembly"] +input_fasta = snakemake.params["fasta"] +input_gff = snakemake.params["gff"] +output_path = snakemake.output["path"] +output_fasta = snakemake.output["fasta"] +output_gff = snakemake.output["gff"] +output_log = snakemake.log["path"] +log = [] +error = [] + + +def check_fasta(input_fasta, log=[], error=[]): + with open(input_fasta, "r") as fasta_file: + fasta = fasta_file.read() + n_items = fasta.count(">") + if n_items: + log += [f"Supplied fasta file '{input_fasta}' was found"] + log += [f"Supplied fasta file contains {n_items} items"] + else: + error += ["The supplied fasta file contains no valid entries starting with '>'"] + return fasta, log, error + + +def check_gff(input_gff, log=[], error=[]): + with open(input_gff, "r") as gff_file: + gff_examiner = GFFExaminer() + log += [f"Supplied GFF file '{input_gff}' was found"] + gff_summary = gff_examiner.available_limits(gff_file) + log += [ + f"Supplied GFF file contains the following items:", + "--------------------", + ] + for item in gff_summary["gff_source_type"]: + log += ["-".join(item) + " : " + str(gff_summary["gff_source_type"][item])] + with open(input_gff, "r") as gff_file: + new_gff = [] + gff_source_type = [] + for i in snakemake.config["get_genome"]["gff_source_type"]: + gff_source_type += list(i.items()) + limits = dict(gff_source_type=gff_source_type) + for rec in GFF.parse(gff_file, limit_info=limits): + for recfeat in rec.features: + rec_keys = recfeat.qualifiers.keys() + if not "Name" in rec_keys: + if "locus_tag" in rec_keys: + recfeat.qualifiers["Name"] = recfeat.qualifiers["locus_tag"] + else: + error += [ + "required fields 'Name','locus_tag' missing in *.gff file" + ] + else: + if "locus_tag" in rec_keys: + recfeat.qualifiers["trivial_name"] = recfeat.qualifiers["Name"] + recfeat.qualifiers["Name"] = recfeat.qualifiers["locus_tag"] + if not "ID" in rec_keys: + if "locus_tag" in rec_keys: + recfeat.qualifiers["ID"] = recfeat.qualifiers["locus_tag"] + elif "Name" in rec_keys: + recfeat.qualifiers["ID"] = recfeat.qualifiers["Name"] + else: + error += [ + "required fields 'ID','locus_tag' missing in *.gff file" + ] + new_gff += [rec] + return new_gff, log, error + + +if input_database.lower() == "ncbi": + ncbi_result = getoutput( + f"datasets summary genome accession {input_assembly} --as-json-lines | " + + "dataformat tsv genome --fields accession,annotinfo-release-date,organism-name" + ) + if ncbi_result.startswith("Error"): + error += [ncbi_result] + error += [ + "The supplied refseq/genbank ID was not valid. Example for correct input: 'GCF_000009045.1'" + ] + elif len(ncbi_result) == 0: + error += [ + "The result from fetching NCBI genome data has zero length. Please check your internet connection!" + ] + else: + ncbi_genome = [ + i.split("\t") + for i in ncbi_result.split("\n") + if not (i.startswith("New version") or i.startswith("Warning")) + ] + ncbi_genome = dict(zip(ncbi_genome[0], ncbi_genome[1])) + log += ["Found the following genome(s):"] + for k in ncbi_genome.keys(): + log += ["{0}: {1}".format(k, ncbi_genome.get(k))] + refseq_id = ncbi_genome.get("Assembly Accession") + if not refseq_id.startswith("GCF_"): + error += ["The RefSeq ID '{0}' has no valid format.".format(refseq_id)] + ncbi_command = ( + f"datasets download genome accession {refseq_id}" + + f" --filename {output_path}/database.zip --include genome,gff3; " + + f"cd {output_path}; unzip database.zip; rm database.zip" + ) + copy_command = ( + f"cp {output_path}/ncbi_dataset/data/{refseq_id}/*.fna {output_fasta}; " + + f"cp {output_path}/ncbi_dataset/data/{refseq_id}/genomic.gff {output_gff}" + ) + index_command = f"samtools faidx {output_fasta}" + str_out = getoutput(ncbi_command) + str_cp = getoutput(copy_command) + # import and check files + fasta, log, error = check_fasta(output_fasta, log, error) + index_out = getoutput(index_command) + gff, log, error = check_gff(output_gff, log, error) + # write cleaned gff file + with open(output_gff, "w") as gff_out: + GFF.write(gff, gff_out) + +elif input_database.lower() == "manual": + if not path.exists(input_fasta): + error += ["The parameter 'fasta' is not a valid path to a FASTA file"] + elif not path.exists(input_gff): + error += ["The parameter 'gff' is not a valid path to a GFF/GTF file"] + else: + # import and check files + fasta, log, error = check_fasta(input_fasta, log, error) + gff, log, error = check_gff(input_gff, log, error) + # export fasta and gff files + with open(output_fasta, "w") as fasta_out: + fasta_out.write(fasta) + with open(output_gff, "w") as gff_out: + GFF.write(gff, gff_out) +else: + error += ["The parameter 'database' is none of 'ncbi', 'manual'"] + +# print error/log messages +if error: + print("\n".join(error)) + raise ValueError( + "Location or format of the supplied genome files was not correct, quitting" + ) +else: + log += ["Module finished successfully\n"] + log = ["GET_GENOME: " + i for i in log] + with open(output_log, "w") as log_file: + log_file.write("\n".join(log))