From 1dfa7adb0120880ae5e85c57551d5e698a057497 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Fri, 29 Nov 2024 16:50:04 +0100
Subject: [PATCH] feat: complete, reproducible example workflow

---
 .gitignore                        |   1 +
 .test/config/config.yml           |  34 +++++++
 .test/config/multiqc_config.yml   |   2 +
 .test/config/samples.tsv          |   3 +
 README.md                         |  98 +++++++++++++++++-
 config/config.yml                 |  34 +++++++
 config/multiqc_config.yml         |   2 +
 config/schemas/config.schema.yml  |  44 +++++++++
 config/schemas/samples.schema.yml |  25 +++++
 workflow/Snakefile                |  47 ++++++++-
 workflow/envs/cutadapt.yml        |   6 ++
 workflow/envs/fastqc.yml          |   6 ++
 workflow/envs/get_genome.yml      |   9 ++
 workflow/envs/multiqc.yml         |   7 ++
 workflow/envs/simulate_reads.yml  |   6 ++
 workflow/rules/common.smk         |  17 ++++
 workflow/rules/process_reads.smk  | 122 +++++++++++++++++++++++
 workflow/scripts/get_genome.py    | 159 ++++++++++++++++++++++++++++++
 18 files changed, 614 insertions(+), 8 deletions(-)
 create mode 100644 .test/config/config.yml
 create mode 100644 .test/config/multiqc_config.yml
 create mode 100644 .test/config/samples.tsv
 create mode 100644 config/config.yml
 create mode 100644 config/multiqc_config.yml
 create mode 100644 config/schemas/config.schema.yml
 create mode 100644 config/schemas/samples.schema.yml
 create mode 100644 workflow/envs/cutadapt.yml
 create mode 100644 workflow/envs/fastqc.yml
 create mode 100644 workflow/envs/get_genome.yml
 create mode 100644 workflow/envs/multiqc.yml
 create mode 100644 workflow/envs/simulate_reads.yml
 create mode 100644 workflow/rules/common.smk
 create mode 100644 workflow/rules/process_reads.smk
 create mode 100644 workflow/scripts/get_genome.py
diff --git a/.gitignore b/.gitignore
index efca3e5..7ddc738 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ resources/**
 logs/**
 .snakemake
 .snakemake/**
+.test/results/*
diff --git a/.test/config/config.yml b/.test/config/config.yml
new file mode 100644
index 0000000..51020b9
--- /dev/null
+++ b/.test/config/config.yml
@@ -0,0 +1,34 @@
+samplesheet: "config/samples.tsv"
+
+get_genome:
+  database: "ncbi"
+  assembly: "GCF_000006785.2"
+  fasta: Null
+  gff: Null
+  gff_source_type:
+    [
+      "RefSeq": "gene",
+      "RefSeq": "pseudogene",
+      "RefSeq": "CDS",
+      "Protein Homology": "CDS",
+    ]
+
+simulate_reads:
+  read_length: 100
+  read_number: 100000
+  random_freq: 0.01
+
+cutadapt:
+  threep_adapter: "-a ATCGTAGATCGG"
+  fivep_adapter: "-A GATGGCGATAGG"
+  default: ["-q 10 ", "-m 25 ", "-M 100", "--overlap=5"]
+
+multiqc:
+  config: "config/multiqc_config.yml"
+
+report:
+  export_figures: True
+  export_dir: "figures/"
+  figure_width: 875
+  figure_height: 500
+  figure_resolution: 125
diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml
new file mode 100644
index 0000000..2f9d44b
--- /dev/null
+++ b/.test/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
\ No newline at end of file
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
new file mode 100644
index 0000000..b931d64
--- /dev/null
+++ b/.test/config/samples.tsv
@@ -0,0 +1,3 @@
+sample	condition	replicate	read1	read2
+sample1	wild_type	1	sample1.bwa.read1.fastq.gz	sample1.bwa.read2.fastq.gz
+sample2	wild_type	2	sample2.bwa.read1.fastq.gz	sample2.bwa.read2.fastq.gz
diff --git a/README.md b/README.md
index aab998b..ecaed19 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,109 @@
 # Snakemake workflow: `<name>`
 
-[![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io)
-[![GitHub actions status](https://github.com/<owner>/<repo>/workflows/Tests/badge.svg?branch=main)](https://github.com/<owner>/<repo>/actions?query=branch%3Amain+workflow%3ATests)
-
+[![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io)
+[![GitHub actions status](https://github.com/MPUSP/snakemake-workflow-template/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/MPUSP/snakemake-workflow-template/actions/workflows/main.yml)
+[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
+[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1D355C.svg?labelColor=000000)](https://sylabs.io/docs/)
+[![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog)
 
 A Snakemake workflow for `<description>`
 
+- [Snakemake workflow: `<name>`](#snakemake-workflow-name)
+  - [Usage](#usage)
+  - [Workflow overview](#workflow-overview)
+  - [Running the workflow](#running-the-workflow)
+    - [Input data](#input-data)
+    - [Execution](#execution)
+    - [Parameters](#parameters)
+  - [Authors](#authors)
+  - [References](#references)
+  - [TODO](#todo)
 
 ## Usage
 
 The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=<owner>%2F<repo>).
 
-If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) <repo>sitory and its DOI (see above).
+If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI.
+
+## Workflow overview
+
+This workflow is a best-practice workflow for `<detailed description>`.
+The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps:
+
+1. Parse sample sheet containing sample meta data (`python`)
+2. Simulate short read sequencing data on the fly (`dwgsim`)
+3. Check quality of input read data (`FastQC`)
+4. Trim adapters from input data (`cutadapt`)
+5. Collect statistics from tool output (`MultiQC`)
+
+## Running the workflow
+
+### Input data
+
+This template workflow contains artifical sequencing data in `*.fastq.gz` format.
+The test data is located in `.test/data`. Input files are supplied with a mandatory table, whose location is indicated in the `config.yml` file (default: `.test/samples.tsv`). The sample sheet has the following layout:
+
+| sample   | condition | replicate | data_folder | fq1                      |
+| -------- | --------- | --------- | ----------- | ------------------------ |
+| RPF-RTP1 | RPF-RTP   | 1         | data        | RPF-RTP1_R1_001.fastq.gz |
+| RPF-RTP2 | RPF-RTP   | 2         | data        | RPF-RTP2_R1_001.fastq.gz |
+
+### Execution
+
+To run the workflow from command line, change the working directory.
+
+```bash
+cd path/to/snakemake-workflow-name
+```
+
+Adjust options in the default config file `config/config.yml`.
+Before running the entire workflow, you can perform a dry run using:
+
+```bash
+snakemake --dry-run
+```
+
+To run the complete workflow with test files using **conda**, execute the following command. The definition of the number of compute cores is mandatory.
+
+```bash
+snakemake --cores 10 --sdm conda --directory .test
+```
+
+To run the workflow with **singularity** / **apptainer**, use:
+
+```bash
+snakemake --cores 10 --sdm conda apptainer --directory .test
+```
+
+### Parameters
+
+This table lists all parameters that can be used to run the workflow.
+
+| parameter              | type | details                                     | default                                      |
+| ---------------------- | ---- | ------------------------------------------- | -------------------------------------------- |
+| **samplesheet**        |      |                                             |                                              |
+| path                   | str  | path to samplesheet, mandatory              | "config/samples.tsv"                         |
+| **cutadapt**           |      |                                             |                                              |
+| fivep_adapter          | str  | sequence of the 5' adapter                  | Null                                         |
+| threep_adapter         | str  | sequence of the 3' adapter                  | `ATCGTAGATCGGAAGAGCACACGTCTGAA`              |
+| default                | str  | additional options passed to `cutadapt`     | [`-q 10 `, `-m 22 `, `-M 52`, `--overlap=3`] |
+
+## Authors
+
+- Firstname Lastname
+  - Affiliation
+  - ORCID profile
+  - home page
+
+## References
+
+> Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. *Sustainable data analysis with Snakemake*. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2.
 
-# TODO
+## TODO
 
 * Replace `<owner>` and `<repo>` everywhere in the template (also under .github/workflows) with the correct `<repo>` name and owning user or organization.
 * Replace `<name>` with the workflow name (can be the same as `<repo>`).
 * Replace `<description>` with a description of what the workflow does.
+* Update the workflow description, parameters, running options, authors and references in the `README.md`
+* Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's capability
 * The workflow will occur in the snakemake-workflow-catalog once it has been made public. Then the link under "Usage" will point to the usage instructions if `<owner>` and `<repo>` were correctly set.
\ No newline at end of file
diff --git a/config/config.yml b/config/config.yml
new file mode 100644
index 0000000..94fffcd
--- /dev/null
+++ b/config/config.yml
@@ -0,0 +1,34 @@
+samplesheet: ".test/config/samples.tsv"
+
+get_genome:
+  database: "ncbi"
+  assembly: "GCF_000006785.2"
+  fasta: Null
+  gff: Null
+  gff_source_type:
+    [
+      "RefSeq": "gene",
+      "RefSeq": "pseudogene",
+      "RefSeq": "CDS",
+      "Protein Homology": "CDS",
+    ]
+
+simulate_reads:
+  read_length: 100
+  read_number: 100000
+  random_freq: 0.01
+
+cutadapt:
+  threep_adapter: "-a ATCGTAGATCGG"
+  fivep_adapter: "-A GATGGCGATAGG"
+  default: ["-q 10 ", "-m 25 ", "-M 100", "--overlap=5"]
+
+multiqc:
+  config: "config/multiqc_config.yml"
+
+report:
+  export_figures: True
+  export_dir: "figures/"
+  figure_width: 875
+  figure_height: 500
+  figure_resolution: 125
diff --git a/config/multiqc_config.yml b/config/multiqc_config.yml
new file mode 100644
index 0000000..2f9d44b
--- /dev/null
+++ b/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
\ No newline at end of file
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
new file mode 100644
index 0000000..6a6d8fd
--- /dev/null
+++ b/config/schemas/config.schema.yml
@@ -0,0 +1,44 @@
+$schema: "http://json-schema.org/draft-07/schema#"
+description: an entry in the sample sheet
+properties:
+  samplesheet:
+    type: string
+    description: sample name/identifier
+
+  get_genome:
+    properties:
+      database:
+        type: ["string", "null"]
+      assembly:
+        type: ["string", "null"]
+      fasta:
+        type: ["string", "null"]
+      gff:
+        type: ["string", "null"]
+      gff_source_type:
+        type: array
+
+  simulate_reads:
+    properties:
+      read_length:
+        type: number
+      read_number:
+        type: number
+      random_freq:
+        type: number
+
+  cutadapt:
+    properties:
+      threep_adapter:
+        type: string
+      fivep_adapter:
+        type: string
+      default:
+        type: array
+
+  multiqc:
+    properties:
+      config:
+        type: string
+
+required: ["samplesheet", "get_genome", "simulate_reads", "cutadapt", "multiqc"]
diff --git a/config/schemas/samples.schema.yml b/config/schemas/samples.schema.yml
new file mode 100644
index 0000000..d7ecee3
--- /dev/null
+++ b/config/schemas/samples.schema.yml
@@ -0,0 +1,25 @@
+$schema: "http://json-schema.org/draft-07/schema#"
+description: an entry in the sample sheet
+properties:
+  sample:
+    type: string
+    description: sample name/identifier
+  condition:
+    type: string
+    description: sample condition that will be compared during differential analysis
+  replicate:
+    type: number
+    default: 1
+    description: consecutive numbers representing multiple replicates of one condition
+  read1:
+    type: string
+    description: names of fastq.gz files, read 1
+  read2:
+    type: string
+    description: names of fastq.gz files, read 2 (optional)
+
+required:
+  - sample
+  - condition
+  - replicate
+  - read1
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 3826ec5..793538f 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -1,4 +1,45 @@
-# Main entrypoint of the workflow. 
-# Please follow the best practices: 
+# Main entrypoint of the workflow.
+# Please follow the best practices:
 # https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html,
-# in particular regarding the standardized folder structure mentioned there. 
+# in particular regarding the standardized folder structure mentioned there.
+
+
+import os
+import pandas as pd
+
+
+# load configuration
+# -----------------------------------------------------
+configfile: "config/config.yml"
+
+
+# container definition: uncomment to include a singularity image, e.g. from github's container registry
+# container: "oras://ghcr.io/<user>/<repository>:<version>"
+
+
+# load rules
+# -----------------------------------------------------
+include: "rules/common.smk"
+include: "rules/process_reads.smk"
+
+
+# optional messages, log and error handling
+# -----------------------------------------------------
+onstart:
+    print("\n--- Analysis started ---\n")
+
+
+onsuccess:
+    print("--- Workflow finished! ---")
+
+
+onerror:
+    print("--- An error occurred! ---")
+
+
+# target rules
+# -----------------------------------------------------
+rule all:
+    input:
+        "results/multiqc/multiqc_report.html",
+    default_target: True
diff --git a/workflow/envs/cutadapt.yml b/workflow/envs/cutadapt.yml
new file mode 100644
index 0000000..e6bbff7
--- /dev/null
+++ b/workflow/envs/cutadapt.yml
@@ -0,0 +1,6 @@
+name: cutadapt
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - cutadapt=4.9
diff --git a/workflow/envs/fastqc.yml b/workflow/envs/fastqc.yml
new file mode 100644
index 0000000..25e6d86
--- /dev/null
+++ b/workflow/envs/fastqc.yml
@@ -0,0 +1,6 @@
+name: fastqc
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - fastqc=0.12.1
diff --git a/workflow/envs/get_genome.yml b/workflow/envs/get_genome.yml
new file mode 100644
index 0000000..79d325e
--- /dev/null
+++ b/workflow/envs/get_genome.yml
@@ -0,0 +1,9 @@
+name: get_genome
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - unzip=6.0
+  - ncbi-datasets-cli=16.23.0
+  - bcbio-gff=0.7.1
+  - samtools=1.20
diff --git a/workflow/envs/multiqc.yml b/workflow/envs/multiqc.yml
new file mode 100644
index 0000000..b99e9c3
--- /dev/null
+++ b/workflow/envs/multiqc.yml
@@ -0,0 +1,7 @@
+name: multiqc
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.9
+  - multiqc=1.14
diff --git a/workflow/envs/simulate_reads.yml b/workflow/envs/simulate_reads.yml
new file mode 100644
index 0000000..b8d2db2
--- /dev/null
+++ b/workflow/envs/simulate_reads.yml
@@ -0,0 +1,6 @@
+name: get_genome
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - dwgsim=1.1.14
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
new file mode 100644
index 0000000..4482e09
--- /dev/null
+++ b/workflow/rules/common.smk
@@ -0,0 +1,17 @@
+# import basic packages
+import pandas as pd
+from snakemake.utils import validate
+from os import path
+
+
+# read sample sheet
+samples = (
+    pd.read_csv(config["samplesheet"], sep="\t", dtype={"sample": str})
+    .set_index("sample", drop=False)
+    .sort_index()
+)
+
+
+# validate sample sheet and config file
+validate(samples, schema="../../config/schemas/samples.schema.yml")
+validate(config, schema="../../config/schemas/config.schema.yml")
diff --git a/workflow/rules/process_reads.smk b/workflow/rules/process_reads.smk
new file mode 100644
index 0000000..ec49a82
--- /dev/null
+++ b/workflow/rules/process_reads.smk
@@ -0,0 +1,122 @@
+# ----------------------------------------------------- #
+# EXAMPLE WORKFLOW                                      #
+# ----------------------------------------------------- #
+
+
+# module to fetch genome from NCBI or Ensemble
+# -----------------------------------------------------
+rule get_genome:
+    output:
+        path=directory("results/get_genome"),
+        fasta="results/get_genome/genome.fasta",
+        gff="results/get_genome/genome.gff",
+    conda:
+        "../envs/get_genome.yml"
+    message:
+        """--- Parsing genome GFF and FASTA files."""
+    params:
+        database=config["get_genome"]["database"],
+        assembly=config["get_genome"]["assembly"],
+        fasta=config["get_genome"]["fasta"],
+        gff=config["get_genome"]["gff"],
+    log:
+        path="results/get_genome/log/get_genome.log",
+    script:
+        "../scripts/get_genome.py"
+
+
+# module simulate reads
+# -----------------------------------------------------
+rule simulate_reads:
+    input:
+        fasta=rules.get_genome.output.fasta,
+    output:
+        fastq1="results/simulate_reads/{sample}.bwa.read1.fastq.gz",
+        fastq2="results/simulate_reads/{sample}.bwa.read2.fastq.gz",
+    conda:
+        "../envs/simulate_reads.yml"
+    message:
+        """--- Simulating reads with DWGSIM."""
+    params:
+        read_length=config["simulate_reads"]["read_length"],
+        read_number=config["simulate_reads"]["read_number"],
+        random_freq=config["simulate_reads"]["random_freq"],
+    log:
+        path="results/simulate_reads/{sample}.log",
+    shell:
+        "prefix=`echo {output.fastq1} | cut -f 1 -d .`;"
+        "dwgsim -1 {params.read_length} -2 {params.read_length} "
+        "-N {params.read_number} -o 1 -y {params.random_freq} {input.fasta} ${{prefix}} &> {log.path}"
+
+
+# module to make QC report
+# -----------------------------------------------------
+rule fastqc:
+    input:
+        fastq="results/simulate_reads/{sample}.bwa.{read}.fastq.gz",
+    output:
+        report="results/fastqc/{sample}.bwa.{read}_fastqc.html",
+    conda:
+        "../envs/fastqc.yml"
+    message:
+        """--- Checking fastq files with FastQC."""
+    log:
+        path="results/fastqc/{sample}.bwa.{read}.log",
+    shell:
+        "prefix=`echo {output.report} | cut -f 1-2 -d /`;"
+        "fastqc --nogroup --extract --quiet --threads {threads} -o ${{prefix}} {input.fastq} > {log}"
+
+
+# module to trim adapters from reads
+# -----------------------------------------------------
+rule cutadapt:
+    input:
+        fastq1="results/simulate_reads/{sample}.bwa.read1.fastq.gz",
+        fastq2="results/simulate_reads/{sample}.bwa.read2.fastq.gz",
+    output:
+        fastq1="results/cutadapt/{sample}.bwa.read1.fastq.gz",
+        fastq2="results/cutadapt/{sample}.bwa.read2.fastq.gz",
+    conda:
+        "../envs/cutadapt.yml"
+    message:
+        """--- Trim adapters from reads."""
+    params:
+        threep_adapter=config["cutadapt"]["threep_adapter"],
+        fivep_adapter=config["cutadapt"]["fivep_adapter"],
+        default=config["cutadapt"]["default"],
+    log:
+        stdout="results/cutadapt/{sample}.bwa.log",
+        stderr="results/cutadapt/{sample}.bwa.stderr",
+    threads: int(workflow.cores * 0.25)
+    shell:
+        "cutadapt {params.threep_adapter} {params.fivep_adapter} --cores {threads} "
+        "-o {output.fastq1} -p {output.fastq2} {input.fastq1} {input.fastq1} > {log.stdout} 2> {log.stderr}"
+
+
+# module to run multiQC on input + processed files
+# -----------------------------------------------------
+rule multiqc:
+    input:
+        expand(
+            "results/cutadapt/{sample}.bwa.{read}.fastq.gz",
+            sample=samples.index,
+            read=["read1", "read2"],
+        ),
+        expand(
+            "results/fastqc/{sample}.bwa.{read}_fastqc.html",
+            sample=samples.index,
+            read=["read1", "read2"],
+        ),
+    output:
+        report="results/multiqc/multiqc_report.html",
+    conda:
+        "../envs/multiqc.yml"
+    message:
+        """--- Generating MultiQC report for seq data."""
+    params:
+        config=config["multiqc"]["config"],
+    log:
+        path="results/multiqc/log/multiqc.log",
+    shell:
+        "outdir=`echo {output.report} | cut -f 1-2 -d /`; "
+        "multiqc -c {params.config} --force --verbose --dirs --outdir ${{outdir}} results &> {log.path}"
diff --git a/workflow/scripts/get_genome.py b/workflow/scripts/get_genome.py
new file mode 100644
index 0000000..107dad9
--- /dev/null
+++ b/workflow/scripts/get_genome.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python3
+
+# GET GENOME
+# -----------------------------------------------------------------------------
+#
+# This script attempts to download genome sequence (FASTA) and
+# genome annotation (GFF / GTF) files from NCBI using the NCBI datasets
+# API, or a similar database. Also, the genome sequence is indexed using samtools.
+# Alternatively, a FASTA and GFF file can be
+# supplied by the user. Input is roughly checked for validity.
+
+from os import path
+from BCBio.GFF import GFFExaminer
+from BCBio import GFF
+from subprocess import getoutput
+
+input_database = snakemake.params["database"]
+input_assembly = snakemake.params["assembly"]
+input_fasta = snakemake.params["fasta"]
+input_gff = snakemake.params["gff"]
+output_path = snakemake.output["path"]
+output_fasta = snakemake.output["fasta"]
+output_gff = snakemake.output["gff"]
+output_log = snakemake.log["path"]
+log = []
+error = []
+
+
+def check_fasta(input_fasta, log=[], error=[]):
+    with open(input_fasta, "r") as fasta_file:
+        fasta = fasta_file.read()
+    n_items = fasta.count(">")
+    if n_items:
+        log += [f"Supplied fasta file '{input_fasta}' was found"]
+        log += [f"Supplied fasta file contains {n_items} items"]
+    else:
+        error += ["The supplied fasta file contains no valid entries starting with '>'"]
+    return fasta, log, error
+
+
+def check_gff(input_gff, log=[], error=[]):
+    with open(input_gff, "r") as gff_file:
+        gff_examiner = GFFExaminer()
+        log += [f"Supplied GFF file '{input_gff}' was found"]
+        gff_summary = gff_examiner.available_limits(gff_file)
+        log += [
+            f"Supplied GFF file contains the following items:",
+            "--------------------",
+        ]
+        for item in gff_summary["gff_source_type"]:
+            log += ["-".join(item) + " : " + str(gff_summary["gff_source_type"][item])]
+    with open(input_gff, "r") as gff_file:
+        new_gff = []
+        gff_source_type = []
+        for i in snakemake.config["get_genome"]["gff_source_type"]:
+            gff_source_type += list(i.items())
+        limits = dict(gff_source_type=gff_source_type)
+        for rec in GFF.parse(gff_file, limit_info=limits):
+            for recfeat in rec.features:
+                rec_keys = recfeat.qualifiers.keys()
+                if not "Name" in rec_keys:
+                    if "locus_tag" in rec_keys:
+                        recfeat.qualifiers["Name"] = recfeat.qualifiers["locus_tag"]
+                    else:
+                        error += [
+                            "required fields 'Name','locus_tag' missing in *.gff file"
+                        ]
+                else:
+                    if "locus_tag" in rec_keys:
+                        recfeat.qualifiers["trivial_name"] = recfeat.qualifiers["Name"]
+                        recfeat.qualifiers["Name"] = recfeat.qualifiers["locus_tag"]
+                if not "ID" in rec_keys:
+                    if "locus_tag" in rec_keys:
+                        recfeat.qualifiers["ID"] = recfeat.qualifiers["locus_tag"]
+                    elif "Name" in rec_keys:
+                        recfeat.qualifiers["ID"] = recfeat.qualifiers["Name"]
+                    else:
+                        error += [
+                            "required fields 'ID','locus_tag' missing in *.gff file"
+                        ]
+            new_gff += [rec]
+    return new_gff, log, error
+
+
+if input_database.lower() == "ncbi":
+    ncbi_result = getoutput(
+        f"datasets summary genome accession {input_assembly} --as-json-lines | "
+        + "dataformat tsv genome --fields accession,annotinfo-release-date,organism-name"
+    )
+    if ncbi_result.startswith("Error"):
+        error += [ncbi_result]
+        error += [
+            "The supplied refseq/genbank ID was not valid. Example for correct input: 'GCF_000009045.1'"
+        ]
+    elif len(ncbi_result) == 0:
+        error += [
+            "The result from fetching NCBI genome data has zero length. Please check your internet connection!"
+        ]
+    else:
+        ncbi_genome = [
+            i.split("\t")
+            for i in ncbi_result.split("\n")
+            if not (i.startswith("New version") or i.startswith("Warning"))
+        ]
+        ncbi_genome = dict(zip(ncbi_genome[0], ncbi_genome[1]))
+        log += ["Found the following genome(s):"]
+        for k in ncbi_genome.keys():
+            log += ["{0}: {1}".format(k, ncbi_genome.get(k))]
+        refseq_id = ncbi_genome.get("Assembly Accession")
+        if not refseq_id.startswith("GCF_"):
+            error += ["The RefSeq ID '{0}' has no valid format.".format(refseq_id)]
+        ncbi_command = (
+            f"datasets download genome accession {refseq_id}"
+            + f" --filename {output_path}/database.zip --include genome,gff3; "
+            + f"cd {output_path}; unzip database.zip; rm database.zip"
+        )
+        copy_command = (
+            f"cp {output_path}/ncbi_dataset/data/{refseq_id}/*.fna {output_fasta}; "
+            + f"cp {output_path}/ncbi_dataset/data/{refseq_id}/genomic.gff {output_gff}"
+        )
+        index_command = f"samtools faidx {output_fasta}"
+        str_out = getoutput(ncbi_command)
+        str_cp = getoutput(copy_command)
+        # import and check files
+        fasta, log, error = check_fasta(output_fasta, log, error)
+        index_out = getoutput(index_command)
+        gff, log, error = check_gff(output_gff, log, error)
+        # write cleaned gff file
+        with open(output_gff, "w") as gff_out:
+            GFF.write(gff, gff_out)
+
+elif input_database.lower() == "manual":
+    if not path.exists(input_fasta):
+        error += ["The parameter 'fasta' is not a valid path to a FASTA file"]
+    elif not path.exists(input_gff):
+        error += ["The parameter 'gff' is not a valid path to a GFF/GTF file"]
+    else:
+        # import and check files
+        fasta, log, error = check_fasta(input_fasta, log, error)
+        gff, log, error = check_gff(input_gff, log, error)
+        # export fasta and gff files
+        with open(output_fasta, "w") as fasta_out:
+            fasta_out.write(fasta)
+        with open(output_gff, "w") as gff_out:
+            GFF.write(gff, gff_out)
+else:
+    error += ["The parameter 'database' is none of 'ncbi', 'manual'"]
+
+# print error/log messages
+if error:
+    print("\n".join(error))
+    raise ValueError(
+        "Location or format of the supplied genome files was not correct, quitting"
+    )
+else:
+    log += ["Module finished successfully\n"]
+    log = ["GET_GENOME: " + i for i in log]
+    with open(output_log, "w") as log_file:
+        log_file.write("\n".join(log))