Merge pull request #4 from cokelaer/main

Update pipeline to set singularity
sequana · Aug 29, 2022 · 3bb1305 · 3bb1305
2 parents 6b5c2f5 + d757db5
commit 3bb1305
Show file tree

Hide file tree

Showing 10 changed files with 125 additions and 89 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -3,19 +3,23 @@ name: Tests
 on:
   push:
     branches:
-      - master
+      - main
+      - dev
   pull_request:
     branches-ignore: []
+  schedule:
+    - cron: '0 0 * * SUN'
 
 jobs:
   build-linux:
     runs-on: ubuntu-latest
     strategy:
       max-parallel: 5
       matrix:
-        python: [3.7,3.8,3.9]
+        python: [3.7, 3.8, 3.9]
       fail-fast: false
 
+
     steps:
 
     - name: install graphviz
@@ -34,12 +38,10 @@ jobs:
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
-        conda update ruamel_yaml
 
     - name: conda
       run: |
-        conda install -c conda-forge --quiet mamba  python=${{ matrix.python }}
-        mamba install -c bioconda -c conda-forge --quiet -y bowtie samtools bamtools pigz bedtools
+        conda install -c conda-forge -c bioconda  -y python=${{ matrix.python }} 'bowtie>=1.3.0' samtools bamtools pigz bedtools
 
     - name: Install dependencies
       run: |
@@ -48,6 +50,7 @@ jobs:
     - name: install package itself
       run: |
          pip install .
+         pip install requests --upgrade
 
     - name: testing
       run: |

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -8,9 +8,9 @@ on:
 jobs:
   build-n-publish:
     name: Build and publish to PyPI and TestPyPI
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@main
     - name: Set up Python 3.7
       uses: actions/setup-python@v1
       with:
@@ -26,14 +26,14 @@ jobs:
           python setup.py sdist
 
     - name: Publish distribution to Test PyPI
-      uses: pypa/gh-action-pypi-publish@master
+      uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
         repository_url: https://test.pypi.org/legacy/
     - name: Publish distribution to PyPI
       if: startsWith(github.ref, 'refs/tags')
-      uses: pypa/gh-action-pypi-publish@master
+      uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/README.rst b/README.rst
@@ -62,8 +62,10 @@ Requirements
 
 This pipelines requires the following executable(s):
 
-- bowtie1
+- bowtie1 >= 1.3.0
+- bedtools
 - samtools
+- bamtools
 - pigz
 
 .. image:: https://raw.githubusercontent.com/sequana/ribofinder/master/sequana_pipelines/ribofinder/dag.png
@@ -102,6 +104,7 @@ Changelog
 ========= ====================================================================
 Version   Description
 ========= ====================================================================
+0.12.0    * set singularity containers
 0.11.1    * Fix config file (removing hard-coded path)
 0.11.0    * Fix multiqc plot using same fix as in sequna_rnaseq pipelines
           * add utility plot to check rate of  ribosomal per sequence and also

diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,18 @@
+name: sequana_variant_calling
+
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  - bowtie>=1.3.0
+  - samtools
+  - bamtools
+  - pigz
+  - bedtools
+  - pip
+  - pip:
+    - sequana
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-sequana>=0.12.7
-sequana_pipetools>=0.7.1
+sequana>=0.14.1
+sequana_pipetools>=0.9.2
diff --git a/sequana_pipelines/ribofinder/main.py b/sequana_pipelines/ribofinder/main.py
@@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
 #
 #  This file is part of Sequana software
 #
 #  Copyright (c) 2016 - Sequana Development Team
 #
-#  File author(s):
-#      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
-#
 #  Distributed under the terms of the 3-clause BSD license.
 #  The full license is in the LICENSE file, distributed with this software.
 #
@@ -17,7 +13,6 @@
 import sys
 import os
 import argparse
-import shutil
 import subprocess
 
 from sequana_pipetools.options import *

diff --git a/sequana_pipelines/ribofinder/requirements.txt b/sequana_pipelines/ribofinder/requirements.txt
@@ -1,3 +1,5 @@
 bowtie
 samtools
+bedtools
+bamtools
 pigz
diff --git a/sequana_pipelines/ribofinder/ribofinder.rules b/sequana_pipelines/ribofinder/ribofinder.rules
@@ -12,13 +12,10 @@
 #  Documentation: https://github.com/sequana/fastqc/README.rst
 ##############################################################################
 """Ribofinder pipeline"""
-import glob
-import os
 import shutil
-from os.path import join
+import os
 from collections import Counter
 
-import sequana
 
 from sequana_pipetools import PipelineManager
 from sequana_pipetools import snaketools as sm
@@ -29,86 +26,107 @@ from sequana.gff3 import GFF3
 configfile: "config.yaml"
 
 manager = PipelineManager("ribofinder", config)
-manager.setup(globals(), mode="warning")
 
 
 
-__data__input = manager.getrawdata()
-
 rule pipeline:
-    input: 
+    input:
         ".sequana/rulegraph.svg",
         "outputs/proportions.png",
         "outputs/RPKM.png",
         "multiqc/multiqc_report.html"
 
-__fasta_file__ = config['general']['reference_file']
-if config['general']['genbank_file']:
-    __annot_file__   = config['general']['genbank_file']
-elif config['general']['gff_file']:
-    __annot_file__   = config['general']['gff_file']
-elif config['general']['rRNA_file']:
-    __annot_file__   = config['general']['rRNA_file']
 
-__prefix_name__ = "indexing/features"
+def get_annot_file():
+    if config['general']['genbank_file']:
+        return config['general']['genbank_file']
+    elif config['general']['gff_file']:
+        return config['general']['gff_file']
+    elif config['general']['rRNA_file']:
+        return config['general']['rRNA_file']
+        raise ValueError("You must provide a genbank of gff or file with rRNA sequences.")
+
+
+input_fasta = config['general']['reference_file']
 
 
 if manager.config.general.rRNA_file:
-    __bowtie1_index_rna__fasta = config["general"]["rRNA_file"]
-    if os.path.exists(__bowtie1_index_rna__fasta) is False:
-        log.error(f"File {__bowtie1_index_rna__fasta} does not exists. Check your config file")
+    user_file = config["general"]["rRNA_file"]
+    if os.path.exists(user_file) is False:
+        log.error(f"File {user_file} does not exists. Check your config file")
         sys.exit(1)
-elif __annot_file__.endswith(".gbk"):
+    os.makedirs("build_feature_fasta", exist_ok=True)
+    shutil.copy(user_file, "build_feature_fasta/feature.fasta")
+elif get_annot_file().endswith(".gbk"):
     # This is for genbank input
-    __build_feature_fasta__output = "build_feature_fasta/feature.fasta"
-    for this in [__fasta_file__, __annot_file__]:
+    for this in [input_fasta, get_annot_file()]:
         if os.path.exists(this) is False:
-            raise IOError("File {} not found".format(__fasta_file__))
+            raise IOError("File {} not found".format(input_fasta))
+
     rule build_feature_fasta:
         input:
-            fasta = __fasta_file__,
-            annot = __annot_file__
-        output: __build_feature_fasta__output
+            fasta = input_fasta,
+            annot = get_annot_file()
+        output: "build_feature_fasta/feature.fasta"
         run:
             from sequana.genbank import GenBank
             gg = GenBank(input.annot)
             sequence = gg.extract_fasta(input.fasta, features=['rRNA'])
             with open(output[0], "w") as fout:
                 fout.write(sequence)
-    __bowtie1_index_rna__fasta = __build_feature_fasta__output
-elif __annot_file__.endswith(".gff") or __annot_file__.endswith(".gff3"):
-    for this in [__fasta_file__, __annot_file__]:
+
+elif get_annot_file().endswith(".gff") or get_annot_file().endswith(".gff3"):
+    for this in [input_fasta, get_annot_file()]:
         if os.path.exists(this) is False:
-            raise IOError("File {} not found".format(__fasta_file__))
-    # extract rRNA feature from GFF and get corresponding fasta
-    # and gff. if no match for rRNA, save empty fasta as AAAAAAAAAAA
-    __build_feature_fasta__output = "build_feature_fasta/feature.fasta"
+            raise IOError("File {} not found".format(input_fasta))
 
-    __extract_fasta_from_bed__input =       __fasta_file__
-    __extract_fasta_from_bed__gff =         __annot_file__
-    __extract_fasta_from_bed__feature =     config["general"]["rRNA_feature"]
-    __extract_fasta_from_bed__output =          __build_feature_fasta__output
-    __extract_fasta_from_bed__output_features = __prefix_name__ + "_rRNA.gff"
-    __extract_fasta_from_bed__log =             "indexing/get_rRNA.log"
-    # ----------------------------------------------------------------------------
-    include: sm.modules["extract_fasta_from_bed"]
-    __bowtie1_index_rna__fasta = __extract_fasta_from_bed__output
+
+    rule build_feature_fasta:
+        """ extract rRNA feature from GFF and get corresponding fasta and gff. 
+
+        if no match for rRNA, save empty fasta as AAAAAAAAAAA
+        """
+        input:
+            fasta = input_fasta,
+            gff = get_annot_file()
+        output:
+            fasta = "build_feature_fasta/feature.fasta",
+            fai = "build_feature_fasta/feature.fasta.fai",
+            gff = "build_feature_fasta/feature_rRNA.gff"
+        params:
+           feature = config['general']['rRNA_feature']
+        container:
+            "https://zenodo.org/record/7031863/files/sequana_tools_0.14.2.img"
+        shell:
+            """
+            # used to be gawk but awk is more generic.
+            awk '{{ if ($3=="{params.feature}") print }}' {input.gff} > {output.gff}
+            if [ -s {output.gff} ]
+            then
+                bedtools getfasta -fi {input.fasta} -bed {output.gff}  -fo {output.fasta}
+            else :
+                echo -e ">empty\\nAAAAAAAAAAAAAA" > {output.fasta}
+            fi
+            samtools faidx {output.fasta}
+            """
 
 
-__bowtie1_reference__ = __bowtie1_index_rna__fasta.rsplit(".",1)[0] + "_rRNA.1.ebwt"
 
 
 
 rule bowtie1_indexing:
     input:
-        reference= __bowtie1_index_rna__fasta
+        reference= "build_feature_fasta/feature.fasta"
     output:
-        __bowtie1_reference__
+        "build_feature_fasta/feature_rRNA.1.ebwt"
     log:
         "indexing/bowtie_rRNA.log"
     params:
         options=""
-    threads: 2
+    threads: 
+        2
+    container:
+        "https://zenodo.org/record/7031863/files/sequana_tools_0.14.2.img"
     wrapper:
         "main/wrappers/bowtie1/build"
 
@@ -126,6 +144,8 @@ rule unpigz:
     input: manager.getrawdata()
     output: temp("{sample}/unpigz/{sample}.fastq")
     threads: 4
+    container:
+        "https://zenodo.org/record/7031863/files/sequana_tools_0.14.2.img"
     shell:
         """
         unpigz -p {threads} -fk --stdout {input[0]} > {output}
@@ -137,15 +157,18 @@ rule unpigz:
 rule bowtie1_mapping_rna:
     input:
         fastq="{sample}/unpigz/{sample}.fastq",
-        index=__bowtie1_reference__
+        index="build_feature_fasta/feature_rRNA.1.ebwt"
     output:
         bam="{sample}/bowtie1/{sample}.bam",
         sorted="{sample}/bowtie1/{sample}.sorted.bam"
     log:
         "{sample}/bowtie1/{sample}.log"
     params:
         options=""
-    threads: config['bowtie1_mapping_rna']['threads']
+    threads: 
+        config['bowtie1_mapping_rna']['threads']
+    container:
+        "https://zenodo.org/record/7031863/files/sequana_tools_0.14.2.img"
     wrapper:
         "main/wrappers/bowtie1/align"
 
@@ -178,13 +201,13 @@ rule fix_bowtie1_log:
 rule plotting:
     input:
         bam_files=expand("{sample}/bowtie1/{sample}.sorted.bam", sample=manager.samples),
-        fasta_file=__bowtie1_index_rna__fasta
+        fasta_file= "build_feature_fasta/feature.fasta"
     output:
         png="outputs/proportions.png",
         rpkm="outputs/RPKM.png"
     run:
         import pandas as pd
-        from sequana import BAM
+        from sequana import BAM, FastA
         from pylab import tight_layout, savefig, xlabel, ylabel, clf
 
         results = []
@@ -214,7 +237,7 @@ rule plotting:
         savefig(output['png'])
 
         # now, we shown the RPKM (read count normalised by gene length)
-        from sequana import FastA
+
         f = FastA(input['fasta_file'])
         df = df.T
         L = [ f.get_lengths_as_dict()[x] for x in df.index]
@@ -235,6 +258,8 @@ rule bam_indexing:
         "{sample}/bowtie1/{sample}.sorted.bam"
     output:
         "{sample}/bowtie1/{sample}.sorted.bam.bai"
+    container:
+        "https://zenodo.org/record/7031863/files/sequana_tools_0.14.2.img"
     shell:
         """
         bamtools index -in {input}