diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..7a9a9b1 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,39 @@ +name: build + +on: + push: + branches: + - master + - main + - develop + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + snakemake-version: ["7.32.3"] + steps: + - uses: actions/checkout@v4 + - uses: mamba-org/setup-micromamba@v1 + with: + environment-name: build + cache-environment: true + condarc: | + channels: + - conda-forge + - bioconda + create-args: >- + python=${{ matrix.python-version }} + snakemake=${{ matrix.snakemake-version }} + setuptools + pip + pytest + - name: Test + run: | + python -m pytest + env: + TMPDIR: ${{ runner.temp }} + shell: micromamba-shell {0} diff --git a/.github/workflows/docker-auto.yml b/.github/workflows/docker-auto.yml index debc9ac..d7bb35b 100644 --- a/.github/workflows/docker-auto.yml +++ b/.github/workflows/docker-auto.yml @@ -6,11 +6,6 @@ on: - main paths: - "docker/**" - pull_request: - branches: - - main - paths: - - "docker/**" jobs: generate-matrix: diff --git a/.github/workflows/projects.yml b/.github/workflows/projects.yml new file mode 100644 index 0000000..61a2816 --- /dev/null +++ b/.github/workflows/projects.yml @@ -0,0 +1,14 @@ +name: Add issues/PRs to user projects + +on: + issues: + types: + - assigned + pull_request: + types: + - assigned + +jobs: + add-to-project: + uses: CCBR/.github/.github/workflows/auto-add-user-project.yml@v0.1.0 + secrets: inherit diff --git a/.gitignore b/.gitignore index e5e822c..67793ae 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,5 @@ resources/mouse_mm9_circRNAs_putative_spliced_sequence.fa.gz workflow/scripts/copy_strand_info.py .tests/lint_workdir **/tmp* +**__pycache__/** +*.pyc diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..dc577a5 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,75 @@ +# CHARLIE development version + +- Major updates to convert CHARLIE from a biowulf-specific to a platform-agnostic pipeline (#102, @kelly-sovacool): + - All rules now use containers instead of envmodules. + - Default config and cluster config files are provided for use on biowulf and FRCE. + - New entry `TEMPDIR` in the config file sets the temporary directory location for rules that require transient storage. + - New `--singcache` argument to provide a singularity cache dir location. The singularity cache dir is automatically set inside `/data/$USER/` or `$WORKDIR/` if `--singcache` is not provided. + +# CHARLIE 0.10.1 + +- strand are reported together, strand from all callers are reported, +- both + and - flanking sites are reported, +- rev-comp function updated, +- updated versions of tools to match available tools on BIOWULF. + +# CHARLIE 0.9.0 + +Significant upgrades since the last release: + +- updates to wrapper script, many new arguments/options added +- new per-sample counts table format +- new all-sample master counts matrix with min-nreads filtering and ntools column to show number of tools supporting the circRNA call +- new version of Snakemake +- cluster_status script added for forced completion of pipeline upon TIMEOUTs +- updated flowchart from lucid charts +- added circRNAfinder, find_circ, circExplorer2_bwa and other tools +- optimized execution and resource requirements +- updated viral annotations (Thanks Sara!) +- new method to extract linear counts, create linear BAMs using circExplorer2 outputs +- new job reporting using jobby and its derivatives +- separated creation of BWA and BOWTIE2 index from creation of STAR index to speed things up +- parallelized find_circ +- better cleanup (eg. deleting \_STARgenome folders, etc.) for much smaller digital footprint +- multitude of comments throughout the snakefiles including listing of output file column descriptions +- preliminary GH actions added + +# CHARLIE 0.7.0 + +- 5 circRNA callers +- all-sample counts matrix with annotations + +# CHARLIE 0.6.9 + +- Optimized pysam scripts +- fixed premature completion of singularity rules + +# CHARLIE 0.6.5 + +- updated config.yaml to use the latest HSV-1 annotations received from Sarah (050421) + +# CHARLIE 0.6.4 + +- create linear reads BAM file +- create linear reads BigWigs for each region in the .regions file. + +# CHARLIE 0.6.3 + +- QOS not working for Taka... removed from cluster.json +- recall rule requires python/3.7 ... env module updated + +# CHARLIE 0.6.2 + +- BSJ files are in BSJ subfolder... bug fix for v0.6.1 + +# CHARLIE 0.6.1 + +- customBSJs recalled from STAR alignments + - only for PE + - removes erroneously called CircExplorer BSJs +- create sense and anti-sense BSJ BAMs and BW for each reference (host+viruses) +- find reads which contribute to CIRI BSJs but not on the STAR list of BSJ reads, see if they contribute to novel (not called by STAR) BSJs and append novel BSJs to customBSJ list + +# CHARLIE 0.6.0 + +cutadapt_min_length to cutadapt rule... setting it to 15 in config (for miRNAs, Biot and short viral features) diff --git a/CITATION.cff b/CITATION.cff index 7c6d608..3d35d31 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,6 +6,11 @@ authors: orcid: https://orcid.org/0000-0001-8978-8495 affiliation: Advanced Biomedical Computational Science, Frederick National Laboratory for Cancer Research, Frederick, MD 21702, USA +- family-names: Sovacool + given-names: Kelly + orcid: https://orcid.org/0000-0003-3283-829X + affiliation: Advanced Biomedical Computational Science, Frederick National Laboratory + for Cancer Research, Frederick, MD 21702, USA title: 'CHARLIE: Circrnas in Host And viRuses anaLysis pIpEline' url: https://ccbr.github.io/CHARLIE/ repository-code: https://github.com/CCBR/CHARLIE diff --git a/README.md b/README.md index b785065..048286c 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,23 @@ # CHARLIE -![img](https://img.shields.io/github/issues/CCBR/CHARLIE?style=for-the-badge)![img](https://img.shields.io/github/forks/CCBR/CHARLIE?style=for-the-badge)![img](https://img.shields.io/github/stars/CCBR/CHARLIE?style=for-the-badge)![img](https://img.shields.io/github/license/CCBR/CHARLIE?style=for-the-badge) +![img](https://img.shields.io/github/issues/CCBR/CHARLIE?style=for-the-badge)![img](https://img.shields.io/github/forks/CCBR/CHARLIE?style=for-the-badge)![img](https://img.shields.io/github/stars/CCBR/CHARLIE?style=for-the-badge)![img](https://img.shields.io/github/license/CCBR/CHARLIE?style=for-the-badge) ### Table of Contents + - [CHARLIE - **C**ircrnas in **H**ost **A**nd vi**R**uses ana**L**ysis p**I**p**E**line](#charlie) - - [Table of Contents](#table-of-contents) - - [1. Introduction](#1-introduction) - - [2. Flowchart](#2-flowchart) - - [3. Software Dependencies](#3-software-dependencies) - - [4. Usage](#4-usage) - - [5. License](#5-license) - - [6. Testing](#6-testing) - - [6.1 Test data](#61-test-data) - - [6.2 Expected output](#62-expected-output) + - [Table of Contents](#table-of-contents) + - [1. Introduction](#1-introduction) + - [2. Flowchart](#2-flowchart) + - [3. Software Dependencies](#3-software-dependencies) + - [4. Usage](#4-usage) + - [5. License](#5-license) + - [6. Testing](#6-testing) + - [6.1 Test data](#61-test-data) + - [6.2 Expected output](#62-expected-output) ### 1. Introduction - **C**ircrnas in **H**ost **A**nd vi**R**uses ana**L**ysis p**I**p**E**line +**C**ircrnas in **H**ost **A**nd vi**R**uses ana**L**ysis p**I**p**E**line Things to know about CHARLIE: @@ -24,26 +25,26 @@ Things to know about CHARLIE: - Primirarily developed to run on [BIOWULF](https://hpc.nih.gov/) - Reach out to [Vishal Koparde](mailto:vishal.koparde@nihgov) for questions/comments/requests. - This circularRNA detection pipeline uses CIRCExplorer2, CIRI2 and many other tools in parallel to detect, quantify and annotate circRNAs. Here is a list of tools that can be run using CHARLIE: -| circRNA Detection Tool | Aligner(s) | Run by default | -| ---------------------- | ---------- | -------------- | -| [CIRCExplorer2](https://github.com/YangLab/CIRCexplorer2) | STAR1 | Yes | -| [CIRI2](https://sourceforge.net/projects/ciri/files/CIRI2/) | BWA1 | Yes | -| [CIRCExplorer2](https://github.com/YangLab/CIRCexplorer2) | BWA1 | Yes | -| [CLEAR](https://github.com/YangLab/CLEAR) | STAR1 | Yes | -| [DCC](https://github.com/dieterich-lab/DCC) | STAR2 | Yes | -| [circRNAFinder](https://github.com/bioxfu/circRNAFinder) | STAR3 | Yes | -| [find_circ](https://github.com/marvin-jens/find_circ) | Bowtie2 | Yes | -| [MapSplice](https://github.com/merckey/MapSplice2) | BWA2 | No | -| [NCLScan](https://github.com/TreesLab/NCLscan) | NovoAlign | No | +| circRNA Detection Tool | Aligner(s) | Run by default | +| ----------------------------------------------------------- | ---------------- | -------------- | +| [CIRCExplorer2](https://github.com/YangLab/CIRCexplorer2) | STAR1 | Yes | +| [CIRI2](https://sourceforge.net/projects/ciri/files/CIRI2/) | BWA1 | Yes | +| [CIRCExplorer2](https://github.com/YangLab/CIRCexplorer2) | BWA1 | Yes | +| [CLEAR](https://github.com/YangLab/CLEAR) | STAR1 | Yes | +| [DCC](https://github.com/dieterich-lab/DCC) | STAR2 | Yes | +| [circRNAFinder](https://github.com/bioxfu/circRNAFinder) | STAR3 | Yes | +| [find_circ](https://github.com/marvin-jens/find_circ) | Bowtie2 | Yes | +| [MapSplice](https://github.com/merckey/MapSplice2) | BWA2 | No | +| [NCLScan](https://github.com/TreesLab/NCLscan) | NovoAlign | No | > Note: STAR1, STAR2, STAR3 denote 3 different sets of alignment parameters, etc. > Note: BWA1, BWA2 denote 2 different alignment parameters, etc. ### 2. Flowchart + ![](docs/images/CHARLIE_v0.8.x.png) For complete documentation with tutorial go [here](https://CCBR.github.io/CHARLIE/). @@ -54,33 +55,32 @@ For complete documentation with tutorial go [here](https://CCBR.github.io/CHARLI The following version of various bioinformatics tools are using within CHARLIE: -| tool | version | -| ------------- | --------- | -| blat | 3.5 | -| bedtools | 2.30.0 | -| bowtie | 2-2.5.1 | -| bowtie | 1.3.1 | -| bwa | 0.7.17 | -| circexplorer2 | 2.3.8 | -| cufflinks | 2.2.1 | -| cutadapt | 4.4 | -| fastqc | 0.11.9 | -| hisat | 2.2.2.1 | -| java | 18.0.1.1 | -| multiqc | 1.9 | -| parallel | 20231122 | -| perl | 5.34 | -| picard | 2.27.3 | -| python | 2.7 | -| python | 3.8 | -| sambamba | 0.8.2 | -| samtools | 1.16.1 | -| STAR | 2.7.6a | -| stringtie | 2.2.1 | -| ucsc | 450 | -| R | 4.0.5 | -| novocraft | 4.03.05 | - +| tool | version | +| ------------- | -------- | +| blat | 3.5 | +| bedtools | 2.30.0 | +| bowtie | 2-2.5.1 | +| bowtie | 1.3.1 | +| bwa | 0.7.17 | +| circexplorer2 | 2.3.8 | +| cufflinks | 2.2.1 | +| cutadapt | 4.4 | +| fastqc | 0.11.9 | +| hisat | 2.2.2.1 | +| java | 18.0.1.1 | +| multiqc | 1.9 | +| parallel | 20231122 | +| perl | 5.34 | +| picard | 2.27.3 | +| python | 2.7 | +| python | 3.8 | +| sambamba | 0.8.2 | +| samtools | 1.16.1 | +| STAR | 2.7.6a | +| stringtie | 2.2.1 | +| ucsc | 450 | +| R | 4.0.5 | +| novocraft | 4.03.05 | ### 4. Usage @@ -155,6 +155,7 @@ Required Arguments: Optional Arguments: +--singcache|-c : singularity cache directory. Default is `/data/${USER}/.singularity` if available, or falls back to `${WORKDIR}/.singularity`. Use this flag to specify a different singularity cache directory. --host|-g : supply host at command line. hg38 or mm39. (--runmode=init only) --additives|-a : supply comma-separated list of additives at command line. ERCC or BAC16Insert or both (--runmode=init only) --viruses|-v : supply comma-separated list of viruses at command line (--runmode=init only) @@ -219,8 +220,8 @@ This will create the folder provided by `-w=`. The user should have write permis Test data (1 paired-end subsample and 1 single-end subsample) have been including under the `.tests/dummy_fastqs` folder. After running in `-m=init`, `samples.tsv` should be edited to point the copies of the above mentioned samples with the column headers: -- sampleName -- path_to_R1_fastq +- sampleName +- path_to_R1_fastq - path_to_R2_fastq Column `path_to_R2_fastq` will be blank in case of single-end samples. @@ -234,6 +235,7 @@ bash -w= -m=dryrun This will create the reference fasta and gtf file based on the selections made in the `config.yaml`. #### Run + If `-m=dryrun` was sucessful, then simply do `-m=run`. The output will look something like this ``` @@ -307,5 +309,5 @@ Expected output from the sample data is stored under `.tests/expected_output`. More details about running test data can be found [here](https://ccbr.github.io/CHARLIE/tutorial). > DISCLAIMER: -> +> > CHARLIE is built to be run only on [BIOWULF](https://hpc.nih.gov). A newer HPC-agnostic version of CHARLIE is planned for 2024. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..539f9fc --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.10.1-dev diff --git a/charlie b/charlie index 9a946c8..a147db9 100755 --- a/charlie +++ b/charlie @@ -5,7 +5,7 @@ # CHARLIE set -eo pipefail -module purge +## TODO module statements can only run on biowulf # decide trigger trigger="mtime" @@ -23,6 +23,16 @@ function get_git_commitid_tag() { echo -ne "$gid\t$tag" } +# determine if the platform is biowulf, FRCE, or something else and use modules accordingly +function get_platform() { + if command -v scontrol &> /dev/null ; then + platform=$(scontrol show config | grep ClusterName | sed 's/.*= //') + else + platform=unknown + fi + echo $platform +} + ########################################################################################## # initial setup ########################################################################################## @@ -40,15 +50,35 @@ GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) # Some more set up ########################################################################################## +PYTHONVERSION="3" +SNAKEMAKEVERSION="7" CLUSTER_SBATCH_CMD="sbatch --parsable --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" -# if [ "$HOSTNAME" == "biowulf.nih.gov" ];then -# if [ "$SLURM_CLUSTER_NAME" == "biowulf" ];then +PARTITION='norm' +CONDA_ACTIVATE='' +PATH_PREPEND='' +MODULE_LOAD='' +PLATFORM=$(get_platform) +if [ "$PLATFORM" == "biowulf" ]; then EXTRA_SINGULARITY_BINDS="/lscratch" CLUSTER_SBATCH_CMD="$CLUSTER_SBATCH_CMD --gres {cluster.gres}" -# fi -PYTHONVERSION="3.7" -SNAKEMAKEVERSION="7.19.1" -# SNAKEMAKEVERSION="5.24.1" + PARTITION="ccr,$PARTITION" + CONDA_ACTIVATE='. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" && conda activate py311' + MODULE_LOAD="module load python/$PYTHONVERSION snakemake/$SNAKEMAKEVERSION singularity; $CONDA_ACTIVATE" +elif [ "$PLATFORM" == "fnlcr" ]; then + EXTRA_SINGULARITY_BINDS="/scratch/local" + # activate conda env + CONDA_ACTIVATE=". '/mnt/projects/CCBR-Pipelines/resources/miniconda3/etc/profile.d/conda.sh' && conda activate py311" + # make sure spooker is in the path + PATH_PREPEND='export PATH="/mnt/projects/CCBR-Pipelines/bin:$PATH"' + MODULE_LOAD="module load singularity; $PATH_PREPEND; $CONDA_ACTIVATE" +else + EXTRA_SINGULARITY_BINDS="" + echo """WARNING: detected platform is $PLATFORM. Please edit the following files for compatibility with your computing environment: + config.yaml + cluster.json + submit_script.sbatch + """ +fi # set defaults HOST="hg38" @@ -70,12 +100,12 @@ function usage() { cat << EOF ########################################################################################## Welcome to charlie(v0.10.1) - _______ __ __ _______ ______ ___ ___ _______ + _______ __ __ _______ ______ ___ ___ _______ | || | | || _ || _ | | | | | | | | || |_| || |_| || | || | | | | | ___| -| || || || |_||_ | | | | | |___ +| || || || |_||_ | | | | | |___ | _|| || || __ || |___ | | | ___| -| |_ | _ || _ || | | || || | | |___ +| |_ | _ || _ || | | || || | | |___ |_______||__| |__||__| |__||___| |_||_______||___| |_______| C_ircrnas in H_ost A_nd vi_R_uses ana_L_ysis p_I_p_E_line @@ -143,15 +173,15 @@ Optional Arguments: Example commands: - bash ${SCRIPTNAME} -w=/my/ouput/folder -m=init - bash ${SCRIPTNAME} -w=/my/ouput/folder -m=dryrun - bash ${SCRIPTNAME} -w=/my/ouput/folder -m=run + bash ${SCRIPTNAME} -w=/my/output/folder -m=init + bash ${SCRIPTNAME} -w=/my/output/folder -m=dryrun + bash ${SCRIPTNAME} -w=/my/output/folder -m=run ########################################################################################## -VersionInfo: - python : $PYTHONVERSION - snakemake : $SNAKEMAKEVERSION +VersionInfo: + python : $PYTHONVERSION + snakemake : $SNAKEMAKEVERSION pipeline_home : $PIPELINE_HOME git commit/tag : $GIT_COMMIT_TAG @@ -190,13 +220,13 @@ sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" \ -e "s/HOST/${HOST}/g" \ -e "s/ADDITIVES/${ADDITIVES}/g" \ -e "s/VIRUSES/${VIRUSES}/g" \ - ${PIPELINE_HOME}/config/config.yaml > $CONFIGFILE + ${PIPELINE_HOME}/config/$PLATFORM/config.yaml > $CONFIGFILE fi if [ ! -f $WORKDIR/nclscan.config ];then sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" -e "s/WORKDIR/${WORKDIR//\//\\/}/g" ${PIPELINE_HOME}/resources/NCLscan.config.template > $WORKDIR/nclscan.config fi if [ ! -f $CLUSTERFILE ];then -cp ${PIPELINE_HOME}/resources/cluster.json $CLUSTERFILE +cp ${PIPELINE_HOME}/config/$PLATFORM/cluster.json $CLUSTERFILE fi if [ ! -f $WORKDIR/samples.tsv ];then cp $MANIFEST $WORKDIR/samples.tsv @@ -267,14 +297,27 @@ function reconfig(){ echo "$WORKDIR/config.yaml has been updated!" } + +# check whether required dependencies are in the path +function check_deps() { + for dep in python snakemake singularity; do + command -v $dep &> /dev/null || err "$dep not found in PATH" + done +} + +# load modules if available, or check whether they're in the path +function load_modules() { + eval $MODULE_LOAD + check_deps +} + ########################################################################################## # RUNCHECK ... check essential files and load required packages ########################################################################################## function runcheck(){ check_essential_files - module load python/$PYTHONVERSION - module load snakemake/$SNAKEMAKEVERSION + load_modules } ########################################################################################## @@ -305,7 +348,7 @@ function touch() { function unlock() { runcheck - run "--unlock" + run "--unlock" } ########################################################################################## @@ -313,8 +356,8 @@ function unlock() { ########################################################################################## function set_singularity_binds(){ -# this functions tries find what folders to bind -# biowulf specific + # this functions tries find what folders to bind + # TODO parse config file with pyyaml to determine singularity bind paths echo "$PIPELINE_HOME" > ${WORKDIR}/tmp1 echo "$WORKDIR" >> ${WORKDIR}/tmp1 grep -o '\/.*' <(cat ${WORKDIR}/config.yaml ${WORKDIR}/samples.tsv)|dos2unix|tr '\t' '\n'|grep -v ' \|\/\/'|sort|uniq >> ${WORKDIR}/tmp1 @@ -344,7 +387,6 @@ function runlocal() { runcheck set_singularity_binds if [ "$SLURM_JOB_ID" == "" ];then err "runlocal can only be done on an interactive node"; exit 1; fi - module load singularity run "local" } @@ -373,10 +415,10 @@ function create_runinfo { echo "Pipeline Dir: $PIPELINE_HOME" > ${WORKDIR}/runinfo.yaml echo "Git Commit/Tag: $GIT_COMMIT_TAG" >> ${WORKDIR}/runinfo.yaml userlogin=$(whoami) - if [[ `which finger 2>/dev/null` ]];then - username=$(finger $userlogin |grep ^Login | awk -F"Name: " '{print $2}'); - elif [[ `which lslogins 2>/dev/null` ]];then - username=$(lslogins -u $userlogin | grep ^Geco | awk -F": " '{print $2}' | awk '{$1=$1;print}'); + if [[ `which finger 2>/dev/null` ]];then + username=$(finger $userlogin |grep ^Login | awk -F"Name: " '{print $2}'); + elif [[ `which lslogins 2>/dev/null` ]];then + username=$(lslogins -u $userlogin | grep ^Geco | awk -F": " '{print $2}' | awk '{$1=$1;print}'); else username="";fi echo "Login: $userlogin" >> ${WORKDIR}/runinfo.yaml echo "Name: $username" >> ${WORKDIR}/runinfo.yaml @@ -394,21 +436,21 @@ function preruncleanup() { echo "Running..." # check initialization - check_essential_files + check_essential_files cd $WORKDIR modtime="" ## Archive previous run files - if [ -f ${WORKDIR}/snakemake.log ];then + if [ -f ${WORKDIR}/snakemake.log ];then modtime=$(stat ${WORKDIR}/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") mv ${WORKDIR}/snakemake.log ${WORKDIR}/stats/snakemake.${modtime}.log - if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then + if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then mv ${WORKDIR}/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt fi - if [ -f ${WORKDIR}/snakemake.stats ];then + if [ -f ${WORKDIR}/snakemake.stats ];then mv ${WORKDIR}/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats fi - if [ -f ${WORKDIR}/snakemake.log.jobinfo ];then + if [ -f ${WORKDIR}/snakemake.log.jobinfo ];then mv ${WORKDIR}/snakemake.log.jobinfo ${WORKDIR}/stats/snakemake.${modtime}.log.jobinfo fi fi @@ -433,104 +475,106 @@ function run() { if [ "$1" == "local" ];then - preruncleanup + preruncleanup - snakemake -s $SNAKEFILE\ - --directory $WORKDIR \ - --printshellcmds \ - --use-singularity \ - --singularity-args "$SINGULARITY_BINDS" \ - --use-envmodules \ - --latency-wait 120 \ - --configfile $CONFIGFILE \ - --cores all \ - --rerun-incomplete \ - --rerun-triggers $trigger \ - --retries 2 \ - --keep-going \ - --stats ${WORKDIR}/snakemake.stats \ - 2>&1|tee ${WORKDIR}/snakemake.log - - if [ "$?" -eq "0" ];then - snakemake -s $SNAKEFILE \ - --report ${WORKDIR}/runlocal_snakemake_report.html \ + $EXPORT_SING_CACHE_DIR_CMD + + snakemake -s $SNAKEFILE\ --directory $WORKDIR \ - --configfile $CONFIGFILE - fi + --printshellcmds \ + --use-singularity \ + --singularity-args "$SINGULARITY_BINDS" \ + --use-envmodules \ + --latency-wait 120 \ + --configfile $CONFIGFILE \ + --cores all \ + --rerun-incomplete \ + --rerun-triggers $trigger \ + --retries 2 \ + --keep-going \ + --stats ${WORKDIR}/snakemake.stats \ + 2>&1|tee ${WORKDIR}/snakemake.log + + if [ "$?" -eq "0" ];then + snakemake -s $SNAKEFILE \ + --report ${WORKDIR}/runlocal_snakemake_report.html \ + --directory $WORKDIR \ + --configfile $CONFIGFILE + fi elif [ "$1" == "slurm" ];then - - preruncleanup - cat > ${WORKDIR}/submit_script.sbatch << EOF + preruncleanup + + cat > ${WORKDIR}/submit_script.sbatch << EOF #!/bin/bash #SBATCH --job-name="charlie" #SBATCH --mem=40g -#SBATCH --partition="ccr,norm" +#SBATCH --partition="$PARTITION" #SBATCH --time=48:00:00 #SBATCH --cpus-per-task=2 -module load python/$PYTHONVERSION -module load snakemake/$SNAKEMAKEVERSION -module load singularity - cd \$SLURM_SUBMIT_DIR +$MODULE_LOAD +$EXPORT_SING_CACHE_DIR_CMD snakemake -s $SNAKEFILE \ ---directory $WORKDIR \ ---use-singularity \ ---singularity-args "$SINGULARITY_BINDS" \ ---use-envmodules \ ---printshellcmds \ ---latency-wait 120 \ ---configfile $CONFIGFILE \ ---cluster-config $CLUSTERFILE \ ---cluster "$CLUSTER_SBATCH_CMD" \ ---cluster-status $CLUSTERSTATUSCMD \ --j 500 \ ---rerun-incomplete \ ---rerun-triggers $trigger \ ---retries 2 \ ---keep-going \ ---stats ${WORKDIR}/snakemake.stats \ -2>&1|tee ${WORKDIR}/snakemake.log + --directory $WORKDIR \ + --use-singularity \ + --singularity-args "$SINGULARITY_BINDS" \ + --use-envmodules \ + --printshellcmds \ + --latency-wait 120 \ + --configfile $CONFIGFILE \ + --cluster-config $CLUSTERFILE \ + --cluster "$CLUSTER_SBATCH_CMD" \ + --cluster-status $CLUSTERSTATUSCMD \ + -j 500 \ + --rerun-incomplete \ + --rerun-triggers $trigger \ + --retries 2 \ + --keep-going \ + --stats ${WORKDIR}/snakemake.stats \ + 2>&1 | tee ${WORKDIR}/snakemake.log if [ "\$?" -eq "0" ];then snakemake -s $SNAKEFILE \ --directory $WORKDIR \ --report ${WORKDIR}/runslurm_snakemake_report.html \ - --configfile $CONFIGFILE + --configfile $CONFIGFILE fi EOF - sbatch ${WORKDIR}/submit_script.sbatch + sbatch ${WORKDIR}/submit_script.sbatch elif [ "$1" == "--touch" ];then -snakemake $1 -s $SNAKEFILE \ ---directory $WORKDIR \ ---configfile $CONFIGFILE \ ---cores 1 + snakemake $1 -s $SNAKEFILE \ + --directory $WORKDIR \ + --configfile $CONFIGFILE \ + --cores 1 else # dry-run and unlock -echo $CLUSTER_SBATCH_CMD - -snakemake $1 -s $SNAKEFILE \ ---directory $WORKDIR \ ---use-envmodules \ ---printshellcmds \ ---latency-wait 120 \ ---configfile $CONFIGFILE \ ---cluster-config $CLUSTERFILE \ ---cluster "$CLUSTER_SBATCH_CMD" \ --j 500 \ ---rerun-incomplete \ ---rerun-triggers $trigger \ ---keep-going \ ---reason \ ---stats ${WORKDIR}/snakemake.stats + echo $CLUSTER_SBATCH_CMD + + snakemake $1 -s $SNAKEFILE \ + --directory $WORKDIR \ + --use-envmodules \ + --use-singularity \ + --singularity-args "$SINGULARITY_BINDS" \ + --printshellcmds \ + --latency-wait 120 \ + --configfile $CONFIGFILE \ + --cluster-config $CLUSTERFILE \ + --cluster "$CLUSTER_SBATCH_CMD" \ + -j 500 \ + --rerun-incomplete \ + --rerun-triggers $trigger \ + --keep-going \ + --reason \ + --stats ${WORKDIR}/snakemake.stats fi @@ -571,6 +615,9 @@ function main(){ -w=*|--workdir=*) WORKDIR="${i#*=}" ;; + -c=*|--singcache=*) + SING_CACHE_DIR="${i#*=}" + ;; -z|--changegrp) CHANGEGRP=1 ;; @@ -582,11 +629,11 @@ function main(){ ;; -v=*|--viruses=*) VIRUSES="${i#*=}" - ;; + ;; -s=*|--manifest=*) MANIFEST="${i#*=}" if [ ! -f $MANIFEST ];then err "File $MANIFEST does NOT exist!";fi - ;; + ;; -h|--help) usage && exit 0; ;; @@ -598,6 +645,17 @@ function main(){ WORKDIR=$(readlink -f "$WORKDIR") echo "Working Dir: $WORKDIR" + if [[ -z "$SING_CACHE_DIR" ]]; then + if [[ -d "/data/$USER" ]]; then + SING_CACHE_DIR="/data/$USER/.singularity" + else + SING_CACHE_DIR="${WORKDIR}/.singularity" + fi + echo "singularity cache dir (--singcache) is not set, using ${SING_CACHE_DIR}" + fi + mkdir -p $SING_CACHE_DIR + EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\"" + # required files CONFIGFILE="${WORKDIR}/config.yaml" CLUSTERFILE="${WORKDIR}/cluster.json" @@ -628,8 +686,3 @@ function main(){ ########################################################################################## main "$@" - - - - - diff --git a/resources/cluster.json b/config/biowulf/cluster.json similarity index 84% rename from resources/cluster.json rename to config/biowulf/cluster.json index ebe392c..028a2e2 100644 --- a/resources/cluster.json +++ b/config/biowulf/cluster.json @@ -5,9 +5,9 @@ "partition": "ccr,norm", "threads": "2", "time": "4:00:00", - "name" : "{rule}.{wildcards}", - "output" : "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", - "error" : "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" + "name": "{rule}.{wildcards}", + "output": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", + "error": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" }, "cutadapt": { "mem": "120g", @@ -22,12 +22,12 @@ "find_circ_align": { "mem": "120g", "threads": "56", - "time": "6:00:00" + "time": "6:00:00" }, "find_circ": { "mem": "120g", "threads": "56", - "time": "6:00:00" + "time": "6:00:00" }, "mapsplice": { "mem": "200g", @@ -80,7 +80,7 @@ "star_circrnafinder": { "mem": "200g", "threads": "56", - "time": "6:00:00" + "time": "6:00:00" }, "estimate_duplication": { "mem": "200g", @@ -92,7 +92,7 @@ "threads": "4", "time": "4:00:00" }, - "create_circExplorer_linear_spliced_bams":{ + "create_circExplorer_linear_spliced_bams": { "mem": "120g", "threads": "56", "time": "8:00:00" @@ -102,11 +102,11 @@ }, "split_splice_reads_BAM_create_BW": { "mem": "120g", - "time": "24:00:00" + "time": "24:00:00" }, "split_linear_reads_BAM_create_BW": { "mem": "120g", - "time": "24:00:00" + "time": "24:00:00" }, "alignment_stats": { "time": "1:00:00" diff --git a/config/biowulf/config.yaml b/config/biowulf/config.yaml new file mode 100644 index 0000000..da1eb19 --- /dev/null +++ b/config/biowulf/config.yaml @@ -0,0 +1,131 @@ +## you probably need to change or comment/uncomment some of these +# +# The working dir... output will be in the results subfolder of the workdir +workdir: "WORKDIR" + +# temporary directory for intermediate files that are not saved +tempdir: '/lscratch/$SLURM_JOB_ID' + +# tab delimited samples file ... should have the following 3 columns +# sampleName path_to_R1_fastq path_to_R2_fastq +samples: "WORKDIR/samples.tsv" + +# Should the CLEAR pipeline be run? True or False WITHOUT quotes +run_clear: True + +# Should the DCC pipeline be run? True or False WITHOUT quote +run_dcc: True + +# Should the MapSplice pipeline be run? True or False WITHOUT quotes +run_mapsplice: False +mapsplice_min_map_len: 50 +mapsplice_filtering: 2 # 1=less stringent 2=default + +# Should the circRNA_finder be run? True or False WITHOUT quotes +run_circRNAFinder: True +# Should the NCLscan pipeline be run? True or False WITHOUT quotes +# This can only be run for PE data +run_nclscan: False +nclscan_config: "WORKDIR/nclscan.config" + +# Should we also run find_circ? True or False WITHOUT quotes +run_findcirc: False +# findcirc_params: "--noncanonical --allhits" # this gives way too many circRNAs +findcirc_params: "--noncanonical" + +# select references .... host + viruses(comma-separated): +# select host: # options are hg38 or mm39 +# host: "hg38" +# additives: "ERCC" # options are ERCC and BAC16Insert +# viruses: "NC_009333.1" +host: "HOST" +additives: "ADDITIVES" +viruses: "VIRUSES" +# select viruses and other (ERCC/BAC): options are +# ERCC +# BAC16Insert +# +# | RefSeq Sequence | RefSeq assembly accession | Notes | +# | ---------------- | ------------------------- | ----------------------------------------------------- | +# | NC_007605.1 | GCF_002402265.1 | Human gammaherpesvirus 4 (Epstein-Barr virus) | +# | NC_000898.1 | GCF_000846365.1 | Human betaherpesvirus 6B | +# | NC_001664.4 | GCF_000845685.2 | Human betaherpesvirus 6A | +# | NC_001716.2 | GCF_000848125.1 | Human betaherpesvirus 7 | +# | NC_006273.2 | GCF_000845245.1 | Human betaherpesvirus 5 | +# | NC_009333.1 | GCF_000838265.1 | Human gammaherpesvirus 8 | +# | NC_045512.2 | GCF_009858895.2 | Severe acute respiratory syndrome-related coronavirus | +# | MN485971.1 | xx | HIV from Belgium ... GTF is hand curated | +# +# | RefSeq Sequence | RefSeq assembly accession | Notes | +# | ---------------- | ------------------------- | ------------------------------------------------------------ | +# | NC_001806.2 | GCF_000859985.2 | [Human alphaherpesvirus 1 (Herpes simplex virus type 1)](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10298&lvl=3&lin=f&keep=1&srchmode=1&unlock) (strain 17) | +# +# | RefSeq Sequence | RefSeq assembly accession | Notes | +# | ---------------- | ------------------------- | ------------------------------------------------------------ | +# | KT899744.1 | KT899744.1 | HSV-1 strain KOS | +# | MH636806.1 | MH636806.1 | MHV68 (Murine herpesvirus 68 strain WUMS) | +# +# comma separated list +# STAR 1-pass junction filtering.... +# 1st pass of STAR generates a list of splice junctions which are filtered to be parsed to the second pass of STAR +# Separate filters can be applied to the "host"+"additives" and "viruses" defined above +# Typically, since "host"+"additives" annotations are much more well-established we filter out noncanonical and unannotated +# while keeping everything for the poorly annotated viruses +star_1pass_filter_host_noncanonical: "True" +star_1pass_filter_host_unannotated: "True" +star_1pass_filter_viruses_noncanonical: "False" +star_1pass_filter_viruses_unannotated: "False" + +alignTranscriptsPerReadNmax: "20000" + +# BSJ filters in bp: +minsize_host: 150 +minsize_virus: 150 +maxsize_host: 1000000000 +maxsize_virus: 5000 + +## you most probably dont need to change these +scriptsdir: "PIPELINE_HOME/workflow/scripts" +resourcesdir: "PIPELINE_HOME/resources" + +# default cluster +# cluster: "PIPELINE_HOME/resources/cluster.json" +cluster: "WORKDIR/cluster.json" + +adapters: "PIPELINE_HOME/resources/TruSeq_and_nextera_adapters.consolidated.fa" +circexplorer_bsj_circRNA_min_reads: 3 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number +minreadcount: 3 # this is used to filter circRNAs while creating the per-sample counts table +flanksize: 18 # 18bp flank on either side of the BSJ .. used by multiple BSJ callers +dcc_strandedness: "-ss" # "-ss" for stranded library and "--nonstrand" for unstranded +cutadapt_min_length: 15 +cutadapt_n: 5 +cutadapt_max_n: 0.5 +cutadapt_O: 5 +cutadapt_q: 20 +high_confidence_core_callers: "circExplorer,circExplorer_bwa" +high_confidence_core_callers_plus_n: 1 + +ciri_perl_script: "/opt2/CIRI_v2.0.6/CIRI2.pl" # path in docker container +# change this path to a directory containing fasta and GTF files for all host and virus genomes +fastas_gtfs_dir: "/data/CCBR_Pipeliner/db/PipeDB/charlie/fastas_gtfs" + +annotation_lookups: + hg38: "PIPELINE_HOME/resources/hg38_2_hg19_lookup.txt" + mm39: "PIPELINE_HOME/resources/mm39_circBase_annotation_lookup.txt" + +containers: + base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" + bowtie1: "docker://nciccbr/charlie_bowtie1:v0.1.0" + circexplorer: "docker://nciccbr/ccbr_circexplorer:v1.0" + circRNA_finder: "docker://nciccbr/charlie_circrna_finder:v1" + ciri: "docker://nciccbr/charlie_ciri2:v1" + clear: "docker://nciccbr/ccbr_clear:2" + cutadapt: "docker://nciccbr/charlie_cutadapt_fqfilter:v1" + dcc: "docker://nciccbr/charlie_dcc:v0.2.0" + fastqc: "docker://nciccbr/ccrgb_qctools:v4.0" + mapsplice: "docker://cgrlab/mapsplice2:latest" + multiqc: "docker://nciccbr/ccbr_multiqc_1.15:v1" + picard: "docker://nciccbr/ccbr_picard_2.27.5:v1" + R: "docker://nciccbr/ccbr_r_4.3.0:v1" + star: "docker://nciccbr/ccbr_star_2.7.6a:latest" + star_ucsc_cufflinks: "docker://nciccbr/charlie_star_ucsc_cufflinks:v0.4.0" diff --git a/config/fnlcr/cluster.json b/config/fnlcr/cluster.json new file mode 100644 index 0000000..fbc50f9 --- /dev/null +++ b/config/fnlcr/cluster.json @@ -0,0 +1,119 @@ +{ + "__default__": { + "mem": "40g", + "partition": "norm", + "threads": "2", + "time": "4:00:00", + "name": "{rule}.{wildcards}", + "output": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", + "error": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" + }, + "cutadapt": { + "mem": "120g", + "threads": "32", + "time": "6:00:00" + }, + "dcc": { + "mem": "120g", + "threads": "4", + "time": "4:00:00" + }, + "find_circ_align": { + "mem": "120g", + "threads": "32", + "time": "6:00:00" + }, + "find_circ": { + "mem": "120g", + "threads": "32", + "time": "6:00:00" + }, + "mapsplice": { + "mem": "200g", + "threads": "32", + "time": "48:00:00" + }, + "mapsplice_postprocess": { + "mem": "120g", + "threads": "4", + "time": "4:00:00" + }, + "nclscan": { + "mem": "512g", + "threads": "32", + "time": "4:00:00", + "partition": "largemem" + }, + "fastqc": { + "mem": "40g", + "threads": "4", + "time": "4:00:00" + }, + "ciri": { + "mem": "512g", + "threads": "32", + "time": "4:00:00", + "partition": "largemem" + }, + "filter_ciri_bam_for_BSJs": { + "mem": "512g", + "threads": "4", + "time": "24:00:00", + "partition": "largemem" + }, + "create_index": { + "mem": "200g", + "threads": "32", + "time": "12:00:00" + }, + "star1p": { + "mem": "200g", + "threads": "32", + "time": "6:00:00" + }, + "star2p": { + "mem": "200g", + "threads": "32", + "time": "6:00:00" + }, + "star_circrnafinder": { + "mem": "200g", + "threads": "32", + "time": "6:00:00" + }, + "estimate_duplication": { + "mem": "200g", + "threads": "4", + "time": "4:00:00" + }, + "create_circExplorer_BSJ_bam": { + "mem": "120g", + "threads": "4", + "time": "4:00:00" + }, + "create_circExplorer_linear_spliced_bams": { + "mem": "120g", + "threads": "32", + "time": "8:00:00" + }, + "clear": { + "time": "1:00:00" + }, + "split_splice_reads_BAM_create_BW": { + "mem": "120g", + "time": "24:00:00" + }, + "split_linear_reads_BAM_create_BW": { + "mem": "120g", + "time": "24:00:00" + }, + "alignment_stats": { + "time": "1:00:00" + }, + "merge_per_sample": { + "time": "1:00:00" + }, + "merge_SJ_tabs": { + "time": "1:00:00" + } +} diff --git a/config/fnlcr/config.yaml b/config/fnlcr/config.yaml new file mode 100644 index 0000000..de47366 --- /dev/null +++ b/config/fnlcr/config.yaml @@ -0,0 +1,131 @@ +## you probably need to change or comment/uncomment some of these +# +# The working dir... output will be in the results subfolder of the workdir +workdir: "WORKDIR" + +# temporary directory for intermediate files that are not saved +tempdir: "/scratch/local" + +# tab delimited samples file ... should have the following 3 columns +# sampleName path_to_R1_fastq path_to_R2_fastq +samples: "WORKDIR/samples.tsv" + +# Should the CLEAR pipeline be run? True or False WITHOUT quotes +run_clear: True + +# Should the DCC pipeline be run? True or False WITHOUT quote +run_dcc: True + +# Should the MapSplice pipeline be run? True or False WITHOUT quotes +run_mapsplice: False +mapsplice_min_map_len: 50 +mapsplice_filtering: 2 # 1=less stringent 2=default + +# Should the circRNA_finder be run? True or False WITHOUT quotes +run_circRNAFinder: True +# Should the NCLscan pipeline be run? True or False WITHOUT quotes +# This can only be run for PE data +run_nclscan: False +nclscan_config: "WORKDIR/nclscan.config" + +# Should we also run find_circ? True or False WITHOUT quotes +run_findcirc: False +# findcirc_params: "--noncanonical --allhits" # this gives way too many circRNAs +findcirc_params: "--noncanonical" + +# select references .... host + viruses(comma-separated): +# select host: # options are hg38 or mm39 +# host: "hg38" +# additives: "ERCC" # options are ERCC and BAC16Insert +# viruses: "NC_009333.1" +host: "HOST" +additives: "ADDITIVES" +viruses: "VIRUSES" +# select viruses and other (ERCC/BAC): options are +# ERCC +# BAC16Insert +# +# | RefSeq Sequence | RefSeq assembly accession | Notes | +# | ---------------- | ------------------------- | ----------------------------------------------------- | +# | NC_007605.1 | GCF_002402265.1 | Human gammaherpesvirus 4 (Epstein-Barr virus) | +# | NC_000898.1 | GCF_000846365.1 | Human betaherpesvirus 6B | +# | NC_001664.4 | GCF_000845685.2 | Human betaherpesvirus 6A | +# | NC_001716.2 | GCF_000848125.1 | Human betaherpesvirus 7 | +# | NC_006273.2 | GCF_000845245.1 | Human betaherpesvirus 5 | +# | NC_009333.1 | GCF_000838265.1 | Human gammaherpesvirus 8 | +# | NC_045512.2 | GCF_009858895.2 | Severe acute respiratory syndrome-related coronavirus | +# | MN485971.1 | xx | HIV from Belgium ... GTF is hand curated | +# +# | RefSeq Sequence | RefSeq assembly accession | Notes | +# | ---------------- | ------------------------- | ------------------------------------------------------------ | +# | NC_001806.2 | GCF_000859985.2 | [Human alphaherpesvirus 1 (Herpes simplex virus type 1)](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10298&lvl=3&lin=f&keep=1&srchmode=1&unlock) (strain 17) | +# +# | RefSeq Sequence | RefSeq assembly accession | Notes | +# | ---------------- | ------------------------- | ------------------------------------------------------------ | +# | KT899744.1 | KT899744.1 | HSV-1 strain KOS | +# | MH636806.1 | MH636806.1 | MHV68 (Murine herpesvirus 68 strain WUMS) | +# +# comma separated list +# STAR 1-pass junction filtering.... +# 1st pass of STAR generates a list of splice junctions which are filtered to be parsed to the second pass of STAR +# Separate filters can be applied to the "host"+"additives" and "viruses" defined above +# Typically, since "host"+"additives" annotations are much more well-established we filter out noncanonical and unannotated +# while keeping everything for the poorly annotated viruses +star_1pass_filter_host_noncanonical: "True" +star_1pass_filter_host_unannotated: "True" +star_1pass_filter_viruses_noncanonical: "False" +star_1pass_filter_viruses_unannotated: "False" + +alignTranscriptsPerReadNmax: "20000" + +# BSJ filters in bp: +minsize_host: 150 +minsize_virus: 150 +maxsize_host: 1000000000 +maxsize_virus: 5000 + +## you most probably dont need to change these +scriptsdir: "PIPELINE_HOME/workflow/scripts" +resourcesdir: "PIPELINE_HOME/resources" + +# default cluster +# cluster: "PIPELINE_HOME/resources/cluster.json" +cluster: "WORKDIR/cluster.json" + +adapters: "PIPELINE_HOME/resources/TruSeq_and_nextera_adapters.consolidated.fa" +circexplorer_bsj_circRNA_min_reads: 3 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number +minreadcount: 3 # this is used to filter circRNAs while creating the per-sample counts table +flanksize: 18 # 18bp flank on either side of the BSJ .. used by multiple BSJ callers +dcc_strandedness: "-ss" # "-ss" for stranded library and "--nonstrand" for unstranded +cutadapt_min_length: 15 +cutadapt_n: 5 +cutadapt_max_n: 0.5 +cutadapt_O: 5 +cutadapt_q: 20 +high_confidence_core_callers: "circExplorer,circExplorer_bwa" +high_confidence_core_callers_plus_n: 1 + +ciri_perl_script: "/opt2/CIRI_v2.0.6/CIRI2.pl" # path in docker container +# change this path to a directory containing fasta and GTF files for all host and virus genomes +fastas_gtfs_dir: "/mnt/projects/CCBR-Pipelines/db/charlie/fastas_gtfs" + +annotation_lookups: + hg38: "PIPELINE_HOME/resources/hg38_2_hg19_lookup.txt" + mm39: "PIPELINE_HOME/resources/mm39_circBase_annotation_lookup.txt" + +containers: + base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" + bowtie1: "docker://nciccbr/charlie_bowtie1:v0.1.0" + circexplorer: "docker://nciccbr/ccbr_circexplorer:v1.0" + circRNA_finder: "docker://nciccbr/charlie_circrna_finder:v1" + ciri: "docker://nciccbr/charlie_ciri2:v1" + clear: "docker://nciccbr/ccbr_clear:2" + cutadapt: "docker://nciccbr/charlie_cutadapt_fqfilter:v1" + dcc: "docker://nciccbr/charlie_dcc:v0.2.0" + fastqc: "docker://nciccbr/ccrgb_qctools:v4.0" + mapsplice: "docker://cgrlab/mapsplice2:latest" + multiqc: "docker://nciccbr/ccbr_multiqc_1.15:v1" + picard: "docker://nciccbr/ccbr_picard_2.27.5:v1" + R: "docker://nciccbr/ccbr_r_4.3.0:v1" + star: "docker://nciccbr/ccbr_star_2.7.6a:latest" + star_ucsc_cufflinks: "docker://nciccbr/charlie_star_ucsc_cufflinks:v0.4.0" diff --git a/config/lint.config.yaml b/config/lint.config.yaml index aeb3108..fde2b7e 100644 --- a/config/lint.config.yaml +++ b/config/lint.config.yaml @@ -17,7 +17,7 @@ run_dcc: True # Should the MapSplice pipeline be run? True or False WITHOUT quotes run_mapsplice: True mapsplice_min_map_len: 50 -mapsplice_filtering: 2 # 1=less stringent 2=default +mapsplice_filtering: 2 # 1=less stringent 2=default # # Should the circRNA_finder be run? True or False WITHOUT quotes run_circRNAFinder: True @@ -27,7 +27,6 @@ run_nclscan: True nclscan_config: ".tests/lint_workdir/nclscan.config" # - # select references .... host + viruses(comma-separated): # select host: # options are hg38 or mm39 host: "host" @@ -36,7 +35,7 @@ viruses: "virus" # select viruses and other (ERCC/BAC): options are # ERCC # BAC16Insert -# +# # | RefSeq Sequence | RefSeq assembly accession | Notes | # | ---------------- | ------------------------- | ----------------------------------------------------- | # | NC_007605.1 | GCF_002402265.1 | Human gammaherpesvirus 4 (Epstein-Barr virus) | @@ -70,10 +69,9 @@ star_1pass_filter_viruses_unannotated: "False" # BSJ filters in bp: minsize_host: 150 -minsize_virus: 150 +minsize_virus: 150 maxsize_host: 1000000000 -maxsize_virus: 5000 - +maxsize_virus: 5000 # ## Resources @@ -95,8 +93,6 @@ maxsize_virus: 5000 # * ERCC sequences # * viruses: - - # ref_fa: "/data/Ziegelbauer_lab/resources/mm39_ERCC_HSV1_MHV68/mm39_ERCC_HSV1_MHV68.fa" # ref_gtf: "/data/Ziegelbauer_lab/resources/mm39_ERCC_HSV1_MHV68/mm39_ERCC_HSV1_MHV68.gtf" # regions: "/data/Ziegelbauer_lab/resources/mm39_ERCC_HSV1_MHV68/mm39_ERCC_HSV1_MHV68.regions" @@ -106,16 +102,12 @@ maxsize_virus: 5000 # ref_bowtie1_index: "/data/Ziegelbauer_lab/resources/mm39_ERCC_HSV1_MHV68/mm39_ERCC_HSV1_MHV68" # genepred_w_geneid: "/data/Ziegelbauer_lab/resources/mm39_ERCC_HSV1_MHV68/mm39_ERCC_HSV1_MHV68.genes.genepred_w_geneid" - # - - ## you most probably dont need to change these scriptsdir: "workflow/scripts" resourcesdir: "resources" -tools: "resources/tools.yaml" -cluster: "resources/cluster.json" +cluster: "config/unknown/cluster.json" adapters: "resources/TruSeq_and_nextera_adapters.consolidated.fa" circexplorer_bsj_circRNA_min_reads: 2 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number minreadcount: 2 # this is used to filter circRNAs while creating the per-sample counts table diff --git a/config/unknown/cluster.json b/config/unknown/cluster.json new file mode 100644 index 0000000..fbc50f9 --- /dev/null +++ b/config/unknown/cluster.json @@ -0,0 +1,119 @@ +{ + "__default__": { + "mem": "40g", + "partition": "norm", + "threads": "2", + "time": "4:00:00", + "name": "{rule}.{wildcards}", + "output": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", + "error": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" + }, + "cutadapt": { + "mem": "120g", + "threads": "32", + "time": "6:00:00" + }, + "dcc": { + "mem": "120g", + "threads": "4", + "time": "4:00:00" + }, + "find_circ_align": { + "mem": "120g", + "threads": "32", + "time": "6:00:00" + }, + "find_circ": { + "mem": "120g", + "threads": "32", + "time": "6:00:00" + }, + "mapsplice": { + "mem": "200g", + "threads": "32", + "time": "48:00:00" + }, + "mapsplice_postprocess": { + "mem": "120g", + "threads": "4", + "time": "4:00:00" + }, + "nclscan": { + "mem": "512g", + "threads": "32", + "time": "4:00:00", + "partition": "largemem" + }, + "fastqc": { + "mem": "40g", + "threads": "4", + "time": "4:00:00" + }, + "ciri": { + "mem": "512g", + "threads": "32", + "time": "4:00:00", + "partition": "largemem" + }, + "filter_ciri_bam_for_BSJs": { + "mem": "512g", + "threads": "4", + "time": "24:00:00", + "partition": "largemem" + }, + "create_index": { + "mem": "200g", + "threads": "32", + "time": "12:00:00" + }, + "star1p": { + "mem": "200g", + "threads": "32", + "time": "6:00:00" + }, + "star2p": { + "mem": "200g", + "threads": "32", + "time": "6:00:00" + }, + "star_circrnafinder": { + "mem": "200g", + "threads": "32", + "time": "6:00:00" + }, + "estimate_duplication": { + "mem": "200g", + "threads": "4", + "time": "4:00:00" + }, + "create_circExplorer_BSJ_bam": { + "mem": "120g", + "threads": "4", + "time": "4:00:00" + }, + "create_circExplorer_linear_spliced_bams": { + "mem": "120g", + "threads": "32", + "time": "8:00:00" + }, + "clear": { + "time": "1:00:00" + }, + "split_splice_reads_BAM_create_BW": { + "mem": "120g", + "time": "24:00:00" + }, + "split_linear_reads_BAM_create_BW": { + "mem": "120g", + "time": "24:00:00" + }, + "alignment_stats": { + "time": "1:00:00" + }, + "merge_per_sample": { + "time": "1:00:00" + }, + "merge_SJ_tabs": { + "time": "1:00:00" + } +} diff --git a/config/config.yaml b/config/unknown/config.yaml similarity index 81% rename from config/config.yaml rename to config/unknown/config.yaml index 40f5016..ed104d6 100644 --- a/config/config.yaml +++ b/config/unknown/config.yaml @@ -17,7 +17,7 @@ run_dcc: True # Should the MapSplice pipeline be run? True or False WITHOUT quotes run_mapsplice: False mapsplice_min_map_len: 50 -mapsplice_filtering: 2 # 1=less stringent 2=default +mapsplice_filtering: 2 # 1=less stringent 2=default # # Should the circRNA_finder be run? True or False WITHOUT quotes run_circRNAFinder: True @@ -31,19 +31,18 @@ run_findcirc: False # findcirc_params: "--noncanonical --allhits" # this gives way too many circRNAs findcirc_params: "--noncanonical" - # select references .... host + viruses(comma-separated): # select host: # options are hg38 or mm39 # host: "hg38" # additives: "ERCC" # options are ERCC and BAC16Insert # viruses: "NC_009333.1" -host: "HOST" -additives: "ADDITIVES" -viruses: "VIRUSES" +host: "HOST" +additives: "ADDITIVES" +viruses: "VIRUSES" # select viruses and other (ERCC/BAC): options are # ERCC # BAC16Insert -# +# # | RefSeq Sequence | RefSeq assembly accession | Notes | # | ---------------- | ------------------------- | ----------------------------------------------------- | # | NC_007605.1 | GCF_002402265.1 | Human gammaherpesvirus 4 (Epstein-Barr virus) | @@ -75,16 +74,17 @@ star_1pass_filter_host_unannotated: "True" star_1pass_filter_viruses_noncanonical: "False" star_1pass_filter_viruses_unannotated: "False" +alignTranscriptsPerReadNmax: "20000" + # BSJ filters in bp: minsize_host: 150 -minsize_virus: 150 +minsize_virus: 150 maxsize_host: 1000000000 -maxsize_virus: 5000 +maxsize_virus: 5000 ## you most probably dont need to change these scriptsdir: "PIPELINE_HOME/workflow/scripts" resourcesdir: "PIPELINE_HOME/resources" -tools: "PIPELINE_HOME/resources/tools.yaml" # default cluster # cluster: "PIPELINE_HOME/resources/cluster.json" @@ -103,12 +103,27 @@ cutadapt_q: 20 high_confidence_core_callers: "circExplorer,circExplorer_bwa" high_confidence_core_callers_plus_n: 1 -ciri_perl_script: "/data/CCBR_Pipeliner/bin/CIRI_v2.0.6/CIRI2.pl" -nclscan_dir: "/data/CCBR_Pipeliner/bin/NCLscan-1.7.0" -circrnafinder_dir: "/data/CCBR_Pipeliner/bin/circRNA_finder-1.2" -find_circ_dir: "/data/CCBR_Pipeliner/bin/find_circ" -fastas_gtfs_dir: "/data/CCBR_Pipeliner/db/PipeDB/charlie/fastas_gtfs" +ciri_perl_script: "/opt2/CIRI_v2.0.6/CIRI2.pl" # path in docker container +# change this path to a directory containing fasta and GTF files for all host and virus genomes +fastas_gtfs_dir: "PIPELINE_HOME/resources/fastas_gtfs" annotation_lookups: hg38: "PIPELINE_HOME/resources/hg38_2_hg19_lookup.txt" mm39: "PIPELINE_HOME/resources/mm39_circBase_annotation_lookup.txt" + +containers: + base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" + bowtie1: "docker://nciccbr/charlie_bowtie1:v0.1.0" + circexplorer: "docker://nciccbr/ccbr_circexplorer:v1.0" + circRNA_finder: "docker://nciccbr/charlie_circrna_finder:v1" + ciri: "docker://nciccbr/charlie_ciri2:v1" + clear: "docker://nciccbr/ccbr_clear:2" + cutadapt: "docker://nciccbr/charlie_cutadapt_fqfilter:v1" + dcc: "docker://nciccbr/charlie_dcc:v0.2.0" + fastqc: "docker://nciccbr/ccrgb_qctools:v4.0" + mapsplice: "docker://cgrlab/mapsplice2:latest" + multiqc: "docker://nciccbr/ccbr_multiqc_1.15:v1" + picard: "docker://nciccbr/ccbr_picard_2.27.5:v1" + R: "docker://nciccbr/ccbr_r_4.3.0:v1" + star: "docker://nciccbr/ccbr_star_2.7.6a:latest" + star_ucsc_cufflinks: "docker://nciccbr/charlie_star_ucsc_cufflinks:v0.4.0" diff --git a/docker/bowtie1/Dockerfile b/docker/bowtie1/Dockerfile new file mode 100644 index 0000000..6386538 --- /dev/null +++ b/docker/bowtie1/Dockerfile @@ -0,0 +1,23 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +# build time variables +ARG BUILD_DATE="000000" +ENV BUILD_DATE=${BUILD_DATE} +ARG BUILD_TAG="000000" +ENV BUILD_TAG=${BUILD_TAG} +ARG REPONAME="000000" +ENV REPONAME=${REPONAME} + +# install conda packages +COPY environment.txt /data2/ +RUN mamba install -c conda-forge -c bioconda --file /data2/environment.txt +ENV R_LIBS_USER=/opt2/conda/lib/R/library/ + +# Save Dockerfile in the docker +COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} +RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} + +# cleanup +WORKDIR /data2 +RUN apt-get clean && apt-get purge \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/bowtie1/environment.txt b/docker/bowtie1/environment.txt new file mode 100644 index 0000000..14ff580 --- /dev/null +++ b/docker/bowtie1/environment.txt @@ -0,0 +1 @@ +bowtie=1.3.1 \ No newline at end of file diff --git a/docker/bowtie1/meta.yml b/docker/bowtie1/meta.yml new file mode 100644 index 0000000..0baf464 --- /dev/null +++ b/docker/bowtie1/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: charlie_bowtie1 +version: v0.1.0 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/circRNA_finder/Dockerfile b/docker/circRNA_finder/Dockerfile new file mode 100644 index 0000000..0262596 --- /dev/null +++ b/docker/circRNA_finder/Dockerfile @@ -0,0 +1,30 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +# build time variables +ARG BUILD_DATE="000000" +ENV BUILD_DATE=${BUILD_DATE} +ARG BUILD_TAG="000000" +ENV BUILD_TAG=${BUILD_TAG} +ARG REPONAME="000000" +ENV REPONAME=${REPONAME} + +# install conda packages +COPY environment.txt /data2/ +RUN mamba install -c conda-forge -c bioconda --file /data2/environment.txt +ENV R_LIBS_USER=/opt2/conda/lib/R/library/ + +# install circRNA_finder +WORKDIR /opt2 +ENV VERSION=1.2 +RUN wget https://github.com/orzechoj/circRNA_finder/archive/refs/tags/v${VERSION}.tar.gz -O circRNA.tar.gz && \ + tar -xzvf circRNA.tar.gz +ENV PATH="/opt2/circRNA_finder-${VERSION}/:$PATH" + +# Save Dockerfile in the docker +COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} +RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} + +# cleanup +WORKDIR /data2 +RUN apt-get clean && apt-get purge \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/circRNA_finder/environment.txt b/docker/circRNA_finder/environment.txt new file mode 100644 index 0000000..fd233e3 --- /dev/null +++ b/docker/circRNA_finder/environment.txt @@ -0,0 +1,2 @@ +samtools +STAR \ No newline at end of file diff --git a/docker/circRNA_finder/meta.yml b/docker/circRNA_finder/meta.yml new file mode 100644 index 0000000..ae5bcb3 --- /dev/null +++ b/docker/circRNA_finder/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: charlie_circrna_finder +version: v1 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/ciri2/Dockerfile b/docker/ciri2/Dockerfile new file mode 100644 index 0000000..c8ed18e --- /dev/null +++ b/docker/ciri2/Dockerfile @@ -0,0 +1,24 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +# build time variables +ARG BUILD_DATE="000000" +ENV BUILD_DATE=${BUILD_DATE} +ARG BUILD_TAG="000000" +ENV BUILD_TAG=${BUILD_TAG} +ARG REPONAME="000000" +ENV REPONAME=${REPONAME} + +# install CIRI +WORKDIR /opt2 +RUN wget -O CIRI.zip https://sourceforge.net/projects/ciri/files/CIRI2/CIRI_v2.0.6.zip/download && \ + unzip CIRI.zip +ENV PATH="/opt2/CIRI2_v2.0.6/:$PATH" + +# Save Dockerfile in the docker +COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} +RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} + +# cleanup +WORKDIR /data2 +RUN apt-get clean && apt-get purge \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/ciri2/meta.yml b/docker/ciri2/meta.yml new file mode 100644 index 0000000..65fe3ed --- /dev/null +++ b/docker/ciri2/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: charlie_ciri2 +version: v1 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/clear/Dockerfile b/docker/clear/Dockerfile new file mode 100755 index 0000000..9f53a10 --- /dev/null +++ b/docker/clear/Dockerfile @@ -0,0 +1,19 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +RUN apt-get update && apt-get install -y zlib1g zlib1g-dev git build-essential + +# Circexplorer2 --> UCSC boostlibraries cufflinks +RUN apt-get install -y libboost1.67-tools-dev + +# install conda packages +COPY environment.yml /data2/ +ENV CONDA_ENV=clear +RUN mamba env create -n ${CONDA_ENV} -f /data2/environment.yml && \ + echo "conda activate ${CONDA_ENV}" > ~/.bashrc +ENV PATH="/opt2/conda/envs/${CONDA_ENV}/bin:$PATH" +RUN python -m pip install git+https://github.com/YangLab/CLEAR.git +RUN which circ_quant && circ_quant -h + +ENV PATH="/opt2:$PATH" +RUN chmod -R a+rX /opt2 +WORKDIR /data2 diff --git a/docker/clear/environment.yml b/docker/clear/environment.yml new file mode 100644 index 0000000..17c438d --- /dev/null +++ b/docker/clear/environment.yml @@ -0,0 +1,22 @@ +name: clear +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bedtools + - bowtie + - circexplorer2 + - cufflinks + - docopt + - hisat2 + - pip + - pybedtools + - pysam + - python=2.7 + - scipy + - setuptools + - stringtie + - tophat + - ucsc-gtftogenepred + - ucsc-genepredtogtf diff --git a/docker/clear/meta.yml b/docker/clear/meta.yml new file mode 100644 index 0000000..b04cd5d --- /dev/null +++ b/docker/clear/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: ccbr_clear +version: 2 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/cutadapt_fqfilter/Dockerfile b/docker/cutadapt_fqfilter/Dockerfile new file mode 100644 index 0000000..de3475e --- /dev/null +++ b/docker/cutadapt_fqfilter/Dockerfile @@ -0,0 +1,26 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +# build time variables +ARG BUILD_DATE="000000" +ENV BUILD_DATE=${BUILD_DATE} +ARG BUILD_TAG="000000" +ENV BUILD_TAG=${BUILD_TAG} +ARG REPONAME="000000" +ENV REPONAME=${REPONAME} + +# install conda packages +COPY environment.yml /data2/ +ENV CONDA_ENV=cutadapt +RUN mamba env create -n ${CONDA_ENV} -f /data2/environment.yml && \ + echo "conda activate ${CONDA_ENV}" > ~/.bashrc +ENV PATH="/opt2/conda/envs/${CONDA_ENV}/bin:$PATH" +ENV R_LIBS_USER=/opt2/conda/lib/R/library/ + +# Save Dockerfile in the docker +COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} +RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} + +# cleanup +WORKDIR /data2 +RUN apt-get clean && apt-get purge \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/cutadapt_fqfilter/environment.yml b/docker/cutadapt_fqfilter/environment.yml new file mode 100644 index 0000000..4bc48f7 --- /dev/null +++ b/docker/cutadapt_fqfilter/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - cutadapt + - fastq-filter \ No newline at end of file diff --git a/docker/cutadapt_fqfilter/meta.yml b/docker/cutadapt_fqfilter/meta.yml new file mode 100644 index 0000000..ed3a6b6 --- /dev/null +++ b/docker/cutadapt_fqfilter/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: charlie_cutadapt_fqfilter +version: v1 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/dcc/Dockerfile b/docker/dcc/Dockerfile new file mode 100644 index 0000000..6338f48 --- /dev/null +++ b/docker/dcc/Dockerfile @@ -0,0 +1,27 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +# build time variables +ARG BUILD_DATE="000000" +ENV BUILD_DATE=${BUILD_DATE} +ARG BUILD_TAG="000000" +ENV BUILD_TAG=${BUILD_TAG} +ARG REPONAME="000000" +ENV REPONAME=${REPONAME} + +# install DCC development version +ENV DCC_VERSION=4418a9a12a7f6e883734459f8117eba5d585b59a +WORKDIR /opt2 +RUN wget https://github.com/dieterich-lab/DCC/archive/${DCC_VERSION}.zip -O dcc.zip && \ + unzip dcc.zip && \ + cd DCC-${DCC_VERSION} && \ + python setup.py install +RUN which DCC && DCC -h + +# Save Dockerfile in the docker +COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} +RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} + +# cleanup +WORKDIR /data2 +RUN apt-get clean && apt-get purge \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/dcc/environment.yml b/docker/dcc/environment.yml new file mode 100644 index 0000000..18e2f93 --- /dev/null +++ b/docker/dcc/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::dcc=0.5.0 \ No newline at end of file diff --git a/docker/dcc/meta.yml b/docker/dcc/meta.yml new file mode 100644 index 0000000..a3a1393 --- /dev/null +++ b/docker/dcc/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: charlie_dcc +version: v0.2.0 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/star_ucsc_cufflinks/Dockerfile b/docker/star_ucsc_cufflinks/Dockerfile new file mode 100644 index 0000000..39ef4e3 --- /dev/null +++ b/docker/star_ucsc_cufflinks/Dockerfile @@ -0,0 +1,42 @@ +FROM nciccbr/ccbr_ubuntu_base_20.04:v6 + +# build time variables +ARG BUILD_DATE="000000" +ENV BUILD_DATE=${BUILD_DATE} +ARG BUILD_TAG="000000" +ENV BUILD_TAG=${BUILD_TAG} +ARG REPONAME="000000" +ENV REPONAME=${REPONAME} + +# install conda packages +COPY environment.yml /data2/ +ENV CONDA_ENV=py3.6 +RUN mamba env create -n ${CONDA_ENV} -f /data2/environment.yml && \ + echo "conda activate ${CONDA_ENV}" > ~/.bashrc +ENV PATH="/opt2/conda/envs/${CONDA_ENV}/bin:$PATH" +ENV R_LIBS_USER=/opt2/conda/lib/R/library/ + +# install find_circ +WORKDIR /opt2 +ENV VERSION=1.2 +RUN wget https://github.com/marvin-jens/find_circ/archive/refs/tags/v${VERSION}.tar.gz -O find_circ.tar.gz && \ + tar -xzvf find_circ.tar.gz && \ + cd find_circ-${VERSION} +ENV PATH="/opt2/find_circ-${VERSION}/:$PATH" + +# install nclscan +ENV NCLSCAN_VERSION=1.7.0 +RUN wget https://github.com/TreesLab/NCLscan/archive/refs/tags/v${NCLSCAN_VERSION}.tar.gz -O nclscan.tar.gz && \ + tar -xzvf nclscan.tar.gz -C /opt2/ && \ + cd /opt2/NCLscan-${NCLSCAN_VERSION}/bin/ && \ + make +ENV PATH="/opt2/NCLscan-${NCLSCAN_VERSION}/:/opt2/NCLscan-${NCLSCAN_VERSION}/bin:${PATH}" + +# Save Dockerfile in the docker +COPY Dockerfile /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} +RUN chmod a+r /opt2/Dockerfile_${REPONAME}.${BUILD_TAG} + +# cleanup +WORKDIR /data2 +RUN apt-get clean && apt-get purge \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docker/star_ucsc_cufflinks/environment.yml b/docker/star_ucsc_cufflinks/environment.yml new file mode 100644 index 0000000..07edb22 --- /dev/null +++ b/docker/star_ucsc_cufflinks/environment.yml @@ -0,0 +1,23 @@ +channels: + - conda-forge + - bioconda +dependencies: + - argparse + - bedtools=2.29.0 + - blat=35 + - bowtie2=2.5.1 + - bwa=0.7.17 + - cufflinks=2.2.1 + - gffread + - HTSeq + - novoalign=3.07.00 + - numpy + - pandas + - pysam + - python=3.6 + - sambamba=0.8.2 + - samtools=1.16.1 + - star=2.7.6a + - ucsc-bedgraphtobigwig + - ucsc-bedsort + - ucsc-gtftogenepred \ No newline at end of file diff --git a/docker/star_ucsc_cufflinks/meta.yml b/docker/star_ucsc_cufflinks/meta.yml new file mode 100644 index 0000000..12f9f38 --- /dev/null +++ b/docker/star_ucsc_cufflinks/meta.yml @@ -0,0 +1,4 @@ +dockerhub_namespace: nciccbr +image_name: charlie_star_ucsc_cufflinks +version: v0.4.0 +container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docs/platforms.md b/docs/platforms.md new file mode 100644 index 0000000..7633ed9 --- /dev/null +++ b/docs/platforms.md @@ -0,0 +1,19 @@ + +CHARLIE was originally developed to run on biowulf, but it can run on other computing platforms too. +There are a few additional steps to configure CHARLIE. + +1. Clone CHARLIE. + ```sh + git clone https://github.com/CCBR/charlie + ``` + +1. Initialize your project working directory. + ```sh + + ``` + +1. Create a directory of reference files. + +1. Edit your project's config file. + +1. If you are using a SLURM job scheduler, edit `cluster.json` and `submit_script.sbatch`. \ No newline at end of file diff --git a/docs/tutorial.md b/docs/tutorial.md index 74bbca1..a2f3382 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -128,6 +128,7 @@ Required Arguments: Optional Arguments: +--singcache|-c : singularity cache directory. Default is `/data/${USER}/.singularity` if available, or falls back to `${WORKDIR}/.singularity`. Use this flag to specify a different singularity cache directory. --host|-g : supply host at command line. hg38 or mm39. (--runmode=init only) --additives|-a : supply comma-separated list of additives at command line. ERCC or BAC16Insert or both (--runmode=init only) --viruses|-v : supply comma-separated list of viruses at command line (--runmode=init only) @@ -174,6 +175,7 @@ The above command creates `` folder and creates 2 subfolders This file is used to fine tune the execution of the pipeline by setting: * sample sheet location ... aka `samples.tsv` +* the temporary directory -- make sure this is correct for your computing environment. * which circRNA finding tools to use by editing these: * run_clear: True * run_dcc: True diff --git a/resources/NCLscan.config.template b/resources/NCLscan.config.template index ec16214..0f53d48 100644 --- a/resources/NCLscan.config.template +++ b/resources/NCLscan.config.template @@ -2,9 +2,8 @@ ### NCLscan Configuration ### ############################# -## The directory of NCLscan -NCLscan_dir = /data/Ziegelbauer_lab/tools/NCLscan-1.7.0 - +## The directory of NCLscan in the docker container +NCLscan_dir = /opt2/NCLscan-1.7.0 ## The directory of references and indices ## The script "create_reference.py" would create the needed references and indices here. @@ -27,14 +26,14 @@ lncRNA_transcripts = WORKDIR/ref/ref.dummy.fa ## External tools -## these are set to "module load" on BIOWULF - -bedtools_bin = /usr/local/apps/bedtools/2.29.0/bin/bedtools -blat_bin = /usr/local/apps/blat/3.5/blat -bwa_bin = /usr/local/apps/bwa/0.7.17/bwa -samtools_bin = /usr/local/apps/samtools/1.15.1/bin/samtools -novoalign_bin = /usr/local/apps/novocraft/4.03.05/novoalign -novoindex_bin = /usr/local/apps/novocraft/4.03.05/novoindex +## these are all in the $PATH in the docker container + +bedtools_bin = bedtools +blat_bin = blat +bwa_bin = bwa +samtools_bin = samtools +novoalign_bin = novoalign +novoindex_bin = novoindex ## Bin diff --git a/resources/tools.yaml b/resources/tools.yaml deleted file mode 100644 index 95cd4a9..0000000 --- a/resources/tools.yaml +++ /dev/null @@ -1,47 +0,0 @@ -bedtools: - version: "bedtools/2.30.0" -bowtie2: - version: "bowtie/2-2.5.1" -bowtie1: - version: "bowtie/1.3.1" -bwa: - version: "bwa/0.7.17" -circexplorer: - version: "circexplorer2/2.3.8" -cufflinks: - version: "cufflinks/2.2.1" -cutadapt: - version: "cutadapt/4.4" -fastqc: - version: "fastqc/0.11.9" -hisat: - version: "hisat/2.2.2.1-ngs3.0.1" -java: - version: "java/18.0.1.1" -multiqc: - version: "multiqc/1.9" -parallel: - version: "parallel/current" -perl: - version: "perl/5.34" -picard: - version: "picard/2.27.3" -python27: - version: "python/2.7" -python37: - version: "python/3.8" -sambamba: - version: "sambamba/0.8.2" -samtools: - version: "samtools/1.16.1" -star: - version: "STAR/2.7.6a" - alignTranscriptsPerReadNmax: "20000" -stringtie: - version: "stringtie/2.2.1" -ucsc: - version: "ucsc/450" -R: - version: "R/4.0.5" -ncl_required_modules: "python/2.7 bedtools/2.29.0 blat/3.5 bwa/0.7.17 samtools/1.15.1 novocraft/4.03.05" - diff --git a/src/charlie/__init__.py b/src/charlie/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/charlie/util.py b/src/charlie/util.py new file mode 100644 index 0000000..c0ca0e8 --- /dev/null +++ b/src/charlie/util.py @@ -0,0 +1,14 @@ +import os + + +def smk_base(rel_path): + basedir = os.path.split( + os.path.split(os.path.dirname(os.path.realpath(__file__)))[0] + )[0] + return os.path.join(basedir, rel_path) + + +def get_version(): + with open(smk_base("VERSION"), "r") as f: + version = f.readline().strip() + return version diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..4ac46e9 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,22 @@ +import os +import re +import sys + +SCRIPT_DIR = os.path.join( + os.path.split(os.path.dirname(os.path.abspath(__file__)))[0], "src", "charlie" +) +sys.path.append(SCRIPT_DIR) +from util import * + + +def test_smk_base(): + assert smk_base("").endswith("CHARLIE/") + + +def test_semantic_version(): + assert bool( + re.match( + r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$", + get_version(), + ) + ) diff --git a/version.txt b/version.txt deleted file mode 100644 index ac39a10..0000000 --- a/version.txt +++ /dev/null @@ -1 +0,0 @@ -0.9.0 diff --git a/workflow/Snakefile b/workflow/Snakefile index 597bfbd..04bb067 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -224,7 +224,7 @@ rule all: expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.circularRNA_known.txt") ,sample=SAMPLES), # annotations with "known" GENCODE genes and NOT "known" circRNAs! expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.circExplorer.annotation_counts.tsv") ,sample=SAMPLES), # circExplorer ... polished BSJ output expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.BSJ.bed.gz") ,sample=SAMPLES), # filtered BSJs with additional columns like : - # 1. (combination of SAM bitflags in the alignment), eg. 339##419##2385 + # 1. (combination of SAM bitflags in the alignment), eg. 339##419##2385 # 2. comma-separated list of readids (with HI number) supporting the BSJ, eg. J00170:207:H73WTBBXY:2:1108:8197:34266##2,J00170:207:H73WTBBXY:2:1108:6126:34371##2 # 3. splice-site flanking 2 bps eg. GG##CT @@ -232,7 +232,7 @@ rule all: expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.BSJ.bam") ,sample=SAMPLES), # use called circExplorer circRNA and the chimeric BAM to extract BSJ only alignments expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.BSJ.plus.bam") ,sample=SAMPLES), # BSJ only alignments to BSJs on the sense strand expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.BSJ.minus.bam") ,sample=SAMPLES), # BSJ only alignments to BSJs on the anti-sense strand - + expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.linear_BSJ.bam") ,sample=SAMPLES), # linear reads in BSJ inclusion zone expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.spliced_BSJ.bam") ,sample=SAMPLES), # spliced-only alignments in the sample expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.linear.bam") ,sample=SAMPLES), # linear reads ... all non chimeric @@ -252,7 +252,7 @@ rule all: ## circExplorer ... found read counts table expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.rid2jid.tsv.gz") ,sample=SAMPLES), expand(join(WORKDIR, "results", "{sample}", "circExplorer", "{sample}.readcounts.tsv") ,sample=SAMPLES), # counts table has expected counts, found BSJ counts, linear counts (same and opposite strand), splices BSJ counts - + ## circExplorer_BWA expand(join(WORKDIR, "results", "{sample}", "circExplorer_BWA", "{sample}.circularRNA_known.txt") ,sample=SAMPLES ), # annotations with "known" GENCODE genes and NOT "known" circRNAs! expand(join(WORKDIR, "results", "{sample}", "circExplorer_BWA", "{sample}.circExplorer_bwa.annotation_counts.tsv") ,sample=SAMPLES), @@ -263,34 +263,34 @@ rule all: # ## ciri expand(join(WORKDIR, "results", "{sample}", "ciri", "{sample}.ciri.out.filtered") ,sample=SAMPLES), expand(join(WORKDIR, "results", "{sample}", "ciri", "{sample}.ciri.bam") ,sample=SAMPLES), - + # ## DCC # # expand(join(WORKDIR,"results","{sample}","DCC","{sample}.dcc.counts_table.tsv"),sample=SAMPLES), get_dcc_target_files(RUN_DCC), - + # ## MapSplice get_mapsplice_target_files(RUN_MAPSPLICE), - + # ## NCLscan get_nclscan_target_files(RUN_NCLSCAN), - + # ## circRNA_finder get_circrnafinder_target_files(RUN_CIRCRNAFINDER), - + # ## find_circ get_find_circ_target_files(RUN_FINDCIRC), - + ## alignment stats join(WORKDIR, "results", "alignmentstats.txt"), - + # # ## merged counts per sample table of all counts/annotations from all circRNA callers expand(join(WORKDIR, "results", "{sample}", "{sample}.circRNA_counts.txt.gz"),sample=SAMPLES), - + # # ## master counts file join(WORKDIR, "results", "circRNA_master_counts.tsv.gz"), # HQ only BAMs - expand(join(WORKDIR, "results", "HQ_BSJ_bams","{sample}.HQ_only.BSJ.bam"),sample=SAMPLES) + expand(join(WORKDIR, "results", "HQ_BSJ_bams","{sample}.HQ_only.BSJ.bam"),sample=SAMPLES) # ## multiqc report # join(WORKDIR,"multiqc_report.html") @@ -303,15 +303,20 @@ include: "rules/post_align_processing.smk" include: "rules/findcircrna.smk" include: "rules/post_findcircrna_processing.smk" -jobby_cmd = f'mkdir -p {WORKDIR}/logs && run_jobby_on_snakemake_log {WORKDIR}/snakemake.log | tee {WORKDIR}/logs/snakemake.log.jobby | cut -f2,3,18 > {WORKDIR}/logs/snakemake.log.jobby.short' -spook_cmd = f'spooker {WORKDIR} CHARLIE' +on_finish_cmd = f""" +sleep 10 +mkdir -p {WORKDIR}/log && run_jobby_on_snakemake_log {WORKDIR}/snakemake.log | tee {WORKDIR}/log/snakemake.log.jobby | cut -f2,3,18 > {WORKDIR}/log/snakemake.log.jobby.short +if command -v spooker &> /dev/null; then + spooker {WORKDIR} CHARLIE +else + echo 'WARNING: spooker is not available in the PATH' +fi +""" def on_complete(msg): print(msg) - print(jobby_cmd) - shell(jobby_cmd) - print(spook_cmd) - shell(spook_cmd) + print(on_finish_cmd) + shell(on_finish_cmd) onsuccess: on_complete("OnSuccess") diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 6983e76..47f06cf 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -52,21 +52,13 @@ rule star1p: flanksize=FLANKSIZE, outdir=join(WORKDIR, "results", "{sample}", "STAR1p"), starindexdir=STAR_INDEX_DIR, - alignTranscriptsPerReadNmax=TOOLS["star"]["alignTranscriptsPerReadNmax"], - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["star"]["version"], + alignTranscriptsPerReadNmax=config["alignTranscriptsPerReadNmax"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']["star"] threads: getthreads("star1p") shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi - -if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi if [ "{params.peorse}" == "PE" ];then # paired-end overhang=$(zcat {input.R1} {input.R2} | awk -v maxlen=100 'NR%4==2 {{if (length($1) > maxlen+0) maxlen=length($1)}}; END {{print maxlen-1}}') @@ -100,7 +92,7 @@ if [ "{params.peorse}" == "PE" ];then --alignEndsProtrude 10 ConcordantPair \\ --outFilterIntronMotifs None \\ --sjdbGTFfile {input.gtf} \\ - --outTmpDir ${{TMPDIR}} \\ + --outTmpDir {params.tmpdir} \\ --sjdbOverhang $overhang rm -rf {params.sample}_p1._STARgenome @@ -136,7 +128,7 @@ if [ "{params.peorse}" == "PE" ];then --alignEndsProtrude 10 ConcordantPair \\ --outFilterIntronMotifs None \\ --sjdbGTFfile {input.gtf} \\ - --outTmpDir ${{TMPDIR}} \\ + --outTmpDir {params.tmpdir} \\ --sjdbOverhang $overhang rm -rf {params.sample}_mate1._STARgenome @@ -172,7 +164,7 @@ if [ "{params.peorse}" == "PE" ];then --alignEndsProtrude 10 ConcordantPair \\ --outFilterIntronMotifs None \\ --sjdbGTFfile {input.gtf} \\ - --outTmpDir ${{TMPDIR}} \\ + --outTmpDir {params.tmpdir} \\ --sjdbOverhang $overhang rm -rf {params.sample}_mate2._STARgenome @@ -212,7 +204,7 @@ else --alignEndsProtrude 10 ConcordantPair \\ --outFilterIntronMotifs None \\ --sjdbGTFfile {input.gtf} \\ - --outTmpDir ${{TMPDIR}} \\ + --outTmpDir {params.tmpdir} \\ --sjdbOverhang $overhang mkdir -p $(dirname {output.mate1_chimeric_junctions}) touch {output.mate1_chimeric_junctions} @@ -247,6 +239,7 @@ rule merge_SJ_tabs: filter2_noncanonical=config["star_1pass_filter_viruses_noncanonical"], filter2_unannotated=config["star_1pass_filter_viruses_unannotated"], threads: getthreads("merge_SJ_tabs") + container: config['containers']['base'] shell: """ set -exo pipefail @@ -303,25 +296,19 @@ rule star2p: flanksize=FLANKSIZE, outdir=join(WORKDIR, "results", "{sample}", "STAR2p"), starindexdir=STAR_INDEX_DIR, - alignTranscriptsPerReadNmax=TOOLS["star"]["alignTranscriptsPerReadNmax"], - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["star"]["version"], - TOOLS["sambamba"]["version"], - TOOLS["samtools"]["version"], + alignTranscriptsPerReadNmax=config["alignTranscriptsPerReadNmax"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']['star_ucsc_cufflinks'] threads: getthreads("star2p") shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi limitSjdbInsertNsj=$(wc -l {input.pass1sjtab}|awk '{{print $1+1}}') if [ "$limitSjdbInsertNsj" -lt "400000" ];then limitSjdbInsertNsj="400000";fi + +output_prefix=$(dirname {output.bam})/{params.sample}_p2. + if [ "{params.peorse}" == "PE" ];then # paired-end overhang=$(zcat {input.R1} {input.R2} | awk -v maxlen=100 'NR%4==2 {{if (length($1) > maxlen+0) maxlen=length($1)}}; END {{print maxlen-1}}') @@ -344,7 +331,7 @@ if [ "{params.peorse}" == "PE" ];then --readFilesIn {input.R1} {input.R2} \\ --readFilesCommand zcat \\ --runThreadN {threads} \\ - --outFileNamePrefix {params.sample}_p2. \\ + --outFileNamePrefix $output_prefix \\ --sjdbFileChrStartEnd {input.pass1sjtab} \\ --chimSegmentMin 15 \\ --chimScoreMin 15 \\ @@ -359,7 +346,7 @@ if [ "{params.peorse}" == "PE" ];then --outFilterIntronMotifs None \\ --sjdbGTFfile {input.gtf} \\ --quantMode GeneCounts \\ - --outTmpDir ${{TMPDIR}} \\ + --outTmpDir {params.tmpdir} \\ --sjdbOverhang $overhang \\ --outBAMcompression 0 \\ --outSAMattributes All @@ -389,7 +376,7 @@ else --readFilesIn {input.R1} \\ --readFilesCommand zcat \\ --runThreadN {threads} \\ - --outFileNamePrefix {params.sample}_p2. \\ + --outFileNamePrefix $output_prefix \\ --sjdbFileChrStartEnd {input.pass1sjtab} \\ --chimSegmentMin 15 \\ --chimScoreMin 15 \\ @@ -404,43 +391,43 @@ else --outFilterIntronMotifs None \\ --sjdbGTFfile {input.gtf} \\ --quantMode GeneCounts \\ - --outTmpDir ${{TMPDIR}} \\ + --outTmpDir {params.tmpdir} \\ --sjdbOverhang $overhang \\ --outBAMcompression 0 \\ --outSAMattributes All - rm -rf {params.sample}_p2._STARgenome + rm -rf ${{output_prefix}}_STARgenome fi sleep 120 -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi -samtools view -H {output.unsortedbam} > ${{TMPDIR}}/{params.sample}_p2.non_chimeric.sam -cp ${{TMPDIR}}/{params.sample}_p2.non_chimeric.sam ${{TMPDIR}}/{params.sample}_p2.chimeric.sam +mkdir -p {params.tmpdir} +samtools view -H {output.unsortedbam} > {params.tmpdir}/{params.sample}_p2.non_chimeric.sam +cp {params.tmpdir}/{params.sample}_p2.non_chimeric.sam {params.tmpdir}/{params.sample}_p2.chimeric.sam # ref https://github.com/alexdobin/STAR/issues/678 -samtools view -@ {threads} {output.unsortedbam} | grep "ch:A:1" >> ${{TMPDIR}}/{params.sample}_p2.chimeric.sam -samtools view -@ {threads} {output.unsortedbam} | grep -v "ch:A:1" >> ${{TMPDIR}}/{params.sample}_p2.non_chimeric.sam +samtools view -@ {threads} {output.unsortedbam} | grep "ch:A:1" >> {params.tmpdir}/{params.sample}_p2.chimeric.sam +samtools view -@ {threads} {output.unsortedbam} | grep -v "ch:A:1" >> {params.tmpdir}/{params.sample}_p2.non_chimeric.sam ls -alrth for i in 1 2 3;do - if [ ! -d ${{TMPDIR}}/{params.randomstr}_${{i}} ];then mkdir -p ${{TMPDIR}}/{params.randomstr}_${{i}};fi + mkdir -p {params.tmpdir}_${{i}} done -samtools view -@ {threads} -b -S ${{TMPDIR}}/{params.sample}_p2.chimeric.sam | \\ +samtools view -@ {threads} -b -S {params.tmpdir}/{params.sample}_p2.chimeric.sam | \\ samtools sort \\ -l 9 \\ - -T ${{TMPDIR}}/{params.randomstr}_1 \\ + -T {params.tmpdir}_1 \\ --write-index \\ -@ {threads} \\ --output-fmt BAM \\ -o {output.chimeric_bam} - -samtools view -@ {threads} -b -S ${{TMPDIR}}/{params.sample}_p2.non_chimeric.sam | \\ +samtools view -@ {threads} -b -S {params.tmpdir}/{params.sample}_p2.non_chimeric.sam | \\ samtools sort \\ -l 9 \\ - -T ${{TMPDIR}}/{params.randomstr}_2 \\ + -T {params.tmpdir}_2 \\ --write-index \\ -@ {threads} \\ --output-fmt BAM \\ -o {output.non_chimeric_bam} - samtools sort \\ -l 9 \\ - -T ${{TMPDIR}}/{params.randomstr}_3 \\ + -T {params.tmpdir}_3 \\ --write-index \\ -@ {threads} \\ --output-fmt BAM \\ @@ -478,21 +465,13 @@ rule star_circrnafinder: workdir=WORKDIR, flanksize=FLANKSIZE, starindexdir=STAR_INDEX_DIR, - alignTranscriptsPerReadNmax=TOOLS["star"]["alignTranscriptsPerReadNmax"], - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["star"]["version"], - TOOLS["sambamba"]["version"], - TOOLS["samtools"]["version"], + alignTranscriptsPerReadNmax=config["alignTranscriptsPerReadNmax"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']['star_ucsc_cufflinks'] threads: getthreads("star_circrnafinder") shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi outdir=$(dirname {output.chimericsam}) if [ ! -d $outdir ];then mkdir -p $outdir;fi @@ -517,7 +496,7 @@ if [ "{params.peorse}" == "PE" ];then --outFilterMultimapNmax 2 \\ --outFileNamePrefix {params.sample}. \\ --outBAMcompression 0 \\ - --outTmpDir $TMPDIR \\ + --outTmpDir {params.tmpdir} \\ --sjdbGTFfile {input.gtf} else @@ -539,7 +518,7 @@ else --outFilterMultimapNmax 2 \\ --outFileNamePrefix {params.sample}. \\ --outBAMcompression 0 \\ - --outTmpDir $TMPDIR \\ + --outTmpDir {params.tmpdir} \\ --sjdbGTFfile {input.gtf} fi @@ -550,10 +529,6 @@ rm -rf {params.sample}._STARpass1 rm -f {params.sample}.Aligned.out.bam sleep 120 - -# Used to sort the BAM file after effect , but realized that it is not required by circRNA_Finder scripts -# Hence deleting it to save digital footprint by making it temp in output block - """ rule find_circ_align: @@ -569,26 +544,18 @@ rule find_circ_align: "{sample}", "find_circ", "{sample}_anchors.fastq.gz", - ), + ), params: sample="{sample}", reffa=REF_FA, peorse=get_peorse, - find_circ_dir=FIND_CIRC_DIR, - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["python27"]["version"], - TOOLS["samtools"]["version"], TOOLS['bowtie2']['version'] + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']['star_ucsc_cufflinks'] threads: getthreads("find_circ_align") shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} refdir=$(dirname {input.bt2}) outdir=$(dirname {output.anchorsfq}) @@ -604,8 +571,8 @@ bowtie2 \\ -q \\ -1 {input.R1} \\ -2 {input.R2} \\ - > ${{TMPDIR}}/{params.sample}.sam -else + > {params.tmpdir}/{params.sample}.sam +else bowtie2 \\ -p {threads} \\ --very-sensitive \\ @@ -614,35 +581,35 @@ bowtie2 \\ -x ${{refdir}}/ref \\ -q \\ -U {input.R1} \\ - > ${{TMPDIR}}/{params.sample}.sam + > {params.tmpdir}/{params.sample}.sam fi -samtools view -@{threads} -hbuS -o ${{TMPDIR}}/{params.sample}.unsorted.bam ${{TMPDIR}}/{params.sample}.sam +samtools view -@{threads} -hbuS -o {params.tmpdir}/{params.sample}.unsorted.bam {params.tmpdir}/{params.sample}.sam samtools sort -@{threads} \\ -u \\ --write-index \\ --output-fmt BAM \\ - -T ${{TMPDIR}}/{params.sample}.samtoolssort \\ - -o ${{TMPDIR}}/{params.sample}.sorted.bam ${{TMPDIR}}/{params.sample}.unsorted.bam + -T {params.tmpdir}/{params.sample}.samtoolssort \\ + -o {params.tmpdir}/{params.sample}.sorted.bam {params.tmpdir}/{params.sample}.unsorted.bam samtools view -@{threads} \\ --output-fmt BAM \\ --write-index \\ - -o ${{TMPDIR}}/{params.sample}.unmapped.bam \\ + -o {params.tmpdir}/{params.sample}.unmapped.bam \\ -f4 \\ - ${{TMPDIR}}/{params.sample}.sorted.bam + {params.tmpdir}/{params.sample}.sorted.bam -{params.find_circ_dir}/unmapped2anchors.py \\ - ${{TMPDIR}}/{params.sample}.unmapped.bam | \\ - gzip -c - > ${{TMPDIR}}/{params.sample}.anchors.fastq.gz +unmapped2anchors.py \\ + {params.tmpdir}/{params.sample}.unmapped.bam | \\ + gzip -c - > {params.tmpdir}/{params.sample}.anchors.fastq.gz -mv ${{TMPDIR}}/{params.sample}.anchors.fastq.gz {output.anchorsfq} -mv ${{TMPDIR}}/{params.sample}.unmapped.b* ${{outdir}}/ +mv {params.tmpdir}/{params.sample}.anchors.fastq.gz {output.anchorsfq} +mv {params.tmpdir}/{params.sample}.unmapped.b* ${{outdir}}/ -sleep 300 +sleep 300 -rm -rf $TMPDIR +rm -rf {params.tmpdir} """ @@ -659,12 +626,10 @@ rule estimate_duplication: params: sample="{sample}", memG=getmemG("estimate_duplication"), - envmodules: - TOOLS["picard"]["version"], - TOOLS["java"]["version"], + container: config['containers']["picard"] shell: """ set -exo pipefail -java -Xmx{params.memG} -jar ${{PICARD_JARPATH}}/picard.jar MarkDuplicates I={input.bam} O=/dev/shm/{params.sample}.mark_dup.bam M={output.metrics} +picard -Xmx{params.memG} MarkDuplicates -I {input.bam} -O /dev/shm/{params.sample}.mark_dup.bam -M {output.metrics} rm -f /dev/shm/{params.sample}* """ diff --git a/workflow/rules/create_index.smk b/workflow/rules/create_index.smk index 373f6c2..419c036 100644 --- a/workflow/rules/create_index.smk +++ b/workflow/rules/create_index.smk @@ -20,16 +20,8 @@ rule create_index: script1=join(SCRIPTS_DIR, "_add_geneid2genepred.py"), script2=join(SCRIPTS_DIR, "_multifasta2separatefastas.sh"), script3=join(SCRIPTS_DIR, "fix_gtfs.py"), - randomstr=str(uuid.uuid4()), - nclscan_dir=config["nclscan_dir"], nclscan_config=config["nclscan_config"], - envmodules: - TOOLS["star"]["version"], - TOOLS["bwa"]["version"], - TOOLS["samtools"]["version"], - TOOLS["ucsc"]["version"], - TOOLS["cufflinks"]["version"], - TOOLS["python37"]["version"], + container: config['containers']['star_ucsc_cufflinks'] threads: getthreads("create_index") shell: """ @@ -44,7 +36,7 @@ samtools faidx {params.reffa} && \\ python {params.script3} --ingtf {params.refgtf} --outgtf {output.fixed_gtf} gffread -w {output.transcripts_fa} -g {params.reffa} {output.fixed_gtf} touch {output.lncRNA_transcripts_fa} -{params.nclscan_dir}/bin/create_reference.py -c {params.nclscan_config} +create_reference.py -c {params.nclscan_config} gtfToGenePred -ignoreGroupsWithoutExons {output.fixed_gtf} ref.genes.genepred && \\ python {params.script1} {output.fixed_gtf} ref.genes.genepred > {output.genepred_w_geneid} @@ -61,9 +53,6 @@ STAR \\ bash {params.script2} {params.reffa} {params.refdir}/separate_fastas ls {params.refdir}/separate_fastas/*.fa | awk {AWK1} > {output.fastalst} # may have to create bowtie1 index here.. has to be a separate rule ... see below - - - """ @@ -79,8 +68,7 @@ rule create_mapsplice_index: separate_fastas=join(REF_DIR, "separate_fastas"), ebwt=join(REF_DIR, "separate_fastas_index"), threads: getthreads("create_mapsplice_index") - container: - "docker://cgrlab/mapsplice2:latest" + container: "docker://cgrlab/mapsplice2:latest" shell: """ set -exo pipefail @@ -99,7 +87,7 @@ rule create_bwa_index: log=join(REF_DIR,"bwa_index.log") params: reffa=REF_FA - envmodules: TOOLS["bwa"]["version"] + container: config['containers']["base"] shell:""" set -exo pipefail refdir=$(dirname {params.reffa}) @@ -115,7 +103,7 @@ rule create_bowtie2_index: bt2=join(REF_DIR,"ref.1.bt2") params: reffa=REF_FA - envmodules: TOOLS["bowtie2"]["version"] + container: config['containers']["base"] shell:""" set -exo pipefail refdir=$(dirname {params.reffa}) @@ -131,7 +119,7 @@ rule create_bowtie1_index: bt2=join(REF_DIR,"ref.1.ebwt") params: reffa=REF_FA - envmodules: TOOLS["bowtie1"]["version"] + container: config['containers']["bowtie1"] shell:""" set -exo pipefail refdir=$(dirname {params.reffa}) diff --git a/workflow/rules/findcircrna.smk b/workflow/rules/findcircrna.smk index 28e5cc7..7edec8d 100644 --- a/workflow/rules/findcircrna.smk +++ b/workflow/rules/findcircrna.smk @@ -101,7 +101,7 @@ def get_per_sample_files_to_merge(wildcards): # | 10 | exonStarts | Exon start positions | # | 11 | exonEnds | Exon end positions | -# outout "known" file has the following columns: +# output "known" file has the following columns: # | # | ColName | Description | # |----|-------------|-------------------------------------| # | 1 | chrom | Chromosome | @@ -197,25 +197,18 @@ rule circExplorer: minsize_virus=config["minsize_virus"], maxsize_virus=config["maxsize_virus"], bash_script=join(SCRIPTS_DIR,"_run_circExplorer_star.sh"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", # script=join(SCRIPTS_DIR, "circExplorer_get_annotated_counts_per_sample.py"), # this produces an annotated counts table to which counts found in BAMs need to be appended threads: getthreads("circExplorer") - envmodules: - TOOLS["circexplorer"]["version"], + container: config['containers']["circexplorer"] shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi -if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi +mkdir -p {params.outdir} {params.tmpdir} cd {params.outdir} bash {params.bash_script} \\ --junctionfile {input.junctionfile} \\ - --tmpdir $TMPDIR \\ + --tmpdir {params.tmpdir} \\ --outdir {params.outdir} \\ --samplename {params.sample} \\ --genepred {params.genepred} \\ @@ -283,20 +276,12 @@ rule ciri: minsize_virus=config["minsize_virus"], maxsize_virus=config["maxsize_virus"], script=join(SCRIPTS_DIR, "filter_ciriout.py"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", threads: getthreads("ciri") - envmodules: - TOOLS["bwa"]["version"], - TOOLS["samtools"]["version"], + container: config['containers']['ciri'] shell: """ -set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.outdir} {params.tmpdir} cd {params.outdir} if [ "{params.peorse}" == "PE" ];then ## paired-end @@ -317,8 +302,8 @@ perl {params.ciripl} \\ -F {params.reffa} \\ -A {input.gtf} \\ -G {output.cirilog} -T {threads} -# samtools view -@{threads} -T {params.reffa} -CS {params.sample}.bwa.sam | samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O CRAM -o {output.ciribam} - -samtools view -@{threads} -bS {params.sample}.bwa.sam | samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O BAM -o {output.ciribam} - +# samtools view -@{threads} -T {params.reffa} -CS {params.sample}.bwa.sam | samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O CRAM -o {output.ciribam} - +samtools view -@{threads} -bS {params.sample}.bwa.sam | samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O BAM -o {output.ciribam} - rm -rf {params.sample}.bwa.sam python {params.script} \\ --ciriout {output.ciriout} \\ @@ -374,25 +359,19 @@ rule circExplorer_bwa: minsize_virus=config["minsize_virus"], maxsize_virus=config["maxsize_virus"], bash_script=join(SCRIPTS_DIR,"_run_circExplorer_bwa.sh"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", # script=join(SCRIPTS_DIR, "circExplorer_get_annotated_counts_per_sample.py"), # this produces an annotated counts table to which counts found in BAMs need to be appended threads: getthreads("circExplorer") - envmodules: - TOOLS["circexplorer"]["version"], + container: config['containers']["circexplorer"] shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi -if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi +mkdir -p {params.tmpdir} {params.outdir} + cd {params.outdir} bash {params.bash_script} \\ --bwabam {input.ciribam} \\ - --tmpdir $TMPDIR \\ + --tmpdir {params.tmpdir} \\ --outdir {params.outdir} \\ --samplename {params.sample} \\ --genepred {params.genepred} \\ @@ -427,8 +406,7 @@ rule create_ciri_count_matrix: lookup=ANNOTATION_LOOKUP, outdir=join(WORKDIR, "results"), hostID=HOST + "ID", - envmodules: - TOOLS["python37"]["version"], + container: config['containers']['base'] shell: """ set -exo pipefail @@ -459,8 +437,7 @@ rule create_circexplorer_count_matrix: lookup=ANNOTATION_LOOKUP, outdir=join(WORKDIR, "results"), hostID=HOST + "ID", - envmodules: - TOOLS["python37"]["version"], + container: config['containers']['base'] shell: """ cd {params.outdir} @@ -477,7 +454,7 @@ python {params.script2} {params.lookup} {params.hostID} # and to extract "Relative expression of circRNA" for downstream purposes # CLEAR does not quantify "Relative expression of circRNA" for novel circRNA, ie., # circRNAs not labeled as "known" possible due to poor genome annotation. -# circRNA is labled as "known" if its coordinates match with exons of known genes! +# circRNA is labelled as "known" if its coordinates match with exons of known genes! # quant.txt is a TSV with the following columns: # | # | ColName | Description | # |----|-------------|-------------------------------------| @@ -510,8 +487,7 @@ rule clear: quantfile=join(WORKDIR, "results", "{sample}", "CLEAR", "quant.txt"), params: genepred=rules.create_index.output.genepred_w_geneid, - container: - "docker://nciccbr/ccbr_clear:latest" + container: config['containers']['clear'] threads: getthreads("clear") shell: """ @@ -579,6 +555,7 @@ rule annotate_clear_output: lookup=ANNOTATION_LOOKUP, cleardir=join(WORKDIR, "results", "{sample}", "CLEAR"), hostID=HOST + "ID", + container: config['containers']['base'] shell: """ set -exo pipefail @@ -626,6 +603,7 @@ rule dcc_create_samplesheets: ss=join(WORKDIR, "results", "{sample}", "DCC", "samplesheet.txt"), m1=join(WORKDIR, "results", "{sample}", "DCC", "mate1.txt"), m2=join(WORKDIR, "results", "{sample}", "DCC", "mate2.txt"), + container: config['containers']['dcc'] shell: """ set -exo pipefail @@ -659,8 +637,7 @@ echo "{input.f3}" > {output.m2} # | 6 | Strand | | # | 7 | Start-End Region | eg. intron-intergenic, exon-exon, intergenic-intron, etc. | # | 8 | OverallRegion | the genomic features circRNA coordinates interval covers | - - +# # output dcc.counts_table.tsv has the following columns: # | # | ColName | # |---|----------------| @@ -690,14 +667,13 @@ rule dcc: "{sample}.dcc.counts_table.tsv.filtered", ), threads: getthreads("dcc") - envmodules: - TOOLS["python27"]["version"], + container: config['containers']['dcc'] params: peorse=get_peorse, dcc_strandedness=config["dcc_strandedness"], rep=REPEATS_GTF, fa=REF_FA, - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", script=join(SCRIPTS_DIR, "create_dcc_per_sample_counts_table.py"), bsj_min_nreads=config["minreadcount"], refregions=REF_REGIONS, @@ -712,19 +688,12 @@ rule dcc: shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} -. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" -conda activate DCC cd $(dirname {output.cr}) if [ "{params.peorse}" == "PE" ];then DCC @{input.ss} \\ - --temp ${{TMPDIR}}/DCC \\ + --temp {params.tmpdir}/DCC \\ --threads {threads} \\ --detect --gene \\ --bam {input.bam} \\ @@ -738,7 +707,7 @@ DCC @{input.ss} \\ -mt2 @{input.m2} else DCC @{input.ss} \\ - --temp ${{TMPDIR}}/DCC \\ + --temp {params.tmpdir}/DCC \\ --threads {threads} \\ --detect --gene \\ --bam {input.bam} \\ @@ -746,15 +715,15 @@ DCC @{input.ss} \\ --annotation {input.gtf} \\ --chrM -G \\ --rep_file {params.rep} \\ - --refseq {params.fa} + --refseq {params.fa} fi -ls -alrth ${{TMPDIR}} +ls -alrth {params.tmpdir} -paste {output.cr} {output.linear} | cut -f1-5,9 > ${{TMPDIR}}/CircRNALinearCount +paste {output.cr} {output.linear} | cut -f1-5,9 > {params.tmpdir}/CircRNALinearCount python {params.script} \\ - --CircCoordinates {output.cc} --CircRNALinearCount ${{TMPDIR}}/CircRNALinearCount -o {output.ct} + --CircCoordinates {output.cc} --CircRNALinearCount {params.tmpdir}/CircRNALinearCount -o {output.ct} python {params.script2} \\ --in_dcc_counts_table {output.ct} \\ @@ -855,19 +824,13 @@ rule mapsplice: separate_fastas=join(REF_DIR, "separate_fastas"), ebwt=join(REF_DIR, "separate_fastas_index"), outdir=join(WORKDIR, "results", "{sample}", "MapSplice"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", threads: getthreads("mapsplice") - container: - "docker://cgrlab/mapsplice2:latest" + container: config['containers']['mapsplice'] shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} MSHOME="/opt/MapSplice2" # singularity exec -B /data/Ziegelbauer_lab,/data/kopardevn \ @@ -877,12 +840,12 @@ if [ "{params.peorse}" == "PE" ];then R1fn=$(basename {input.R1}) R2fn=$(basename {input.R2}) -zcat {input.R1} > ${{TMPDIR}}/${{R1fn%.*}} -zcat {input.R2} > ${{TMPDIR}}/${{R2fn%.*}} +zcat {input.R1} > {params.tmpdir}/${{R1fn%.*}} +zcat {input.R2} > {params.tmpdir}/${{R2fn%.*}} python $MSHOME/mapsplice.py \\ - -1 ${{TMPDIR}}/${{R1fn%.*}} \\ - -2 ${{TMPDIR}}/${{R2fn%.*}} \\ + -1 {params.tmpdir}/${{R1fn%.*}} \\ + -2 {params.tmpdir}/${{R2fn%.*}} \\ -c {params.separate_fastas} \\ -p {threads} \\ --min-map-len {params.minmaplen} \\ @@ -897,10 +860,10 @@ python $MSHOME/mapsplice.py \\ else R1fn=$(basename {input.R1}) -zcat {input.R1} > ${{TMPDIR}}/${{R1fn%.*}} +zcat {input.R1} > {params.tmpdir}/${{R1fn%.*}} python $MSHOME/mapsplice.py \ - -1 ${{TMPDIR}}/${{R1fn%.*}} \ + -1 {params.tmpdir}/${{R1fn%.*}} \ -c {params.separate_fastas} \ -p {threads} \ -x {params.ebwt} \ @@ -951,13 +914,11 @@ rule mapsplice_postprocess: bai=join( WORKDIR, "results", "{sample}", "MapSplice", "{sample}.mapsplice.cram.crai" ), - envmodules: - TOOLS["samtools"]["version"], - TOOLS["python27"]["version"], + container: config['containers']['star_ucsc_cufflinks'] params: script=join(SCRIPTS_DIR, "create_mapsplice_per_sample_counts_table.py"), memG=getmemG("mapsplice_postprocess"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", bsj_min_nreads=config["minreadcount"], refregions=REF_REGIONS, reffa=REF_FA, @@ -972,12 +933,7 @@ rule mapsplice_postprocess: shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} python {params.script} \\ --circularRNAstxt {input.circRNAs} \\ -o {output.ct} \\ @@ -991,8 +947,8 @@ python {params.script} \\ --host_filter_max {params.maxsize_host} \\ --virus_filter_min {params.minsize_virus} \\ --virus_filter_max {params.maxsize_virus} -cd $TMPDIR -samtools view -@{threads} -T {params.reffa} -CS {input.sam} | samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O CRAM -o alignments.cram - +cd {params.tmpdir} +samtools view -@{threads} -T {params.reffa} -CS {input.sam} | samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O CRAM -o alignments.cram - rsync -az --progress alignments.cram {output.bam} rsync -az --progress alignments.cram.crai {output.bai} """ @@ -1043,17 +999,15 @@ rule nclscan: "NCLscan", "{sample}.nclscan.counts_table.tsv.filtered", ), - envmodules: - TOOLS["ncl_required_modules"], + container: config['containers']['star_ucsc_cufflinks'] threads: getthreads("nclscan") params: workdir=WORKDIR, sample="{sample}", peorse=get_peorse, - nclscan_dir=config["nclscan_dir"], nclscan_config=config["nclscan_config"], script=join(SCRIPTS_DIR, "create_nclscan_per_sample_counts_table.py"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", bsj_min_nreads=config["minreadcount"], refregions=REF_REGIONS, host=HOST, @@ -1066,18 +1020,13 @@ rule nclscan: shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} outdir=$(dirname {output.result}) results_bn=$(basename {output.result}) if [ "{params.peorse}" == "PE" ];then -{params.nclscan_dir}/NCLscan.py -c {params.nclscan_config} -pj {params.sample} -o $TMPDIR --fq1 {input.R1} --fq2 {input.R2} -rsync -az --progress ${{TMPDIR}}/${{results_bn}} {output.result} +NCLscan.py -c {params.nclscan_config} -pj {params.sample} -o {params.tmpdir} --fq1 {input.R1} --fq2 {input.R2} +rsync -az --progress {params.tmpdir}/${{results_bn}} {output.result} python {params.script} \\ --result {output.result} \\ -o {output.ct} \\ @@ -1091,14 +1040,6 @@ python {params.script} \\ --host_filter_max {params.maxsize_host} \\ --virus_filter_min {params.minsize_virus} \\ --virus_filter_max {params.maxsize_virus} -# else -# outdir=$(dirname {output.result}) -# if [ ! -d $outdir ];then -# mkdir -p $outdir -# fi -# touch {output.result} -# touch {output.ct} -# This part is redundant as it is already taken care of by get_nclscan_target_files function! fi """ @@ -1145,23 +1086,13 @@ rule circrnafinder: "{sample}.Chimeric.out.sorted.bam", ), params: - postProcessStarAlignment_script=join( - config["circrnafinder_dir"], "postProcessStarAlignment.pl" - ), bsj_min_nreads=config["minreadcount"], - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["perl"]["version"], - TOOLS["samtools"]["version"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']['circRNA_finder'] shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} starDir=$(dirname {input.chimericsam}) outDir=$(dirname {output.bed}) @@ -1169,7 +1100,7 @@ outDir=$(dirname {output.bed}) if [ -d $outDir ];then rm -rf $outDir;fi if [ ! -d $outDir ];then mkdir -p $outDir;fi -{params.postProcessStarAlignment_script} \\ +postProcessStarAlignment.pl \\ --starDir ${{starDir}}/ \\ --outDir ${{outDir}}/ @@ -1223,30 +1154,19 @@ rule find_circ: params: sample="{sample}", reffa=REF_FA, - find_cir_dir=FIND_CIRC_DIR, find_circ_params=config['findcirc_params'], min_reads=config['circexplorer_bsj_circRNA_min_reads'], collapse_script=join(SCRIPTS_DIR,"_collapse_find_circ.py"), - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["python27"]["version"], - TOOLS["bowtie2"]["version"], - TOOLS["samtools"]["version"], - TOOLS["parallel"]["version"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']['star_ucsc_cufflinks'] threads: getthreads("find_circ") shell: """ set -exo pipefail python --version which python -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi - -if [ ! -d $TMPDIR ]; then mkdir -p $TMPDIR;fi -cd $TMPDIR +mkdir -p {params.tmpdir} +cd {params.tmpdir} refdir=$(dirname {input.bt2}) outdir=$(dirname {output.find_circ_bsj_bed}) @@ -1264,41 +1184,41 @@ outdir=$(dirname {output.find_circ_bsj_bed}) # These _A and _B pairs should be retained in the fastq splits # find number of lines in fastq file -cp {input.anchorsfq} ${{TMPDIR}} +cp {input.anchorsfq} {params.tmpdir} fname=$(basename {input.anchorsfq}) fname_wo_gz=$(echo $fname|sed "s/.gz//g") pigz -d $fname total_lines=$(wc -l ${{fname_wo_gz}} | awk '{{print $1}}') split_nlines=$(echo $total_lines| awk '{{print sprintf("%d", $1/10)}}' | awk '{{print sprintf("%d",($1+7)/8+1)}}' | awk '{{print sprintf("%d",$1*8)}}') -split -d -l $split_nlines --suffix-length 1 $fname_wo_gz ${{TMPDIR}}/{params.sample}.samsplit. +split -d -l $split_nlines --suffix-length 1 $fname_wo_gz {params.tmpdir}/{params.sample}.samsplit. -if [ -f ${{TMPDIR}}/do_find_circ ];then rm -f ${{TMPDIR}}/do_find_circ;fi +if [ -f {params.tmpdir}/do_find_circ ];then rm -f {params.tmpdir}/do_find_circ;fi for i in $(seq 0 9);do bowtie2 -p {threads} \\ --score-min=C,-15.0 \\ --reorder --mm \\ - -q -U ${{TMPDIR}}/{params.sample}.samsplit.${{i}} \\ - -x ${{refdir}}/ref > ${{TMPDIR}}/{params.sample}.samsplit.${{i}}.sam + -q -U {params.tmpdir}/{params.sample}.samsplit.${{i}} \\ + -x ${{refdir}}/ref > {params.tmpdir}/{params.sample}.samsplit.${{i}}.sam -cat <>${{TMPDIR}}/do_find_circ -cat ${{TMPDIR}}/{params.sample}.samsplit.${{i}}.sam | \\ -{params.find_cir_dir}/find_circ.py \\ +cat <>{params.tmpdir}/do_find_circ +cat {params.tmpdir}/{params.sample}.samsplit.${{i}}.sam | \\ +find_circ.py \\ --genome={params.reffa} \\ --prefix={params.sample}.find_circ \\ --name={params.sample} \\ {params.find_circ_params} \\ --stats=${{outdir}}/{params.sample}.bowtie2_stats.${{i}}.txt \\ - --reads=${{TMPDIR}}/{params.sample}.bowtie2_spliced_reads.${{i}}.fa \\ - > ${{TMPDIR}}/{params.sample}.splice_sites.${{i}}.bed + --reads={params.tmpdir}/{params.sample}.bowtie2_spliced_reads.${{i}}.fa \\ + > {params.tmpdir}/{params.sample}.splice_sites.${{i}}.bed EOF done -parallel -j 10 < ${{TMPDIR}}/do_find_circ +parallel -j 10 < {params.tmpdir}/do_find_circ -cat ${{TMPDIR}}/{params.sample}.splice_sites.*.bed > ${{TMPDIR}}/{params.sample}.splice_sites.bed +cat {params.tmpdir}/{params.sample}.splice_sites.*.bed > {params.tmpdir}/{params.sample}.splice_sites.bed -grep CIRCULAR ${{TMPDIR}}/{params.sample}.splice_sites.bed | \\ +grep CIRCULAR {params.tmpdir}/{params.sample}.splice_sites.bed | \\ grep ANCHOR_UNIQUE \\ > {output.find_circ_bsj_bed} @@ -1360,6 +1280,7 @@ rule merge_per_sample: minreadcount=config["minreadcount"], # this filter is redundant as inputs are already pre-filtered. high_confidence_core_callers=config["high_confidence_core_callers"], # comma separated list ... default circExplorer,circExplorer_bwa high_confidence_core_callers_plus_n=config["high_confidence_core_callers_plus_n"] # number of callers in addition to core callers that need to call the circRNA for it to be called "High Confidence" + container: config['containers']['star_ucsc_cufflinks'] shell: """ python3 {params.script} \\ @@ -1381,185 +1302,6 @@ bash {output.merge_bash_script} """ -# if RUN_DCC and RUN_MAPSPLICE and RUN_NCLSCAN: -# rule merge_per_sample: -# input: -# unpack(get_per_sample_files_to_merge) -# output: -# merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt.gz"), -# params: -# script=join(SCRIPTS_DIR,"_merge_per_sample_counts_table.py"), -# samplename="{sample}", -# peorse=get_peorse, -# reffa=REF_FA, -# minreadcount=config['minreadcount'] # this filter is redundant as inputs are already pre-filtered. -# envmodules: -# TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# outdir=$(dirname {output.merged_counts}) - -# parameters=" --circExplorer {input.circExplorer}" -# parameters="$parameters --ciri {input.CIRI}" -# parameters="$parameters --dcc {input.DCC}" -# parameters="$parameters --mapsplice {input.MapSplice}" -# if [[ "{params.peorse}" == "PE" ]]; then # NCLscan is run only if the sample is PE -# parameters="$parameters --nclscan {input.NCLscan}" -# fi -# parameters="$parameters --min_read_count_reqd {params.minreadcount}" -# parameters="$parameters --reffa {params.reffa}" -# parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}" - -# echo "python {params.script} $parameters" -# python {params.script} $parameters -# """ -# elif RUN_DCC and not RUN_MAPSPLICE and not RUN_NCLSCAN: -# rule merge_per_sample: -# input: -# unpack(get_per_sample_files_to_merge) -# output: -# merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt.gz"), -# params: -# script=join(SCRIPTS_DIR,"_merge_per_sample_counts_table.py"), -# samplename="{sample}", -# peorse=get_peorse, -# reffa=REF_FA, -# minreadcount=config['minreadcount'] # this filter is redundant as inputs are already pre-filtered. -# envmodules: -# TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# outdir=$(dirname {output.merged_counts}) - -# parameters=" --circExplorer {input.circExplorer}" -# parameters="$parameters --ciri {input.CIRI}" -# parameters="$parameters --dcc {input.DCC}" -# parameters="$parameters --min_read_count_reqd {params.minreadcount}" -# parameters="$parameters --reffa {params.reffa}" -# parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}" - -# echo "python {params.script} $parameters" -# python {params.script} $parameters -# """ -# elif RUN_DCC and RUN_MAPSPLICE and not RUN_NCLSCAN: -# rule merge_per_sample: -# input: -# unpack(get_per_sample_files_to_merge) -# output: -# merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt.gz"), -# params: -# script=join(SCRIPTS_DIR,"_merge_per_sample_counts_table.py"), -# samplename="{sample}", -# peorse=get_peorse, -# reffa=REF_FA, -# minreadcount=config['minreadcount'] # this filter is redundant as inputs are already pre-filtered. -# envmodules: -# TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# outdir=$(dirname {output.merged_counts}) - -# parameters=" --circExplorer {input.circExplorer}" -# parameters="$parameters --ciri {input.CIRI}" -# parameters="$parameters --dcc {input.DCC}" -# parameters="$parameters --mapsplice {input.MapSplice}" -# parameters="$parameters --min_read_count_reqd {params.minreadcount}" -# parameters="$parameters --reffa {params.reffa}" -# parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}" - -# echo "python {params.script} $parameters" -# python {params.script} $parameters -# """ -# elif not RUN_DCC and RUN_MAPSPLICE and not RUN_NCLSCAN: -# rule merge_per_sample: -# input: -# unpack(get_per_sample_files_to_merge) -# output: -# merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt.gz"), -# params: -# script=join(SCRIPTS_DIR,"_merge_per_sample_counts_table.py"), -# samplename="{sample}", -# peorse=get_peorse, -# reffa=REF_FA, -# minreadcount=config['minreadcount'] # this filter is redundant as inputs are already pre-filtered. -# envmodules: -# TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# outdir=$(dirname {output.merged_counts}) - -# parameters=" --circExplorer {input.circExplorer}" -# parameters="$parameters --ciri {input.CIRI}" -# parameters="$parameters --mapsplice {input.MapSplice}" -# parameters="$parameters --min_read_count_reqd {params.minreadcount}" -# parameters="$parameters --reffa {params.reffa}" -# parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}" - -# echo "python {params.script} $parameters" -# python {params.script} $parameters -# """ -# elif not RUN_DCC and not RUN_MAPSPLICE and RUN_NCLSCAN: -# rule merge_per_sample: -# input: -# unpack(get_per_sample_files_to_merge) -# output: -# merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt.gz"), -# params: -# script=join(SCRIPTS_DIR,"_merge_per_sample_counts_table.py"), -# samplename="{sample}", -# peorse=get_peorse, -# reffa=REF_FA, -# minreadcount=config['minreadcount'] # this filter is redundant as inputs are already pre-filtered. -# envmodules: -# TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# outdir=$(dirname {output.merged_counts}) - -# parameters=" --circExplorer {input.circExplorer}" -# parameters="$parameters --ciri {input.CIRI}" -# if [[ "{params.peorse}" == "PE" ]]; then # NCLscan is run only if the sample is PE -# parameters="$parameters --nclscan {input.NCLscan}" -# fi -# parameters="$parameters --min_read_count_reqd {params.minreadcount}" -# parameters="$parameters --reffa {params.reffa}" -# parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}" - -# echo "python {params.script} $parameters" -# python {params.script} $parameters -# """ -# else: # DCC, MapSplice and NCLScan are all off! -# rule merge_per_sample: -# input: -# unpack(get_per_sample_files_to_merge) -# output: -# merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt.gz"), -# params: -# script=join(SCRIPTS_DIR,"_merge_per_sample_counts_table.py"), -# samplename="{sample}", -# peorse=get_peorse, -# reffa=REF_FA, -# minreadcount=config['minreadcount'] # this filter is redundant as inputs are already pre-filtered. -# envmodules: -# TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# outdir=$(dirname {output.merged_counts}) - -# parameters=" --circExplorer {input.circExplorer}" -# parameters="$parameters --ciri {input.CIRI}" -# parameters="$parameters --min_read_count_reqd {params.minreadcount}" -# parameters="$parameters --reffa {params.reffa}" -# parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}" - -# echo "python {params.script} $parameters" -# python {params.script} $parameters -# """ - - -# localrules: -# create_master_counts_file, - # rule create_master_counts_file: # merge all per-sample counts tables into a single giant counts matrix and annotate it with known circRNA databases @@ -1576,8 +1318,7 @@ rule create_master_counts_file: resultsdir=join(WORKDIR, "results"), lookup_table=ANNOTATION_LOOKUP, bsj_min_nreads=config["circexplorer_bsj_circRNA_min_reads"], - envmodules: - TOOLS["python37"]["version"], + container: config['containers']['base'] shell: """ set -exo pipefail diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk index 1ce5296..9aea06c 100644 --- a/workflow/rules/init.smk +++ b/workflow/rules/init.smk @@ -17,7 +17,7 @@ def check_existence(filename): """ filename = filename.strip() if not os.path.exists(filename): - sys.exit("File: {} does not exists!".format(filename)) + raise FileNotFoundError(f"File: {filename} does not exist") return True @@ -28,10 +28,8 @@ def check_readaccess(filename): filename = filename.strip() check_existence(filename) if not os.access(filename, os.R_OK): - sys.exit( - "File: {} exists, but user cannot read from file due to permissions!".format( - filename - ) + raise PermissionError( + f"File: {filename} exists, but user cannot read from file due to permissions" ) return True @@ -43,10 +41,8 @@ def check_writeaccess(filename): filename = filename.strip() check_existence(filename) if not os.access(filename, os.W_OK): - sys.exit( - "File: {} exists, but user cannot write to file due to permissions!".format( - filename - ) + raise PermissionError( + f"File: {filename} exists, but user cannot write to file due to permissions!" ) return True @@ -88,8 +84,12 @@ def _convert_to_int(variable): return -1 # Unknown -# resouce absolute path +# resource absolute path WORKDIR = config["workdir"] +TEMPDIR = config["tempdir"] +if not os.access(TEMPDIR, os.W_OK): + raise PermissionError(f"TEMPDIR {TEMPDIR} cannot be written to.\n\tHint: does the path exist and do you have write permissions?") + SCRIPTS_DIR = config["scriptsdir"] RESOURCES_DIR = config["resourcesdir"] FASTAS_GTFS_DIR = config["fastas_gtfs_dir"] @@ -108,9 +108,9 @@ N_RUN_FINDCIRC = _convert_to_int(RUN_FINDCIRC) MAPSPLICE_MIN_MAP_LEN = config["mapsplice_min_map_len"] MAPSPLICE_FILTERING = config["mapsplice_filtering"] FLANKSIZE = config['flanksize'] -FIND_CIRC_DIR = config['find_circ_dir'] HQCC=config["high_confidence_core_callers"].replace(" ","") CALLERS=["circExplorer","ciri","circExplorer_bwa"] + # if RUN_CLEAR: CALLERS.append("clear") if RUN_DCC: CALLERS.append("dcc") if RUN_MAPSPLICE: CALLERS.append("mapsplice") @@ -122,7 +122,7 @@ HQCC=HQCC.split(",") HQCC=list(map(lambda x:x.lower(),HQCC)) for caller in HQCC: if not caller in CALLERS: - sys.exit("Caller: {} will not be running but included in high_confidence_core_callers!".format(caller)) + raise ValueError(f"Caller: {caller} will not be running but included in high_confidence_core_callers. Please edit config.yaml") REF_DIR = join(WORKDIR, "ref") if not os.path.exists(REF_DIR): os.mkdir(REF_DIR) @@ -156,7 +156,7 @@ if VIRUSES != "": else: HOST_VIRUSES = HOST HOST_ADDITIVES_VIRUSES = HOST_ADDITIVES - + REPEATS_GTF = join(FASTAS_GTFS_DIR, HOST + ".repeats.gtf") HOST_ADDITIVES_VIRUSES = HOST_ADDITIVES_VIRUSES.split(",") @@ -182,7 +182,7 @@ if not os.path.exists(join(WORKDIR, "fastqs")): if not os.path.exists(join(WORKDIR, "results")): os.mkdir(join(WORKDIR, "results")) -REQUIRED_FILES = [config[f] for f in ["samples", "tools", "cluster"]] +REQUIRED_FILES = [config[f] for f in ["samples", "cluster"]] REQUIRED_FILES.append(ANNOTATION_LOOKUP) REQUIRED_FILES.extend(FASTAS) REQUIRED_FILES.extend(REGIONS) @@ -230,12 +230,6 @@ for sample in SAMPLES: SAMPLESDF.loc[[sample], "R2"] = R2filenewname else: SAMPLESDF.loc[[sample], "PEorSE"] = "SE" -# print(SAMPLESDF) -# sys.exit() - -## Load tools from YAML file -with open(config["tools"]) as f: - TOOLS = yaml.safe_load(f) ## Load cluster.json with open(config["cluster"]) as json_file: diff --git a/workflow/rules/post_align_processing.smk b/workflow/rules/post_align_processing.smk index 532e1eb..21b0bc5 100644 --- a/workflow/rules/post_align_processing.smk +++ b/workflow/rules/post_align_processing.smk @@ -1,92 +1,3 @@ -# rule split_splice_reads_BAM_create_BW: -# input: -# bam=rules.create_spliced_reads_bam.output.bam -# output: -# bam=join(WORKDIR,"results","{sample}","STAR2p","{sample}.spliced_reads."+HOST+".bam"), -# bw=join(WORKDIR,"results","{sample}","STAR2p","{sample}.spliced_reads."+HOST+".bw") -# params: -# sample="{sample}", -# workdir=WORKDIR, -# memG=getmemG("split_splice_reads_BAM_create_BW"), -# outdir=join(WORKDIR,"results","{sample}","STAR2p"), -# regions=REF_REGIONS, -# randomstr=str(uuid.uuid4()) -# threads: getthreads("split_splice_reads_BAM_create_BW") -# envmodules: TOOLS["samtools"]["version"],TOOLS["bedtools"]["version"],TOOLS["ucsc"]["version"],TOOLS["sambamba"]["version"] -# shell:""" -# if [ -d /lscratch/${{SLURM_JOB_ID}} ];then -# TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -# else -# TMPDIR="/dev/shm/{params.randomstr}" -# mkdir -p $TMPDIR -# fi -# cd {params.outdir} -# bam_basename="$(basename {input.bam})" -# while read a b;do -# bam="${{bam_basename%.*}}.${{a}}.bam" -# samtools view {input.bam} $b -b > ${{TMPDIR}}/${{bam%.*}}.tmp.bam -# sambamba sort --memory-limit={params.memG} --tmpdir=${{TMPDIR}} --nthreads={threads} --out=$bam ${{TMPDIR}}/${{bam%.*}}.tmp.bam -# bw="${{bam%.*}}.bw" -# bdg="${{bam%.*}}.bdg" -# sizes="${{bam%.*}}.sizes" -# bedtools genomecov -bga -split -ibam $bam > $bdg -# bedSort $bdg $bdg -# if [ "$(wc -l $bdg|awk '{{print $1}}')" != "0" ];then -# samtools view -H $bam|grep ^@SQ|cut -f2,3|sed "s/SN://g"|sed "s/LN://g" > $sizes -# bedGraphToBigWig $bdg $sizes $bw -# else -# touch $bw -# fi -# rm -f $bdg $sizes -# done < {params.regions} -# cd $TMPDIR && rm -f * -# """ - -# rule split_linear_reads_BAM_create_BW: -# # This rule is identical to split_splice_reads_BAM_create_BW, "spliced_reads" is replaced by "linear_reads" -# input: -# bam=rules.create_linear_reads_bam.output.bam -# output: -# bam=join(WORKDIR,"results","{sample}","STAR2p","{sample}.linear_reads."+HOST+".bam"), -# bw=join(WORKDIR,"results","{sample}","STAR2p","{sample}.linear_reads."+HOST+".bw") -# params: -# sample="{sample}", -# workdir=WORKDIR, -# memG=getmemG("split_linear_reads_BAM_create_BW"), -# outdir=join(WORKDIR,"results","{sample}","STAR2p"), -# regions=REF_REGIONS, -# randomstr=str(uuid.uuid4()) -# threads: getthreads("split_linear_reads_BAM_create_BW") -# envmodules: TOOLS["samtools"]["version"],TOOLS["bedtools"]["version"],TOOLS["ucsc"]["version"],TOOLS["sambamba"]["version"] -# shell:""" -# if [ -d /lscratch/${{SLURM_JOB_ID}} ];then -# TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -# else -# TMPDIR="/dev/shm/{params.randomstr}" -# mkdir -p $TMPDIR -# fi -# cd {params.outdir} -# bam_basename="$(basename {input.bam})" -# while read a b;do -# bam="${{bam_basename%.*}}.${{a}}.bam" -# samtools view {input.bam} $b -b > ${{TMPDIR}}/${{bam%.*}}.tmp.bam -# sambamba sort --memory-limit={params.memG} --tmpdir=${{TMPDIR}} --nthreads={threads} --out=$bam ${{TMPDIR}}/${{bam%.*}}.tmp.bam -# bw="${{bam%.*}}.bw" -# bdg="${{bam%.*}}.bdg" -# sizes="${{bam%.*}}.sizes" -# bedtools genomecov -bga -split -ibam $bam > $bdg -# bedSort $bdg $bdg -# if [ "$(wc -l $bdg|awk '{{print $1}}')" != "0" ];then -# samtools view -H $bam|grep ^@SQ|cut -f2,3|sed "s/SN://g"|sed "s/LN://g" > $sizes -# bedGraphToBigWig $bdg $sizes $bw -# else -# touch $bw -# fi -# rm -f $bdg $sizes -# done < {params.regions} -# cd $TMPDIR && rm -f * -# """ - localrules: merge_genecounts, @@ -111,8 +22,7 @@ rule merge_genecounts: params: outdir=join(WORKDIR, "results"), rscript=join(SCRIPTS_DIR, "merge_ReadsPerGene_counts.R"), - envmodules: - TOOLS["R"]["version"], + container: config['containers']["R"] shell: """ set -exo pipefail diff --git a/workflow/rules/post_findcircrna_processing.smk b/workflow/rules/post_findcircrna_processing.smk index 038e3fd..192cc23 100644 --- a/workflow/rules/post_findcircrna_processing.smk +++ b/workflow/rules/post_findcircrna_processing.smk @@ -61,22 +61,13 @@ rule create_circExplorer_BSJ_bam: scriptse=join(SCRIPTS_DIR, "_create_circExplorer_BSJ_bam_se.py"), flankscript=join(SCRIPTS_DIR, "_append_splice_site_flanks_to_BSJs.py"), bam2bwscript=join(SCRIPTS_DIR, "bam_to_bigwig.sh"), - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["python37"]["version"], - TOOLS["samtools"]["version"], - TOOLS["bedtools"]["version"], - TOOLS["ucsc"]["version"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']["star_ucsc_cufflinks"] threads: getthreads("create_circExplorer_BSJ_bam") shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi +mkdir -p {params.tmpdir} outdir=$(dirname {output.BSJbam}) BSJbedbn=$(basename {output.BSJbed}) @@ -86,61 +77,59 @@ if [ "{params.peorse}" == "PE" ];then python3 {params.scriptpe} \\ --inbam {input.chimericbam} \\ --sample_counts_table {input.countstable} \\ - --plusbam ${{TMPDIR}}/{params.sample}.BSJ.plus.bam \\ - --minusbam ${{TMPDIR}}/{params.sample}.BSJ.minus.bam \\ - --outbam ${{TMPDIR}}/{params.sample}.BSJ.bam \\ - --bed ${{TMPDIR}}/${{BSJbedbn}} \\ + --plusbam {params.tmpdir}/{params.sample}.BSJ.plus.bam \\ + --minusbam {params.tmpdir}/{params.sample}.BSJ.minus.bam \\ + --outbam {params.tmpdir}/{params.sample}.BSJ.bam \\ + --bed {params.tmpdir}/${{BSJbedbn}} \\ --sample_name {params.sample} \\ --junctionsfound {output.BSJfoundcounts} \\ --regions {params.refregions} \\ --host "{params.host}" \\ --additives "{params.additives}" \\ --viruses "{params.viruses}" \\ - --outputhostbams --outputvirusbams --outdir $TMPDIR + --outputhostbams --outputvirusbams --outdir {params.tmpdir} else python3 {params.scriptse} \\ --inbam {input.chimericbam} \\ --sample_counts_table {input.countstable} \\ - --plusbam ${{TMPDIR}}/{params.sample}.BSJ.plus.bam \\ - --minusbam ${{TMPDIR}}/{params.sample}.BSJ.minus.bam \\ - --outbam ${{TMPDIR}}/{params.sample}.BSJ.bam \\ - --bed ${{TMPDIR}}/${{BSJbedbn}} \\ + --plusbam {params.tmpdir}/{params.sample}.BSJ.plus.bam \\ + --minusbam {params.tmpdir}/{params.sample}.BSJ.minus.bam \\ + --outbam {params.tmpdir}/{params.sample}.BSJ.bam \\ + --bed {params.tmpdir}/${{BSJbedbn}} \\ --sample_name {params.sample} \\ --junctionsfound {output.BSJfoundcounts} \\ --regions {params.refregions} \\ --host "{params.host}" \\ --additives "{params.additives}" \\ --viruses "{params.viruses}" \\ - --outputhostbams --outputvirusbams --outdir $TMPDIR + --outputhostbams --outputvirusbams --outdir {params.tmpdir} fi -samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O BAM -o {output.plusBSJbam} ${{TMPDIR}}/{params.sample}.BSJ.plus.bam -samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O BAM -o {output.minusBSJbam} ${{TMPDIR}}/{params.sample}.BSJ.minus.bam -samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O BAM -o {output.BSJbam} ${{TMPDIR}}/{params.sample}.BSJ.bam +samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O BAM -o {output.plusBSJbam} {params.tmpdir}/{params.sample}.BSJ.plus.bam +samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O BAM -o {output.minusBSJbam} {params.tmpdir}/{params.sample}.BSJ.minus.bam +samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O BAM -o {output.BSJbam} {params.tmpdir}/{params.sample}.BSJ.bam for b in {output.plusBSJbam} {output.minusBSJbam} {output.BSJbam} # for b in {output.plusBSJbam} {output.minusBSJbam} do - bash {params.bam2bwscript} $b $TMPDIR + bash {params.bam2bwscript} $b {params.tmpdir} done for i in $(echo {params.host}|tr ',' ' ');do - samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O BAM -o ${{outdir}}/{params.sample}.${{i}}.BSJ.bam ${{TMPDIR}}/{params.sample}.${{i}}.BSJ.bam - bash {params.bam2bwscript} ${{outdir}}/{params.sample}.${{i}}.BSJ.bam $TMPDIR + samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O BAM -o ${{outdir}}/{params.sample}.${{i}}.BSJ.bam {params.tmpdir}/{params.sample}.${{i}}.BSJ.bam + bash {params.bam2bwscript} ${{outdir}}/{params.sample}.${{i}}.BSJ.bam {params.tmpdir} done for i in $(echo {params.viruses}|tr ',' ' ');do - samtools sort -l 9 -T $TMPDIR --write-index -@{threads} -O BAM -o ${{outdir}}/{params.sample}.${{i}}.BSJ.bam ${{TMPDIR}}/{params.sample}.${{i}}.BSJ.bam - bash {params.bam2bwscript} ${{outdir}}/{params.sample}.${{i}}.BSJ.bam $TMPDIR + samtools sort -l 9 -T {params.tmpdir} --write-index -@{threads} -O BAM -o ${{outdir}}/{params.sample}.${{i}}.BSJ.bam {params.tmpdir}/{params.sample}.${{i}}.BSJ.bam + bash {params.bam2bwscript} ${{outdir}}/{params.sample}.${{i}}.BSJ.bam {params.tmpdir} done -python3 {params.flankscript} --reffa {params.reffa} --inbsjbedgz ${{TMPDIR}}/${{BSJbedbn}} --outbsjbedgz {output.BSJbed} - +python3 {params.flankscript} --reffa {params.reffa} --inbsjbedgz {params.tmpdir}/${{BSJbedbn}} --outbsjbedgz {output.BSJbed} - -rm -rf $TMPDIR +rm -rf {params.tmpdir} """ @@ -213,28 +202,16 @@ rule create_circExplorer_linear_spliced_bams: additives=ADDITIVES, viruses=VIRUSES, peorse=get_peorse, - bashscript=join(SCRIPTS_DIR, "_create_circExplorer_linear_bam.sh"), - # pythonscript=join(SCRIPTS_DIR,"_extract_circExplorer_linear_reads.py"), - # bam2bwscript=join(SCRIPTS_DIR,"bam_to_bigwig.sh"), + bashscript=join(SCRIPTS_DIR, "_create_circExplorer_linear_bam.v2.sh"), outdir=join(WORKDIR, "results", "{sample}", "circExplorer"), - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["python37"]["version"], - TOOLS["samtools"]["version"], - TOOLS["bedtools"]["version"], - TOOLS["ucsc"]["version"], - TOOLS["parallel"]["version"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']["star_ucsc_cufflinks"] threads: getthreads("create_circExplorer_linear_spliced_bams") shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ -d $TMPDIR ];then rm -rf $TMPDIR;fi -mkdir -p $TMPDIR +if [ -d {params.tmpdir} ];then rm -rf {params.tmpdir};fi +mkdir -p {params.tmpdir} cd {params.outdir} @@ -245,7 +222,7 @@ bash {params.bashscript} \\ --samplename {params.sample} \\ --peorse {params.peorse} \\ --bsjbed {input.bsjbedgz} \\ - --tmpdir $TMPDIR \\ + --tmpdir {params.tmpdir} \\ --rid2jid {output.rid2jid} \\ --filteredbam {output.filtered_bam} \\ --linearbsjlist {output.linear_readids} \\ @@ -258,8 +235,9 @@ bash {params.bashscript} \\ --additives "{params.additives}" \\ --viruses "{params.viruses}" \\ --linearbam {output.linear_bam} \\ - --splicedbam {output.spliced_bam} -rm -rf $TMPDIR + --splicedbam {output.spliced_bam} \\ + --threads {threads} +rm -rf {params.tmpdir} """ @@ -302,16 +280,11 @@ rule create_circExplorer_merged_found_counts_table: SCRIPTS_DIR, "create_circExplorer_per_sample_counts_table.py" ), outdir=join(WORKDIR, "results", "{sample}", "circExplorer"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -else - TMPDIR="/dev/shm/{params.randomstr}" - mkdir -p $TMPDIR -fi +mkdir -p {params.tmpdir} python3 {params.pythonscript} \\ -b {input.bsj_found_counts} \\ -l {input.linear_spliced_counts} \\ @@ -324,32 +297,6 @@ python3 {params.pythonscript2} \\ """ -# localrules: create_circExplorer_per_sample_counts_table -# rule create_circExplorer_per_sample_counts_table: -# input: -# annotation_counts=rules.circExplorer.output.annotation_counts_table, -# found_counts=rules.create_circExplorer_merged_found_counts_table.output.found_counts_table -# output: -# count_counts_table=join(WORKDIR,"results","{sample}","circExplorer","{sample}.circExplorer.counts_table.tsv") -# params: -# sample="{sample}", -# pythonscript=join(SCRIPTS_DIR,"create_circExplorer_per_sample_counts_table.py"), -# outdir=join(WORKDIR,"results","{sample}","circExplorer"), -# randomstr=str(uuid.uuid4()) -# shell:""" -# set -exo pipefail -# if [ -d /lscratch/${{SLURM_JOB_ID}} ];then -# TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -# else -# TMPDIR="/dev/shm/{params.randomstr}" -# mkdir -p $TMPDIR -# fi -# python3 {params.pythonscript} \\ -# --annotationcounts {input.annotation_counts} \\ -# --allfoundcounts {input.found_counts} \\ -# --countstable {output.count_counts_table} -# """ - if RUN_MAPSPLICE: rule alignment_stats: @@ -369,34 +316,27 @@ if RUN_MAPSPLICE: peorse=get_peorse, run_mapsplice=N_RUN_MAPSPLICE, bash2nreads_pyscript=join(SCRIPTS_DIR, "_bam_get_alignment_stats.py"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", threads: getthreads("alignment_stats") - envmodules: - TOOLS["python37"]["version"], - TOOLS["parallel"]["version"], + container: config['containers']["base"] shell: """ set -exo pipefail - if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}" - else - TMPDIR="/dev/shm/{params.randomstr}" - mkdir -p $TMPDIR - fi + mkdir -p {params.tmpdir} for bamfile in {input};do bamfile_bn=$(basename $bamfile) if [ "{params.peorse}" == "PE" ];then - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > ${{TMPDIR}}/${{bamfile_bn}}.counts" + echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > {params.tmpdir}/${{bamfile_bn}}.counts" else - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > ${{TMPDIR}}/${{bamfile_bn}}.counts" + echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > {params.tmpdir}/${{bamfile_bn}}.counts" fi - done > ${{TMPDIR}}/do_bamstats - parallel -j 2 < ${{TMPDIR}}/do_bamstats + done > {params.tmpdir}/do_bamstats + parallel -j 2 < {params.tmpdir}/do_bamstats print_bam_results () {{ bamfile=$1 bamfile_bn=$(basename $bamfile) - stats_file=${{TMPDIR}}/${{bamfile_bn}}.counts + stats_file={params.tmpdir}/${{bamfile_bn}}.counts prefix=$2 while read b a;do echo -ne "${{prefix}}_${{a}}\\t${{b}}\\n";done < $stats_file }} @@ -426,34 +366,27 @@ else: peorse=get_peorse, run_mapsplice=N_RUN_MAPSPLICE, bash2nreads_pyscript=join(SCRIPTS_DIR, "_bam_get_alignment_stats.py"), - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", threads: getthreads("alignment_stats") - envmodules: - TOOLS["python37"]["version"], - TOOLS["parallel"]["version"], + container: config['containers']["base"] shell: """ set -exo pipefail - if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}" - else - TMPDIR="/dev/shm/{params.randomstr}" - mkdir -p $TMPDIR - fi + mkdir -p {params.tmpdir} for bamfile in {input};do bamfile_bn=$(basename $bamfile) if [ "{params.peorse}" == "PE" ];then - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > ${{TMPDIR}}/${{bamfile_bn}}.counts" + echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} --pe > {params.tmpdir}/${{bamfile_bn}}.counts" else - echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > ${{TMPDIR}}/${{bamfile_bn}}.counts" + echo "python3 {params.bash2nreads_pyscript} --inbam $bamfile --regions {params.regions} > {params.tmpdir}/${{bamfile_bn}}.counts" fi - done > ${{TMPDIR}}/do_bamstats - parallel -j 2 < ${{TMPDIR}}/do_bamstats + done > {params.tmpdir}/do_bamstats + parallel -j 2 < {params.tmpdir}/do_bamstats print_bam_results () {{ bamfile=$1 bamfile_bn=$(basename $bamfile) - stats_file=${{TMPDIR}}/${{bamfile_bn}}.counts + stats_file={params.tmpdir}/${{bamfile_bn}}.counts prefix=$2 while read b a;do echo -ne "${{prefix}}_${{a}}\\t${{b}}\\n";done < $stats_file }} @@ -483,25 +416,21 @@ rule merge_alignment_stats: output: join(WORKDIR, "results", "alignmentstats.txt"), params: - randomstr=str(uuid.uuid4()), + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", shell: """ set -exo pipefail -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -else - TMPDIR="/dev/shm/{params.randomstr}" - mkdir -p $TMPDIR -fi +mkdir -p {params.tmpdir} + count=0 for f in {input};do count=$((count+1)) if [ "$count" == "1" ];then cp $f {output} else - cut -f2 $f > ${{TMPDIR}}/${{count}} - paste {output} ${{TMPDIR}}/${{count}} > ${{TMPDIR}}/${{count}}.tmp - mv ${{TMPDIR}}/${{count}}.tmp {output} + cut -f2 $f > {params.tmpdir}/${{count}} + paste {output} {params.tmpdir}/${{count}} > {params.tmpdir}/${{count}}.tmp + mv {params.tmpdir}/${{count}}.tmp {output} fi done """ @@ -520,9 +449,7 @@ rule create_hq_bams: host=HOST, additives=ADDITIVES, viruses=VIRUSES, - envmodules: - TOOLS["python37"]["version"], - TOOLS["samtools"]["version"], + container: config['containers']["base"] shell:""" set -exo pipefail outdir=$(dirname {output.outbam}) @@ -544,224 +471,3 @@ for bam in $(ls {params.samplename}.*.HQ_only.BSJ.bam);do fi done """ - - - -# localrules: venn -# rule venn: -# input: -# circexplorerout=rules.circExplorer.output.annotations, -# ciriout=rules.ciri.output.cirioutfiltered -# output: -# png=join(WORKDIR,"results","{sample}","{sample}.venn_mqc.png"), -# cirionly=join(WORKDIR,"results","{sample}","{sample}.cirionly.lst"), -# circexploreronly=join(WORKDIR,"results","{sample}","{sample}.circexploreronly.lst"), -# common=join(WORKDIR,"results","{sample}","{sample}.common.lst") -# params: -# script1=join(SCRIPTS_DIR,"venn.R"), -# sample="{sample}", -# outdir=join(WORKDIR,"results","{sample}") -# container: "docker://nciccbr/ccbr_venn:latest" -# threads: getthreads("venn") -# shell:""" -# set -exo pipefail -# cut -f1 {input.ciriout}|grep -v circRNA_ID > /dev/shm/{params.sample}.ciri.lst -# cut -f1-3 {input.circexplorerout}|awk -F"\\t" '{{print $1\":\"$2+1\"|\"$3}}' > /dev/shm/{params.sample}.circExplorer.lst -# if [[ "$(cat /dev/shm/{params.sample}.ciri.lst | wc -l)" != "0" ]];then -# if [[ "$(cat /dev/shm/{params.sample}.circExplorer.lst | wc -l)" != "0" ]];then -# 2set_venn.R \\ -# -l /dev/shm/{params.sample}.ciri.lst \\ -# -r /dev/shm/{params.sample}.circExplorer.lst \\ -# -p {output.png} \\ -# -m {output.cirionly} \\ -# -s {output.circexploreronly} \\ -# -c1 "CIRI2" \\ -# -c2 "CircExplorer2" \\ -# -c {output.common} \\ -# -t {params.sample} -# else -# for o in {output} -# do -# touch $o -# done -# fi -# fi -# rm -f /dev/shm{params.sample}* -# """ - - -# rule add_strand_to_circExplorer: -# input: -# backsplicedjunctions=rules.circExplorer.output.backsplicedjunctions, -# bed=rules.split_BAM_create_BW.output.bed -# output: -# backsplicedjunctions=join(WORKDIR,"results","{sample}","circExplorer","{sample}.back_spliced_junction.bed"), -# params: -# sample="{sample}", -# workdir=WORKDIR, -# pythonscript=join(SCRIPTS_DIR,"copy_strand_info.py"), -# threads: 2 -# # envmodules: -# shell:""" -# cp {input.backsplicedjunctions} /dev/shm -# bsj_basename="$(basename {input.backsplicedjunctions})" -# python {params.pythonscript} --from {input.bed} --to /dev/shm/${{bsj_basename}} --output {output.backsplicedjunctions} -# """ - -# rule recall_valid_BSJ_split_BAM_by_strand_create_BW: -# input: -# bam=rules.create_circExplorer_BSJ_bam.output.BSJbam, -# output: -# bam=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".bam"), -# plusbam=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".plus.bam"), -# minusbam=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".minus.bam"), -# bed=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".bed"), -# bw=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".bw"), -# plusbw=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".plus.bw"), -# minusbw=join(WORKDIR,"results","{sample}","STAR2p","BSJs","{sample}.BSJ."+HOST+".minus.bw"), -# validBSJbed=join(WORKDIR,"results","{sample}","customBSJs","{sample}.valid_STAR_BSJ.bed") -# params: -# sample="{sample}", -# workdir=WORKDIR, -# memG=getmemG("recall_valid_BSJ_split_BAM_by_strand_create_BW"), -# outdir=join(WORKDIR,"results","{sample}","STAR2p","BSJs"), -# pythonscript=join(SCRIPTS_DIR,"validate_BSJ_reads_and_split_BSJ_bam_by_strand.py"), -# regions=REF_REGIONS, -# randomstr=str(uuid.uuid4()) -# threads: getthreads("recall_valid_BSJ_split_BAM_by_strand_create_BW") -# envmodules: TOOLS["samtools"]["version"],TOOLS["bedtools"]["version"],TOOLS["ucsc"]["version"],TOOLS["sambamba"]["version"],TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# if [ -d /lscratch/${{SLURM_JOB_ID}} ];then -# TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -# else -# TMPDIR="/dev/shm/{params.randomstr}" -# mkdir -p $TMPDIR -# fi -# cd {params.outdir} -# bam_basename="$(basename {input.bam})" -# while read a b;do -# bam="${{bam_basename%.*}}.${{a}}.bam" -# plusbam="${{bam_basename%.*}}.${{a}}.plus.bam" -# minusbam="${{bam_basename%.*}}.${{a}}.minus.bam" -# bed="${{bam_basename%.*}}.${{a}}.bed" -# samtools view {input.bam} $b -b > ${{TMPDIR}}/${{bam%.*}}.tmp.bam -# sambamba sort --memory-limit={params.memG} --tmpdir=${{TMPDIR}} --nthreads={threads} --out=$bam ${{TMPDIR}}/${{bam%.*}}.tmp.bam -# bw="${{bam%.*}}.bw" -# bdg="${{bam%.*}}.bdg" -# plusbw="${{bam%.*}}.plus.bw" -# plusbdg="${{bam%.*}}.plus.bdg" -# minusbw="${{bam%.*}}.minus.bw" -# minusbdg="${{bam%.*}}.minus.bdg" -# sizes="${{bam%.*}}.sizes" -# # create strand specific bams -# python {params.pythonscript} \ -# -i $bam \ -# -p $plusbam \ -# -m $minusbam \ -# -b $bed -# bedSort $bed $bed -# # create bedgraphs -# bedtools genomecov -bga -split -ibam $bam > $bdg -# bedSort $bdg $bdg -# bedtools genomecov -bga -split -ibam $plusbam > $plusbdg -# bedSort $plusbdg $plusbdg -# bedtools genomecov -bga -split -ibam $minusbam > $minusbdg -# bedSort $minusbdg $minusbdg -# # create bigwigs -# if [ "$(wc -l $bdg|awk '{{print $1}}')" != "0" ];then -# samtools view -H $bam|grep ^@SQ|cut -f2,3|sed "s/SN://g"|sed "s/LN://g" > $sizes -# bedGraphToBigWig $bdg $sizes $bw -# if [ "$(wc -l $plusbdg|awk '{{print $1}}')" != "0" ];then -# bedGraphToBigWig $plusbdg $sizes $plusbw -# else -# touch $plusbw -# fi -# if [ "$(wc -l $minusbdg|awk '{{print $1}}')" != "0" ];then -# bedGraphToBigWig $minusbdg $sizes $minusbw -# else -# touch $minusbw -# fi -# else -# touch $bw -# fi -# rm -f $bdg $sizes - -# done < {params.regions} -# cat {params.sample}.BSJ.*.bed |cut -f1-6 > ${{TMPDIR}}/{sample}.tmp.bed -# bedSort ${{TMPDIR}}/{sample}.tmp.bed ${{TMPDIR}}/{sample}.tmp.bed -# awk -F"\\t" -v OFS="\\t" '{{$4="S"NR;print}}' ${{TMPDIR}}/{sample}.tmp.bed > {output.validBSJbed} -# cd $TMPDIR && rm -f * -# """ - -# rule add_novel_ciri_BSJs_to_customBSJ: -# input: -# ciriout=rules.ciri.output.ciriout, -# hg38bed=rules.recall_valid_BSJ_split_BAM_by_strand_create_BW.output.bed, -# validBSJbed=rules.recall_valid_BSJ_split_BAM_by_strand_create_BW.output.validBSJbed, -# output: -# novelBSJ=join(WORKDIR,"results","{sample}","customBSJs","{sample}.novel_CIRI_BSJ.bed"), -# cirireadids=temp(join(WORKDIR,"results","{sample}","customBSJs","{sample}.ciri.readids")), -# combinedBSJ=join(WORKDIR,"results","{sample}","customBSJs","{sample}.BSJ.bed") -# params: -# sample="{sample}", -# outdir=join(WORKDIR,"results","{sample}","customBSJs"), -# list_compare_script=join(SCRIPTS_DIR,"_compare_lists.py"), -# randomstr=str(uuid.uuid4()) -# envmodules: TOOLS["ucsc"]["version"],TOOLS["python37"]["version"] -# shell:""" -# set -exo pipefail -# if [ -d /lscratch/${{SLURM_JOB_ID}} ];then -# TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -# else -# TMPDIR="/dev/shm/{params.randomstr}" -# mkdir -p $TMPDIR -# fi -# cd {params.outdir} -# grep -v junction_reads_ID {input.ciriout}|awk -F"\\t" '{{print substr($NF,1,length($NF)-1)}}'|tr ',' '\\n'|sort|uniq > {params.sample}.ciri.readids -# stardir=$(dirname {input.hg38bed}) -# cat ${{stardir}}/{params.sample}.BSJ.*.bed|awk -F"\\t" '{{print $NF}}'|tr ',' '\\n'|awk -F"##" '{{print $1}}'|sort|uniq > {params.sample}.circExplorer.readids -# python {params.list_compare_script} {params.sample}.ciri.readids {params.sample}.circExplorer.readids 1 -# mv a_only.lst {params.sample}.ciri.novel.readids && rm -f a_* b_* -# head -n1 {input.ciriout} > {params.sample}.ciri.novel.out -# while read a;do grep $a {input.ciriout};done < {params.sample}.ciri.novel.readids |sort|uniq >> {params.sample}.ciri.novel.out -# grep -v circRNA_ID {params.sample}.ciri.novel.out | awk -F"\\t" -v OFS="\\t" '{{print $2,$3-1,$4,$2":"$3-1"-"$4,$5,$11}}' > ${{TMPDIR}}/{params.sample}.novel_CIRI_BSJ.tmp.bed -# awk -F"\\t" '{{print $4}}' ${{TMPDIR}}/{params.sample}.novel_CIRI_BSJ.tmp.bed > ${{TMPDIR}}/{params.sample}.novel_CIRI_BSJ.tmp.lst -# awk -F"\\t" '{{print $1":"$2"-"$3}}' {input.validBSJbed} > ${{TMPDIR}}/{params.sample}.circExplorer.BSJ.lst -# python {params.list_compare_script} ${{TMPDIR}}/{params.sample}.novel_CIRI_BSJ.tmp.lst ${{TMPDIR}}/{params.sample}.circExplorer.BSJ.lst 1 -# mv a_only.lst {params.sample}.ciri.novel.BSJ.lst && rm -f a_* b_* -# while read a;do grep $a ${{TMPDIR}}/{params.sample}.novel_CIRI_BSJ.tmp.bed;done < {params.sample}.ciri.novel.BSJ.lst > ${{TMPDIR}}/{params.sample}.ciri.novel.BSJ.bed -# bedSort ${{TMPDIR}}/{params.sample}.ciri.novel.BSJ.bed ${{TMPDIR}}/{params.sample}.ciri.novel.BSJ.bed -# awk -F"\\t" -v OFS="\\t" '{{$4="C"NR;print}}' ${{TMPDIR}}/{params.sample}.ciri.novel.BSJ.bed > {output.novelBSJ} -# cat {input.validBSJbed} {output.novelBSJ} > {output.combinedBSJ} -# bedSort {output.combinedBSJ} {output.combinedBSJ} -# rm -f {params.sample}.circExplorer.readids {params.sample}.ciri.novel.* -# cd $TMPDIR && rm -f * -# """ -# rule filter_ciri_bam_for_BSJs: -# input: -# bam=rules.ciri.output.ciribam, -# ciriout=rules.ciri.output.cirioutfiltered, -# readids=rules.add_novel_ciri_BSJs_to_customBSJ.output.cirireadids -# output: -# bam=join(WORKDIR,"results","{sample}","ciri","{sample}.bwa.BSJ.bam") -# params: -# sample="{sample}", -# memG=getmemG("filter_ciri_bam_for_BSJs"), -# script=join(SCRIPTS_DIR,"filter_bam_by_readids.py"), -# randomstr=str(uuid.uuid4()) -# threads: getthreads("filter_ciri_bam_for_BSJs") -# envmodules: TOOLS["python37"]["version"],TOOLS["sambamba"]["version"] -# shell:""" -# set -exo pipefail -# if [ -d /lscratch/${{SLURM_JOB_ID}} ];then -# TMPDIR="/lscratch/${{SLURM_JOB_ID}}" -# else -# TMPDIR="/dev/shm/{params.randomstr}" -# mkdir -p $TMPDIR -# fi -# sambamba sort --memory-limit={params.memG} --tmpdir=${{TMPDIR}} --nthreads={threads} --out=${{TMPDIR}}/{params.sample}.ciri.sorted.bam {input.bam} -# python {params.script} --inputBAM ${{TMPDIR}}/{params.sample}.ciri.sorted.bam --outputBAM ${{TMPDIR}}/{params.sample}.tmp.bam --readids {input.readids} -# sambamba sort --memory-limit={params.memG} --tmpdir=${{TMPDIR}} --nthreads={threads} --out={output.bam} ${{TMPDIR}}/{params.sample}.tmp.bam -# cd $TMPDIR && rm -f * -# """ diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk index daf2f99..1abeaed 100644 --- a/workflow/rules/preprocessing.smk +++ b/workflow/rules/preprocessing.smk @@ -30,75 +30,57 @@ rule cutadapt: cutadapt_O=config["cutadapt_O"], cutadapt_q=config["cutadapt_q"], adapters=join(RESOURCES_DIR, "TruSeq_and_nextera_adapters.consolidated.fa"), - randomstr=str(uuid.uuid4()), - envmodules: - TOOLS["cutadapt"]["version"], + tmpdir=f"{TEMPDIR}/{str(uuid.uuid4())}", + container: config['containers']['cutadapt'] threads: getthreads("cutadapt") shell: """ -set -exo pipefail - -# set TMPDIR -if [ -d /lscratch/${{SLURM_JOB_ID}} ];then - TMPDIR="/lscratch/${{SLURM_JOB_ID}}/{params.randomstr}" -else - TMPDIR="/dev/shm/{params.randomstr}" -fi -if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi - -# set conda environment -. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" -conda activate fastqfilter - -if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi - -of1bn=$(basename {output.of1}) -of2bn=$(basename {output.of2}) - -if [ "{params.peorse}" == "PE" ];then - ## Paired-end - cutadapt --pair-filter=any \\ - --nextseq-trim=2 \\ - --trim-n \\ - --max-n {params.cutadapt_max_n} \\ - -n {params.cutadapt_n} -O {params.cutadapt_O} \\ - -q {params.cutadapt_q},{params.cutadapt_q} -m {params.cutadapt_min_length}:{params.cutadapt_min_length} \\ - -b file:{params.adapters} \\ - -B file:{params.adapters} \\ - -j {threads} \\ - -o ${{TMPDIR}}/${{of1bn}} -p ${{TMPDIR}}/${{of2bn}} \\ - {input.R1} {input.R2} - -# filter for average read quality - fastq-filter \\ - -q {params.cutadapt_q} \\ - -o {output.of1} -o {output.of2} \\ - ${{TMPDIR}}/${{of1bn}} ${{TMPDIR}}/${{of2bn}} - -else - ## Single-end - cutadapt \\ - --nextseq-trim=2 \\ - --trim-n \\ - --max-n {params.cutadapt_max_n} \\ - -n {params.cutadapt_n} -O {params.cutadapt_O} \\ - -q {params.cutadapt_q},{params.cutadapt_q} -m {params.cutadapt_min_length} \\ - -b file:{params.adapters} \\ - -j {threads} \\ - -o ${{TMPDIR}}/${{of1bn}} \\ - {input.R1} - - touch {output.of2} - -# filter for average read quality - fastq-filter \\ - -q {params.cutadapt_q} \\ - -o {output.of1} \\ - ${{TMPDIR}}/${{of1bn}} - -fi - - - - -""" + set -exo pipefail + + mkdir -p {params.tmpdir} + of1bn=$(basename {output.of1}) + of2bn=$(basename {output.of2}) + + if [ "{params.peorse}" == "PE" ];then + ## Paired-end + cutadapt --pair-filter=any \\ + --nextseq-trim=2 \\ + --trim-n \\ + --max-n {params.cutadapt_max_n} \\ + -n {params.cutadapt_n} -O {params.cutadapt_O} \\ + -q {params.cutadapt_q},{params.cutadapt_q} -m {params.cutadapt_min_length}:{params.cutadapt_min_length} \\ + -b file:{params.adapters} \\ + -B file:{params.adapters} \\ + -j {threads} \\ + -o {params.tmpdir}/${{of1bn}} -p {params.tmpdir}/${{of2bn}} \\ + {input.R1} {input.R2} + + # filter for average read quality + fastq-filter \\ + -q {params.cutadapt_q} \\ + -o {output.of1} -o {output.of2} \\ + {params.tmpdir}/${{of1bn}} {params.tmpdir}/${{of2bn}} + + else + ## Single-end + cutadapt \\ + --nextseq-trim=2 \\ + --trim-n \\ + --max-n {params.cutadapt_max_n} \\ + -n {params.cutadapt_n} -O {params.cutadapt_O} \\ + -q {params.cutadapt_q},{params.cutadapt_q} -m {params.cutadapt_min_length} \\ + -b file:{params.adapters} \\ + -j {threads} \\ + -o {params.tmpdir}/${{of1bn}} \\ + {input.R1} + + touch {output.of2} + + # filter for average read quality + fastq-filter \\ + -q {params.cutadapt_q} \\ + -o {output.of1} \\ + {params.tmpdir}/${{of1bn}} + + fi + """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 576812e..4682dd9 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -33,8 +33,7 @@ rule fastqc: params: outdir=join(WORKDIR, "qc", "fastqc"), threads: getthreads("fastqc") - envmodules: - TOOLS["fastqc"]["version"], + container: config['containers']['fastqc'] shell: """ set -exo pipefail @@ -85,8 +84,7 @@ rule multiqc: html=join(WORKDIR, "multiqc_report.html"), params: outdir=WORKDIR, - envmodules: - TOOLS["multiqc"]["version"], + container: config['containers']['multiqc'] shell: """ cd {params.outdir} diff --git a/workflow/scripts/_create_circExplorer_linear_bam.sh b/workflow/scripts/_create_circExplorer_linear_bam.sh deleted file mode 100644 index 1435396..0000000 --- a/workflow/scripts/_create_circExplorer_linear_bam.sh +++ /dev/null @@ -1,317 +0,0 @@ -#!/usr/bin/env bash - module load parallel - module load python/3.7 - module load bedtools - module load ucsc - module load samtools - - -set -e -x -o pipefail - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -# only for debugging -SCRIPT_DIR2="/vf/users/Ziegelbauer_lab/Pipelines/circRNA/activeDev/workflow/scripts" -# add the following line when not debugging -SCRIPT_DIR2="$SCRIPT_DIR" -SCRIPT_NAME=$( basename -- "${BASH_SOURCE[0]}" ) - -ARGPARSE_DESCRIPTION="create linear and splice BAMs (and BIGWIGs)" - source /opt2/argparse.bash || \ - source ${SCRIPT_DIR}/../../resources/argparse.bash || \ - source <(curl -s https://raw.githubusercontent.com/CCBR/Tools/master/scripts/argparse.bash) || \ - exit 1 -argparse "$@" < ${tmpdir}/${sample_name}.bed - -python ${SCRIPT_DIR2}/_process_bamtobed.py \ - --inbed ${tmpdir}/${sample_name}.bed \ - --outbed ${tmpdir}/${sample_name}.readends.bed \ - --linear ${tmpdir}/${sample_name}.linear.readids.gz \ - --spliced ${tmpdir}/${sample_name}.spliced.readids.gz \ - -bedSort ${tmpdir}/${sample_name}.readends.bed ${tmpdir}/${sample_name}.readends.bed - -################################################################################################### -# get BSJ ends BED -################################################################################################### - -printtime $SCRIPT_NAME $start0 $start "get bed BSJ ends from BSJ bed file" -start=$(date +%s.%N) - -zcat $bsjbedgz |awk -F"\t" -v OFS="\t" '{print $1, $2, $2, $1"##"$2"##"$3"##"$6, $5, $6}' - > ${tmpdir}/BSJ.ends.bed -zcat $bsjbedgz |awk -F"\t" -v OFS="\t" '{print $1, $3, $3, $1"##"$2"##"$3"##"$6, $5, $6}' - >> ${tmpdir}/BSJ.ends.bed -bedSort ${tmpdir}/BSJ.ends.bed ${tmpdir}/BSJ.ends.bed - -################################################################################################### -# finding max readlength -################################################################################################### - -printtime $SCRIPT_NAME $start0 $start "finding max readlength" -start=$(date +%s.%N) - -python3 ${SCRIPT_DIR2}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl -maxrl=$(cat ${tmpdir}/${sample_name}.maxrl) -two_maxrl=$((maxrl*2)) -echo $two_maxrl - -#two_maxrl="302" -################################################################################################### -# find closest known BSJs with primary alignments and then keep alignments with some alignment within the "inclusion zone" -# this is run in parallel -################################################################################################### - -printtime $SCRIPT_NAME $start0 $start "creating rid2jidgzip" -start=$(date +%s.%N) - -shuf ${tmpdir}/${sample_name}.readends.bed | \ - split --lines=10000 --suffix-length=5 --numeric-suffixes=1 - ${tmpdir}/${sample_name}.readends.part. -for f in `ls ${tmpdir}/${sample_name}.readends.part.?????`;do -echo """ -bedSort $f $f && \ -bedtools closest -nonamecheck \ - -a $f \ - -b ${tmpdir}/BSJ.ends.bed -d | \ - awk -F\"\\t\" -v OFS=\"\\t\" -v limit=$two_maxrl '{if (\$NF > 0 && \$NF <= limit) {print \$4,\$10,\$NF}}' | \ - sort --buffer-size=2G --unique --parallel=2 > ${f}.rid2jid.closest.txt -""" -done > ${tmpdir}/para.tmp -grep -v "^$" ${tmpdir}/para.tmp > ${tmpdir}/para -parallel -j $threads < ${tmpdir}/para - -#cat ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt > ${rid2jidgzip%.*}.tmp -for f in $(find $tmpdir -name "${sample_name}.readends.part.*.rid2jid.closest.txt");do -#for f in $(ls ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt);do - cat $f -done > ${rid2jidgzip%.*}.tmp -sort -k1,1 -k3,3n --buffer-size=20G --parallel=$threads --unique ${rid2jidgzip%.*}.tmp | \ - awk -F"\t" -v OFS="\t" '{a[$1]++;if (a[$1]==1) {print $1,$2}}' > ${rid2jidgzip%.*} - -pigz -p4 -f ${rid2jidgzip%.*} && rm -f ${rid2jidgzip%.*}.tmp - -################################################################################################### -# filter linear and spliced readids to those near BSJs only -################################################################################################### - -printtime $SCRIPT_NAME $start0 $start "filter linear and spliced readids to those near BSJs only; creating counts table" -start=$(date +%s.%N) - -python3 ${SCRIPT_DIR2}/_filter_linear_spliced_readids_w_rid2jid.py \ - --linearin ${tmpdir}/${sample_name}.linear.readids.gz \ - --splicedin ${tmpdir}/${sample_name}.spliced.readids.gz \ - --rid2jid ${rid2jidgzip} \ - --linearout ${linearrids} \ - --splicedout ${splicedrids} \ - --jidcounts ${jidcounts} - -# rm -rf ${tmpdir}/${sample_name}.readends.part.* -# the above statement leads to /usr/bin/rm: Argument list too long -# hence, -find ${tmpdir} -maxdepth 1 -name "${sample_name}.readends.part.*" -print0 | xargs -0 rm -f - -################################################################################################### -# create BAMS from readids -################################################################################################### - -printtime $SCRIPT_NAME $start0 $start "creating linear and spliced BAMs for near BSJ reads and all reads" -start=$(date +%s.%N) - -if [ -f ${tmpdir}/para2 ];then rm -f ${tmpdir}/para2;fi - -linearbam_bn=$(basename $linearbam) -splicedbam_bn=$(basename $splicedbam) -linearbam_all_bn=$(basename $linearbam_all) -splicedbam_all_bn=$(basename $splicedbam_all) - -echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2 - -parallel -j 4 < ${tmpdir}/para2 - -printtime $SCRIPT_NAME $start0 $start "sorting linear and spliced BAMs for near BSJ reads and all reads" -start=$(date +%s.%N) - -if [ -d "${tmpdir}/sorttmp" ];then rm -rf ${tmpdir}/sorttmp;fi && mkdir -p ${tmpdir}/sorttmp -samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${linearbam} ${tmpdir}/${linearbam_bn} - -if [ -d "${tmpdir}/sorttmp" ];then rm -rf ${tmpdir}/sorttmp;fi && mkdir -p ${tmpdir}/sorttmp -samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${splicedbam} ${tmpdir}/${splicedbam_bn} - -if [ -d "${tmpdir}/sorttmp" ];then rm -rf ${tmpdir}/sorttmp;fi && mkdir -p ${tmpdir}/sorttmp -samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${linearbam_all} ${tmpdir}/${linearbam_all_bn} - -if [ -d "${tmpdir}/sorttmp" ];then rm -rf ${tmpdir}/sorttmp;fi && mkdir -p ${tmpdir}/sorttmp -samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${splicedbam_all} ${tmpdir}/${splicedbam_all_bn} - -################################################################################################### -# split BAMs by regions -################################################################################################### - - -if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para3;fi - -echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 - -parallel -j 4 < ${tmpdir}/para3 - -# fi -# two_maxrl="302" - -################################################################################################### -# convert BAMs to Bigwigs -################################################################################################### - -printtime $SCRIPT_NAME $start0 $start "creating linear and spliced BIGWIGs" -start=$(date +%s.%N) - -for folder in $(ls ${tmpdir}/b2b*);do rm -rf $folder;done - -if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para4;fi - -echo "mkdir -p ${tmpdir}/b2b_1 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $linearbam ${tmpdir}/b2b_1" >> ${tmpdir}/para4 -echo "mkdir -p ${tmpdir}/b2b_2 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $splicedbam ${tmpdir}/b2b_2" >> ${tmpdir}/para4 -echo "mkdir -p ${tmpdir}/b2b_3 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $linearbam_all ${tmpdir}/b2b_3" >> ${tmpdir}/para4 -echo "mkdir -p ${tmpdir}/b2b_4 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $splicedbam_all ${tmpdir}/b2b_4" >> ${tmpdir}/para4 -count=4 -for prefix in "linear_BSJ" "spliced_BSJ" "linear" "spliced";do -for b in $(echo "$host $viruses"|tr ',' ' ');do - count=$((count+1)) - bam="${outdir}/${sample_name}.${prefix}.${b}.bam" - echo "mkdir -p ${tmpdir}/b2b_${count} && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $bam ${tmpdir}/b2b_${count}" >> ${tmpdir}/para4 -done -done - -parallel -j 12 < ${tmpdir}/para4 - -#rm -rf ${tmpdir}/* -printtime $SCRIPT_NAME $start0 $start "Done!" - diff --git a/workflow/scripts/_create_circExplorer_linear_bam.v2.sh b/workflow/scripts/_create_circExplorer_linear_bam.v2.sh index 5ee32ff..f415ad9 100644 --- a/workflow/scripts/_create_circExplorer_linear_bam.v2.sh +++ b/workflow/scripts/_create_circExplorer_linear_bam.v2.sh @@ -1,11 +1,4 @@ #!/usr/bin/env bash -# module load parallel -# module load python/3.7 -# module load bedtools -# module load ucsc -# module load samtools - - set -e -x -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) @@ -40,12 +33,6 @@ parser.add_argument('--threads',required=False, default=56, help='number of thre EOF threads=$THREADS -if [ "$SLURM_JOB_ID" != "" ];then -alloccpu=$(sacct -j $SLURM_JOB_ID --format "AllocCPUS"|tail -n1|awk '{print $1}') -if [ "$alloccpu" -lt "$threads" ];then - threads=$alloccpu -fi -fi start0=$(date +%s.%N) @@ -152,7 +139,7 @@ python3 ${SCRIPT_DIR}/filter_bam.py \ fi -nreads=$(samtools view /gpfs/gsfs8/users/CBLCCBR/kopardevn_tmp/issue57_testing_2/results/GI1_T/circExplorer/GI1_T.bam|wc -l) +nreads=$(samtools view ${filtered_bam} |wc -l) if [ "$nreads" == "0" ];then echo "SOMETHING WENT WRONG ...filtered BAM is empty!!" exit diff --git a/workflow/scripts/annotate_clear_quant.py b/workflow/scripts/annotate_clear_quant.py index 3539cae..39e3df9 100644 --- a/workflow/scripts/annotate_clear_quant.py +++ b/workflow/scripts/annotate_clear_quant.py @@ -2,11 +2,6 @@ # This script annotates quant_txt file from CLEAR workflow with hg38_2_hg19_lookup.txt import pandas -import re -import numpy as np -from pathlib import Path -import os -import matplotlib.pyplot as plt import sys indexcol=sys.argv[3] # hg38 or mm39 diff --git a/workflow/scripts/bam_split_by_regions.py b/workflow/scripts/bam_split_by_regions.py index 5a11dcb..9da14e2 100644 --- a/workflow/scripts/bam_split_by_regions.py +++ b/workflow/scripts/bam_split_by_regions.py @@ -1,7 +1,5 @@ import pysam -import sys import argparse -import gzip import os import time diff --git a/workflow/scripts/filter_bam.py b/workflow/scripts/filter_bam.py index 7c1b7b8..5f02829 100644 --- a/workflow/scripts/filter_bam.py +++ b/workflow/scripts/filter_bam.py @@ -1,12 +1,7 @@ import pysam -import sys import argparse -import gzip -import pprint def main(): - # debug = True - debug = False parser = argparse.ArgumentParser( description="Remove all non-proper-pair, chimeric, secondary, supplementary, unmapped alignments from input BAM file" ) diff --git a/workflow/scripts/make_star_index.sh b/workflow/scripts/make_star_index.sh index d8898e8..f81cc53 100644 --- a/workflow/scripts/make_star_index.sh +++ b/workflow/scripts/make_star_index.sh @@ -1,4 +1,3 @@ -module load STAR/2.7.0f mkdir -p ./STAR_index_no_GTF && STAR \ --runThreadN 56 \