Merge pull request #1 from moka-guys/v1.0.0

V1.0.0
moka-guys · Jan 5, 2022 · 354adbf · 354adbf
2 parents 0cd431b + 4f86282
commit 354adbf
Show file tree

Hide file tree

Showing 5 changed files with 232 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,30 @@
-# dnanexus_ED_readcount_analysis
-This app calculates the readcount for a batch of samples for CNV calling using ExomeDepth.
+# dnanexus_ED_readcount_analysis_v1.0.0
+Exome depth is run in two stages. Firstly, read counts are calculated, before CNVs are called using the read counts. Read counts are calculated over the entire genome whereas the CNV calling can be performed using a subpanel.
+
+# What does the app do?
+This app runs the read count calculation stage.
+
+Using the provided DNANexus project and the list of Pan numbers the app downloads BAMs and BAI.
+
+A Docker image containing Exome depth is downloaded from 001 - In this release [#1220d31](https://github.com/moka-guys/seglh-cnv/tree/1220d31e2eed1d4488eb461e70615a0fad8b5eb1) is used. #TODO make this a release
+
+The `readCount.R` script is then called, producing a readcount file (`PanXXXXexomedepth_readCount.RData`) which can be used as an input for the ED_cnv_calling app https://github.com/moka-guys/dnanexus_ED_cnv_calling
+
+If the panel of normals is not provided then intrabatch normalisation is performed.
+# Inputs
+* DNAnexus project name where the BAMs and indexes are saved in a folder called '/output'
+* Reference_genome (*.fa.gz or *.fa) in build 37
+* List of comma seperated pan numbers
+* Bedfile covering the capture region
+* Optional: panel of normals
+
+# Output
+readCount.RData - Read count data and selected references per sample
+readCount.csv - Model parameters and QC metrics output (can be used to build a QC classifier, see below)
+
+# Panel of Normals 
+Run following command to create panel of normals
+`docker run -v /home/dnanexus:/home/dnanexus ${DOCKERIMAGENAME} readCount.R /home/dnanexus/out/exomedepth_output/exomedepth_output/$bedfile_prefix/normals.RData $reference_genome_path $bedfile_path $bam_list`
+
+# Created by
+This app was created within the Viapath Genome Informatics section
diff --git a/dxapp.json b/dxapp.json
@@ -0,0 +1,93 @@
+{
+  "name": "ED_readcount_analysis_v1.0.0",
+  "title": "ED_readcount_analysis_v1.0.0",
+  "summary": "v1.0.0 - Step 1 of CNV calling using ExomeDepth",
+  "dxapi": "1.0.0",
+  "inputSpec": [
+    {
+      "name": "project_name",
+      "label": "project_name",
+      "help": "The project containing the bamfiles.",
+      "class": "string"
+    },
+    {
+      "name": "reference_genome",
+      "label": "reference fasta file",
+      "help": "reference_genome",
+      "class": "file",
+      "patterns": ["*.fa","*.fa.gz"],
+      "suggestions": [
+        {
+          "name": "hs37d5.fa.gz",
+          "value": 
+            {
+              "$dnanexus_link": 
+              {
+                "project": "project-ByfFPz00jy1fk6PjpZ95F27J",
+                "id": "file-B6ZY7VG2J35Vfvpkj8y0KZ01"
+              }
+            }
+        }
+      ],
+      "optional": false
+    },
+    {
+      "name": "bamfile_pannumbers",
+      "label": "bamfile_pannumbers",
+      "help": "comma separated string on pan numbers found within the BAM file name",
+      "class": "string"
+    },
+    {
+      "name": "bedfile",
+      "label": "Read count BED",
+      "help": "BED file used to count reads",
+      "class": "file",
+      "patterns": ["*.bed"],
+      "optional": false
+    },
+    {
+      "name": "normals_RData",
+      "label": "Panel of normals",
+      "help": "Rdata file for panel of normals",
+      "class": "file",
+      "patterns": ["*.RData"],
+      "optional": true
+    }
+  ],
+  "outputSpec": [
+    {
+      "name": "exomedepth_output",
+      "label": "exomedepth output",
+      "help": "PDF output from ExomeDepth.",
+      "class": "array:file"
+    }
+  ],
+  "runSpec": {
+    "interpreter": "bash",
+    "timeoutPolicy": {
+      "*": {
+        "hours": 48
+      }
+    },
+    "distribution": "Ubuntu",
+    "release": "16.04",
+    "version": "1",
+    "file": "src/code.sh"
+  },
+  "access": {
+    "network": [
+      "*"
+    ],
+    "allProjects": "VIEW"
+  },
+  "ignoreReuse": false,
+  "regionalOptions": {
+    "aws:us-east-1": {
+      "systemRequirements": {
+        "*": {
+          "instanceType": "mem1_ssd1_v2_x4"
+        }
+      }
+    }
+  }
+}
diff --git a/resources/usr/bin/mark-section b/resources/usr/bin/mark-section
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+echo '{"error": {"type": "AppError", "message": "Error while '"$@"'; please refer to the job log for more details."}}' > ~/job_error.json
diff --git a/resources/usr/bin/mark-success b/resources/usr/bin/mark-success
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+rm -f ~/job_error.json
diff --git a/src/code.sh b/src/code.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# exomedepth_cnv_analysis_v1.0.0
+
+# The following line causes bash to exit at any point if there is any error
+# and to output each line as it is executed -- useful for debugging
+set -e -x -o pipefail
+
+### Set up parameters
+# split project name to get the NGS run number
+run=${project_name##*_}
+
+#read the DNA Nexus api key as a variable
+API_KEY=$(dx cat project-FQqXfYQ0Z0gqx7XG9Z2b4K43:mokaguys_nexus_auth_key)
+
+# make output dir and folder to hold downloaded files
+mkdir -p /home/dnanexus/out/exomedepth_output/exomedepth_output/$bedfile_prefix/ /home/dnanexus/to_test
+
+mark-section "Downloading inputs"
+# download all inputs
+dx-download-all-inputs --parallel
+
+mark-section "Determining reference genome"
+if  [[ $reference_genome_name == *.tar* ]]
+	then
+		echo "reference is tarball"
+		exit 1
+elif [[ $reference_genome_name == *.gz ]]
+	then 
+		gunzip $reference_genome_path
+		reference_fasta=$(echo $reference_genome_path | sed 's/\.gz//g')
+elif [[ $reference_genome_name == *.fa ]]
+	then
+		reference_fasta=$reference_genome_path
+fi 
+
+mark-section "determine run specific variables"
+#Extract samplename to name output files
+samplename=$(python -c "basename='$bedfile_prefix'; print basename.split('_R1')[0]")
+echo $samplename
+echo "read_depth_bed="$bedfile
+echo "reference_genome="$reference_fasta
+echo "panel="$bamfile_pannumbers
+echo "bedfile_prefix="$bedfile_prefix
+echo "normals_RData="$normal_RData
+output_RData_file="/home/dnanexus/out/exomedepth_output/exomedepth_output/${bedfile_prefix}/${bedfile_prefix}_readCount.RData"
+echo "output RData file="$output_RData_file
+
+
+mark-section "Download all relevant BAMs"
+# make and cd to test dir
+cd to_test
+# $bamfile_pannumbers is a comma seperated list of pannumbers that should be analysed together.
+# split this into an array and loop through to download BAM and BAI files
+IFS=',' read -ra pannum_array <<<  $bamfile_pannumbers
+for panel in ${pannum_array[@]}
+do
+	# check there is at least one file with that pan number to download otherwise the dx download command will fail
+	if (( $(dx ls $project_name:output/*001.ba* --auth $API_KEY | grep $panel -c) > 0 ));
+	then
+		#download all the BAM and BAI files for this project/pan number
+		dx download $project_name:output/*$panel*001.ba* --auth $API_KEY
+	fi
+done
+
+# Get list of all BAMs in to_test
+# NB (include full filepath to ensure the output are absolute paths (needed for docker run))
+bam_list=(/home/dnanexus/to_test/*bam)
+
+# count the BAM files. make sure there are at least 3 samples for this pan number, else stop
+filecount="${#bam_list[@]}"
+if (( $filecount < 3 )); then
+	echo "LESS THAN THREE BAM FILES FOUND FOR THIS ANALYSIS" 1>&2
+	exit 1
+fi
+
+# cd out of to_test
+cd /home/dnanexus
+
+mark-section "setting up Exomedepth docker image"
+# Location of the ExomeDepth docker file
+docker_file=project-ByfFPz00jy1fk6PjpZ95F27J:file-G6kfZYQ0jy1vZ0BF33KZpQjJ
+# download the docker file from 001_Tools...
+dx download $docker_file --auth "${API_KEY}"
+docker load -i '/home/dnanexus/seglh_exomedepth_1220d31.tgz'
+
+
+mark-section "Calculate read depths using docker image"
+# docker run - mount the home directory as a share
+# call the readCount.R script
+# supply following arguments
+#  	- output_RData_file path
+#  	- reference_fasta_path 
+#  	- bedfile_path 
+#	- bam_list 
+#	- normals_RData_path
+# The log (PanXXXXexomedepth_readCount.csv) written to same location as the $output_RData_file
+
+
+# Run ReadCount script in docker container
+docker run -v /home/dnanexus:/home/dnanexus seglh/exomedepth:1220d31 readCount.R $output_RData_file $reference_fasta $bedfile_path ${bam_list[@]} $normals_RData_path
+
+# Upload results
+dx-upload-all-outputs
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		echo '{"error": {"type": "AppError", "message": "Error while '"$@"'; please refer to the job log for more details."}}' > ~/job_error.json