-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbsv-R1-S7_bqsr_recal_make_input.sh
executable file
·95 lines (81 loc) · 2.67 KB
/
bsv-R1-S7_bqsr_recal_make_input.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/bin/bash
#########################################################
#
# Platform: NCI Gadi HPC
# Description: make inputs file for parallel exectuion of GATK BaseRecalibrator
# Details:
# Comma-delimited list for BQSR must have N entries per sample,
# where N is the maximum number of chunks that GATK recommends
# BQSR can operate over (min 100 Mb per chunk, N = genome_size / 100 Mb,
# rounded down to integer)
# Contig names cannot be used to name the output files as
# they contain special characters and this makes bash have a hissy fit
# For tumour/normal, run time is ~ half for normal so best to split
# into 2 jobs
# This script assumes all non-cancer samples are designated
# 'N' ('normal'), and are lower coverage. All other phenotype IDs are assigned
# to 'tumour'
# If no binomial grouping is desired, change group=true to group=false
# Sample info is read from <cohort>.config
#
# Author: Cali Willet
# cali.willet@sydney.edu.au
# Date last modified: 14/10/2020
#
# If you use this script towards a publication, please acknowledge the
# Sydney Informatics Hub (or co-authorship, where appropriate).
#
# Suggested acknowledgement:
# The authors acknowledge the scientific and technical assistance
# <or e.g. bioinformatics assistance of <PERSON>> of Sydney Informatics
# Hub and resources and services from the National Computational
# Infrastructure (NCI), which is supported by the Australian Government
# with access facilitated by the University of Sydney.
#
#########################################################
cohort=<cohort>
config=${cohort}.config
group=false
t_input=./Inputs/bqsr_recal.inputs-tumour
n_input=./Inputs/bqsr_recal.inputs-normal
input=./Inputs/bqsr_recal.inputs
mkdir -p ./Inputs
rm -f $t_input
rm -f $n_input
rm -f $input
intervals=$(ls -1 ./Reference/BQSR_intervals/*list) #directory of text files generated by GATK SplitIntervals
intervals=($intervals)
awk 'NR>1' ${config} | while read LINE
do
labSampleID=`echo $LINE | cut -d ' ' -f 2`
if [[ $group = true ]]
then
if [[ $labSampleID = *-N ]]
then
group_input=${n_input}
else
group_input=${t_input}
fi
else
group_input=${input}
fi
for ((i=0;i<${#intervals[@]};i++))
do
printf "${labSampleID},${i},${intervals[i]}\n" >> ${group_input}
done
done
if [ -f $input ]
then
tasks=`wc -l < $input`
printf "Number of BaseRecalibrator tasks to run: ${tasks}\n"
fi
if [ -f $n_input ]
then
tasks=`wc -l < $n_input`
printf "Number of BaseRecalibrator normal sample tasks to run: ${tasks}\n"
fi
if [ -f $t_input ]
then
tasks=`wc -l < $t_input`
printf "Number of BaseRecalibrator tumour sample tasks to run: ${tasks}\n"
fi