-
Notifications
You must be signed in to change notification settings - Fork 1
/
SVprotocol_part5.sh
262 lines (231 loc) · 10.1 KB
/
SVprotocol_part5.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/sh
#===============================================================================
#* This pipeline is used to conduct differential gene expression analysis
#===============================================================================
function usage()
{
echo "
******This pipeline is used to conduct differential gene expression (DGE) analysis
Usage: SVprotocol_part5.sh <The long option is recommended>
-s, --samplelist (required) sample list, one sample per line
-c, --config (required) a config file including basic setting.
-p, --project (required) a project name as the suffix of subdirectory. default = project
-j, --thread (optional) the number of thread, default = 5
-h, --help (optional) This small usage guide
1) In your_gene_expression_analysis_dir, you must have a subdirectory named as fastq/<sample_name> including two FASTQ file (<sample_name>_1.fastq.gz and <sample_name>_2.fastq.gz)
2) Please conduct quality control by yourself.
This pipeline directly use the fastq files with no quality control.
3) Please prepare a config file following the example file
1. Containing the path of software to be used
2. You must use the variable name shown in the example file because, in this pipeline, we will use these variable names
4) Please just include chromosome-level sequence or sequence with size
larger than 250kb in the reference genome fasta file
5) <project>: Usually, we will provide a series of samples and run DGE for
one destination, e.g. normal vs infected samples. For the convenience of subsequent analysis, this pipeline will put all results into a subdirectory with the suffix name <project>.
6) Example:
SVprotocol_part5.sh --samplelist SRR.list
--config example.config
--project RM
--thread 5
"
}
OPTIONS=hs:c:j:
LONGOPTIONS=help,samplelist:,config:,thread:
PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTIONS --name "$0" -- "$@")
if [ $? != 0 ]; then
echo "ERROR: Terminating..."
exit 2
fi
eval set -- "${PARSED}"
FLAG_sample=0
FLAG_config=0
thread=5
project="project"
while true
do
case "$1" in
-s|--samplelist)
samplelist=$2
FLAG_sample=1
shift 2
;;
-c|--config)
config=$2
FLAG_config=1
shift 2
;;
-j|--thread)
thread=$2
shift 2
;;
-p|--project)
project=$2
shift 2
;;
-h|--help)
usage
shift
exit 0
;;
--)
shift
break
;;
*)
echo "ERROR: Unknown argument $1 . Please check again."
exit 3
;;
esac
done
until [ $FLAG_sample -eq 1 -a $FLAG_config -eq 1 ]
do
echo "ERROR: missing required variables. Please check again."
exit 1
done
function check_existed_file()
{
local file_name=$1
if [ -f $file_name ]
then
file_size=`ls -l $file_name |awk '{print $5}'`
if [ $file_size -gt 0 ]
then
return 0
else
return 1
fi
else
return 1
fi
}
#===============================================================================
#****/ Main
#===============================================================================
source $config
cd $your_gene_expression_analysis_dir
echo "SVprotocol_part5.sh"
printf "\t--samplelist\t%s\n" "$samplelist"
printf "\t--config\t%s\n" "$config"
printf "\t--project\t%s\n" "$project"
printf "\t--thread\t%s\n" "$thread"
printf "%-20s\t%-s\n" "Hostname:" "`hostname`"
printf "%-20s\t%-s\n" "Working directory:" "`pwd`"
printf "INFO\t%-s\t%-s\n" "`date`" "Start time"
#************/ Create directory
mkdir -p $your_gene_expression_analysis_dir/110.hisat2.index_genome 2>/dev/null
mkdir -p $your_gene_expression_analysis_dir/120.hisat2.genome_out.${project} 2>/dev/null
mkdir -p $your_gene_expression_analysis_dir/130.htseq.genome_out.${project} 2>/dev/null
mkdir -p $your_gene_expression_analysis_dir/140.gfold.${project} 2>/dev/null
#************/ 110.hisat2.index_genome /***************************
hisat2_index_dir=$your_gene_expression_analysis_dir/110.hisat2.index_genome
hisat2_index_prefix=$ref_genome_prefix_name
cd $hisat2_index_dir
existed_file=$hisat2_index_dir/${hisat2_index_prefix}.1.ht2
check_existed_file $existed_file
if [ $? -eq 1 ]
then
#* build index only with genome file (1h) / 3 CPUs
$hisat2_build -p $thread $ref_genome_fasta $hisat2_index_dir/$hisat2_index_prefix
printf "INFO\t%-s\t%-s\n" "`date`" "hisat2 index with genome finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "hisat2 index has been executed before, we skip this step."
fi
#************/ 120.hisat2.genome_out /***********************************
cd $your_gene_expression_analysis_dir
hisat2_analysis_dir=$your_gene_expression_analysis_dir/120.hisat2.genome_out.${project}
genome_index=$your_gene_expression_analysis_dir/110.hisat2.index_genome/$hisat2_index_prefix
cat $samplelist | while read sample
do
INDIR=$your_gene_expression_analysis_dir/fastq/$sample
OUTDIR=$hisat2_analysis_dir
existed_file=$hisat2_analysis_dir/$sample.sorted.bam
check_existed_file $existed_file
if [ $? -eq 1 ]
then
## run hisat2
$hisat2 -p $thread -x $genome_index -1 $INDIR/${sample}_1.fastq.gz -2 $INDIR/${sample}_2.fastq.gz --seed 110 -S $OUTDIR/$sample.sam && $samtools view -@ $thread -bS -h $OUTDIR/$sample.sam > $OUTDIR/$sample.bam && $samtools sort -@ $thread -T $sample $OUTDIR/$sample.bam > $OUTDIR/$sample.sorted.bam && $samtools index $OUTDIR/$sample.sorted.bam && rm -f $OUTDIR/$sample.sam $OUTDIR/$sample.bam >/dev/null 2>&1
printf "INFO\t%-s\t%-s\n" "`date`" "$sample hisat2 mapping finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "$sample hisat2 mapping has been executed before, we skip this step."
fi
done
#************/ 130.htseq.genome_out /***********************************
cd $your_gene_expression_analysis_dir
htseq_analysis_dir=$your_gene_expression_analysis_dir/130.htseq.genome_out.${project}
cat $samplelist | while read sample
do
INDIR=$hisat2_analysis_dir
OUTDIR=$htseq_analysis_dir
existed_file=$OUTDIR/$sample.sorted.htseq.gene.count
check_existed_file $existed_file
if [ $? -eq 1 ]
then
## run HTSeq basing on gene_id
# -s no : indicates we're using an unstranded RNA-seq library.
# -r pos : tells htseq-count that our BAM file is coordinate sorted.
# -f bam : indicates that our input file is in BAM format.
# --type FEATURE_TYPE:
# Feature type (3rd column in GTF file) to be used, all
# features of other type are ignored (default, suitable
# for Ensembl GTF files: exon)
#--idattr IDATTR:
# read count是基于feature来统计的;
# GTF attribute to be used as feature ID (default,
# suitable for Ensembl GTF files: gene_id). All feature
# of the right type (see -t option) within the same GTF
# attribute will be added together. The typical way of
# using this option is to count all exonic reads from
# each gene and add the exons but other uses are
# possible as well.
#--nonunique=<nonunique mode> Mode to handle reads that align to or are assigned to more than one feature in the overlap <mode> of choice (see -m option). <nonunique mode> are none, all, fraction, and random (default: none)
$htseq_count -s no -r pos -f bam --nprocesses=$thread --mode=union --type exon --idattr gene_id --additional-attr=gene_name --nonunique=all $INDIR/$sample.sorted.bam $ref_genome_gtf > $OUTDIR/$sample.sorted.htseq.gene.count
printf "INFO\t%-s\t%-s\n" "`date`" "$sample htseq count finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "$sample htseq count has been executed before, we skip this step."
fi
done
#** Combine all sample's gene count
cd $htseq_analysis_dir
mat_input_suffix="sorted.htseq.gene.count"
$perl $this_protocol_script_path/code_convertCount2Matrix.pl --dir $htseq_analysis_dir --srr_list $your_gene_expression_analysis_dir/$samplelist --out_prefix $project --input_suffix $mat_input_suffix
#** output file: <project>.htseq.gene.count.mat
#************/ 140.gfold /***********************************
cd $your_gene_expression_analysis_dir
gfold_analysis_dir=$your_gene_expression_analysis_dir/140.gfold.${project}
cd $gfold_analysis_dir
ln -s $htseq_analysis_dir/$project.htseq.gene.count.mat >/dev/null 2>&1
#**** Convert gff to bed file to be used in analysis
# output files
# out_prefix.gff.cds.bed
# out_prefix.gff.mRNA.bed
# out_prefix.gff.UTR.bed
# out_prefix.gff.promoter.bed
# out_prefix.gff.upstream.bed
# out_prefix.gff.downstream.bed
out_gff_convert_dir=`dirname $ref_genome_gff`
out_prefix=$out_gff_convert_dir/$ref_genome_prefix_name
existed_file=$out_prefix.gff.cds.bed
check_existed_file $existed_file
if [ $? -eq 1 ]
then
$perl $this_protocol_script_path/code_gff2SVAnnBed.pl $ref_genome_gff $out_prefix
printf "INFO\t%-s\t%-s\n" "`date`" "gff.cds.bed finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "gff.cds.bed has been executed before, we skip this step."
fi
#**** Convert gene count to gfold input
cd $gfold_analysis_dir
gff_cds_bed=$out_gff_convert_dir/$ref_genome_prefix_name.gff.cds.bed
sample=`head -n1 $your_gene_expression_analysis_dir/$samplelist`
existed_file=$OUTDIR/$sample.gfold.input
check_existed_file $existed_file
if [ $? -eq 1 ]
then
$Rscript $this_protocol_script_path/code_convertHTSeq2GFOLD.r --dir $your_gene_expression_analysis_dir/140.gfold.$project --htseq_mat $project.htseq.gene.count.mat --cds_bed $gff_cds_bed --prefix $project
printf "INFO\t%-s\t%-s\n" "`date`" "$sample gfold input data finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "$sample gfold input data has been executed before, we skip this step."
fi
#************/ Ending
printf "INFO\t%-s\t%-s\n" "`date`" "Congratulations"
printf "INFO\t%-s\t%-s\n" "`date`" "End"