forked from ZhenyanLuo/FindPlantNLRs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLR_classification.sh
169 lines (139 loc) · 7.92 KB
/
NLR_classification.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env bash
#From a bash script from https://github.com/peritob/Myrtaceae_NLR_workflow
#Use for NLR classification step in the snakemake pipeline
#This script can be used individually by running 'bash run_classification.sh {interproscan.tsv} {augustus.gff3}'
TSV_FILE=$1
GFF3_FILE=$2
# try to get a sensible prefix for the output files
OUT_PREFIX=${TSV_FILE%.tsv}
OUT_PREFIX=${OUT_PREFIX%.fasta}
OUT_PREFIX=${OUT_PREFIX%_augustus_aa}
echo tsv file: $TSV_FILE
echo gff3 file: $GFF3_FILE
echo prefix: $OUT_PREFIX
# Build a temporary working file which is the gff3 file but with two columns inserted on the left.
# The first column contains the gene ID without the extension (extracted from column 9).
# The second column contains the row number so we can return to the original gff3 order.
gawk 'BEGIN {FS="\t"; OFS="\t"} {split($9, a, "[=\\.;]"); print a[2], NR, $0}' $GFF3_FILE | sort -k 1b,1 > GFF3_temp
#
# Identify NBARC
#
gawk 'BEGIN {FS="\t"} $5=="PF00931" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > ${OUT_PREFIX}_NBARC.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_NBARC.list | sort -k 1b,1 | uniq > NBARC_temp_1
join NBARC_temp_1 GFF3_temp | sort -k2,2 -n > NBARC_temp_2
# Reformat rows to produce new gff3 file. This outputs the annotated gene co-ordinates and orientation based on the original genome using braker outputs.
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' NBARC_temp_2 > ${OUT_PREFIX}_NBARC.gff3
for files in NBARC_temp_*; do rm ${files}; done
#
# Identify NLR
#
gawk 'BEGIN {FS="\t"} ($5=="G3DSA:3.80.10.10" || $5=="PF08263" || $5=="PF13516" || $5=="PF12799" || $5=="PF13306" || $5=="PF13855") {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > NLR_temp_1
join ${OUT_PREFIX}_NBARC.list NLR_temp_1 > ${OUT_PREFIX}_NLR.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_NLR.list | sort -k 1b,1 | uniq > NLR_temp_2
join NLR_temp_2 GFF3_temp | sort -k2,2 -n > NLR_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' NLR_temp_3 > ${OUT_PREFIX}_NLR.gff3
for files in NLR_temp_*; do rm ${files}; done
#
# Identify RNB
#
gawk 'BEGIN {FS="\t"} $5=="PF05659" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > RNB_temp_1
join ${OUT_PREFIX}_NBARC.list RNB_temp_1 > ${OUT_PREFIX}_RNB.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_RNB.list | sort -k 1b,1 | uniq > RNB_temp_2
join RNB_temp_2 GFF3_temp | sort -k2,2 -n > RNB_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' RNB_temp_3 > ${OUT_PREFIX}_RNB.gff3
for files in RNB_temp_* ; do rm ${files} ; done
#
# Identify RNL
#
gawk 'BEGIN {FS="\t"} $5=="PF05659" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > RNL_temp_1
join ${OUT_PREFIX}_NLR.list RNL_temp_1 > ${OUT_PREFIX}_RNL.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_RNL.list | sort -k 1b,1 | uniq > RNL_temp_2
join RNL_temp_2 GFF3_temp | sort -k2,2 -n > RNL_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' RNL_temp_3 > ${OUT_PREFIX}_RNL.gff3
for files in RNL_temp_* ; do rm ${files} ; done
#
# Identify RxNB
#
gawk 'BEGIN {FS="\t"} $5=="PF18052" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > RxNB_temp_1
join ${OUT_PREFIX}_NBARC.list RxNB_temp_1 > ${OUT_PREFIX}_RxNB.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_RxNB.list | sort -k 1b,1 | uniq > RxNB_temp_2
join RxNB_temp_2 GFF3_temp | sort -k2,2 -n > RxNB_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' RxNB_temp_3 > ${OUT_PREFIX}_RxNB.gff3
for files in RxNB_temp_* ; do rm ${files} ; done
#
# Identify RxNL
#
gawk 'BEGIN {FS="\t"} $5=="PF18052" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > RxNL_temp_1
join ${OUT_PREFIX}_NLR.list RxNL_temp_1 > ${OUT_PREFIX}_RxNL.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_RxNL.list | sort -k 1b,1 | uniq > RxNL_temp_2
join RxNL_temp_2 GFF3_temp | sort -k2,2 -n > RxNL_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' RxNL_temp_3 > ${OUT_PREFIX}_RxNL.gff3
for files in RxNL_temp_* ; do rm ${files} ; done
#
# Identify TNB
#
gawk 'BEGIN {FS="\t"} $5=="PF01582" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > TNB_temp_1
join ${OUT_PREFIX}_NBARC.list TNB_temp_1 > ${OUT_PREFIX}_TNB.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_TNB.list | sort -k 1b,1 | uniq > TNB_temp_2
join TNB_temp_2 GFF3_temp | sort -k2,2 -n > TNB_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' TNB_temp_3 > ${OUT_PREFIX}_TNB.gff3
for files in TNB_temp_* ; do rm ${files} ; done
#
# Identify TNL
#
gawk 'BEGIN {FS="\t"} $5=="PF01582" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > TNL_temp_1
join ${OUT_PREFIX}_NLR.list TNL_temp_1 > ${OUT_PREFIX}_TNL.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_TNL.list | sort -k 1b,1 | uniq > TNL_temp_2
join TNL_temp_2 GFF3_temp | sort -k2,2 -n > TNL_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' TNL_temp_3 > ${OUT_PREFIX}_TNL.gff3
for files in TNL_temp_* ; do rm ${files} ; done
#
# Identify CNB
#
gawk 'BEGIN {FS="\t"} $5=="Coil" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > CNB_temp_1
join ${OUT_PREFIX}_NBARC.list CNB_temp_1 > CNB_temp_2
# Do not want genes that have any of these
gawk 'BEGIN {FS="\t"} ($5=="PF18052" || $5=="PF05659" || $5=="PF01419" || $5=="PF01582") {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > CNB_temp_exclude
grep -Fxv -f CNB_temp_exclude CNB_temp_2 > ${OUT_PREFIX}_CNB.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_CNB.list | sort -k 1b,1 | uniq > CNB_temp_3
join CNB_temp_3 GFF3_temp | sort -k2,2 -n > CNB_temp_4
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' CNB_temp_4 > ${OUT_PREFIX}_CNB.gff3
for files in CNB_temp_* ; do rm ${files} ; done
#
# Identify CNL
#
# Working on from CNB
join ${OUT_PREFIX}_CNB.list ${OUT_PREFIX}_NLR.list > ${OUT_PREFIX}_CNL.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_CNL.list | sort -k 1b,1 | uniq > CNL_temp_2
join CNL_temp_2 GFF3_temp | sort -k2,2 -n > CNL_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' CNL_temp_3 > ${OUT_PREFIX}_CNL.gff3
for files in CNL_temp_* ; do rm ${files} ; done
# Identify JNB
#
gawk 'BEGIN {FS="\t"} $5=="PF01419" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > JNB_temp_1
join ${OUT_PREFIX}_NBARC.list JNB_temp_1 > ${OUT_PREFIX}_JNB.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_JNB.list | sort -k 1b,1 | uniq > JNB_temp_2
join JNB_temp_2 GFF3_temp | sort -k2,2 -n > JNB_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' JNB_temp_3 > ${OUT_PREFIX}_JNB.gff3
for files in JNB_temp_* ; do rm ${files} ; done
#
# Identify JNL
#
gawk 'BEGIN {FS="\t"} $5=="PF01419" {print $1}' $TSV_FILE | sort -k 1b,1 | uniq > JNL_temp_1
join ${OUT_PREFIX}_NLR.list JNL_temp_1 > ${OUT_PREFIX}_JNL.list
gawk '{split($1, a, "."); print a[1]}' ${OUT_PREFIX}_JNL.list | sort -k 1b,1 | uniq > JNL_temp_2
join JNL_temp_2 GFF3_temp | sort -k2,2 -n > JNL_temp_3
# Reformat rows to produce new gff3 file
gawk 'BEGIN {OFS="\t"} {split($3, a, "[:\\-+]"); print a[1], $4, $5, (a[2]+$6), (a[2]+$7), $8, $9, $10, $11}' JNL_temp_3 > ${OUT_PREFIX}_JNL.gff3
for files in JNL_temp_* ; do rm ${files} ; done
rm GFF3_temp