-
Notifications
You must be signed in to change notification settings - Fork 6
/
neoFusion.py
286 lines (247 loc) · 11.9 KB
/
neoFusion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import argparse
parser = argparse.ArgumentParser(description='It is used for FusionNeoantigen Analysis',formatter_class=argparse.RawDescriptionHelpFormatter,add_help=True)
subparsers = parser.add_subparsers(help='commands')
denovo = subparsers.add_parser('denovo',help='use STAR-Fuion to call Fusion transcript',add_help=False,formatter_class=argparse.RawDescriptionHelpFormatter)
Req = denovo.add_argument_group('Required')
Req.add_argument('--left',action='store',metavar='<left.fq>',required=True,help='fastq file')
Req.add_argument('--genome',action='store',metavar='<reference>',required=True,help='download from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB')
Req.add_argument('--hla',action='store',metavar='<hla_allel>',required=True,help='format HLA-A02:01,HLA-B02:01')
Opt = denovo.add_argument_group('Optional')
Opt.add_argument('--right',action='store',metavar='<rigth.fq>',help='Optional,but highly recommended')
Opt.add_argument('-o',action='store',metavar='<output_dir>',default='STAR-Fusion',help='default STAR-Fusion')
Opt.add_argument('-t',action='store',metavar='<threads>',type=int,default=4, help='default 4')
Opt.add_argument('-l',action='store',metavar='<length of pep>',type=str,default='9,10,11', help='default 9,10,11')
Opt.add_argument('-e',action='store',metavar='<FFPM>',type=float,default=0.1,help='minimum fusion fragments per million rna-seq frags')
Opt.add_argument('-p',action='store',metavar='<prefix>',type=str,default='', help='outfile prefix,defalut None')
Opt.add_argument('-f','--filter',action='store_true',default=False,help='filter fusion in non-cancer tissue and cell')
Opt.add_argument('-h','--help',action='help', help='show this help message and exit')
midway = subparsers.add_parser('midway',help='user provide the fusion transcipt',add_help=False,formatter_class=argparse.RawTextHelpFormatter)
Req = midway.add_argument_group('Required')
Req.add_argument('--fusion',action='store',metavar='<protein>',required=True,help='fusion protein in fasta format')
Req.add_argument('--hla',action='store',metavar='<hla_allel>',required=True,help='format HLA-A02:01,HLA-B02:01')
Opt = midway.add_argument_group('Optional')
Opt.add_argument('-e',action='store',metavar='<FFPM>',type=float,default=0.1,help='minimum fusion fragments per million rna-seq frags')
Opt.add_argument('-p',action='store',metavar='<prefix>',type=str,default='', help='outfile prefix,defalut None')
Opt.add_argument('-l',action='store',metavar='<length of pep>',type=str,default='9,10,11', help='default 9,10,11')
Opt.add_argument('-h','--help',action='help', help='show this help message and exit')
args = parser.parse_args()
import os,re,sys,glob,subprocess,pickle,warnings
import pandas as pd,numpy as np
from subprocess import PIPE
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
from Bio.SeqIO.FastaIO import SimpleFastaParser
from multiprocessing import Pool
from math import log, exp
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from collections import defaultdict
warnings.filterwarnings('ignore')
hydro_score = dict(R=-4.5,K=-3.9,N=-3.5,D=-3.5,Q=-3.5,E=-3.5,H=-3.2,P=-1.6, Y=-1.3,W=-0.9,S=-.8,
T=-0.7,G=-0.4,A=1.8,M=1.9,C=2.5,F=2.8,L=3.8,V=4.2,I=4.5)
def getWildpep(mut_pep):
path = '/usr/local/src/pepmatch_db_x86_64'
with open('/tmp/tmp.fa','w') as fout:
fout.write('{}\n'.format(mut_pep))
ref_seq = '/usr/local/data/reference_peptide_{}.txt'.format(len(mut_pep))
cmd = '{} -thr 5 /tmp/tmp.fa {} > /tmp/tmp.tsv'.format(path,ref_seq)
subprocess.call(cmd,shell=True)
cmd = "grep -v '#' /tmp/tmp.tsv"
line = subprocess.Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE)
lines = line.stdout.readline().decode().strip().split()
return lines[3],lines[5]
def cal(x):
return 1 / (1 + np.exp(5 *(x-2)))
def f1(Rm,Rn,H,R,mismatch,comb):
score = Rm * (1 - Rn / 2 ** mismatch) * R * comb * H
return score
def getR(neo_seq,iedb_seq):
align_score = []
a = 26
k = 4.86936
for seq in iedb_seq:
aln_score = aligner(neo_seq,seq)
if aln_score:
localds_core = max([line[2] for line in aln_score])
align_score.append(localds_core)
bindingEnergies = list(map(lambda x: -k * (a - x), align_score))
lZk = logSum(bindingEnergies + [0])
lGb = logSum(bindingEnergies)
R=exp(lGb-lZk)
return R
def aligner(seq1,seq2):
matrix = matlist.blosum62
gap_open = -11
gap_extend = -1
aln = pairwise2.align.localds(seq1.upper(), seq2.upper(), matrix, gap_open, gap_extend)
return aln
def logSum(v):
ma = max(v)
return log(sum(map(lambda x: exp(x-ma),v))) + ma
def getIEDBseq():
iedb_seq = []
with open('/usr/local/data/IEDB.fasta', 'r') as fin:
for t, seq in SimpleFastaParser(fin):
iedb_seq.append(seq)
return iedb_seq
def getModel():
mymodel = {}
model_9 = '/usr/local/data/hg_xgb_9.dat'
model_10 = '/usr/local/data/hg_xgb_10.dat'
model_11 = '/usr/local/data/hg_xgb_11.dat'
with open(model_9, 'rb') as fin1, open(model_10, 'rb') as fin2, open(model_11, 'rb') as fin3:
mymodel[9] = pickle.load(fin1)
mymodel[10] = pickle.load(fin2)
mymodel[11] = pickle.load(fin3)
return mymodel
def do_hydro_vector(pep):
hydro_vector = []
for i in pep:
hydro_vector.append(hydro_score[i.upper()])
return hydro_vector
def getH(pep,mymodel):
hydro_vector = do_hydro_vector(pep)
if len(hydro_vector) == 8:
H = .38
return H
else:
H = round(mymodel[len(hydro_vector)].predict_proba(hydro_vector)[0][1],3)
return H
def cal_netctlpan(hla,pep):
with open('/tmp/tmp.fa', 'w') as fout:
fout.write('>pep\n{}\n'.format(pep))
cmd = 'python /usr/local/src/netchop/predict.py --method netctlpan --length {} --allele {} ' \
'--epitope_threshold -99.9 --threshold -99.9 --noplot /tmp/tmp.fa > /tmp/tmp.tsv'.format(len(pep),hla)
subprocess.call(cmd,shell=True)
while True:
line = subprocess.Popen('grep {} /tmp/tmp.tsv'.format(pep),shell=True,stdin=PIPE,stdout=PIPE)
lines = line.stdout.readline().decode().strip().split()
if len(lines) == 0:
os.system(cmd)
else:
return lines[3],lines[4],lines[5]
def cal_netMHCpan(hla,pep):
with open('/tmp/tmp.fa', 'w') as fout:
fout.write('>pep\n{}\n'.format(pep))
cmd = '/usr/local/src/netMHCpan-4.0/netMHCpan -a {} -l {} -f /tmp/tmp.fa -s -BA > /tmp/tmp.tsv'.format(hla,len(pep))
subprocess.call(cmd,shell=True)
cmd = 'grep {} /tmp/tmp.tsv'.format(pep)
result = subprocess.Popen(cmd,shell=True,stdin=PIPE,stdout=PIPE)
results = result.stdout.readline().decode().strip().split()
return results[11],results[12],results[13]
def STAR_Fusion(args):
print ('######## Running STAR_Fusion ########')
if args.right:
cmd = '/usr/local/src/STAR-Fusion/STAR-Fusion --left_fq {} --right_fq {} --examine_coding_effect --verbose_level 0 ' \
'--output_dir {} --CPU {} --genome_lib_dir {} '.format(args.left, args.right, args.o,args.t, args.genome)
else:
cmd = '/usr/local/src/STAR-Fusion/STAR-Fusion --left_fq {} --examine_coding_effect --verbose_level 0 ' \
'--output_dir {} --CPU {} --genome_lib_dir {}'.format(args.left, args.o, args.t, args.genome)
subprocess.call(cmd,shell=True)
print ('######## STAR_Fusion Complete Successfully ########')
def filterFusion(args):
filein = '{}/star-fusion.fusion_predictions.abridged.coding_effect.tsv'.format(args.o)
fileout = '{}fusion_filter.txt'.format(args.p)
index = 1
if os.path.isfile(filein):
with open(filein,'r') as fin,open(fileout,'w') as fout:
for line in fin:
lines = line.strip().split('\t')
if line.startswith('#') or lines[-3] == '.' or lines[8] == 'NO_LDAS' or float(lines[9]) <= args.e:
continue
if args.filter is True:
if isNormal(lines[0]):
continue
pep = lines[-3].split('*')[0]
fout.write('>fusion{}\n{}\n'.format(index,pep))
index += 1
def isNormal(x):
mylist = []
with open('/usr/local/data/normal_fusion.tsv','r') as fin:
for line in fin:
mylist.append(line.strip())
xs = x.split('--')
if xs[0] + '_' + xs[1] in mylist or xs[1] + '_' + xs[0] in mylist:
return True
else:
return False
def netMHCpan(args,filein):
print('\n######## Running netMHCpan #############')
if not os.path.isfile(filein) or os.path.getsize(filein) ==0:
print ('fusion_filter.txt is empty,no fusion exists')
sys.exit(2)
else:
cmd = '/usr/local/src/netMHCpan-4.0/netMHCpan -a {} -f {} -l {} -s -BA > {}netMHCpan.txt'.format(args.hla,filein,args.l,args.p)
subprocess.call(cmd,shell=True)
cmd = 'grep Error {}netMHCpan.txt'.format(args.p)
line = subprocess.Popen(cmd,shell=True,stdin=PIPE,stdout=PIPE)
if line.stdout.readline():
print ('netMHCpan error,please check input for netMHCpan: HLA,protein,length')
sys.exit(2)
else:
print('######## netMHCpan Complete Successfully ########')
def netMHCpanD(args):
filein = '{}fusion_filter.txt'.format(args.p)
netMHCpan(args,filein)
def netMHCpanM(args):
filein = args.fusion
netMHCpan(args,filein)
def parsernetMHCpan(args):
print('\n######## Running neoAntigen Scoring ############')
filein = '{}netMHCpan.txt'.format(args.p)
fileout = '/tmp/{}neoScore.txt'.format(args.p)
ref = ''
iedb_seq = getIEDBseq()
mymodel = getModel()
with open('/usr/local/data/ref_annot_fusion.pep','r') as fin:
for t,seq in SimpleFastaParser(fin):
ref = seq
break
with open(filein,'r') as fin,open(fileout,'w') as fout:
fout.write('HLA\tSample\tmismatch\t')
fout.write('MTpep\tMTpep_score\tMTpep_aff\tMTpep_rank\tMTpep_comb\t')
fout.write('WTpep\tWTpep_score\tWTpep_aff\tWTpep_rank\tWTpep_comb\t')
fout.write('Hydro_Model\tR\tScore\n')
for line in fin:
if re.search('=\s+WB',line) or re.search('=\s+SB',line):
lines = line.strip().split()
mtpep = lines[2]
if ref.find(mtpep) != -1:
continue
sample = lines[10]
mtpep, mtpep_score, mtpep_aff, mtpep_rank = lines[2], lines[11], lines[12], lines[13]
wtpep, mismatch = getWildpep(mtpep)
hla = lines[1].replace('*', '')
wtpep_score, wtpep_aff, wtpep_rank = cal_netMHCpan(hla, wtpep)
mtpep_tap, mtpep_cleavage, mtpep_comb = cal_netctlpan(hla, mtpep)
wtpep_tap, wtpep_cleavage, wtpep_comb = cal_netctlpan(hla, wtpep)
R = getR(mtpep, iedb_seq)
H = getH(mtpep, mymodel)
Rm = cal(float(mtpep_rank))
Rn = cal(float(wtpep_rank))
score = f1(Rm=Rm,Rn=Rn,H=float(H),R=R,mismatch=float(mismatch),comb=float(mtpep_comb))
if score < 0 :
score = 0
out = '\t'.join((hla, sample, str(mismatch), mtpep, mtpep_score, mtpep_aff, mtpep_rank, mtpep_comb,
wtpep, wtpep_score, wtpep_aff, wtpep_rank, wtpep_comb, str(H), str(R)))
fout.write('{}\t{}\n'.format(out,str(score)))
filein = '/tmp/{}neoScore.txt'.format(args.p)
fileout = '{}neoScore.txt'.format(args.p)
if os.path.isfile(filein) and os.path.getsize(filein) > 0:
dat = pd.read_csv(filein,sep='\t',header=0)
dat.sort_values(by='Score',ascending=False,inplace=True)
dat.to_csv(fileout,sep='\t',header=True,index=False)
print('\n######## Complete Successfuly ############')
def isDenovo(args):
try:
if args.left:
return True
except:
return False
if isDenovo(args):
STAR_Fusion(args)
filterFusion(args)
netMHCpanD(args)
else:
netMHCpanM(args)
parsernetMHCpan(args)