-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathn.py
453 lines (365 loc) · 18.2 KB
/
n.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
import os
import re
import commands
import sys
print 'input file >>',sys.argv[1]
print 'Intergenic distance condition(bp) >>',sys.argv[2]
print 'Database table name >>',sys.argv[3]
#update : use for only select neighborhood (skip blast process)
##############################################################################################################
##############################################################################################################
#Version7 fix bug, loss some diverging neighobor gene (replace < with <= and remove method add method split_gene_position specific for divergent transciption)
#to select the neighboring gene around the query gene
#add input from user
#Version6 connect to photosyn db
#Version5 modify output report , remove > sign from query name and neighbor, add Diverging_neighbor type
#Version4 Include neighboring gene from diverging transcription
#Version3 complete neighboring genes selection in condition of in_range X bp, overlaped frame, circular genome, read both strands
def create_arrange_genome(genome_id):
#original file is arrange_orf.py
#To arrange orf file, separate 2 different strand
#strand_1 for 5' to 3'
#strand_2 for 3' to 5'
file_name="./OnlyGenome/"+genome_id+"_prodigal.orf.fsa" # need to edit !!!
os.system("awk '/>/' "+file_name+" > out.orf_name")
strand_1=''
strand_2=''
for i in open('out.orf_name'):
i=i.replace('\n','')
col=i.split('_')
cds=col[len(col)-1]
cds_strand=cds.split('-')
if int(cds_strand[1])-int(cds_strand[0]) >0:
strand_1=strand_1+i+'\n'
else:
strand_2=strand_2+i+'\n'
write_to_file=open("strand_1.orf_name",'w')
write_to_file.write(strand_1)
write_to_file.close()
write_to_file=open("strand_2.orf_name",'w')
write_to_file.write(strand_2)
write_to_file.close()
#for remove intermediate files
os.system("rm out.orf_name")
def select_neighborhood(query_first_input,input_name,chr_id,dist,circle, chr_len, out_cyto, min_operonic, operonic_dist):
Cytoscape=""
if os.path.exists(query_first_input+'.neighbor') == False:
out_neighbor_list=open(query_first_input+'.neighbor','w')
out_neighbor_list.close()
out_neighbor_list=open(query_first_input+'.neighbor','a')
query=input_name
genome_id=query.split('_')
#To arrange orf file
create_arrange_genome(chr_id)
#query is AE006470_AE01278_CDS_203664-202288
num_gene=query
tab=query.split('_')
gene_position_str_end=tab[len(tab)-1].split('-')
query_str = gene_position_str_end[0]
query_end = gene_position_str_end[1]
out_neighbor_list.write("#Find gene neighborhood for query "+num_gene+'\n')
#To check which strand
if int(query_end)-int(query_str) > 0:
out_neighbor_list.write("#read orf file strand 1\n")
neighboring_gene=neighboring(dist,circle, chr_len,query, "plus", "strand_1.orf_name", query_str, query_end, num_gene,'neighbor',query, '000')
out_neighbor_list.write(neighboring_gene[0])
Cytoscape = Cytoscape+neighboring_gene[1]
if len(neighboring_gene[0]) ==0:
out_neighbor_list.write("#No neighboring gene found\n")
out_neighbor_list.write("#==============================================================================\n")
else:
first_cds = split_position(neighboring_gene[4], 'plus')
first_cds_str = first_cds[0]
first_cds_end = first_cds[1]
#search_gene_in_range(str_position, operonic_distance, oposit_strand, first_cds_str)
divergent=search_gene_in_range(first_cds_str-min_operonic, operonic_dist-min_operonic, "minus", circle, chr_len, first_cds_str)
if divergent[0] == None:
out_neighbor_list.write("#No diverging transcription found\n")
out_neighbor_list.write("#==============================================================================\n")
else:
divergent_position = split_gene_position(divergent[0]) #method (neighboring) will invert str and end position again
diverging_str = divergent_position[0]
diverging_end = divergent_position[1]
neighboring_gene_diverging = neighboring(dist,circle, chr_len, divergent[0], "minus", "strand_2.orf_name", diverging_str, diverging_end, num_gene+' Diverging neighborhood','Diverging_neighbor',query,divergent[1])
Cytoscape = Cytoscape+neighboring_gene_diverging[1]
out_neighbor_list.write('#Found Diverging trascription near '+neighboring_gene[4]+'\n')
out_neighbor_list.write('#Diverging trascription\n'+neighboring_gene_diverging[0])
out_neighbor_list.write("#==============================================================================\n")
elif int(query_end)-int(query_str) < 0:
out_neighbor_list.write("#read orf file strand 2\n")
neighboring_gene=neighboring(dist,circle, chr_len,query, 'minus', "strand_2.orf_name", query_str, query_end, num_gene,'neighbor',query,'000')
out_neighbor_list.write(neighboring_gene[0])
Cytoscape=Cytoscape+neighboring_gene[1]
if len(neighboring_gene[0]) ==0:
out_neighbor_list.write("#No neighboring gene found\n")
out_neighbor_list.write( "#==============================================================================\n")
else:
first_cds = split_position(neighboring_gene[4], 'minus')
first_cds_str = first_cds[0]
first_cds_end = first_cds[1]
#search_gene_in_range(str_position, operonic_distance, oposit_strand, first_cds_str)
divergent=search_gene_in_range(first_cds_str+min_operonic, operonic_dist-min_operonic, "plus", circle, chr_len, first_cds_str)#if min_operonic = 200, operonic_dist should be operonic_dist-min_operonic or if operonic dist = 1000 then 1000-200=800
if divergent[0] == None:
out_neighbor_list.write("#No diverging transcription found\n")
out_neighbor_list.write( "#==============================================================================\n")
else:
divergent_position = split_gene_position(divergent[0]) #method (neighboring) will invert str and end position again
diverging_str = divergent_position[0]
diverging_end = divergent_position[1]
neighboring_gene_diverging = neighboring(dist,circle, chr_len, divergent[0], "plus", "strand_1.orf_name", diverging_str, diverging_end, num_gene+' Diverging neighborhood','Diverging_neighbor',query,divergent[1])
Cytoscape = Cytoscape+neighboring_gene_diverging[1]
out_neighbor_list.write('#Found Diverging trascription near '+neighboring_gene[4]+'\n')
out_neighbor_list.write('#Diverging trascription\n'+neighboring_gene_diverging[0])
out_neighbor_list.write("#==============================================================================\n")
else:
print "Query input is incorrect"
if out_cyto == True:
write_to_file=open(query_first_input+'.cyto','w')
write_to_file.write(Cytoscape)
write_to_file.close()
#print Cytoscape
out_neighbor_list.close()
############################################
#neighboring(dist,circle, chr_len,col (query full name), "minus", "strand_1.orf_name", query_str, query_end, num_gene_of_query, original_query_gene, operonic_distance)
def neighboring(dist,circle, chr_len,col, strand, file_input, query_str, query_end, num_gene, type, original_query,operonic_dist):
col=col.replace('>','')
neighbor=""
Cytoscape=""
cds_str=[]
cds_end=[]
first_cds=""
query_str_in_def = int(query_str)
query_end_in_def = int(query_end)
strand_sign='+'
if strand == 'minus':
strand_sign='-'
query_str_in_def = int(query_end)
query_end_in_def = int(query_str)
file_orf1=open(file_input,'r')
file_orf1_out=file_orf1.read()
orf1_out=file_orf1_out.split('\n')
file_orf1.close()
del orf1_out[len(orf1_out)-1]
#for report start gene of diverging transcription
if type == 'Diverging_neighbor':
original_query=original_query.replace('>','')
neighbor=neighbor+original_query+'\t'+col+'\t'+strand_sign+'\t'+"Separated_"+str(operonic_dist)+'_bp\t'+'*'+'\t'+type+'\t'+'\n'
col=original_query ###
k=0 # k is position of pivot
while k <= len(orf1_out)-1:
db_match = split_position(orf1_out[k],strand)
db_str = db_match[0]
db_end = db_match[1]
#To match between query and orf file
#print query_str_in_def, db_str, query_end_in_def , db_end
if query_str_in_def == db_str and query_end_in_def == db_end:
first_cds= orf1_out[k]
print "#Query gene "+str(num_gene)+" Found"
cds_str.append(int(db_str))
cds_end.append(int(db_end))
#Right direction >>
l=1 # l is position of neighborhood according to the arranged genes in file strand.orf_name
#k postion of matched gene
if k == len(orf1_out)-1:
l=0
while k+l <= len(orf1_out)-1:
#k+l-1 first next position
db = split_position(orf1_out[k+l-1],strand)
db_str = db[0]
db_end = db[1]
#k+l second next position
db2 = split_position(orf1_out[k+l],strand)
db2_str = db2[0]
db2_end = db2[1]
db3 = split_position(orf1_out[0],strand)
db3_str = db3[0]
db3_end = db3[1]
if int(db_str)<= int(db2_str) <= (int(db_end)+int(dist)) : ####fix
intergenic_dist=int(db2_str) - int(db_end) #+1 for first bp counting ####fix
#Checking neighborhood conditions
overlap='Separated_'+str(intergenic_dist)+'_bp'
#to check wether overlaped gene [3]
if int(db_str)<= int(db2_str) <= int(db_end) :
overlapped_dist= int(db_end) - int(db2_str) +1
overlap='Overlapped_'+str(overlapped_dist)+'_bp'
########### OUTPUT UPDATE ###########
neighbor_name2=orf1_out[k+l].replace('>','')
neighbor=neighbor+col+'\t'+neighbor_name2+'\t'+strand_sign+'\t'+overlap+'\t'+'>'+'\t'+type+'\n'
Cytoscape=Cytoscape+orf1_out[k+l-1]+'\t'+orf1_out[k+l]+'\n'
if strand == 'minus': #To save the first cds of operon
first_cds= orf1_out[k+l]
db2=split_position(orf1_out[k+l],strand)
cds_str.append(int(db2[0]))
cds_end.append(int(db2[1]))
#Checking circular genome neighborhood
if circle == True and k+l == len(orf1_out)-1 and int(db3_str) <= (int(db2_end) + dist ) - chr_len :
#need to separate condition because k+l == len(orf1_out)-1 is not in first loop
cir_intergenic_dist = ( chr_len - int(db2_end) ) + int(db3_str)
########### OUTPUT UPDATE ###########
neighbor_name=orf1_out[0].replace('>','')
neighbor=neighbor+col+'\t'+neighbor_name+'\t'+strand_sign+'\t'+"Separated_"+str(cir_intergenic_dist)+'_bp'+'\t'+'>'+'\t'+type+'\t'+"Circular_neighbor"+'\n'
Cytoscape=Cytoscape+orf1_out[-1]+'\t'+orf1_out[0]+'\n'
cds_str.append(int(db3_str))
cds_end.append(int(db3_end))
if strand == 'minus':
first_cds= orf1_out[0]
l= -k+1 # to reset l for covering circular genome
l=l+1
else :
break
#Left direction <<
l= 0 # l is position of neighborhood according to the arranged genes in file strand.orf_name
while k+l <= k :#
db = split_position(orf1_out[k+l],strand)
db_str = db[0]
db_end = db[1]
db2 = split_position(orf1_out[k+l-1],strand)
db2_str = db2[0]
db2_end = db2[1]
ex_len= int(db_str) - dist #excessive chromosome
ex_len_chr= chr_len + ex_len
if int(dist) < int(db_str) and (k+l-1 != 0) and (int(db_str)-int(dist)) <= int(db2_end) <= int(db_end): ####fix #Checking whether first position and circular chlomosome !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
intergenic_dist= int(db_str) - int(db2_end) ####fix
overlap='Separated_'+str(intergenic_dist)+'_bp'
#to check wether overlapped gene
if int(db_str)<= int(db2_end) <= int(db_end):
overlapped_dist = int(db2_end) - int(db_str) +1
overlap='overlapped_'+str(overlapped_dist)+'_bp'
########### OUTPUT UPDATE ###########
neighbor_name3=orf1_out[k+l-1].replace('>','')
neighbor=neighbor+col+'\t'+neighbor_name3+'\t'+strand_sign+'\t'+overlap+'\t'+'<'+'\t'+type+'\n'
Cytoscape=Cytoscape+orf1_out[k+l]+'\t'+orf1_out[k+l-1]+'\n'
if strand == 'plus': #To save the first cds of operon
first_cds= orf1_out[k+l-1]
cds_str.append(int(db2_str))
cds_end.append(int(db2_end))
l=l-1
elif circle == True and int(dist) >= int(db_str) and k+l == 0 and int(ex_len_chr) <= int(db2_end) <= int(chr_len): #Checking whether first position and circular chlomosome
#because array position can be a minus sign e.g. array[-1] >> the last array , no need to reset 'l' like above
cir_intergenic_dist = int(db_str) + ( chr_len - int(db2_end) )
########### OUTPUT UPDATE ###########
neighbor_name4=orf1_out[k+l-1].replace('>','')
neighbor=neighbor+col+'\t'+neighbor_name4+'\t'+strand_sign+'\t'+"Separated_"+str(cir_intergenic_dist )+'_bp'+'\t'+'<'+'\t'+type+'\t'+"Circular_neighbor"+'\n'
Cytoscape=Cytoscape+orf1_out[k+l]+'\t'+orf1_out[k+l-1]+'\n'
if strand == 'plus': #To save the first cds of operon
first_cds= orf1_out[k+l-1]
cds_str.append(int(db2_str))
cds_end.append(int(db2_end))
l=l-1
else:
break
#else:
#print "No gene match"
k=k+1
#neighbor; list of neighboring gene and details, Cytoscape; list of neighboring gene in cytoscape relationship format, cds_position; position of genes
return neighbor, Cytoscape, cds_str, cds_end, first_cds
#to convert name to only location
def split_position(name,strand):
tab2=name.split('_')
db2_position_str_end=tab2[len(tab2)-1].split('-')
db2_str = int(db2_position_str_end[0])
db2_end = int(db2_position_str_end[1])
if strand == 'minus':
db2_str = int(db2_position_str_end[1])
db2_end = int(db2_position_str_end[0])
return db2_str, db2_end
def split_gene_position(name):
tab2=name.split('_')
db2_position_str_end=tab2[len(tab2)-1].split('-')
db2_str = int(db2_position_str_end[0])
db2_end = int(db2_position_str_end[1])
return db2_str, db2_end
def search_gene_in_range(db_position, distance, strand, circular, chr_len,first_cds_str): #first_cds_str= position of query gene for searching
accual_operonic_dist=None
list_diverging=None
filename="strand_1.orf_name"
if strand == "minus":
filename="strand_2.orf_name"
open_arrange_file=open(filename,'r')
arrange_file=open_arrange_file.read()
arrange_file=arrange_file.split('\n')
open_arrange_file.close()
del arrange_file[len(arrange_file)-1]
#db_position is strat position of query which run on different strand
index_count = 0
while index_count <= len(arrange_file)-1 and index_count >= -(len(arrange_file)-1):
position=split_position(arrange_file[index_count],strand)
if strand == 'minus':
#run on minus strand
if circular == True and (db_position-distance) < 0: #for checking circular chr.
if int(position[1]) in range(chr_len+(db_position-distance), chr_len): #circular genome
list_diverging = arrange_file[index_count] #return only nearest gene of the query
accual_operonic_dist = chr_len - int(position[1]) + int(first_cds_str) +1
break
else:
if int(position[1]) in range(db_position-distance, db_position):
list_diverging = arrange_file[index_count] #return only nearest gene of the query
accual_operonic_dist = int(first_cds_str) - int(position[1]) +1
break
index_count = index_count-1
else:
#run on plus strand
if circular == True and (db_position+distance) > chr_len: #distance for plus strand is positive value
if int(position[0]) in range(0, (db_position+distance)-chr_len): #circular genome
list_diverging = arrange_file[index_count] #return only nearest gene of the query
accual_operonic_dist = chr_len - int(first_cds_str) + int(position[0]) +1
break
else:
if int(position[0]) in range(db_position, db_position+distance):
list_diverging = arrange_file[index_count] #return only nearest gene of the query
accual_operonic_dist = int(position[0]) - int(first_cds_str) +1
break
index_count = index_count+1
return list_diverging, accual_operonic_dist
##############################################################################################################
##############################################################################################################
##########################
#write from list to file
def write2file(list_1,out_1):
file_out=open(out_1,'w')
for i in list_1:
line=''
for j in i:
line=line+str(j)+'\t'
file_out.write(line+'\n')
file_out.close()
#########################
import sqlite3
conn = sqlite3.connect("Photosyn.db")
c = conn.cursor()
DB_mcl_tablename=sys.argv[3]
#blast_query_to_genome("list_genomes.txt", "list_query.txt")
#report_file(0,0,1e-10) #set criteria for qiden, qcov, evalue
#select_neighborhood(250, True , 4986812, True, 200, 1000) #select_neighborhood(query_list_input,query_name,FIX_id,intergenic_dist,circle_chromosome,chromosome_length, Cytoscape_output, interoperonic_min_distance, interoperonic_max_distance)
#Make quey protein id file =>' default is query_prot_id '
#os.system('ls ./query_cluster/ > list_query_prot_id')
file_query_cluster=sys.argv[1]
os.system('cut -f1 '+file_query_cluster+' |sort -u > query_cluster_id.ext')
for cluster_id in open('query_cluster_id.ext'):
cluster_id=cluster_id.replace('\n','')
#os.system('cp ./query_cluster/'+cluster_id+' ./query_prot_id')#optional
#in this case , I extract cluster id from database to be queries
print 'Extract protein sequences from cluster ID '+str(cluster_id)
c.execute("SELECT Genome_accession.FIX_ID,cds.genome_acc,Chr_lenght,Chr_circulation,"+DB_mcl_tablename+".seq_id FROM "+DB_mcl_tablename+",cds,Genome_accession WHERE cluster_id="+str(cluster_id)+" AND cds.cds_ID="+DB_mcl_tablename+".seq_id AND Genome_accession.genome_acc=cds.genome_acc")
result = c.fetchall()
query_first_input='query_prot_id_'+str(cluster_id)+'.ext'
write2file(result,query_first_input)
for genomes in open(query_first_input):
genomes=genomes.replace('\n','')
col=genomes.split()
if col[3] == 'circular':
cir=True
else:
cir=False
gen=int(sys.argv[2])
op_min=200
op_max=1000
print 'Read genome : '+col[1]+' (FIX_ID '+str(col[0])+')'
print 'Chromosome length : '+str(col[2])
print 'Circular chromosome : '+str(cir)
print 'Intergenic distance cutoff : '+str(gen)
print 'Interoperonic minimum distance : '+str(op_min)
print 'Interoperonic maximum distance : '+str(op_max)
select_neighborhood(query_first_input,col[4],str(col[0]),gen, cir , int(col[2]), True, op_min, op_max)
#select_neighborhood(query_list_input,query_name,FIX_id,intergenic_dist,circle_chromosome,chromosome_length, Cytoscape_output, interoperonic_min_distance, interoperonic_max_distance)
conn.close()