-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate.py
479 lines (372 loc) · 19.5 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastxCommandline
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import argparse
import os
import glob
# Make a directory if it doesn't already exist.
# Parameters: the name of the directory.
def make_directory(dir_name):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
# Given a file, or a the full path to the file,
# return only the file name with no file extension.
def return_only_name(full_path):
path_and_name = os.path.split(full_path)
name_and_extension = os.path.splitext(path_and_name[1])
return name_and_extension[0]
# Given an id for a protein record from our Chlamy
# fasta database, return the id of that protein.
# (Eg: jgi|Chlre4|28893|estExt_fgenesh1_pg.C_80374
# returns 28893)
def return_id_number(protein_id):
id_components = protein_id.split("|")
return id_components[2]
# Given blast id for a protein record, return the id
# of that protein.
# (Eg: gnl|BL_ORD_ID|10482 jgi|Chlre4|153454|Chlre2_kg.scaffold_58000016
# returns 153454)
def return_id_number_blast(protein_id):
id_components = protein_id.split("|")
return id_components[4]
# Given a sequence and a name for that sequence
# create a SeqRecord.
# Parameters: a sequence as a string, and a name
# for that sequence.
# Return a SeqRecord.
def create_fasta_record(seq, record_name):
return SeqRecord(Seq(seq), name=record_name, id=record_name, description="")
# Given a SeqIO record, add it to a (multi) fasta file.
def add_to_fasta_file(record, multi_fasta):
opened = open(multi_fasta, 'a')
SeqIO.write(record, opened, "fasta")
# Create individual fasta files from a multi-fasta file.
# Parameters: a multi-fasta file, name for a directory
# to put all the new individual fasta files in.
def make_individual_fatsa_files(multi_fasta, fasta_dir):
make_directory(fasta_dir)
# Create an individual fasta file for each of our contigs
for record in SeqIO.parse(multi_fasta,"fasta"):
file_name = record.id
SeqIO.write(record, fasta_dir + "/" + file_name +".fasta", "fasta")
# Create individual protein files from a multi-fasta file.
# Parameters: a multi-fasta protein file, name for a directory
# to put all the new protein fasta files in.
def make_individual_protein_files(protein_fasta, fasta_dir):
make_directory(fasta_dir)
# Create an individual fasta file for each of our contigs
for record in SeqIO.parse(protein_fasta,"fasta"):
file_name = return_id_number(record.id)
SeqIO.write(record, fasta_dir + "/" + file_name +".fasta", "fasta")
# Make an individual fasta file out of the subsequence
# of a given fasta file.
# Parameters: the input fasta file to use, where to put the new file,
# the beginning and end points in the sequence to cut out for the new
# file.
def make_individual_fasta_file(input_fasta, output_name, seq_begin, seq_end):
# adjust the 1 based indexing of the sequences, to the 0-based in the files
seq_begin = seq_begin - 1
seq_end = seq_end
record = SeqIO.read(input_fasta, "fasta")
record.seq = record.seq[seq_begin:seq_end]
SeqIO.write(record, output_name, "fasta")
# Return the length of the sequence in a given fasta file.
def return_length(input_fasta):
record = SeqIO.read(input_fasta, "fasta")
return len(record.seq)
# Given a line from a gff file (generated by Exonerate)
# replace the feature names with ones
# that Augustus will recognize.
def change_feature_names(line):
line = line.replace("cds","CDS")
line = line.replace("splice3","ass")
line = line.replace("splice5","dss")
return line
# Given a line from a gff file (generated by Exonerate)
# add a "source=X" tag to the end of the line
# for Augustus.
# Parameters: the line to modify, the source character
# to add (source_char = 'P', for example, will add the tag "source=P")
def add_source_for_augustus(line, source_char):
# cut off the newline character
line = line[0:-1]
# add our desired source. And the newline char bakc.
line = line + ";source=" + source_char + "\n"
return line
# Given a gff file generated by Exonerate
# replace the feature names with ones
# that Augustus will recognize, and
# add a source tag to the end.
# Parameters: the file to modify, and the name
# of the output file to generate.
def modify_gff_file(to_modify, output, source_char):
to_modify = to_modify.encode("ascii")
output = output.encode("ascii")
begin = []
end = []
with open(to_modify) as infile, open(output, "w") as outfile:
for line in infile:
if line.startswith("#") == False: # ignore comment lines
line = change_feature_names(line)
line = add_source_for_augustus(line, source_char)
partitioned = line.split("\t")
#if partitioned[2] == "gene":
#begin.append(int(partitioned[3]))
#end.append(int(partitioned[4]))
outfile.write(line)
#lowest_start = min(begin)
#highest_end = max(end)
#return lowest_start, highest_end
# For a given contig, concatenate all its Augustus gff output,
# and save in final output folder.
# Parameters: the contig ("Contig412" for example), the output
# folder with all the Augustus gff files, the folder to store
# the final output, and an original genomic start dict,
# where the keys are the file names (eg Contig412_1231534)
def concatenate_augustus_for_contig(contig, augustus_output, final_output, original_genomic_start_dict):
# make sure we have a directory for the final output
make_directory(final_output)
final_output_file = final_output + "/" + contig + ".gff"
# look at all Augustus output
all_files = glob.glob(augustus_output + "/*")
for output in all_files:
file_name = return_only_name(output)
original_genomic_start = original_genomic_start_dict[file_name]
# only look at output for this contig
if file_name.startswith(contig):
with open(output) as infile, open(final_output_file, "a") as outfile:
for line in infile:
if line.startswith("# start gene"):
outfile.write(line)
continue
if line.startswith("#") == False:
partitioned = line.split("\t")
if partitioned[1] == "AUGUSTUS":
# adjust coordinates
#print "\n\n\n"
#print "genomic start " + str(original_genomic_start)
#print "current 3 " + partitioned[3]
#print "current 4 " + partitioned[4]
#print "readjusted " + str(original_coordinates(int(partitioned[3]), original_genomic_start))
partitioned[3] = str(original_coordinates(int(partitioned[3]), original_genomic_start))
partitioned[4] = str(original_coordinates(int(partitioned[4]), original_genomic_start))
#print "in partitioned[3] " + partitioned[3]
#print "in partitioned[4] " + partitioned[4]
line = "\t".join(partitioned)
#print "to write to file " + line
outfile.write(line)
# For a given contig, concatenate all of the protein sequences
# of the Augustus output into a multi-fasta file.
# Parameters: the contig ("Contig412" for example), the output
# folder with all the Augustus gff files, the folder to store
# the final output
def generate_multi_fasta_from_augustus(contig, augustus_output, final_output):
# make sure we have a directory for the final output
make_directory(final_output)
final_output_file = final_output + "/" + contig + ".fasta"
protein_seq = []
reading_seq = False
# look at all Augustus output
all_files = glob.glob(augustus_output + "/*")
for output in all_files:
file_name = return_only_name(output)
# only look at output for this contig
if file_name.startswith(contig):
with open(output) as infile, open(final_output_file, "a") as outfile:
for line in infile:
if line.startswith("# protein sequence ="):
reading_seq = True
partitioned = line.split("[")
for char in partitioned[1]:
if char != "]" and char != "\n" and char != "" and char != "#" and char != " ":
protein_seq.append(char)
if line.endswith("]\n"):
reading_seq = False
protein_seq = ''.join(protein_seq)
record = create_fasta_record(protein_seq, file_name)
add_to_fasta_file(record, final_output_file)
protein_seq = []
continue
if reading_seq == True:
for char in line:
if char != "]" and char != "\n" and char != "" and char != "#" and char != " ":
protein_seq.append(char)
if line.endswith("]\n"):
reading_seq = False
protein_seq = ''.join(protein_seq)
record = create_fasta_record(protein_seq, file_name)
add_to_fasta_file(record, final_output_file)
protein_seq = []
# Run blastx on a directory contaning fasta files. Put the
# output in a directory specified by blastx_output_dir.
# Parameters: name of a directory with all the fasta files,
# the protein data-base to use, and the name for an output directory
# for blastx.
def blastx_fasta_files(fasta_dir, protein_db, blastx_output_dir):
make_directory(blastx_output_dir)
all_files = glob.glob(fasta_dir + "/*")
for fasta_file in all_files:
contig_name = return_only_name(fasta_file)
saved_output = blastx_output_dir + "/" + contig_name + ".xml"
#if contig_name == "Contig325": #our shortest contig
print "Running blastx on " + contig_name + "..."
blastx_cline = NcbiblastxCommandline(cmd='blastx', query=fasta_file, db=protein_db, outfmt=5, out = saved_output)
#print blastx_cline
stdout, stderr = blastx_cline()
# Given a blastx alignment, determine if it
# "matched well."
# Parameters: an e value.
# Return: True, if it passes our thresholding
# strategy. False otherwise.
def blastx_filter(e):
if e < .0001:
return True
else:
return False
# Widen the end points of a given protein hit.
# Be sure we don't jump over the contig end point.
def adjust_end_points(start, end, total_length):
new_start = start - 1000
new_end = end + 1000
if new_end > total_length:
new_end = total_length
if new_start < 1:
new_start = 1
return new_start, new_end
# Return to the original coordiante system on a contig.
# Parameters: the current index of an element, the original
# genomic start point of this partition.
# Return: the coordinate in the original system.
def original_coordinates(current, original_genomic_start):
original_coord = original_genomic_start + current - 1
return original_coord
# Given hsps, determine the genomic end points
# of the hit.
# Parameters: an hsp object, or a list of them.
# Return: The genomic endpoints of the hit:
# start, end.
def find_end_points(hsps):
begin = []
end = []
# each hsp_data is one hit for the protein
for hsp_data in hsps:
print "start: " + str(hsp_data.query_start) + "\tend: " + str(hsp_data.query_end)
begin.append(int(hsp_data.query_start))
end.append(int(hsp_data.query_end))
lowest_start = min(begin)
highest_end = max(end)
return lowest_start, highest_end
# Filter blastx output, run Exonerate on partitioned contigs
# and whatver proteins they have significant matches with,
# run Augustus on the Exonerate output.
# Parameters: blastx_output_dir--the blastx output,
# smaller_genomic_regions--for contig partitions, exonerate_out_dir--
# to put the exonerate output, exonerate_mod_dir--where
# modified gff files will be saved, augustus_out--where
# Augustus output will be saved, cfg_file--Absolute path to the Augustus
# extrinsicCfgFile you want to use contig_dir--where the individual contig
# files are, protein_dir--where the individiual protein files are,
# final_output--for final output, out_type--g for gff file, a for multi-fasta
# of proteins
def exonerate_augustus(blastx_output_dir, smaller_genomic_regions, exonerate_out_dir, exonerate_mod_dir, augustus_out, cfg_file, contig_dir, protein_dir, final_output, out_type):
# make sure we have directories for Contig partitions, Exonerate, Augustus, and final outputs
make_directory(smaller_genomic_regions)
make_directory(exonerate_out_dir)
make_directory(exonerate_mod_dir)
make_directory(augustus_out)
genomic_start = 1
genomic_start_dict = {}
# grab all blastx output files
all_files = glob.glob(blastx_output_dir + "/*")
# look at the xml output files for all contigs
for xml_output in all_files:
contig_number = return_only_name(xml_output)
opened_file = open(xml_output)
blast_record = NCBIXML.read(opened_file)
#for_testing = contig_number == "Contig133"
print "Now processing " + contig_number + "..."
# look at all protein hits
descriptions_alignments = zip(blast_record.descriptions, blast_record.alignments)
for element in descriptions_alignments:
description = element[0]
alignment = element[1]
hsps = alignment.hsps
protein_length = alignment.length
significant = blastx_filter(description.e)
protein_of_interest = description.title
protein_file_id = return_id_number_blast(protein_of_interest)
# only continue analysis with proteins that pass our filter
if significant: #and for_testing:# and protein_file_id == "132151":
genomic_start, genomic_end = find_end_points(hsps)
protein_of_interest = description.title
protein_file_id = return_id_number_blast(protein_of_interest)
protein_file = protein_dir + "/" + protein_file_id + ".fasta"
contig_file = contig_dir + "/" + contig_number + ".fasta"
contig_length = return_length(contig_file)
# save Contig partition, Exonerate and Augustus output as Contig#_ProteinID (eg: Contig325_188153)
new_file_name = contig_number + "_" + protein_file_id
# widen the blast hit boundaries
genomic_start, genomic_end = adjust_end_points(genomic_start, genomic_end, contig_length)
print genomic_start, genomic_end
genomic_start_dict[new_file_name] = genomic_start
# partition the contig according to the blast hit
partitioned_contig = smaller_genomic_regions + "/" + new_file_name + ".fasta"
make_individual_fasta_file(contig_file, partitioned_contig, genomic_start, genomic_end)
exonerate_output_file = exonerate_out_dir + "/" + new_file_name + ".gff"
exonerate_argument = "exonerate --model protein2genome -q " + protein_file + " -t " + partitioned_contig + " --showtargetgff yes --showsugar no --showalignment no --showvulgar no --verbose 0 > " + exonerate_output_file
os.system(exonerate_argument)
# modify the Exonerate output, so that Augustus can use it
exonerate_modified_file = exonerate_mod_dir + "/" + new_file_name + ".gff"
modify_gff_file(exonerate_output_file, exonerate_modified_file, 'M')
augustus_out_file = augustus_out + "/" + new_file_name + ".gff"
augustus_arg = "augustus " + partitioned_contig + " --species=chlamydomonas --strand=both --hintsfile=" + exonerate_modified_file + " --outfile=" + augustus_out_file #+ " --extrinsicCfgFile=" + cfg_file
os.system(augustus_arg)
if out_type == 'g' or True:
concatenate_augustus_for_contig(contig_number, augustus_out, final_output, genomic_start_dict)
if out_type == 'a' or True:
generate_multi_fasta_from_augustus(contig_number, augustus_out, final_output)
print "Done with " + contig_number + "."
################
# "Main":
################
# build the command-line parser
parser = argparse.ArgumentParser(description='CSE 182: Genome annotation project.')
parser.add_argument('-out', required=True, choices=['g', 'a'], help ='g for GFF output, a for a multi_fasta file of predicted protein, c for cDNA')
parser.add_argument('-input', required=True, help='Fasta data base with all contigs')
parser.add_argument('-p', required=True, help='Protein ortholog data base file')
parser.add_argument('-m', required=False, help='Optional cDNA file')
parser.add_argument('-cfg', required=False, help='Absolute path to the Augustus extrinsicCfgFile you want to use. Ex: /home/david/augustus.2.7/config/extrinsic/extrinsic.MP.cfg')
# parse command line arguments
args = parser.parse_args()
# directories to create and use
individual_contig_dir = "individual_contigs"
individual_protein_dir = "individual_proteins"
blastx_output = "blastx_output"
exonerate_output = "exonerate_raw_output"
exonerate_modified_output = "exonerate_modified_output"
augustus_output = "augustus_output"
smaller_genomic_regions = "smaller_genomic_regions"
final_output = "final_output"
print "\nCreating individual fasta files for all our contigs:"
print "Saving them in folder " + "'" + individual_contig_dir + "' ..."
# break up all the contigs into individual contig files
make_individual_fatsa_files(args.input, individual_contig_dir)
print "\nCreating individual fasta files for proteins in",args.p
print "Saving them in folder " + "'" + individual_protein_dir + "' ..."
# break up all the proteins into individual protein files
make_individual_protein_files(args.p, individual_protein_dir)
print "\nRunning blastx on each of our contigs:"
print "Saving output in folder " + "'" + blastx_output + "'"
# run blastx on all contigs
blastx_fasta_files(individual_contig_dir, args.p, blastx_output)
print "\nRunning Exonerate, and then Augustus on each contig and protein sequence that matched well"
print "Saving initial contig partitions in "+ "'" + smaller_genomic_regions + "'"
print "Saving raw Exonerate output in folder " + "'" + exonerate_output + "'"
print "Saving modified gff Exonerate output in folder " + "'" + exonerate_modified_output + "'"
print "Saving Augustus gff output in folder " + "'" + augustus_output + "'"
print "Saving Final output in folder " + "'" + final_output + "'"
# filter blastx output...run Exonerate and Augustus
exonerate_augustus(blastx_output, smaller_genomic_regions, exonerate_output, exonerate_modified_output, augustus_output, args.cfg, individual_contig_dir, individual_protein_dir, final_output, args.out)