-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathitis_taxonomy.py
91 lines (73 loc) · 2.64 KB
/
itis_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import urllib2
import Bio.Phylo as bp
from Bio.Phylo import Newick
import os
import tarfile
col_delimiter = '|'
url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz'
tree_filename = 'itis_taxonomy.newick'
tree_format = 'newick'
data_dir = 'data/'
if not os.path.exists(data_dir): os.mkdir(data_dir)
# download the taxonomy archive
filename = os.path.join(data_dir, url.split('/')[-1])
if os.path.exists(filename):
print 'Using existing copy of %s' % filename
else:
print 'Downloading %s...' % filename
r = urllib2.urlopen(urllib2.Request(url))
assert r.geturl() == url
with open(filename, 'wb') as output_file:
output_file.write(r.read())
r.close()
# extract the tables
for extract in ('taxonomic_units', 'longnames'):
if os.path.exists(os.path.join(data_dir, extract)):
print 'Using existing copy of %s' % extract
else:
print 'Extracting %s from %s...' % (extract, filename)
archive = tarfile.open(name=filename, mode='r:gz')
full_extract = [x for x in archive.getnames() if x.split('/')[-1] == extract][0]
member = archive.getmember(full_extract)
member.name = extract
archive.extract(extract, path=data_dir)
archive.close()
# get names for all ITIS TSNs from longnames table
print 'Getting names...'
names = {}
with open(os.path.join(data_dir, 'longnames')) as names_file:
for line in names_file:
line = line.strip()
values = line.split(col_delimiter)
tax_id, name = values
names[tax_id] = name
# read all node info from taxonomic_units
print 'Reading taxonomy...'
nodes = {}
with open(os.path.join(data_dir, 'taxonomic_units')) as nodes_file:
for line in nodes_file:
line = line.strip()
values = line.split(col_delimiter)
(tax_id, usage, parent_id,
uncertain_parent) = [values[n] for n in (0, 10, 17, 23)]
#if uncertain_parent: continue
if not usage in ('accepted', 'valid'): continue
name = names[tax_id]
this_node = Newick.Clade(name=name)
nodes[tax_id] = this_node
this_node.parent = parent_id
nodes['0'] = root_node = Newick.Clade()
# create tree from nodes dictionary
print 'Building tree...'
for node_id, this_node in nodes.iteritems():
if node_id == '0': continue
try:
parent_node = nodes[this_node.parent]
parent_node.clades.append(this_node)
except KeyError: pass
del this_node.parent
tree = Newick.Tree(root=root_node)
# write tree to file
print 'Writing %s tree to %s...' % (tree_format, tree_filename)
bp.write([tree], tree_filename, tree_format)
print 'Done!'''