-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_tool_xml.py
104 lines (96 loc) · 4.8 KB
/
extract_tool_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Extract attributes and their values from the xml files of all tools.
It works on the local repository of all tools
"""
import os
import urllib2
import pandas as pd
import xml.etree.ElementTree as et
import utils
import json
class ExtractToolXML:
@classmethod
def __init__( self ):
""" Init method. """
self.repo_path = 'data/alltools/' # path to the directory containing all tools
self.tools_directories = [ name for name in os.listdir( self.repo_path ) ]
self.file_extension = 'xml'
@classmethod
def read_tool_directory( self ):
"""
Loop through all the directories in the Galaxy tools folder
"""
all_tools = list()
for item in self.tools_directories:
tool_path = os.path.join( self.repo_path, item )
files = os.listdir( tool_path )
for file in files:
if file.endswith( self.file_extension ):
dataframe = self.convert_xml_dataframe( os.path.join( tool_path, file ) )
if dataframe is None:
continue
else:
# remove any duplication of tool ids
if not any( item.get( "id", None ) == dataframe[ "id" ] for item in all_tools ):
all_tools.append( dataframe )
tools_dataframe = pd.DataFrame( all_tools )
dname = os.path.dirname( os.path.abspath(__file__) )
os.chdir(dname + '/data')
tools_dataframe.to_csv( 'all_tools.csv' )
@classmethod
def convert_xml_dataframe( self, xml_file_path ):
"""
Convert xml file of a tool to a record with its attributes
"""
record = dict()
try:
tree = et.parse( xml_file_path )
root = tree.getroot()
record_id = root.get( "id", None )
# read those xml only if it is a tool
if root.tag == "tool" and record_id is not None and record_id is not '':
record[ "id" ] = record_id
record[ "name" ] = root.get( "name" )
for child in root:
if child.tag == 'description':
record[ child.tag ] = str( child.text ) if child.text else ""
elif child.tag == 'inputs':
file_formats = list()
for children in child.findall( './/param' ):
if children.get( 'type' ) == 'data':
file_format = children.get( 'format', None )
if file_format not in file_formats and file_format is not None:
file_formats.append( file_format )
record[ child.tag ] = '' if file_formats is None else ",".join( file_formats )
elif child.tag == 'outputs':
file_formats = list()
for children in child:
file_format = children.get( 'format', None )
if file_format not in file_formats and file_format is not None:
file_formats.append( file_format )
record[ child.tag ] = '' if file_formats is None else ",".join( file_formats )
elif child.tag == 'help':
help_text = child.text
help_split = help_text.split('\n\n')
for index, item in enumerate( help_split ):
if 'What it does' in item or 'Syntax' in item:
record[ child.tag ] = utils._remove_special_chars( help_split[ index + 1 ] )
break
elif child.tag == "edam_topics":
for item in child:
if item.tag == "edam_topic":
edam_annotations = ''
edam_text = urllib2.urlopen( 'https://www.ebi.ac.uk/ols/api/ontologies/edam/terms?iri=http://edamontology.org/' + item.text )
edam_json = json.loads( edam_text.read() )
edam_terms = edam_json[ "_embedded" ][ "terms" ][ 0 ]
edam_annotations += edam_terms[ "description" ][ 0 ] + ' '
edam_annotations += edam_terms[ "label" ] + ' '
for syn in edam_terms[ "synonyms" ]:
edam_annotations += syn + ' '
record[ child.tag ] = utils._remove_special_chars( edam_annotations )
return record
except Exception as exp:
return None
if __name__ == "__main__":
extract_tool = ExtractToolXML()
extract_tool.read_tool_directory()