-
Notifications
You must be signed in to change notification settings - Fork 0
/
domainanalyzer.py
124 lines (124 loc) · 5.75 KB
/
domainanalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def bacterialProteomeDomainAnalyzer(file, arg_type = None):
import json
"""sumary_line
a rapid implementation of the bacterial domain
analyzer, following the predictions of the bacterial
domains from the interproscan. I implemented a mapped
dataframe approach to make it faster and iterable. it will
parse a nested to nested json from interpro for direct analysis
and fecthing all the protein domains and the corresponding start
and stop coordinates. Also you can make a direct ingestion to the
database and it also provides a dataframe.
Keyword arguments:
argument -- file prediction by the interproscan
Return: a systematic prediction of the domains in the sequences
"""
if arg_type == "sequence":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
sequence = ''.join([i["sequence"] for i in data["results"]])
return sequence
if arg_type == "interpro_normalize":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
interpro_normalize = pd.concat(list(map(lambda n: pd.DataFrame(n), \
[i["matches"] for i in data["results"]])))
return interpro_normalize
if arg_type == "signature":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
signature = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['signature']
return signature
if arg_type == "location":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
location = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['locations']
return location
if arg_type == "evalue":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
evalue = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['evalue']
return evalue
if arg_type == "score":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
score = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['score']
return score
if arg_type == "modelac":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
modelac = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['model-ac']
return modelac
if arg_type == "scope":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
scope = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['scope']
return scope
if arg_type == "accession":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
accession = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['accession']
return accession
if arg_type == "name":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
name = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['name']
return name
if arg_type == "proteinClass":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
proteinClass = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['ProteinClass']
return proteinClass
if arg_type == "graftPoint":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
graftPoint = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['graftPoint']
return graftPoint
if arg_type == "goxRefs":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
goxRefs = pd.concat(list(map(lambda n: pd.DataFrame(n),\
[i["matches"] for i in data["results"]])))['goxRefs']
return goxRefs
if arg_type == "prediction_locations":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
prediction_locations = pd.DataFrame.from_dict(pd.concat(list(map(lambda n:\
pd.DataFrame(n),[i["matches"] for i in data["results"]])))\
["signature"].apply(lambda n: n.values()))
return prediction_locations
if arg_type == "getdomains":
interpro_file = file
with open(interpro_file) as read:
data = json.load(read)
get_domains = list(list(filter(lambda n: n!=None and n!="",i)) for i in \
(list(list(filter(lambda n: not isinstance(n,dict),i)) \
for i in ([list(i) for i in pd.DataFrame.from_dict \
(pd.concat(list(map(lambda n: pd.DataFrame \
(n),[i["matches"] for i in data["results"]]))) \
["signature"].apply(lambda n: n.values()))["signature"].to_list()]))))
return get_domains