-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatchms_scoring.py
179 lines (168 loc) · 7.8 KB
/
matchms_scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python
"""
Author: Junda Huang
Script using matchms scoring functions to get matching scores
for imput spectra in mgf/msp again mgf/msp spetra references library
spectra
Command line input: python3 matchms_scoring.py <query>.mgf
<referrence_library>.msp/mgf <output>.txt
"""
# import statements
from sys import argv
import numpy as np
from matchms.importing import load_from_mgf
from matchms.importing import load_from_msp
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities
from matchms import calculate_scores, Spectrum, Scores
from matchms.similarity import CosineGreedy
from matchms.Spikes import Spikes
# functions
def matchms_score(q, q_spectra, ref_spectra, reference):
"""
"""
if '.msp' in reference:
for r in load_from_msp(reference):
#tolerence = float(q.metadata['precursormz'])
tolerence = float(q.metadata['pepmass'][0])
if 'precursormz' in r.metadata.keys():
target = float(r.metadata['precursormz'].replace(',','.'))
if abs(target - tolerence) <= 0.5:
r = normalize_intensities(default_filters(r))
ref_spectra.append(r)
elif '.mgf' in reference:
for r in load_from_mgf(reference):
tolerence = float(q.metadata['precursormz'])
#tolerence = float(q.metadata['pepmass'][0])
if 'pepmass' in r.metadata.keys():
target = float(r.metadata['pepmass'][0])
#target = float(r.metadata['precursormz'])
if abs(target - tolerence) <= 0.01:
#if abs(target - tolerence) <= 5 * tolerence * (10**(-6)):
r = normalize_intensities(default_filters(r))
ref_spectra.append(r)
if ref_spectra != []:
matches = calculate_scores(ref_spectra, q_spectra, CosineGreedy())
return matches
else:
return None
def un_normalize(spectrum):
"""
"""
s = spectrum.clone()
max_in = np.max(s.peaks.intensities)
mz, intensity = s.peaks
un_norm = intensity * max_in
s.peaks = Spikes(mz = mz, intensities = un_norm)
return s
def matchms_to_file(mgf_file, references_file, output):
"""
Find spectra matches from references library, calculate matches scores
and write into output file
Params:
mgf_file(string): file name of the query mgf file
references_file(string): file name of the reference library file (msp)
output(string): file name of the output text file
Returns:
output(string): file name of the output text file
"""
queries = load_from_mgf(mgf_file)
#queries = load_from_msp('test.msp')
#queries = load_from_mgf('Qe05947-neg.mgf')
#references = load_from_msp(references_file_mgf)
with open (output, 'w') as out:
for q in queries:
q_spectra = [normalize_intensities(default_filters(q))]
ref_spectra = []
matches = matchms_score(q, q_spectra, ref_spectra, \
references_file)
if matches != None:
for match in matches:
(reference, query, match) = match
if reference is not query and match["matches"] >= 1 and \
abs((int(reference.metadata['rtinseconds'])/60) \
-(int(query.metadata['retention'])/1000000))<=0.4:
#out.write(f"Reference precursormz:\
# {reference.metadata['precursormz']}\n")
out.write(f"Reference precursormz:\
{reference.metadata['pepmass']}\n")
out.write(f"Reference rettime:\
{int(reference.metadata['rtinseconds'])/60}\n")
#out.write(f"Reference Name:\
# {reference.metadata['name']}\n")
if 'formula' in reference.metadata.keys():
out.write(f"Reference Formula:\
{reference.metadata['formula']}\n")
out.write(f"Query rettime:\
{int(query.metadata['retention'])/1000000}\n")
#out.write(f"Query rettime:\
# {int(query.metadata['rtinseconds'])/60}\n")
out.write(f"Score: {match['score']:.4f}\n")
out.write(f"Number of matching peaks:\
{match['matches']}\n")
out.write("----------------------------\n")
if match['score'] >= 0.95 and match["matches"] >= 5:
spec_r = un_normalize(reference).plot()
spec_q = un_normalize(q_spectra[0]).plot()
spec_r.savefig\
('msclust-aif/newspecmatch/05012022/outputlibmsms/lib{}.png'.\
format(reference.metadata['pepmass'][0]))
spec_q.savefig\
('msclust-aif/newspecmatch/05012022/outputlibmsms/aif{}.png'.\
format(query.metadata['precursormz']))
'''
spec_r = un_normalize(reference).plot()
spec_q = un_normalize(q_spectra[0]).plot()
spec_r.savefig('output/{}.png'.\
format(reference.metadata['pepmass']))
spec_q.savefig('output/{}.png'.\
format(query.metadata['precursormz']))'''
"""
for r in load_from_msp(references_file):
tolerence = float(q.metadata['precursormz'])
#tolerence = float(q.metadata['pepmass'][0])
if 'precursormz' in r.metadata.keys():
target = float(r.metadata['precursormz'].replace(',','.'))
if abs(target - tolerence) <= 0.5:
r = normalize_intensities(default_filters(r))
ref_spectra.append(r)
if ref_spectra != []:
for match in calculate_scores(ref_spectra, q_spectra, \
CosineGreedy()):
(reference, query, match) = match
if reference is not query and match["matches"] >= 1:
out.write(f"Reference precursormz:\
{reference.metadata['precursormz']}\n")
out.write(f"Reference Name:\
{reference.metadata['name']}\n")
out.write(f"Reference Formula:\
{reference.metadata['formula']}\n")
out.write(f"Query scan id:\
{query.metadata['scan nr']}\n")
out.write(f"Query mass:\
{query.metadata['precursormz']}\n")
out.write(f"Score: {match['score']:.4f}\n")
out.write(f"Number of matching peaks:\
{match['matches']}\n")
out.write("----------------------------\n")"""
return matches
def main(query, ref, output):
'''
The main function
Params:
mgf_file(string): file name of the query mgf file
references_file(string): file name of the reference library file (msp)
output(string): file name of the output text file
Returns:
output(string): file name of the output text file
'''
matches = matchms_to_file(query, ref, output)
return matches
# main
if __name__ == '__main__':
# parsing files from command line
query = argv[1]
references_library = argv[2]
output_file = argv[3]
# main
scores = main(query, references_library, output_file)