-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathblast.py
executable file
·68 lines (61 loc) · 1.82 KB
/
blast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
import click as ck
import numpy as np
import pandas as pd
import gzip
import math
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics.pairwise import cosine_similarity
MAXLEN = 1000
@ck.command()
def main():
res = load_sw_scores()
blast = list()
df = pd.read_pickle('data/vectors.pkl')
proteins = df['proteins'].values
vectors = df['vectors'].values
vec_matrix = np.empty((len(vectors), len(vectors[0])), dtype=np.float32)
for i in range(len(vectors)):
vec_matrix[i, :] = vectors[i]
cosine_sim = cosine_similarity(vec_matrix)
cosine = list()
for i in range(len(proteins)):
p1 = proteins[i]
for j in range(i + 1, len(proteins)):
cosine.append(cosine_sim[i, j])
p2 = proteins[j]
if p1 in res and p2 in res[p1]:
blast.append(res[p1][p2] / res[p1][p1])
else:
blast.append(0.0)
print(spearmanr(cosine, blast))
print(pearsonr(cosine, blast))
def load_blast_sim():
res = {}
with open('data/sim.blst') as f:
for line in f:
it = line.strip().split()
p1 = it[0]
p2 = it[1]
s = float(it[2])
if p1 not in res:
res[p1] = {}
if p2 not in res:
res[p2] = {}
res[p1][p2] = s
res[p2][p1] = s
return res
def load_sw_scores():
res = {}
with open('data/scores.sw') as f:
for line in f:
line = line.strip()
if line.startswith('query:'):
cur_prot = line[6:]
res[cur_prot] = {}
elif line.startswith('score: '):
it = line[7:].split(' -- ')
res[cur_prot][it[1]] = int(it[0])
return res
if __name__ == '__main__':
main()