-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmRMR.py
249 lines (210 loc) · 12.9 KB
/
mRMR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import numpy as np
from gromacs_utils import make_ndx, read_ndx, warn_file_exists, colors
import os
import cPickle
import time
from ConfigParser import RawConfigParser
def entropy(arr):
"""
Calculates the entropy of a distribution:
S = - SUM_i (p_i * ln p_i)
This is the true entropy if arr is a pdf, but notice that you will have to divide by an appropriate normalization if
p_i's are counts rather than probabilities.
:param arr: a probability distribution
:return: entropy of arr
"""
return np.sum([-p * np.log(p) if p else 0 for p in arr])
def h_fast(x):
"""
Calculates the column entropies of the matrix x. This assumes that the columns of x have been binned but NOT
histogrammed. That is, it will perform the following calculation:
S = -SUM_i [n_i/N ln(n_i/N)]
This is actually implemented as:
S = ln(N) - 1/N * SUM_i[n_i ln n_i]
If X is mxn dimensional, then this will return a list of length n.
:param x: An mxn matrix of observations. m is the number of observations, and n is the number of observables. In the
case of mRMR, m is the number of frames and n is the number of pairs of residues (x is essentially a processed xvg
from gmx mindist).
:return: column entropy (n dimensional array)
"""
n_samples, _ = x.shape
bin_range = [np.min(x), np.max(x)]
n_bins = bin_range[1] - bin_range[0] + 1
# Bincount along columns
hist = np.apply_along_axis(lambda y: np.histogram(y, bins=n_bins, range=bin_range)[0], axis=0, arr=x)
# Compute: -SUM_i[n_i ln n_i] along cols
count_entropy = np.apply_along_axis(entropy, axis=0, arr=hist)
# Compute: ln(N) - 1/N * SUM_i[n_i ln n_i]
return np.add(np.divide(count_entropy, n_samples), np.log(n_samples))
def mi_fast(x, y):
"""
A clever algorithm for calculating the mutual information of the arrays x and y. Pulled from Kasson git: numpy_mi.py
:param x: an array that has been binned but not yet been histogrammed.
:param y: an array that has been binned but not yet been histogrammed.
:return: mutual information of x and y.
"""
n_samples = len(x)
bin_range = [min(np.min(x), np.min(y)), max(np.max(x), np.max(y))]
n_bins = bin_range[1] - bin_range[0]+1
xy_c = np.histogram2d(x, y, bins=n_bins, range=[bin_range, bin_range])[0]
x_c = np.sum(xy_c, 1)
y_c = np.sum(xy_c, 0)
Hxy = entropy(np.ravel(xy_c)) / n_samples + np.log(n_samples)
Hx = entropy(np.ravel(x_c)) / n_samples + np.log(n_samples)
Hy = entropy(np.ravel(y_c)) / n_samples + np.log(n_samples)
return Hx + Hy - Hxy
class MRMRConfig(object):
def __init__(self, configuration_file, rewrite=False):
parser = RawConfigParser()
parser.read(configuration_file)
dist_filename = parser.get('files', 'distances-filename')
entropy_filename = parser.get('files', 'entropy-filename')
self.mRMR_filename = parser.get('files', 'mRMR-filename')
self.num_bins = parser.getint('parameters', 'num-bins')
self.bin_width = parser.getfloat('parameters', 'bin-width')
self.aa = parser.getboolean('parameters', 'aa')
self.chains = parser.getint('parameters', 'chains')
self.dist_range = [float(i) for i in parser.get('parameters', 'distance-range').split()]
self.weights = [float(i) for i in parser.get('parameters', 'weights').split()]
self.resi_select = parser.get('parameters', 'excluded-residues').split()
if not self.resi_select:
self.resi_select = None
print "Loading distance file %s" % dist_filename
init = time.time()
self.dists = cPickle.load(open(dist_filename))
print "Loaded %s in %f seconds" % (dist_filename, time.time() - init)
# If no entropy has been provided, entropy must be calculated for permitted residues.
print colors.HEADER + "\tENTROPY CALCULATION" + colors.ENDC
if os.path.exists(entropy_filename) and not rewrite:
warn_file_exists(entropy_filename)
self.entropy = cPickle.load(open(entropy_filename))
else:
print "No entropy file was found or you have chosen to rewrite. Calculating entropies..."
self.entropy = h_fast(self.dists)
print "Dumping entopies to", entropy_filename
cPickle.dump(self.entropy, open(entropy_filename, 'w'))
def _get_permitted_pairs(self, resi_tpr=None):
"""
This function allows you to restrict the pairs you wish to consider in the mRMR algorithm based on their average
distances and/or residue name. DEER measurements are reliable for distances only in the range of about 20 - 50 nm.
Therefore, we wish to only consider pairs with an average distance in simulation of 20 - 50 nm. Similarly, we may
not wish to mutate certain residues that are integral to secondary structure (or lack of secondary structure) e.g.,
PRO. Therefore, we may eliminate all PRO residues from the distance calculation as well.
:param mat: A matrix of distances. mat[i,j] = distance of pair j at frame i.
:param resi_select: a list of residue name selections, e.g. ['PRO', 'GLY', ...], that you wish to eliminate in the
mRMR calculation. Use PDB residue naming scheme.
:param resi_tpr: a single tpr so that the residue name can be mapped to atom index and therefore to pair number.
:return: a list of permitted pair numbers.
"""
_, num_pairs = self.dists.shape
permitted_pairs = [i for i in range(num_pairs)]
# EXCLUDE BASED ON DISTANCE
if self.dist_range is not None:
bin_range = np.divide(self.dist_range, self.bin_width)
print "Choosing pairs with distances between", self.dist_range[0], "and", self.dist_range[1], "nm"
print "That corresponds to bin numbers between", bin_range[0], "and", bin_range[1]
bin_mean = np.mean(self.dists, axis=0)
for i in range(num_pairs):
if not (bin_range[0] < bin_mean[i] < bin_range[1]):
permitted_pairs.remove(i)
# EXCLUDE BASED ON RESIDUE NAME
if self.resi_select is not None:
# Generate two index files:
# 1. CA/BB that are PRO
# 2. All CA/BB
full_selection = "(resname " + " or resname ".join(self.resi_select) + ")"
if self.aa:
selection = 'mol 1 and name CA and ' + full_selection + \
'; mol 2 and name CA and ' + full_selection
else:
selection = 'mol 1 and name BB and ' + full_selection + \
'; mol 2 and name BB and ' + full_selection
print "Eliminating pairs with selection:", self.resi_select
make_ndx(tpr_filename=resi_tpr, ndx_filename='selected_restrict_resis.ndx', selection=selection)
make_ndx(tpr_filename=resi_tpr, ndx_filename='all_resis.ndx', AA=self.aa)
restrict_dict = read_ndx('selected_restrict_resis.ndx')
all_dict = read_ndx('all_resis.ndx')
mol1all, mol2all, _, _ = all_dict.keys()
mol1restrict, mol2restrict = restrict_dict.keys()
pair_count = 0
for resi1 in all_dict[mol1all]:
for resi2 in all_dict[mol2all]:
if ((resi1 in restrict_dict[mol1restrict]) or
(resi2 in restrict_dict[mol2restrict])) and \
(pair_count in permitted_pairs):
permitted_pairs.remove(pair_count)
pair_count += 1
return permitted_pairs
def mRMR(self, iters, store_in_memory=True, resi_tpr=None):
"""
This method is an implementation of the mRMR algorithm described in http://dx.doi.org/10.1142/S0219720005001004
(Here, however, we use a weighted sum of both the MID and MIQ criteria, not just one or the other).
:param mat: A matrix of distances. mat[i,j] = distance of pair j at frame i. The matrix MUST be binned.
Unfortunately, because of memory concerns, the xvg must be read in as bin numbers (integer) instead of
raw distances (floats).
:param iters: Number of mRMR pairs you wish to calculate.
:param entropy_filename: a file for storing all distance entropies. This is a binary cPickle file (this is done so
that if you have already calculated the entropies, the entropies can be read in quickly instead of recalculated).
:param self.weights: self.weights for MID and MIQ criteria. This should be a list: [MID_weight, MIQ_weight]
:param dist_restrict: the range of allowed distances as a list. I.e., [2.0, 5.0] for the usual DEER case.
:param resi_restrict: a list of residue name selections, e.g. ['PRO', 'GLY', ...], that you wish to eliminate in the
mRMR calculation. Use PDB residue naming scheme.
:param resi_tpr: a single tpr so that the residue name can be mapped to atom index and therefore to pair number.
:param self.bin_width: This is a little clunky. The distance matrix mat must be pre-binned, but we need to know the bin
size that was used. That way we can map bin number to absolute distance with get_permitted_pairs()
:param rewrite: boolean to indicate whether you want to rewrite the entropy file.
:return: a list of pair numbers, ordered from highest to lowest mRMR.
"""
_, n_pairs = self.dists.shape # Get the number of pairs in your matrix
print "The number of pairs is", n_pairs
permitted_resis = self._get_permitted_pairs(resi_tpr=resi_tpr)
print "Saving permitted pairs to permitted_pairs.txt in this directory..."
np.savetxt('permitted_pairs.txt', permitted_resis)
# Initialize a set omega. At each iteration you will pull a pair from this set
# based on the mRMR criterion of your choice (subtraction or division). The pair
# will be placed into the set s. s is the final set of highest mRMR pairs.
omega = [i for i in permitted_resis] # At first, all permitted pairs are present in omega
s = [] # s starts as an empty set
if store_in_memory:
mi_mat = np.zeros(shape=(n_pairs, n_pairs)) # To-do: make mi_mat only as big as num of permitted resis
print colors.HEADER + "\n\tBEGINNING mRMR CALCULATION" + colors.ENDC
mRMR_file = open(self.mRMR_filename, "w")
for m in range(iters): # Number of times you want to perform mRMR
print "m =", m, "calculation"
start_time = time.time()
if m == 0: # At the first step, choose the highest entropy pair.
temp_H = [self.entropy[i] for i in permitted_resis]
s.append(omega.pop(np.argmax(temp_H)))
# Now you have lost the nice property that omega[i] = i. You must be
# careful in all following calculations to ensure you pop the right
# element of omega.
# Must calculate the mutual information between pairs of pairs.
# MI = H(Y_i) + H(Y_j) - H(Y_i, Y_j)
# Then the final mRMR score is H(Y_test) -/ mean of MI(Y_test, Y_s)
else:
# high_idx are the high mRMR pairs currently in the set s. You
# will evaluate the mutual information between each of these
# pairs and a test pair. The test pair is test_idx
mRMR_scores = []
for test_idx in omega: # Loop through test pairs
pair_pair_mi = 0
for high_idx in s: # Loop through mRMR pairs
if store_in_memory:
# First, check to see if you need to store joint entropy - you don't want
# to recalculate if you can use symmetry properties and previously stored
# values.
if (mi_mat[test_idx, high_idx] == 0) and (mi_mat[high_idx, test_idx] == 0):
mi_mat[test_idx, high_idx] = mi_fast(self.dists[:, test_idx], self.dists[:, high_idx])
mi_mat[high_idx, test_idx] = mi_mat[test_idx, high_idx]
pair_pair_mi += mi_mat[test_idx, high_idx]
else:
pair_pair_mi += mi_fast(self.dists[:, test_idx], self.dists[:, high_idx])
mRMR_scores.append(self.weights[0]*(self.entropy[test_idx] - pair_pair_mi/m) +
self.weights[1]*self.entropy[test_idx]/pair_pair_mi/m)
omega_idx = np.argmax(mRMR_scores) # Find the test pair with the highest mRMR value
s.append(omega.pop(omega_idx)) # Pop that test pair from omega, place in s
mRMR_file.write("%i\n" % s[m])
end_time = time.time()
print "Time elapsed for m =", m, "calculation:", end_time-start_time
mRMR_file.close()
return np.array(s)