-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathpreprocessing_python.py
317 lines (273 loc) · 12.7 KB
/
preprocessing_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import numpy as np, json, os, matplotlib.pyplot as plt
#========================================================================================#
def get_distance_matrix(M):
'''
##############################################
Input
M = Data matrix. Rows are datapoints (e.g. cell) and columns are features (e.g. genes)
Output (D)
D = All Pairwise euclidian distances between points in M
##############################################
'''
D = np.zeros((M.shape[0],M.shape[0]))
for i in range(M.shape[0]):
Mtiled = np.tile(M[i,:][None,:],(M.shape[0],1))
D[i,:] = np.sqrt(np.sum((Mtiled - M)**2, axis=1))
return D
#========================================================================================#
def filter_cells(E, min_reads):
'''
##############################################
Filter out cells with total UMI count < min_reads
Input
E = Expression matrix. Rows correspond to cells and columns to genes
min_reads = Minimum number of reads required for a cell to survive filtering
Output (Efiltered, cell_filter)
Efiltered = Filtered expression matrix
cell_filter = Boolean mask that reports filtering. True means that the cell is
kept; False means the cells is removed
##############################################
'''
total_counts = np.sum(E,axis=1)
cell_filter = total_counts >= min_reads
if np.sum(cell_filter) == 0:
return None, cell_filter
else: return E[cell_filter,:],cell_filter
#========================================================================================#
def row_normalize(E):
'''
##############################################
Normalize so that every cell has the same total read count. Only use genes that make
up <5% of the total reads in every cell.
Input
E = Expression matrix. Rows correspond to cells and columns to genes.
Output (Enormalized)
Enormalized = Normalized version of the original expression matrix.
##############################################
'''
total_counts = np.sum(E,axis=1)
tc_tiled = np.tile(total_counts[:,None],(1,E.shape[1]))
included = np.all(E < tc_tiled * 0.05, axis=0)
tc_include = np.sum(E[:,included],axis=1)
tc_tiled = np.tile(tc_include[:,None],(1,E.shape[1]))+.000001
return E / tc_tiled * np.mean(total_counts)
#========================================================================================#
def Zscore(E):
'''
##############################################
Z-score standardize the expression of each gene.
Input
E = Expression matrix. Rows correspond to cells and columns to genes.
Output (EZ)
EZ = Z-score standardized version of the original expression matrix.
##############################################
'''
means = np.tile(np.mean(E,axis=0)[None,:],(E.shape[0],1))
stds = np.tile(np.std(E,axis=0)[None,:],(E.shape[0],1))
return (E - means) / (stds + .0001)
#========================================================================================#
def filter_genes(E, Ecutoff, Vcutoff):
'''
##############################################
Filter genes based on expression level and variability. Only keep genes with mean
expression > Ecutoff and Fano factor > Vcutoff
Input
E = Expression matrix. Rows correspond to cells and columns to genes.
Ecutoff = minimum mean expression to keep a gene
Vcutoff = minimum Fano factor to keep a genes
Output (Efiltered, gene_filter)
Efiltered = Filtered expression matrix
gene_filter = Boolean mask that reports filtering. True means that the gene is
kept; False means the gene is removed
##############################################
'''
mean_filter = np.mean(E,axis=0)> Ecutoff
var_filter = np.var(E,axis=0) / (np.mean(E,axis=0)+.0001) > Vcutoff
gene_filter = np.all([mean_filter,var_filter],axis=0)
return E[:,gene_filter], gene_filter
#========================================================================================#
def get_knn_edges(dmat, k):
'''
##############################################
Calculate knn-graph edges from a distance matrix.
Input
dmat = Square distance matrix. (dmat)_ij = the distance between i and k
k = Number of edges to assign each node (i.e. k in the knn-graph)
Output (edge_list)
edge_list = A list of unique undirected edges in the knn graph. Each edge comes in
the form of a tuple (i,j) representing an edge between i and j.
##############################################
'''
edge_dict = {}
for i in range(dmat.shape[0]):
for j in np.nonzero(dmat[i,:] <= sorted(dmat[i,:])[k])[0]:
if i != j:
ii,jj = tuple(sorted([i,j]))
edge_dict[(ii,jj)] = dmat[i,j]
return edge_dict.keys()
#========================================================================================#
def get_PCA(E,numpc):
'''
##############################################
Reduce to the dimension of a data matrix using principal components analysis (PCA)
Input
E = Data matrix. Rows are data points (e.g. cells) and columns are features (e.g. genes)
numpc = Final number of PCs to use
Output (Epca)
Epca = PC-reduced data matrix. If E has N rows, then Epca has shape (N,numpc).
##############################################
'''
from sklearn.decomposition import PCA
pca = PCA(n_components=numpc)
return pca.fit_transform(E)
#========================================================================================#
def save_spring_dir(E,D,k,gene_list,project_directory, custom_colors={},cell_groupings={}, use_genes=[], coarse_grain_X=1):
'''
##############################################
Builds a SPRING project directory and transforms data into SPRING-readable formats
Input (Required)
E = (numpy array) matrix of gene expression. Rows correspond to
celles and columns correspond to genes.
D = (numpy array) distance matrix for construction of knn graph.
Any distance matrix can be used as long as higher values
correspond to greater distances.
k = Number of edges assigned to each node in knn graph
gene_list = An ordered list of gene names with length length E.shape[1]
project_directory = Path to a directory where SPRING readable files will be
written. The directory does not have to exist before running
this function.
Input (Optional)
cell_groupings = Dictionary with one key-value pair for each cell grouping.
The key is the name of the grouping (e.g. "SampleID") and
the value is a list of labels (e.g. ["sample1","sample2"...])
If there are N cells total (i.e. E.shape[0] == N), then the
list of labels should have N entries.
custom_colors = Dictionary with one key-value pair for each custom color.
The key is the name of the color track and the value is a
list of scalar values (i.e. color intensities). If there are
N cells total (i.e. E.shape[0] == N), then the list of labels
should have N entries.
coarse_grain_X = The fold-change reduction in nodes to achieve through graph
coarse graining. If 1, then no coarse graining will be
performed
##############################################
'''
# Convert arry data types
E = np.array(E.tolist())
D = np.array(D.tolist())
if not os.path.exists(project_directory):
os.makedirs(project_directory)
if not project_directory[-1] == '/': project_directory += '/'
# Build graph
#print 'Building graph'
edges = get_knn_edges(D,k)
# Coarse grain
if coarse_grain_X < 1:
edges,E,custom_colors,cell_groupings = coarse_grain(edges,E,k,custom_colors,cell_groupings,coarse_grain_X)
# save genesets
#print 'Saving gene sets'
custom_colors['Uniform'] = np.zeros(E.shape[0])
write_color_tracks(custom_colors, project_directory+'color_data_gene_sets.csv')
all = []
# save gene colortracks
#print 'Saving coloring tracks'
if not os.path.exists(project_directory+'gene_colors'):
os.makedirs(project_directory+'gene_colors')
II = len(gene_list) / 50 + 1
for j in range(50):
fname = project_directory+'/gene_colors/color_data_all_genes-'+repr(j)+'.csv'
if len(use_genes) > 0: all_gene_colors = {g : E[:,i+II*j] for i,g in enumerate(gene_list[II*j:II*(j+1)]) if g in use_genes}
else: all_gene_colors = {g : E[:,i+II*j] for i,g in enumerate(gene_list[II*j:II*(j+1)]) if np.mean(E[:,i+II*j])>0.05}
write_color_tracks(all_gene_colors, fname)
all += all_gene_colors.keys()
# Create and save a dictionary of color profiles to be used by the visualizer
#print 'Color stats'
color_stats = {}
for i in range(E.shape[1]):
mean = float(np.mean(E[:,i]))
std = float(np.std(E[:,i]))
max = float(np.max(E[:,i]))
centile = float(np.percentile(E[:,i],99.6))
color_stats[gene_list[i]] = (mean,std,0,max,centile)
for k,v in custom_colors.items():
color_stats[k] = (0,1,np.min(v),np.max(v)+.01,np.percentile(v,99))
json.dump(color_stats,open(project_directory+'/color_stats.json','w'),indent=4, sort_keys=True)
# save cell labels
#print 'Saving categorical color data'
categorical_coloring_data = {}
for k,labels in cell_groupings.items():
labels = [(l if type(l)==str else repr(l)) for l in labels]
label_colors = {l:frac_to_hex(float(i)/len(set(labels))) for i,l in enumerate(list(set(labels)))}
categorical_coloring_data[k] = {'label_colors':label_colors, 'label_list':labels}
json.dump(categorical_coloring_data,open(project_directory+'/categorical_coloring_data.json','w'),indent=4)
#print 'Writing graph'
nodes = [{'name':i,'number':i} for i in range(E.shape[0])]
edges = [{'source':i, 'target':j, 'distance':0} for i,j in edges]
out = {'nodes':nodes,'links':edges}
open(project_directory+'graph_data.json','w').write(json.dumps(out,indent=4, separators=(',', ': ')))
#========================================================================================#
def coarse_grain(edges,E,k,custom_colors,cell_groupings,coarse_grain_X):
# make a list of base nodes, give each one a unique index
# make -1 be a special index that means "unassigned"
# loop through edges, each one member has a label but the other does not, propogate
# repeat until all nodes are assigned
N = len(E)
node_index = np.ones(N) * -1
base_filter = np.random.uniform(0,1,len(node_index)) < coarse_grain_X
node_index[base_filter] = np.arange(np.sum(base_filter))
for count,(i,j) in enumerate(edges):
if node_index[i] != -1 and node_index[j] == -1:
node_index[j] = node_index[i]
if node_index[j] != -1 and node_index[i] == -1:
node_index[i] = node_index[j]
# now make a new list of edges and color vector and coordinates
new_E = []
new_custom_colors = {k:[] for k in custom_colors}
new_cell_groupings = {k:[] for k in cell_groupings}
for i in range(np.sum(base_filter)):
new_E.append(np.mean(E[node_index==i,:],axis=0))
for k in new_custom_colors:
new_custom_colors[k].append(np.mean(np.array(custom_colors[k])[node_index==i]))
for k in new_cell_groupings:
new_cell_groupings[k].append(most_common([cell_groupings[k][ii] for ii in np.nonzero(node_index==i)[0]]))
new_E = np.array(new_E)
new_edges = {}
for i,j in edges:
ne = tuple(sorted([node_index[i],node_index[j]]))
if not ne in new_edges: new_edges[ne] = 0.
new_edges[ne] += 1.
edge_weights = [v for k,v in new_edges.items()]
cutoff = np.percentile(edge_weights,100./len(edge_weights)*coarse_grain_X*len(edges)*1.5)
new_edges = [e for e,v in new_edges.items() if v >= cutoff]
return new_edges,new_E,new_custom_colors,new_cell_groupings
#========================================================================================#
def most_common(names):
counts = {n:0 for n in names}
for n in names: counts[n] += 1
highest = np.max([v for k,v in counts.items()])
for k,v in counts.items():
if v==highest: return k
#========================================================================================#
def row_sum_normalize(A):
print A.shape
d = np.sum(A,axis=1)
A = A / np.tile(d[:,None],(1,A.shape[1]))
return A
#========================================================================================#
def write_graph(n_nodes, edges,path):
nodes = [{'name':i,'number':i} for i in range(n_nodes)]
edges = [{'source':i, 'target':j, 'distance':0} for i,j in edges]
out = {'nodes':nodes,'links':edges}
open(path+'/graph_data.json','w').write(json.dumps(out,indent=4, separators=(',', ': ')))
#========================================================================================#
def write_color_tracks(ctracks, fname):
out = []
for name,score in ctracks.items():
line = ','.join([name]+[repr(round(x,1)) for x in score])
out += [line]
out = sorted(out,key=lambda x: x.split(',')[0])
open(fname,'w').write('\n'.join(out))
#========================================================================================#
def frac_to_hex(frac):
rgb = tuple(np.array(np.array(plt.cm.jet(frac)[:3])*255,dtype=int))
return '#%02x%02x%02x' % rgb