From 048e5de182268a4a1db65e26f61b2c1b4602ce01 Mon Sep 17 00:00:00 2001 From: Brian Hie Date: Mon, 18 Nov 2019 10:11:29 -0500 Subject: [PATCH 1/2] implement sketch-based acceleration of integration --- bin/mouse_brain.py | 14 ++-- bin/mouse_brain_sketched.py | 87 +++++++++++++++++++++ scanorama/__init__.py | 2 + scanorama/scanorama.py | 147 +++++++++++++++++++----------------- setup.py | 5 +- 5 files changed, 176 insertions(+), 79 deletions(-) create mode 100644 bin/mouse_brain_sketched.py diff --git a/bin/mouse_brain.py b/bin/mouse_brain.py index 83bbee3..bd02236 100644 --- a/bin/mouse_brain.py +++ b/bin/mouse_brain.py @@ -28,25 +28,23 @@ if __name__ == '__main__': process(data_names, min_trans=100) - + datasets, genes_list, n_cells = load_names(data_names) datasets, genes = merge_datasets(datasets, genes_list, ds_names=data_names) datasets_dimred, genes = process_data(datasets, genes, verbose=True) - + t0 = time() datasets_dimred = assemble( datasets_dimred, batch_size=BATCH_SIZE, - geosketch=True, geosketch_max=6900 ) print('Integrated panoramas in {:.3f}s'.format(time() - t0)) - + t0 = time() datasets_dimred, datasets, genes = correct( datasets, genes_list, ds_names=data_names, return_dimred=True, batch_size=BATCH_SIZE, - geosketch=True, geosketch_max=6900 ) print('Integrated and batch corrected panoramas in {:.3f}s' .format(time() - t0)) @@ -59,12 +57,12 @@ names.append(data_names[i]) curr_label += 1 labels = np.array(labels, dtype=int) - + mouse_brain_genes = [ 'Gja1', 'Flt1', 'Gabra6', 'Syt1', 'Gabrb2', 'Gabra1', 'Meg3', 'Mbp', 'Rgs5', ] - + # Downsample for visualization purposes datasets_dimred = [] for i in range(len(data_names)): @@ -83,7 +81,7 @@ image_suffix='.png') np.savetxt('data/{}_embedding.txt'.format(NAMESPACE), embedding, delimiter='\t') - + cell_labels = ( open('data/cell_labels/mouse_brain_cluster.txt') .read().rstrip().split() diff --git a/bin/mouse_brain_sketched.py b/bin/mouse_brain_sketched.py new file mode 100644 index 0000000..24e96f0 --- /dev/null +++ b/bin/mouse_brain_sketched.py @@ -0,0 +1,87 @@ +import numpy as np +from scanorama import * +from scipy.sparse import vstack +from sklearn.preprocessing import normalize, LabelEncoder +import sys +from time import time + +from benchmark import write_table +from process import load_names, process + +np.random.seed(0) + +NAMESPACE = 'mouse_brain_sketched' +BATCH_SIZE = 10000 + +data_names = [ + 'data/mouse_brain/nuclei', + 'data/mouse_brain/dropviz/Cerebellum_ALT', + 'data/mouse_brain/dropviz/Cortex_noRep5_FRONTALonly', + 'data/mouse_brain/dropviz/Cortex_noRep5_POSTERIORonly', + 'data/mouse_brain/dropviz/EntoPeduncular', + 'data/mouse_brain/dropviz/GlobusPallidus', + 'data/mouse_brain/dropviz/Hippocampus', + 'data/mouse_brain/dropviz/Striatum', + 'data/mouse_brain/dropviz/SubstantiaNigra', + 'data/mouse_brain/dropviz/Thalamus', +] + +if __name__ == '__main__': + process(data_names, min_trans=100) + + datasets, genes_list, n_cells = load_names(data_names) + + datasets, genes = merge_datasets(datasets, genes_list, ds_names=data_names) + + datasets_dimred, genes = process_data(datasets, genes, verbose=True) + + t0 = time() + datasets_dimred, genes = integrate( + datasets, genes_list, ds_names=data_names, + sketch=True, sketch_method='geosketch', sketch_max=2000, + ) + print('Sketched and integrated panoramas in {:.3f}s' + .format(time() - t0)) + + labels = [] + names = [] + curr_label = 0 + for i, a in enumerate(datasets_dimred): + labels += list(np.zeros(a.shape[0]) + curr_label) + names.append(data_names[i]) + curr_label += 1 + labels = np.array(labels, dtype=int) + + mouse_brain_genes = [ + 'Gja1', 'Flt1', 'Gabra6', 'Syt1', 'Gabrb2', 'Gabra1', + 'Meg3', 'Mbp', 'Rgs5', + ] + + # Downsample for visualization purposes + datasets_dimred = [] + for i in range(len(data_names)): + ds = datasets_dimred[i] + rand_idx = np.random.choice(ds.shape[0], size=int(ds.shape[0]/10), + replace=False) + datasets_dimred[i] = ds[rand_idx, :] + datasets[i] = datasets[i][rand_idx, :] + data.close() + + embedding = visualize(datasets_dimred, + labels, NAMESPACE + '_ds', names, + gene_names=mouse_brain_genes, genes=genes, + gene_expr=vstack(datasets), + multicore_tsne=True, + image_suffix='.png') + + cell_labels = ( + open('data/cell_labels/mouse_brain_cluster.txt') + .read().rstrip().split() + ) + le = LabelEncoder().fit(cell_labels) + cell_labels = le.transform(cell_labels) + cell_types = le.classes_ + + visualize(None, + cell_labels, NAMESPACE + '_type', cell_types, + embedding=embedding, image_suffix='.png') diff --git a/scanorama/__init__.py b/scanorama/__init__.py index 9bb2aa2..b9e8080 100644 --- a/scanorama/__init__.py +++ b/scanorama/__init__.py @@ -1 +1,3 @@ from .scanorama import * + +__version__ = 1.5 diff --git a/scanorama/scanorama.py b/scanorama/scanorama.py index ecba066..63aa92f 100644 --- a/scanorama/scanorama.py +++ b/scanorama/scanorama.py @@ -34,8 +34,7 @@ def correct(datasets_full, genes_list, return_dimred=False, batch_size=BATCH_SIZE, verbose=VERBOSE, ds_names=None, dimred=DIMRED, approx=APPROX, sigma=SIGMA, alpha=ALPHA, knn=KNN, - return_dense=False, hvg=None, union=False, - geosketch=False, geosketch_max=20000, seed=0): + return_dense=False, hvg=None, union=False, seed=0): """Integrate and batch correct a list of data sets. Parameters @@ -101,7 +100,6 @@ def correct(datasets_full, genes_list, return_dimred=False, expr_datasets=datasets, # Modified in place. verbose=verbose, knn=knn, sigma=sigma, approx=approx, alpha=alpha, ds_names=ds_names, batch_size=batch_size, - geosketch=geosketch, geosketch_max=geosketch_max, ) if return_dense: @@ -115,8 +113,8 @@ def correct(datasets_full, genes_list, return_dimred=False, # Integrate a list of data sets. def integrate(datasets_full, genes_list, batch_size=BATCH_SIZE, verbose=VERBOSE, ds_names=None, dimred=DIMRED, approx=APPROX, - sigma=SIGMA, alpha=ALPHA, knn=KNN, geosketch=False, - geosketch_max=20000, n_iter=1, union=False, hvg=None, seed=0): + sigma=SIGMA, alpha=ALPHA, knn=KNN, union=False, hvg=None, seed=0, + sketch=False, sketch_method='geosketch', sketch_max=10000,): """Integrate a list of data sets. Parameters @@ -147,6 +145,16 @@ def integrate(datasets_full, genes_list, batch_size=BATCH_SIZE, Use this number of top highly variable genes based on dispersion. seed: `int`, optional (default: 0) Random seed to use. + sketch: `bool`, optional (default: False) + Apply sketching-based acceleration by first downsampmling the datsets. + See Hie et al., Cell Systems (2019). + sketch_method: {'geosketch', 'uniform'}, optional (default: `geosketch`) + Apply the given sketching method to the data. Only used if + `sketch=True`. + sketch_max: `int`, optional (default: 10000) + If a dataset has more cells than `sketch_max`, downsample to + `sketch_max` using the method provided in `sketch_method`. Only used + if `sketch=True`. Returns ------- @@ -165,12 +173,21 @@ def integrate(datasets_full, genes_list, batch_size=BATCH_SIZE, datasets_dimred, genes = process_data(datasets, genes, hvg=hvg, dimred=dimred) - for _ in range(n_iter): + if sketch: + datasets_dimred = integrate_sketch( + datasets_dimred, sketch_method=sketch_method, N=sketch_max, + integration_fn=assemble, integration_fn_args={ + 'verbose': verbose, 'knn': knn, 'sigma': sigma, + 'approx': approx, 'alpha': alpha, 'ds_names': ds_names, + 'batch_size': batch_size, + }, + ) + + else: datasets_dimred = assemble( datasets_dimred, # Assemble in low dimensional space. verbose=verbose, knn=knn, sigma=sigma, approx=approx, alpha=alpha, ds_names=ds_names, batch_size=batch_size, - geosketch=geosketch, geosketch_max=geosketch_max, ) return datasets_dimred, genes @@ -584,21 +601,10 @@ def fill_table(table, i, curr_ds, datasets, base_ds=0, # Fill table of alignment scores. def find_alignments_table(datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, - prenormalized=False, geosketch=False, - geosketch_max=20000): + prenormalized=False): if not prenormalized: datasets = [ normalize(ds, axis=1) for ds in datasets ] - if geosketch: - # Only match cells in geometric sketches. - from ample import gs, uniform - global gs_idxs - if gs_idxs is None: - gs_idxs = [ uniform(X, geosketch_max, replace=False) - if X.shape[0] > geosketch_max else range(X.shape[0]) - for X in datasets ] - datasets = [ datasets[i][gs_idx, :] for i, gs_idx in enumerate(gs_idxs) ] - table = {} for i in range(len(datasets)): if len(datasets[:i]) > 0: @@ -631,13 +637,6 @@ def find_alignments_table(datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, if verbose > 1: table_print[i, j] += table1[(i, j)] - if geosketch: - # Translate matches within geometric sketches to original indices. - matches_mnn = matches[(i, j)] - matches[(i, j)] = [ - (gs_idxs[i][a], gs_idxs[j][b]) for a, b in matches_mnn - ] - if verbose > 1: print(table_print) return table1, table_print, matches @@ -646,12 +645,10 @@ def find_alignments_table(datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, # Find the matching pairs of cells between datasets. def find_alignments(datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, - alpha=ALPHA, prenormalized=False, - geosketch=False, geosketch_max=20000): + alpha=ALPHA, prenormalized=False,): table1, _, matches = find_alignments_table( datasets, knn=knn, approx=approx, verbose=verbose, prenormalized=prenormalized, - geosketch=geosketch, geosketch_max=geosketch_max ) alignments = [ (i, j) for (i, j), val in reversed( @@ -771,14 +768,13 @@ def transform(curr_ds, curr_ref, ds_ind, ref_ind, sigma=SIGMA, cn=False, def assemble(datasets, verbose=VERBOSE, view_match=False, knn=KNN, sigma=SIGMA, approx=APPROX, alpha=ALPHA, expr_datasets=None, ds_names=None, batch_size=None, - geosketch=False, geosketch_max=20000, alignments=None, matches=None): + alignments=None, matches=None): if len(datasets) == 1: return datasets if alignments is None and matches is None: alignments, matches = find_alignments( datasets, knn=knn, approx=approx, alpha=alpha, verbose=verbose, - geosketch=geosketch, geosketch_max=geosketch_max ) ds_assembled = {} @@ -951,6 +947,57 @@ def assemble(datasets, verbose=VERBOSE, view_match=False, knn=KNN, return datasets +# Sketch-based acceleration of integration. +def integrate_sketch(datasets_dimred, sketch_method='geosketch', N=10000, + integration_fn=assemble, integration_fn_args={}): + + from geosketch import gs, uniform + + if sketch_method.lower() == 'geosketch' or sketch_method.lower() == 'gs': + sampling_fn = gs + else: + sampling_fn = uniform + + # Sketch each dataset. + + sketch_idxs = [ + sorted(set(sampling_fn(X, N, replace=False))) + if X.shape[0] > N else list(range(X.shape[0])) + for X in datasets_dimred + ] + datasets_sketch = [ X[idx] for X, idx in zip(datasets_dimred, sketch_idxs) ] + + # Integrate the dataset sketches. + + datasets_int = integration_fn(datasets_sketch[:], **integration_fn_args) + + # Apply integrated coordinates back to full data. + + labels = [] + curr_label = 0 + for i, a in enumerate(datasets_sketch): + labels += list(np.zeros(a.shape[0]) + curr_label) + curr_label += 1 + labels = np.array(labels, dtype=int) + + for i, (X_dimred, X_sketch) in enumerate(zip(datasets_dimred, datasets_sketch)): + X_int = datasets_int[i] + + neigh = NearestNeighbors(n_neighbors=3).fit(X_dimred) + _, neigh_idx = neigh.kneighbors(X_sketch) + + ds_idxs, ref_idxs = [], [] + for ref_idx in range(neigh_idx.shape[0]): + for k_idx in range(neigh_idx.shape[1]): + ds_idxs.append(neigh_idx[ref_idx, k_idx]) + ref_idxs.append(ref_idx) + + bias = transform(X_dimred, X_int, ds_idxs, ref_idxs, 15, batch_size=1000) + + datasets_int[i] = X_dimred + bias + + return datasets_int + # Non-optimal dataset assembly. Simply accumulate datasets into a # reference. def assemble_accum(datasets, verbose=VERBOSE, knn=KNN, sigma=SIGMA, @@ -976,41 +1023,3 @@ def assemble_accum(datasets, verbose=VERBOSE, knn=KNN, sigma=SIGMA, datasets[j] = ds1 + bias return datasets - -def interpret_alignments(datasets, expr_datasets, genes, - verbose=VERBOSE, knn=KNN, approx=APPROX, - alpha=ALPHA, n_permutations=None): - if n_permutations is None: - n_permutations = float(len(genes) * 30) - - alignments, matches = find_alignments( - datasets, knn=knn, approx=approx, alpha=alpha, verbose=verbose - ) - - for i, j in alignments: - # Compute average bias vector that aligns two datasets together. - ds_i = expr_datasets[i] - ds_j = expr_datasets[j] - if i < j: - match = matches[(i, j)] - else: - match = matches[(j, i)] - i_ind = [ a for a, _ in match ] - j_ind = [ b for _, b in match ] - avg_bias = np.absolute( - np.mean(ds_j[j_ind, :] - ds_i[i_ind, :], axis=0) - ) - - # Construct null distribution and compute p-value. - null_bias = ( - ds_j[np.random.randint(ds_j.shape[0], size=n_permutations), :] - - ds_i[np.random.randint(ds_i.shape[0], size=n_permutations), :] - ) - p = ((np.sum(np.greater_equal( - np.absolute(np.tile(avg_bias, (n_permutations, 1))), - np.absolute(null_bias) - ), axis=0, dtype=float) + 1) / (n_permutations + 1)) - - print('>>>> Stats for alignment {}'.format((i, j))) - for k in range(len(p)): - print('{}\t{}'.format(genes[k], p[k])) diff --git a/setup.py b/setup.py index 49fe012..d353c6e 100644 --- a/setup.py +++ b/setup.py @@ -2,14 +2,15 @@ setup( name='scanorama', - version='1.4', + version='1.5', description='Panoramic stitching of heterogeneous single cell transcriptomic data', url='https://github.com/brianhie/scanorama', - download_url='https://github.com/brianhie/scanorama/archive/v1.4.tar.gz', + download_url='https://github.com/brianhie/scanorama/archive/v1.5.tar.gz', packages=find_packages(exclude=['bin', 'conf', 'data', 'target']), install_requires=[ 'annoy>=1.11.5', 'fbpca>=1.0', + 'geosketch>=1.0', 'intervaltree==2.1.0', 'matplotlib>=2.0.2', 'numpy>=1.12.0', From e63fe9b21538e47dd189951840685180b3b5b65c Mon Sep 17 00:00:00 2001 From: Brian Hie Date: Mon, 18 Nov 2019 10:17:22 -0500 Subject: [PATCH 2/2] update readme with sketching --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8c8b1a9..b9d4cc1 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ Scanorama enables batch-correction and integration of heterogeneous scRNA-seq data sets, which is described in the paper ["Efficient integration of heterogeneous single-cell transcriptomes using Scanorama"](https://www.nature.com/articles/s41587-019-0113-3) by Brian Hie, Bryan Bryson, and Bonnie Berger. This repository contains the Scanorama source code as well as scripts necessary for reproducing the results in the paper. -**Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, including those for imputation and highly-variable gene filtering. The results from Scanorama integration and batch correction can then be used as input to other tools for scRNA-seq clustering, visualization, and analysis.** +Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, including those for imputation and highly-variable gene filtering. The results from Scanorama integration and batch correction can then be used as input to other tools for scRNA-seq clustering, visualization, and analysis. + +Scanorama integration can also be greatly accelerated using tools for data sketching, as described in the paper ["Geometric sketching compactly summarizes the single-cell transcriptomic landscape", Cell Systems (2019)](https://www.cell.com/cell-systems/fulltext/S2405-4712\(19\)30152-8) and implemented [here](https://github.com/brianhie/geosketch). ## API example usage @@ -88,7 +90,7 @@ cd scanorama/ python setup.py install --user ``` -If you are running inside an anaconda environment, first install annoy by doing: +If you are running inside an anaconda environment, first install annoy by doing: ``` conda install -c conda-forge python-annoy ``` @@ -166,11 +168,9 @@ For those interested in the algorithm implementation, `scanorama/scanorama.py` i - Make sure the input matrices are cells-by-genes, not the transpose. -- For the example scripts, be sure to run `bin/process.py` first, although this is not necessary if you are using Scanorama through the API. - -- For large data set integration under memory constraints (e.g., if you run into a `MemoryError`), try lowering the `batch_size` parameter. And stay tuned for more improvements! +- For large data set integration under memory constraints (e.g., if you run into a `MemoryError`), try lowering the `batch_size` parameter to improve memory usage and try sketch-based acceleration using the `sketch` parameter to `integrate()` to improve both memory usage and runtime. -- Scanorama versions 0.2 through 0.6.1 had default parameters that resulted in non-optimal batch correction results (integration was unaffected). Upgrade to the latest version for a fix to this issue. +- For the example scripts, be sure to run `bin/process.py` first, although this is not necessary if you are using Scanorama through the API. ## Questions