From ce06250c597258cba1c6161beec6893ab88e7ae6 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 2 Jan 2025 01:25:17 +0000 Subject: [PATCH 1/3] implement cellranger order of magnitude algorithm for emptyDropsCellRanger --- R/emptyDropsCellRanger.R | 137 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) diff --git a/R/emptyDropsCellRanger.R b/R/emptyDropsCellRanger.R index 8ec4df5..7339ed4 100644 --- a/R/emptyDropsCellRanger.R +++ b/R/emptyDropsCellRanger.R @@ -7,6 +7,7 @@ #' The matrix should only contain barcodes for an individual sample, prior to any filtering for barcodes. #' Alternatively, a \linkS4class{SummarizedExperiment} containing such an object. #' @param n.expected.cells An integer scalar specifying the number of expected cells in a sample. +#' If missing, will try to estimate this from the data using the order of magnitude algorithm from CellRanger. #' Corresponds to the \code{nExpectedCells} argument in \pkg{STARsolo}. #' @param max.percentile A numeric scalar between 0 and 1 used to define the maximum UMI count in the simple filtering algorithm. #' Corresponds to the \code{maxPercentile} argument in \pkg{STARsolo}. @@ -100,7 +101,7 @@ NULL .empty_drops_cell_ranger <- function(m, # STARsolo arguments ## simple filtering - n.expected.cells=3000, # nExpectedCells + n.expected.cells=NULL, # nExpectedCells max.percentile=0.99, # maxPercentile max.min.ratio=10, # maxMinRatio @@ -120,7 +121,7 @@ NULL bpstart(BPPARAM) on.exit(bpstop(BPPARAM)) } - + # This function is an approximate implementation of the # `--soloCellFilter EmptyDrops_CR` filtering approach # of STARsolo 2.7.9a (https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1), @@ -181,6 +182,14 @@ NULL ) } + if (is.null(n.expected.cells)) { + n.expected.cells <- filter_cellular_barcodes_ordmag(colSums(m))$filtered_bcs + if (n.expected.cells < 1) { + warning("Could not estimate the number of expected cells; using 3000 instead.") + n.expected.cells <- 3000 + } + } + stats <- .test_empty_drops(m=m, ambient.FUN=ambfun, niters=niters, test.ambient=FALSE, ignore=NULL, alpha=Inf, round=round, BPPARAM=BPPARAM) tmp <- stats$PValue @@ -206,3 +215,127 @@ setMethod("emptyDropsCellRanger", "ANY", .empty_drops_cell_ranger) setMethod("emptyDropsCellRanger", "SummarizedExperiment", function(m, ..., assay.type="counts") { .empty_drops_cell_ranger(assay(m, assay.type), ...) }) + +# The following functions are adopted from CellRanger +# https://github.com/10XGenomics/cellranger/blob/fe0616967771151209c3a6f97f4de92bf55ac0bd/lib/python/cellranger/cell_calling_helpers.py#L841 +## Cell calling constants +ORDMAG_NUM_BOOTSTRAP_SAMPLES = 100 +ORDMAG_RECOVERED_CELLS_QUANTILE = 0.99 +MIN_RECOVERED_CELLS_PER_GEM_GROUP = 50 +MAX_RECOVERED_CELLS_PER_GEM_GROUP = 262144 # 1 << 18 + +find_within_ordmag <- function(x, baseline_idx) { + x_ascending = sort(x) + baseline = x_ascending[length(x_ascending) - baseline_idx] + cutoff = round(0.1 * baseline) + cutoff[cutoff <= 1] = 1 + # Return the index corresponding to the cutoff in descending order + length(x) - findInterval(cutoff, x_ascending) +} + +# Estimate the number of recovered cells by trying to find ordmag(recovered) =~ filtered. +# - Search for a result such that some loss(recovered_cells, filtered_cells) is minimized. +# - Here I'm using (obs - exp)**2 / exp, which approximates a proportion for small differences +# but blows up for large differences. +# - Test over a log2-spaced range of values from 1..262_144 +estimate_recovered_cells_ordmag <- function(nonzero_bc_counts, max_expected_cells) { + recovered_cells = seq(1, log2(max_expected_cells), length.out = 2000) + recovered_cells = unique(round(2^recovered_cells)) + + baseline_bc_idx = round(recovered_cells * (1 - ORDMAG_RECOVERED_CELLS_QUANTILE)) + baseline_bc_idx[baseline_bc_idx > length(nonzero_bc_counts)] = length(nonzero_bc_counts) + + filtered_cells = find_within_ordmag(nonzero_bc_counts, baseline_bc_idx) + loss = (filtered_cells - recovered_cells)^2 / recovered_cells + idx = which.min(loss) + c(recovered_cells[idx], loss[idx]) +} + +# All barcodes that are close to within an order of magnitude of a top barcode. +# Takes all barcodes that are close to within an order of magnitude of a +# top barcode that likely represents a cell. +filter_cellular_barcodes_ordmag <- function(bc_counts) { + nonzero_bc_counts = bc_counts[bc_counts > 0] + if (length(nonzero_bc_counts) == 0) { + stop("All barcodes have zero count; cannot proceed.") + } + + set.seed(0) + # Set the most cells to examine based on the empty drops range for this chemistry + max_expected_cells = MAX_RECOVERED_CELLS_PER_GEM_GROUP + recovered_cells_loss = + sapply(seq(1, ORDMAG_NUM_BOOTSTRAP_SAMPLES), function(x) { + estimate_recovered_cells_ordmag( + sample(nonzero_bc_counts, length(nonzero_bc_counts),replace = TRUE), max_expected_cells + ) + }) + recovered_cells = mean(recovered_cells_loss[1,]) + loss = mean(recovered_cells_loss[2,]) + + recovered_cells = max(round(recovered_cells), MIN_RECOVERED_CELLS_PER_GEM_GROUP) + + message(paste0("Found recovered_cells = ", recovered_cells, " with loss = ", loss)) + + baseline_bc_idx = round(recovered_cells * (1 - ORDMAG_RECOVERED_CELLS_QUANTILE)) + baseline_bc_idx = min(baseline_bc_idx, length(nonzero_bc_counts)) + + # Bootstrap sampling; run algo with many random samples of the data + top_n_boot = sapply(seq(1, ORDMAG_NUM_BOOTSTRAP_SAMPLES), function(x) { + find_within_ordmag( + sample(nonzero_bc_counts, length(nonzero_bc_counts), replace = TRUE), baseline_bc_idx + ) + }) + + metrics = summarize_bootstrapped_top_n(top_n_boot, nonzero_bc_counts) + + # Get the filtered barcodes + top_n = metrics.filtered_bcs + + if (top_n > len(nonzero_bc_counts)) { + stop("Invalid selection of 0-count barcodes!") + } + return(metrics) +} + +summarize_bootstrapped_top_n <- function(top_n_boot, nonzero_counts) { + top_n_bcs_mean = mean(top_n_boot) + # mimick cellranger, but not sure if we want to use sample var or population var + top_n_bcs_var = sum((top_n_boot - top_n_bcs_mean)^2)/length(top_n_boot) + top_n_bcs_sd = sqrt(top_n_bcs_var) + result = list( + filtered_bcs = 0, + filtered_bcs_lb = 0, + filtered_bcs_ub = 0, + filtered_bcs_var = 0, + filtered_bcs_cv = 0, + filtered_bcs_cutoff = 0 + ) + + result$filtered_bcs_var = top_n_bcs_var + result$filtered_bcs_cv = top_n_bcs_sd/top_n_bcs_mean + result$filtered_bcs_lb = round(qnorm(0.025, mean = top_n_bcs_mean, sd = top_n_bcs_sd, lower.tail = TRUE), 0) + result$filtered_bcs_ub = round(qnorm(0.975, mean = top_n_bcs_mean, sd = top_n_bcs_sd, lower.tail = TRUE), 0) + + nbcs = round(top_n_bcs_mean) + result$filtered_bcs = nbcs + + # make sure that if a barcode with count x is selected, we select all barcodes with count >= x + # this is true for each bootstrap sample, but is not true when we take the mean + + if (nbcs > 0) { + sorted_counts = nonzero_counts[order(nonzero_counts, decreasing = TRUE)] + + cutoff = sorted_counts[nbcs] + index = nbcs + while (((index) < length(sorted_counts)) && sorted_counts[index] == cutoff) { + index = index + 1 + } + # if we end up grabbing too many barcodes, revert to initial estimate + if ((index - nbcs) > 0.20 * nbcs) { + return(result) + } + result$filtered_bcs = index + result$filtered_bcs_cutoff = cutoff + } + return(result) +} From 6d6eef44f88202f44df7950ff397207c9af35dfd Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 2 Jan 2025 01:45:20 +0000 Subject: [PATCH 2/3] add a assertion when ambient pool size is too small --- R/emptyDropsCellRanger.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/emptyDropsCellRanger.R b/R/emptyDropsCellRanger.R index 7339ed4..e537443 100644 --- a/R/emptyDropsCellRanger.R +++ b/R/emptyDropsCellRanger.R @@ -156,7 +156,18 @@ NULL } ambient <- logical(length(totals)) - ambient[o[min(ind.min, length(totals)):min(ind.max, length(totals))]] <- TRUE + final.ind.min = min(ind.min, length(totals)) + final.ind.max = min(ind.max, length(totals)) + if ((final.ind.max - final.ind.min) < 100) { + stop(paste0( + "The ambient pool size (", + final.ind.max - final.ind.min + 1, + ") is too small; cannot proceed.", + "Please adjust `ind.min` and `ind.max` to increase the size. ", + "One suggestion is to set `ind.min = sum(colSums(m) > N)` and `ind.max = ncol(m)`, ", + "where N is a threshold of UMI count, such as 100, and m is the count matrix.")) + } + ambient[o[final.ind.min:final.ind.max]] <- TRUE ambient.m <- mat[,ambient,drop=FALSE] ambient.prof <- rowSums(ambient.m) From a04ccee482fa5018b943f13d5456ffd9fa1ff200 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 2 Jan 2025 02:07:41 +0000 Subject: [PATCH 3/3] add a assertion when ambient pool size is too small --- R/emptyDropsCellRanger.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/emptyDropsCellRanger.R b/R/emptyDropsCellRanger.R index e537443..33bcdb8 100644 --- a/R/emptyDropsCellRanger.R +++ b/R/emptyDropsCellRanger.R @@ -162,7 +162,7 @@ NULL stop(paste0( "The ambient pool size (", final.ind.max - final.ind.min + 1, - ") is too small; cannot proceed.", + ") is too small; cannot proceed. ", "Please adjust `ind.min` and `ind.max` to increase the size. ", "One suggestion is to set `ind.min = sum(colSums(m) > N)` and `ind.max = ncol(m)`, ", "where N is a threshold of UMI count, such as 100, and m is the count matrix."))