From 407cf2a64008ec089f5054088e835abd18df438b Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Sat, 7 Dec 2024 01:30:19 -0800 Subject: [PATCH] Improve the stability of the knee point identification. (#117) The new algorithm is based on maximizing the distance from a line between the plateau and the inflection point, which avoids problems with the instability of the empirical second derivative, even with smoothing. Also did a minor fix to action to avoid redundant runs on PR. --- .github/workflows/check-bioc.yml | 4 +-- DESCRIPTION | 4 +-- NAMESPACE | 3 -- R/barcodeRanks.R | 47 +++++++++++--------------------- inst/NEWS.Rd | 6 ++++ man/barcodeRanks.Rd | 22 +++------------ tests/testthat/test-misc.R | 7 ++--- 7 files changed, 32 insertions(+), 61 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 9b9fca5..46d21d1 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -22,8 +22,8 @@ on: push: - branch: - - 'devel' + branches: + - devel pull_request: name: R-CMD-check-bioc diff --git a/DESCRIPTION b/DESCRIPTION index 12dcb1b..b23c104 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: DropletUtils -Version: 1.27.0 -Date: 2024-07-23 +Version: 1.27.1 +Date: 2024-12-04 Title: Utilities for Handling Single-Cell Droplet Data Authors@R: c( person("Aaron", "Lun", role = "aut"), diff --git a/NAMESPACE b/NAMESPACE index ef24259..b46ba4e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -118,7 +118,6 @@ importFrom(scuttle,isOutlier) importFrom(scuttle,librarySizeFactors) importFrom(scuttle,sumCountsAcrossCells) importFrom(scuttle,sumCountsAcrossFeatures) -importFrom(stats,fitted) importFrom(stats,kmeans) importFrom(stats,mad) importFrom(stats,median) @@ -126,14 +125,12 @@ importFrom(stats,optimize) importFrom(stats,p.adjust) importFrom(stats,pnbinom) importFrom(stats,ppois) -importFrom(stats,predict) importFrom(stats,qnbinom) importFrom(stats,quantile) importFrom(stats,rexp) importFrom(stats,rgamma) importFrom(stats,rpois) importFrom(stats,runif) -importFrom(stats,smooth.spline) importFrom(utils,head) importFrom(utils,read.delim) importFrom(utils,tail) diff --git a/R/barcodeRanks.R b/R/barcodeRanks.R index db7f295..6312da4 100644 --- a/R/barcodeRanks.R +++ b/R/barcodeRanks.R @@ -11,7 +11,7 @@ #' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting. #' Ignored if \code{fit.bounds} is specified. #' @param assay.type Integer or string specifying the assay containing the count matrix. -#' @param df Integer scalar specifying the number of degrees of freedom, to pass to \code{\link{smooth.spline}}. +#' @param df Deprecated and ignored. #' @param ... For the generic, further arguments to pass to individual methods. #' #' For the SummarizedExperiment method, further arguments to pass to the ANY method. @@ -34,20 +34,11 @@ #' \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized. #' The derivative is computed directly from all points on the curve with total counts greater than \code{lower}. #' This avoids issues with erratic behaviour of the curve at lower totals. -#' \item The knee point is defined as the point on the curve where the signed curvature is minimized. -#' This requires calculation of the second derivative, which is much more sensitive to noise in the curve. -#' To overcome this, a smooth spline is fitted to the log-total counts against the log-rank using \code{\link{smooth.spline}}. -#' Derivatives are then calculated from the fitted spline using \code{\link{predict}}. +#' \item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve. +#' We used to minimize the signed curvature to identify the knee point but this relies on the second derivative, +#' which was too unstable even after smoothing. #' } #' -#' @section Details on curve fitting: -#' We supply a relatively low default setting of \code{df} to avoid overfitting the spline, -#' as this results in unstability in the higher derivatives (and thus the curvature). -#' \code{df} and other arguments to \code{\link{smooth.spline}} can be tuned -#' if the estimated knee point is not at an appropriate location. -#' We also restrict the fit to lie within the bounds defined by \code{fit.bounds} to focus on the region containing the knee point. -#' This allows us to obtain an accurate fit with low \code{df} rather than attempting to model the entire curve. -#' #' If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point #' as this should lie below the knee point on typical curves. #' The upper bound is set to the point at which the first derivative is closest to zero, @@ -63,8 +54,6 @@ #' \describe{ #' \item{\code{rank}:}{Numeric, the rank of each barcode (averaged across ties).} #' \item{\code{total}:}{Numeric, the total counts for each barcode.} -#' \item{\code{fitted}:}{Numeric, the fitted value from the spline for each barcode. -#' This is \code{NA} for points with \code{x} outside of \code{fit.bounds}.} #' } #' #' The metadata contains \code{knee}, a numeric scalar containing the total count at the knee point; @@ -85,7 +74,6 @@ #' # Making a plot. #' plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total") #' o <- order(br.out$rank) -#' lines(br.out$rank[o], br.out$fitted[o], col="red") #' abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2) #' abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2) #' legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), @@ -98,7 +86,6 @@ #' @name barcodeRanks NULL -#' @importFrom stats smooth.spline predict fitted #' @importFrom utils tail #' @importFrom Matrix colSums #' @importFrom S4Vectors DataFrame metadata<- @@ -134,21 +121,20 @@ NULL if (is.null(fit.bounds)) { new.keep <- left.edge:right.edge } else { - new.keep <- y > log10(fit.bounds[1]) & y < log10(fit.bounds[2]) + new.keep <- which(y > log10(fit.bounds[1]) & y < log10(fit.bounds[2])) } - # Smoothing to avoid error multiplication upon differentiation. - # Minimizing the signed curvature and returning the total for the knee point. - fitted.vals <- rep(NA_real_, length(keep)) - + # Using the maximum distance to identify the knee point. if (length(new.keep) >= 4) { - fit <- smooth.spline(x[new.keep], y[new.keep], df=df, ...) - fitted.vals[keep][new.keep] <- 10^fitted(fit) - - d1 <- predict(fit, deriv=1)$y - d2 <- predict(fit, deriv=2)$y - curvature <- d2/(1 + d1^2)^1.5 - knee <- 10^(y[new.keep][which.min(curvature)]) + curx <- x[new.keep] + cury <- y[new.keep] + xbounds <- curx[c(1L, length(new.keep))] + ybounds <- cury[c(1L, length(new.keep))] + gradient <- diff(ybounds)/diff(xbounds) + intercept <- ybounds[1] - xbounds[1] * gradient + above <- which(cury >= curx * gradient + intercept) + dist <- abs(gradient * curx[above] - cury[above] + intercept)/sqrt(gradient^2 + 1) + knee <- 10^(cury[above[which.max(dist)]]) } else { # Sane fallback upon overly aggressive filtering by 'exclude.from', 'lower'. knee <- 10^(y[new.keep[1]]) @@ -157,8 +143,7 @@ NULL # Returning a whole stack of useful stats. out <- DataFrame( rank=.reorder(run.rank, stuff$lengths, o), - total=.reorder(run.totals, stuff$lengths, o), - fitted=.reorder(fitted.vals, stuff$lengths, o) + total=.reorder(run.totals, stuff$lengths, o) ) rownames(out) <- colnames(m) metadata(out) <- list(knee=knee, inflection=inflection) diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index 4abadaa..32ce03e 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -2,6 +2,12 @@ \title{DropletUtils News} \encoding{UTF-8} +\section{Version 1.28.0}{\itemize{ +\item Use a more stable algorithm for identifying the knee point in \code{barcodeRanks()}. +The new algorithm is based on maximizing the distance from a line between the plateau and the inflection point. +Previously, we tried to minimize the signed curvature but this was susceptible to many local minima due to the instability of the empirical second derivative, even after smoothing. +}} + \section{Version 1.18.0}{\itemize{ \item Added an \code{intersect.genes=} option to \code{read10xCounts()} for samples with inconsistent gene information. Automatically fix empty chromosome names for mitochondrial genes in certain Cellranger outputs. diff --git a/man/barcodeRanks.Rd b/man/barcodeRanks.Rd index 98dba42..8a12de8 100644 --- a/man/barcodeRanks.Rd +++ b/man/barcodeRanks.Rd @@ -39,7 +39,7 @@ from which to obtain a section of the curve for spline fitting.} \item{exclude.from}{An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting. Ignored if \code{fit.bounds} is specified.} -\item{df}{Integer scalar specifying the number of degrees of freedom, to pass to \code{\link{smooth.spline}}.} +\item{df}{Deprecated and ignored.} \item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed.} @@ -50,8 +50,6 @@ A \linkS4class{DataFrame} where each row corresponds to a column of \code{m}, an \describe{ \item{\code{rank}:}{Numeric, the rank of each barcode (averaged across ties).} \item{\code{total}:}{Numeric, the total counts for each barcode.} -\item{\code{fitted}:}{Numeric, the fitted value from the spline for each barcode. -This is \code{NA} for points with \code{x} outside of \code{fit.bounds}.} } The metadata contains \code{knee}, a numeric scalar containing the total count at the knee point; @@ -75,20 +73,10 @@ presumably reflecting the difference between empty droplets with little RNA and \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized. The derivative is computed directly from all points on the curve with total counts greater than \code{lower}. This avoids issues with erratic behaviour of the curve at lower totals. -\item The knee point is defined as the point on the curve where the signed curvature is minimized. -This requires calculation of the second derivative, which is much more sensitive to noise in the curve. -To overcome this, a smooth spline is fitted to the log-total counts against the log-rank using \code{\link{smooth.spline}}. -Derivatives are then calculated from the fitted spline using \code{\link{predict}}. +\item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve. +We used to minimize the signed curvature to identify the knee point but this relies on the second derivative, +which was too unstable even after smoothing. } -} -\section{Details on curve fitting}{ - -We supply a relatively low default setting of \code{df} to avoid overfitting the spline, -as this results in unstability in the higher derivatives (and thus the curvature). -\code{df} and other arguments to \code{\link{smooth.spline}} can be tuned -if the estimated knee point is not at an appropriate location. -We also restrict the fit to lie within the bounds defined by \code{fit.bounds} to focus on the region containing the knee point. -This allows us to obtain an accurate fit with low \code{df} rather than attempting to model the entire curve. If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point as this should lie below the knee point on typical curves. @@ -100,7 +88,6 @@ to avoid spuriously large numerical derivatives from unstable parts of the curve Note that only points with total counts above \code{lower} will be considered for curve fitting, regardless of how \code{fit.bounds} is defined. } - \examples{ # Mocking up some data: set.seed(2000) @@ -113,7 +100,6 @@ names(br.out) # Making a plot. plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total") o <- order(br.out$rank) -lines(br.out$rank[o], br.out$fitted[o], col="red") abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2) abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2) legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R index 5048893..bd0f719 100644 --- a/tests/testthat/test-misc.R +++ b/tests/testthat/test-misc.R @@ -11,7 +11,6 @@ test_that("barcodeRanks runs to completion", { brout <- barcodeRanks(my.counts, lower=limit) expect_equal(brout$total, totals) expect_identical(brout$rank, rank(-totals, ties.method="average")) - expect_true(all(is.na(brout$fitted[totals <= limit]))) # Trying again with a higher limit. limit2 <- 200 @@ -21,9 +20,8 @@ test_that("barcodeRanks runs to completion", { # Specifying the boundaries. bounds <- c(200, 1000) brout3 <- barcodeRanks(my.counts, lower=limit, fit.bounds=bounds) - is.okay <- totals > bounds[1] & totals < bounds[2] - expect_true(all(is.na(brout3$fitted[!is.okay]))) - expect_true(all(!is.na(brout3$fitted[is.okay]))) + knee <- metadata(brout3)$knee + expect_true(knee >= bounds[1] && knee <= bounds[2]) # Respecting column names. alt <- my.counts @@ -32,7 +30,6 @@ test_that("barcodeRanks runs to completion", { expect_identical(rownames(brout2), colnames(alt)) expect_identical(names(brout2$rank), NULL) expect_identical(names(brout2$total), NULL) - expect_identical(names(brout2$fitted), NULL) # Trying out silly inputs. expect_error(barcodeRanks(my.counts[,0]), "insufficient")