From 407cf2a64008ec089f5054088e835abd18df438b Mon Sep 17 00:00:00 2001
From: Aaron Lun <infinite.monkeys.with.keyboards@gmail.com>
Date: Sat, 7 Dec 2024 01:30:19 -0800
Subject: [PATCH] Improve the stability of the knee point identification.
 (#117)

The new algorithm is based on maximizing the distance from a line
between the plateau and the inflection point, which avoids problems with
the instability of the empirical second derivative, even with smoothing.

Also did a minor fix to action to avoid redundant runs on PR.
---
 .github/workflows/check-bioc.yml |  4 +--
 DESCRIPTION                      |  4 +--
 NAMESPACE                        |  3 --
 R/barcodeRanks.R                 | 47 +++++++++++---------------------
 inst/NEWS.Rd                     |  6 ++++
 man/barcodeRanks.Rd              | 22 +++------------
 tests/testthat/test-misc.R       |  7 ++---
 7 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index 9b9fca5..46d21d1 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -22,8 +22,8 @@
 
 on:
   push:
-    branch:
-      - 'devel'
+    branches:
+      - devel
   pull_request:
 
 name: R-CMD-check-bioc
diff --git a/DESCRIPTION b/DESCRIPTION
index 12dcb1b..b23c104 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: DropletUtils
-Version: 1.27.0
-Date: 2024-07-23
+Version: 1.27.1
+Date: 2024-12-04
 Title: Utilities for Handling Single-Cell Droplet Data
 Authors@R: c(
     person("Aaron", "Lun", role = "aut"),
diff --git a/NAMESPACE b/NAMESPACE
index ef24259..b46ba4e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -118,7 +118,6 @@ importFrom(scuttle,isOutlier)
 importFrom(scuttle,librarySizeFactors)
 importFrom(scuttle,sumCountsAcrossCells)
 importFrom(scuttle,sumCountsAcrossFeatures)
-importFrom(stats,fitted)
 importFrom(stats,kmeans)
 importFrom(stats,mad)
 importFrom(stats,median)
@@ -126,14 +125,12 @@ importFrom(stats,optimize)
 importFrom(stats,p.adjust)
 importFrom(stats,pnbinom)
 importFrom(stats,ppois)
-importFrom(stats,predict)
 importFrom(stats,qnbinom)
 importFrom(stats,quantile)
 importFrom(stats,rexp)
 importFrom(stats,rgamma)
 importFrom(stats,rpois)
 importFrom(stats,runif)
-importFrom(stats,smooth.spline)
 importFrom(utils,head)
 importFrom(utils,read.delim)
 importFrom(utils,tail)
diff --git a/R/barcodeRanks.R b/R/barcodeRanks.R
index db7f295..6312da4 100644
--- a/R/barcodeRanks.R
+++ b/R/barcodeRanks.R
@@ -11,7 +11,7 @@
 #' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting.
 #' Ignored if \code{fit.bounds} is specified.
 #' @param assay.type Integer or string specifying the assay containing the count matrix.
-#' @param df Integer scalar specifying the number of degrees of freedom, to pass to \code{\link{smooth.spline}}.
+#' @param df Deprecated and ignored.
 #' @param ... For the generic, further arguments to pass to individual methods.
 #'
 #' For the SummarizedExperiment method, further arguments to pass to the ANY method.
@@ -34,20 +34,11 @@
 #' \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized.
 #' The derivative is computed directly from all points on the curve with total counts greater than \code{lower}.
 #' This avoids issues with erratic behaviour of the curve at lower totals.
-#' \item The knee point is defined as the point on the curve where the signed curvature is minimized.
-#' This requires calculation of the second derivative, which is much more sensitive to noise in the curve.
-#' To overcome this, a smooth spline is fitted to the log-total counts against the log-rank using \code{\link{smooth.spline}}.
-#' Derivatives are then calculated from the fitted spline using \code{\link{predict}}.
+#' \item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve.
+#' We used to minimize the signed curvature to identify the knee point but this relies on the second derivative,
+#' which was too unstable even after smoothing.
 #' }
 #'
-#' @section Details on curve fitting:
-#' We supply a relatively low default setting of \code{df} to avoid overfitting the spline, 
-#' as this results in unstability in the higher derivatives (and thus the curvature).
-#' \code{df} and other arguments to \code{\link{smooth.spline}} can be tuned 
-#' if the estimated knee point is not at an appropriate location.
-#' We also restrict the fit to lie within the bounds defined by \code{fit.bounds} to focus on the region containing the knee point.
-#' This allows us to obtain an accurate fit with low \code{df} rather than attempting to model the entire curve.
-#' 
 #' If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point
 #' as this should lie below the knee point on typical curves.
 #' The upper bound is set to the point at which the first derivative is closest to zero, 
@@ -63,8 +54,6 @@
 #' \describe{
 #' \item{\code{rank}:}{Numeric, the rank of each barcode (averaged across ties).}
 #' \item{\code{total}:}{Numeric, the total counts for each barcode.}
-#' \item{\code{fitted}:}{Numeric, the fitted value from the spline for each barcode.
-#' This is \code{NA} for points with \code{x} outside of \code{fit.bounds}.}
 #' }
 #' 
 #' The metadata contains \code{knee}, a numeric scalar containing the total count at the knee point;
@@ -85,7 +74,6 @@
 #' # Making a plot.
 #' plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
 #' o <- order(br.out$rank)
-#' lines(br.out$rank[o], br.out$fitted[o], col="red")
 #' abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
 #' abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
 #' legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), 
@@ -98,7 +86,6 @@
 #' @name barcodeRanks
 NULL
 
-#' @importFrom stats smooth.spline predict fitted
 #' @importFrom utils tail
 #' @importFrom Matrix colSums
 #' @importFrom S4Vectors DataFrame metadata<-
@@ -134,21 +121,20 @@ NULL
     if (is.null(fit.bounds)) {
         new.keep <- left.edge:right.edge
     } else {
-        new.keep <- y > log10(fit.bounds[1]) & y < log10(fit.bounds[2])
+        new.keep <- which(y > log10(fit.bounds[1]) & y < log10(fit.bounds[2]))
     }
 
-    # Smoothing to avoid error multiplication upon differentiation.
-    # Minimizing the signed curvature and returning the total for the knee point.
-    fitted.vals <- rep(NA_real_, length(keep))
-
+    # Using the maximum distance to identify the knee point.
     if (length(new.keep) >= 4) {
-        fit <- smooth.spline(x[new.keep], y[new.keep], df=df, ...)
-        fitted.vals[keep][new.keep] <- 10^fitted(fit)
-
-        d1 <- predict(fit, deriv=1)$y
-        d2 <- predict(fit, deriv=2)$y
-        curvature <- d2/(1 + d1^2)^1.5
-        knee <- 10^(y[new.keep][which.min(curvature)])
+        curx <- x[new.keep]
+        cury <- y[new.keep]
+        xbounds <- curx[c(1L, length(new.keep))]
+        ybounds <- cury[c(1L, length(new.keep))]
+        gradient <- diff(ybounds)/diff(xbounds)
+        intercept <- ybounds[1] - xbounds[1] * gradient
+        above <- which(cury >= curx * gradient + intercept)
+        dist <- abs(gradient * curx[above] - cury[above] + intercept)/sqrt(gradient^2 + 1)
+        knee <- 10^(cury[above[which.max(dist)]])
     } else {
         # Sane fallback upon overly aggressive filtering by 'exclude.from', 'lower'.
         knee <- 10^(y[new.keep[1]]) 
@@ -157,8 +143,7 @@ NULL
     # Returning a whole stack of useful stats.
     out <- DataFrame(
         rank=.reorder(run.rank, stuff$lengths, o), 
-        total=.reorder(run.totals, stuff$lengths, o),
-        fitted=.reorder(fitted.vals, stuff$lengths, o)
+        total=.reorder(run.totals, stuff$lengths, o)
     )
     rownames(out) <- colnames(m)
     metadata(out) <- list(knee=knee, inflection=inflection)
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
index 4abadaa..32ce03e 100644
--- a/inst/NEWS.Rd
+++ b/inst/NEWS.Rd
@@ -2,6 +2,12 @@
 \title{DropletUtils News}
 \encoding{UTF-8}
 
+\section{Version 1.28.0}{\itemize{
+\item Use a more stable algorithm for identifying the knee point in \code{barcodeRanks()}.
+The new algorithm is based on maximizing the distance from a line between the plateau and the inflection point.
+Previously, we tried to minimize the signed curvature but this was susceptible to many local minima due to the instability of the empirical second derivative, even after smoothing.
+}}
+
 \section{Version 1.18.0}{\itemize{
 \item Added an \code{intersect.genes=} option to \code{read10xCounts()} for samples with inconsistent gene information.
 Automatically fix empty chromosome names for mitochondrial genes in certain Cellranger outputs.
diff --git a/man/barcodeRanks.Rd b/man/barcodeRanks.Rd
index 98dba42..8a12de8 100644
--- a/man/barcodeRanks.Rd
+++ b/man/barcodeRanks.Rd
@@ -39,7 +39,7 @@ from which to obtain a section of the curve for spline fitting.}
 \item{exclude.from}{An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting.
 Ignored if \code{fit.bounds} is specified.}
 
-\item{df}{Integer scalar specifying the number of degrees of freedom, to pass to \code{\link{smooth.spline}}.}
+\item{df}{Deprecated and ignored.}
 
 \item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed.}
 
@@ -50,8 +50,6 @@ A \linkS4class{DataFrame} where each row corresponds to a column of \code{m}, an
 \describe{
 \item{\code{rank}:}{Numeric, the rank of each barcode (averaged across ties).}
 \item{\code{total}:}{Numeric, the total counts for each barcode.}
-\item{\code{fitted}:}{Numeric, the fitted value from the spline for each barcode.
-This is \code{NA} for points with \code{x} outside of \code{fit.bounds}.}
 }
 
 The metadata contains \code{knee}, a numeric scalar containing the total count at the knee point;
@@ -75,20 +73,10 @@ presumably reflecting the difference between empty droplets with little RNA and
 \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized.
 The derivative is computed directly from all points on the curve with total counts greater than \code{lower}.
 This avoids issues with erratic behaviour of the curve at lower totals.
-\item The knee point is defined as the point on the curve where the signed curvature is minimized.
-This requires calculation of the second derivative, which is much more sensitive to noise in the curve.
-To overcome this, a smooth spline is fitted to the log-total counts against the log-rank using \code{\link{smooth.spline}}.
-Derivatives are then calculated from the fitted spline using \code{\link{predict}}.
+\item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve.
+We used to minimize the signed curvature to identify the knee point but this relies on the second derivative,
+which was too unstable even after smoothing.
 }
-}
-\section{Details on curve fitting}{
-
-We supply a relatively low default setting of \code{df} to avoid overfitting the spline, 
-as this results in unstability in the higher derivatives (and thus the curvature).
-\code{df} and other arguments to \code{\link{smooth.spline}} can be tuned 
-if the estimated knee point is not at an appropriate location.
-We also restrict the fit to lie within the bounds defined by \code{fit.bounds} to focus on the region containing the knee point.
-This allows us to obtain an accurate fit with low \code{df} rather than attempting to model the entire curve.
 
 If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point
 as this should lie below the knee point on typical curves.
@@ -100,7 +88,6 @@ to avoid spuriously large numerical derivatives from unstable parts of the curve
 Note that only points with total counts above \code{lower} will be considered for curve fitting,
 regardless of how \code{fit.bounds} is defined.
 }
-
 \examples{
 # Mocking up some data: 
 set.seed(2000)
@@ -113,7 +100,6 @@ names(br.out)
 # Making a plot.
 plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
 o <- order(br.out$rank)
-lines(br.out$rank[o], br.out$fitted[o], col="red")
 abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
 abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
 legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), 
diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R
index 5048893..bd0f719 100644
--- a/tests/testthat/test-misc.R
+++ b/tests/testthat/test-misc.R
@@ -11,7 +11,6 @@ test_that("barcodeRanks runs to completion", {
     brout <- barcodeRanks(my.counts, lower=limit)
     expect_equal(brout$total, totals)
     expect_identical(brout$rank, rank(-totals, ties.method="average"))
-    expect_true(all(is.na(brout$fitted[totals <= limit])))
 
     # Trying again with a higher limit.
     limit2 <- 200
@@ -21,9 +20,8 @@ test_that("barcodeRanks runs to completion", {
     # Specifying the boundaries.
     bounds <- c(200, 1000)
     brout3 <- barcodeRanks(my.counts, lower=limit, fit.bounds=bounds)
-    is.okay <- totals > bounds[1] & totals < bounds[2]
-    expect_true(all(is.na(brout3$fitted[!is.okay])))
-    expect_true(all(!is.na(brout3$fitted[is.okay])))
+    knee <- metadata(brout3)$knee
+    expect_true(knee >= bounds[1] && knee <= bounds[2])
 
     # Respecting column names.
     alt <- my.counts
@@ -32,7 +30,6 @@ test_that("barcodeRanks runs to completion", {
     expect_identical(rownames(brout2), colnames(alt))
     expect_identical(names(brout2$rank), NULL)
     expect_identical(names(brout2$total), NULL)
-    expect_identical(names(brout2$fitted), NULL)
 
     # Trying out silly inputs.
     expect_error(barcodeRanks(my.counts[,0]), "insufficient")