From a9394392ada67919a96920b533a31e7c666b2a8e Mon Sep 17 00:00:00 2001 From: agdenadel Date: Wed, 25 Sep 2024 13:37:58 -0700 Subject: [PATCH] Initial commit recall (compared to callback, adds various null distributions, renames method --- .Rbuildignore | 7 + .github/.gitignore | 1 + .github/workflows/check-standard.yml | 49 ++ .github/workflows/docker-image.yml | 18 + .github/workflows/pkgdown.yaml | 48 ++ .github/workflows/super-linter.yml | 29 + .gitignore | 54 ++ DESCRIPTION | 36 ++ Dockerfile | 3 + LICENSE | 2 + LICENSE.md | 21 + NAMESPACE | 5 + R/copula.R | 92 +++ R/countsplit.R | 0 R/estimate_negative_binomial.R | 186 ++++++ R/estimate_zipoisson.R | 60 ++ R/recall.R | 591 ++++++++++++++++++ R/seurat_workflow.R | 66 ++ README.md | 75 +++ _pkgdown.yml | 4 + _pkgdown.yml | 3 + man/FindClustersCountsplit.Rd | 64 ++ man/FindClustersRecall.Rd | 64 ++ man/compute_knockoff_filter.Rd | 39 ++ man/estimate_negative_binomial.Rd | 22 + man/estimate_negative_binomial_copula.Rd | 40 ++ man/estimate_zi_poisson.Rd | 20 + ...et_seurat_obj_with_artificial_variables.Rd | 36 ++ man/rzipoisson.Rd | 24 + man/seurat_workflow.Rd | 40 ++ vignettes/basic-usage.Rmd | 73 +++ 31 files changed, 1772 insertions(+) create mode 100644 .Rbuildignore create mode 100644 .github/.gitignore create mode 100644 .github/workflows/check-standard.yml create mode 100644 .github/workflows/docker-image.yml create mode 100644 .github/workflows/pkgdown.yaml create mode 100644 .github/workflows/super-linter.yml create mode 100644 .gitignore create mode 100644 DESCRIPTION create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 LICENSE.md create mode 100644 NAMESPACE create mode 100644 R/copula.R create mode 100644 R/countsplit.R create mode 100644 R/estimate_negative_binomial.R create mode 100644 R/estimate_zipoisson.R create mode 100644 R/recall.R create mode 100644 R/seurat_workflow.R create mode 100644 README.md create mode 100644 _pkgdown.yml create mode 100644 _pkgdown.yml create mode 100644 man/FindClustersCountsplit.Rd create mode 100644 man/FindClustersRecall.Rd create mode 100644 man/compute_knockoff_filter.Rd create mode 100644 man/estimate_negative_binomial.Rd create mode 100644 man/estimate_negative_binomial_copula.Rd create mode 100644 man/estimate_zi_poisson.Rd create mode 100644 man/get_seurat_obj_with_artificial_variables.Rd create mode 100644 man/rzipoisson.Rd create mode 100644 man/seurat_workflow.Rd create mode 100644 vignettes/basic-usage.Rmd diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..3cb2e90 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,7 @@ +^LICENSE\.md$ +^Dockerfile$ +^tmp$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +^\.github$ \ No newline at end of file diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/check-standard.yml b/.github/workflows/check-standard.yml new file mode 100644 index 0000000..74d8c97 --- /dev/null +++ b/.github/workflows/check-standard.yml @@ -0,0 +1,49 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 0000000..df305df --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,18 @@ +name: Docker Image CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Build the Docker image + run: docker build . --file Dockerfile --tag recall:$(date +%s) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..a7276e8 --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.5.0 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.github/workflows/super-linter.yml b/.github/workflows/super-linter.yml new file mode 100644 index 0000000..ca69cec --- /dev/null +++ b/.github/workflows/super-linter.yml @@ -0,0 +1,29 @@ +# This workflow executes several linters on changed files based on languages used in your code base whenever +# you push a code or open a pull request. +# +# You can adjust the behavior by modifying this file. +# For more information, see: +# https://github.com/github/super-linter +name: Lint Code Base + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] +jobs: + run-lint: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + # Full git history is needed to get a proper list of changed files within `super-linter` + fetch-depth: 0 + + - name: Lint Code Base + uses: github/super-linter@v4 + env: + VALIDATE_ALL_CODEBASE: false + DEFAULT_BRANCH: "main" + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2adc561 --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ +.Rproj.user +.Rdata +.DS_Store +docs +inst/doc \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..d8dabbf --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,36 @@ +Package: recall +Title: Calibrated clustering with artificial variables to avoid over-clustering in single-cell RNA-sequencing +Version: 0.0.0 +Authors@R: + person("Alan", "DenAdel", , "alan_denadel@brown.edu", role = c("aut", "cre"), + comment = c(ORCID = "0000-0002-7985-6789")) +Description: recall (Calibrated Clustering with Artificial Variables) is a method for protecting + against over-clustering by controlling for the impact of double-dipping. The approach + can be applied to any clustering algorithm (implemented are the Louvain and Leiden algorithms with + plans forK-means, and hierarchical clustering algorithms). The method provides state-of-the-art + clustering performance and can rapidly analyze large-scale scRNA-seq studies and is + compatible with the Seurat library. +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.2 +Imports: + Matrix, + Seurat (>= 5.0.1), + SingleCellExperiment, + scDesign3, + SummarizedExperiment, + MASS, + fitdistrplus, + lamW, + knockoff, + future, + stats, + cli, + stringr, + countsplit +License: MIT + file LICENSE +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr +URL: https://lcrawlab.github.io/recall/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..93727f6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,3 @@ +FROM rocker/verse:4.0.5 + +RUN R -e "install.packages('.', type = 'source', repos = NULL)" \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6459969 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2024 +COPYRIGHT HOLDER: recall authors diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..f6addd8 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2024 recall authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..c817b7f --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,5 @@ +# Generated by roxygen2: do not edit by hand + +export(FindClustersCallback) +export(FindClustersCountsplit) +export(seurat_workflow) diff --git a/R/copula.R b/R/copula.R new file mode 100644 index 0000000..2a5344b --- /dev/null +++ b/R/copula.R @@ -0,0 +1,92 @@ +simulate_data_scDesign3 <- function(data_matrix, cores, family) { + sce <- SingleCellExperiment::SingleCellExperiment(list(counts = data_matrix)) + SummarizedExperiment::colData(sce)$cell_type <- "1" # scDesign3 needs a cell type so we just make it the same for all cells + + simulated_data <- scDesign3::scdesign3(sce, + celltype = "cell_type", + pseudotime = NULL, + spatial = NULL, + other_covariates = NULL, + empirical_quantile = FALSE, + usebam=TRUE, # to speedup marginal inference + mu_formula = "1", + sigma_formula = "1", + corr_formula = "1", + family_use = family, # this is the key parameter + nonzerovar = FALSE, + n_cores = cores, + parallelization = "mcmapply", + important_feature = "all", + nonnegative = FALSE, + copula = "gaussian", + fastmvn = TRUE) + + ko <- simulated_data$new_count + + return(ko) +} + + +#' @title todo +#' +#' @description Given data, computes todo +#' +#' @param data_matrix The data to estimate parameters from. +#' @param cores The number of CPU cores to use in estimation by scDesign3. +#' @returns todo +#' @name estimate_negative_binomial_copula +estimate_zi_poisson_copula <- function(data_matrix, cores) { + family <- "zip" + ko <- simulate_data_scDesign3(data_matrix, cores, family) + return(ko) +} + + +#' @title todo +#' +#' @description Given data, computes todo +#' +#' @param data_matrix The data to estimate parameters from. +#' @param cores The number of CPU cores to use in estimation by scDesign3. +#' @returns todo +#' @name estimate_negative_binomial_copula +estimate_negative_binomial_copula <- function(data_matrix, cores) { + family <- "nb" + ko <- simulate_data_scDesign3(data_matrix, cores, family) + return(ko) +} + + +#' @title todo +#' +#' @description Given data, computes todo +#' +#' @param data_matrix The data to estimate parameters from. +#' @param cores The number of CPU cores to use in estimation by scDesign3. +#' @returns todo +#' @name estimate_negative_binomial_copula +estimate_poisson_copula <- function(data_matrix, cores) { + family <- "poisson" + ko <- simulate_data_scDesign3(data_matrix, cores, family) + return(ko) +} + + + + +#' @title todo +#' +#' @description Given data, computes todo +#' +#' @param data_matrix The data to estimate parameters from. +#' @param cores The number of CPU cores to use in estimation by scDesign3. +#' @returns todo +#' @name estimate_negative_binomial_copula +estimate_gaussian_copula <- function(data_matrix, cores) { + family <- "gaussian" + ko <- simulate_data_scDesign3(data_matrix, cores, family) + return(ko) +} + + + diff --git a/R/countsplit.R b/R/countsplit.R new file mode 100644 index 0000000..e69de29 diff --git a/R/estimate_negative_binomial.R b/R/estimate_negative_binomial.R new file mode 100644 index 0000000..5b9cbea --- /dev/null +++ b/R/estimate_negative_binomial.R @@ -0,0 +1,186 @@ + +#' @title Maximum likelihood estimation for the negative binomial +#' distribution. +#' +#' @description Given data, computes the maximum likelihood estimators +#' for the negative binomial distribution with parameters: size and mu. +#' +#' @param data The data to estimate parameters from. +#' @returns Maximum likelihood estimators size and mu for the negative +#' binomial distribution +#' @param verbose Whether or not to show all logging. +#' @name estimate_negative_binomial +estimate_negative_binomial <- function(data, verbose=FALSE) { + + if (verbose) { message("Attempting MLE method 1") } + mle1 <- tryCatch( + { + nb_fit <- MASS::fitdistr(data, "negative binomial", method = "Nelder-Mead") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + }, + error = function(cond) { + if (verbose) { message("MLE method 1 failed with an error.") } + NA + }, + warning = function(cond) { + if (verbose) { message("MLE method 2 had a warning. Warning message:\n") } + if (verbose) { message(cond) } + if (verbose) { message("\n") } + NA + } + ) + + if (verbose) { message("Attempting MLE method 2") } + mle2 <- tryCatch( + { + nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mle") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + + }, + error = function(cond) { + if (verbose) { message("MLE method 2 failed with an error.") } + NA + }, + warning = function(cond) { + if (verbose) { message("MLE method 2 had a warning. Warning message:") } + if (verbose) { message(cond) } + NA + } + ) + + if (verbose) { message("Attempting MME") } + mme <- tryCatch( + { + nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + }, + error = function(cond) { + if (verbose) { message("MME failed with an error.") } + NA + }, + warning = function(cond) { + if (verbose) { message("MME method has a warning. Warning message:") } + if (verbose) { message(cond) } + NA + } + ) + + + if (verbose) { message("Attempting MME with warnings") } + mme <- tryCatch( + { + nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + }, + error = function(cond) { + if (verbose) { message("MME failed with an error.") } + NA + } + ) + + if (verbose) { message("Attempting MSE") } + mme <- tryCatch( + { + nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mse") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + }, + error = function(cond) { + if (verbose) { message("MSE failed with an error.") } + NA + }, + warning = function(cond) { + if (verbose) { message("MSE method failed. Warning message:") } + if (verbose) { message(cond) } + NA + } + ) + + if (verbose) { message("Attempting QME") } + mme <- tryCatch( + { + nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="qme") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + }, + error = function(cond) { + if (verbose) { message("QME failed with an error.") } + NA + }, + warning = function(cond) { + if (verbose) { message("QME method failed. Warning message:") } + if (verbose) { message(cond) } + NA + } + ) + + + if (verbose) { message("Attempting MGE") } + mme <- tryCatch( + { + nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mge") + size <- nb_fit$estimate[["size"]] + mu <- nb_fit$estimate[["mu"]] + + # check if method returned NaN or NA without throwing an error + if (is.na(mu) || is.na(size)) { stop() } + + return.list <- list("size" = size, "mu" = mu) + return(return.list) + }, + error = function(cond) { + if (verbose) { message("MGE failed with an error.") } + NA + }, + warning = function(cond) { + if (verbose) { message("MGE method failed. Warning message:") } + if (verbose) { message(cond) } + NA + } + ) + + + + stop("All negative binomial estimation methods failed.") + +} diff --git a/R/estimate_zipoisson.R b/R/estimate_zipoisson.R new file mode 100644 index 0000000..070de4a --- /dev/null +++ b/R/estimate_zipoisson.R @@ -0,0 +1,60 @@ + +# https://en.wikipedia.org/wiki/Zero-inflated_model#Estimators_of_ZIP_parameters +# https://math.stackexchange.com/questions/2761563/maximum-likelihood-estimation-for-zero-inflated-poisson-distribution +# https://ieeexplore.ieee.org/document/9032203 + +#' @title Maximum likelihood estimation for the zero-inflated Poisson distribution +#' with Poisson parameter lambda and zero proportion prop.zero. +#' +#' @description Given data, computes the maximum likelihood estimators +#' for the zero-inflated Poisson distribution. +#' +#' @param data The data to estimate parameters from. +#' @returns Maximum likelihood estimators of the zero-inflated Poisson +#' distribution +#' @name estimate_zi_poisson +estimate_zi_poisson <- function(data) { + num.zeros <- sum(data == 0) + r0 <- 1 / length(data) * num.zeros + + x.bar = mean(data) + + gamma <- x.bar / (1 - r0) + + lambda.hat <- lamW::lambertW0(-gamma * exp(-gamma)) + gamma + + pi.hat <- 1 - x.bar / lambda.hat + + + return.list <- list("lambda.hat" = lambda.hat, "pi.hat" = pi.hat) + return(return.list) +} + + +#' @title Random data generation for the zero-infalted Poisson distribution +#' with Poisson parameter lambda and zero proportion prop.zero. +#' +#' @description Given the number of samples desired, a Poisson parameter, +#' lambda, and a zero proportion, prop.zero, simulates the number of desired +#' samples from ZIP(lambda, prop.zero). +#' +#' @param n The number of samples to be simulated. +#' @param lambda The Poisson rate parameter. +#' @param prop.zero The proportion of excess zeroes. +#' @returns Simulated data from ZIP(lambda, prop.zero). +#' @name rzipoisson +rzipoisson <- function(n, lambda, prop.zero) { + data <- c() + + + for (i in 1:n) { + if (stats::runif(1) < prop.zero) { + data[i] <- 0 + } + else { + data[i] <- stats::rpois(1, lambda) + } + } + return(data) +} + diff --git a/R/recall.R b/R/recall.R new file mode 100644 index 0000000..969bf84 --- /dev/null +++ b/R/recall.R @@ -0,0 +1,591 @@ + +#' @title Returns a Seurat object that contains additional (fake) RNA +#' expression counts. +#' +#' @description Given a Seurat object, returns a new Seurat object whose RNA +#' expression counts includes the +#' variable features from the original object and an equal number of artificial +#' features. +#' +#' @param seurat_obj A Seurat object containing RNA expression counts. +#' @param assay The assay to generate artificial variables from. +#' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula) +#' @param cores The number of cores to use in generating synthetic null variables. +#' @param verbose Whether or not to show logging. +#' @returns A Seurat object that contains the original variable features and an +#' equal number of artificial features. +#' @name get_seurat_obj_with_artificial_variables +get_seurat_obj_with_artificial_variables <- function(seurat_obj, assay = "RNA", null_method = "ZIP", verbose = TRUE, cores) { + + if (verbose) { + message("Pulling data from Seurat object") + } + + var_features <- Seurat::VariableFeatures(seurat_obj) + seurat_obj_data <- as.data.frame(t(as.matrix(Seurat::GetAssayData(seurat_obj, assay = assay, layer = "counts")[var_features, ]))) + + #if (verbose) { + # message("Estimating the distribution of each gene") + #} + if (verbose) { + message("Computing artificial features") + } + + if (null_method == "ZIP") { + estimates <- lapply(seurat_obj_data, estimate_zi_poisson) + sampling_function <- function(x) { + rzipoisson(nrow(seurat_obj_data), + x$lambda.hat, + x$pi.hat) + } + ko <- as.data.frame(lapply(estimates, sampling_function)) + + } + else if (null_method == "NB") { + estimates <- lapply(seurat_obj_data, estimate_negative_binomial) + sampling_function <- function(x) { + stats::rnbinom(nrow(seurat_obj_data), + size = x$size, + mu = x$mu) + } + ko <- as.data.frame(lapply(estimates, sampling_function)) + } + else if (null_method == "ZIP-copula") { + ko <- estimate_zi_poisson_copula(seurat_obj_data, cores) + } + else if (null_method == "NB-copula") { + ko <- estimate_negative_binomial_copula(seurat_obj_data, cores) + } + else if (null_method == "Poisson-copula") { + ko <- estimate_poisson_copula(seurat_obj_data, cores) + } + else if (null_method == "Gaussian-copula") { + ko <- estimate_gaussian_copula(seurat_obj_data, cores) + } + else { + stop("You selected a null_method that is not supported. Choose from: ZIP, NB, ZIP-copula, NB-copula, Poisson-copula, Gaussian-copula.") + } + + + num_variable_features <- length(var_features) + colnames(ko) <- paste0(rep("knockoff", num_variable_features), 1:num_variable_features) + combined_data <- cbind(seurat_obj_data, ko) + + # sparsify augmented data matrix and transpose for use in Seurat + combined_data <- Matrix::Matrix(t(combined_data), sparse = TRUE) + + new_project_name <- paste0(seurat_obj@project.name, "_with_knockoffs") + new_seurat_obj <- Seurat::CreateSeuratObject(counts = combined_data, project = new_project_name) + + return(new_seurat_obj) +} + + + + + + +#' @title Returns the genes selected by the knockoff filter +#' +#' @description Given two Seurat objects, returns the the genes selected by +#' the knockoff filter and their W statistics. +#' +#' @param seurat_obj A Seurat object +#' @param cluster1 The Idents of the cluster of interest in seurat_obj1 +#' @param cluster2 The Idents of the cluster of interest in seurat_obj2 +#' @param q The desired rate to control the FDR at +#' @param return_all Determines if the returned object will contain all genes +#' or just the selected genes. +#' @param num_cores The number of cores for computing marker genes in parallel. +#' @param shared_memory_max The maximum size for shared global variables. +#' @returns todo +#' @name compute_knockoff_filter +compute_knockoff_filter <- function(seurat_obj, + cluster1, + cluster2, + q, + return_all = FALSE, + num_cores = 1, + shared_memory_max) { + options(future.globals.maxSize = shared_memory_max) + # todo note what this is for, figure this out as a parameter or programmatically + future::plan("multicore", workers = as.numeric(num_cores)) + + markers <- Seurat::FindMarkers(seurat_obj, + ident.1 = cluster1, + ident.2 = cluster2, + logfc.threshold = 0, + min.pct = 0) + + + # FindMarkers orders by p-value, so we can't rely on position to know which genes are which + knockoff_indices <- grepl("^knockoff", rownames(markers)) + original_indices <- !knockoff_indices + + # subset the markers data.frame into originals and knockoffs + knockoff_markers <- markers[knockoff_indices, ] + original_markers <- markers[original_indices, ] + + all_genes <- rownames(seurat_obj) + + # get indices of knockoffs and originals from seurat_obj, should be [FALSE, ..., FALSE, TRUE, ..., TRUE] + knockoff_indices_sorted <- grepl("^knockoff", all_genes) + original_indices_sorted <- !knockoff_indices_sorted + + knockoff_names_sorted <- all_genes[knockoff_indices_sorted] + original_names_sorted <- all_genes[original_indices_sorted] + + # sort markers data.frames by their original orderings + knockoff_markers_sorted <- knockoff_markers[knockoff_names_sorted, ] + original_markers_sorted <- original_markers[original_names_sorted, ] + + original_p_values <- original_markers_sorted$p_val + knockoff_p_values <- knockoff_markers_sorted$p_val + + log_original_p_values <- -log10(original_p_values) + log_knockoff_p_values <- -log10(knockoff_p_values) + + W <- log_original_p_values - log_knockoff_p_values + + thres <- knockoff::knockoff.threshold(W, fdr = q, offset = 1) + + + if (return_all) { + all_features <- as.data.frame(list("gene" = original_names_sorted, "W" = W)) + + ret <- list("all_features" = all_features, "threshold" = thres) + + return(ret) + } + selected_indices <- which(W >= thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p) + #selected_indices <- which(W > thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p) + + selected_genes <- original_names_sorted[selected_indices] + selected_Ws <- W[selected_indices] + + selected_features <- as.data.frame(list("selected_gene" = selected_genes, "W" = selected_Ws)) + + selected_features <- selected_features[order(selected_features$W, decreasing = TRUE), ] + + ret <- list("selected_features" = selected_features, "threshold" = thres) + + return(ret) +} + + + + + + + + +#' @title Runs a typical Seurat workflow on a Seurat object (up to +#' dimensionality reduction and clustering). +#' +#' @description Given a Seurat object, returns a new Seurat that has been +#' normalized, had variable features identified, scaled, had principal +#' components computed, hadclusters identified, and had tSNE and UMAP +#' embeddings determined. +#' +#' @param seurat_obj The Seurat object that will be analyzed. +#' @param resolution_start The starting resolution to be used for the +#' clustering algorithm (Louvain and Leiden algorithms). +#' @param num_clusters_start The starting number of clusters to be used for the +#' clustering algorithm (K-means and Hierarchical clustering algorithms). +#' @param reduction_percentage The amount that the starting parameter will be +#' reduced by after each iteration (between 0 and 1). +#' @param dims The dimensions to use as input features (i.e. 1:10). +#' @param algorithm The clustering algorithm to be used. +#' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula) +#' @param assay The assay to generate artificial variables from. +#' @param cores The number of cores to compute marker genes in parallel. +#' @param shared_memory_max The maximum size for shared global variables. +#' Increased this variable if you see the following error: +#' The total size of the X globals that need to be exported for the future expression +#' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB +#' (option 'future.globals.maxSize'). The X largest globals are ... +#' @param verbose Whether or not to show all logging. +#' @returns Returns a Seurat object where the idents have been updated with the +#' clusters determined via the recall algorithm. +#' Latest clustering results will be stored in the object metadata under +#' recall_clusters'. Note that 'recall_clusters' will be overwritten ever +#' time FindClustersRecall is run. +#' @name FindClustersRecall +#' @export +FindClustersRecall <- function(seurat_obj, + resolution_start = 0.8, + reduction_percentage = 0.2, + num_clusters_start = 20, + dims = 1:10, + algorithm = "louvain", # todo implement all algos + null_method = "ZIP", + assay = "RNA", + cores = 1, + shared_memory_max = 8000 * 1024^2, + verbose = TRUE) { + + # todo check function arguments for validity + + augmented_with_artificial_variables_seurat_obj <- get_seurat_obj_with_artificial_variables(seurat_obj, + assay=assay, + null_method=null_method, + verbose=verbose, + cores=cores) + + num_variable_features <- 2 * length(Seurat::VariableFeatures(seurat_obj)) + + + # Pre-process data + options(future.globals.maxSize = shared_memory_max) + # todo log number of cores being used + future::plan("multicore", workers = as.numeric(cores)) + #options(future.globals.maxSize = 8000 * 1024^2) + + if (verbose) { + message(paste("Number of cores:", cores)) + } + + #plan("multicore", workers = as.numeric(cores)) + + augmented_with_artificial_variables_seurat_obj <- Seurat::NormalizeData(augmented_with_artificial_variables_seurat_obj, + verbose = FALSE) + + augmented_with_artificial_variables_seurat_obj <- Seurat::FindVariableFeatures(augmented_with_artificial_variables_seurat_obj, + selection.method = "vst", + nfeatures = num_variable_features, + verbose = FALSE) + + augmented_with_artificial_variables_seurat_obj <- Seurat::ScaleData(augmented_with_artificial_variables_seurat_obj, verbose = FALSE) + augmented_with_artificial_variables_seurat_obj <- Seurat::RunPCA(augmented_with_artificial_variables_seurat_obj, + features = Seurat::VariableFeatures(object = augmented_with_artificial_variables_seurat_obj), + verbose = FALSE) + + augmented_with_artificial_variables_seurat_obj <- Seurat::FindNeighbors(augmented_with_artificial_variables_seurat_obj, + dims = dims, + verbose = FALSE) + + resolution_param <- resolution_start + + + first_iteration <- TRUE + + while (TRUE) { + if (verbose) { + message("####################################################################") + message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm")) + message(paste("Resolution param:", resolution_param)) + } + + if (algorithm == "louvain") { + augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj, + resolution = resolution_param, + verbose = FALSE) + } + + if (algorithm == "leiden") { + #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging + augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj, + resolution = resolution_param, + algorithm = 4, + method = "igraph", + verbose = FALSE) + } + + # Reduce resolution for next iteration of the loop + resolution_param <- (1 - reduction_percentage) * resolution_param + + k <- length(levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj))) + #knock_idents <- 0:(k-1) + + if (verbose) { + message("Num clusters:") + message(k) + } + + knock_idents <- levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj)) + + num_selected_matrix <- matrix(nrow = k, ncol = k) + + found_no_sign_diff <- FALSE + + num_clusters <- length(knock_idents) + + + if (verbose) { + progress_bar_length <- num_clusters * (num_clusters - 1) / 2 + cli::cli_progress_bar("Processing cluster pairs:", + total = progress_bar_length, + clear = FALSE) + } + + m <- 0 + for (i in 1:num_clusters) { + for (j in 1:num_clusters) { + if (j >= i) { + next + } + + m <- m + 1 + + if (verbose) { + cli::cli_progress_update() + } + + markers_selected <- compute_knockoff_filter(seurat_obj = augmented_with_artificial_variables_seurat_obj, + cluster1 = knock_idents[i], + cluster2 = knock_idents[j], + q = 0.05, + num_cores = cores, + shared_memory_max = shared_memory_max) + + num_selected <- nrow(markers_selected$selected_features) + + if (num_selected == 0) { + found_no_sign_diff <- TRUE + break + } + + num_selected_matrix[i, j] <- num_selected + num_selected_matrix[j, i] <- num_selected + + } + if (found_no_sign_diff) { + if (verbose) { + cli::cli_progress_done() + message("Found clusters with no significant differences.") + message("Progressing to next clustering iteration.") + } + first_iteration <- FALSE + break + } + } + + if (found_no_sign_diff) { + next + } + break + } + + if (first_iteration) { + warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run recall with a larger starting parameter.") + } + + seurat_obj@meta.data$recall_clusters <- Seurat::Idents(augmented_with_artificial_variables_seurat_obj) + Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$recall_clusters + + return(seurat_obj) +} + + + + + +#' @title Runs a typical Seurat workflow on a Seurat object (up to +#' dimensionality reduction and clustering). +#' +#' @description Given a Seurat object, returns a new Seurat that has been +#' normalized, had variable features identified, scaled, had principal +#' components computed, hadclusters identified, and had tSNE and UMAP +#' embeddings determined. +#' +#' @param seurat_obj The Seurat object that will be analyzed. +#' @param resolution_start The starting resolution to be used for the +#' clustering algorithm (Louvain and Leiden algorithms). +#' @param num_clusters_start The starting number of clusters to be used for the +#' clustering algorithm (K-means and Hierarchical clustering algorithms). +#' @param reduction_percentage The amount that the starting parameter will be +#' reduced by after each iteration (between 0 and 1). +#' @param dims The dimensions to use as input features (i.e. 1:10). +#' @param algorithm The clustering algorithm to be used. +#' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula) +#' @param assay The assay to generate artificial variables from. +#' @param cores The number of cores to compute marker genes in parallel. +#' @param shared_memory_max The maximum size for shared global variables. +#' Increased this variable if you see the following error: +#' The total size of the X globals that need to be exported for the future expression +#' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB +#' (option 'future.globals.maxSize'). The X largest globals are ... +#' @param verbose Whether or not to show all logging. +#' @returns Returns a Seurat object where the idents have been updated with the +#' clusters determined via the countsplit algorithm. +#' Latest clustering results will be stored in the object metadata under +#' countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever +#' time FindClustersCountsplit is run. +#' @name FindClustersCountsplit +#' @export +FindClustersCountsplit <- function(seurat_obj, + resolution_start = 0.8, + reduction_percentage = 0.2, + num_clusters_start = 20, + dims = 1:10, + algorithm = "louvain", # todo implement all algos + null_method = "ZIP", + assay = "RNA", + cores = 1, + shared_memory_max = 8000 * 1024^2, + verbose = TRUE) { + + options(future.globals.maxSize = shared_memory_max) + # todo log number of cores being used + future::plan("multicore", workers = as.numeric(cores)) + + num_variable_features <- length(Seurat::VariableFeatures(seurat_obj)) + + + # follow this issue + #https://github.com/anna-neufeld/countsplit/issues/8 + + # todo only do this for variable features + split <- countsplit::countsplit(Seurat::GetAssayData(seurat_obj, assay)) + Xtrain <- split[[1]] + Xtest <- split[[2]] + + seurat_obj_train <- Seurat::CreateSeuratObject(counts = Xtrain) + seurat_obj_test <- Seurat::CreateSeuratObject(counts = Xtest) + + + # process training data + seurat_obj_train <- Seurat::NormalizeData(seurat_obj_train, + verbose = FALSE) + + seurat_obj_train <- Seurat::FindVariableFeatures(seurat_obj_train, + selection.method = "vst", + nfeatures = num_variable_features, + verbose = FALSE) + + seurat_obj_train <- Seurat::ScaleData(seurat_obj_train, verbose = FALSE) + seurat_obj_train <- Seurat::RunPCA(seurat_obj_train, + features = Seurat::VariableFeatures(object = seurat_obj_train), + verbose = FALSE) + + seurat_obj_train <- Seurat::FindNeighbors(seurat_obj_train, + dims = dims, + verbose = FALSE) + + # process test data (no need for PCA and FindNeighbors since we are just assigning idents based on training idents) + seurat_obj_test <- Seurat::NormalizeData(seurat_obj_test, + verbose = FALSE) + + seurat_obj_test <- Seurat::FindVariableFeatures(seurat_obj_test, + selection.method = "vst", + nfeatures = num_variable_features, + verbose = FALSE) + + seurat_obj_test <- Seurat::ScaleData(seurat_obj_test, verbose = FALSE) + + + + resolution_param <- resolution_start + + # set up multicore for FindMarkers + future::plan("multicore", workers = as.numeric(cores)) + + first_iteration <- TRUE + + while (TRUE) { + if (verbose) { + message("####################################################################") + message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm")) + message(paste("Resolution param:", resolution_param)) + } + + if (algorithm == "louvain") { + seurat_obj_train <- Seurat::FindClusters(seurat_obj_train, + resolution = resolution_param, + verbose = FALSE) + } + + if (algorithm == "leiden") { + #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging + seurat_obj_train <- Seurat::FindClusters(seurat_obj_train, + resolution = resolution_param, + algorithm = 4, + method = "igraph", + verbose = FALSE) + } + + # Reduce resolution for next iteration of the loop + resolution_param <- (1 - reduction_percentage) * resolution_param + + Seurat::Idents(seurat_obj_test) <- Seurat::Idents(seurat_obj_train) + + k <- length(levels(Seurat::Idents(seurat_obj_test))) + #knock_idents <- 0:(k-1) + + if (verbose) { + message("Num clusters:") + message(k) + } + + countsplit_idents <- levels(Seurat::Idents(seurat_obj_test)) + + num_selected_matrix <- matrix(nrow = k, ncol = k) + + found_no_sign_diff <- FALSE + + num_clusters <- length(countsplit_idents) + + + if (verbose) { + progress_bar_length <- num_clusters * (num_clusters - 1) / 2 + cli::cli_progress_bar("Processing cluster pairs:", + total = progress_bar_length, + clear = FALSE) + } + + m <- 0 + for (i in 1:num_clusters) { + for (j in 1:num_clusters) { + if (j >= i) { + next + } + + m <- m + 1 + + if (verbose) { + cli::cli_progress_update() + } + + markers_selected <- Seurat::FindMarkers(seurat_obj_test, + ident.1 = countsplit_idents[i], + ident.2 = countsplit_idents[j]) + + num_selected <- sum(markers_selected$p_val_adj < 0.05) + + if (num_selected == 0) { + found_no_sign_diff <- TRUE + break + } + + num_selected_matrix[i, j] <- num_selected + num_selected_matrix[j, i] <- num_selected + + } + if (found_no_sign_diff) { + if (verbose) { + cli::cli_progress_done() + message("Found clusters with no significant differences.") + message("Progressing to next clustering iteration.") + } + first_iteration <- FALSE + break + } + } + + if (found_no_sign_diff) { + next + } + break + } + + if (first_iteration) { + warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run FindClustersCountsplit with a larger starting parameter.") + } + + + seurat_obj@meta.data$countsplit_clusters <- Seurat::Idents(seurat_obj_test) + Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$countsplit_clusters + + return(seurat_obj) + +} \ No newline at end of file diff --git a/R/seurat_workflow.R b/R/seurat_workflow.R new file mode 100644 index 0000000..be5b4e8 --- /dev/null +++ b/R/seurat_workflow.R @@ -0,0 +1,66 @@ +#' @title Runs a typical Seurat workflow on a Seurat object (up to +#' dimensionality reduction and clustering). +#' +#' @description Given a Seurat object, returns a new Seurat that has been +#' normalized, had variable features identified, +#' scaled, had principal components computed, had clusters identified, and had +#' tSNE and UMAP embeddings determined. +#' +#' @param seurat_obj A Seurat object that will be analyzed. +#' @param num_variable_features The number of variable features to use in the +#' analysis. +#' @param resolution_param The resolution parameter to use when clustering. +#' @param visualization_method Either "umap" or "tsne". +#' @param num_dims The number of principal components to use. +#' @param algorithm The clustering algorithm to use, either "louvain" or +#' "leiden". +#' @returns A Seurat object containing the relevant analysis results. +#' @export +#' @name seurat_workflow +seurat_workflow <- function(seurat_obj, + num_variable_features, + resolution_param = 0.8, + visualization_method = "umap", + num_dims = 10, + algorithm = "louvain") { + seurat_obj <- Seurat::NormalizeData(seurat_obj) + + seurat_obj <- Seurat::FindVariableFeatures(seurat_obj, + selection.method = "vst", + nfeatures = num_variable_features) + + all_genes <- rownames(seurat_obj) + + seurat_obj <- Seurat::ScaleData(seurat_obj) + + seurat_obj <- Seurat::RunPCA(seurat_obj, + features = Seurat::VariableFeatures(object = seurat_obj)) + + seurat_obj <- Seurat::FindNeighbors(seurat_obj, dims = 1:num_dims) + + if (algorithm == "louvain") { + seurat_obj <- Seurat::FindClusters(seurat_obj, + resolution = resolution_param) + } + + if (algorithm == "leiden") { + seurat_obj <- Seurat::FindClusters(seurat_obj, + resolution = resolution_param, + algorithm = 4, + method = "igraph") + } + + if (visualization_method == "umap") { + seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims) + } + if (visualization_method == "tsne") { + seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims) + } + +if (visualization_method == "both") { + seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims) + seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims) + } + + return(seurat_obj) +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..2104bda --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +# recall (Calibrated Clustering with Artificial Variables) + +[![R CMD check](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml) +[![Docker Image CI](https://github.com/lcrawlab/recal/actions/workflows/docker-image.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/docker-image.yml) + +## Introduction + +Standard single-cell RNA-sequencing (scRNA-seq) pipelines nearly always include unsupervised clustering as a key step in identifying biologically distinct cell types. A follow-up step in these pipelines is to test for differential expression between the identified clusters. When algorithms over-cluster, downstream analyses will produce inflated P-values resulting in increased false discoveries. +Here, we present `recall` (Calibrated Clustering with Artificial Variables): a new method for protecting against over-clustering by controlling for the impact of double-dipping. +Importantly, our approach can be applied to any clustering algorithm (implemented here are the Louvain, Leiden, K-means, and hierarchical clustering algorithms). +`recall` provides state-of-the-art clustering performance and can rapidly analyze large-scale scRNA-seq studies, even on a personal laptop. + +## Installation + +You can install the lastest development version by using the [devtools](https://CRAN.R-project.org/package=devtools) library. To install this package with devtools, use this command: + +```r +devtools::install_github("lcrawlab/recall") +``` + +Although it is not explicitly a dependency, making sure you have `presto` installed will make `recall` much faster. + +```r +devtools::install_github("immunogenomics/presto") +``` + + +## Tutorial + +```r +library(Seurat) +library(SeuratData) + +library(recall) + +set.seed(123) + +# load pbmc3k dataset +SeuratData::InstallData("pbmc3k") +data("pbmc3k") + +pbmc3k <- UpdateSeuratObject(pbmc3k) + +pbmc3k <- NormalizeData(pbmc3k) +pbmc3k <- FindVariableFeatures(pbmc3k) +pbmc3k <- ScaleData(pbmc3k) +pbmc3k <- RunPCA(pbmc3k) +pbmc3k <- FindNeighbors(pbmc3k) +pbmc3k <- RunUMAP(pbmc3k, dims = 1:10) + +pbmc_default <- FindClusters(pbmc3k) +pbmc_recall <- FindClustersRecall(pbmc3k) + +DimPlot(pbmc_default) + DimPlot(pbmc_recall) +``` +## Overview of the Method + +The `recall` algorithm consists of three simple steps: + +1. First, we generate synthetic null variables, inspired by knockoff variables (Barber and Candès,2015) , where we augment the single-cell data being analyzed with "fake" genes that are known not to contribute to any unique cell type. +2. Second, we perform both preprocessing and clustering on this augmented dataset. +3. Third, we calibrate the number of inferred clusters by using a hypothesis testing strategy with a data-dependent threshold to determine if there is a statistically significant difference between groups. If any pair of groups does not have statistically significant differences then re-clustering occurs. + +The synthetic genes act as negative control variables; they go through the same analytic steps as the real data and are presented with the same opportunity to be identified as marker genes. +The `recall` algorithm uses the guiding principle that well-calibrated clusters (i.e., those representing real groups) should have significantly differentially expressed genes after correcting for multiple hypothesis tests, while over-clustered groups will not. +We use this rule to iteratively re-cluster cells until the inferred clusters are well-calibrated and the observed differences in expression between groups are not due to the effects of double-dipping. + +## Relevant Citations +`recall` is currently on the bioRxiv, [here](https://www.biorxiv.org/content/10.1101/2024.03.08.584180v1). + +A. DenAdel, M. Ramseier, A. Navia, A. Shalek, S. Raghavan, P. Winter, A. Amini, and L. Crawford. A knockoff calibration method to avoid over-clustering in single-cell RNA-sequencing. _bioRxiv_. + +## Questions and Feedback +For questions or concerns with `recall`, please contact +[Alan DenAdel](mailto:alan_denadel@brown.edu) or [Lorin Crawford](lcrawford@microsoft.com). Any feedback on the software, manuscript, and tutorials is appreciated. diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..26c6513 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,4 @@ +url: https://lcrawlab.github.io/callback/ +template: + bootstrap: 5 + diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..934c5da --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,3 @@ +url: https://lcrawlab.github.io/recall/ +template: + bootstrap: 5 \ No newline at end of file diff --git a/man/FindClustersCountsplit.Rd b/man/FindClustersCountsplit.Rd new file mode 100644 index 0000000..06e5bd9 --- /dev/null +++ b/man/FindClustersCountsplit.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recall.R +\name{FindClustersCountsplit} +\alias{FindClustersCountsplit} +\title{Runs a typical Seurat workflow on a Seurat object (up to +dimensionality reduction and clustering).} +\usage{ +FindClustersCountsplit( + seurat_obj, + resolution_start = 0.8, + reduction_percentage = 0.2, + num_clusters_start = 20, + dims = 1:10, + algorithm = "louvain", + null_method = "ZIP", + assay = "RNA", + cores = 1, + shared_memory_max = 8000 * 1024^2, + verbose = TRUE +) +} +\arguments{ +\item{seurat_obj}{The Seurat object that will be analyzed.} + +\item{resolution_start}{The starting resolution to be used for the +clustering algorithm (Louvain and Leiden algorithms).} + +\item{reduction_percentage}{The amount that the starting parameter will be +reduced by after each iteration (between 0 and 1).} + +\item{num_clusters_start}{The starting number of clusters to be used for the +clustering algorithm (K-means and Hierarchical clustering algorithms).} + +\item{dims}{The dimensions to use as input features (i.e. 1:10).} + +\item{algorithm}{The clustering algorithm to be used.} + +\item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)} + +\item{assay}{The assay to generate artificial variables from.} + +\item{cores}{The number of cores to compute marker genes in parallel.} + +\item{shared_memory_max}{The maximum size for shared global variables. +Increased this variable if you see the following error: +The total size of the X globals that need to be exported for the future expression +('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB +(option 'future.globals.maxSize'). The X largest globals are ...} + +\item{verbose}{Whether or not to show all logging.} +} +\value{ +Returns a Seurat object where the idents have been updated with the +clusters determined via the countsplit algorithm. +Latest clustering results will be stored in the object metadata under +countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever +time FindClustersCountsplit is run. +} +\description{ +Given a Seurat object, returns a new Seurat that has been +normalized, had variable features identified, scaled, had principal +components computed, hadclusters identified, and had tSNE and UMAP +embeddings determined. +} diff --git a/man/FindClustersRecall.Rd b/man/FindClustersRecall.Rd new file mode 100644 index 0000000..598ba9f --- /dev/null +++ b/man/FindClustersRecall.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recall.R +\name{FindClustersRecall} +\alias{FindClustersRecall} +\title{Runs a typical Seurat workflow on a Seurat object (up to +dimensionality reduction and clustering).} +\usage{ +FindClustersRecall( + seurat_obj, + resolution_start = 0.8, + reduction_percentage = 0.2, + num_clusters_start = 20, + dims = 1:10, + algorithm = "louvain", + null_method = "ZIP", + assay = "RNA", + cores = 1, + shared_memory_max = 8000 * 1024^2, + verbose = TRUE +) +} +\arguments{ +\item{seurat_obj}{The Seurat object that will be analyzed.} + +\item{resolution_start}{The starting resolution to be used for the +clustering algorithm (Louvain and Leiden algorithms).} + +\item{reduction_percentage}{The amount that the starting parameter will be +reduced by after each iteration (between 0 and 1).} + +\item{num_clusters_start}{The starting number of clusters to be used for the +clustering algorithm (K-means and Hierarchical clustering algorithms).} + +\item{dims}{The dimensions to use as input features (i.e. 1:10).} + +\item{algorithm}{The clustering algorithm to be used.} + +\item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)} + +\item{assay}{The assay to generate artificial variables from.} + +\item{cores}{The number of cores to compute marker genes in parallel.} + +\item{shared_memory_max}{The maximum size for shared global variables. +Increased this variable if you see the following error: +The total size of the X globals that need to be exported for the future expression +('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB +(option 'future.globals.maxSize'). The X largest globals are ...} + +\item{verbose}{Whether or not to show all logging.} +} +\value{ +Returns a Seurat object where the idents have been updated with the +clusters determined via the recall algorithm. +Latest clustering results will be stored in the object metadata under +recall_clusters'. Note that 'recall_clusters' will be overwritten ever +time FindClustersRecall is run. +} +\description{ +Given a Seurat object, returns a new Seurat that has been +normalized, had variable features identified, scaled, had principal +components computed, hadclusters identified, and had tSNE and UMAP +embeddings determined. +} diff --git a/man/compute_knockoff_filter.Rd b/man/compute_knockoff_filter.Rd new file mode 100644 index 0000000..d5f5321 --- /dev/null +++ b/man/compute_knockoff_filter.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recall.R +\name{compute_knockoff_filter} +\alias{compute_knockoff_filter} +\title{Returns the genes selected by the knockoff filter} +\usage{ +compute_knockoff_filter( + seurat_obj, + cluster1, + cluster2, + q, + return_all = FALSE, + num_cores = 1, + shared_memory_max +) +} +\arguments{ +\item{seurat_obj}{A Seurat object} + +\item{cluster1}{The Idents of the cluster of interest in seurat_obj1} + +\item{cluster2}{The Idents of the cluster of interest in seurat_obj2} + +\item{q}{The desired rate to control the FDR at} + +\item{return_all}{Determines if the returned object will contain all genes +or just the selected genes.} + +\item{num_cores}{The number of cores for computing marker genes in parallel.} + +\item{shared_memory_max}{The maximum size for shared global variables.} +} +\value{ +todo +} +\description{ +Given two Seurat objects, returns the the genes selected by +the knockoff filter and their W statistics. +} diff --git a/man/estimate_negative_binomial.Rd b/man/estimate_negative_binomial.Rd new file mode 100644 index 0000000..b845e0c --- /dev/null +++ b/man/estimate_negative_binomial.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimate_negative_binomial.R +\name{estimate_negative_binomial} +\alias{estimate_negative_binomial} +\title{Maximum likelihood estimation for the negative binomial +distribution.} +\usage{ +estimate_negative_binomial(data, verbose = FALSE) +} +\arguments{ +\item{data}{The data to estimate parameters from.} + +\item{verbose}{Whether or not to show all logging.} +} +\value{ +Maximum likelihood estimators size and mu for the negative +binomial distribution +} +\description{ +Given data, computes the maximum likelihood estimators +for the negative binomial distribution with parameters: size and mu. +} diff --git a/man/estimate_negative_binomial_copula.Rd b/man/estimate_negative_binomial_copula.Rd new file mode 100644 index 0000000..c655a63 --- /dev/null +++ b/man/estimate_negative_binomial_copula.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/copula.R +\name{estimate_negative_binomial_copula} +\alias{estimate_negative_binomial_copula} +\alias{estimate_zi_poisson_copula} +\alias{estimate_poisson_copula} +\alias{estimate_gaussian_copula} +\title{todo} +\usage{ +estimate_zi_poisson_copula(data_matrix, cores) + +estimate_negative_binomial_copula(data_matrix, cores) + +estimate_poisson_copula(data_matrix, cores) + +estimate_gaussian_copula(data_matrix, cores) +} +\arguments{ +\item{data_matrix}{The data to estimate parameters from.} + +\item{cores}{The number of CPU cores to use in estimation by scDesign3.} +} +\value{ +todo + +todo + +todo + +todo +} +\description{ +Given data, computes todo + +Given data, computes todo + +Given data, computes todo + +Given data, computes todo +} diff --git a/man/estimate_zi_poisson.Rd b/man/estimate_zi_poisson.Rd new file mode 100644 index 0000000..4557dd2 --- /dev/null +++ b/man/estimate_zi_poisson.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimate_zipoisson.R +\name{estimate_zi_poisson} +\alias{estimate_zi_poisson} +\title{Maximum likelihood estimation for the zero-inflated Poisson distribution +with Poisson parameter lambda and zero proportion prop.zero.} +\usage{ +estimate_zi_poisson(data) +} +\arguments{ +\item{data}{The data to estimate parameters from.} +} +\value{ +Maximum likelihood estimators of the zero-inflated Poisson +distribution +} +\description{ +Given data, computes the maximum likelihood estimators +for the zero-inflated Poisson distribution. +} diff --git a/man/get_seurat_obj_with_artificial_variables.Rd b/man/get_seurat_obj_with_artificial_variables.Rd new file mode 100644 index 0000000..143699d --- /dev/null +++ b/man/get_seurat_obj_with_artificial_variables.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recall.R +\name{get_seurat_obj_with_artificial_variables} +\alias{get_seurat_obj_with_artificial_variables} +\title{Returns a Seurat object that contains additional (fake) RNA +expression counts.} +\usage{ +get_seurat_obj_with_artificial_variables( + seurat_obj, + assay = "RNA", + null_method = "ZIP", + verbose = TRUE, + cores +) +} +\arguments{ +\item{seurat_obj}{A Seurat object containing RNA expression counts.} + +\item{assay}{The assay to generate artificial variables from.} + +\item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)} + +\item{verbose}{Whether or not to show logging.} + +\item{cores}{The number of cores to use in generating synthetic null variables.} +} +\value{ +A Seurat object that contains the original variable features and an +equal number of artificial features. +} +\description{ +Given a Seurat object, returns a new Seurat object whose RNA +expression counts includes the +variable features from the original object and an equal number of artificial +features. +} diff --git a/man/rzipoisson.Rd b/man/rzipoisson.Rd new file mode 100644 index 0000000..fa6cd0b --- /dev/null +++ b/man/rzipoisson.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimate_zipoisson.R +\name{rzipoisson} +\alias{rzipoisson} +\title{Random data generation for the zero-infalted Poisson distribution +with Poisson parameter lambda and zero proportion prop.zero.} +\usage{ +rzipoisson(n, lambda, prop.zero) +} +\arguments{ +\item{n}{The number of samples to be simulated.} + +\item{lambda}{The Poisson rate parameter.} + +\item{prop.zero}{The proportion of excess zeroes.} +} +\value{ +Simulated data from ZIP(lambda, prop.zero). +} +\description{ +Given the number of samples desired, a Poisson parameter, +lambda, and a zero proportion, prop.zero, simulates the number of desired +samples from ZIP(lambda, prop.zero). +} diff --git a/man/seurat_workflow.Rd b/man/seurat_workflow.Rd new file mode 100644 index 0000000..97b16e0 --- /dev/null +++ b/man/seurat_workflow.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/seurat_workflow.R +\name{seurat_workflow} +\alias{seurat_workflow} +\title{Runs a typical Seurat workflow on a Seurat object (up to +dimensionality reduction and clustering).} +\usage{ +seurat_workflow( + seurat_obj, + num_variable_features, + resolution_param = 0.8, + visualization_method = "umap", + num_dims = 10, + algorithm = "louvain" +) +} +\arguments{ +\item{seurat_obj}{A Seurat object that will be analyzed.} + +\item{num_variable_features}{The number of variable features to use in the +analysis.} + +\item{resolution_param}{The resolution parameter to use when clustering.} + +\item{visualization_method}{Either "umap" or "tsne".} + +\item{num_dims}{The number of principal components to use.} + +\item{algorithm}{The clustering algorithm to use, either "louvain" or +"leiden".} +} +\value{ +A Seurat object containing the relevant analysis results. +} +\description{ +Given a Seurat object, returns a new Seurat that has been +normalized, had variable features identified, +scaled, had principal components computed, had clusters identified, and had +tSNE and UMAP embeddings determined. +} diff --git a/vignettes/basic-usage.Rmd b/vignettes/basic-usage.Rmd new file mode 100644 index 0000000..4c42aaa --- /dev/null +++ b/vignettes/basic-usage.Rmd @@ -0,0 +1,73 @@ +--- +title: "Basic Usage on PBMC3k Data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{basic-usage} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +knitr::opts_chunk$set(eval = FALSE) + +``` + +```{r setup} +suppressPackageStartupMessages({ +library(Seurat) +library(SeuratData) +library(recall) +}) +``` + + +First, we use the `SeuratData` data package to first download and then load +2700 PBMCs. The loaded `SeuratObject`, `pbmc3k`, is from an old version of +`Seurat`, and so we update the object to v5. + +```{r load_data} +set.seed(123) + +SeuratData::InstallData("pbmc3k") +data("pbmc3k") + +pbmc3k <- UpdateSeuratObject(pbmc3k) +``` + +Now, we use `Seurat` to perform the usual preprocessing steps that are performed prior to clustering. + +```{r preprocessing} +pbmc3k <- NormalizeData(pbmc3k) +pbmc3k <- FindVariableFeatures(pbmc3k) +pbmc3k <- ScaleData(pbmc3k) +pbmc3k <- RunPCA(pbmc3k) +pbmc3k <- FindNeighbors(pbmc3k) +pbmc3k <- RunUMAP(pbmc3k, dims = 1:10) +``` + +The `recall` algorithm can be run with a single function call as a drop-in +replacement for the `Seurat` function `FindClusters`. + +```{r run_recall} +pbmc3k <- FindClustersRecall(pbmc3k) +``` + +The `recall` clusters are set to the idents of the `SeuratObject` that is +returned by `FindClustersRecall` + +```{r plot_umap} +DimPlot(pbmc3k) +``` + +Cluster labels from `FindClustersRecall` care stored in the metadata in the +column `pbmc3k@meta.data$recall_clusters`. + +```{r plot_umap2} +DimPlot(pbmc3k, group.by = "recall_clusters") +```