From a9394392ada67919a96920b533a31e7c666b2a8e Mon Sep 17 00:00:00 2001
From: agdenadel <agdenadel@gmail.com>
Date: Wed, 25 Sep 2024 13:37:58 -0700
Subject: [PATCH] Initial commit recall (compared to callback, adds various
 null distributions, renames method

---
 .Rbuildignore                                 |   7 +
 .github/.gitignore                            |   1 +
 .github/workflows/check-standard.yml          |  49 ++
 .github/workflows/docker-image.yml            |  18 +
 .github/workflows/pkgdown.yaml                |  48 ++
 .github/workflows/super-linter.yml            |  29 +
 .gitignore                                    |  54 ++
 DESCRIPTION                                   |  36 ++
 Dockerfile                                    |   3 +
 LICENSE                                       |   2 +
 LICENSE.md                                    |  21 +
 NAMESPACE                                     |   5 +
 R/copula.R                                    |  92 +++
 R/countsplit.R                                |   0
 R/estimate_negative_binomial.R                | 186 ++++++
 R/estimate_zipoisson.R                        |  60 ++
 R/recall.R                                    | 591 ++++++++++++++++++
 R/seurat_workflow.R                           |  66 ++
 README.md                                     |  75 +++
 _pkgdown.yml                                  |   4 +
 _pkgdown.yml                                  |   3 +
 man/FindClustersCountsplit.Rd                 |  64 ++
 man/FindClustersRecall.Rd                     |  64 ++
 man/compute_knockoff_filter.Rd                |  39 ++
 man/estimate_negative_binomial.Rd             |  22 +
 man/estimate_negative_binomial_copula.Rd      |  40 ++
 man/estimate_zi_poisson.Rd                    |  20 +
 ...et_seurat_obj_with_artificial_variables.Rd |  36 ++
 man/rzipoisson.Rd                             |  24 +
 man/seurat_workflow.Rd                        |  40 ++
 vignettes/basic-usage.Rmd                     |  73 +++
 31 files changed, 1772 insertions(+)
 create mode 100644 .Rbuildignore 
 create mode 100644 .github/.gitignore
 create mode 100644 .github/workflows/check-standard.yml
 create mode 100644 .github/workflows/docker-image.yml
 create mode 100644 .github/workflows/pkgdown.yaml
 create mode 100644 .github/workflows/super-linter.yml
 create mode 100644 .gitignore
 create mode 100644 DESCRIPTION
 create mode 100644 Dockerfile
 create mode 100644 LICENSE
 create mode 100644 LICENSE.md
 create mode 100644 NAMESPACE
 create mode 100644 R/copula.R
 create mode 100644 R/countsplit.R
 create mode 100644 R/estimate_negative_binomial.R
 create mode 100644 R/estimate_zipoisson.R
 create mode 100644 R/recall.R
 create mode 100644 R/seurat_workflow.R
 create mode 100644 README.md
 create mode 100644 _pkgdown.yml
 create mode 100644 _pkgdown.yml 
 create mode 100644 man/FindClustersCountsplit.Rd
 create mode 100644 man/FindClustersRecall.Rd
 create mode 100644 man/compute_knockoff_filter.Rd
 create mode 100644 man/estimate_negative_binomial.Rd
 create mode 100644 man/estimate_negative_binomial_copula.Rd
 create mode 100644 man/estimate_zi_poisson.Rd
 create mode 100644 man/get_seurat_obj_with_artificial_variables.Rd
 create mode 100644 man/rzipoisson.Rd
 create mode 100644 man/seurat_workflow.Rd
 create mode 100644 vignettes/basic-usage.Rmd

diff --git a/.Rbuildignore  b/.Rbuildignore 
new file mode 100644
index 0000000..3cb2e90
--- /dev/null
+++ b/.Rbuildignore 	
@@ -0,0 +1,7 @@
+^LICENSE\.md$
+^Dockerfile$
+^tmp$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
+^\.github$
\ No newline at end of file
diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/check-standard.yml b/.github/workflows/check-standard.yml
new file mode 100644
index 0000000..74d8c97
--- /dev/null
+++ b/.github/workflows/check-standard.yml
@@ -0,0 +1,49 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macos-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 0000000..df305df
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,18 @@
+name: Docker Image CI
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Build the Docker image
+      run: docker build . --file Dockerfile --tag recall:$(date +%s)
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
new file mode 100644
index 0000000..a7276e8
--- /dev/null
+++ b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,48 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+  release:
+    types: [published]
+  workflow_dispatch:
+
+name: pkgdown
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    # Only restrict concurrency for non-PR jobs
+    concurrency:
+      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, local::.
+          needs: website
+
+      - name: Build site
+        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
+        shell: Rscript {0}
+
+      - name: Deploy to GitHub pages 🚀
+        if: github.event_name != 'pull_request'
+        uses: JamesIves/github-pages-deploy-action@v4.5.0
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
diff --git a/.github/workflows/super-linter.yml b/.github/workflows/super-linter.yml
new file mode 100644
index 0000000..ca69cec
--- /dev/null
+++ b/.github/workflows/super-linter.yml
@@ -0,0 +1,29 @@
+# This workflow executes several linters on changed files based on languages used in your code base whenever
+# you push a code or open a pull request.
+#
+# You can adjust the behavior by modifying this file.
+# For more information, see:
+# https://github.com/github/super-linter
+name: Lint Code Base
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+jobs:
+  run-lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          # Full git history is needed to get a proper list of changed files within `super-linter`
+          fetch-depth: 0
+
+      - name: Lint Code Base
+        uses: github/super-linter@v4
+        env:
+          VALIDATE_ALL_CODEBASE: false
+          DEFAULT_BRANCH: "main"
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2adc561
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,54 @@
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+.RDataTmp
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+# pkgdown site
+docs/
+
+# translation temp files
+po/*~
+
+# RStudio Connect folder
+rsconnect/
+.Rproj.user
+.Rdata
+.DS_Store
+docs
+inst/doc
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..d8dabbf
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,36 @@
+Package: recall
+Title: Calibrated clustering with artificial variables to avoid over-clustering in single-cell RNA-sequencing
+Version: 0.0.0
+Authors@R: 
+    person("Alan", "DenAdel", , "alan_denadel@brown.edu", role = c("aut", "cre"),
+           comment = c(ORCID = "0000-0002-7985-6789"))
+Description: recall (Calibrated Clustering with Artificial Variables) is a method for protecting
+ against over-clustering by controlling for the impact of double-dipping. The approach 
+    can be applied to any clustering algorithm (implemented are the Louvain and Leiden algorithms with
+    plans forK-means, and hierarchical clustering algorithms). The method provides state-of-the-art
+    clustering performance and can rapidly analyze large-scale scRNA-seq studies and is 
+    compatible with the Seurat library.
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.3.2
+Imports: 
+    Matrix,
+    Seurat (>= 5.0.1),
+    SingleCellExperiment,
+    scDesign3,
+    SummarizedExperiment,
+    MASS,
+    fitdistrplus,
+    lamW,
+    knockoff,
+    future,
+    stats,
+    cli,
+    stringr,
+    countsplit
+License: MIT + file LICENSE
+Suggests: 
+    knitr,
+    rmarkdown
+VignetteBuilder: knitr
+URL: https://lcrawlab.github.io/recall/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..93727f6
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,3 @@
+FROM rocker/verse:4.0.5
+
+RUN R -e "install.packages('.', type = 'source', repos = NULL)"
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6459969
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2024
+COPYRIGHT HOLDER: recall authors
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..f6addd8
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2024 recall authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..c817b7f
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,5 @@
+# Generated by roxygen2: do not edit by hand
+
+export(FindClustersCallback)
+export(FindClustersCountsplit)
+export(seurat_workflow)
diff --git a/R/copula.R b/R/copula.R
new file mode 100644
index 0000000..2a5344b
--- /dev/null
+++ b/R/copula.R
@@ -0,0 +1,92 @@
+simulate_data_scDesign3 <- function(data_matrix, cores, family) {
+    sce <- SingleCellExperiment::SingleCellExperiment(list(counts = data_matrix))
+    SummarizedExperiment::colData(sce)$cell_type <- "1" # scDesign3 needs a cell type so we just make it the same for all cells
+
+    simulated_data <- scDesign3::scdesign3(sce,
+                                           celltype = "cell_type",
+                                           pseudotime = NULL,
+                                           spatial = NULL,
+                                           other_covariates = NULL,
+                                           empirical_quantile = FALSE,
+                                           usebam=TRUE, # to speedup marginal inference
+                                           mu_formula = "1",
+                                           sigma_formula = "1",
+                                           corr_formula = "1",
+                                           family_use = family, # this is the key parameter
+                                           nonzerovar = FALSE,
+                                           n_cores = cores,
+                                           parallelization = "mcmapply",
+                                           important_feature = "all",
+                                           nonnegative = FALSE,
+                                           copula = "gaussian",
+                                           fastmvn = TRUE)
+
+    ko <- simulated_data$new_count
+
+    return(ko)
+}
+
+
+#' @title todo
+#'
+#' @description Given data, computes todo
+#'
+#' @param data_matrix The data to estimate parameters from.
+#' @param cores The number of CPU cores to use in estimation by scDesign3.
+#' @returns todo
+#' @name estimate_negative_binomial_copula
+estimate_zi_poisson_copula <- function(data_matrix, cores) {
+    family <- "zip"
+    ko <- simulate_data_scDesign3(data_matrix, cores, family)
+    return(ko)
+}
+
+
+#' @title todo
+#'
+#' @description Given data, computes todo
+#'
+#' @param data_matrix The data to estimate parameters from.
+#' @param cores The number of CPU cores to use in estimation by scDesign3.
+#' @returns todo
+#' @name estimate_negative_binomial_copula
+estimate_negative_binomial_copula <- function(data_matrix, cores) {
+    family <- "nb"
+    ko <- simulate_data_scDesign3(data_matrix, cores, family)
+    return(ko)
+}
+
+
+#' @title todo
+#'
+#' @description Given data, computes todo
+#'
+#' @param data_matrix The data to estimate parameters from.
+#' @param cores The number of CPU cores to use in estimation by scDesign3.
+#' @returns todo
+#' @name estimate_negative_binomial_copula
+estimate_poisson_copula <- function(data_matrix, cores) {
+    family <- "poisson"
+    ko <- simulate_data_scDesign3(data_matrix, cores, family)
+    return(ko)
+}
+
+
+
+
+#' @title todo
+#'
+#' @description Given data, computes todo
+#'
+#' @param data_matrix The data to estimate parameters from.
+#' @param cores The number of CPU cores to use in estimation by scDesign3.
+#' @returns todo
+#' @name estimate_negative_binomial_copula
+estimate_gaussian_copula <- function(data_matrix, cores) {
+    family <- "gaussian"
+    ko <- simulate_data_scDesign3(data_matrix, cores, family)
+    return(ko)
+}
+
+
+
diff --git a/R/countsplit.R b/R/countsplit.R
new file mode 100644
index 0000000..e69de29
diff --git a/R/estimate_negative_binomial.R b/R/estimate_negative_binomial.R
new file mode 100644
index 0000000..5b9cbea
--- /dev/null
+++ b/R/estimate_negative_binomial.R
@@ -0,0 +1,186 @@
+
+#' @title Maximum likelihood estimation for the negative binomial
+#' distribution.
+#'
+#' @description Given data, computes the maximum likelihood estimators
+#' for the negative binomial distribution with parameters: size and mu.
+#'
+#' @param data The data to estimate parameters from.
+#' @returns Maximum likelihood estimators size and mu for the negative 
+#' binomial distribution
+#' @param verbose Whether or not to show all logging.
+#' @name estimate_negative_binomial
+estimate_negative_binomial <- function(data, verbose=FALSE) {
+  
+  if (verbose) { message("Attempting MLE method 1") }
+  mle1 <- tryCatch(
+    {
+      nb_fit <- MASS::fitdistr(data, "negative binomial", method = "Nelder-Mead")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+    },
+    error = function(cond) {
+      if (verbose) { message("MLE method 1 failed with an error.") }
+      NA
+    },
+    warning = function(cond) {
+      if (verbose) { message("MLE method 2 had a warning. Warning message:\n") }
+      if (verbose) { message(cond) }
+      if (verbose) { message("\n") }
+      NA
+    }
+  )
+  
+  if (verbose) { message("Attempting MLE method 2") }
+  mle2 <- tryCatch(
+    {
+      nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mle")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+
+    },
+    error = function(cond) {
+      if (verbose) { message("MLE method 2 failed with an error.") }
+      NA
+    },
+    warning = function(cond) {
+      if (verbose) { message("MLE method 2 had a warning. Warning message:") }
+      if (verbose) { message(cond) }
+      NA
+    }
+  )
+  
+  if (verbose) { message("Attempting MME") }
+  mme <- tryCatch(
+    {
+      nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+    },
+    error = function(cond) {
+      if (verbose) { message("MME failed with an error.") }
+      NA
+    },
+    warning = function(cond) {
+      if (verbose) { message("MME method has a warning. Warning message:") }
+      if (verbose) { message(cond) }
+      NA
+    }
+  )
+
+  
+  if (verbose) { message("Attempting MME with warnings") }
+  mme <- tryCatch(
+    {
+      nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+    },
+    error = function(cond) {
+      if (verbose) { message("MME failed with an error.") }
+      NA
+    }
+  )
+
+  if (verbose) { message("Attempting MSE") }
+  mme <- tryCatch(
+    {
+      nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mse")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+    },
+    error = function(cond) {
+      if (verbose) { message("MSE failed with an error.") }
+      NA
+    },
+    warning = function(cond) {
+      if (verbose) { message("MSE method failed. Warning message:") }
+      if (verbose) { message(cond) }
+      NA
+    }
+  )
+
+  if (verbose) { message("Attempting QME") }
+  mme <- tryCatch(
+    {
+      nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="qme")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+    },
+    error = function(cond) {
+      if (verbose) { message("QME failed with an error.") }
+      NA
+    },
+    warning = function(cond) {
+      if (verbose) { message("QME method failed. Warning message:") }
+      if (verbose) { message(cond) }
+      NA
+    }
+  )
+
+
+  if (verbose) { message("Attempting MGE") }
+  mme <- tryCatch(
+    {
+      nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mge")
+      size <- nb_fit$estimate[["size"]]
+      mu <- nb_fit$estimate[["mu"]]
+
+      # check if method returned NaN or NA without throwing an error
+      if (is.na(mu) || is.na(size)) { stop() }
+
+      return.list <- list("size" = size, "mu" = mu)
+      return(return.list)
+    },
+    error = function(cond) {
+      if (verbose) { message("MGE failed with an error.") }
+      NA
+    },
+    warning = function(cond) {
+      if (verbose) { message("MGE method failed. Warning message:") }
+      if (verbose) { message(cond) }
+      NA
+    }
+  )
+  
+
+
+  stop("All negative binomial estimation methods failed.")  
+
+}
diff --git a/R/estimate_zipoisson.R b/R/estimate_zipoisson.R
new file mode 100644
index 0000000..070de4a
--- /dev/null
+++ b/R/estimate_zipoisson.R
@@ -0,0 +1,60 @@
+
+# https://en.wikipedia.org/wiki/Zero-inflated_model#Estimators_of_ZIP_parameters
+# https://math.stackexchange.com/questions/2761563/maximum-likelihood-estimation-for-zero-inflated-poisson-distribution
+# https://ieeexplore.ieee.org/document/9032203
+
+#' @title Maximum likelihood estimation for the zero-inflated Poisson distribution
+#' with Poisson parameter lambda and zero proportion prop.zero.
+#'
+#' @description Given data, computes the maximum likelihood estimators
+#' for the zero-inflated Poisson distribution.
+#'
+#' @param data The data to estimate parameters from.
+#' @returns Maximum likelihood estimators of the zero-inflated Poisson
+#' distribution
+#' @name estimate_zi_poisson
+estimate_zi_poisson <- function(data) {
+  num.zeros <- sum(data == 0)
+  r0 <- 1 / length(data) * num.zeros
+  
+  x.bar = mean(data)
+  
+  gamma <- x.bar / (1 - r0)
+  
+  lambda.hat <- lamW::lambertW0(-gamma * exp(-gamma)) + gamma
+  
+  pi.hat <- 1 - x.bar / lambda.hat
+
+
+  return.list <- list("lambda.hat" = lambda.hat, "pi.hat" = pi.hat)
+  return(return.list)
+}
+
+
+#' @title Random data generation for the zero-infalted Poisson distribution
+#' with Poisson parameter lambda and zero proportion prop.zero.
+#'
+#' @description Given the number of samples desired, a Poisson parameter,
+#' lambda, and a zero proportion, prop.zero, simulates the number of desired
+#' samples from ZIP(lambda, prop.zero).
+#'
+#' @param n The number of samples to be simulated.
+#' @param lambda The Poisson rate parameter.
+#' @param prop.zero The proportion of excess zeroes.
+#' @returns Simulated data from ZIP(lambda, prop.zero).
+#' @name rzipoisson
+rzipoisson <- function(n, lambda, prop.zero) {
+  data <- c()
+
+
+  for (i in 1:n) {
+    if (stats::runif(1) < prop.zero) {
+      data[i] <- 0
+    }
+    else {
+      data[i] <- stats::rpois(1, lambda)
+    }
+  }
+  return(data)
+} 
+
diff --git a/R/recall.R b/R/recall.R
new file mode 100644
index 0000000..969bf84
--- /dev/null
+++ b/R/recall.R
@@ -0,0 +1,591 @@
+
+#' @title Returns a Seurat object that contains additional (fake) RNA
+#' expression counts.
+#'
+#' @description Given a Seurat object, returns a new Seurat object whose RNA
+#' expression counts includes the
+#' variable features from the original object and an equal number of artificial
+#' features.
+#'
+#' @param seurat_obj A Seurat object containing RNA expression counts.
+#' @param assay The assay to generate artificial variables from.
+#' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)
+#' @param cores The number of cores to use in generating synthetic null variables.
+#' @param verbose Whether or not to show logging.
+#' @returns A Seurat object that contains the original variable features and an
+#' equal number of artificial features.
+#' @name get_seurat_obj_with_artificial_variables
+get_seurat_obj_with_artificial_variables <- function(seurat_obj, assay = "RNA", null_method = "ZIP", verbose = TRUE, cores) {
+
+  if (verbose) {
+    message("Pulling data from Seurat object")
+  }
+
+  var_features <- Seurat::VariableFeatures(seurat_obj)
+  seurat_obj_data <- as.data.frame(t(as.matrix(Seurat::GetAssayData(seurat_obj, assay = assay, layer = "counts")[var_features, ])))
+
+  #if (verbose) {
+  #  message("Estimating the distribution of each gene")
+  #}
+  if (verbose) {
+    message("Computing artificial features")
+  }
+
+  if (null_method == "ZIP") {
+    estimates <- lapply(seurat_obj_data, estimate_zi_poisson)
+    sampling_function <- function(x) {
+                                      rzipoisson(nrow(seurat_obj_data),
+                                                 x$lambda.hat,
+                                                 x$pi.hat)
+                                    }
+    ko <- as.data.frame(lapply(estimates, sampling_function))
+
+  }
+  else if (null_method == "NB") {
+    estimates <- lapply(seurat_obj_data, estimate_negative_binomial)
+    sampling_function <- function(x) {
+                                      stats::rnbinom(nrow(seurat_obj_data),
+                                                 size = x$size,
+                                                 mu = x$mu)
+                                    }
+    ko <- as.data.frame(lapply(estimates, sampling_function))
+  }
+  else if (null_method == "ZIP-copula") {
+    ko <- estimate_zi_poisson_copula(seurat_obj_data, cores)
+  }
+  else if (null_method == "NB-copula") {
+    ko <- estimate_negative_binomial_copula(seurat_obj_data, cores)
+  }
+  else if (null_method == "Poisson-copula") {
+    ko <- estimate_poisson_copula(seurat_obj_data, cores)
+  }
+  else if (null_method == "Gaussian-copula") {
+    ko <- estimate_gaussian_copula(seurat_obj_data, cores)
+  }
+  else {
+    stop("You selected a null_method that is not supported. Choose from: ZIP, NB, ZIP-copula, NB-copula, Poisson-copula, Gaussian-copula.")
+  }
+
+
+  num_variable_features <- length(var_features)
+  colnames(ko) <- paste0(rep("knockoff", num_variable_features), 1:num_variable_features)
+  combined_data <- cbind(seurat_obj_data, ko)
+
+  # sparsify augmented data matrix and transpose for use in Seurat
+  combined_data <- Matrix::Matrix(t(combined_data), sparse = TRUE)
+
+  new_project_name <- paste0(seurat_obj@project.name, "_with_knockoffs")
+  new_seurat_obj <- Seurat::CreateSeuratObject(counts = combined_data, project = new_project_name)
+
+  return(new_seurat_obj)
+}
+
+
+
+
+
+
+#' @title Returns the genes selected by the knockoff filter
+#'
+#' @description Given two Seurat objects, returns the  the genes selected by
+#' the knockoff filter and their W statistics.
+#'
+#' @param seurat_obj A Seurat object
+#' @param cluster1 The Idents of the cluster of interest in seurat_obj1
+#' @param cluster2 The Idents of the cluster of interest in seurat_obj2
+#' @param q The desired rate to control the FDR at
+#' @param return_all Determines if the returned object will contain all genes
+#' or just the selected genes.
+#' @param num_cores The number of cores for computing marker genes in parallel.
+#' @param shared_memory_max The maximum size for shared global variables.
+#' @returns todo
+#' @name compute_knockoff_filter
+compute_knockoff_filter <- function(seurat_obj,
+                                    cluster1,
+                                    cluster2,
+                                    q,
+                                    return_all = FALSE,
+                                    num_cores = 1,
+                                    shared_memory_max) {
+  options(future.globals.maxSize = shared_memory_max)
+  # todo note what this is for, figure this out as a parameter or programmatically
+  future::plan("multicore", workers = as.numeric(num_cores))
+
+  markers <- Seurat::FindMarkers(seurat_obj,
+                         ident.1 = cluster1,
+                         ident.2 = cluster2,
+                         logfc.threshold = 0,
+                         min.pct = 0)
+
+
+  # FindMarkers orders by p-value, so we can't rely on position to know which genes are which
+  knockoff_indices <- grepl("^knockoff", rownames(markers))
+  original_indices <- !knockoff_indices
+
+  # subset the markers data.frame into originals and knockoffs
+  knockoff_markers <- markers[knockoff_indices, ]
+  original_markers <- markers[original_indices, ]
+
+  all_genes <- rownames(seurat_obj)
+
+  # get indices of knockoffs and originals from seurat_obj, should be [FALSE, ..., FALSE, TRUE, ..., TRUE]
+  knockoff_indices_sorted <- grepl("^knockoff", all_genes)
+  original_indices_sorted <- !knockoff_indices_sorted
+
+  knockoff_names_sorted <- all_genes[knockoff_indices_sorted]
+  original_names_sorted <- all_genes[original_indices_sorted]
+
+  # sort markers data.frames by their original orderings
+  knockoff_markers_sorted <- knockoff_markers[knockoff_names_sorted, ]
+  original_markers_sorted <- original_markers[original_names_sorted, ]
+
+  original_p_values <- original_markers_sorted$p_val
+  knockoff_p_values <- knockoff_markers_sorted$p_val
+
+  log_original_p_values <- -log10(original_p_values)
+  log_knockoff_p_values <- -log10(knockoff_p_values)
+
+  W <- log_original_p_values - log_knockoff_p_values
+
+  thres <- knockoff::knockoff.threshold(W, fdr = q, offset = 1)
+
+
+  if (return_all) {
+    all_features <- as.data.frame(list("gene" = original_names_sorted, "W" = W))
+
+    ret <-  list("all_features" = all_features, "threshold" = thres)
+
+    return(ret)
+  }
+  selected_indices <- which(W >= thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p)
+  #selected_indices <- which(W > thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p)
+
+  selected_genes <- original_names_sorted[selected_indices]
+  selected_Ws <- W[selected_indices]
+
+  selected_features <- as.data.frame(list("selected_gene" = selected_genes, "W" = selected_Ws))
+
+  selected_features <- selected_features[order(selected_features$W, decreasing = TRUE), ]
+
+  ret <-  list("selected_features" = selected_features, "threshold" = thres)
+
+  return(ret)
+}
+
+
+
+
+
+
+
+
+#' @title Runs a typical Seurat workflow on a Seurat object (up to
+#' dimensionality reduction and clustering).
+#'
+#' @description Given a Seurat object, returns a new Seurat that has been
+#' normalized, had variable features identified, scaled, had principal
+#' components computed, hadclusters identified, and had tSNE and UMAP
+#' embeddings determined.
+#'
+#' @param seurat_obj The Seurat object that will be analyzed.
+#' @param resolution_start The starting resolution to be used for the
+#' clustering algorithm (Louvain and Leiden algorithms).
+#' @param num_clusters_start The starting number of clusters to be used for the
+#' clustering algorithm (K-means and Hierarchical clustering algorithms).
+#' @param reduction_percentage The amount that the starting parameter will be
+#' reduced by after each iteration (between 0 and 1).
+#' @param dims The dimensions to use as input features (i.e. 1:10).
+#' @param algorithm The clustering algorithm to be used.
+#' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)
+#' @param assay The assay to generate artificial variables from.
+#' @param cores The number of cores to compute marker genes in parallel.
+#' @param shared_memory_max The maximum size for shared global variables.
+#' Increased this variable if you see the following error:
+#' The total size of the X globals that need to be exported for the future expression 
+#' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 
+#' (option 'future.globals.maxSize'). The X largest globals are ... 
+#' @param verbose Whether or not to show all logging.
+#' @returns Returns a Seurat object where the idents have been updated with the
+#' clusters determined via the recall algorithm.
+#' Latest clustering results will be stored in the object metadata under
+#' recall_clusters'. Note that 'recall_clusters' will be overwritten ever
+#' time FindClustersRecall is run.
+#' @name FindClustersRecall
+#' @export
+FindClustersRecall <- function(seurat_obj,
+                                 resolution_start = 0.8,
+                                 reduction_percentage = 0.2,
+                                 num_clusters_start = 20,
+                                 dims = 1:10,
+                                 algorithm = "louvain", # todo implement all algos
+                                 null_method = "ZIP",
+                                 assay = "RNA",
+                                 cores = 1,
+                                 shared_memory_max = 8000 * 1024^2,
+                                 verbose = TRUE) {
+
+  # todo check function arguments for validity
+
+  augmented_with_artificial_variables_seurat_obj <- get_seurat_obj_with_artificial_variables(seurat_obj,
+                                                       assay=assay,
+                                                       null_method=null_method,
+                                                       verbose=verbose,
+                                                       cores=cores)
+
+  num_variable_features <- 2 * length(Seurat::VariableFeatures(seurat_obj))
+
+
+  # Pre-process data
+  options(future.globals.maxSize = shared_memory_max)
+  # todo log number of cores being used
+  future::plan("multicore", workers = as.numeric(cores))
+  #options(future.globals.maxSize = 8000 * 1024^2)
+
+    if (verbose) {
+      message(paste("Number of cores:", cores))
+    }
+
+  #plan("multicore", workers = as.numeric(cores))
+
+  augmented_with_artificial_variables_seurat_obj <- Seurat::NormalizeData(augmented_with_artificial_variables_seurat_obj,
+                                               verbose = FALSE)
+   
+  augmented_with_artificial_variables_seurat_obj <- Seurat::FindVariableFeatures(augmented_with_artificial_variables_seurat_obj,
+                                                      selection.method = "vst",
+                                                      nfeatures = num_variable_features,
+                                                      verbose = FALSE)
+
+  augmented_with_artificial_variables_seurat_obj <- Seurat::ScaleData(augmented_with_artificial_variables_seurat_obj, verbose = FALSE)
+  augmented_with_artificial_variables_seurat_obj <- Seurat::RunPCA(augmented_with_artificial_variables_seurat_obj,
+                                        features = Seurat::VariableFeatures(object = augmented_with_artificial_variables_seurat_obj),
+                                        verbose = FALSE)
+
+  augmented_with_artificial_variables_seurat_obj <- Seurat::FindNeighbors(augmented_with_artificial_variables_seurat_obj,
+                                               dims = dims,
+                                               verbose = FALSE)
+
+  resolution_param <- resolution_start
+
+
+  first_iteration <- TRUE
+
+  while (TRUE) {
+    if (verbose) {
+      message("####################################################################")
+      message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm"))
+      message(paste("Resolution param:", resolution_param))
+    }
+
+    if (algorithm == "louvain") {
+      augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj,
+                                                  resolution = resolution_param,
+                                                  verbose = FALSE)
+    }
+
+    if (algorithm == "leiden") {
+      #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging
+      augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj,
+                                                  resolution = resolution_param,
+                                                  algorithm = 4,
+                                                  method = "igraph",
+                                                  verbose = FALSE)
+    }
+
+    # Reduce resolution for next iteration of the loop
+    resolution_param <- (1 - reduction_percentage) * resolution_param
+
+    k <- length(levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj)))
+    #knock_idents <- 0:(k-1)
+
+    if (verbose) {
+      message("Num clusters:")
+      message(k)
+    }
+
+    knock_idents <- levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj))
+
+    num_selected_matrix <- matrix(nrow = k, ncol = k)
+
+    found_no_sign_diff <- FALSE
+
+    num_clusters <- length(knock_idents)
+
+
+    if (verbose) {
+      progress_bar_length <- num_clusters * (num_clusters - 1) / 2
+      cli::cli_progress_bar("Processing cluster pairs:",
+                            total = progress_bar_length,
+                            clear = FALSE)
+    }
+
+    m <- 0
+    for (i in 1:num_clusters) {
+      for (j in 1:num_clusters) {
+        if (j >= i) {
+          next
+        }
+
+        m <- m + 1
+
+        if (verbose) {
+          cli::cli_progress_update()
+        }
+
+        markers_selected <- compute_knockoff_filter(seurat_obj = augmented_with_artificial_variables_seurat_obj,
+                                                    cluster1 = knock_idents[i],
+                                                    cluster2 = knock_idents[j],
+                                                    q = 0.05,
+                                                    num_cores = cores,
+                                                    shared_memory_max = shared_memory_max)
+
+        num_selected <- nrow(markers_selected$selected_features)
+
+        if (num_selected == 0) {
+          found_no_sign_diff <- TRUE
+          break
+        }
+
+        num_selected_matrix[i, j] <- num_selected
+        num_selected_matrix[j, i] <- num_selected
+
+      }
+      if (found_no_sign_diff) {
+        if (verbose) {
+          cli::cli_progress_done()
+          message("Found clusters with no significant differences.")
+          message("Progressing to next clustering iteration.")
+        }
+        first_iteration <- FALSE
+        break
+      }
+    }
+
+    if (found_no_sign_diff) {
+      next
+    }
+    break
+  }
+
+  if (first_iteration) {
+    warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run recall with a larger starting parameter.")
+  }
+
+  seurat_obj@meta.data$recall_clusters <- Seurat::Idents(augmented_with_artificial_variables_seurat_obj)
+  Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$recall_clusters
+
+  return(seurat_obj)
+}
+
+
+
+
+
+#' @title Runs a typical Seurat workflow on a Seurat object (up to
+#' dimensionality reduction and clustering).
+#'
+#' @description Given a Seurat object, returns a new Seurat that has been
+#' normalized, had variable features identified, scaled, had principal
+#' components computed, hadclusters identified, and had tSNE and UMAP
+#' embeddings determined.
+#'
+#' @param seurat_obj The Seurat object that will be analyzed.
+#' @param resolution_start The starting resolution to be used for the
+#' clustering algorithm (Louvain and Leiden algorithms).
+#' @param num_clusters_start The starting number of clusters to be used for the
+#' clustering algorithm (K-means and Hierarchical clustering algorithms).
+#' @param reduction_percentage The amount that the starting parameter will be
+#' reduced by after each iteration (between 0 and 1).
+#' @param dims The dimensions to use as input features (i.e. 1:10).
+#' @param algorithm The clustering algorithm to be used.
+#' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)
+#' @param assay The assay to generate artificial variables from.
+#' @param cores The number of cores to compute marker genes in parallel.
+#' @param shared_memory_max The maximum size for shared global variables.
+#' Increased this variable if you see the following error:
+#' The total size of the X globals that need to be exported for the future expression 
+#' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 
+#' (option 'future.globals.maxSize'). The X largest globals are ... 
+#' @param verbose Whether or not to show all logging.
+#' @returns Returns a Seurat object where the idents have been updated with the
+#' clusters determined via the countsplit algorithm.
+#' Latest clustering results will be stored in the object metadata under
+#' countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever
+#' time FindClustersCountsplit is run.
+#' @name FindClustersCountsplit
+#' @export
+FindClustersCountsplit <- function(seurat_obj,
+                                   resolution_start = 0.8,
+                                   reduction_percentage = 0.2,
+                                   num_clusters_start = 20,
+                                   dims = 1:10,
+                                   algorithm = "louvain", # todo implement all algos
+                                   null_method = "ZIP",
+                                   assay = "RNA",
+                                   cores = 1,
+                                   shared_memory_max = 8000 * 1024^2,
+                                   verbose = TRUE) {
+
+  options(future.globals.maxSize = shared_memory_max)
+  # todo log number of cores being used
+  future::plan("multicore", workers = as.numeric(cores))
+
+  num_variable_features <- length(Seurat::VariableFeatures(seurat_obj))
+
+
+  # follow this issue
+  #https://github.com/anna-neufeld/countsplit/issues/8
+
+  # todo only do this for variable features
+  split <- countsplit::countsplit(Seurat::GetAssayData(seurat_obj, assay))
+  Xtrain <- split[[1]]
+  Xtest <- split[[2]]
+
+  seurat_obj_train <- Seurat::CreateSeuratObject(counts = Xtrain)
+  seurat_obj_test <- Seurat::CreateSeuratObject(counts = Xtest)
+
+
+  # process training data
+  seurat_obj_train <- Seurat::NormalizeData(seurat_obj_train,
+                                            verbose = FALSE)
+
+  seurat_obj_train <- Seurat::FindVariableFeatures(seurat_obj_train,
+                                                   selection.method = "vst",
+                                                   nfeatures = num_variable_features,
+                                                   verbose = FALSE)
+
+  seurat_obj_train <- Seurat::ScaleData(seurat_obj_train, verbose = FALSE)
+  seurat_obj_train <- Seurat::RunPCA(seurat_obj_train,
+                                     features = Seurat::VariableFeatures(object = seurat_obj_train),
+                                     verbose = FALSE)
+
+  seurat_obj_train <- Seurat::FindNeighbors(seurat_obj_train,
+                                            dims = dims,
+                                            verbose = FALSE)
+
+  # process test data (no need for PCA and FindNeighbors since we are just assigning idents based on training idents)
+  seurat_obj_test <- Seurat::NormalizeData(seurat_obj_test,
+                                           verbose = FALSE)
+
+  seurat_obj_test <- Seurat::FindVariableFeatures(seurat_obj_test,
+                                                  selection.method = "vst",
+                                                  nfeatures = num_variable_features,
+                                                  verbose = FALSE)
+
+  seurat_obj_test <- Seurat::ScaleData(seurat_obj_test, verbose = FALSE)
+
+
+
+  resolution_param <- resolution_start
+
+  # set up multicore for FindMarkers
+  future::plan("multicore", workers = as.numeric(cores))
+
+  first_iteration <- TRUE
+
+  while (TRUE) {
+    if (verbose) {
+      message("####################################################################")
+      message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm"))
+      message(paste("Resolution param:", resolution_param))
+    }
+    
+    if (algorithm == "louvain") {
+      seurat_obj_train <- Seurat::FindClusters(seurat_obj_train,
+                                                  resolution = resolution_param,
+                                                  verbose = FALSE)
+    }
+    
+    if (algorithm == "leiden") {
+      #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging
+      seurat_obj_train <- Seurat::FindClusters(seurat_obj_train,
+                                                  resolution = resolution_param,
+                                                  algorithm = 4,
+                                                  method = "igraph",
+                                                  verbose = FALSE)
+    }
+    
+    # Reduce resolution for next iteration of the loop
+    resolution_param <- (1 - reduction_percentage) * resolution_param
+    
+    Seurat::Idents(seurat_obj_test) <- Seurat::Idents(seurat_obj_train)
+    
+    k <- length(levels(Seurat::Idents(seurat_obj_test)))
+    #knock_idents <- 0:(k-1)
+    
+    if (verbose) {
+      message("Num clusters:")
+      message(k)
+    }
+    
+    countsplit_idents <- levels(Seurat::Idents(seurat_obj_test))
+    
+    num_selected_matrix <- matrix(nrow = k, ncol = k)
+    
+    found_no_sign_diff <- FALSE
+    
+    num_clusters <- length(countsplit_idents)
+    
+    
+    if (verbose) {
+      progress_bar_length <- num_clusters * (num_clusters - 1) / 2
+      cli::cli_progress_bar("Processing cluster pairs:",
+                            total = progress_bar_length,
+                            clear = FALSE)
+    }
+    
+    m <- 0
+    for (i in 1:num_clusters) {
+      for (j in 1:num_clusters) {
+        if (j >= i) {
+          next
+        }
+        
+        m <- m + 1
+        
+        if (verbose) {
+          cli::cli_progress_update()
+        }
+        
+        markers_selected <- Seurat::FindMarkers(seurat_obj_test,
+                                                ident.1 = countsplit_idents[i],
+                                                ident.2 = countsplit_idents[j])
+    
+        num_selected <- sum(markers_selected$p_val_adj < 0.05)
+        
+        if (num_selected == 0) {
+          found_no_sign_diff <- TRUE
+          break
+        }
+        
+        num_selected_matrix[i, j] <- num_selected
+        num_selected_matrix[j, i] <- num_selected
+        
+      }
+      if (found_no_sign_diff) {
+        if (verbose) {
+          cli::cli_progress_done()
+          message("Found clusters with no significant differences.")
+          message("Progressing to next clustering iteration.")
+        }
+        first_iteration <- FALSE
+        break
+      }
+    }
+    
+    if (found_no_sign_diff) {
+      next
+    }
+    break
+  }
+
+  if (first_iteration) {
+    warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run FindClustersCountsplit with a larger starting parameter.")
+  }
+
+
+  seurat_obj@meta.data$countsplit_clusters <- Seurat::Idents(seurat_obj_test)
+  Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$countsplit_clusters
+
+  return(seurat_obj)
+
+}
\ No newline at end of file
diff --git a/R/seurat_workflow.R b/R/seurat_workflow.R
new file mode 100644
index 0000000..be5b4e8
--- /dev/null
+++ b/R/seurat_workflow.R
@@ -0,0 +1,66 @@
+#' @title Runs a typical Seurat workflow on a Seurat object (up to
+#' dimensionality reduction and clustering).
+#'
+#' @description Given a Seurat object, returns a new Seurat that has been
+#' normalized, had variable features identified,
+#' scaled, had principal components computed, had clusters identified, and had
+#' tSNE and UMAP embeddings determined.
+#'
+#' @param seurat_obj A Seurat object that will be analyzed.
+#' @param num_variable_features The number of variable features to use in the
+#' analysis.
+#' @param resolution_param The resolution parameter to use when clustering.
+#' @param visualization_method Either "umap" or "tsne".
+#' @param num_dims The number of principal components to use.
+#' @param algorithm The clustering algorithm to use, either "louvain" or
+#' "leiden".
+#' @returns A Seurat object containing the relevant analysis results.
+#' @export
+#' @name seurat_workflow
+seurat_workflow <- function(seurat_obj,
+                            num_variable_features,
+                            resolution_param = 0.8,
+                            visualization_method = "umap",
+                            num_dims = 10,
+                            algorithm = "louvain") {
+  seurat_obj <- Seurat::NormalizeData(seurat_obj)
+
+  seurat_obj <- Seurat::FindVariableFeatures(seurat_obj,
+                                             selection.method = "vst",
+                                             nfeatures = num_variable_features)
+
+  all_genes <- rownames(seurat_obj)
+
+  seurat_obj <- Seurat::ScaleData(seurat_obj)
+
+  seurat_obj <- Seurat::RunPCA(seurat_obj,
+                               features = Seurat::VariableFeatures(object = seurat_obj))
+
+  seurat_obj <- Seurat::FindNeighbors(seurat_obj, dims = 1:num_dims)
+
+  if (algorithm == "louvain") {
+    seurat_obj <- Seurat::FindClusters(seurat_obj,
+                                       resolution = resolution_param)
+  }
+
+  if (algorithm == "leiden") {
+    seurat_obj <- Seurat::FindClusters(seurat_obj,
+                                       resolution = resolution_param,
+                                       algorithm = 4,
+                                       method = "igraph")
+  }
+
+  if (visualization_method == "umap") {
+    seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims)
+  }
+  if (visualization_method == "tsne") {
+    seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims)
+  }
+
+if (visualization_method == "both") {
+    seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims)
+    seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims)
+  }
+
+  return(seurat_obj)
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2104bda
--- /dev/null
+++ b/README.md
@@ -0,0 +1,75 @@
+# recall (Calibrated Clustering with Artificial Variables) <img src="man/figures/recall_logo.png" align="right" alt="" width="120"/>
+
+[![R CMD check](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml)
+[![Docker Image CI](https://github.com/lcrawlab/recal/actions/workflows/docker-image.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/docker-image.yml)
+
+## Introduction
+
+Standard single-cell RNA-sequencing (scRNA-seq) pipelines nearly always include unsupervised clustering as a key step in identifying biologically distinct cell types. A follow-up step in these pipelines is to test for differential expression between the identified clusters. When algorithms over-cluster, downstream analyses will produce inflated P-values resulting in increased false discoveries.
+Here, we present `recall` (Calibrated Clustering with Artificial Variables): a new method for protecting against over-clustering by controlling for the impact of double-dipping.
+Importantly, our approach can be applied to any clustering algorithm (implemented here are the Louvain, Leiden, K-means, and hierarchical clustering algorithms).
+`recall` provides state-of-the-art clustering performance and can rapidly analyze large-scale scRNA-seq studies, even on a personal laptop.
+
+## Installation
+
+You can install the lastest development version by using the [devtools](https://CRAN.R-project.org/package=devtools) library. To install this package with devtools, use this command:
+
+```r
+devtools::install_github("lcrawlab/recall")
+```
+
+Although it is not explicitly a dependency, making sure you have `presto` installed will make `recall` much faster.
+
+```r
+devtools::install_github("immunogenomics/presto")
+```
+
+
+## Tutorial
+
+```r
+library(Seurat)
+library(SeuratData)
+
+library(recall)
+
+set.seed(123)
+
+# load pbmc3k dataset
+SeuratData::InstallData("pbmc3k")
+data("pbmc3k")
+
+pbmc3k <- UpdateSeuratObject(pbmc3k)
+
+pbmc3k <- NormalizeData(pbmc3k)
+pbmc3k <- FindVariableFeatures(pbmc3k)
+pbmc3k <- ScaleData(pbmc3k)
+pbmc3k <- RunPCA(pbmc3k)
+pbmc3k <- FindNeighbors(pbmc3k)
+pbmc3k <- RunUMAP(pbmc3k, dims = 1:10)
+
+pbmc_default <- FindClusters(pbmc3k)
+pbmc_recall <- FindClustersRecall(pbmc3k)
+
+DimPlot(pbmc_default) + DimPlot(pbmc_recall)
+```
+## Overview of the Method
+
+The `recall` algorithm consists of three simple steps:
+
+1. First, we generate synthetic null variables, inspired by knockoff variables (Barber and Candès,2015) , where we augment the single-cell data being analyzed with "fake" genes that are known not to contribute to any unique cell type.
+2. Second, we perform both preprocessing and clustering on this augmented dataset.
+3. Third, we calibrate the number of inferred clusters by using a hypothesis testing strategy with a data-dependent threshold to determine if there is a statistically significant difference between groups. If any pair of groups does not have statistically significant differences then re-clustering occurs.
+
+The synthetic genes act as negative control variables; they go through the same analytic steps as the real data and are presented with the same opportunity to be identified as marker genes.
+The `recall` algorithm uses the guiding principle that well-calibrated clusters (i.e., those representing real groups) should have significantly differentially expressed genes after correcting for multiple hypothesis tests, while over-clustered groups will not.
+We use this rule to iteratively re-cluster cells until the inferred clusters are well-calibrated and the observed differences in expression between groups are not due to the effects of double-dipping.
+
+## Relevant Citations
+`recall` is currently on the bioRxiv, [here](https://www.biorxiv.org/content/10.1101/2024.03.08.584180v1).
+
+A. DenAdel, M. Ramseier, A. Navia, A. Shalek, S. Raghavan, P. Winter, A. Amini, and L. Crawford. A knockoff calibration method to avoid over-clustering in single-cell RNA-sequencing. _bioRxiv_.
+
+## Questions and Feedback
+For questions or concerns with `recall`, please contact
+[Alan DenAdel](mailto:alan_denadel@brown.edu) or [Lorin Crawford](lcrawford@microsoft.com). Any feedback on the software, manuscript, and tutorials is appreciated.
diff --git a/_pkgdown.yml b/_pkgdown.yml
new file mode 100644
index 0000000..26c6513
--- /dev/null
+++ b/_pkgdown.yml
@@ -0,0 +1,4 @@
+url: https://lcrawlab.github.io/callback/
+template:
+  bootstrap: 5
+
diff --git a/_pkgdown.yml  b/_pkgdown.yml 
new file mode 100644
index 0000000..934c5da
--- /dev/null
+++ b/_pkgdown.yml 	
@@ -0,0 +1,3 @@
+url: https://lcrawlab.github.io/recall/
+template:
+  bootstrap: 5
\ No newline at end of file
diff --git a/man/FindClustersCountsplit.Rd b/man/FindClustersCountsplit.Rd
new file mode 100644
index 0000000..06e5bd9
--- /dev/null
+++ b/man/FindClustersCountsplit.Rd
@@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/recall.R
+\name{FindClustersCountsplit}
+\alias{FindClustersCountsplit}
+\title{Runs a typical Seurat workflow on a Seurat object (up to
+dimensionality reduction and clustering).}
+\usage{
+FindClustersCountsplit(
+  seurat_obj,
+  resolution_start = 0.8,
+  reduction_percentage = 0.2,
+  num_clusters_start = 20,
+  dims = 1:10,
+  algorithm = "louvain",
+  null_method = "ZIP",
+  assay = "RNA",
+  cores = 1,
+  shared_memory_max = 8000 * 1024^2,
+  verbose = TRUE
+)
+}
+\arguments{
+\item{seurat_obj}{The Seurat object that will be analyzed.}
+
+\item{resolution_start}{The starting resolution to be used for the
+clustering algorithm (Louvain and Leiden algorithms).}
+
+\item{reduction_percentage}{The amount that the starting parameter will be
+reduced by after each iteration (between 0 and 1).}
+
+\item{num_clusters_start}{The starting number of clusters to be used for the
+clustering algorithm (K-means and Hierarchical clustering algorithms).}
+
+\item{dims}{The dimensions to use as input features (i.e. 1:10).}
+
+\item{algorithm}{The clustering algorithm to be used.}
+
+\item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)}
+
+\item{assay}{The assay to generate artificial variables from.}
+
+\item{cores}{The number of cores to compute marker genes in parallel.}
+
+\item{shared_memory_max}{The maximum size for shared global variables.
+Increased this variable if you see the following error:
+The total size of the X globals that need to be exported for the future expression
+('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB
+(option 'future.globals.maxSize'). The X largest globals are ...}
+
+\item{verbose}{Whether or not to show all logging.}
+}
+\value{
+Returns a Seurat object where the idents have been updated with the
+clusters determined via the countsplit algorithm.
+Latest clustering results will be stored in the object metadata under
+countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever
+time FindClustersCountsplit is run.
+}
+\description{
+Given a Seurat object, returns a new Seurat that has been
+normalized, had variable features identified, scaled, had principal
+components computed, hadclusters identified, and had tSNE and UMAP
+embeddings determined.
+}
diff --git a/man/FindClustersRecall.Rd b/man/FindClustersRecall.Rd
new file mode 100644
index 0000000..598ba9f
--- /dev/null
+++ b/man/FindClustersRecall.Rd
@@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/recall.R
+\name{FindClustersRecall}
+\alias{FindClustersRecall}
+\title{Runs a typical Seurat workflow on a Seurat object (up to
+dimensionality reduction and clustering).}
+\usage{
+FindClustersRecall(
+  seurat_obj,
+  resolution_start = 0.8,
+  reduction_percentage = 0.2,
+  num_clusters_start = 20,
+  dims = 1:10,
+  algorithm = "louvain",
+  null_method = "ZIP",
+  assay = "RNA",
+  cores = 1,
+  shared_memory_max = 8000 * 1024^2,
+  verbose = TRUE
+)
+}
+\arguments{
+\item{seurat_obj}{The Seurat object that will be analyzed.}
+
+\item{resolution_start}{The starting resolution to be used for the
+clustering algorithm (Louvain and Leiden algorithms).}
+
+\item{reduction_percentage}{The amount that the starting parameter will be
+reduced by after each iteration (between 0 and 1).}
+
+\item{num_clusters_start}{The starting number of clusters to be used for the
+clustering algorithm (K-means and Hierarchical clustering algorithms).}
+
+\item{dims}{The dimensions to use as input features (i.e. 1:10).}
+
+\item{algorithm}{The clustering algorithm to be used.}
+
+\item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)}
+
+\item{assay}{The assay to generate artificial variables from.}
+
+\item{cores}{The number of cores to compute marker genes in parallel.}
+
+\item{shared_memory_max}{The maximum size for shared global variables.
+Increased this variable if you see the following error:
+The total size of the X globals that need to be exported for the future expression
+('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB
+(option 'future.globals.maxSize'). The X largest globals are ...}
+
+\item{verbose}{Whether or not to show all logging.}
+}
+\value{
+Returns a Seurat object where the idents have been updated with the
+clusters determined via the recall algorithm.
+Latest clustering results will be stored in the object metadata under
+recall_clusters'. Note that 'recall_clusters' will be overwritten ever
+time FindClustersRecall is run.
+}
+\description{
+Given a Seurat object, returns a new Seurat that has been
+normalized, had variable features identified, scaled, had principal
+components computed, hadclusters identified, and had tSNE and UMAP
+embeddings determined.
+}
diff --git a/man/compute_knockoff_filter.Rd b/man/compute_knockoff_filter.Rd
new file mode 100644
index 0000000..d5f5321
--- /dev/null
+++ b/man/compute_knockoff_filter.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/recall.R
+\name{compute_knockoff_filter}
+\alias{compute_knockoff_filter}
+\title{Returns the genes selected by the knockoff filter}
+\usage{
+compute_knockoff_filter(
+  seurat_obj,
+  cluster1,
+  cluster2,
+  q,
+  return_all = FALSE,
+  num_cores = 1,
+  shared_memory_max
+)
+}
+\arguments{
+\item{seurat_obj}{A Seurat object}
+
+\item{cluster1}{The Idents of the cluster of interest in seurat_obj1}
+
+\item{cluster2}{The Idents of the cluster of interest in seurat_obj2}
+
+\item{q}{The desired rate to control the FDR at}
+
+\item{return_all}{Determines if the returned object will contain all genes
+or just the selected genes.}
+
+\item{num_cores}{The number of cores for computing marker genes in parallel.}
+
+\item{shared_memory_max}{The maximum size for shared global variables.}
+}
+\value{
+todo
+}
+\description{
+Given two Seurat objects, returns the  the genes selected by
+the knockoff filter and their W statistics.
+}
diff --git a/man/estimate_negative_binomial.Rd b/man/estimate_negative_binomial.Rd
new file mode 100644
index 0000000..b845e0c
--- /dev/null
+++ b/man/estimate_negative_binomial.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate_negative_binomial.R
+\name{estimate_negative_binomial}
+\alias{estimate_negative_binomial}
+\title{Maximum likelihood estimation for the negative binomial
+distribution.}
+\usage{
+estimate_negative_binomial(data, verbose = FALSE)
+}
+\arguments{
+\item{data}{The data to estimate parameters from.}
+
+\item{verbose}{Whether or not to show all logging.}
+}
+\value{
+Maximum likelihood estimators size and mu for the negative
+binomial distribution
+}
+\description{
+Given data, computes the maximum likelihood estimators
+for the negative binomial distribution with parameters: size and mu.
+}
diff --git a/man/estimate_negative_binomial_copula.Rd b/man/estimate_negative_binomial_copula.Rd
new file mode 100644
index 0000000..c655a63
--- /dev/null
+++ b/man/estimate_negative_binomial_copula.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/copula.R
+\name{estimate_negative_binomial_copula}
+\alias{estimate_negative_binomial_copula}
+\alias{estimate_zi_poisson_copula}
+\alias{estimate_poisson_copula}
+\alias{estimate_gaussian_copula}
+\title{todo}
+\usage{
+estimate_zi_poisson_copula(data_matrix, cores)
+
+estimate_negative_binomial_copula(data_matrix, cores)
+
+estimate_poisson_copula(data_matrix, cores)
+
+estimate_gaussian_copula(data_matrix, cores)
+}
+\arguments{
+\item{data_matrix}{The data to estimate parameters from.}
+
+\item{cores}{The number of CPU cores to use in estimation by scDesign3.}
+}
+\value{
+todo
+
+todo
+
+todo
+
+todo
+}
+\description{
+Given data, computes todo
+
+Given data, computes todo
+
+Given data, computes todo
+
+Given data, computes todo
+}
diff --git a/man/estimate_zi_poisson.Rd b/man/estimate_zi_poisson.Rd
new file mode 100644
index 0000000..4557dd2
--- /dev/null
+++ b/man/estimate_zi_poisson.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate_zipoisson.R
+\name{estimate_zi_poisson}
+\alias{estimate_zi_poisson}
+\title{Maximum likelihood estimation for the zero-inflated Poisson distribution
+with Poisson parameter lambda and zero proportion prop.zero.}
+\usage{
+estimate_zi_poisson(data)
+}
+\arguments{
+\item{data}{The data to estimate parameters from.}
+}
+\value{
+Maximum likelihood estimators of the zero-inflated Poisson
+distribution
+}
+\description{
+Given data, computes the maximum likelihood estimators
+for the zero-inflated Poisson distribution.
+}
diff --git a/man/get_seurat_obj_with_artificial_variables.Rd b/man/get_seurat_obj_with_artificial_variables.Rd
new file mode 100644
index 0000000..143699d
--- /dev/null
+++ b/man/get_seurat_obj_with_artificial_variables.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/recall.R
+\name{get_seurat_obj_with_artificial_variables}
+\alias{get_seurat_obj_with_artificial_variables}
+\title{Returns a Seurat object that contains additional (fake) RNA
+expression counts.}
+\usage{
+get_seurat_obj_with_artificial_variables(
+  seurat_obj,
+  assay = "RNA",
+  null_method = "ZIP",
+  verbose = TRUE,
+  cores
+)
+}
+\arguments{
+\item{seurat_obj}{A Seurat object containing RNA expression counts.}
+
+\item{assay}{The assay to generate artificial variables from.}
+
+\item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)}
+
+\item{verbose}{Whether or not to show logging.}
+
+\item{cores}{The number of cores to use in generating synthetic null variables.}
+}
+\value{
+A Seurat object that contains the original variable features and an
+equal number of artificial features.
+}
+\description{
+Given a Seurat object, returns a new Seurat object whose RNA
+expression counts includes the
+variable features from the original object and an equal number of artificial
+features.
+}
diff --git a/man/rzipoisson.Rd b/man/rzipoisson.Rd
new file mode 100644
index 0000000..fa6cd0b
--- /dev/null
+++ b/man/rzipoisson.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimate_zipoisson.R
+\name{rzipoisson}
+\alias{rzipoisson}
+\title{Random data generation for the zero-infalted Poisson distribution
+with Poisson parameter lambda and zero proportion prop.zero.}
+\usage{
+rzipoisson(n, lambda, prop.zero)
+}
+\arguments{
+\item{n}{The number of samples to be simulated.}
+
+\item{lambda}{The Poisson rate parameter.}
+
+\item{prop.zero}{The proportion of excess zeroes.}
+}
+\value{
+Simulated data from ZIP(lambda, prop.zero).
+}
+\description{
+Given the number of samples desired, a Poisson parameter,
+lambda, and a zero proportion, prop.zero, simulates the number of desired
+samples from ZIP(lambda, prop.zero).
+}
diff --git a/man/seurat_workflow.Rd b/man/seurat_workflow.Rd
new file mode 100644
index 0000000..97b16e0
--- /dev/null
+++ b/man/seurat_workflow.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/seurat_workflow.R
+\name{seurat_workflow}
+\alias{seurat_workflow}
+\title{Runs a typical Seurat workflow on a Seurat object (up to
+dimensionality reduction and clustering).}
+\usage{
+seurat_workflow(
+  seurat_obj,
+  num_variable_features,
+  resolution_param = 0.8,
+  visualization_method = "umap",
+  num_dims = 10,
+  algorithm = "louvain"
+)
+}
+\arguments{
+\item{seurat_obj}{A Seurat object that will be analyzed.}
+
+\item{num_variable_features}{The number of variable features to use in the
+analysis.}
+
+\item{resolution_param}{The resolution parameter to use when clustering.}
+
+\item{visualization_method}{Either "umap" or "tsne".}
+
+\item{num_dims}{The number of principal components to use.}
+
+\item{algorithm}{The clustering algorithm to use, either "louvain" or
+"leiden".}
+}
+\value{
+A Seurat object containing the relevant analysis results.
+}
+\description{
+Given a Seurat object, returns a new Seurat that has been
+normalized, had variable features identified,
+scaled, had principal components computed, had clusters identified, and had
+tSNE and UMAP embeddings determined.
+}
diff --git a/vignettes/basic-usage.Rmd b/vignettes/basic-usage.Rmd
new file mode 100644
index 0000000..4c42aaa
--- /dev/null
+++ b/vignettes/basic-usage.Rmd
@@ -0,0 +1,73 @@
+---
+title: "Basic Usage on PBMC3k Data"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{basic-usage}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+
+knitr::opts_chunk$set(eval = FALSE)
+
+```
+
+```{r setup}
+suppressPackageStartupMessages({
+library(Seurat)
+library(SeuratData)
+library(recall)
+})
+```
+
+
+First, we use the `SeuratData` data package to first download and then load
+2700 PBMCs. The loaded `SeuratObject`, `pbmc3k`, is from an old version of 
+`Seurat`, and so we update the object to v5.
+
+```{r load_data}
+set.seed(123)
+
+SeuratData::InstallData("pbmc3k")
+data("pbmc3k")
+
+pbmc3k <- UpdateSeuratObject(pbmc3k)
+```
+
+Now, we use `Seurat` to perform the usual preprocessing steps that are performed prior to clustering.
+
+```{r preprocessing}
+pbmc3k <- NormalizeData(pbmc3k)
+pbmc3k <- FindVariableFeatures(pbmc3k)
+pbmc3k <- ScaleData(pbmc3k)
+pbmc3k <- RunPCA(pbmc3k)
+pbmc3k <- FindNeighbors(pbmc3k)
+pbmc3k <- RunUMAP(pbmc3k, dims = 1:10)
+```
+
+The `recall` algorithm can be run with a single function call as a drop-in
+replacement for the `Seurat` function `FindClusters`.
+
+```{r run_recall}
+pbmc3k <- FindClustersRecall(pbmc3k)
+```
+
+The `recall`  clusters are set to the idents of the `SeuratObject` that is
+returned by `FindClustersRecall`
+
+```{r plot_umap}
+DimPlot(pbmc3k)
+```
+
+Cluster labels from `FindClustersRecall` care stored in the metadata in the
+column `pbmc3k@meta.data$recall_clusters`.
+
+```{r plot_umap2}
+DimPlot(pbmc3k, group.by = "recall_clusters")
+```