prepare for initial cran release

igordot · Jan 15, 2020 · 9ebc78e · 9ebc78e
1 parent 44fe496
commit 9ebc78e
Show file tree

Hide file tree

Showing 18 changed files with 431 additions and 157 deletions.
diff --git a/CRAN-RELEASE b/CRAN-RELEASE
@@ -0,0 +1,2 @@
+This package was submitted to CRAN on 2020-01-14.
+Once it is accepted, delete this file and tag the release (commit f7d2e89092).
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,24 +1,27 @@
 Type: Package
-Package: ClusterMole
-Title: A Collection of Cell Type Markers
-Version: 0.0.0.9000
+Package: clustermole
+Title: Unbiased Cell Type Identification of Single-Cell Transcriptomic Data
+Version: 1.0.0
 Authors@R: 
     person(given = "Igor",
            family = "Dolgalev",
            role = c("aut", "cre"),
            email = "igor.dolgalev@nyumc.org")
-Description: A collection of cell type markers.
+Description: A typical computational pipeline to process single-cell RNA sequencing (scRNA-seq) data  involves clustering of cells. Assignment of cell type labels to those clusters is often a time-consuming process that involves manual inspection of the cluster marker genes complemented with a detailed literature search. This is especially challenging if you are not familiar with all the captured subpopulations or have unexpected contaminants. 'clustermole' provides a comprehensive meta collection of cell identity markers for thousands of human and mouse cell types sourced from a variety of databases as well as methods to query them.
 License: MIT + file LICENSE
 URL: https://github.com/igordot/clustermole
+BugReports: https://github.com/igordot/clustermole/issues
 Depends: 
     R (>= 3.4)
 Imports: 
     dplyr,
     GSVA (>= 1.26.0),
     magrittr,
+    methods,
     rlang (>= 0.1.2),
     tibble,
-    tidyr
+    tidyr,
+    utils
 Suggests: 
     covr,
     roxygen2,

diff --git a/LICENSE b/LICENSE
@@ -1,21 +1,2 @@
-MIT License
-
-Copyright (c) 2019
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+YEAR: 2019-2020
+COPYRIGHT HOLDER: Igor Dolgalev
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2019
+Copyright (c) 2019-2020
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,9 +3,17 @@
 export("%>%")
 export(clustermole_enrichment)
 export(clustermole_markers)
+export(clustermole_overlaps)
+export(read_gmt)
 import(dplyr)
+import(methods)
+import(utils)
 importFrom(GSVA,gsva)
 importFrom(magrittr,"%>%")
 importFrom(rlang,.data)
+importFrom(stats,p.adjust)
+importFrom(stats,phyper)
 importFrom(tibble,as_tibble)
+importFrom(tibble,enframe)
 importFrom(tidyr,gather)
+importFrom(tidyr,unnest)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# clustermole 1.0.0
+
+* Initial CRAN submission.
diff --git a/R/functions.R b/R/functions.R
@@ -1,21 +1,104 @@
 
-#' Retrieve a table of all cell type markers
+#' Retrieve the available cell type markers
 #'
-#' @return a data frame of markers with one gene per row
+#' @param species Species for the appropriate gene symbol format: "hs" for human or "mm" for mouse.
+#'
+#' @return A data frame of markers with one gene per row.
+#' @import dplyr
 #' @export
 #'
 #' @examples
-#' markers_tbl <- clustermole_markers()
-clustermole_markers <- function() {
-  markers
+#' markers <- clustermole_markers()
+#' head(markers)
+clustermole_markers <- function(species = "hs") {
+  m_tbl <- clustermole_markers_tbl
+  if (species == "hs") {
+    m_tbl %>%
+      dplyr::select(-.data$gene_mm) %>%
+      dplyr::rename(gene = .data$gene_hs)
+  } else if (species == "mm") {
+    m_tbl %>%
+      dplyr::select(-.data$gene_hs) %>%
+      dplyr::rename(gene = .data$gene_mm)
+  } else {
+    stop("unknown species")
+  }
+}
+
+#' Perform cell type overrepresentation analysis for a set of genes
+#'
+#' @param genes A vector of genes.
+#' @param species Species: "hs" for human or "mm" for mouse.
+#'
+#' @return A data frame of enrichment results with hypergeometric test p-values.
+#' @import methods
+#' @import dplyr
+#' @importFrom tibble as_tibble
+#' @importFrom tidyr gather
+#' @importFrom stats phyper p.adjust
+#' @export
+#'
+#' @examples
+#' my_genes <- c("CD2", "CD3D", "CD3E", "CD3G", "TRAC", "TRBC2", "LTB")
+#' my_overlaps <- clustermole_overlaps(genes = my_genes, species = "hs")
+#' head(my_overlaps)
+clustermole_overlaps <- function(genes, species) {
+
+  # check that the genes vector seems reasonable
+  if (!is(genes, "character")) {
+    stop("genes is not a character vector")
+  }
+  genes <- unique(genes)
+  if (length(genes) < 5) {
+    stop("too few genes")
+  }
+  if (length(genes) > 5000) {
+    stop("too many genes")
+  }
+
+  # retrieve markers
+  markers_tbl <- clustermole_markers(species = species)
+  markers_list <- split(x = markers_tbl$gene, f = markers_tbl$celltype_full)
+  celltypes_tbl <-
+    markers_tbl %>%
+    dplyr::select(-dplyr::starts_with("gene")) %>%
+    dplyr::distinct()
+
+  # check that input genes overlap species genes
+  all_genes <- unique(markers_tbl$gene)
+  genes <- intersect(genes, all_genes)
+  if (length(genes) < 5) {
+    stop("the genes do not appear to correspond to the given species")
+  }
+
+  # run the enrichment analysis
+  overlaps_mat <-
+    sapply(markers_list, function(celltype_genes) {
+      n_overlap <- length(intersect(genes, celltype_genes))
+      n_query <- length(genes)
+      n_celltype <- length(celltype_genes)
+      n_all <- length(all_genes)
+      # phyper(success-in-sample, success-in-bg, fail-in-bg, sample-size)
+      p_val <- phyper(n_overlap - 1, n_celltype, n_all - n_celltype, n_query, lower.tail = FALSE)
+      c("overlap" = n_overlap, "p_value" = p_val, "fdr" = 1)
+    })
+  overlaps_mat <- t(overlaps_mat)
+  overlaps_mat[, "fdr"] <- p.adjust(overlaps_mat[, "p_value"], method = "fdr")
+
+  # clean up the enrichment table
+  overlaps_tbl <- tibble::as_tibble(overlaps_mat, rownames = "celltype_full")
+  overlaps_tbl <- dplyr::inner_join(celltypes_tbl, overlaps_tbl, by = "celltype_full")
+  overlaps_tbl <- dplyr::arrange(overlaps_tbl, .data$fdr, .data$p_value, .data$celltype_full)
+  overlaps_tbl
 }
 
 #' Perform cell type enrichment for a given gene expression matrix
 #'
-#' @param expr_mat expression matrix (logCPMs, logFPKMs, or logTPMs)
-#' @param species species ("hs" for human or "mm" for mouse)
+#' @param expr_mat Expression matrix (logCPMs, logFPKMs, or logTPMs) with genes as rows.
+#' @param species Species: "hs" for human or "mm" for mouse.
 #'
-#' @return a data frame of enrichment results
+#' @return A data frame of enrichment results.
+#' @import methods
 #' @import dplyr
 #' @importFrom tibble as_tibble
 #' @importFrom tidyr gather
@@ -38,37 +121,55 @@ clustermole_enrichment <- function(expr_mat, species) {
   expr_mat <- expr_mat[rowMeans(expr_mat) > 0, ]
 
   # retrieve markers
-  markers_tbl <- clustermole_markers()
+  markers_tbl <- clustermole_markers(species = species)
+  markers_list <- split(x = markers_tbl$gene, f = markers_tbl$celltype_full)
   celltypes_tbl <-
     markers_tbl %>%
-    select(.data$db, .data$species, .data$organ, .data$celltype, .data$celltype_long, .data$n_genes) %>%
-    distinct()
-
-  # create a markers list for gene set enrichment
-  if (species == "hs") {
-    markers_list <- split(x = markers_tbl$gene_h, f = markers_tbl$celltype_long)
-  }
-  if (species == "mm") {
-    markers_list <- split(x = markers_tbl$gene_m, f = markers_tbl$celltype_long)
-  }
+    dplyr::select(-dplyr::starts_with("gene")) %>%
+    dplyr::distinct()
 
   # run the actual enrichment analysis
-  gsva_mat <- gsva(
+  gsva_mat <- GSVA::gsva(
     expr = expr_mat, gset.idx.list = markers_list,
-    method = "gsva", kcdf = "Gaussian", verbose = FALSE
+    method = "gsva", kcdf = "Gaussian", parallel.sz = 1, verbose = FALSE
   )
 
   # clean up the enrichment table
   gsva_tbl <-
     gsva_mat %>%
     round(8) %>%
-    as_tibble(rownames = "celltype_long") %>%
-    gather(key = "cluster", value = "score", -.data$celltype_long) %>%
-    select(.data$cluster, .data$celltype_long, .data$score) %>%
-    group_by(.data$cluster) %>%
-    top_n(n = 50) %>%
-    ungroup() %>%
-    inner_join(celltypes_tbl, by = "celltype_long") %>%
-    arrange(.data$cluster, desc(.data$score))
+    tibble::as_tibble(rownames = "celltype_full") %>%
+    tidyr::gather(key = "cluster", value = "score", -.data$celltype_full) %>%
+    dplyr::select(.data$cluster, .data$celltype_full, .data$score) %>%
+    dplyr::group_by(.data$cluster) %>%
+    dplyr::top_n(n = 50) %>%
+    dplyr::ungroup() %>%
+    dplyr::inner_join(celltypes_tbl, by = "celltype_full") %>%
+    dplyr::arrange(.data$cluster, desc(.data$score))
   gsva_tbl
 }
+
+#' Read a GMT file into a data frame
+#'
+#' @param file A connection object or a character string (can be a URL).
+#' @param geneset_label Column name for gene sets (first column of the GMT file) in the output data frame.
+#' @param gene_label Column name for genes (variable columns of the GMT file) in the output data frame.
+#'
+#' @return A data frame with gene sets as the first column and genes as the second column (one gene per row).
+#' @import utils
+#' @importFrom tibble enframe
+#' @importFrom tidyr unnest
+#' @export
+#'
+#' @examples
+#' gmt <- "http://software.broadinstitute.org/gsea/msigdb/supplemental/scsig.all.v1.0.symbols.gmt"
+#' gmt_tbl <- read_gmt(gmt)
+#' head(gmt_tbl)
+read_gmt <- function(file, geneset_label = "celltype", gene_label = "gene") {
+  gmt_split <- strsplit(readLines(file), "\t")
+  gmt_list <- lapply(gmt_split, tail, -2)
+  names(gmt_list) <- sapply(gmt_split, head, 1)
+  gmt_df <- tibble::enframe(gmt_list, name = geneset_label, value = gene_label)
+  gmt_df <- tidyr::unnest(gmt_df, gene_label)
+  gmt_df
+}
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/README.md b/README.md
@@ -1,4 +1,7 @@
-# ClusterMole
+# clustermole: blindly digging for cell types in scRNA-seq clusters
+
+[![Travis Build Status](https://travis-ci.org/igordot/clustermole.svg?branch=master)](https://travis-ci.org/igordot/clustermole)
+[![codecov](https://codecov.io/gh/igordot/clustermole/branch/master/graph/badge.svg)](https://codecov.io/gh/igordot/clustermole)
 
 > See, children, the misguided Mole.  
 > He lives down in a deep, dark hole;  
@@ -15,29 +18,33 @@
 >
 > -- Oliver Herford
 
-Cell type annotation of single-cell RNA sequencing (scRNA-seq) data typically requires a reference dataset, but finding an appropriate one may be challenging.
-ClusterMole is an R package that provides a collection of cell type markers for thousands of human and mouse cell populations sourced from a variety of databases.
+A typical computational pipeline to process single-cell RNA sequencing (scRNA-seq) data  involves clustering of cells. Assignment of cell type labels to those clusters is often a time-consuming process that involves manual inspection of the cluster marker genes complemented with a detailed literature search. This is especially challenging if you are not familiar with all the captured subpopulations or have unexpected contaminants. `clustermole` is an R package that provides a comprehensive meta collection of cell identity markers for thousands of human and mouse cell types sourced from a variety of databases as well as methods to query them.
 
-Install ClusterMole:
+Install clustermole (development version):
 
 ```r
 BiocManager::install("igordot/clustermole", update = FALSE)
 ```
 
+Load clustermole:
+
+```r
+library(clustermole)
+```
+
 Retrieve a table of all cell type markers:
 
 ```r
-markers_tbl = clustermole_markers()
-head(markers_tbl)
+clustermole_markers(genes, species)
 ```
 
-See a summary of the available cell types:
+Perform cell type overrepresentation analysis for a given set of genes:
 
 ```r
-markers_tbl %>% distinct(celltype, organ, db)
+clustermole_overlaps(expr_mat, species)
 ```
 
-Perform cell type enrichment for a given gene expression matrix:
+Perform cell type enrichment for a given full gene expression matrix:
 
 ```r
 clustermole_enrichment(expr_mat, species)

diff --git a/cran-comments.md b/cran-comments.md
@@ -0,0 +1,19 @@
+## Resubmission
+
+This is a resubmission. In this version I have:
+
+* Fixed LICENSE based on the CRAN template.
+
+
+## Test environments
+
+* local OS X install, R 3.6.1
+* travis-ci: R 3.4, R-release, R-devel
+* win-builder: R-devel
+* rchk: unbuntu-rchk platform on R-hub
+
+## R CMD check results
+
+0 errors | 0 warnings | 0 notes
+
+* This is a new release.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		This package was submitted to CRAN on 2020-01-14.
		Once it is accepted, delete this file and tag the release (commit f7d2e89092).
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# clustermole 1.0.0

		* Initial CRAN submission.