From 0fcd86002c0633509569d312bd47cf28b77f9a66 Mon Sep 17 00:00:00 2001
From: igor <6363505+igordot@users.noreply.github.com>
Date: Mon, 6 May 2024 17:20:47 -0400
Subject: [PATCH] Clean up clustermole_overlaps()

---
 R/overlaps.R                   | 17 +++++++++--------
 tests/testthat/test-overlaps.R |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/R/overlaps.R b/R/overlaps.R
index c4695a8..c793a30 100644
--- a/R/overlaps.R
+++ b/R/overlaps.R
@@ -18,17 +18,16 @@
 #' my_overlaps <- clustermole_overlaps(genes = my_genes, species = "hs")
 #' head(my_overlaps)
 clustermole_overlaps <- function(genes, species) {
-
   # check that the genes vector seems reasonable
   if (!is(genes, "character")) {
-    stop("genes is not a character vector")
+    stop("`genes` is not a character vector")
   }
-  genes <- unique(genes)
+  genes <- unique(sort(genes))
   if (length(genes) < 5) {
-    stop("too few genes")
+    stop("input should be at least 5 genes")
   }
   if (length(genes) > 5000) {
-    stop("too many genes")
+    stop("input should be less than 5,000 genes")
   }
 
   # retrieve markers
@@ -39,11 +38,13 @@ clustermole_overlaps <- function(genes, species) {
     dplyr::select(-dplyr::starts_with("gene")) %>%
     dplyr::distinct()
 
-  # check that input genes overlap species genes
+  # check that input genes overlap marker genes for a given species
   all_genes <- unique(markers_tbl$gene)
+  input_genes <- genes
   genes <- intersect(genes, all_genes)
-  if (length(genes) < 5) {
-    stop("the genes do not appear to correspond to the given species")
+  if (length(genes) < (length(input_genes) * 0.25)) {
+    problematic_genes <- setdiff(input_genes, genes)
+    stop("many input genes are not known (possibly wrong species): ", toString(problematic_genes))
   }
 
   # run the enrichment analysis
diff --git a/tests/testthat/test-overlaps.R b/tests/testthat/test-overlaps.R
index 1d52601..82617ba 100644
--- a/tests/testthat/test-overlaps.R
+++ b/tests/testthat/test-overlaps.R
@@ -9,7 +9,7 @@ gene_names <- sample(gene_names)
 test_that("clustermole_overlaps() wrong input", {
   expect_error(clustermole_overlaps(gene_names[1:3], species = "hs"))
   expect_error(clustermole_overlaps(gene_names[1:10000], species = "hs"))
-  expect_error(clustermole_overlaps(c(gene_names[1:3], "X", "Y", "Z"), species = "hs"))
+  expect_error(clustermole_overlaps(c(gene_names[1:3], "Z1", "Z2", "Z3", "Z4"), species = "hs"))
   expect_error(clustermole_overlaps(as.list(gene_names[1:10]), species = "hs"))
 })