fix mm9 and mm10 rGREAT support, handle R/pycisTarget exceptions of l…

…arge overlap between query and background
epigen · Jul 5, 2024 · 03b64cd · 03b64cd
1 parent e01b8ee
commit 03b64cd
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -160,8 +160,8 @@ Detailed specifications can be found here [./config/README.md](./config/README.m
 We provide four example queries across all tools with four different databases:
 - three are region sets from a [LOLA Vignette](http://code.databio.org/LOLA/articles/usingLOLACore.html). Download the example data by following the instructions below.
 - one is a preranked gene-score set derived from the GDS289 [fgsea R package example data](https://github.com/ctlab/fgsea/blob/master/inst/extdata/GDS289.tsv) (`score=-log10(p-value) * sign(LFC)`).
-- the total runtime was ~45 minutes on an HPC with 1 core and 32GB RAM per job.
-- note: we are using a hg38 database for pycistarget, because the respective hg19 database is not compatible with the current version.
+- the total runtime was ~23 minutes on an HPC with 1 core and 32GB RAM per job.
+- note: we are using a hg38 database for pycistarget, because the respective hg19 database is not compatible with the current pycisTarget version.
 
 Follow these steps to run the complete analysis:
 1. Download all necessary data (query and resources)

diff --git a/config/config.yaml b/config/config.yaml
@@ -22,7 +22,7 @@ genome: 'hg38'
 #     Enrichr (https://maayanlab.cloud/Enrichr/#libraries)
 # JSON example content: { "MyDB_Term1": ["geneA","geneB","geneC"],"MyDB_Term2": ["geneX","geneY","geneZ"]}
 local_databases:
-    My_GMT_db: "path/to/My_GMT_db.gmt"
+    MSigDB_H_C2_CGP: "path/to/a/GMT_db.gmt"
 
 ## LOLA compatible region set databases
 # loaded using loadRegionDB() (https://code.databio.org/LOLA/reference/loadRegionDB.html)

diff --git a/workflow/envs/region_enrichment_analysis.yaml b/workflow/envs/region_enrichment_analysis.yaml
@@ -11,3 +11,9 @@ dependencies:
   - bioconductor-rgreat=2.4.0
   - r-data.table=1.15.2
   - bioconductor-rtracklayer=1.62.0
+  - bioconductor-org.mm.eg.db=3.18.0
+  - bioconductor-org.hs.eg.db=3.18.0
+  - bioconductor-txdb.mmusculus.ucsc.mm10.knowngene=3.10.0
+  - bioconductor-txdb.hsapiens.ucsc.hg19.knowngene=3.2.2
+  - bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.18.0
+  - bioconductor-txdb.mmusculus.ucsc.mm9.knowngene=3.2.2
diff --git a/workflow/rules/enrichment_analysis.smk b/workflow/rules/enrichment_analysis.smk
@@ -89,23 +89,27 @@ rule region_motif_enrichment_analysis_pycisTarget:
         "logs/rules/region_enrichment_analysis_pycisTarget_{region_set}_{database}.log"
     shell:
         """
-        pycistarget cistarget \
-            --cistarget_db_fname {input.ctx_db} \
-            --bed_fname {input.regions} \
-            --output_folder $(dirname {output.motif_hdf5}) \
-            --fr_overlap_w_ctx_db {params.fraction_overlap_w_cistarget_database} \
-            --auc_threshold {params.auc_threshold} \
-            --nes_threshold {params.nes_threshold} \
-            --rank_threshold {params.rank_threshold} \
-            --path_to_motif_annotations {input.motif2tf} \
-            --annotation_version {params.annotation_version} \
-            --annotations_to_use {params.annotations_to_use} \
-            --motif_similarity_fdr {params.motif_similarity_fdr} \
-            --orthologous_identity_threshold {params.orthologous_identity_threshold} \
-            --species {params.species} \
-            --name {wildcards.region_set} \
-            --output_mode 'hdf5' \
-            --write_html
+        {{
+            pycistarget cistarget \
+                --cistarget_db_fname {input.ctx_db} \
+                --bed_fname {input.regions} \
+                --output_folder $(dirname {output.motif_hdf5}) \
+                --fr_overlap_w_ctx_db {params.fraction_overlap_w_cistarget_database} \
+                --auc_threshold {params.auc_threshold} \
+                --nes_threshold {params.nes_threshold} \
+                --rank_threshold {params.rank_threshold} \
+                --path_to_motif_annotations {input.motif2tf} \
+                --annotation_version {params.annotation_version} \
+                --annotations_to_use {params.annotations_to_use} \
+                --motif_similarity_fdr {params.motif_similarity_fdr} \
+                --orthologous_identity_threshold {params.orthologous_identity_threshold} \
+                --species {params.species} \
+                --name {wildcards.region_set} \
+                --output_mode 'hdf5' \
+                --write_html
+        }} || {{ 
+            echo "An error occurred during the region TFBS motif enrichment analysis using pycisTarget"; touch {output.motif_hdf5} {output.motif_html}; exit 0;
+        }}
         """
 
 # postprocess results from pycisTarget

diff --git a/workflow/scripts/gene_enrichment_analysis_RcisTarget.R b/workflow/scripts/gene_enrichment_analysis_RcisTarget.R
@@ -51,31 +51,46 @@ ranking_df <- getRanking(motifRankings)
 # subset gene list for supported genes
 geneSets[[gene_set_name]] <- intersect(colnames(ranking_df), geneSets[[gene_set_name]])
 
+# quit early if there is no overlap
+if (length(geneSets[[gene_set_name]])==0){
+    print("No overlap between ranking database and query genes.")
+    file.create(result_path)
+    quit(save = "no", status = 0)
+}
+
 # load the motif to TF annotation
 motifAnnot <- importAnnotations(motif2tf_path, motifsInRanking = ranking_df$features)
 
 ###### RcisTarget
 
-# run RcisTarget
-motifEnrichmentTable_wGenes <- cisTarget(geneSets = geneSets,
-                                         motifRankings = motifRankings,
-                                         motifAnnot = motifAnnot,
-                                         motifAnnot_highConfCat = c(rcistarget_params[["motifAnnot_highConfCat"]]),
-                                         motifAnnot_lowConfCat = c(rcistarget_params[["motifAnnot_lowConfCat"]]),
-                                         highlightTFs = NULL,
-                                         nesThreshold = rcistarget_params[["nesThreshold"]],
-                                         aucMaxRank = rcistarget_params[["aucMaxRank_factor"]] * ncol(motifRankings),
-                                         geneErnMethod = rcistarget_params[["geneErnMethod"]], 
-                                         geneErnMaxRank = rcistarget_params[["geneErnMaxRank"]],
-                                         nCores = cores_n,
-                                         verbose = TRUE
-                                        )
-
-# format result table
-motifEnrichmentTable_wGenes$description <- sapply(motifEnrichmentTable_wGenes$TF_highConf, process_genes)
-motifEnrichmentTable_wGenes$description <- paste0(motifEnrichmentTable_wGenes$motif, " (",motifEnrichmentTable_wGenes$description,")")
-# motifEnrichmentTable_wGenes$description <- paste0(motifEnrichmentTable_wGenes$motif, " (",motifEnrichmentTable_wGenes$TF_highConf,")")
-motifEnrichmentTable_wGenes$name <- gene_set_name
-
-# save result table
-fwrite(as.data.frame(motifEnrichmentTable_wGenes), file=file.path(result_path), row.names=FALSE) #quote=FALSE
+# run RcisTarget with try/catch exception handling
+tryCatch({
+    motifEnrichmentTable_wGenes <- cisTarget(geneSets = geneSets,
+                                             motifRankings = motifRankings,
+                                             motifAnnot = motifAnnot,
+                                             motifAnnot_highConfCat = c(rcistarget_params[["motifAnnot_highConfCat"]]),
+                                             motifAnnot_lowConfCat = c(rcistarget_params[["motifAnnot_lowConfCat"]]),
+                                             highlightTFs = NULL,
+                                             nesThreshold = rcistarget_params[["nesThreshold"]],
+                                             aucMaxRank = rcistarget_params[["aucMaxRank_factor"]] * ncol(motifRankings),
+                                             geneErnMethod = rcistarget_params[["geneErnMethod"]], 
+                                             geneErnMaxRank = rcistarget_params[["geneErnMaxRank"]],
+                                             nCores = cores_n,
+                                             verbose = TRUE
+                                            )
+
+    # format result table
+    motifEnrichmentTable_wGenes$description <- sapply(motifEnrichmentTable_wGenes$TF_highConf, process_genes)
+    motifEnrichmentTable_wGenes$description <- paste0(motifEnrichmentTable_wGenes$motif, " (",motifEnrichmentTable_wGenes$description,")")
+    motifEnrichmentTable_wGenes$name <- gene_set_name
+
+    # save result table
+    fwrite(as.data.frame(motifEnrichmentTable_wGenes), file=file.path(result_path), row.names=FALSE) #quote=FALSE
+}, error = function(e) {
+    print("An error occurred during the cisTarget analysis.")
+    overlap_percentage <- round(length(intersect(geneSets[[gene_set_name]], background)) / length(background) * 100, 2)
+    print(paste("Overlap between query and background gene set might be too high with ", overlap_percentage,"%."))
+    print(e)
+    file.create(result_path)
+    quit(save = "no", status = 0)
+})
diff --git a/workflow/scripts/overview_plot.R b/workflow/scripts/overview_plot.R
@@ -136,7 +136,7 @@ pheatmap(adjp_df,
          cellwidth=10,
          cellheight=10,
          filename=adjp_hm_path,
-         breaks=seq(0, max(adjp_df), length.out=200),
+         breaks= if (max(adjp_df)==0) seq(0, 1, length.out=200) else seq(0, max(adjp_df), length.out=200),
          color=colorRampPalette(c("white", "red"))(200)
         )
 

diff --git a/workflow/scripts/process_results_pycisTarget.py b/workflow/scripts/process_results_pycisTarget.py
@@ -1,5 +1,6 @@
 
 # libraries
+import os
 import pycistarget
 from pycistarget.input_output import read_hdf5
 
@@ -15,6 +16,11 @@
 region_set_name = snakemake.wildcards["region_set"]
 term_col = snakemake.config["pycistarget_parameters"]["annotations_to_use"][0]
 
+# quit early if file is empty
+if os.path.getsize(motif_hdf5_path) == 0:
+    open(motif_csv_path, 'w').close()
+    quit()
+
 # load pycisTarget results from hdf5
 results = read_hdf5(motif_hdf5_path)