Skip to content

Commit

Permalink
fix mm9 and mm10 rGREAT support, handle R/pycisTarget exceptions of l…
Browse files Browse the repository at this point in the history
…arge overlap between query and background
  • Loading branch information
sreichl committed Jul 5, 2024
1 parent e01b8ee commit 03b64cd
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 44 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ Detailed specifications can be found here [./config/README.md](./config/README.m
We provide four example queries across all tools with four different databases:
- three are region sets from a [LOLA Vignette](http://code.databio.org/LOLA/articles/usingLOLACore.html). Download the example data by following the instructions below.
- one is a preranked gene-score set derived from the GDS289 [fgsea R package example data](https://github.com/ctlab/fgsea/blob/master/inst/extdata/GDS289.tsv) (`score=-log10(p-value) * sign(LFC)`).
- the total runtime was ~45 minutes on an HPC with 1 core and 32GB RAM per job.
- note: we are using a hg38 database for pycistarget, because the respective hg19 database is not compatible with the current version.
- the total runtime was ~23 minutes on an HPC with 1 core and 32GB RAM per job.
- note: we are using a hg38 database for pycistarget, because the respective hg19 database is not compatible with the current pycisTarget version.

Follow these steps to run the complete analysis:
1. Download all necessary data (query and resources)
Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ genome: 'hg38'
# Enrichr (https://maayanlab.cloud/Enrichr/#libraries)
# JSON example content: { "MyDB_Term1": ["geneA","geneB","geneC"],"MyDB_Term2": ["geneX","geneY","geneZ"]}
local_databases:
My_GMT_db: "path/to/My_GMT_db.gmt"
MSigDB_H_C2_CGP: "path/to/a/GMT_db.gmt"

## LOLA compatible region set databases
# loaded using loadRegionDB() (https://code.databio.org/LOLA/reference/loadRegionDB.html)
Expand Down
6 changes: 6 additions & 0 deletions workflow/envs/region_enrichment_analysis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,9 @@ dependencies:
- bioconductor-rgreat=2.4.0
- r-data.table=1.15.2
- bioconductor-rtracklayer=1.62.0
- bioconductor-org.mm.eg.db=3.18.0
- bioconductor-org.hs.eg.db=3.18.0
- bioconductor-txdb.mmusculus.ucsc.mm10.knowngene=3.10.0
- bioconductor-txdb.hsapiens.ucsc.hg19.knowngene=3.2.2
- bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.18.0
- bioconductor-txdb.mmusculus.ucsc.mm9.knowngene=3.2.2
38 changes: 21 additions & 17 deletions workflow/rules/enrichment_analysis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -89,23 +89,27 @@ rule region_motif_enrichment_analysis_pycisTarget:
"logs/rules/region_enrichment_analysis_pycisTarget_{region_set}_{database}.log"
shell:
"""
pycistarget cistarget \
--cistarget_db_fname {input.ctx_db} \
--bed_fname {input.regions} \
--output_folder $(dirname {output.motif_hdf5}) \
--fr_overlap_w_ctx_db {params.fraction_overlap_w_cistarget_database} \
--auc_threshold {params.auc_threshold} \
--nes_threshold {params.nes_threshold} \
--rank_threshold {params.rank_threshold} \
--path_to_motif_annotations {input.motif2tf} \
--annotation_version {params.annotation_version} \
--annotations_to_use {params.annotations_to_use} \
--motif_similarity_fdr {params.motif_similarity_fdr} \
--orthologous_identity_threshold {params.orthologous_identity_threshold} \
--species {params.species} \
--name {wildcards.region_set} \
--output_mode 'hdf5' \
--write_html
{{
pycistarget cistarget \
--cistarget_db_fname {input.ctx_db} \
--bed_fname {input.regions} \
--output_folder $(dirname {output.motif_hdf5}) \
--fr_overlap_w_ctx_db {params.fraction_overlap_w_cistarget_database} \
--auc_threshold {params.auc_threshold} \
--nes_threshold {params.nes_threshold} \
--rank_threshold {params.rank_threshold} \
--path_to_motif_annotations {input.motif2tf} \
--annotation_version {params.annotation_version} \
--annotations_to_use {params.annotations_to_use} \
--motif_similarity_fdr {params.motif_similarity_fdr} \
--orthologous_identity_threshold {params.orthologous_identity_threshold} \
--species {params.species} \
--name {wildcards.region_set} \
--output_mode 'hdf5' \
--write_html
}} || {{
echo "An error occurred during the region TFBS motif enrichment analysis using pycisTarget"; touch {output.motif_hdf5} {output.motif_html}; exit 0;
}}
"""

# postprocess results from pycisTarget
Expand Down
61 changes: 38 additions & 23 deletions workflow/scripts/gene_enrichment_analysis_RcisTarget.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,31 +51,46 @@ ranking_df <- getRanking(motifRankings)
# subset gene list for supported genes
geneSets[[gene_set_name]] <- intersect(colnames(ranking_df), geneSets[[gene_set_name]])

# quit early if there is no overlap
if (length(geneSets[[gene_set_name]])==0){
print("No overlap between ranking database and query genes.")
file.create(result_path)
quit(save = "no", status = 0)
}

# load the motif to TF annotation
motifAnnot <- importAnnotations(motif2tf_path, motifsInRanking = ranking_df$features)

###### RcisTarget

# run RcisTarget
motifEnrichmentTable_wGenes <- cisTarget(geneSets = geneSets,
motifRankings = motifRankings,
motifAnnot = motifAnnot,
motifAnnot_highConfCat = c(rcistarget_params[["motifAnnot_highConfCat"]]),
motifAnnot_lowConfCat = c(rcistarget_params[["motifAnnot_lowConfCat"]]),
highlightTFs = NULL,
nesThreshold = rcistarget_params[["nesThreshold"]],
aucMaxRank = rcistarget_params[["aucMaxRank_factor"]] * ncol(motifRankings),
geneErnMethod = rcistarget_params[["geneErnMethod"]],
geneErnMaxRank = rcistarget_params[["geneErnMaxRank"]],
nCores = cores_n,
verbose = TRUE
)

# format result table
motifEnrichmentTable_wGenes$description <- sapply(motifEnrichmentTable_wGenes$TF_highConf, process_genes)
motifEnrichmentTable_wGenes$description <- paste0(motifEnrichmentTable_wGenes$motif, " (",motifEnrichmentTable_wGenes$description,")")
# motifEnrichmentTable_wGenes$description <- paste0(motifEnrichmentTable_wGenes$motif, " (",motifEnrichmentTable_wGenes$TF_highConf,")")
motifEnrichmentTable_wGenes$name <- gene_set_name

# save result table
fwrite(as.data.frame(motifEnrichmentTable_wGenes), file=file.path(result_path), row.names=FALSE) #quote=FALSE
# run RcisTarget with try/catch exception handling
tryCatch({
motifEnrichmentTable_wGenes <- cisTarget(geneSets = geneSets,
motifRankings = motifRankings,
motifAnnot = motifAnnot,
motifAnnot_highConfCat = c(rcistarget_params[["motifAnnot_highConfCat"]]),
motifAnnot_lowConfCat = c(rcistarget_params[["motifAnnot_lowConfCat"]]),
highlightTFs = NULL,
nesThreshold = rcistarget_params[["nesThreshold"]],
aucMaxRank = rcistarget_params[["aucMaxRank_factor"]] * ncol(motifRankings),
geneErnMethod = rcistarget_params[["geneErnMethod"]],
geneErnMaxRank = rcistarget_params[["geneErnMaxRank"]],
nCores = cores_n,
verbose = TRUE
)

# format result table
motifEnrichmentTable_wGenes$description <- sapply(motifEnrichmentTable_wGenes$TF_highConf, process_genes)
motifEnrichmentTable_wGenes$description <- paste0(motifEnrichmentTable_wGenes$motif, " (",motifEnrichmentTable_wGenes$description,")")
motifEnrichmentTable_wGenes$name <- gene_set_name

# save result table
fwrite(as.data.frame(motifEnrichmentTable_wGenes), file=file.path(result_path), row.names=FALSE) #quote=FALSE
}, error = function(e) {
print("An error occurred during the cisTarget analysis.")
overlap_percentage <- round(length(intersect(geneSets[[gene_set_name]], background)) / length(background) * 100, 2)
print(paste("Overlap between query and background gene set might be too high with ", overlap_percentage,"%."))
print(e)
file.create(result_path)
quit(save = "no", status = 0)
})
2 changes: 1 addition & 1 deletion workflow/scripts/overview_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ pheatmap(adjp_df,
cellwidth=10,
cellheight=10,
filename=adjp_hm_path,
breaks=seq(0, max(adjp_df), length.out=200),
breaks= if (max(adjp_df)==0) seq(0, 1, length.out=200) else seq(0, max(adjp_df), length.out=200),
color=colorRampPalette(c("white", "red"))(200)
)

Expand Down
6 changes: 6 additions & 0 deletions workflow/scripts/process_results_pycisTarget.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

# libraries
import os
import pycistarget
from pycistarget.input_output import read_hdf5

Expand All @@ -15,6 +16,11 @@
region_set_name = snakemake.wildcards["region_set"]
term_col = snakemake.config["pycistarget_parameters"]["annotations_to_use"][0]

# quit early if file is empty
if os.path.getsize(motif_hdf5_path) == 0:
open(motif_csv_path, 'w').close()
quit()

# load pycisTarget results from hdf5
results = read_hdf5(motif_hdf5_path)

Expand Down

0 comments on commit 03b64cd

Please sign in to comment.