Added more functions

neurogenomics · Oct 22, 2024 · 4e9c315 · 4e9c315
1 parent 3560809
commit 4e9c315
Show file tree

Hide file tree

Showing 19 changed files with 579 additions and 74 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
-export(sc_cell_type_de)
+export(DGE_analysis)
+export(power_analysis)
+export(within_data_correlation)
 importFrom(EnsDb.Hsapiens.v79,EnsDb.Hsapiens.v79)
 importFrom(Matrix,rowSums)
 importFrom(SingleCellExperiment,colData)
@@ -36,7 +38,11 @@ importFrom(ggplot2,geom_point)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,ggsave)
 importFrom(ggplot2,ggtitle)
+importFrom(ggplot2,guide_legend)
+importFrom(ggplot2,guides)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_alpha)
+importFrom(ggplot2,scale_colour_brewer)
 importFrom(ggplot2,scale_colour_manual)
 importFrom(ggplot2,scale_fill_gradient2)
 importFrom(ggplot2,scale_shape_manual)

diff --git a/R/sc_cell_type_de.R → R/DGE_analysis.R b/R/sc_cell_type_de.R → R/DGE_analysis.R
@@ -70,11 +70,11 @@
 #'          .N,by=.(celltype,chromosome_name)]
 #'}
 
-sc_cell_type_de <- function(SCE, design, pseudobulk_ID, celltype_ID, y=NULL,
-                            region="single_region", coef=NULL, control=NULL,
-                            pval_adjust_method = "BH", adj_pval=0.05,
-                            folder="sc.cell.type.de.graphs/",
-                            rmv_zero_count_genes=TRUE, verbose=F){
+DGE_analysis <- function(SCE, design, pseudobulk_ID, celltype_ID, y=NULL,
+                         region="single_region", coef=NULL, control=NULL,
+                         pval_adjust_method = "BH", adj_pval=0.05,
+                         folder="sc.cell.type.de.graphs/",
+                         rmv_zero_count_genes=TRUE, verbose=F){
 
     # need to load SCE if a directory is passed
     if(class(SCE)[1]=="character"){
@@ -135,8 +135,8 @@ sc_cell_type_de <- function(SCE, design, pseudobulk_ID, celltype_ID, y=NULL,
 
     # run edgeR LRT DE analysis
     celltype_de <-
-        de_analysis(pb_dat,formula,y_name=y,y_contin,coef,control,
-                        pval_adjust_method, adj_pval, verbose)
+        differential_expression(pb_dat,formula,y_name=y,y_contin,coef,control,
+                                pval_adjust_method, adj_pval, verbose)
     # get sig DEGs for each
     celltype_DEGs <- lapply(celltype_de, function(x) x[x$adj_pval<adj_pval,])
 

diff --git a/R/correlation_boxplots.r b/R/correlation_boxplots.r
@@ -0,0 +1,236 @@
+# Define global variables
+utils::globalVariables(c("alpha"))
+
+#' Obtain box plots for the correlations of all celltypes, and the mean correlations at a specified cutoff p-value
+
+#' @importFrom ggplot2 geom_boxplot ggplot labs facet_wrap theme aes element_text element_blank scale_colour_brewer scale_alpha guides guide_legend
+#' @importFrom cowplot theme_cowplot
+
+#' @param corrMats (named) list of correlation matrices for each celltype with the final element being the mean correlation matrix, all at specified p-value
+#' @param numRealDatasets total number of *real* datasets (most likely the number of studies, but sometimes a study may be split e.g. into 2 brain regions, so in this case it would be the number of studies plus 1)
+#' @param pval the cut-off p-value which was used to select DEGs
+#' @param alphaval (alpha) transparency of the non-mean boxplots
+#' @param numPerms number of random permutations of the dataset used to select significant DEGs from
+#' @param numSubsets number of pairs of random subsets of the dataset used to select significant DEGs from
+#' @param sexDEGs true if DEGs come from sex chromosomes, else false
+#' @param fontsize_yaxislabels font size for axis labels in plot
+#' @param fontsize_yaxisticks font size for axis tick labels in plot
+#' @param fontsize_title font size for plot title
+#' @param fontsize_legendlabels font size for legend labels in plot
+#' @param fontsize_legendtitle font size for legend title in plot
+#' @param fontsize_facet_labels font size for facet labels
+
+#' @return box plots for correlation matrices at a certain p-value cut-off, sorted by celltype and then type of correlation
+
+correlation_boxplots <- function(corrMats,
+                                 numRealDatasets,
+                                 pval,
+                                 alphaval=0.25,
+                                 numPerms=5,
+                                 numSubsets=5,
+                                 sexDEGs=FALSE,
+                                 fontsize_yaxislabels=12,
+                                 fontsize_yaxisticks=9,
+                                 fontsize_title=14,
+                                 fontsize_legendlabels=9,
+                                 fontsize_legendtitle=9,
+                                 fontsize_facet_labels=9){
+
+    # check input parameters are fine
+    if(class(corrMats)!="list"){
+        stop("Error: corrMats should be a list of matrices")
+    }
+    if(floor(numRealDatasets)!=numRealDatasets){
+        stop("Error: numRealDatasets should be an integer specifying the number of real datasets (see description)")
+    }
+    if(!is.numeric(pval) | pval <= 0 | pval > 1){
+        stop("Error: pval should be a (positive) number between 0 and 1")
+    }
+    if(floor(numPerms)!=numPerms){
+        stop("Error: numPerms should be an integer specifying the number of random permutations of Tsai (see description)")
+    }
+    if(floor(numSubsets)!=numSubsets){
+        stop("Error: numSubsets should be an integer specifying the number of subsets of Tsai (see description)")
+    }
+    if(class(sexDEGs)!="logical"){
+        stop("Error: sexDEGs should be TRUE (if DEGs are chosen only from sex chromosomes) or FALSE")
+    }
+    if(class(fontsize_yaxislabels)!="numeric"){
+        stop("Error: fontsize_yaxislabels should be numerical.")
+    }else{
+        if(fontsize_yaxislabels-floor(fontsize_yaxislabels)!=0|fontsize_yaxislabels<0){
+            stop("Error: fontsize_yaxislabels should be a positive integer.")
+        }
+    }
+    if(class(fontsize_yaxisticks)!="numeric"){
+        stop("Error: fontsize_yaxisticks should be numerical.")
+    }else{
+        if(fontsize_yaxisticks-floor(fontsize_yaxisticks)!=0|fontsize_yaxisticks<0){
+            stop("Error: fontsize_yaxisticks should be a positive integer.")
+        }
+    }
+    if(class(fontsize_title)!="numeric"){
+        stop("Error: fontsize_title should be numerical.")
+    }else{
+        if(fontsize_title-floor(fontsize_title)!=0|fontsize_title<0){
+            stop("Error: fontsize_title should be a positive integer.")
+        }
+    }
+    if(class(fontsize_legendlabels)!="numeric"){
+        stop("Error: fontsize_legendlabels should be numerical.")
+    }else{
+        if(fontsize_legendlabels-floor(fontsize_legendlabels)!=0|fontsize_legendlabels<0){
+            stop("Error: fontsize_legendlabels should be a positive integer.")
+        }
+    }
+    if(class(fontsize_legendtitle)!="numeric"){
+        stop("Error: fontsize_legendtitle should be numerical.")
+    }else{
+        if(fontsize_legendtitle-floor(fontsize_legendtitle)!=0|fontsize_legendtitle<0){
+            stop("Error: fontsize_legendtitle should be a positive integer.")
+        }
+    }
+    if(class(fontsize_facet_labels)!="numeric"){
+        stop("Error: fontsize_facet_labels should be numerical.")
+    }else{
+        if(fontsize_facet_labels-floor(fontsize_facet_labels)!=0|fontsize_facet_labels<0){
+            stop("Error: fontsize_facet_labels should be a positive integer.")
+        }
+    }
+
+    # midCor submatrix limits
+    midCorLim <- numPerms + numRealDatasets
+    # list to hold results
+    corrOuts <- c()
+    # index
+    j <- 1
+
+    # get lists with all correlations
+    for(corrMat in corrMats){
+        # specify submatrices with upper/middle/lower bounds
+        lower <- corrMat[1:numPerms,1:numPerms]
+        middle <- corrMat[(numPerms+1):midCorLim,(numPerms+1):midCorLim]
+        upper <- corrMat[(midCorLim+1):(midCorLim+numSubsets),(midCorLim+1):(midCorLim+numSubsets)]
+        # convert each one to a list and remove "1" (selfcorrelation)
+        lower <- unique(unlist(as.list(lower)))
+        lower <- lower[-c(1)]
+        middle <- unique(unlist(as.list(middle)))
+        middle <- middle[-c(1)]
+        upper <- unique(unlist(as.list(upper)))
+        upper <- upper[-c(1)]
+        # store in list
+        corrOuts[[j]] <- list(lower,middle,upper)
+        names(corrOuts)[[j]] <- names(corrMats)[[j]]
+        # increment
+        j <- j+1
+    }
+
+    # store in dataframe
+    i <- 1
+    # empty dataframe
+    df <- data.frame()
+    # fill dataframe
+    for(out in corrOuts){
+        # define variables
+        var1 <- replicate(length(out[[1]])+length(out[[2]])+length(out[[3]]), names(corrOuts)[[i]]) #celltype
+        var2 <- c(replicate(length(out[[1]]),"Random Permutations"),replicate(length(out[[2]]),"Between Study"),replicate(length(out[[3]]),"Within-study subsamples"))
+        val <- unlist(out)
+        # put in dataframe
+        df_new <- data.frame(var1)
+        df_new$var2 <- var2
+        df_new$val <- val
+        # join
+        df <- rbind(df,df_new)
+        i <- i+1
+    }
+
+    df$alpha <- replicate(dim(df)[[1]],alphaval)
+    df$alpha <- ifelse(df$var1=="Mean", 1, ifelse(df$var1!="Mean",alphaval,alphaval))
+    unique_alphas <- df[!duplicated(df[,c("var1")]),]$alpha
+
+    # box plot
+    if(pval == 1){
+        if(sexDEGs == FALSE){
+            fig.plot <- ggplot(df,
+                aes(x=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),y=val))+
+                geom_boxplot(outlier.shape=NA,aes(fill=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),alpha=alpha),width=5)+
+                theme_cowplot()+
+                scale_colour_brewer(palette = "Set1")+
+                labs(y="Correlation", x="Type", fill="Cell Type",title=paste0("Using all DEGs (from all chromosomes)"))+
+                facet_wrap(factor(df$var2,levels=c("Random Permutations","Between Study","Within-study subsamples")), scales="fixed")+
+                theme(axis.title.x=element_blank(),
+                      axis.text.x=element_blank(),
+                      axis.ticks.x=element_blank(),
+                      axis.title.y = element_text(size = fontsize_yaxislabels),
+                      axis.text.y = element_text(size = fontsize_yaxisticks),
+                      plot.title = element_text(size = fontsize_title),
+                      legend.text = element_text(size = fontsize_legendlabels),
+                      legend.title = element_text(size = fontsize_legendtitle),
+                      strip.text = element_text(size = fontsize_facet_labels))+
+                scale_alpha(guide = 'none')+
+                guides(fill=guide_legend(override.aes = list(alpha=unique_alphas)))
+        }else{
+            fig.plot <- ggplot(df,
+                aes(x=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),y=val))+
+                geom_boxplot(outlier.shape=NA,aes(fill=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),alpha=alpha),width=5)+
+                theme_cowplot()+
+                scale_colour_brewer(palette = "Set1")+
+                labs(y="Correlation", x="Type", fill="Cell Type",title=paste0("Using all DEGs (from sex chromosomes)"))+
+                facet_wrap(factor(df$var2,levels=c("Random Permutations","Between Study","Within-study subsamples")), scales="fixed")+
+                theme(axis.title.x=element_blank(),
+                      axis.text.x=element_blank(),
+                      axis.ticks.x=element_blank(),
+                      axis.title.y = element_text(size = fontsize_yaxislabels),
+                      axis.text.y = element_text(size = fontsize_yaxisticks),
+                      plot.title = element_text(size = fontsize_title),
+                      legend.text = element_text(size = fontsize_legendlabels),
+                      legend.title = element_text(size = fontsize_legendtitle),
+                      strip.text = element_text(size = fontsize_facet_labels))+
+                scale_alpha(guide = 'none')+
+                guides(fill=guide_legend(override.aes = list(alpha=unique_alphas)))
+        }
+    }else{
+        if(sexDEGs == FALSE){
+            fig.plot <- ggplot(df,
+                aes(x=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),y=val))+
+                geom_boxplot(outlier.shape=NA,aes(fill=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),alpha=alpha),width=5)+
+                theme_cowplot()+
+                scale_colour_brewer(palette = "Set1")+
+                labs(y="Correlation", x="Type", fill="Cell Type",title=paste0("DEGs selected at a ",pval*100,"% cut-off (from all chromosomes)"))+
+                facet_wrap(factor(df$var2,levels=c("Random Permutations","Between Study","Within-study subsamples")), scales="fixed")+
+                theme(axis.title.x=element_blank(),
+                      axis.text.x=element_blank(),
+                      axis.ticks.x=element_blank(),
+                      axis.title.y = element_text(size = fontsize_yaxislabels),
+                      axis.text.y = element_text(size = fontsize_yaxisticks),
+                      plot.title = element_text(size = fontsize_title),
+                      legend.text = element_text(size = fontsize_legendlabels),
+                      legend.title = element_text(size = fontsize_legendtitle),
+                      strip.text = element_text(size = fontsize_facet_labels))+
+                scale_alpha(guide = 'none')+
+                guides(fill=guide_legend(override.aes = list(alpha=unique_alphas)))
+        }else{
+            fig.plot <- ggplot(df,
+                aes(x=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),y=val))+
+                geom_boxplot(outlier.shape=NA,aes(fill=factor(var1,levels=c("Mean","Astro","Endo","Micro","Oligo")),alpha=alpha),width=5)+
+                theme_cowplot()+
+                scale_colour_brewer(palette = "Set1")+
+                labs(y="Correlation", x="Type", fill="Cell Type",title=paste0("DEGs selected at a ",pval*100,"% cut-off (from sex chromosomes)"))+
+                facet_wrap(factor(df$var2,levels=c("Random Permutations","Between Study","Within-study subsamples")), scales="fixed")+
+                theme(axis.title.x=element_blank(),
+                      axis.text.x=element_blank(),
+                      axis.ticks.x=element_blank(),
+                      axis.title.y = element_text(size = fontsize_yaxislabels),
+                      axis.text.y = element_text(size = fontsize_yaxisticks),
+                      plot.title = element_text(size = fontsize_title),
+                      legend.text = element_text(size = fontsize_legendlabels),
+                      legend.title = element_text(size = fontsize_legendtitle),
+                      strip.text = element_text(size = fontsize_facet_labels))+
+                scale_alpha(guide = 'none')+
+                guides(fill=guide_legend(override.aes = list(alpha=unique_alphas)))
+        }
+    }
+
+    return(fig.plot)
+
+}
diff --git a/R/de_analysis.R → R/differential_expression.R b/R/de_analysis.R → R/differential_expression.R
@@ -18,8 +18,8 @@
 #' @return a list containing differential expression data (a dataframe) for each cell type. The dataframe contains:
 #' log fold change (logFC), log counts per million (logCPM), log ratio (LR), p-value (PValue), adjusted p-value (adj_pval)
 
-de_analysis <- function(pb_dat,formula,y_name,y_contin,coef, control,
-                            pval_adjust_method, adj_pval,verbose){
+differential_expression <- function(pb_dat,formula,y_name,y_contin,coef, control,
+                                    pval_adjust_method, adj_pval,verbose){
     # use Likelihood ratio test from edgeR, found to have best perf in a recent benchmark: https://www.biorxiv.org/content/10.1101/2021.03.12.435024v1.full
     # other option for edgeR is quasi-likelihood F-tests
     # run each cell type through

diff --git a/R/downsampling_DEanalysis.r b/R/downsampling_DEanalysis.r
@@ -68,7 +68,7 @@ downsampling_DEanalysis <- function(data,
     # check if DE analysis output present already in output_path
     if(!"DEout.RData" %in% list.files(output_path)){
         # run and save DE analysis
-        assign("DEout", sc_cell_type_de(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
+        assign("DEout", DGE_analysis(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
         save(DEout,file=paste0(output_path,"/DEout.RData"))
     }else{
         load(paste0(output_path,"/DEout.RData"))
@@ -115,7 +115,7 @@ downsampling_DEanalysis <- function(data,
                 dir.create(path,showWarnings=FALSE)
                 setwd(path)
                 # ensure sexID isnt "Sex", has to be lower case (change this in SCE if needed)
-                assign(paste0("DEout_",toString(value)), sc_cell_type_de(samples[[j]], design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
+                assign(paste0("DEout_",toString(value)), DGE_analysis(samples[[j]], design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
                 # save output
                 save(list=eval(paste0("DEout_",toString(value))),file=paste0("DEout",toString(value),"_",j,".RData"))
                 # get number of TP DEGs
@@ -170,7 +170,7 @@ downsampling_DEanalysis <- function(data,
                 dir.create(path,showWarnings=FALSE)
                 setwd(path)
                 # ensure sexID isnt "Sex", has to be lower case (change this in SCE if needed)
-                assign(paste0("DEout_",toString(value)), sc_cell_type_de(cells[[j]], design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
+                assign(paste0("DEout_",toString(value)), DGE_analysis(cells[[j]], design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
                 # save output
                 save(list=eval(paste0("DEout_",toString(value))),file=paste0("DEout",toString(value),"_",j,".RData"))
                 # get number of TP DEGs

diff --git a/R/downsampling_corrplots.r b/R/downsampling_corrplots.r
@@ -63,7 +63,7 @@ downsampling_corrplots <- function(data,
     # check if DE analysis output present already in output_path
     if(!"DEout.RData" %in% list.files(output_path)){
         # run and save DE analysis
-        assign("DEout", sc_cell_type_de(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
+        assign("DEout", DGE_analysis(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
         save(DEout,file=paste0(output_path,"/DEout.RData"))
     }else{
         load(paste0(output_path,"/DEout.RData"))

diff --git a/R/plot_de_analysis.R b/R/plot_de_analysis.R
@@ -1,7 +1,7 @@
 # Define global variables
 utils::globalVariables(c("deg_direction",".I","adj_pval","celltype","i.deg_direction","phenotype","gene_name","x.hgnc_symbol",".N","N","num_cells","prop","N_prop","colour_ident","."))
 
-#' Create differential expression analysis plots. Run by sc_cell_type_de()
+#' Create differential expression analysis plots. Run by DGE_analysis()
 
 #' @importFrom EnsDb.Hsapiens.v79 EnsDb.Hsapiens.v79
 #' @importFrom data.table rbindlist setnames as.data.table setkey data.table setorder :=