TCGA_correlations_BRCA.Rmd

---
title: "Genes best correlating with the selected gene"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
always_allow_html: yes
output:
  pdf_document:
    toc: no
  html_document:
    theme: united
    toc: yes
---

```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700, 
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```

```{r libraries, include=FALSE}
library(openxlsx)
library(writexl)
library(MDmisc)
library(org.Hs.eg.db)
library(KEGG.db)
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(sva)
# library(clusterProfiler)
# library(pathview)
# devtools::install_github("mdozmorov/enrichR")
# library(enrichR)
source("https://raw.githubusercontent.com/mdozmorov/enrichR/master/R/api_wrapper.R")
source("https://raw.githubusercontent.com/mdozmorov/RNA-seq/master/calcTPM.R")
library(enrichR)
library(annotables)
# Append gene length
grch37 <- grch37 %>% mutate(Length = abs(end - start))
# Remove non-canonical chromosome names
grch37 <- grch37[ !(grepl("_", grch37$chr) | grepl("GL", grch37$chr)), ] %>% as.data.frame()
grch37 <- grch37[, c("symbol", "description", "Length")]
# grch37 <- grch37[ complete.cases(grch37) , ]
grch37 <- grch37[ !duplicated(grch37), ]
```

```{r functions}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
  FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
  if (all(file.exists(FILE), !(force_reload))) {
    # If the data has been previously saved, load it
    load(file = FILE)
  } else {
    # If no saved data exists, get it from the remote source
    mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
    save(file = FILE, list = c("mtx")) # Save it
  }
  return(mtx)
}

# A function to get data overview
summarize_data <- function(mtx = mtx) {
  print(paste0("Dimensions of expression matrix, genex X patients: ", paste(dim(mtx$dat), collapse = " ")))
  print(paste0("Dimensions of clinical matrix, patients X parameters: ", paste(dim(mtx$clinical), collapse = " ")))
  print(paste0("Dimensions of merged matrix, patients X parameters + genes: ", paste(dim(mtx$merged.dat), collapse = " ")))
  print("Head of the merged matrix")
  print(mtx$merged.dat[1:5, 1:10])
  print("Head of the clinical matrix")
  print(mtx$clinical[1:5, 1:7])
  print("List of clinical values, and frequency of each variable: ")
  clin_vars <- apply(mtx$clinical, 2, function(x) length(table(x[ !(is.na(x) & x != "" )]))) %>% as.data.frame()
  # Filter clinical variables to have at least 2, but no more than 10 categories,
  # And they are not dates
  clin_vars <- clin_vars[ as.numeric(clin_vars$.) > 1 & as.numeric(clin_vars$.) < 10 & !grepl("years|days|date|vital|OS|RFS|TIME|sample_type", rownames(clin_vars), perl = TRUE) , , drop = FALSE]
  print(kable(clin_vars))
  return(rownames(clin_vars))
}

```

```{r settings}
system("mkdir -p data")
system("mkdir -p results")
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# data_dir = "F:/Data/GenomeRunner/TCGAsurvival/data" # Windows

# Selected genes
precalculated  <- FALSE
selected_genes <- c("SPHK2") # If nothing precalculated - use one of the genes
method         <- "" # If correlation with the selected_gene is measured, method is empty
# If precalculated, use precalculated values
# precalculated  <- TRUE 
# selected_genes <- "interferon_signature" 
# method         <- "NMF" # Which dimensionaliry reduction results to use, from NMF, PCA, FA

# Data type
data.type = "RNASeq2" ; type = "" 
# data.type = "2018_pub"; type = "mrna" # Neuroblastoma

# Expression cutoff to select a particular range of expression of the selected gene.
# To use all expression, use "0" expression cutoff and "TRUE" top_expression (Default)
expression_cutoff <- 0   # From 0 to 1, percent cutoff of expression of the selected gene
top_expression    <- TRUE # Whether to take top (TRUE) of bottom (FALSE) expression 

# All cancers with RNASeq2 data
# cancer = c("ACC", "BLCA", "HNSC" , "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS")
# fileNameIn <- (paste0("data/All_expression_", data.type, "_", type, ".Rda")) # Save expression data
# fileNameOut <- paste0("results/All_correlation_", selected_genes, "_", data.type, "_", type, ".Rda") # Save correlation data
# fileNameRes <- paste0("results/All_results_", selected_genes, "_", data.type, "_", type, ".xlsx") # Save results
# Or, several cancers
cancer = c("BRCA")
# cancer = "nbl_target"  # Neuroblastoma

# Correlation type
corr_type    <- "pearson"
# Correlation cutoffs
corr_cutoff  <- 0.2
p_val_cutoff   <- 0.05 # Regular p-value cutoff
p_adj_cutoff   <- 0.3 # FDR cutoff
min_kegg_genes <- 20 # Minimum number of genes to run enrichment analysis on
max_kegg_genes <- 2000 # Maximum number of genes to run enrichment analysis on
up_dn_separate <- FALSE # Whether to run KEGG separately on up- and downregulated genes. FALSE - do not distinguish directionality
ntable         <- 15 # Number of genes to output in a DEG table

# Save results
fileNameIn <- (paste0("data/Expression_", paste(cancer, collapse = "_"), ".Rda"))  # Save expression data
fileNameOut <- paste0("data/Correlation_", selected_genes, "_", paste(cancer, collapse = "_"), ".Rda") # Save correlation data
fileNameRes <- paste0("results/Results_", selected_genes, "_", paste(cancer, collapse = "_"), "_PAM50.xlsx")
```

# Add PAM50 classification

```{r}
mtx <- load_data(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
# BRCA-specific - replace original annotations with XENA
mtx$clinical <- read.csv("data.TCGA/XENA_classification.csv", row.names = 1)

clinical_annotations <- summarize_data(mtx = mtx)
# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE) 
# expr <- expr[, apply(expr, 2, ff)] 
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)
# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
# Full clinical information
clin_full <- mtx$clinical
# Match to the order of small clinical annitation
clin_full <- clin_full[rownames(clin_full) %in% clin$AffyID, ]
clin_full <- clin_full[match(clin$AffyID, rownames(clin_full)), ]
# Sanity check
all.equal(expr$AffyID, rownames(clin_full))
```

# Correlation in each PAM50 subgroup

```{r correlations}
sheet <- list()
# For each PAM50Call_RNAseq annotation
for (annotation in unique(clin_full$PAM50Call_RNAseq[!is.na(clin_full$PAM50Call_RNAseq)])) { 
  all_expression <- expr[ expr$AffyID %in% rownames(clin_full)[clin_full$PAM50Call_RNAseq == annotation], ] # Subset to the current PAM50 annotation
  all_expression$AffyID <- NULL # Remove ID column
  all_expression <- t(all_expression) # Transpose
  all_corrs <- vector(mode = "numeric", length = nrow(all_expression))
  all_pvals <- vector(mode = "numeric", length = nrow(all_expression))
 for (i in 1:nrow(all_expression)) {
    # Calculate the correlation
    cors <- Hmisc::rcorr(all_expression[ rownames(all_expression) == selected_genes, ],
                         all_expression[ i, ], type = corr_type)
    all_corrs[i] <- cors[[1]][1, 2]
    all_pvals[i] <- cors[[3]][1, 2]
  }
  
  correlations <- data.frame(hgnc = rownames(all_expression), corr = all_corrs, pval = all_pvals)
  # Remove genes for which correlation cannot be calculated
  correlations <- correlations[complete.cases(correlations), ]
  # Sort in decreasing order
  correlations <- correlations[ order(correlations$corr, decreasing = TRUE), ]
  # Save correlation results
  sheet <- c(sheet, list(correlations))
  names(sheet)[length(sheet)] <- annotation
}
write_xlsx(sheet, path = fileNameRes)
```