-
Notifications
You must be signed in to change notification settings - Fork 42
/
TCGA_correlations_BRCA.Rmd
196 lines (176 loc) · 8.3 KB
/
TCGA_correlations_BRCA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
---
title: "Genes best correlating with the selected gene"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
always_allow_html: yes
output:
pdf_document:
toc: no
html_document:
theme: united
toc: yes
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r libraries, include=FALSE}
library(openxlsx)
library(writexl)
library(MDmisc)
library(org.Hs.eg.db)
library(KEGG.db)
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(sva)
# library(clusterProfiler)
# library(pathview)
# devtools::install_github("mdozmorov/enrichR")
# library(enrichR)
source("https://raw.githubusercontent.com/mdozmorov/enrichR/master/R/api_wrapper.R")
source("https://raw.githubusercontent.com/mdozmorov/RNA-seq/master/calcTPM.R")
library(enrichR)
library(annotables)
# Append gene length
grch37 <- grch37 %>% mutate(Length = abs(end - start))
# Remove non-canonical chromosome names
grch37 <- grch37[ !(grepl("_", grch37$chr) | grepl("GL", grch37$chr)), ] %>% as.data.frame()
grch37 <- grch37[, c("symbol", "description", "Length")]
# grch37 <- grch37[ complete.cases(grch37) , ]
grch37 <- grch37[ !duplicated(grch37), ]
```
```{r functions}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
if (all(file.exists(FILE), !(force_reload))) {
# If the data has been previously saved, load it
load(file = FILE)
} else {
# If no saved data exists, get it from the remote source
mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
save(file = FILE, list = c("mtx")) # Save it
}
return(mtx)
}
# A function to get data overview
summarize_data <- function(mtx = mtx) {
print(paste0("Dimensions of expression matrix, genex X patients: ", paste(dim(mtx$dat), collapse = " ")))
print(paste0("Dimensions of clinical matrix, patients X parameters: ", paste(dim(mtx$clinical), collapse = " ")))
print(paste0("Dimensions of merged matrix, patients X parameters + genes: ", paste(dim(mtx$merged.dat), collapse = " ")))
print("Head of the merged matrix")
print(mtx$merged.dat[1:5, 1:10])
print("Head of the clinical matrix")
print(mtx$clinical[1:5, 1:7])
print("List of clinical values, and frequency of each variable: ")
clin_vars <- apply(mtx$clinical, 2, function(x) length(table(x[ !(is.na(x) & x != "" )]))) %>% as.data.frame()
# Filter clinical variables to have at least 2, but no more than 10 categories,
# And they are not dates
clin_vars <- clin_vars[ as.numeric(clin_vars$.) > 1 & as.numeric(clin_vars$.) < 10 & !grepl("years|days|date|vital|OS|RFS|TIME|sample_type", rownames(clin_vars), perl = TRUE) , , drop = FALSE]
print(kable(clin_vars))
return(rownames(clin_vars))
}
```
```{r settings}
system("mkdir -p data")
system("mkdir -p results")
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# data_dir = "F:/Data/GenomeRunner/TCGAsurvival/data" # Windows
# Selected genes
precalculated <- FALSE
selected_genes <- c("SPHK2") # If nothing precalculated - use one of the genes
method <- "" # If correlation with the selected_gene is measured, method is empty
# If precalculated, use precalculated values
# precalculated <- TRUE
# selected_genes <- "interferon_signature"
# method <- "NMF" # Which dimensionaliry reduction results to use, from NMF, PCA, FA
# Data type
data.type = "RNASeq2" ; type = ""
# data.type = "2018_pub"; type = "mrna" # Neuroblastoma
# Expression cutoff to select a particular range of expression of the selected gene.
# To use all expression, use "0" expression cutoff and "TRUE" top_expression (Default)
expression_cutoff <- 0 # From 0 to 1, percent cutoff of expression of the selected gene
top_expression <- TRUE # Whether to take top (TRUE) of bottom (FALSE) expression
# All cancers with RNASeq2 data
# cancer = c("ACC", "BLCA", "HNSC" , "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS")
# fileNameIn <- (paste0("data/All_expression_", data.type, "_", type, ".Rda")) # Save expression data
# fileNameOut <- paste0("results/All_correlation_", selected_genes, "_", data.type, "_", type, ".Rda") # Save correlation data
# fileNameRes <- paste0("results/All_results_", selected_genes, "_", data.type, "_", type, ".xlsx") # Save results
# Or, several cancers
cancer = c("BRCA")
# cancer = "nbl_target" # Neuroblastoma
# Correlation type
corr_type <- "pearson"
# Correlation cutoffs
corr_cutoff <- 0.2
p_val_cutoff <- 0.05 # Regular p-value cutoff
p_adj_cutoff <- 0.3 # FDR cutoff
min_kegg_genes <- 20 # Minimum number of genes to run enrichment analysis on
max_kegg_genes <- 2000 # Maximum number of genes to run enrichment analysis on
up_dn_separate <- FALSE # Whether to run KEGG separately on up- and downregulated genes. FALSE - do not distinguish directionality
ntable <- 15 # Number of genes to output in a DEG table
# Save results
fileNameIn <- (paste0("data/Expression_", paste(cancer, collapse = "_"), ".Rda")) # Save expression data
fileNameOut <- paste0("data/Correlation_", selected_genes, "_", paste(cancer, collapse = "_"), ".Rda") # Save correlation data
fileNameRes <- paste0("results/Results_", selected_genes, "_", paste(cancer, collapse = "_"), "_PAM50.xlsx")
```
# Add PAM50 classification
```{r}
mtx <- load_data(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
# BRCA-specific - replace original annotations with XENA
mtx$clinical <- read.csv("data.TCGA/XENA_classification.csv", row.names = 1)
clinical_annotations <- summarize_data(mtx = mtx)
# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE)
# expr <- expr[, apply(expr, 2, ff)]
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)
# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
# Full clinical information
clin_full <- mtx$clinical
# Match to the order of small clinical annitation
clin_full <- clin_full[rownames(clin_full) %in% clin$AffyID, ]
clin_full <- clin_full[match(clin$AffyID, rownames(clin_full)), ]
# Sanity check
all.equal(expr$AffyID, rownames(clin_full))
```
# Correlation in each PAM50 subgroup
```{r correlations}
sheet <- list()
# For each PAM50Call_RNAseq annotation
for (annotation in unique(clin_full$PAM50Call_RNAseq[!is.na(clin_full$PAM50Call_RNAseq)])) {
all_expression <- expr[ expr$AffyID %in% rownames(clin_full)[clin_full$PAM50Call_RNAseq == annotation], ] # Subset to the current PAM50 annotation
all_expression$AffyID <- NULL # Remove ID column
all_expression <- t(all_expression) # Transpose
all_corrs <- vector(mode = "numeric", length = nrow(all_expression))
all_pvals <- vector(mode = "numeric", length = nrow(all_expression))
for (i in 1:nrow(all_expression)) {
# Calculate the correlation
cors <- Hmisc::rcorr(all_expression[ rownames(all_expression) == selected_genes, ],
all_expression[ i, ], type = corr_type)
all_corrs[i] <- cors[[1]][1, 2]
all_pvals[i] <- cors[[3]][1, 2]
}
correlations <- data.frame(hgnc = rownames(all_expression), corr = all_corrs, pval = all_pvals)
# Remove genes for which correlation cannot be calculated
correlations <- correlations[complete.cases(correlations), ]
# Sort in decreasing order
correlations <- correlations[ order(correlations$corr, decreasing = TRUE), ]
# Save correlation results
sheet <- c(sheet, list(correlations))
names(sheet)[length(sheet)] <- annotation
}
write_xlsx(sheet, path = fileNameRes)
```