-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_NormalizationDiscretization.R
185 lines (155 loc) · 5.35 KB
/
3_NormalizationDiscretization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Title: Discretizing the gene expression values
# Author: Olivia Williamson, Natalie Goulett, Gabriel Odom
# Created: 2024-04-18
# Edited: 2024-04-28
# We do not normalize by treatment group. We want to treat all columns as the
# same group because this is a preprocessing step.
# We only use DGEList for raw count data.
# Explanation: https://support.bioconductor.org/p/p133964/#p134019
#install and load edgeR
BiocManager::edgeR
library(edgeR)
# Importing data set files (with gene symbols)
GSE154041 <- read.csv(file = "./Data_Clean/GSE154041_wsymbols_20240410.csv")
GSE186332 <- read.csv(file = "./Data_Clean/GSE186332_wsymbols_20240410.csv")
###### Data Set 1 ############################################################
# GSE186332 - raw counts, converting to normalized counts through TMM method
# Creating DGEList object
test <- DGEList(GSE186332[-2], annotation.columns = "Gene_name")
# **convert something to factors to get CPM to work?
test_TMM <- calcNormFactors(test)
# Convert reads to Counts Per Million (CPM)
test_cpm <- cpm(test_TMM, log = TRUE)
head(test_cpm)
# Check for a relatively normal distribution
hist(test_cpm)
# Transpose matrix to scale, transpose back to long format
test_z <- t(
scale(
t(
test_cpm
)
)
)
head(test_z)
hist(test_z)
# There are no hugo gene symbols for 596 of the 25040 genes
rownames(test_z) <- GSE186332$Gene_name
# Cut these to over-, non-, or under-expressed genes
###### Data Set 2 ############################################################
# GSE154041 - TMM Normalized expression values obtained from GEO
# Removing ensemble column
GSE154041_df <- GSE154041[-2]
# Converting to matrix
GSE154041_matrix <- as.matrix(GSE154041_df[-1])
# moving first column to row names - obtain just the values in columns
rownames(GSE154041_matrix) <- GSE154041_df$Gene_name
head(GSE154041_matrix)
# log base 2 (x+1) transformation
GSE154041_log <- log2(GSE154041_matrix + 1)
GSE154041Log2T_mat <- t(
scale(
t(
GSE154041_log
)
)
)
hist(GSE154041Log2T_mat)
hist(rnorm(n = 1113990))
# Compared the histogram of our log 2 transformed data set against rnorm
# add more comments/explanation here **lol
# test_z is the transformed version of GSE186332.
# we will read these into the RemoveBatchEffects script.
# Save transformed files
write.csv(
test_z,
file = "./Data_Clean/GSE186332_Transformed.csv",
row.names = TRUE
)
write.csv(
GSE154041Log2T_mat,
file = "./Data_Clean/GSE154041_Transformed.csv",
row.names = TRUE
)
GSE154041_Transformed <- read.csv(
file = "./Data_Clean/GSE154041_Transformed.csv"
)
GSE186332_Transformed <- read.csv(
file = "./Data_Clean/GSE186332_Transformed.csv"
)
intersection(GSE154041_Transformed, GSE186332_Transformed)
###### Decompose Normalized Data to remove batch effects ######################
# **are we doing anything with this section or should it be moved?
# do you want to keep these links?:
# https://academic.oup.com/bib/article/14/4/469/191565
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3046121/
# https://www.biostars.org/p/266507/
# removeBatchEffects() in Limma::
# because one data set has raw counts, the other TMM normalized (CPM), we can't
# not using ComBatSeq because one data set is not in raw counts
# after removing BE, data may not come from normal distribution (mean != 0, sd
# != 1)
library(limma)
# Merge by gene symbol
intersect(GSE154041Log2T_mat, test_TMM)
removeBatchEffect()
###### Discretizing Values ####################################################
# We need to be able to specify what threshold to use because genomic data is
# not normally distributed.
# calculate the normal probabilities assoc. with threshold z = -1 and z = 1
pnorm(-1)
# 0.1587
qnorm(0.1587)
# GSE154041
discretized_genes <- function(normalized_expr_mat,
thresh_low = -1,
thresh_high = 1){
# Turn normalized gene expr. values into 3 discrete categories
# browser()
# Check for valid threshold values
if (is.na(thresh_low) | is.na(thresh_high)) {
stop("NA values in thresholds are not allowed.")
}
# Apply cut-offs
is_low <- normalized_expr_mat < thresh_low
is_high <- normalized_expr_mat > thresh_high
is_none <- !(is_low | is_high)
# Replace z-scores w/ categories
normalized_expr_mat[is_none] <- 0
normalized_expr_mat[is_low] <- -1
normalized_expr_mat[is_high] <- 1
normalized_expr_mat
}
# Test
set.seed(1235)
test <- rnorm(15)
discretized_genes(test)
discretized_genes(
test, thresh_low = qnorm(0.1587), thresh_high = qnorm(1-0.1587)
)
# Test on non-normal data
test2 <- rt(30, df = 3)
# quantile function returns named vector
testThresh <- c(
low = quantile(test2, 0.1587, names = FALSE),
high = quantile(test2, 1 - 0.1587, names = FALSE)
)
discretized_genes(
test2, thresh_low = testThresh["low"], thresh_high = testThresh["high"]
)
# Test that our error function is working
discretized_genes(
test2, thresh_low = NA, thresh_high = testThresh["high"]
)
discretized_genes(
test2, thresh_low = testThresh["Low"], thresh_high = testThresh["high"]
)
discrete_counts <- apply(
standardized.counts,
MARGIN = 2,
discretize_genes
)
# Double Check work
discrete_counts[1:5,1:5]
#
standardized.counts[1:5,1:5]