-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_DownloadData.R
133 lines (117 loc) · 4.91 KB
/
1_DownloadData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Title: Download Data from GEO
# Author: Olivia Williamson & Natalie Goulett
# Created: 2024-04-04
# Edited: 2024-04-28
#We will acquire data from two NCBI datasets for our Bayesian network analysis -
# GSE186332 and GSE154041.
# GSE154041's genes are named using ensemble IDs whereas GSE186332 contains NCBI
# Gene names. We will convert them both to Hugo gene symbols in the following
# R script (step 2).
# GSE154041 read counts were normalized with the TMM normalization method and
# subsequently filtered(?)** for reads depth as count per million (CPM) using
# the cpm function implemented in edgeR. GSE186332 contains raw count data, so
# we will normalize it in the normalization R script (step 3).
# More info on the data are available at
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE154041 and
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi.
# Install and load the following packages
# install.packages("BiocManager")
# BiocManager::install("GEOQuery")
# BiocManager::install("Biobase")
# BiocManager::install("biomaRt")
# install.packages("tidyverse")
# install.packages("hgnc")
library(BiocManager)
library(GEOquery)
library(Biobase)
library(biomaRt)
library(tidyverse)
library("hgnc")
########################## READING IN RAW COUNTS ###############################
####### GSE186332
# Gene features (obs. count) - 25,132 obs
# Samples - 57
# treatment column - "GSE186332_pheno_names_to_keep$treatment_protocol_ch1.1"
# Treatment - Selinexor doses (twice weekly) preoperatively (Arm A; n = 8
# patients). Patients not undergoing surgery received 50 mg/m2 (Arm B, n = 24),
# or 60 mg (Arm C, n = 14) twice weekly, or 80 mg once weekly (Arm D; n = 30).
# Phenotype info available - tissue type; trial arm; experimental set
# (Discovery, validation); patient response (responder, non-responder);
# treatment protocol (keep 1.1-1.6) including drug type, dose, frequency;
# Primary endpoint was 6-month progression-free survival rate (PFS6).
# Gene name format: NCBI ID, which is Entrez ID. Convert to Symbol format using:
# https://support.bioconductor.org/p/125380/
# Reference Genome: hg38
GSE186332 <- getGEO("GSE186332")
GSE186332_data <- GSE186332[[1]]
# gets formal class ExpressionSet`
GSE186332_data
names(pData(GSE186332_data))
head(pData(GSE186332_data))
GSE186332_pheno_names_to_keep <- c(
"title", "characteristics_ch1.1", "characteristics_ch1.2",
"characteristics_ch1.3", "treatment_protocol_ch1.1",
"treatment_protocol_ch1.3", "treatment_protocol_ch1.4",
"treatment_protocol_ch1.5", "treatment_protocl_ch1.6",
"experimental set:ch1", "patient response:ch1", "trial arm:ch1"
)
GSE186332_raw <- getGEOSuppFiles("GSE186332")
# data was downloaded to top level directory. It was manually moved to Data/.
GSE186332_raw
GSE186332_long_df <- read.delim(
"GSE186332/GSE186332_gbm_rawcounts.tsv.gz"
)
# use version GPL24676, which is the datafile for GSE186332.
head(GSE186332_long_df)
# **create a vector of ensemble IDs from homo sapiens set and then ___?
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mapping <- getBM(attributes = c())
# Genes from the GSE186332 dataset are named using NCBI gene IDs, which are
# identical to Entrez IDs. Here we use the `AnnotationHub` and `HGNC` packages
# to add Hugo gene symbols to the dataset.
import_hgnc_dataset()
head(tbl)
GSE186332Final <- GSE186332_long_df %>%
rename("entrez_id" = "NCBIGeneID")
hgnc_tbl <- import_hgnc_dataset() %>%
select("entrez_id", "ensembl_gene_id", "hgnc_id", "symbol")
hgnc_tbl %>% as.data.frame()
didthiswork <- merge(hgnc_tbl, GSE186332Final, by = "entrez_id")
###### GSE154041
# 74 Samples - 10902 observations (filtered)
# treatment column - "GSE154041_pheno_names_to_keep$treatment:ch1"
# Treatments - Regorafenib or Lomustine
# Phenotype info available - Patient ID, treatment type, and survival (in
# months).
# Reference Genome - HG 38
# Data are have been normalized using TMM method.
GSE154041 <- getGEO("GSE154041")
class(GSE154041)
length(GSE154041)
names(GSE154041)
GSE154041_data <- GSE154041[[1]]
names(pData(GSE154041_data))
head(pData(GSE154041_data))
names(pData(GSE154041))
GSE154041_pheno_names_to_keep <- c(
"title", "characteristics_ch1", "characteristics_ch1.1", "description",
"overall survival (months):ch1"
)
GSE154041_raw <- getGEOSuppFiles("GSE154041")
GSE154041_raw
GSE154041_long_df <- read.delim(
"GSE154041/GSE154041_samples_expression_table_filtered.txt.gz"
)
head(GSE154041_long_df)
# GSE154041 genes are named using ensemble IDs whereas GSE186332 contains NCBI
# Gene names.
# rename the ensemble ID column to match format of hgnc table(?)**
GSE154041Final <- GSE154041_long_df %>%
rename("ensembl_gene_id" = "Ensembl_geneid")
# Add hgnc table names to GSE154041
didthiswork2 <- merge(
hgnc_tbl,
GSE154041Final,
by = "ensembl_gene_id",
.before = "TRUE"
)