-
Notifications
You must be signed in to change notification settings - Fork 2
/
Preprocessing.Rmd
113 lines (88 loc) · 3.85 KB
/
Preprocessing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
---
title: "Preprocessing"
author: Saw Simeon, Nuttapat Anuwongcharoen, Watshara Shoombuatong, Aijaz Ahmad Malik,
Virapong Prachayasittikul, Jarl E. S. Wikberg and Chanin Nantasenamat
date: "June 7, 2016"
output: pdf_document
---
## Import data
Import the bioactivity data obtained from the ChEMBL database as a dataframe. Select the IC50 data subset.
```{r, warning = FALSE, message = FALSE, error = FALSE, tidy = TRUE}
library(readxl)
df_raw <- suppressWarnings(read_excel("Human_AChE.xlsx"))
names <- c("CMPD_CHEMBLID", "CANONICAL_SMILES", "STANDARD_TYPE",
"RELATION", "STANDARD_VALUE", "STANDARD_UNITS",
"PROTEIN_ACCESSION", "PREF_NAME",
"PUBMED_ID",
"JOURNAL", "YEAR", "VOLUME",
"ISSUE", "FIRST_PAGE", "MOLWEIGHT",
"ALOGP",
"PSA", "NUM_RO5_VIOLATIONS")
df <- df_raw[, names]
df_1 <- subset(df, STANDARD_TYPE == "IC50")
df_2 <- subset(df_1, RELATION == "=")
df_4 <- subset(df_2, STANDARD_UNITS == "nM")
bioactivity <- paste0("Bioactivity data points: ", nrow(df))
IC50 <- paste0("IC50 Bioactivity data points: ", nrow(df_1))
IC50_omit <- paste0("Omit IC50 with lesser/greater than symbol: ", nrow(df_2))
final <- paste0("Final number of data points: ", nrow(df_4))
message <- c(bioactivity, IC50, IC50_omit, final)
print(message)
```
## Remove redundant rows
```{r, tidy= TRUE, warning = FALSE, message = FALSE, error = FALSE}
ID <- df_4$CMPD_CHEMBLID
IC50_nm <- df_4$STANDARD_VALUE
smiles <- df_4$CANONICAL_SMILES
data_1 <- data.frame(ID, smiles, IC50_nm)
duplicate <- smiles[duplicated(smiles)]
unique <- unique(duplicate)
unique <- as.character(unique)
results <- data.frame()
for (i in unique) {
pre_data <- subset(data_1, smiles == i)
results <- rbind(results, pre_data)
}
results_2 <- data.frame()
for (i in unique) {
class <- subset(results, smiles == i)
IC50 <- class$IC50_nm
mean_IC50_nm <- mean(IC50)
sd <- sd(IC50)
data_frame <- data.frame(i, mean_IC50_nm, sd)
results_2 <- rbind(data_frame, results_2)
}
keep <- subset(results_2, sd < 2)
names(keep) <- c("CANONICAL_SMILES", "STANDARD_VALUE", "SD")
non_redundant <- df_4[!duplicated(df_4$CANONICAL_SMILES), ]
non_redundant <- non_redundant[!non_redundant$CANONICAL_SMILES %in% unique, ]
non_redundant <- non_redundant[, c("CANONICAL_SMILES", "STANDARD_VALUE")]
redundant <- df_4[df_4$CANONICAL_SMILES %in% unique, ]
redundant <- subset(redundant, !duplicated(redundant[, c("CANONICAL_SMILES")]))
redundant <- subset(redundant, select = -STANDARD_VALUE)
STANDARD_VALUE <- keep$STANDARD_VALUE
CANONICAL_SMILES <- keep$CANONICAL_SMILES
cleaned_redundant <- data.frame(CANONICAL_SMILES, STANDARD_VALUE)
curated_data <- rbind(non_redundant, cleaned_redundant)
curated_data_na_removed <- na.omit(curated_data)
unique <- paste0("number of unique redundant compounds: ", length(unique))
sd_2 <- paste0("Compounds left that are kept", " which has a sd of lower than two: ",
nrow(keep))
combined <- paste0("Combined data points: ", nrow(curated_data))
non_redundant_compound <- paste0("number of non-redundant compounds: ",
nrow(non_redundant))
curated <- paste0("number of curated data points where missing smiles are removed: ", nrow(curated_data_na_removed))
message <- c(non_redundant_compound, unique, sd_2, combined, curated)
print(message)
```
## Save the resulting dataframe as an excel file
```{r, tidy = TRUE, warning = FALSE, message = FALSE, error = FALSE}
library(xlsx)
curated_smiles <- curated_data_na_removed$CANONICAL_SMILES
STANDARD_VALUE <- curated_data_na_removed$STANDARD_VALUE
new_df <- df[df$CANONICAL_SMILES %in% curated_smiles, ]
get_unique <- new_df[!duplicated(new_df$CANONICAL_SMILES), ]
new_df_2 <- get_unique[, !(names(get_unique) %in% "STANDARD_TYPE")]
new_df_3 <- cbind(STANDARD_VALUE, new_df_2)
write.xlsx(new_df_3, file = "Preprocessed_Human_AChE.xlsx", sheetName = "Sheet1", row.names = FALSE)
```