-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbreast-cancer.R
656 lines (503 loc) · 30 KB
/
breast-cancer.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
#######################################################################################
### INITIAL SET-UP
#######################################################################################
# Open required package libraries
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if(!require(dplyr)) install.packages("dplyr", repos = "http://cran.us.r-project.org")
if(!require(data.table)) install.packages("data.table", repos = "http://cran.us.r-project.org")
if(!require(lubridate)) install.packages("lubridate", repos = "http://cran.us.r-project.org")
if(!require(stringr)) install.packages("stringr", repos = "http://cran.us.r-project.org")
if(!require(ggplot2)) install.packages("ggplot2", repos = "http://cran.us.r-project.org")
if(!require(ggdendro)) install.packages("ggdendro", repos = "http://cran.us.r-project.org")
if(!require(dendextend)) install.packages("dendextend", repos = "http://cran.us.r-project.org")
if(!require(knitr)) install.packages("knitr", repos = "http://cran.us.r-project.org")
if(!require(kableExtra)) install.packages("kableExtra", repos = "http://cran.us.r-project.org")
if(!require(scales)) install.packages("scales", repos = "http://cran.us.r-project.org")
if(!require(matrixStats)) install.packages("matrixStats", repos = "http://cran.us.r-project.org")
if(!require(RColorBrewer)) install.packages("RColorBrewer", repos = "http://cran.us.r-project.org")
if(!require(xfun)) install.packages("xfun", repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret", dependencies = c("Depends", "Suggests"), repos = "http://cran.us.r-project.org")
# Set all numeric outputs to 3 digits (unless otherwise specified in code chunks)
options(digits = 3)
# Download data-set containing both features and outcome data
data_url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
# Download data-set description as a
names_url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names"
# download data and save to data.frame
wdbc_data <- fread(data_url)
# add column names based on information provided in WPBC.names text file
column_names <- c("id", "diagnosis", "radius_m", "texture_m", "perimeter_m", "area_m", "smoothness_m", "compactness_m",
"concavity_m", "concave_points_m", "symmetry_m", "fractal_dim_m", "radius_se", "texture_se", "perimeter_se",
"area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", "symmetry_se",
"fractal_dim_se", "radius_w", "texture_w", "perimeter_w", "area_w", "smoothness_w", "compactness_w",
"concavity_w", "concave_points_w", "symmetry_w", "fractal_dim_w")
colnames(wdbc_data) <- column_names
# Create plot theme to apply to ggplot2 element text throughout report
plot_theme <- theme(plot.caption = element_text(size = 12, face = "italic"), axis.title = element_text(size = 12))
#######################################################################################
### EXPLORATORY ANALYSIS
#######################################################################################
### EXPLORE FEATURES
# Extract unique features from column names object using stringr functions
features <- column_names[-c(1,2)] %>% str_replace("[^_]+$", "") %>% unique() %>% str_replace_all("_", " ") %>% str_to_title()
# Create data.frame with descriptions for each of the unique features
features_list <- data.frame(Feature = features, Description =
c("Mean of distances from center to points on the perimeter of individual nuclei",
"Variance (standard deviation) of grey-scale intensitities in the component pixels",
"Perimeter length of each nucleus",
"Area as measured by counting pixels within each nucleus",
"Local variation in radius lengths",
"Combination of perimeter and area using the formula: (perimeter^2 / area - 1.0)",
"Number and severity of concavities (indentations) in the nuclear contour",
"Number of concavities in the nuclear contour",
"Symmetry of the nuclei as measured by length differences between lines perpendicular to the major axis and the cell boundary",
"Fractal dimension based on the 'coastline approximation' - 1.0"))
# Create table showing summary data for the mean scores for each of the ten features
summary(wdbc_data[,3:12])
# Create table showing summary data for the worst scores for each of the ten features
summary(wdbc_data[,23:32])
# Create table showing summary data for the standard error scores for each of the ten features
summary(wdbc_data[,13:22])
# Plot and facet wrap density plots for each feature by diagnosis
wdbc_data %>% select(-id) %>%
gather("feature", "value", -diagnosis) %>%
ggplot(aes(value, fill = diagnosis)) +
geom_density(alpha = 0.5) +
xlab("Feature values") +
ylab("Density") +
theme(legend.position = "top",
axis.text.x = element_blank(), axis.text.y = element_blank(),
legend.title=element_blank()) +
scale_fill_discrete(labels = c("Benign", "Malignant")) +
facet_wrap(~ feature, scales = "free", ncol = 3)
### INITIAL DAT TIDY
# Remove id column
wdbc_data <- wdbc_data[,-1]
# Convert diagnosis to a factor
wdbc_data$diagnosis <- as.factor(wdbc_data$diagnosis)
# Generate features matrix and outcome (diagnosis) list
wdbc <- list(x = as.matrix(wdbc_data[,-1]), y = wdbc_data$diagnosis)
### CREATE TRAIN/TEST DATA-SETS
# Split into train and test sets (80:20)
set.seed(200, sample.kind = "Rounding")
test_index <- createDataPartition(wdbc$y, times = 1, p = 0.2, list = FALSE)
train <- list(x = wdbc$x[-test_index,], y = wdbc$y[-test_index])
test <- list(x = wdbc$x[test_index,], y = wdbc$y[test_index])
### NORMALISE TRAIN DATA
# Centre the train$x data around zero by subtracting the column mean
train_c <- sweep(train$x, 2, colMeans(train$x))
# Scale the train$x data by dividing by the column standard deviation
train_s <- sweep(train_c, 2, colSds(train$x), FUN = "/")
### DISTANCE BETWEEN SAMPLES
# Use dist function to calculate distance between each sample
d_samples <- dist(train_s)
# Average distance between all samples
ave_dist_samples <- round(mean(as.matrix(d_samples)),2)
# Distance between benign samples
distb_b_samples <- round(mean(as.matrix(d_samples)[train$y=="B"]),2)
# Distance between malignant samples
distm_m_samples <- round(mean(as.matrix(d_samples)[train$y=="M"]),2)
# Distance between benign and malignant samples
distb_m_samples <- round(mean(as.matrix(d_samples)[train$y=="M", train$y=="B"]),2)
# Create heatmap of distance between samples
heatmap(as.matrix(d_samples), symm = T, revC = T,
col = brewer.pal(4, "Blues"),
ColSideColors = ifelse(train$y=="B", "green", "red"),
RowSideColors = ifelse(train$y=="B", "green", "red"),
labRow = NA, labCol = NA)
### VARIANCE BBETWEEN FEATURES
# Identifying zero variance predictors
nzv <- nearZeroVar(train_s, saveMetrics= TRUE)
# Hierarchical clustering of features
h <- hclust(dist(t(train_s)))
# Create dendrogram structure from h object
dend <- as.dendrogram(h) %>% set("branches_k_color", k = 7) %>%
set("labels_cex", 0.5) %>%
set("labels_colors", k = 7)
# Create ggdend object using dendextend package to visualise with the ggplot2 package
ggd <- as.ggdend(dend)
# Plot dendrogram from ggd object using ggplot2 package
ggplot(ggd, horiz = TRUE, theme = plot_theme) + labs(x = "Features", y = "Distance")
### CORRELATION BETWEEN FEATURES
# Identifying correlated predictors
trainCor <- abs(cor(train_s))
cutoff <- 0.9
corr_index <- findCorrelation(trainCor, cutoff = cutoff, names = FALSE)
corr <- findCorrelation(trainCor, cutoff = cutoff, names = TRUE)
# Create heatmap of correlation between features
heatmap(as.matrix(trainCor), col = brewer.pal(9, "RdPu"), labCol = NA)
### PRINCIPAL COMPONENT ANALYSIS
# Save principal component analysis in pca object
pca <- prcomp(train_s)
# Calculate variance scores per principal component
pca.var <- pca$sdev^2
pca.var.per <- pca.var/sum(pca.var)
# Create table showing the first 10 Principal Components
summary(pca)$importance[,1:10]
# Create box plot to show top 10 PCs by diagnosis
data.frame(pca$x[,1:10], Diagnosis = train$y) %>%
gather(key = "PC", value = "value", -Diagnosis) %>%
ggplot(aes(PC, value, fill = Diagnosis)) +
geom_boxplot() +
scale_fill_discrete(name="Diagnosis",
breaks=c("B", "M"),
labels=c("Benign", "Malignant"))
# Create scatter plot of PC1 and PC2 by diagnosis
data.frame(pca$x[,1:2], Diagnosis = train$y) %>%
ggplot(aes(PC1, PC2, color = Diagnosis)) +
geom_point() +
stat_ellipse() +
xlab(paste("PC1: ", percent(pca.var.per[1],0.1))) +
ylab(paste("PC2: ", percent(pca.var.per[2],0.1))) +
scale_color_discrete(name="Diagnosis",
breaks=c("B", "M"),
labels=c("Benign", "Malignant"))
#######################################################################################
### METHODS
#######################################################################################
### NORMALISE TEST DATA
# Centre the test$x data around zero by subtracting the train$x column mean
test_c <- sweep(test$x, 2, colMeans(train$x))
# Scale the test$x data by dividing by the train$x column standard deviation
test_s <- sweep(test_c, 2, colSds(train$x), FUN = "/")
# Create empty data frame to capture key performance measures in results table
model_results <- data.frame(Method = character(),
Accuracy = double(),
Sensitivity = double(),
Specificity = double(),
F1 = double(),
FNR = double(),
FPR = double())
# Define train control parameters for appropriate models
fitControl <- trainControl(method = "repeatedcv",
number = 10, # 10-fold cross-validation
repeats = 10, # repeat each cross-validation 10 times
classProbs = TRUE, # class probabilities computed
returnResamp = "final", # only save the final resampled summary metrics
savePredictions = "final") # only save the final predictions for each resample
### RANDOM SAMPLING MODEL
# Probability for random sampling set to 0.5, i.e. no weighting between different outcomes
p <- 0.5
# Set seed to enable result to be reproduced
set.seed(3, sample.kind = "Rounding")
# Sample outcomes matching length and factor levels from the test set
random_pred <- sample(c("B", "M"), length(test_index), prob = c(1-p, p), replace = TRUE) %>%
factor(levels = levels(test$y))
# Store confusion matrix in 'random' object
random <- confusionMatrix(random_pred, test$y, positive = "M")
# Add model model results to model_results data frame
model_results[1, ] <- c("Random sample",
format(round(random$overall["Accuracy"],2), nsmall = 2),
format(round(random$byClass["Sensitivity"],2), nsmall = 2),
format(round(random$byClass["Specificity"],2), nsmall = 2),
format(round(random$byClass["F1"],2), nsmall = 2),
percent(1-(random$byClass["Sensitivity"])),
percent(1-(random$byClass["Specificity"])))
### WEIGHTED RANDOM SAMPLING MODEL
# Probability for random sampling set to match the prevalence of benign and malignant samples in the train set
p <- mean(train$y=="M")
# Set seed to enable result to be reproduced
set.seed(3, sample.kind = "Rounding")
# Sample outcomes matching length and factor levels from the test set
weighted_random_pred <- sample(c("B", "M"), length(test_index), prob = c(1-p, p), replace = TRUE) %>%
factor(levels = levels(test$y))
# Store confusion matrix in 'weighted_random' object
weighted_random <- confusionMatrix(weighted_random_pred, test$y, positive = "M")
# Add model results to model_results data frame
model_results[2, ] <- c("Weighted random sample",
format(round(weighted_random$overall["Accuracy"],2), nsmall = 2),
format(round(weighted_random$byClass["Sensitivity"],2), nsmall = 2),
format(round(weighted_random$byClass["Specificity"],2), nsmall = 2),
format(round(weighted_random$byClass["F1"],2), nsmall = 2),
percent(1-(weighted_random$byClass["Sensitivity"])),
percent(1-(weighted_random$byClass["Specificity"])))
#######################################################################################
### UNSUPERVISED LEARNING (K-MEANS CLUSTERING)
#######################################################################################
# Build k-means prediction function
predict_kmeans <- function(x, k) {
# extract cluster centres
centres <- k$centers
# calculate distance from data-points to the cluster centres
distances <- sapply(1:nrow(x), function(i){
apply(centres, 1, function(y) dist(rbind(x[i,], y)))
})
# select cluster with min distance to centre
max.col(-t(distances))
}
### K-MEANS MODEL USING FULL DATA
# Define x as normalised train data-set, train_s
x <- train_s
# Set seed to enable result to be reproduced
set.seed(25, sample.kind = "Rounding")
# Predict outcome using kmeans model with k=2 and 25 random sets
k <- kmeans(x, centers = 2, nstart = 25)
kmeans_pred <- factor(ifelse(predict_kmeans(test_s, k) == 1, "M", "B"))
# Store confusion matrix in 'kmeans_results_1' object
kmeans_results_1 <- confusionMatrix(kmeans_pred, test$y, positive = "M")
# Add model results to model_results data frame
model_results[3, ] <- c("K-means clustering",
format(round(kmeans_results_1$overall["Accuracy"],2), nsmall = 2),
format(round(kmeans_results_1$byClass["Sensitivity"],2), nsmall = 2),
format(round(kmeans_results_1$byClass["Specificity"],2), nsmall = 2),
format(round(kmeans_results_1$byClass["F1"],2), nsmall = 2),
percent(1-(kmeans_results_1$byClass["Sensitivity"])),
percent(1-(kmeans_results_1$byClass["Specificity"])))
### K-MEANS MODEL USING SELECTED DATA
# K-means model with feature selection based on correlation coefficient score
#Define x_select as normalised train data-set, train_s extracting those included in the corr_index based on a correlation coefficient score over the cutoff (0.9)
x_select <- train_s[,-c(corr_index)]
#Apply the same selection process to the test data-set
test_s_select <- test_s[,-c(corr_index)]
# Set seed to enable result to be reproduced
set.seed(25, sample.kind = "Rounding")
# Predict outcome using kmeans model with k=2 and 25 random sets
k <- kmeans(x_select, centers = 2)
kmeans_pred_2 <- factor(ifelse(predict_kmeans(test_s_select, k) == 1, "M", "B"))
# Store confusion matrix in 'kmeans_results_2' object
kmeans_results_2 <- confusionMatrix(kmeans_pred_2, test$y, positive = "M")
# Add model results to model_results data frame
model_results[4, ] <- c("K-means (without highly correlated features)",
format(round(kmeans_results_2$overall["Accuracy"],2), nsmall = 2),
format(round(kmeans_results_2$byClass["Sensitivity"],2), nsmall = 2),
format(round(kmeans_results_2$byClass["Specificity"],2), nsmall = 2),
format(round(kmeans_results_2$byClass["F1"],2), nsmall = 2),
percent(1-(kmeans_results_2$byClass["Sensitivity"])),
percent(1-(kmeans_results_2$byClass["Specificity"])))
#######################################################################################
### SUPERVISED LEARNING - GENERARIVE MODELS
#######################################################################################
### NAIVE BAYES MODEL
# Set seed to enable result to be reproduced
set.seed(28, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_nb <- train(train_s, train$y,
method = "nb",
tuneGrid = expand.grid(usekernel = c(FALSE, TRUE), fL = c(0, 1), adjust = c(0, 1)),
trControl = fitControl)
nb_pred <- predict(train_nb, test_s)
# Store confusion matrix in 'nb_results' object
nb_results <- confusionMatrix(nb_pred, test$y, positive = "M")
# Add model results to model_results data frame
model_results[5, ] <- c("Naive Bayes",
format(round(nb_results$overall["Accuracy"],2), nsmall = 2),
format(round(nb_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(nb_results$byClass["Specificity"],2), nsmall = 2),
format(round(nb_results$byClass["F1"],2), nsmall = 2),
percent(1-(nb_results$byClass["Sensitivity"])),
percent(1-(nb_results$byClass["Specificity"])))
### LINEAR DISCRIMINANT ANALYSIS (LDA) MODEL
# Set seed to enable result to be reproduced
set.seed(30, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_lda <- train(train_s, train$y,
method = "lda",
trControl = fitControl)
lda_pred <- predict(train_lda, test_s)
# Store confusion matrix in 'lda_results' object
lda_results <- confusionMatrix(lda_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[6, ] <- c("Linear Discriminant Analysis",
format(round(lda_results$overall["Accuracy"],2), nsmall = 2),
format(round(lda_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(lda_results$byClass["Specificity"],2), nsmall = 2),
format(round(lda_results$byClass["F1"],2), nsmall = 2),
percent(1-(lda_results$byClass["Sensitivity"])),
percent(1-(lda_results$byClass["Specificity"])))
### QUADRATIC DISCRIMINANT ANALYSIS (QDA) MODEL
# Set seed to enable result to be reproduced
set.seed(32, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_qda <- train(train_s, train$y,
method = "qda",
trControl = fitControl)
qda_pred <- predict(train_qda, test_s)
# Store confusion matrix in 'qda_results' object
qda_results <- confusionMatrix(qda_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[7, ] <- c("Quadratic Discriminant Analysis",
format(round(qda_results$overall["Accuracy"],2), nsmall = 2),
format(round(qda_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(qda_results$byClass["Specificity"],2), nsmall = 2),
format(round(qda_results$byClass["F1"],2), nsmall = 2),
percent(1-(qda_results$byClass["Sensitivity"])),
percent(1-(qda_results$byClass["Specificity"])))
### QUADRATIC DISCRIMINANT ANALYSIS (QDA) WITH PCA MODEL
# Set seed to enable result to be reproduced
set.seed(32, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using non-normalised train/test data-sets
train_qda_pca <- train(train$x, train$y,
method = "qda",
trControl = fitControl,
# Pre-processing function to centre, scale and apply pca
preProcess = c("center", "scale", "pca"))
qda_pca_pred <- predict(train_qda_pca, test$x)
# Store confusion matrix in 'qda_pca_results' object
qda_pca_results <- confusionMatrix(qda_pca_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[8, ] <- c("Quadratic Discriminant Analysis (with PCA)",
format(round(qda_pca_results$overall["Accuracy"],2), nsmall = 2),
format(round(qda_pca_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(qda_pca_results$byClass["Specificity"],2), nsmall = 2),
format(round(qda_pca_results$byClass["F1"],2), nsmall = 2),
percent(1-(qda_pca_results$byClass["Sensitivity"])),
percent(1-(qda_pca_results$byClass["Specificity"])))
#######################################################################################
### SUPERVISED LEARNING - DISCRIMINATIVE MODELS
#######################################################################################
### LOGISTIC REGRESSION MODEL
# Set seed to enable result to be reproduced
set.seed(36, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_glm <- train(train_s, train$y,
method = "glm",
trControl = fitControl)
glm_pred <- predict(train_glm, test_s)
# Store confusion matrix in 'glm_results' object
glm_results <- confusionMatrix(glm_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[9, ] <- c("Logistic regression",
format(round(glm_results$overall["Accuracy"],2), nsmall = 2),
format(round(glm_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(glm_results$byClass["Specificity"],2), nsmall = 2),
format(round(glm_results$byClass["F1"],2), nsmall = 2),
percent(1-(glm_results$byClass["Sensitivity"])),
percent(1-(glm_results$byClass["Specificity"])))
### LOGISTIC REGRESSION WITH PCA MODEL
# Set seed to enable result to be reproduced
set.seed(36, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using non-normalised train/test data-sets
train_glm_pca <- train(train$x, train$y,
method = "glm",
trControl = fitControl,
# Pre-processing function to centre, scale and apply pca
preProcess = c("center", "scale", "pca"))
glm_pca_pred <- predict(train_glm_pca, test$x)
# Store confusion matrix in 'glm_pca_results' object
glm_pca_results <- confusionMatrix(glm_pca_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[10, ] <- c("Logistic regression (with PCA)",
format(round(glm_pca_results$overall["Accuracy"],2), nsmall = 2),
format(round(glm_pca_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(glm_pca_results$byClass["Specificity"],2), nsmall = 2),
format(round(glm_pca_results$byClass["F1"],2), nsmall = 2),
percent(1-(glm_pca_results$byClass["Sensitivity"])),
percent(1-(glm_pca_results$byClass["Specificity"])))
### K-NEAREST NEIGHBOUR (KNN) MODEL
# Set seed to enable result to be reproduced
set.seed(5, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_knn <- train(train_s, train$y,
method = "knn",
tuneGrid = data.frame(k = seq(1, 30, 2)),
trControl = fitControl)
knn_pred <- predict(train_knn, test_s)
# Store confusion matrix in 'knn_results' object
knn_results <- confusionMatrix(knn_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[11, ] <- c("K Nearest Neighbour",
format(round(knn_results$overall["Accuracy"],2), nsmall = 2),
format(round(knn_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(knn_results$byClass["Specificity"],2), nsmall = 2),
format(round(knn_results$byClass["F1"],2), nsmall = 2),
percent(1-(knn_results$byClass["Sensitivity"])),
percent(1-(knn_results$byClass["Specificity"])))
### RANDOM FOREST (RF) MODEL
# Set seed to enable result to be reproduced
set.seed(7, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_rf <- train(train_s, train$y,
method = "rf",
tuneGrid = data.frame(mtry = seq(3, 15, 2)),
importance = TRUE,
trControl = fitControl)
rf_pred <- predict(train_rf, test_s)
# Store confusion matrix in 'rf_results' object
rf_results <- confusionMatrix(rf_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[12, ] <- c("Random Forest",
format(round(rf_results$overall["Accuracy"],2), nsmall = 2),
format(round(rf_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(rf_results$byClass["Specificity"],2), nsmall = 2),
format(round(rf_results$byClass["F1"],2), nsmall = 2),
percent(1-(rf_results$byClass["Sensitivity"])),
percent(1-(rf_results$byClass["Specificity"])))
### NEURAL NETWORK MODEL
# Set seed to enable result to be reproduced
set.seed(9, sample.kind = "Rounding")
# Use caret package to train and then predict outcomes using normalised train/test data-sets
train_nn <- train(train_s, train$y,
method = "nnet",
trace = FALSE,
trControl = fitControl)
nn_pred <- predict(train_nn, test_s)
# Store confusion matrix in 'nn_results' object
nn_results <- confusionMatrix(nn_pred, test$y, positive = "M")
# Add results to model_results data frame
model_results[13, ] <- c("Neural Network",
format(round(nn_results$overall["Accuracy"],2), nsmall = 2),
format(round(nn_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(nn_results$byClass["Specificity"],2), nsmall = 2),
format(round(nn_results$byClass["F1"],2), nsmall = 2),
percent(1-(nn_results$byClass["Sensitivity"])),
percent(1-(nn_results$byClass["Specificity"])))
### ENSEMBLE MODEL
# Build ensemble model with all supervised models tested
ensemble <- cbind(glm = ifelse(glm_pred == "B", 0, 1), glm_pca = ifelse(glm_pca_pred == "B", 0, 1), nb = ifelse(nb_pred =="B", 0, 1), lda = ifelse(lda_pred == "B", 0, 1), qda = ifelse(qda_pred == "B", 0, 1), qda_pca = ifelse(qda_pca_pred == "B", 0, 1), rf = ifelse(rf_pred == "B", 0, 1), knn = ifelse(knn_pred == "B", 0, 1), nn = ifelse(nn_pred == "B", 0, 1))
# Predict final classification based on majority vote across all models, using the rowMeans function
ensemble_preds <- as.factor(ifelse(rowMeans(ensemble) < 0.5, "B", "M"))
# Store confusion matrix in 'ensemble_results' object
ensemble_results <- confusionMatrix(ensemble_preds, test$y, positive = "M")
# Add results to model_results data frame
model_results[14, ] <- c("Ensemble",
format(round(ensemble_results$overall["Accuracy"],2), nsmall = 2),
format(round(ensemble_results$byClass["Sensitivity"],2), nsmall = 2),
format(round(ensemble_results$byClass["Specificity"],2), nsmall = 2),
format(round(ensemble_results$byClass["F1"],2), nsmall = 2),
percent(1-(ensemble_results$byClass["Sensitivity"])),
percent(1-(ensemble_results$byClass["Specificity"])))
#######################################################################################
### RESULTS
#######################################################################################
### OVERALL PERFORMANCE MEASURES
# Print table of results from each model
model_results
### TUNING NAIVE BAYES MODEL
# Plot accuracy during cross-validation for tuning laplace smoothing (0/1), usekernel (true/false) and adjust (0/1)
plot(train_nb,
ylab = "Accuracy (repeated cross-validation)",
xlab = "Laplace correction")
### TUNING NEAREST NEIGHBOURS MODEL
# Plot accuracy during cross-validation for each value of k (number of neighbours) tuned
plot(train_knn,
ylab = "Accuracy (repeated cross-validation)",
xlab = "# Neighbours (k)")
### TUNING RANDOM FOREST MODEL
#Change number of digits to 4 to aid reading y-axis of plot
options(digits = 4)
# Plot accuracy during cross-validation for each value of mtry (number of neighbours) tuned
plot(train_rf,
ylab = "Accuracy (repeated cross-validation)",
xlab = "# Randomly selected predictors (mtry)")
### COMPARE MODEL PERFORMANCE DURING CROSS-VALIDATION
# Generate character string of models to be included in resampling analysis
modelNames <- c("Naive Bayes", "Linear Discriminant Analysis", "Quadratic Discriminant Analysis", "Quadratic Discriminant Analysis with PCA", "Logistic Regression", "Logistic Regression with PCA", "K Nearest Neighbours", "Random Forest", "Neural Network")
# Generate list of results of cross-validation of training sets with each of these models
modelTrains <- list(train_nb, train_lda, train_qda, train_qda_pca, train_glm, train_glm_pca, train_knn, train_rf, train_nn)
# Use the resamples function to compare performance during cross-validation, ranking them in order of performance
model_compare <- resamples(modelTrains, modelNames = modelNames, decreasing = TRUE)
# Plot dot plots to show accuracy and kappa scores for each model
dotplot(model_compare)
### COMPARE VARIABLE IMPORTANCE FOR NEAREST NEIGHBOUR, RANDOM FOREST, NEURAL NETWORK AND LOGISITIC REGRESSION MODELS
# Create varImp objects for each of the models
varimp_knn <- (varImp(train_knn))
varimp_rf <- (varImp(train_rf))
varimp_nn <- (varImp(train_nn))
varimp_glm <- (varImp(train_glm))
# Define the number of variables to include in plots ranked by importance
top <- 10
# Generate variable importance plots for each of the varImp objects
plot(varimp_knn, top = top)
plot(varimp_rf, top = top)
plot(varimp_nn, top = top)
plot(varimp_glm, top = top)