-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject2.Rmd
266 lines (218 loc) · 9.14 KB
/
project2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
---
title: "560 Data Mining Course Project 2"
author: "Jing Zhang"
date: "2/23/2021"
output:
word_document: default
html_document:
df_print: paged
pdf_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```
```{r}
require(mlbench)
library(tidyverse)
```
BreastCancer has 699 observations on 11 variables, one being a character variable, 9 being ordered or nominal, and 1 target class.
### Data preparation
Load Data
```{r}
# load the data set
data(BreastCancer)
# some algorithms don't like missing values, so remove rows with missing values
BreastCancer <- na.omit(BreastCancer)
# remove the unique identifier, which is useless and would confuse the machine learning algorithms
BreastCancer <- BreastCancer %>% dplyr::select(-Id) #why need to specify the package name to work?it's due to the package overlapping, MASS has select fun.
summary(BreastCancer)
#look at the class
#split(names(BreastCancer), sapply(BreastCancer, function(x)(class(x))))
```
Split data
```{r}
# partition the data set for 80% training and 20% evaluation
set.seed(2)
ind <- sample(2, nrow(BreastCancer), replace = TRUE, prob=c(0.8, 0.2))
train <- BreastCancer[ind==1,]
valid <- BreastCancer[ind==2,]
```
### Create multiple models using different classifiers/algorithms
1. decision tree
```{r}
library(rpart)
library(rpart.plot)
x.rp <- rpart(Class ~ ., data=train)
#plot(x.rp, main="Decision tree created using rpart") #?????
prp(x.rp, type = 1, extra = 1, split.font = 1, varlen = -10)
#prediction
# predict classes for the evaluation data set
x.rp.pred <- predict(x.rp, type="class", newdata=valid) # to ensemble
# score the evaluation data set (extract the probabilities)
x.rp.prob <- predict(x.rp, type="prob", newdata=valid)
table(x.rp.pred,valid$Class)
# Leave-1-Out Cross Validation (LOOCV)
#ans <- numeric(nrow(BreastCancer))
#for (i in 1:nrow(BreastCancer)) {
# mytree <- rpart(Class ~ ., BreastCancer[-i,])
# mytree.pred <- predict(mytree,BreastCancer[i,],type="class")
# ans[i] <- mytree.pred
#}
#ans <- factor(ans,labels=levels(BreastCancer$Class))
#table(ans,BreastCancer$Class)
#ans benign malignant
# benign 430 20
# malignant 14 219
```
2. conditional inference trees
```{r}
require(party)
x.ct <- ctree(Class ~ ., data=train)
plot(x.ct, main="Decision tree created using condition inference trees")
x.ct.pred <- predict(x.ct, newdata=valid) #ensemble
x.ct.prob <- 1- unlist(treeresponse(x.ct, valid), use.names=F)[seq(1,nrow(valid)*2,2)]
table(x.ct.pred,valid$Class)
```
3. random forest : an implementation of the random forest and bagging ensemble algorithms utilizing conditional inference trees as base learners.
```{r}
x.cf <- cforest(Class ~ ., data=train, control = cforest_unbiased(mtry = 9)) #?cforest_unbiased, bagging? #500 trees
x.cf.pred <- predict(x.cf, newdata=valid)
x.cf.prob <- 1- unlist(treeresponse(x.cf, valid), use.names=F)[seq(1,nrow(valid)*2,2)]
table(x.cf.pred,valid$Class)
```
4. bagging (bootstrap aggregating)
```{r}
# create model using bagging (bootstrap aggregating)
require(ipred)
x.ip <- bagging(Class ~ ., data=train) #Bagging classification trees with 25 bootstrap replications
x.ip.pred <- predict(x.ip, newdata=valid)
x.ip.prob <- predict(x.ip, type="prob", newdata=valid)
table(x.ip.pred,valid$Class)
```
5. svm
```{r}
require(e1071)
# svm requires tuning
x.svm.tune <- tune(svm, Class~., data = train,
ranges = list(gamma = 2^(-8:1), cost = 2^(0:4)),
tunecontrol = tune.control(sampling = "fix")) #why use these number to intialize ranges?
# display the tuning results (in text format)
x.svm.tune #note the gamma and cost
# If the tuning results are on the margin of the parameters (e.g., gamma = 2^-8),
# then widen the parameters.
# I manually copied the cost and gamma from console messages above to parameters below.
x.svm <- svm(Class~., data = train, cost=1, gamma=0.03125, probability = TRUE) #
x.svm.pred <- predict(x.svm, type="class", newdata=valid) #ensemble; only give the class
x.svm.prob <- predict(x.svm, type="prob", newdata=valid, probability = TRUE) # has to include probability = TRUE while type="prob" is not needed
#t <- attr(x.svm.prob, "probabilities") # only give the probabilities
table(x.svm.pred,valid$Class)
```
6. naive bayes
```{r}
library(klaR)
x.nb <- NaiveBayes(Class~., data = train)
x.nb.pred <- predict(x.nb,valid)$class #ensemble
x.nb.prob <- predict(x.nb,valid)$posterior
table(x.nb.pred,valid$Class)
```
7. neural network
```{r}
library(nnet)
x.nn <- nnet(Class~., data = train,size=1) #size? everytime, result changes, hidden layer, nodes?
x.nn.pred <- predict(x.nn,valid,type="class")
x.nn.prob <- predict(x.nn,valid,type="raw") #is this the probability of "malignant"? yes?
table(x.nn.pred,valid$Class)
```
8. Quadratic Discriminant Analysis
```{r}
library(MASS)
library(dplyr)
train.num <- train %>% dplyr::select(-Class) %>% mutate_if(is.factor,as.character)%>% mutate_if(is.character,as.numeric)
train.num$Class <- train$Class
valid.num <- valid%>%dplyr::select(-Class) %>% mutate_if(is.factor,as.character)%>% mutate_if(is.character,as.numeric)
valid.num$Class <- valid$Class
x.qda <- qda(Class~., data = train.num) #qda, formula, right hand is non-factor
x.qda.pred <- predict(x.qda, valid.num)$class
x.qda.prob <- predict(x.qda, valid.num)$posterior #is posterior?
table(x.qda.pred,valid.num$Class)
```
9. Regularised Discriminant Analysis
```{r}
library(klaR)
x.rda <- rda(Class~., data = train)
x.rda.pred <- predict(x.rda, valid)$class
x.rda.prob <- predict(x.rda, valid)$posterior
table(x.rda.pred,valid$Class)
```
### Plot ROC curves to compare the performance of the individual classifiers.
```{r}
#load the ROCR package which draws the ROC curves
require(ROCR)
# 1.rptree
# create an ROCR prediction object from rpart() probabilities
x.rp.prob.rocr <- prediction(x.rp.prob[,2], BreastCancer[ind == 2,'Class'])
#x.rp.prob.rocr #Slot "n.pos": check and see if it identifies the positive class; compare with summary(valid$Class)
# prepare an ROCR performance object for ROC curve (tpr=true positive rate, fpr=false positive rate)
x.rp.perf <- performance(x.rp.prob.rocr, "tpr","fpr")
# 2.ctree
x.ct.prob.rocr <- prediction(x.ct.prob, BreastCancer[ind == 2,'Class'])
x.ct.perf <- performance(x.ct.prob.rocr, "tpr","fpr")
# 3.cforest
x.cf.prob.rocr <- prediction(x.cf.prob, BreastCancer[ind == 2,'Class'])
x.cf.perf <- performance(x.cf.prob.rocr, "tpr","fpr")
# 4.bagging
x.ip.prob.rocr <- prediction(x.ip.prob[,2], BreastCancer[ind == 2,'Class'])
x.ip.perf <- performance(x.ip.prob.rocr, "tpr","fpr")
# 5.svm
x.svm.prob.rocr <- prediction(attr(x.svm.prob, "probabilities")[,2], BreastCancer[ind == 2,'Class'])
x.svm.perf <- performance(x.svm.prob.rocr, "tpr","fpr")
# 6.nb ### calculate the prob
x.nb.prob.rocr <- prediction(x.nb.prob[,2], BreastCancer[ind == 2,'Class'])
x.nb.perf <- performance(x.nb.prob.rocr, "tpr","fpr")
# 7.nn
x.nn.prob.rocr <- prediction(x.nn.prob, BreastCancer[ind == 2,'Class'])
x.nn.perf <- performance(x.nn.prob.rocr, "tpr","fpr")
# 8.qda
x.qda.prob.rocr <- prediction(x.qda.prob[,2], BreastCancer[ind == 2,'Class'])
x.qda.perf <- performance(x.qda.prob.rocr, "tpr","fpr")
# 9.rda
x.rda.prob.rocr <- prediction(x.rda.prob[,2], BreastCancer[ind == 2,'Class'])
x.rda.perf <- performance(x.rda.prob.rocr, "tpr","fpr")
```
```{r}
####### plot
# Output the plot to a PNG file for display on web. To draw to the screen,
# comment this line out.
#png(filename="roc_curve_models1.png", width=700, height=700)
#par(mfrow=c(1,2))
plot(x.rp.perf, col=2, main="ROC curves comparing classification performance \n of 9 machine learning models") #
legend(0.6, 0.6, c('rpart', 'ctree', 'cforest','bagging','svm'), 2:6)# Draw a legend.
plot(x.ct.perf, col=3, add=TRUE)# add=TRUE draws on the existing chart #has to be run together.
plot(x.cf.perf, col=4, add=TRUE)
plot(x.ip.perf, col=5, add=TRUE)
plot(x.svm.perf, col=6, add=TRUE)
# Close and save the PNG file.
#dev.off()
#png(filename="roc_curve_models2.png", width=700, height=700)
plot(x.nb.perf, col=7, main="ROC curves comparing classification performance \n of the other 4 machine learning models")
legend(0.6, 0.6, c('naive bayes', 'neural network', 'qda','rda'), 7:10)
plot(x.nn.perf, col=8, add=TRUE)
plot(x.qda.perf, col=9, add=TRUE)
plot(x.rda.perf, col=10, add=TRUE)
#dev.off()
```
### Ensemble: combine all the nine classifiers and generate the final prediction based on the majority rule.
```{r}
classifier <- data.frame(cbind(x.rp.pred, x.ct.pred, x.cf.pred, x.ip.pred, x.svm.pred, x.nb.pred,x.nn.pred,x.qda.pred,x.rda.pred))
names(classifier) <-c('recursive.tree','conditional.inference.tree','random.forest','bootstrap','svm','naive.bayes','neutral.network','qda','rda')
levels(classifier$neutral.network) =c('1','2')
classifier <-classifier%>% sapply(FUN = function(x)(ifelse(x=='1',0,1)))
classifier<- addmargins(classifier, margin = 2) # table/arragy, margin =2 aggregate by col
classifier <- data.frame(classifier)
classifier$predition <- ifelse(classifier$Sum >=5, 'malignant','benign')
head(classifier)
table(classifier$predition, valid$Class)
#confusion matrix
library(caret)
confusionMatrix(as.factor(classifier$predition), valid$Class, positive = 'malignant')
```