-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlife_expectancy.R
107 lines (85 loc) · 3.1 KB
/
life_expectancy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
library(rminer)
library(ggplot2)
library(kknn)
library(ggpubr)
library(corrplot)
lifeexp.df = read.csv("Life Expectancy Data.csv")
# dataset description
str(lifeexp.df)
summary(lifeexp.df) # here we can see NAs
# DATA PREPARAION
## Country Factor to Numerical
lifeexp.df$Country = as.numeric(lifeexp.df$Country)
## IMPUTATION
# save column with missing values indexes
nacol = NULL
for (i in 1:ncol(lifeexp.df)) {
if ( any(is.na(lifeexp.df[,i])) ) {
nacol = c(nacol,i)
}
}
# 1st method: case deletion
lifeexp.na.del = na.omit(lifeexp.df)
summary(lifeexp.na.del)
# 2nd method: imputation by mode
lifeexp.imp.mode = lifeexp.df
for (i in nacol) {
lifeexp.imp.mode = imputation("value", lifeexp.imp.mode, i, Value=which.max(table(na.omit(lifeexp.df[,i]))))
}
summary(lifeexp.imp.mode)
# 3rd mode: imputation by hotdeck
lifeexp.imp.hotdeck = lifeexp.df
for (i in nacol) {
lifeexp.imp.hotdeck = imputation("hotdeck", lifeexp.imp.hotdeck, i)
}
summary(lifeexp.imp.hotdeck)
# imputation comparison
plots = list()
j = 1
for (i in nacol[1:4]) {
meth1=data.frame(v=lifeexp.na.del[[i]])
meth2=data.frame(v=lifeexp.imp.mode[[i]])
meth3=data.frame(v=lifeexp.imp.hotdeck[[i]])
meth1$Method="NAs deleted"
meth2$Method="Mode"
meth3$Method="Hotdeck"
all = rbind(meth1,meth2,meth3)
plots[[j]] = ggplot(all,aes(v,fill=Method))+geom_density(alpha = 0.4)+xlab(colnames(lifeexp.df)[i])
j = j+1
}
ggarrange(plotlist = plots, ncol=2, nrow=2, common.legend = TRUE, legend="bottom")
# we keep hotdeck version
lifeexp = lifeexp.imp.hotdeck
## EDA
correlation = cor(within(lifeexp, rm("Status")))
corrplot(correlation, type="upper", method="circle")
# MODEL
inputs = c(1:2,4:ncol(lifeexp))
dvar = 3
# Holdout - Random Forest
H = holdout(lifeexp$Life.expectancy, ratio=2/3, seed=12345)
summary(H)
model1 = fit( Life.expectancy~., lifeexp[H$tr,], model="randomForest")
# Crossvalidation - Random Forest
model2 = crossvaldata(Life.expectancy~., lifeexp, fit, predict, ngroup=10, seed=123, model="randomForest", task="reg")
# EVALUATION
# Holdout
# get predictions on test set (new data)
pred1 = predict(model1, lifeexp[H$ts,])
target1 = lifeexp[H$ts,]$Life.expectancy
mgraph(target1, pred1, graph="RSC", Grid=10, main="Random Forest - Holdout 1/3")
mmetric(target1, pred1, metric="ALL")
# 10-fold cross-validation
pred2 = model2$cv.fit # k-fold predictions on full dataset
mgraph(lifeexp$Life.expectancy, pred2, graph="RSC", Grid=10, main="Random Forest - 10-fold Cross Validation")
mmetric(lifeexp$Life.expectancy, pred2, metric="ALL")
# mining for randomForest, external 3-fold, 20 Runs (=60 fitted models)
model3.mining = mining(Life.expectancy~., lifeexp, model="randomForest", method=c("kfold",3,42), Runs=20)
m=mmetric(model3.mining, metric=c("MAE","RMSE","R2")) # 2 metrics:
print(m) # show metrics for each run
# mining for standard multiple linear regression
model4.mining = mining(Life.expectancy~., lifeexp, model="mr", method=c("kfold",3,42), Runs=20)
L=vector("list",2) # list of minings
L[[1]]=model3.mining
L[[2]]=model4.mining
mgraph(L, graph="REC", leg=c("randomForest","mr"), main="REC curve", xval=10)