-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload-data.R
109 lines (99 loc) · 2.59 KB
/
load-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
################################################################
# Mineria de Datos: Preprocesamiento y Clasificacion #
# #
# FILE: load-data.R #
# #
# (C) Cristian González Guerrero #
################################################################
# Load required libraries
library(utils)
library(stats)
# Load the complete dataset
dataset = read.csv("data/datcom2016.csv")
names(dataset)[1:50] = paste("X", 1:50, sep = "")
# Load the 10-folds
split_up = list() # The k-folds (same to dataset.test)
for (k in 1:10) {
filename = paste(
"data/datcom2016-10-",
as.character(k),
"-fold",
".csv",
sep = ""
)
x = read.csv(filename)
rownames(x) = x$X
x = x[,-1]
names(x)[1:50] = paste("X", 1:50, sep = "")
split_up[[k]] = x
}
# Function to quickly access to training set for a given fold
dataset.train = function(i) {
return(do.call("rbind", split_up[-i]))
}
# Function to quickly access to test set for a given fold
dataset.test = function(i) {
return(split_up[[i]])
}
# Create some useful sets
dataset.tra = dataset.train(1) # Training set
dataset.tra.cl = dataset.tra$class # |-> Class
dataset.tra.dt = dataset.tra[, -50] # \-> Data
dataset.tst = dataset.test(1) # Training set
dataset.tst.cl = dataset.tst$class # |-> Class
dataset.tst.dt = dataset.tst[, -50] # \-> Data
# Separate data into numeric and factor (useful for visualization)
subset.numeric.dt = function(df) {
return(
df[,sapply(df, class) != "factor", drop = F]
)
}
subset.numeric.cl = function(df) {
return(
df$class
)
}
subset.numeric = function(df) {
return(cbind(
subset.numeric.dt(df),
class = df$class
))
}
subset.factor.dt = function(df) {
return(subset(
df[,sapply(df, class) == "factor", drop = F],
select = -class
))
}
subset.factor.cl = subset.numeric.cl
subset.factor = function(df) {
return(
df[,sapply(df, class) == "factor", drop = F]
)
}
subset.num.dt = function(df) {
return(
df[,sapply(df, class) == "numeric", drop = F]
)
}
subset.num.cl = subset.numeric.cl
subset.num = function(df) {
return(cbind(
subset.num.dt(df),
class = df$class
))
}
subset.int.dt = function(df) {
return(
df[,sapply(df, class) == "integer", drop = F]
)
}
subset.int.cl = subset.numeric.cl
subset.int = function(df) {
return(cbind(
subset.int.dt(df),
class = df$class
))
}
# Remove temporal variables
rm(filename, x)