-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunlabelled_data_training.R
166 lines (148 loc) · 7.44 KB
/
unlabelled_data_training.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
library(parallelMap)
library(llama)
parallelStartSocket(64)
parallelLibrary("llama")
# Read the data
names <- c("ID", "nodes", "time", "size")
classes <- c("character", "numeric", "numeric", "numeric")
read_data <- function(algorithm) {
sip <- read.csv(paste0("results/", algorithm, ".sip.csv"), header = FALSE,
colClasses = classes, col.names = names)
mcs <- read.csv(paste0("results/", algorithm, ".mcs.csv"), header = FALSE,
colClasses = classes, col.names = names)
rbind(sip, mcs)
}
clique <- read_data("clique")
kdown <- read_data("kdown")
mcsplit <- read_data("mcsplit")
mcsplitdown <- read_data("mcsplitdown")
# Construct the feature data frame
source("common.R")
features <- get_features(labelled = FALSE)
# Check the clique dataset
clique_features <- merge(clique, features, by = "ID")
# Are there any instances that are too big to be solved?
clique_features$ID[clique_features$pattern.vertices *
clique_features$target.vertices >= 16000]
# What instances are missing?
small_features <- subset(features, pattern.vertices * target.vertices < 16000)
small_features$ID[!(small_features$ID %in% clique$ID)]
# Check if the answers match
answers <- data.frame(ID = features[1])
answers <- merge(answers, kdown[kdown$time < 1e6, c("ID", "size")], by = "ID",
all.x = TRUE)
colnames(answers) <- c("ID", "kdown")
answers <- merge(answers, mcsplit[mcsplit$time < 1e6, c("ID", "size")],
by = "ID", all.x = TRUE)
colnames(answers) <- c("ID", "kdown", "mcsplit")
answers <- merge(answers, clique[clique$time < 1e6, c("ID", "size")],
by = "ID", all.x = TRUE)
colnames(answers) <- c("ID", "kdown", "mcsplit", "clique")
answers$equal <- answers$mcsplit == answers$clique
all(answers[complete.cases(answers), "equal"])
#Construct the performance (running time) data frame
performance <- data.frame(ID = features[1])
performance <- merge(performance, clique[, c("ID", "time")], by = "ID",
all.x = TRUE)
colnames(performance) <- c("ID", "clique")
performance <- merge(performance, kdown[, c("ID", "time")], by = "ID",
all.x = TRUE)
colnames(performance) <- c("ID", "clique", "kdown")
performance <- merge(performance, mcsplit[, c("ID", "time")], by = "ID",
all.x = TRUE)
colnames(performance) <- c("ID", "clique", "kdown", "mcsplit")
performance <- merge(performance, mcsplitdown[, c("ID", "time")], by = "ID",
all.x = TRUE)
rm("clique", "kdown", "mcsplit", "mcsplitdown")
colnames(performance) <- c("ID", "clique", "kdown", "mcsplit", "mcsplitdown")
performance$clique[is.na(performance$clique)] <- 1e6
performance$clique <- pmin(performance$clique, 1e6)
performance$kdown <- pmin(performance$kdown, 1e6)
performance$mcsplit <- pmin(performance$mcsplit, 1e6)
performance$mcsplitdown <- pmin(performance$mcsplitdown, 1e6)
performance <- performance[performance$clique < 1e6 |
performance$kdown < 1e6 |
performance$mcsplit < 1e6 |
performance$mcsplitdown < 1e6, ]
performance <- performance[order(performance$ID), ]
features <- features[features$ID %in% performance$ID, ]
# Construct the success data frame
success <- cbind(performance)
success$clique <- success$clique < 1e6
success$kdown <- success$kdown < 1e6
success$mcsplit <- success$mcsplit < 1e6
success$mcsplitdown <- success$mcsplitdown < 1e6
costs <- read.csv("results/costs.csv", header = FALSE)
colnames(costs) <- c("ID", "group1")
data <- input(features, performance, success, list(
groups = list(group1 = colnames(features)[-1]), values = costs))
rm("features", "performance", "success")
saveRDS(data, "models/unlabelled_data.rds")
model <- classify(makeLearner("classif.randomForest"),
cvFolds(data, stratify = TRUE))
saveRDS(model, "models/unlabelled.rds")
parallelStop()
# Plots
times <- subset(data$data, T, data$performance)
times$vbs <- apply(times, 1, min)
cols <- gray(seq(1, 0, length.out = 255))
labels <- c("clique", sprintf("k\u2193"), "McSplit",
sprintf("McSplit\u2193"), "VBS")
# Log runtimes by solver and instance
image(log10(t(as.matrix(times[, -5]))), axes = F, col = cols)
axis(1, labels = labels[-5], at = seq(0, 1, 1 / (length(data$performance) - 1)),
las = 2)
# White - first, black - last (weird results because of equal timing out values)
image(apply(times[, -5], 1, order), axes = F, col = cols)
axis(1, labels = labels[-5], at = seq(0, 1, 1 / (length(data$performance) - 1)),
las = 2)
# Tables for best algorithms
times <- performance[grep("data/sip-instances/images-CVIU11", performance$ID), ]
times <- performance[grep("data/sip-instances/images-PR15", performance$ID), ]
times <- performance[grep("data/sip-instances/largerGraphs", performance$ID), ]
times <- performance[grep("data/sip-instances/LV", performance$ID), ]
times <- performance[grep("data/sip-instances/meshes-CVIU11", performance$ID), ]
times <- performance[grep("data/sip-instances/phase", performance$ID), ]
times <- performance[grep("data/sip-instances/scalefree", performance$ID), ]
times <- performance[grep("data/sip-instances/si", performance$ID), ]
times <- performance[grep("data/mcs-instances", performance$ID), ]
times$vbs <- apply(times, 1, min)
# How many times is each algorithm the best?
length(which(times$clique <= times$kdown & times$clique <= times$mcsplit &
times$clique <= times$mcsplitdown))
length(which(times$kdown <= times$clique & times$kdown <= times$mcsplit &
times$kdown <= times$mcsplitdown))
length(which(times$mcsplit <= times$clique & times$mcsplit <= times$kdown &
times$mcsplit <= times$mcsplitdown))
length(which(times$mcsplitdown <= times$clique &
times$mcsplitdown <= times$kdown &
times$mcsplitdown <= times$mcsplit))
summary(times[!(times$clique < times$kdown & times$clique < times$mcsplit &
times$clique < times$mcsplitdown) &
!(times$kdown < times$clique & times$kdown < times$mcsplit &
times$kdown < times$mcsplitdown) &
!(times$mcsplit < times$clique & times$mcsplit < times$kdown &
times$mcsplit < times$mcsplitdown) &
!(times$mcsplitdown < times$clique &
times$mcsplitdown < times$kdown &
times$mcsplitdown < times$mcsplit), ])
library(lattice)
library(latticeExtra)
ecdfplot(~ clique + kdown + mcsplit + mcsplitdown + vbs, data = times,
auto.key = list(space = "right", text = labels), xlab = "Runtime (ms)")
# Heatmaps for pattern/target features
features <- subset(data$data, T, data$features)
n_features <- normalize(features)
graph_feature_names <- c("vertices", "edges", "loops", "mean degree",
"max degree", "SD of degrees", "density", "connected",
"mean distance", "max distance", "distance \u2265 2",
"distance \u2265 3", "distance \u2265 4")
full_feature_names <- c(paste("pattern", graph_feature_names),
paste("target", graph_feature_names),
c("vertices ratio", "edges ratio", "mean degree ratio",
"max degree ratio", "density ratio",
"mean distance ratio", "max distance ratio"))
par(mar = c(1, 10, 1, 1))
image(as.matrix(n_features$features), axes = F, col = cols)
axis(2, labels = full_feature_names,
at = seq(0, 1, 1 / (length(data$features) - 1)), las = 2)