-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkmeans.r
143 lines (105 loc) · 3.76 KB
/
kmeans.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# K-means-Clustering
#The program contains detection of outliers, scaling, k-means clustering and evaluation of the model.
##### MADE BY ######
#1st Objective: k-means clustering
#Created by: Paraskevi Sifnaiou
# 20/11/19
#####################
###### Step 1: Data import ######
setwd("C:/Users/")
#Import libraries
library(readxl)
library(dplyr)
library(fpc)
library(MASS)
library(caret)
library(flexclust)
library(NbClust)
#Read the file
original <- read_xlsx("Whitewine.xlsx")
boxplot(original)
####Step 2: Remove the extreme outliers#####
#show the Min. 1st Qu , Median , Mean ,3rd Qu, Max. for residual sugar
original_summary <- summary(original$`residual sugar`)
original_summary
# Estimate interquartile range
# (3rd quartile minus 1st quartile)
iqr <- original_summary[[5]] - original_summary[[2]]
# Identify bounds for outliers
lower_bound <- original_summary[[2]] - (1.5 * iqr)
upper_bound <- original_summary[[5]] + (1.5 * iqr)
# Identify outlier(s)
outliers <- original %>%
filter( `residual sugar`> upper_bound | `residual sugar`< lower_bound)
# Remove outliers from dataframe, but store as new dataframe "no_outliers"
no_outliers <- original %>%
filter(`residual sugar` < upper_bound & `residual sugar` > lower_bound)
####
original_summary <- summary(original$`free sulfur dioxide`)
iqr <- original_summary[[5]] - original_summary[[2]]
# The bounds are established with the original data
lower_bound <- original_summary[[2]] - (1.5 * iqr)
upper_bound <- original_summary[[5]] + (1.5 * iqr)
outliers <- rbind(outliers,original %>%
filter(`free sulfur dioxide` > upper_bound | `free sulfur dioxide` < lower_bound))
no_outliers <- no_outliers %>%
filter(`free sulfur dioxide` < upper_bound & `free sulfur dioxide` > lower_bound)
# Repeat for fixed accidity
original_summary <- summary(original$`total sulfur dioxide`)
iqr <- original_summary[[5]] - original_summary[[2]]
# Remember that the bounds are based on the original data
lower_bound <- original_summary[[2]] - (1.5 * iqr)
upper_bound <- original_summary[[5]] + (1.5 * iqr)
# Removing fixed acidity outliers from the no_outliers data, not the original
outliers <- rbind(outliers,original %>%
filter(`total sulfur dioxide` > upper_bound | `total sulfur dioxide` < lower_bound))
no_outliers <- no_outliers %>%
filter(`total sulfur dioxide` < upper_bound & `total sulfur dioxide` > lower_bound)
boxplot(no_outliers)
#Scaling
wine_stand<- scale(no_outliers[-12])
summary(scale_wine)
#NbClust()
set.seed(1234)
nc <- NbClust(wine_stand,
min.nc=2, max.nc=8,
method="kmeans")
barplot(table(nc$Best.n[1,]), # provide bar chart
xlab="Numer of Clusters",
ylab="Number of Criteria",
main="Number of Clusters Chosen by 9 Criteria")
ws <- 0
for (i in 1:9){
ws[i] <-
sum(kmeans(wine_stand, centers=i)$withinss)}
plot(1:9,
ws,
type="b",
xlab="Number of Clusters",
ylab="Within groups sum of squares")
#kmeans=2
fit.km2 <- kmeans(wine_stand,2)
plotcluster(wine_stand, fit.km2$cluster)
#Evaluation for k=2
confuse <- table(no_outliers$quality,fit.km2$cluster)
confuse
#MASS plot
parcoord(wine_stand, fit.km2$cluster)
#kmeans=3
fit.km3 <- kmeans(wine_stand , 3)
fit.km3
#Evaluation for k=3
confuse3 <- table(no_outliers$quality,fit.km3$cluster)
#kmeans=4
fit.km4 <- kmeans(wine_stand, 4)
#Evaluation for k=4
table(no_outliers$quality,fit.km4$cluster)
#kmeans=5
fit.km5 <- kmeans(wine_stand, 5)
#Evaluation for k=5
table(no_outliers$quality,fit.km5$cluster)
plotcluster(wine_stand,fit.km5$cluster)
#Evaluation with ARI for k=2
randIndex(confuse)
#NbClust() with Manhattan distance
clusters_manhattan <- NbClust(wine_stand,distance="manhattan",min.nc=2,max.nc=5,method="kmeans",index="all")