-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstatistical-modelling-2b.R
93 lines (73 loc) · 2.58 KB
/
statistical-modelling-2b.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# library imports
library(mclust)
library(dplyr)
# dataset is taken from https://www.kaggle.com/zynicide/wine-reviews
# The winemag-data-130k-v2.csv dataset
# This dataset has the following colummns in this order
# country, description, designation, points, price, province, region_1, region_2,
# taster_name, taster_twitter_handle, title, variety, winery
wine_df = read.csv('data/winemag-data-130k-v2.csv')
# definitely unwanted variables for the modelling
wine_df = within(wine_df, remove('X', 'description', 'designation', 'taster_name', 'taster_twitter_handle', 'title'))
# these variables might be useful in the future, but ignorning them for now.
wine_df = within(wine_df, remove('province', 'region_1', 'region_2', 'winery', 'variety'))
# currently the data frame contains the following columns "country, points and price" in this order
# where X = {country and price} and Y = points
View(wine_df)
# this filters the data as per Q2-b - US Wine
# additionally filtering it to price less than 400 so as to avoid outliers
wine_us = filter(wine_df, country == 'US' & price < 100)
wine_us = na.omit(wine_us)
wine_us = within(wine_us, remove('country'))
View(wine_us)
wine_us <- wine_us[c("price", "points")]
# not required
# wine_us$price<-log(wine_us$price)
# wine_us$points<-wine_us$points/wine_us$price
# plot "points vs price" for wine in US
plot(wine_us)
fit1 <- Mclust(wine_us)
print(fit1)
summary(fit1)
plot(fit1, what = "classification")
plot(fit1, what = "BIC")
fit1$BIC
fit1$parameters$mean
fit1$parameters$pro
toy_data <- wine_us
toy_data <- toy_data + rnorm(n = nrow(wine_us), sd = 0.1)
toy_data <- toy_data[c("price", "points")]
plot(toy_data)
fit_toy <- Mclust(toy_data)
summary(fit_toy)
plot(fit_toy, what = "classification")
toy_data2 <- apply(wine_us, 2, scale)
toy_data2 <- toy_data2 + rnorm(n = nrow(wine_us), sd = 0.1)
plot(toy_data2)
fit1_1 <- Mclust(toy_data)
plot(fit1_1, what = "classification")
fit1_1$parameters$mean
fit1_1$parameters$pro
BIC <- mclustBIC(wine_us)
plot(BIC)
summary(BIC)
fit2 <- Mclust(wine_us, G = 7, modelNames = "VVV")
print(fit2)
summary(fit2)
plot(fit2, what = "classification")
plot(fit2, what = "uncertainty")
fit3 <- Mclust(wine_us, G = 8, modelNames = "VVV")
print(fit3)
summary(fit3)
plot(fit3, what = "classification")
plot(fit3, what = "uncertainty")
fit4 <- Mclust(wine_us, G = 9, modelNames = "VVI")
print(fit4)
summary(fit4)
plot(fit4, what = "classification")
plot(fit4, what = "uncertainty")
fit5 <- Mclust(wine_us, G = 6, modelNames = "VVV", prior = priorControl())
print(fit5)
summary(fit5)
plot(fit5, what = "classification")
plot(fit5, what = "uncertainty")