-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEDF4604_Research_Project.R
310 lines (243 loc) · 11.1 KB
/
EDF4604_Research_Project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# Recreating Kosinsky's Analysis
# From paper 'Mining Big Data to Extract Patterns and Predict Real-Life Outcome'
# Install necessary packages
install.packages("foreign", dependencies=TRUE)
install.packages("Rcmdr", dependencies=TRUE)
install.packages("psych", dependencies=TRUE)
install.packages("Matrix", dependencies=TRUE) # From Kosinsky article
install.packages("irlba", dependencies=TRUE) # From Kosinsky article
install.packages("topicmodels", dependencies=TRUE) # From Kosinsky article
install.packages("ROCR", dependencies=TRUE) # From Kosinsky article
# Load packages
library(foreign)
library(Rcmdr)
library(psych)
library(Matrix) # From Kosinsky article
library(irlba) # From Kosinsky article
library(topicmodels) # From Kosinsky article
library(ROCR) # From Kosinsky article
# Description of files used in the analysis
# users.csv: contains psychodemographic user profiles. It has 110,728 rows
# (excluding the row holding column names) and nine columns: anonymized user ID,
# gender ("0" for male and "1" for female), age, political views ("0" for Democrat and "1" for Republican),
# and five scores on a 100-item-long International Personality Item Pool questionnaire measuring the five-factor
# (i.e., Openness, Conscientiousness, Extroversion, Agreeable-ness, and Neuroticism) model of personality
# likes.csv: contains anonymized IDs and names of nL ???
# 1,580,284 Facebook Likes. It has two columns: ID and name.
# users-likes.csv: contains the associations between users and their Likes, stored as user-Like pairs. It has nu-L
# 10,612,326 rows and two columns: user ID and Like ID. An existence of a user-Like pair implies that a given user
# had the corresponding Like on their profile.
# Set working directory to where you downloaded the original files
setwd("D:/Monash/EDF4604 Research project/01 Research Project/EDF4604_Research_Project")
# Load files into R using
users <- read.csv("users.csv")
likes <- read.csv("likes.csv")
ul <- read.csv("users-likes.csv")
# Inspect the dimensions of the data
# Inspect head of data
# Inspect tail of data
# Kosinsky only does this for ul
dim(users)
head(users)
tail(users)
dim(likes)
head(likes)
tail(likes)
dim(ul)
head(ul)
tail(ul)
# Constructing a user like matrix
# Matrix named M
# First need to match users with their likes
ul$user_row <- match(ul$userid, users$userid)
ul$like_row <- match(ul$likeid, likes$likeid)
# Use the pointers to rows in the 'users' and 'likes' objects
# to build a user-Like matrix
require(Matrix)
# M <- sparseMatrix(i = ul$user_row, j = ul$like_row, x = 1) # Original command
M <- sparseMatrix(i = ul$user_row, j = ul$like_row, x = 1)
# Set row names of the matrix to contain the iDs of the respective users
# Set column names to contain the names of the respective Likes
# Display dimensions of Matrix M
rownames(M) <- users$userid
colnames(M) <- likes$name
dim(M)
# Objects ul and likes are not necessary for further steps and can be removed
# Uncomment following line to remove the objects
# rm(ul, likes)
# Trimming the User-Like Matrix
# Remove the least frequent data points
# Thresholds are 50 minimum likes per user and minimum 150 users per
# like to reduce the time required for further analysis
repeat {
i <- sum(dim(M))
M <- M[rowSums(M) >= 50, colSums(M) >= 150]
if (sum(dim(M)) == i) break
}
users <- users[match(rownames(M), users$userid),]
# Extracting patterns from big data sets
# Singular value decomposition (SVD)
# Latent Dirichlet allocation (LDA)
# Reducing the dimensionality of the User-Like matrix using SVD and LDA
# Package used in this section (irlba) should have loaded at start
# Parameter nv = 5 corresponds to k, or the number of SVD dimensions to be extracted
# SVD
set.seed(seed = 68)
Msvd <- irlba(M, nv = 5)
u <- Msvd$u
v <- Msvd$v
# Note: We do not center the data prior to conducting the SVD to maintain the sparse
# format of Matrix M
# Centering could be achieved by using the following command:
# M <- scale(M, scale = F)
# Scree plot representing the singular values of the consecutive
# SVD dimensions (matrix SIGMA) can be displayed using:
plot(Msvd$d)
# Use varimax function to rotate the SVD dimensions
# The following code produces V_rot and U_rot, the
# varimax rotated equivalents of matrices U and V:
v_rot <- unclass(varimax(Msvd$v)$loadings)
u_rot <- as.matrix(M %*% v_rot)
# LDA
# topicmodels package should install and load at beginning
# Set Dirichlet distribution parameters to a = 10 and delta = 0.1
# As with SVD, preset R's random number generator to seed = 68
Mlda <- LDA(M, k = 5, control =
list(alpha = 10, delta = .1, seed = 68),
method = "Gibbs")
gamma <- Mlda@gamma
beta <- exp(Mlda@beta)
# Following code is used to compute the log-likelihoods across different values of k
# The for loop cycles through increasing values of i, trains the LDA model while
# setting the k = 1, adn the extracts the model's log-likelihood and its degrees
# of freedom using teh logLik function. The results are saved in the lg object.
# Consider widening the range of k's to be tested but be aware that this code can
# take a very long time to run depending on the computer it is run on
lg <- list()
for (i in 2:5) {
Mlda <- LDA(M, k = i, control =
list(alpha = 10, delta = .1, seed = 68),
method = "Gibbs")
lg[[i]] <- logLik(Mlda)
}
plot(2:5, unlist(lg))
# Interpreting clusters and dimensions
# For simplicity, we will use k = 5 LDA clusters and k = 5 SVD dimensions
# extracted from Matrix M, but you should experiment with different
# vallues of k
# LDA clusters
# Start by computing the correlations between user scores on LDA clusters
# stored in R object 'gamma' and psychodemographic user traits
# Code exludes first column of 'users' because they are user IDs
cor(gamma, users[, - 1], use = "pairwise")
# Investigate which Likes are most representative of these k = 5 LDA clusters
# Strength of relationship stored in R object 'beta'
# Following code will extract top 10 likes most stongly associated
# with each of the clusters
# First we create an empty list 'top' to store results
# Next we start a 'for' loop assigning consecutive values from 1 to 5 to i
# Inside the loop, we use 'order' function to order the LIkes in ascending
# order based on their scores on the i-th LDA cluster. Next we use the 'tail'
# function to extract teh indexes of the last 10 LIkes (i.e. the Likes with
# the higest LDA scores) and save their names as teh i-th element of list 'top'
top <- list()
for (i in 1:5) {
f <- order(beta[i,])
temp <- tail(f, n = 10)
top[[i]] <- colnames(M) [temp]
}
top
# SVD Dimensions
cor(u_rot, users[, -1], use = "pairwise")
# 'For' loop from LDA adapted for SVD
top <- list()
bottom <-list()
for (i in 1:5) {
f <- order(v_rot[ ,i])
temp <- tail(f, n = 10)
top[[i]]<-colnames(M)[temp]
temp <- head(f, n = 10)
bottom[[i]]<-colnames(M)[temp]
}
# Extract the indexes of the Likes with extreme scores
colnames(M) [tail(f, n = 10)]
colnames(M) [head(f, n = 10)]
# Predicting real life outcomes
# Cross-validation
# Predicting real-life outcomes with Facebook likes
# Building a prediction model based on the SVD dimensions extracted
# from the user-Like Matrix M. Start by assigning a random number to
# each user in the sample as a way of splitting users into 10 independent
# subsets (or folds) that will be used for cross validation
folds <- sample(1:10, size = nrow(users), replace = T)
# The users with fold = 1 are assigned to the test subset, and the remaining
# userse are assigned to the training subset. The following code produces a
# logical vector 'test' that takes the value of TRUE when object folds equals
# 1 and FALSE otherwise
test <- folds == 1
# Logical vectors can be used to extract desired elements from R objects, such
# as other vectors or matrices. For example, the command mean(users$age[test])
# can be used to compute the average age of the users in the test subset.
# In the following step, we extract k = 50 SVD dimensions from a training
# subset of Facebook Likes (i.e. M[!test]). SVD scores of Likes are varimax-
# rotated user SVD scores of the entire sample. Consequently, the SVD scores
# for the users in the test subset, preserving the independence of the results
# obtained from the training and test subsets
Msvd <- irlba(M[!test,], nv = 50)
v_rot <- unclass(varimax(Msvd$v)$loadings)
u_rot <- as.data.frame(as.matrix(M %*% v_rot))
# Next apply the function 'glm' to build regression models predicting variables
# 'ope' (openess) and gender from the user SVD scores in the training subset
# Logistic regression model can be obtained by specifying the parameter 'family =
# "binomail""
fit_o <- glm(users$ope~., data = u_rot, subset = !test)
fit_g <- glm(users$gender~.,data = u_rot, subset = !test, family = "binomial")
# Next estimate the predictions for the testing sample using the function 'predict'
# Predictions are produced using the models developed in the previous step
pred_o <- predict(fit_o, u_rot[test,])
pred_g <- predict(fit_g, u_rot[test,], type = "response")
# Finally, estimate the accuracy of the predictions for this particular cross-
# validation fold. The accuracy of the linear predictions can be expressed
# as a Pearson product-momnent correlation
cor(users$ope[test], pred_o)
# For dichotomous variables such as gender, report prediction accuracy using
# the area under the receiver-operating characteristic curve coefficient (AUC)
# which can be computed using the ROCR library
temp <- prediction(pred_g, users$gender[test])
performance(temp, "auc")@y.values
# So far we have estimated prediction performance based on all k = 50 SVD dimensions
# Next we will investigate the relationship between the cross-validated prediction
# accuracy and the number of k of SVD dimensions. In the following code we first
# define a set of ks containing the values of k that we want to test. We then create
# an empty list 'rs' that is used to store the results. Then we start a 'for' loop that
# reruns the code surrounded by curly brackets while changing the value of i to consecutive
# elements stored in ks.
set.seed(seed = 68)
ks <- c(2:10,15,20,30,40,50)
rs <- list()
for (i in ks) {
v_rot <- unclass(varimax(Msvd$v[,1:i])$loadings)
u_rot <- as.data.frame(as.matrix(M%*%v_rot))
fit_o <- glm(users$ope~., data = u_rot, subset = !test)
pred_o <- predict(fit_o, u_rot[test,])
rs[[as.character(i)]] <- cor(users$ope[test], pred_o)
}
# Finally compute cross-validated predictions of openess for entire sample
# We build models and compute the predicted values using each of the 10 folds of data
set.seed(seed = 68)
pred_o <- rep(NA, n = nrow(users))
for (i in 1:10){
test <- folds == i
Msvd <- irlba(M[!test, ], nv = 50)
v_rot <- unclass(varimax(Msvd$v)$loadings)
u_rot <- as.data.frame(as.matrix(M %*% v_rot))
fit_o <- glm(users$ope~., data = u_rot, subset = !test)
pred_o[test] <- predict(fit_o, u_rot[test, ])
}
cor(users$ope, pred_o)
# Prediction models based on LDA cluster memberships
# The following models took several hours to run
# Uncomment the next 3 lines if you want to run them again
# Mlda <- LDA(M[!test, ], control=list(alpha=1, delta=.1, seed=68), k=50, method="Gibbs")
# temp<-posterior(Mlda, M)
# gamma<-as.data.frame(temp$topics)