-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCode.R
103 lines (86 loc) · 4.57 KB
/
Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
install.packages("aws.s3")
library(SparkR)
library("aws.s3")
Sys.setenv("AWS_ACCESS_KEY_ID" = "XXXXXXXXXXXXXXXXXXXX",
"AWS_SECRET_ACCESS_KEY" = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"AWS_DEFAULT_REGION" = "us-east-1")
bucketlist()
get_bucket('yelpdatasetchallengebigdataproject')
obj <- get_object("s3://yelpdatasetchallengebigdataproject/yelp_review.csv")
yelpData <- read.csv(text = rawToChar(obj)) #storing csv data from Amazon s3 to a yelp data variable
display(yelpData) # preview of the dataset
sum(is.na(yelpData)) # checking if there is any NA's or data is clean
nrow(yelpData) # Number of rows in dataset
str(yelpData) # structure of the dataset
names(yelpData) # mainly used for seeing for column numbers
yelpData <- yelpData[,c(4,6,7,8,9)]
head(yelpData)
display(yelpData) # Making sure whether it extracted the right columns
pairs(yelpData) # seeing if there is any correlation visually between variables
yelpData$textLength <- nchar(as.character(yelpData$text)) # Adding text length column
to yelpData which calculates length of the review text
display(yelpData) #Checking to see if new column is added and has text length
#Installing sentimentr package
if (!require("pacman")) install.packages("pacman")
pacman::p_load_current_gh("trinker/lexicon", "trinker/sentimentr")
yelpData$text <- as.character(yelpData$text) #Coverting factor to character type
yelp <- sentiment_by(yelpData$text) # getting sentiment scores
yelpData$textSentimentScore <- yelp$ave_sentiment # adding each text sentimental score to new column called textSentimentScore
correlations <- cor(yelpData[,c(1,3,4,5,6,7)], use="pairwise", method="spearman") # Calculating Correlation for selected columns
install.packages("corrplot") #installing corrplot package
library(corrplot)
corrplot.mixed(correlations, order="hclust", tl.col="black") #plotting the correlation graph between variable
attach(yelpData) #attaching the Yelp data for reproducibility
# box plot between Sentimental score and stars
# Higher the ratings (stars), the review is more positive. Therefore sentimental score and stars are directly related.
boxplot(textSentimentScore~stars,data=yelpData, main="Sentimental Score Vs Stars",
xlab="Stars", ylab="Sentimental Score")
# Splitting the data into train and test data for regression analysis
set.seed(2017)
i <- base::sample(1:nrow(yelpData), nrow(yelpData)*0.75, replace=FALSE)
train_yelpData <- yelpData[i,]
test_yelpData <- yelpData[-i,]
# Creating a linear regression model
lm1 <- lm(stars ~ useful+funny+cool+textLength+textSentimentScore, data =
train_yelpData)
summary(lm1) #summary of the linear regression model
predictions <- predict(lm1, newdata=test_yelpData) # using the linear regression model to predict ratings against test values
correlationAccuracyLinearRegression <- cor(predictions, test_yelpData$stars) # calculating the correlation accuracy for the linear regression model
paste("The correlation accuracy for linear regression model is: ",
correlationAccuracyLinearRegression*100,"%")
install.packages("dplyr")
#Plotting linear regression model error, that is, wheather, linear model is showing the linearity
par(mfrow=c(2,2))
plot(lm1)
# Plotting how model fits the predictors and target
plot(stars ~ useful+funny+cool+textLength+textSentimentScore, data=yelpData)
abline(lm1, col="red")
install.packages("tree")
library(tree) # library for decison tree
decisionTreeModel = tree(stars ~., train_yelpData) # building the decision tree model
decisionPred <- predict(decisionTreeModel, test_yelpData) #predicting
correlationAccuracyOfDecisionTree <- cor(decisionPred,test_yelpData$stars) # accuracy
paste("The correlation accuracy of decision tree is:
",correlationAccuracyOfDecisionTree*100,"%")
install.packages("gbm")
#boosting
set.seed(2017)
library(gbm)
boost <- gbm(stars ~ useful + funny + cool + textLength + textSentimentScore,
train_yelpData, distribution="gaussian",
n.trees=5000, interaction.depth=4)
summary(boost)
decisi <- predict(boost, test_yelpData,n.trees=5000) #predicting
correlationAccuracyForBoosting <- cor(decisi,test_yelpData$stars) # accuracy
paste("The correlation accuracy for boosting is:
",correlationAccuracyForBoosting*100,"%")
install.packages("randomForest")
#Random Forest
set.seed(2017)
library(randomForest)
randomForestModel <- randomForest(stars ~ useful + funny + cool + textLength +
textSentimentScore, data=train_yelpData)
randomfor <- predict(randomForestModel, test_yelpData) #predicting
correlationAccuracyForRandomForest <- cor(randomfor,test_yelpData$stars) # accuracy
paste("The correlation accuracy for Random forest is:
",correlationAccuracyForRandomForest*100,"%")