-
Notifications
You must be signed in to change notification settings - Fork 1
/
01_IMDb_WebScraping_Script.R
130 lines (116 loc) · 5.34 KB
/
01_IMDb_WebScraping_Script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
###############################################################################
# IMDb webscraping Script
###############################################################################
#
# There are two steps performed in this script:
#
# 1. Scrape IMDb movie id from main imdb website
# - In this script date range is from 2017-12-31 to 2019-01-01, 100 movies per page
# - Store in 'all_movies.csv' file
#
# 2. Scrape reviews for each movie
# - For each movie id, run over the list and scrape top 25 reviews for each movie.
# - Store in 'all_reviews.csv' file
#
###############################################################################
if (!require("rvest")) install.packages("rvest", quiet=TRUE) ; require("rvest")
if (!require("XML")) install.packages("XML", quiet=TRUE) ; require("XML")
if (!require("stringr")) install.packages("stringr", quiet=TRUE) ; require("stringr")
if (!require("tidyverse")) install.packages("tidyverse", quiet=TRUE) ; require("tidyverse")
setwd("C:/Users/eviriyakovithya/Documents/GitHub/SMAGroup3")
# Function to retrieve Review Score ######
getReviewScore <- function(x){
stars_html <- html_nodes(x,'.ratings-imdb-rating')
stars <- html_text(stars_html)
stars <- str_extract(stars[1], "\\d\\.\\d")
return(stars)
}
# Initialize dataframe to store the movie id ######
all_movies <- data.frame(titles=character(),
imdb_ratings=integer(),
movie_id=character(),
stringsAsFactors=FALSE)
# scrape movie infos, return to all_movies dataframe ######
# output column names: titles imdb_ratings movie_id
# loop through 144 pages
for (i in 1:144){
url <- paste0("https://www.imdb.com/search/title?title_type=feature&release_date=2017-12-31,2019-01-01&count=100&start=",i,"01&ref_=adv_nxt")
url <- URLdecode(url)
webpage <- read_html(url)
# get a list of all the films in this page (list of 100)
films_html <- html_nodes(webpage,'.mode-advanced')
print(i)
# set sleep time 5 seconds for each page
Sys.sleep(5)
#loop for all pages
for (k in 1:length(films_html)){
# extract movie title
titles_html <- html_nodes(films_html[[k]],'.lister-item-header a')
titles <- html_text(titles_html)
# extract movie average rating (stars)
stars_html <- html_nodes(films_html[[k]],'.ratings-imdb-rating')
stars <- html_text(stars_html)
imdb_ratings <- str_extract(stars[1], "\\d\\.\\d")
# extract IMDb movie id
href_html <- html_nodes(films_html[[k]],'a')%>% html_attr('href')
movie_id <- strsplit(href_html[[1]],"/")[[1]][3]
# append to dataframe
this_movie <- as.data.frame(cbind(titles,imdb_ratings,movie_id))
all_movies <- rbind(all_movies,this_movie)
}
# # periodically save the file every 1000 entries
if(nrow(all_movies)%%1000==0){write.csv(all_movies,'all_movies.csv')}
}
# export to csv
write.csv(all_movies,'all_movies.csv')
# read in all_movies.csv file
all_movies<-read.csv(file="C:/Users/eviriyakovithya/Documents/GitHub/SMAGroup3/all_movies.csv", header=TRUE, sep=",")
# scrape movie review using movie id as a key, return data to all_reviews dataframe ######
# output names: id comment_titles ratings comments_body
all_reviews <- data.frame(id=character(),
comment_titles=character(),
ratings=integer(),
comments_body=character(),
stringsAsFactors=FALSE)
# get all films reviews, return to all_reviews dataframe
all_ids <- as.character(all_movies$movie_id)
# loop through all movie id
for (id in all_ids){
# this url is sorted by 'helpfulness of the review', 25 reviews per movie.
url <- paste0("https://www.imdb.com/title/",id,"/reviews?ref_=tt_urv")
url <- URLdecode(url)
webpage <- read_html(url)
# some movies do not have any review, check for 0 Reviews
check_review<- html_nodes(webpage,'.article')[[1]]%>%html_text()
zero_review<- str_detect(check_review,"0 Reviews")
if(zero_review==TRUE)
# set sleep time 5 seconds for each page
{Sys.sleep(5)}
else{
films_html <- html_nodes(webpage,'.lister-item-content')
for (k in 1:length(films_html)){
# extract review title
comment_titles_html <- html_nodes(films_html[[k]],'.title')
comment_titles <- html_text(comment_titles_html)
comment_titles <-str_trim(gsub("\r?\n|\r", " ", comment_titles))
# extract comment rating
ratings_html <- html_nodes(films_html[[k]],'.ipl-ratings-bar')
ratings <- html_text(ratings_html)
ratings<- str_extract(ratings, "(\\d)+")
if(identical(ratings, character(0))){ratings<-0} #replace missing rating with 0
# extract review content
comments_body_html <- html_nodes(films_html[[k]],'.show-more__control')
comments_body <- html_text(comments_body_html)
comments_body <-str_trim(gsub("\r?\n|\r", " ", comments_body))
# combine into dataframe and append
this_review <- as.data.frame(cbind(id,comment_titles,ratings,comments_body))
all_reviews <- rbind(all_reviews,this_review)
# keep track of movie id, in case the script crashes
write.csv(c(id,match(id,all_ids)),'last_id.csv')
}
}
# periodically save the file every 500 reviews
if(nrow(all_reviews)%%500==0){write.csv(all_reviews,'all_reviews.csv')}
}
# export to csv
write.csv(all_reviews,'./IMDb_Sentiment_Analysis/Data/all_reviews_movie_0-1866.csv')