-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovielens_quiz.R
74 lines (51 loc) · 1.68 KB
/
movielens_quiz.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#movielens quiz
#Q1 How many rows and columns are there in the edx dataset?
dim(edx)
#Number of rows:
nrow(edx)
#Number of columns:
ncol(edx)
#q2 How many zeros were given as ratings in the edx dataset?
edx %>% group_by(rating) %>% dplyr::count()
#No movies have a rating of 0. Movies are rated from 0.5 to 5.0 in 0.5 increments.
#The number of 0s can be found using
edx %>% filter(rating == 0) %>% tally()
#how many ratings were 3s?
edx %>% filter(rating == 3) %>% tally()
#how many movies are in the data set.
n_distinct(edx$movieId)
#how many different users in the data set?
n_distinct(edx$userId)
#How many movie ratings are in each of the following genres in the edx dataset?
#drama
sum(str_detect(string = edx$genres, pattern = "Drama"))
#Comedy:
sum(str_detect(string = edx$genres, pattern = "Comedy"))
#Thriller:
sum(str_detect(string = edx$genres, pattern = "Thriller"))
#Romance:
sum(str_detect(string = edx$genres, pattern = "Romance"))
#recommended code:
# str_detect
genres = c("Drama", "Comedy", "Thriller", "Romance")
sapply(genres, function(g) {
sum(str_detect(edx$genres, g))
})
#which move has the greatest number of rankings?
edx %>% group_by(title) %>% tally() %>% arrange(desc(n))
#recommended code:
edx %>% group_by(movieId, title) %>%
summarize(count = n()) %>%
arrange(desc(count))
#what are the most common ratings, from most to least
sort(table(edx$rating))
hist(edx$rating)
#rec code:
edx %>% group_by(rating) %>% summarize(count = n()) %>% top_n(5) %>%
arrange(desc(count))
#True or False - In general, half star ratings less common than full star
edx %>%
group_by(rating) %>%
summarize(count = n()) %>%
ggplot(aes(x = rating, y = count)) +
geom_line()