-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_accumulation.R
467 lines (384 loc) · 19.2 KB
/
1_accumulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
library(tidyverse)
library(lubridate)
library(ggsci)
library(reshape2)
library(RColorBrewer)
# We use here the tidy version (saved as RData object to save space)
# While cleaning the dataset we excluded stories with less than 10 words,
# and stories published in AO3 in the year 2020
load("data/dataset_HP_AO3.RData")
# check languages - 90% is English
# harry %>%
# group_by(language) %>%
# summarise(n = n_distinct(index)) %>%
# mutate(prop = prop.table(n)) %>%
# arrange(desc(n))
# we exclude stories that are not in English, obtaining a total of N=196726 stories
harry <- harry %>%
filter(language == "English")
# Backdated stories --------------------------------------------------------------
# column "year_fix" will assign to backdated stories the backdated year, while not
# backdated ones will keep the year assigned based on "link". This is useful for
# queries on themes within the HP fandom beyond AO3, since stories were published
# outside AO3 before being imported and having articulated tags added to them.
# New reliable date range starting from 2000
harry <- harry %>%
mutate(year_fix = year(date)) %>%
mutate(backdated = ifelse(date_year >= (year_fix + 1), TRUE, FALSE)) %>%
mutate(year_fix = ifelse(backdated==FALSE, date_year, year_fix))
# Aggregate synonyms ------------------------------------------------------
# remember this is a quite conservative estimate of accumulation, if we do not
# consider synonyms it would be higher.
# load the synonyms files
characters_syn <- read_csv("data/characters_synonyms.csv")
relationships_syn <- read_csv("data/relationships_synonyms.csv")
freeforms_syn <- read_csv("data/freeforms_synonyms.csv")
common_freeform_syn <- read_csv("data/common_freeforms_synonyms.csv")
merge_freeforms_syn <- bind_rows(freeforms_syn, common_freeform_syn)
# transform the synonyms files in a tidy format
char_dictionary <- characters_syn %>%
pivot_longer(-canonical_tag, names_to = "n_syn", values_to = "synonym", values_drop_na = TRUE)
ship_dictionary <- relationships_syn %>%
pivot_longer(-canonical_tag, names_to = "n_syn", values_to = "synonym", values_drop_na = TRUE)
free_dictionary <- merge_freeforms_syn %>%
pivot_longer(-canonical_tag, names_to = "n_syn", values_to = "synonym", values_drop_na = TRUE)
# aggregate synonyms
harry_char <- harry %>%
filter(tag_type=="characters") %>%
left_join(char_dictionary, by = c("tag" ="synonym")) %>%
mutate(canonical_tag = ifelse(is.na(canonical_tag), tag, canonical_tag))
harry_ships <- harry %>%
filter(tag_type=="relationships") %>%
left_join(ship_dictionary, by = c("tag" ="synonym")) %>%
mutate(canonical_tag = ifelse(is.na(canonical_tag), tag, canonical_tag))
harry_free <- harry %>%
filter(tag_type=="freeforms") %>%
left_join(free_dictionary, by = c("tag" ="synonym")) %>%
mutate(canonical_tag = ifelse(is.na(canonical_tag), tag, canonical_tag))
# build dataset including aggregated synonyms (includes only character, relationship, and freeform tags)
harry_aggr <- bind_rows(harry_char, harry_ships, harry_free)
# Optional: count the number of synonyms that have been aggregated and identify them
# harry_aggr %>%
# group_by(tag_type) %>%
# summarise(n = length(n_syn[!is.na(n_syn)]))
#
# count_syn_tot <- harry_aggr %>% # group by canonical tag
# count(canonical_tag, tag_type, n_syn, sort = TRUE) %>%
# drop_na() %>%
# group_by(canonical_tag) %>%
# summarise(tot = sum(n))
#
# count_syn <- harry_aggr %>% # show single synonyms
# count(canonical_tag, tag_type, n_syn, sort = TRUE) %>%
# drop_na()
#
# freeforms_syn[132,50]
# Accumulation analysis ---------------------------------------------------
# start from 2002 because 2000 and 2001 have only 21 and 55 stories
harry_aggr <- harry_aggr %>%
filter(between(year_fix, 2002, 2019))
# total number of stories by year:
harry_aggr %>%
group_by(year_fix) %>%
summarise(count = n_distinct(index)) %>%
ggplot(aes(x = year_fix, y = count)) +
geom_col() +
scale_x_continuous(breaks= c(2002,2006,2010,2014,2018)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
labs(x = "Year", y = "Number of stories") +
theme_bw() +
ggsave("analysis/plots/1_1_number_of_stories.pdf", width = 5, height = 5)
# total number of different tags by year
harry_aggr %>%
group_by(year_fix, tag_type) %>%
summarise(count = n_distinct(canonical_tag)) %>%
ggplot(aes(x = year_fix, y = count)) +
geom_point() +
geom_line() +
scale_color_jco() +
scale_x_continuous(breaks= c(2002,2006,2010,2014,2018)) +
facet_wrap(vars(tag_type), scales = "free") +
labs(x = "Year", y = "Count") +
theme_bw() +
ggsave("analysis/plots/1_2_accumulation_totals.pdf", width = 12, height = 5)
## SPECIFIC TAGS ANALYSIS
# For each character, or group of characters, we can show that the number of
# freeforms and relationships tags increase, ans there is more diversity in stories.
# Here an example for Harry and freeforms tags with data extracted from the "linked-potter"
# knowledge base.
# Harry:
# tagOf_harry <- read_csv("data/tagOf_harry.csv", quote="")
#
# tagOf_harry <- tagOf_harry %>%
# mutate(tagOf = "Harry Potter") %>%
# filter(tagOf_first_row != "Harry Potter") %>%
# rename(isTaggedAs = tagOf_first_row)
#
# harry_aggr_tagOf <- harry_aggr %>%
# filter(tag %in% tagOf_harry$isTaggedAs)
#
# freeform_groups <- read_csv("data/freeform_groups_harry.csv")
#
# syn_count_tagOf <- harry_aggr_tagOf %>%
# left_join(freeform_groups, by = "canonical_tag") %>%
# group_by(year_fix) %>%
# mutate(n_stories_y = n_distinct(index)) %>%
# count(year_fix, tag, tag_type, canonical_tag, freeform_group, n_stories_y, n_syn) %>%
# group_by(year_fix, canonical_tag, freeform_group, n_stories_y) %>%
# summarise(tot = sum(n))
#
# syn_count_tagOf %>%
# ggplot(aes(x = year_fix, y = tot, fill = canonical_tag)) +
# geom_bar(position="fill", stat="identity") +
# scale_fill_igv() +
# labs(x = "Year", y = "Proportion of stories", title = "Harry Potter") +
# scale_x_continuous(breaks= c(2002,2006,2010,2014,2018)) +
# theme_bw()
# With the same logic we can do it for all other characters. Most of this material can go in
# supplementary material. One to show in the main manuscript is the relationships of the three
# characters, Harry, Hermione, and Draco.
# NOTE: Federico will do an interactive version of this.
# HARRY:
ships_harry <- read_csv("data/ships_harry.csv", quote="")
ships_harry <- ships_harry %>%
mutate(hasParticipant = "Harry Potter") %>%
filter(hasParticipant_first_row != "Harry Potter") %>%
rename(participatesTo = hasParticipant_first_row)
harry_aggr_ships <- harry_aggr %>%
filter(tag %in% ships_harry$participatesTo)
# count stories per year
syn_count_harry_ships <- harry_aggr_ships %>%
group_by(year_fix) %>%
mutate(n_stories_y = n_distinct(index)) %>%
count(year_fix, tag, tag_type, canonical_tag, n_stories_y, n_syn) %>%
group_by(year_fix, canonical_tag, n_stories_y) %>%
summarise(tot = sum(n))%>%
mutate(Character = "Harry", avg_count = tot/n_stories_y) %>%
filter(tot >= 10)
# DRACO
ships_draco <- read_csv("data/ships_draco.csv", quote="")
ships_draco <- ships_draco %>%
mutate(hasParticipant = "Draco Malfoy") %>%
filter(hasParticipant_first_row != "Draco Malfoy") %>%
rename(participatesTo = hasParticipant_first_row)
harry_aggr_draco_ships <- harry_aggr %>%
filter(tag %in% ships_draco$participatesTo)
# count stories per year
syn_count_draco_ships <- harry_aggr_draco_ships %>%
group_by(year_fix) %>%
mutate(n_stories_y = n_distinct(index)) %>%
count(year_fix, tag, tag_type, canonical_tag, n_stories_y, n_syn) %>%
group_by(year_fix, canonical_tag, n_stories_y) %>%
summarise(tot = sum(n)) %>%
mutate(Character = "Draco", avg_count = tot/n_stories_y) %>%
filter(tot >= 10)
# HERMIONE:
ships_hermione <- read_csv("data/ships_hermione.csv", quote="")
ships_hermione <- ships_hermione %>%
mutate(hasParticipant = "Hermione Granger") %>%
filter(hasParticipant_first_row != "Hermione Granger") %>%
rename(participatesTo = hasParticipant_first_row)
harry_aggr_hermione_ships <- harry_aggr %>%
filter(tag %in% ships_hermione$participatesTo)
# count stories per year
syn_count_hermione_ships <- harry_aggr_hermione_ships %>%
group_by(year_fix) %>%
mutate(n_stories_y = n_distinct(index)) %>%
count(year_fix, tag, tag_type, canonical_tag, n_stories_y, n_syn) %>%
group_by(year_fix, canonical_tag, n_stories_y) %>%
summarise(tot = sum(n)) %>%
mutate(Character = "Hermione", avg_count = tot/n_stories_y) %>%
filter(tot >= 10)
# PUT TOGETHER:
combine_ships <- bind_rows(syn_count_harry_ships, syn_count_hermione_ships, syn_count_draco_ships) %>%
arrange(Character, year_fix, desc(tot))
# generate a colour palette for each character
n_colors <- length(unique(combine_ships$canonical_tag[combine_ships$Character == "Draco"]))
n_colors2 <- length(unique(combine_ships$canonical_tag[combine_ships$Character == "Harry"]))
n_colors3 <- length(unique(combine_ships$canonical_tag[combine_ships$Character == "Hermione"]))
mypal <- colorRampPalette(brewer.pal(9, "RdPu"))(n_colors)[order(sample(1:n_colors, n_colors))] # for legibility, shuffle palette order
mypal2 <- colorRampPalette(brewer.pal(9, "BuGn"))(n_colors2)[order(sample(1:n_colors2, n_colors2))]
mypal3 <- colorRampPalette(brewer.pal(9, "YlOrBr"))(n_colors3)[order(sample(1:n_colors3, n_colors3))]
combine_ships %>%
ggplot(aes(x = year_fix, y = tot, fill = interaction(as.factor(canonical_tag), Character))) +
geom_bar(stat = "identity", position = "fill") +
scale_x_continuous(breaks= c(2002,2006,2010,2014,2018)) +
scale_fill_manual(values = c(mypal, mypal2, mypal3)) +
labs(x = "Year", y = "Proportion of stories") +
theme_bw() +
theme(legend.position = "none") +
ggsave("analysis/plots/1_3_accumulation_relationships_growth.pdf", width = 8, height = 5)
## -----------------------------------------------------------------------
# Calculate number of tags for each story
harry_aggr <- harry_aggr %>%
group_by(index, hits, kudos, bookmarks, year_fix) %>%
summarise(characters_n = n_distinct(canonical_tag[tag_type == "characters"]), # get the number of canonical "character" tags for the story
relationships_n = n_distinct(canonical_tag[tag_type == "relationships"]),
freeforms_n = n_distinct(canonical_tag[tag_type == "freeforms"]))
# Optional: check how many stories don't use tags for each category -- for some years it can reach 30-50%
# harry_aggr %>%
# group_by(year_fix) %>%
# summarise(zero_char = n_distinct(index[characters_n == 0]), prop_char = zero_char / n_distinct(index),
# zero_ship = n_distinct(index[relationships_n == 0]), prop_ship = zero_ship / n_distinct(index),
# zero_free = n_distinct(index[freeforms_n == 0]), prop_free = zero_free / n_distinct(index),)
# Calculate percentiles
harry_aggr <- harry_aggr %>%
group_by(year_fix) %>% # calculate per year to be sure to have stories from all years for each percentile
mutate(hits_p = ntile(hits, 100)) %>%
mutate(kudos_p = ntile(kudos, 100)) %>%
mutate(bookmarks_p = ntile(bookmarks, 100))
# Optional: check the number of NAs per year for each metric
# Number of NAs is 1.7~10% for hits, 1~19% for kudos, 16~54% for bookmarks
# check_na <- harry %>%
# group_by(year_fix) %>%
# distinct(index, .keep_all = TRUE) %>%
# summarise(unique_stories = n(),
# NA_hits = sum(is.na(hits)), prop_NA_hits = NA_hits / unique_stories,
# NA_kudos = sum(is.na(kudos)), prop_NA_kudos = NA_kudos / unique_stories,
# NA_bookmarks = sum(is.na(bookmarks)), prop_NA_bookmarks = NA_bookmarks / unique_stories)
## Count unique tags per percentile interval
# Specify the metric to be used in group_by (e.g. here kudos) and percentiles intervals
# of same length for all clusters (e.g. here 10)
# KUDOS
# low kudos
lower_percentile <- 1
higher_percentile <- 10
harry_low <- harry_aggr %>%
filter(between(kudos_p, lower_percentile, higher_percentile)) %>%
group_by(year_fix, kudos_p) %>%
mutate(n_stories_y_p_char = n_distinct(index[characters_n != 0]), # for every year, for each percentile, calculate number of stories having tags of a certain category
n_stories_y_p_ships = n_distinct(index[relationships_n != 0]),
n_stories_y_p_free = n_distinct(index[freeforms_n != 0])) %>%
group_by(year_fix, kudos_p, n_stories_y_p_char, n_stories_y_p_ships, n_stories_y_p_free) %>%
summarise(char_tags_count = sum(characters_n), ships_tags_count = sum(relationships_n), free_tags_count = sum(freeforms_n),
min(kudos), max(kudos)) %>%
mutate(Characters = char_tags_count/n_stories_y_p_char, Relationships = ships_tags_count/n_stories_y_p_ships,
Freeforms = free_tags_count/n_stories_y_p_free) %>%
pivot_longer(Characters:Freeforms, names_to = "tag_type", values_to = "count")
# mid kudos
lower_percentile <- 46
higher_percentile <- 55
harry_mid <- harry_aggr %>%
filter(between(kudos_p, lower_percentile, higher_percentile)) %>%
group_by(year_fix, kudos_p) %>%
mutate(n_stories_y_p_char = n_distinct(index[characters_n != 0]), # for every year, for each percentile, calculate number of stories having tags of a certain category
n_stories_y_p_ships = n_distinct(index[relationships_n != 0]),
n_stories_y_p_free = n_distinct(index[freeforms_n != 0])) %>%
group_by(year_fix, kudos_p, n_stories_y_p_char, n_stories_y_p_ships, n_stories_y_p_free) %>%
summarise(char_tags_count = sum(characters_n), ships_tags_count = sum(relationships_n), free_tags_count = sum(freeforms_n),
min(kudos), max(kudos)) %>%
mutate(Characters = char_tags_count/n_stories_y_p_char, Relationships = ships_tags_count/n_stories_y_p_ships,
Freeforms = free_tags_count/n_stories_y_p_free) %>%
pivot_longer(Characters:Freeforms, names_to = "tag_type", values_to = "count")
# high kudos
lower_percentile <- 91
higher_percentile <- 100
harry_high <- harry_aggr %>%
filter(between(kudos_p, lower_percentile, higher_percentile)) %>%
group_by(year_fix, kudos_p) %>%
mutate(n_stories_y_p_char = n_distinct(index[characters_n != 0]), # for every year, for each percentile, calculate number of stories having tags of a certain category
n_stories_y_p_ships = n_distinct(index[relationships_n != 0]),
n_stories_y_p_free = n_distinct(index[freeforms_n != 0])) %>%
group_by(year_fix, kudos_p, n_stories_y_p_char, n_stories_y_p_ships, n_stories_y_p_free) %>%
summarise(char_tags_count = sum(characters_n), ships_tags_count = sum(relationships_n), free_tags_count = sum(freeforms_n),
min(kudos), max(kudos)) %>%
mutate(Characters = char_tags_count/n_stories_y_p_char, Relationships = ships_tags_count/n_stories_y_p_ships,
Freeforms = free_tags_count/n_stories_y_p_free) %>%
pivot_longer(Characters:Freeforms, names_to = "tag_type", values_to = "count")
# PROPER STAT MODEL HERE:
# Compare linear and exponential fitting.
cumulative <- bind_rows(harry_high, harry_mid, harry_low) %>%
add_column(Rank = as_factor(rep(c("High kudos (91-100th percentile)",
"Mid kudos (46-55th percentile)",
"Low kudos (1-10th percentile)"), each = 540)))
# replace below High/Mid/Low kudos:
trend_to_fit <- cumulative %>%
filter(Rank == "Low kudos (1-10th percentile)", tag_type == "Freeforms")
y <- trend_to_fit$count
x <- trend_to_fit$year_fix
AIC(lm(y ~ x))
AIC(lm(y ~ x + I(x^2)))
trend_to_fit <- cumulative %>%
filter(Rank == "Low kudos (1-10th percentile)", tag_type == "Characters")
y <- trend_to_fit$count
x <- trend_to_fit$year_fix
AIC(lm(y ~ x))
AIC(lm(y ~ x + I(x^2)))
trend_to_fit <- cumulative %>%
filter(Rank == "Low kudos (1-10th percentile)", tag_type == "Relationships")
y <- trend_to_fit$count
x <- trend_to_fit$year_fix
AIC(lm(y ~ x))
AIC(lm(y ~ x + I(x^2)))
# compare slopes of percentiles
cumulative %>%
ggplot(aes(x = year_fix, y = count, colour = Rank)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2)) + # according to the test, it is better an exponential fit.
scale_color_jco() +
scale_x_continuous(breaks= c(2002,2006,2010,2014,2018)) +
facet_wrap(vars(tag_type), scales = "free") +
labs(x = "Year", y = "Average count") +
theme_bw() +
ggsave("analysis/plots/1_4_accumulation_percentiles.pdf", width = 15, height = 5)
# Linear model on slopes --------------------------------------------------
### Calculate slope on all percentiles
# using linear model
# on KUDOS
# function for linear model
get_slope <- function(my_distr){
# prepare variables for linear model
x <- my_distr$year_fix
y <- my_distr$count
# generate model
test <- lm(y ~ x)
# get slope
return(test$coefficients[2])
}
# moving window of 10 percentiles
characters_slope <- numeric()
relationships_slope <- numeric()
freeforms_slope <- numeric()
percentiles <- 1:91 # this depends on the size of the percentiles we use
for(my_percentile in percentiles){
lower_percentile <- my_percentile
higher_percentile <- my_percentile+9 # this depends on the size of the percentiles we use
harry_perc <- harry_aggr %>%
filter(between(kudos_p, lower_percentile, higher_percentile)) %>%
group_by(year_fix, kudos_p) %>%
mutate(n_stories_y_p_char = n_distinct(index[characters_n != 0]), # for every year, for each percentile, calculate number of stories having tags of a certain category
n_stories_y_p_ships = n_distinct(index[relationships_n != 0]),
n_stories_y_p_free = n_distinct(index[freeforms_n != 0])) %>%
group_by(year_fix, kudos_p, n_stories_y_p_char, n_stories_y_p_ships, n_stories_y_p_free) %>%
summarise(char_tags_count = sum(characters_n), ships_tags_count = sum(relationships_n), free_tags_count = sum(freeforms_n),
min(kudos), max(kudos)) %>%
mutate(Characters = char_tags_count/n_stories_y_p_char, Relationships = ships_tags_count/n_stories_y_p_ships,
Freeforms = free_tags_count/n_stories_y_p_free) %>%
pivot_longer(Characters:Freeforms, names_to = "tag_type", values_to = "count")
# get Kudos distribution per characters
my_distr <- harry_perc[which(harry_perc$tag_type == "Characters"),]
# get slope
characters_slope[my_percentile] <- get_slope(my_distr)
# get Kudos distribution per relationships
my_distr <- harry_perc[which(harry_perc$tag_type == "Relationships"),]
# get slope
relationships_slope[my_percentile] <- get_slope(my_distr)
# get Kudos distribution per freeforms
my_distr <- harry_perc[which(harry_perc$tag_type == "Freeforms"),]
# get slope
freeforms_slope[my_percentile] <- get_slope(my_distr)
print(my_percentile)
}
# plot
my_df <- data.frame(percentiles = c(percentiles), Freeforms = freeforms_slope, Characters = characters_slope, Relationships = relationships_slope)
my_df_melt <- melt(my_df, id.vars = "percentiles")
names(my_df_melt) <- c("percentiles", "Tag", "linear_model_slope")
ggplot(my_df_melt, mapping = aes(x = percentiles, y = linear_model_slope)) +
geom_line(aes(color = Tag), size = 1.5) +
scale_color_aaas() +
scale_x_continuous(breaks = seq(1,91,by=10)) +
ylim(c(0, 0.65)) +
theme_bw() +
labs(y = "Slope of the trend between 2002 and 2019", x = "Lower bound of 10-percentile window") +
ggsave("analysis/plots/1_5_accumulation_all_percentiles.pdf", width = 6, height = 5)
# all of the above can be done using bookmarks or hits as metric