-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0_extract_data_medline.R
72 lines (65 loc) · 3.2 KB
/
0_extract_data_medline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# 0_extract_data_medline.R
# extra data from medline dump from pubmed
# just get date field, don't need anything else
# date received comes from PLOS ONE system (e.g., our audit paper is 2017-06-22)
# no idea if it accounts for time zone
# April 2019
library(dplyr)
library(stringr)
library(tidyr)
# weekday labels
wdays = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
years = 2007:2018 # ignore first year because too few results and add a lot of noise
earliest.date = as.Date('07/08/2006', '%d/%m/%Y') # this is a Monday, so window starts on Monday
earliest.date = as.Date('12/08/2006', '%d/%m/%Y') # move to Saturday, does this make a difference to the results by moving the window?
all = NULL
for (year in years){
infile = paste('data/pubmed_medline', year, '.txt', sep='')
# could also use ,'AD ' for address, but has line breaks
xmas.dates = as.Date(paste(rep(2000:2018,3), c('-12-24','-12-25','-12-26'), sep='')) # all possible Xmas dates from Xmas eve to boxing day
raw = read.table(infile, sep='-', header=FALSE, fill = TRUE, quote='', stringsAsFactors = FALSE) %>%
filter(V1 %in% c('PHST')) %>% # just dates
filter(str_detect(pattern='received', string=V2)) %>% # just received date
rename('date' = 'V2') %>%
mutate(date= str_replace_all(pattern = ' 00:00 \\[received\\]', replacement='', string=date), # date received (submitted)
date = as.Date(date, '%Y/%m/%d'),
xmas = date %in% xmas.dates,
weekday = weekdays(date),
weekday = factor(weekday, levels=wdays)) %>%
select(-starts_with('V'))
all = rbind(all, raw)
} # end of year loop
all = mutate(all, year=as.numeric(format(date, '%Y'))) %>% # add year based on date
filter(year > 1969) # remove one clear error
## now calculate stats on grouped data (cannot do above because years overlap)
# annual stats
results.xmas = group_by(all, year, xmas) %>% # count numbers
summarise(count=n())
results = group_by(all, year, weekday) %>% # count numbers
summarise(count=n())
# weekly stats
weekly = mutate(all, time = floor(as.numeric(date - earliest.date)/7) ) %>% # time in seven week windows, better than using %U. time is weeks since earliest date
group_by(time, weekday) %>% # count numbers
summarise(count=n())
## calculate annual percentages
#
results.xmas = group_by(results.xmas, year) %>%
mutate(N = sum(count),
percent = 100*(count/N)) %>%
ungroup()
#
results = group_by(results, year) %>%
mutate(N = sum(count),
percent = 100*(count/N)) %>%
ungroup()
## calculate weekly percentages of weekend (yes/no) for time-trend analysis
for.trend = mutate(weekly, weekend = weekday %in% c('Saturday','Sunday'),
weekend = factor(as.numeric(weekend), levels=0:1, labels=c('No','Yes'))) %>%
group_by(time, weekend) %>%
summarise(N = sum(count)) %>% # count weekdays and weekends
ungroup() %>%
spread(key=weekend, value=N) %>% # long to wide
mutate(No = ifelse(is.na(No), 0, No),
Yes = ifelse(is.na(Yes), 0, Yes)) # replace NA with zero
# save
save(weekly, results, results.xmas, years, for.trend, earliest.date, file = 'data/PLOSAnalysisReady.RData')