This repository has been archived by the owner on Dec 26, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_covid_mobility.R
102 lines (87 loc) · 2.89 KB
/
parse_covid_mobility.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Download all of Google's COVID Mobility Reports and Parse the Data
# See this Page : https://www.google.com/covid19/mobility/
# Set a Folder Here and Source the Script
folder='~/Desktop/Covid_Mobility/'
# Standardize State Names
states <- gsub('\\s', '_', state.name)
# Download all of the Reports
for(state in states){
url <- paste0('https://www.gstatic.com/covid19/mobility/2020-03-29_US_', state, '_Mobility_Report_en.pdf')
destfile <- paste0(folder, state, '_Mobility_Report_en.pdf')
download.file(url, destfile)
}
# Parse the Text from the PDF
library(pdftools)
library(stringr)
# Parse Page
parse_pg <- function(pg){
a=str_remove_all(pg, '\\s')
a=unlist(str_split(a, 'SunFeb16SunMar8SunMar29SunFeb16SunMar8SunMar29SunFeb16SunMar8SunMar29'))
a=unlist(str_split(a, 'Retail&recreation\\**Grocery&pharmacy\\**Parks\\**'))
a=unlist(str_split(a, 'Transitstations\\**Workplace\\**Residential\\**'))
a=str_remove(a, '\\+80\\%\\+80\\%\\+80\\%\\+40\\%\\+40\\%\\+40\\%BaselineBaselineBaseline-40\\%-40\\%-40\\%-80\\%-80\\%-80\\%')
a=unlist(str_split(a, 'Notenoughdataforthisdate'))
a=unlist(str_split(a, 'comparedtobaseline'))
if(a[11] == ''){ # Single County
df = data.frame(stringsAsFactors = FALSE,
County = a[1],
Retail_Recreation = a[2],
Grocery_Pharma = a[3],
Parks = a[4],
Transit = a[7],
Workplace = a[8],
Residential = a[9]
)
}else{ # Two Counties
df = data.frame(stringsAsFactors = FALSE,
County = c(a[1], a[11]),
Retail_Recreation = c(a[2], a[12]),
Grocery_Pharma = c(a[3], a[13]),
Parks = c(a[4], a[14]),
Transit = c(a[7], a[17]),
Workplace = c(a[8], a[18]),
Residential = c(a[9], a[19])
)
}
return(df)
}
# Parse Document
parse_pdf <- function(file){
txt <- pdf_text(file)
pages <- txt[3:(length(txt)-1)]
res <- vector('list', length(pages))
for ( i in seq_along(pages)){
res[[i]] <- parse_pg(pages[i])
}
res <- do.call(rbind, res)
return(res)
}
# Parse All of the States in The Downloaded Directory
# Adding State as a Variable to the Data.frames
library(dplyr)
library(magrittr)
all_data <- vector('list', length(states))
for( i in seq_along(states) ){
file <- paste0(folder, states[i], '_Mobility_Report_en.pdf')
all_data[[i]] <- parse_pdf(file)
all_data[[i]] <- mutate(all_data[[i]], State = states[i])
}
us_data <- bind_rows(all_data)
# Clean the Final Dataframe a bit for Mapping
to_num <- function(var){
str_remove(var, '\\%') %>%
as.numeric() %>%
`/`(., 100)
}
us_data %<>%
mutate(
County = gsub("([a-z])([A-Z])", "\\1 \\2", County),
State = str_replace(State, '_', ' '),
Retail_Recreation = to_num(Retail_Recreation),
Grocery_Pharma = to_num(Grocery_Pharma),
Parks = to_num(Parks),
Transit = to_num(Transit),
Workplace = to_num(Workplace),
Residential = to_num(Residential)
)
save(us_data, file = 'data/google_mobility.rda')