-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGA_speeds_Git.R
438 lines (389 loc) · 17.1 KB
/
GA_speeds_Git.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
###"Internet speeds in GA"
###Introduction
#This is the code that goes along with the 6/12/2019 AJC story
#"Internet far slower in Georgia than reported."
### Library
#The analysis requires a couple out-of-R steps to function,
#and the to_BQ_string function in the library does the heavy lifting with that.
#That function takes an sf MultiPolygon geometry and makes it into the right kind of string so that
#Google BigQuery will accept it. There are also two SQL queries. The Big_Q query is heavily cribbed from
#Georgia Bullen of Simply Secure.
library(bigrquery)
library(tidyverse)
library(viridis)
library(lubridate)
library(sf)
library(tidycensus)
library(ggmap)
library(geojson)
library(geojsonsf)
project <- "NAME_OF_PROJECT"
stringify = function(x){
z=apply(x, 1, paste, collapse = ", ")
w<-str_c("[ ",z, " ]")
return("Values"=w)
}
to_dataflow_string = function(dataflow_sf, file_name){
require(sf)
require(jsonlite)
library(tools)
ln_list<-nrow(dataflow_sf)
dataflow_list <-vector(mode = "list",length=ln_list)
for(i in 1:ln_list){
num_polys<-length(dataflow_sf[i,]$geometry[[1]])
if(num_polys==1){
val <- tryCatch(stringify(dataflow_sf[i,]$geometry[[1]][[1]][[1]]),
error=function(e){return("e")})
if(length(val)==1){
val<-stringify(dataflow_sf[i,]$geometry[[1]][[1]])
}
dataflow_list[[i]]<-list("district"=dataflow_sf[i,]$GEOID,
"Values"=val %>% str_c(collapse = ", ") %>% c("{\"type\":\"MultiPolygon\",\"coordinates\":[ [ [ ",.," ] ] ]}") %>% str_c(collapse = "")
)
}else{
for(j in 1:num_polys){
val <- tryCatch(stringify(dataflow_sf[i,]$geometry[[1]][[j]][[1]]),
error=function(e){return("e")})
if(length(val)==1){
val<-stringify(dataflow_sf[i,]$geometry[[1]][[j]])
}
curr_list<-list("district"=dataflow_sf[i,]$GEOID,
"Values"=val %>% str_c(collapse = ", ") %>% c("{\"type\":\"MultiPolygon\",\"coordinates\":[ [ [ ",.," ] ] ]}") %>% str_c(collapse = "")
)
dataflow_list<-append(dataflow_list, list(curr_list), i-1)
}
}
}
lengths<-lapply(dataflow_list, length)
dataflow_list_short<-dataflow_list[which(lengths==2)] %>% do.call("rbind",.) %>% data.frame %>% mutate_all(unlist)
write_csv(dataflow_list_short, file_name, col_names = FALSE)
}
point_graph_BQ <- "#standardSQL
SELECT
APPROX_QUANTILES(ml_download_Mbps, 101) [SAFE_ORDINAL(51)] AS med_dl_Mbps,
APPROX_QUANTILES(ml_download_Mbps, 101) [SAFE_ORDINAL(26)] AS qr_1_dl_Mbps,
APPROX_QUANTILES(ml_download_Mbps, 101) [SAFE_ORDINAL(76)] AS qr_3_dl_Mbps,
SUM(ml_dl_count_tests) as county_tests,
COUNT(DISTINCT ip),
partition_date,
long,
lat
FROM
(
SELECT
COUNT(test_id) AS ml_dl_count_tests,
connection_spec.client_ip as ip,
APPROX_QUANTILES(
8 * SAFE_DIVIDE(
web100_log_entry.snap.HCThruOctetsAcked,
(
web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd
)
),
101 IGNORE NULLS
) [SAFE_ORDINAL(51)] AS ml_download_Mbps,
partition_date,
connection_spec.client_geolocation.longitude AS long,
connection_spec.client_geolocation.latitude AS lat
FROM
`measurement-lab.release.ndt_downloads` tests,
`NAME_OF_PROJECT.NAME_OF_BUCKET.NAME_OF_DATASET` counties
WHERE
connection_spec.server_geolocation.country_name = 'United States'
AND (connection_spec.server_geolocation.region = 'Georgia' OR
connection_spec.server_geolocation.region = 'GA')
AND ST_WITHIN(
ST_GeogPoint(
connection_spec.client_geolocation.longitude,
connection_spec.client_geolocation.latitude
),
counties.Values
)
GROUP BY
connection_spec.client_ip,
long,
lat,
partition_date
)
GROUP BY
long,
lat,
partition_date
"
big_Q <- "#standardSQL
WITH counties AS (
SELECT
county_name AS name,
county_geom AS geom,
geo_id as geo_id
FROM
`NAME_OF_PROJECT.NAME_OF_BUCKET.NAME_OF_DATASET`
WHERE
state_fips_code = 'STATE_CODE'
),
mlab_dl AS (
SELECT
APPROX_QUANTILES(ml_download_Mbps, 101) [SAFE_ORDINAL(51)] AS med_dl_Mbps,
SUM(ml_dl_count_tests) as county_tests,
partition_date,
FORMAT(
'%s_%d',
['jun', 'dec'] [ORDINAL(CAST(CEIL(EXTRACT(MONTH FROM partition_date) / 6) AS INT64))],
EXTRACT(
YEAR
FROM
partition_date
)
) as time_period,
geo_id
FROM
(
SELECT
COUNT(test_id) AS ml_dl_count_tests,
APPROX_QUANTILES(
8 * SAFE_DIVIDE(
web100_log_entry.snap.HCThruOctetsAcked,
(
web100_log_entry.snap.SndLimTimeRwin + web100_log_entry.snap.SndLimTimeCwnd + web100_log_entry.snap.SndLimTimeSnd
)
),
101 IGNORE NULLS
) [SAFE_ORDINAL(51)] AS ml_download_Mbps,
partition_date,
geo_id
FROM
`measurement-lab.release.ndt_downloads` tests,
`NAME_OF_PROJECT.NAME_OF_BUCKET.NAME_OF_DATASET` counties
WHERE
connection_spec.server_geolocation.country_name = 'United States'
AND (connection_spec.server_geolocation.region = 'STATE_NAME' OR
connection_spec.server_geolocation.region = 'STATE_ABBREVIATION')
AND ST_WITHIN(
ST_GeogPoint(
connection_spec.client_geolocation.longitude,
connection_spec.client_geolocation.latitude
),
counties.Values
)
GROUP BY
connection_spec.client_ip,
geo_id,
partition_date
)
GROUP BY
geo_id,
partition_date
),
mlab_ul AS (
SELECT
APPROX_QUANTILES(ml_upload_Mbps, 101) [SAFE_ORDINAL(51)] AS med_ul_Mbps,
partition_date,
geo_id
FROM
(
SELECT
COUNT(test_id) AS ml_ul_count_tests,
APPROX_QUANTILES(
8 * SAFE_DIVIDE(
web100_log_entry.snap.HCThruOctetsReceived,
web100_log_entry.snap.Duration
),
101 IGNORE NULLS
) [SAFE_ORDINAL(51)] AS ml_upload_Mbps,
partition_date,
geo_id
FROM
`measurement-lab.release.ndt_uploads` tests,
`NAME_OF_PROJECT.NAME_OF_BUCKET.NAME_OF_DATASET` counties
WHERE
connection_spec.server_geolocation.country_name = 'United States'
AND (connection_spec.server_geolocation.region = 'STATE_NAME' OR
connection_spec.server_geolocation.region = 'STATE_ABBREVIATION')
AND ST_WITHIN(
ST_GeogPoint(
connection_spec.client_geolocation.longitude,
connection_spec.client_geolocation.latitude
),
counties.Values
)
GROUP BY
connection_spec.client_ip,
geo_id,
partition_date
)
GROUP BY
geo_id,
partition_date
),
fccdata AS (
SELECT *, 'dec_2014' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_DEC_2014`
UNION ALL SELECT *, 'dec_2015' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_DEC_2015`
UNION ALL SELECT *, 'dec_2016' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_DEC_2016`
UNION ALL SELECT *, 'jun_2015' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_JUN_2015`
UNION ALL SELECT *, 'jun_2016' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_JUN_2016`
UNION ALL SELECT *, 'jun_2017' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_JUN_2017`
UNION ALL SELECT *, 'dec_2017' AS time_period FROM `NAME_OF_PROJECT.NAME_OF_FCC_BUCKET.NAME_OF_DATASET_DEC_2017`,
),
fcc_providerMedians AS (
SELECT
time_period,
APPROX_QUANTILES(CAST(Max_Advertised_Downstream_Speed__mbps_ AS FLOAT64), 101)[SAFE_ORDINAL(51)] AS advertised_down,
APPROX_QUANTILES(CAST(Max_Advertised_Upstream_Speed__mbps_ AS FLOAT64), 101)[SAFE_ORDINAL(51)] AS advertised_up,
MAX(CAST(Max_Advertised_Downstream_Speed__mbps_ AS FLOAT64)) AS advertised_down_max,
SUBSTR(Census_Block_FIPS_Code, 0, 5) as geo_id
FROM fccdata
WHERE Consumer = '1'
GROUP BY geo_id, time_period, FRN
),
fcc_groups AS (
SELECT
fccdata.time_period,
COUNT(DISTINCT FRN) AS reg_provider_count,
APPROX_QUANTILES(fcc_providerMedians.advertised_down, 101)[SAFE_ORDINAL(51)] AS advertised_down,
APPROX_QUANTILES(fcc_providerMedians.advertised_up, 101)[SAFE_ORDINAL(51)] AS advertised_up,
MAX(CAST(Max_Advertised_Downstream_Speed__mbps_ AS FLOAT64)) AS advertised_down_max,
SUBSTR(Census_Block_FIPS_Code, 0, 5) as geo_id
FROM fccdata JOIN fcc_providerMedians
ON SUBSTR(fccdata.Census_Block_FIPS_Code, 0, 5) = fcc_providerMedians.geo_id AND fccdata.time_period = fcc_providerMedians.time_period
WHERE Consumer = '1'
GROUP BY geo_id, time_period
),
fcc_timeslices AS (
SELECT
time_period,
reg_provider_count,
advertised_down,
advertised_up,
advertised_down_max,
geo_id
FROM fcc_groups
)
SELECT
partition_date,
time_period,
geo_id,
med_dl_Mbps,
med_ul_Mbps,
county_tests,
reg_provider_count,
advertised_down,
advertised_up,
advertised_down_max
FROM
(mlab_dl JOIN mlab_ul USING (geo_id, partition_date) JOIN fcc_timeslices USING (geo_id, time_period))"
### R code
#This section can be run right out of the box, once all dependencies have been installed.
#It gets the county shapefiles from tidycensus and gets them in the right format for BigQuery.
#The shapefile download takes a minute or two because it gets the shapefiles for the entire country.
#Feel free to modify the get_acs call to shapefiles you're interested in.
#You can also set the state flag in the SQL calls above. Once running this section,
#you'll have to leave R, and work with Google Compute.
###This loads shapes and population from tidycensus
us <- unique(fips_codes$state)[1:51]
D_l <- vector(length = length(us), mode = "list")
D_l<-tigris::states()
D<-D_l %>% st_as_sf
###This R object file is used in the Mapbox pipeline.
df_counties <- D%>%select(GEOID, geometry)
to_dataflow_string(df_counties, "BQ_geo.csv")
###A brief aside in Google BigQuery to get BigRQuery to work
#The meat of the analysis is done in Google BigQuery because M-Lab provides a free way to work with their
#large internet speed database on BigQuery. Storing the FCC files and the county files in a Google Bucket
#will cost a little bit of money, but it should be less than a dollar for this story.
#Follow the instructions at : https://www.measurementlab.net/data/docs/bq/quickstart/ to get started with
#BigQuery, and to get connected with the M-Lab datasets. The general steps are installing
#the Google Cloud SDK and getting signed up on the M-Lab message board to get access to their tables
#and so that the BigQuery payments that would fall to you are covered by them.
#Follow the instructions at: https://cloud.google.com/storage/docs/creating-buckets to create Google Bucket
#for your data. Upload the file "BQ_geo.csv" to the Google Bucket. This will give BigQuery the shapefiles
#it will perform spatial joins on.
#In BigQuery, (at https://console.cloud.google.com/bigquery) create a dataset and name it whatever you like.
#I named mine GA_internet. The next step is to import BQ_geo.csv into your dataset in BigQuery as table by
#following the instructions here: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage.
#In the "Create table" section, add two fields to the Schema. This lets BigQuery know what form the data
#you're uploading is in. Name the first "districts" and the second "Values." These names are important
#because they match the SQL calls you'll make later. Set the Type for the first to "STRING" and the second
#to "GEOGRAPHY." The table should appear under your dataset. Finally, update the quest point_graph_BQ above
#to reflect the names of the datasets you uploaded and the geographic region you're interested in.
#For generality, I'm showing a map of the entire U.S. below. Update project below to your project name.
#Having done all that, the following section should run properly. This sections uses the point_graph_BQ
#query to calculate the median-of-medians average internet speed for every location IP addresses are mapped
#to. The first median of the median-of-medians calculation is done in BigQuery because it's much faster,
#and for large areas that often have hundreds of millions of tests, doing it locally just isn't possible.
point_data<-query_exec(point_graph_BQ, project = "ADD_NAME_OF_PROJECT", use_legacy_sql=FALSE, max_pages = Inf)
point_data_sf<-point_data %>%
group_by(long, lat) %>% summarise(med = median(med_dl_Mbps), counts = sum(county_tests), ips = sum(f0_)) %>% st_as_sf(coords = c("long", "lat"))
cuts<-c(0,.2, 4, 10, 25,1000)
colors_v<-viridis(7, option="magma")
point_data_sf_GA<-point_data_sf %>% na.omit %>% mutate(speed_mlab_c=cut(med, cuts))%>% mutate(
speed_color = case_when(
speed_mlab_c=="(0,0.2]"~colors_v[1],
speed_mlab_c=="(0.2,4]"~colors_v[2],
speed_mlab_c=="(4,10]"~colors_v[3],
speed_mlab_c=="(10,25]"~colors_v[4],
speed_mlab_c=="(25,1e+03]"~colors_v[5]
)
) %>% mutate(
speed_labels = case_when(
speed_mlab_c=="(0,0.2]"~"0 to .2 Mbps",
speed_mlab_c=="(0.2,4]"~".2 to 4 Mbps",
speed_mlab_c=="(4,10]"~"4 to 10 Mbps",
speed_mlab_c=="(10,25]"~"10 to 25 Mbps",
speed_mlab_c=="(25,1e+03]"~"Over 25 Mbps"
)
)
###R mapping
#This section takes the data we've gotten and crunched, and turns it into a map.
#Because the underlying data is technically point data (latitude/longitude data), we have a couple choices
#on how to aggregate it. In the GA story, the approach we took was to create a Voronoi tessalation of the
#latitude/longitude data, and assigning to each Voronoi polygon the internet speed of the latitude/longitude
#that created it. Voronoi polygons (https://en.wikipedia.org/wiki/Voronoi_diagram) are simple ways of
#turning a set of points in a space into a set of polygons that partition that space. In particular,
#the Voronoi polygon created by a point is the area that's closer to that point than any other point.
#Other choices could have been census tracts or counties, but Voronoi polygons struck a good balance
#between showing the variations in speed that indicate _some_ people have fast internet and the widespread
#slower speeds that show the ISP data reported to the FCC overestimates the state of internet speeds in GA.
#The section below calculates the Voronoi polygons and makes a map out of them.
USA_poly <- a<-tigris::states() %>% st_as_sf %>% st_union %>% st_buffer( dist = 0)
USA_poly<-st_transform(USA_poly, 2163)
voronoi_tess <- st_voronoi(do.call(c, point_data_sf_GA$geometry)) %>% st_collection_extract() %>% st_sf %>%
st_buffer( dist = 0)
st_crs(voronoi_tess)<-st_crs(USA_poly)
st_crs(point_data_sf_GA)<-st_crs(USA_poly)
voronoi_tess<-st_intersection(st_cast(voronoi_tess), USA_poly)
voronoi_tess %>% st_join(point_data_sf_GA)%>% ggplot(aes(color = speed_color, fill = speed_color)) +
geom_sf() +
scale_color_manual(values = c("#14274f","#6c406e","#b65f77","#eb9074","#ffd17d"),
#labels = c("0 to .2 Mbps", ".2 to 4 Mbps","4 to 10 Mbps","10 to 25 Mbps","25 to 50 Mbps"),
labels = c("0 to .2 Mbps", ".2 to 4 Mbps","4 to 10 Mbps","10 to 25 Mbps","25 to 50 Mbps"),
name = "Average download speed")+
scale_fill_manual(values = c("#14274f","#6c406e","#b65f77","#eb9074","#ffd17d"),
labels = c("0 to .2 Mbps", ".2 to 4 Mbps","4 to 10 Mbps","10 to 25 Mbps","25 to 50 Mbps"),
name = "Average download speed")+theme(
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
# panel.grid.minor = element_line(color = "#ebebe5", size = 0.2),
panel.grid.major = element_line(color = "#ebebe5", size = 0.2),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
panel.border = element_blank()
)+labs(title="")
### FCC comparison
#This section compares FCC 477 data with the M-Lab data. Twice a year, ISPs are required to submit a Form 477
#to the FCC detailing the maximum speed they make available at the census blocks they serve in the U.S.
#That data is put together and made publicly available online at
#https://www.fcc.gov/general/broadband-deployment-data-fcc-form-477. Each dataset is about 400 MB. To run
#the code below, download all of the FCC releases, ranging from December 2014 to December 2017, and upload
#them to your Google Bucket. Import them into separate tables in your BigQuery dataset with names that you
#choose. Then update Big_Q above to reflect the names of your FCC tables, and you should be good to go.
D_state_county<-query_exec(big_Q, project = project,use_legacy_sql=FALSE, max_pages = Inf )
D_state_county %>% filter(time_period=="dec_2017"&str_sub(geo_id,1,2)=="13") %>%
summarise(fcc_dl =median(advertised_down), dl= median(med_dl_Mbps)) #25 fcc, 6.26 mlab
point_data_sf_GA %>% filter(str_sub(county,1,2)=="13") %>% summarise(sum(counts), sum(ips), median(med))
pull(counts) %>% sum