forked from liu431/Big-Data-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisualizations.Rmd
173 lines (148 loc) · 7.55 KB
/
visualizations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
---
title: "Visualizations"
author: "Adam Shelton"
output: github_document
---
```{r setup, include=FALSE}
library(tidyverse)
library(readxl)
library(ggplot2)
library(here)
library(scales)
library(stringr)
library(eply)
library(ggraph)
library(igraph)
library(maps)
library(maptools)
library(ggmap)
library(sf)
library(mapproj)
library(treemapify)
library(Cairo)
source("../../../../theme.R")
windows.options(antialias = "cleartype")
options(device = Cairo::CairoWin)
knitr::opts_chunk$set(echo = TRUE,fig.width=10, fig.height=7, dev = "svg", dpi = 300, fig.pos = 'H', tidy.opts=list(width.cutoff=60), tidy=TRUE)
```
## MPI Processing
```{r mpi-1, message=FALSE, warning=FALSE}
mpi_data = read_csv(here("output_data", "mpi_trials.csv"))
# create line plot with 1/x smoothing
mpi_data %>% subset(hosts == 1) %>% ggplot(aes(x = nodes, y = proc_time/60)) +
geom_smooth(color = color_pal(1, "cool"), size = 1.75, method = "lm", formula = y ~ I(1 / x)) +
scale_x_continuous(breaks = c(1, 2, 4, 8, 16)) +
labs(title = "Parallelization has Diminishing Returns", x = "MPI Nodes", y = "Average Running Time (min)") + theme_master(base_size = 22) + theme(panel.grid.minor.x = element_blank())
```
```{r mpi-cost, message=FALSE, warning=FALSE}
# calculate price for each MPI test, converting time to hours and requiring two CPUs per node
mpi_data$total_price = 2 * mpi_data$nodes * (mpi_data$proc_time / 3600) * 0.031611
# create line plot
mpi_data %>% subset(hosts == 1) %>% ggplot(aes(x = nodes, y = total_price)) +
geom_smooth(color = color_pal(1, "discrete"), size = 1.75, method = "loess", span = 0.5) +
scale_x_continuous(breaks = c(1, 2, 4, 8, 16)) +
scale_y_continuous(limits = c(0,0.02)) +
labs(title = "Parallelization is More Costly", x = "MPI Nodes", y = "Mean Total Processing Cost (USD)") + theme_master(base_size = 22) + theme(panel.grid.minor.x = element_blank())
```
```{r mpi-2, message=FALSE, warning=FALSE}
# get means and standard deviations for each number of hosts
mpi_means = mpi_data %>% filter(nodes == 16) %>% filter(hosts == 1) %>% .$proc_time %>% tibble(hosts = c(1), proc_time = mean(.), pc_sd = sd(.)) %>% .[1, 2:4]
mpi_means[2, ] = mpi_data %>% filter(nodes == 16) %>% filter(hosts == 2) %>% .$proc_time %>% tibble(proc_time = mean(.), pc_sd = sd(.)) %>% .[1, 2:3] %>% bind_cols(host = c(2), .)
mpi_means[3, ] = mpi_data %>% filter(nodes == 16) %>% filter(hosts == 4) %>% .$proc_time %>% tibble(proc_time = mean(.), pc_sd = sd(.)) %>% .[1, 2:3] %>% bind_cols(host = c(4), .)
#plot a column plot with error bars
mpi_means %>% ggplot(aes(x = factor(hosts), y = proc_time)) +
geom_col(aes(fill = factor(hosts))) +
geom_errorbar(aes(ymin=proc_time-pc_sd, ymax=proc_time+pc_sd), width=.2,
position=position_dodge(.9)) +
scale_fill_manual(values = color_pal(3)) +
labs(title = "Number of MPI Hosts Affects Performance", x = "MPI Hosts", y = "Average Running Time (s)") + theme_master(base_size = 22) + hide_x_gridlines + hide_legend
```
## Top Tags
```{r top-tags, message=FALSE, warning=FALSE}
# arrange tags by the number of posts for each one
tt_data = read_csv(here("output_data", "top_tags.csv"))
tt_data$count = as.numeric(tt_data$count)
tt_data = tt_data %>% arrange(-count)
tt_data$language = tt_data$language %>% factor()
# take the top six tags and column plot them
tt_data %>% .[1:6,] %>% ggplot(aes(x = reorder(language, -count), y = (count / 10^6))) +
geom_col(aes(fill = language)) +
scale_fill_manual(values = color_pal(6)) +
labs(title = "Top Six Tags in StackOverflow Posts", x = "Tag", y = "Number of Posts (in millions)") + theme_master(base_size = 22) + hide_x_gridlines + hide_legend
```
## Questions by Year
```{r ques-year, message=FALSE, warning=FALSE}
# remove quotation marks and whitespace from each question
ques_year_data = read_csv(here("output_data", "top_questions.csv"))
ques_year_data$question = ques_year_data$question %>% str_remove_all("\"") %>% str_remove_all("\\\\") %>% unquote(deep = TRUE) %>% str_trim()
ques_year_data = ques_year_data %>% filter(year != 2012) # remove weird results for 2012
# plot flipped axis column plot with questions as labels on the plot
ques_year_data %>% ggplot(aes(x = factor(year), y = count)) +
geom_col(aes(fill = year)) +
scale_fill_gradientn(colors = color_pal(2)) +
geom_label(aes(x = 1:length(year), y = 110, label = question), alpha = 0.9, color = "black", family = "Pragati Narrow", size = 6, hjust = 0) +
labs(title = "Top Questions on StackOverflow by Year", x = "Year", y = "Number of Answers") +
theme_master(base_size = 22) +
hide_y_gridlines +
hide_legend +
coord_flip()
```
## Two-Grams
```{r two-grams, message=FALSE, warning=FALSE}
two_grams_data = read_csv(here("output_data", "twograms.csv")) %>% arrange(-count)
# make graph from top 150 tags
bigram_graph <- two_grams_data[1:150,] %>%
graph_from_data_frame()
remove_axes <- theme(
axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.title = element_blank()
)
# plot a network graph of these top 150 tags
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(color = "grey") +
geom_node_point(color = "grey") +
geom_node_text(aes(label = name, color = name %in% tt_data$language[1:6], size = name %in% tt_data$language[1:6]), nudge_x = 0, nudge_y = 0, repel = TRUE, family = "Pragati Narrow") +
scale_color_manual(values = c("#000000", color_pal(1, "discrete"))) +
scale_size_manual(values = c(5, 8)) +
labs(title = "Connections Between Tags in StackOverflow Posts") +
theme_master(base_size = 22) +
remove_axes +
hide_legend
```
## User Activities
```{r user-act, message=FALSE, warning=FALSE}
user_act_data = read_csv(here("output_data", "user_ac_out.csv"))
user_act_data = user_act_data[is.numeric(user_act_data$user_id), ]
# density plot of user activity
ggplot(user_act_data, aes(x = count)) + geom_density(color = color_pal(1, "cool"), size = 1.75) + labs(title = "Most Users Have Very Little Account Activity", x = "Account Interactions", y = "Density") + theme_master(base_size = 22)
```
```{r usr-act-tmap, message=FALSE, warning=FALSE}
aggr_ua_data = user_act_data$count %>% table() %>% as.tibble()
names(aggr_ua_data) = c("act_count", "num_obs")
aggr_ua_data$act_count = as.numeric(aggr_ua_data$act_count)
aggr_ua_data$perc = aggr_ua_data$num_obs/sum(aggr_ua_data$num_obs)
# plot treemap of user activity data
ggplot(aggr_ua_data, aes(area = as.numeric(num_obs), fill = as.numeric(act_count), label = act_count)) +
geom_treemap() +
geom_treemap_text(colour = "white", place = "centre",
grow = TRUE) +
scale_fill_gradientn(trans = "log10", colors = color_pal(5, type = "continuous")) +
labs(title = "The Majority of StackOverflow Accounts Have Very Little Account Activity", fill = "Number of \nPosts") +
theme_master()
```
## User Locations
```{r user-loc, message=FALSE, warning=FALSE}
user_loc_data = read_csv(here("output_data", "users_gold_badge_locations.csv"))
world_map = map_data("world")
# plot world map with 2d density plot (heatmap) of user activity
ggplot() +
geom_map(data = world_map, map = world_map,
aes(x = long, y = lat, group = group, map_id=region),
fill = "white", colour = "#7f7f7f", size=0.5) +
stat_density2d(data = user_loc_data, aes(x = lon, y = lat, fill=stat(level)), geom="polygon", alpha = 0.75) + scale_fill_gradientn(colors = color_pal(5, "continuous", reverse = TRUE)) + theme_map() + theme(legend.position = "right") + labs(title = "Global Distribution of StackOverflow Users", fill = "Density") + coord_map("albers", 0, 0)
```
```