-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.Rmd
130 lines (113 loc) · 4.66 KB
/
analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
---
title: "Analysis of UCPD Incident Reports"
author: "Adam Shelton"
output: github_document
---
```{r setup, include=FALSE}
library(tidyverse)
library(lubridate)
library(stringr)
library(here)
library(ggcorrplot)
library(cowplot)
library(tidytext)
library(stringr)
library(here)
library(topicmodels)
library(igraph)
library(ggraph)
library(parallel)
library(wordcloud)
library(knitr)
library(kableExtra)
source("../../functions.R")
set.seed(60615)
source("../../theme.R")
windows.options(antialias = "cleartype")
options(device = Cairo::CairoWin)
knitr::opts_chunk$set(echo = TRUE,fig.width=10, fig.height=7, dev = "svg", dpi = 300, fig.pos = 'H', tidy.opts=list(width.cutoff=60), tidy=TRUE)
```
## Getting Started
```{r getting-started, message=FALSE, warning=FALSE, cache=TRUE}
process_date_range = Vectorize(function(d_range, start = TRUE) {
intervals = str_split(d_range, "to") %>% unlist() %>% str_trim()
if (length(intervals) == 2) {
beginning = intervals[1] %>% str_squish() %>% str_split(" ") %>% unlist()
if (!start & !str_detect(intervals[2], "/")) {
return(paste(beginning[1], intervals[2], sep = " "))
} else if (!start) {
return(intervals[2])
}
}
if (length(intervals) == 3) {
middle = intervals[2] %>% str_squish() %>% str_split(" ") %>% unlist()
if (start) {
return(paste(intervals[1], middle[2], middle[3], sep = " "))
}
return(paste(middle[1], intervals[3], sep = " "))
}
if (start) {
return(intervals[1])
}
return(NA)
})
extract_date = Vectorize(function(date_char) {
attempt = mdy_hm(date_char)
#message(attempt)
if (is.na(attempt) & !is.na(date_char)) {
if (str_detect(date_char, " ")) {
date_char = date_char %>% str_split(" ") %>% unlist() %>% .[1]
}
#message(date_char)
attempt = mdy(date_char) %>% as_datetime()
#message(attempt)
}
return(attempt)
})
clean_disposition = function(disp_vector) {
cpd_arrest = str_detect(disp_vector, "CPD") & str_detect(disp_vector, "Arrest")
cpd_alone = str_detect(disp_vector, "CPD") & !str_detect(disp_vector, "Arrest")
arrest_alone = str_detect(disp_vector, "Arrest") & !str_detect(disp_vector, "CPD")
disp_vector[cpd_arrest] = "CPD Arrest"
disp_vector[cpd_alone] = "CPD Involved"
disp_vector[arrest_alone] = "Arrest"
closed = str_detect(disp_vector, "Closed") | str_detect(disp_vector, "closed")
disp_vector[closed] = "Closed"
disp_vector[str_detect(disp_vector, "Clear")] = "Exceptionally Cleared"
return(disp_vector)
}
ucpd_data = list.files(getwd(), pattern="ucpd_incident_data_scraped_*") %>% read_csv() %>% fix_nas(c(":", "VOID", "Void", "void", "n/a", "N/A", "na", "NA", "No reports this date", "None")) %>% mutate(Reported = Reported %>% mdy_hm(), Start = Occurred %>% process_date_range() %>% extract_date() %>% as_datetime(), End = Occurred %>% process_date_range(FALSE) %>% extract_date() %>% as_datetime(), Disposition = Disposition %>% clean_disposition()) %>% drop_na(Reported) %>% collapse_to_other(10, Disposition) %>% rename("Comments" = `Comments / Nature of Fire`)
```
## Text Analysis
```{r tokenizing}
report_lines = ucpd_data %>% select(Comments) %>% rename("line" = Comments)
report_words = report_lines %>% unnest_tokens(word, line) %>% anti_join(stop_words)
report_bigrams = report_lines %>% unnest_tokens(bigram, line, token = "ngrams", n = 2, collapse = FALSE) %>% separate(bigram, c("word1", "word2"), sep = " ") %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)
word_count = report_words %>% count(word, sort = TRUE)
bigram_count = report_bigrams %>% count(word1, word2, sort = TRUE)
word_count %>% top_n(15) %>% kable(caption = "15 Most Common Words")
bigram_count %>% top_n(15) %>% kable(caption = "15 Most Common Bigrams")
```
```{r bigram-graph}
bigram_tops = bigram_count %>% top_n(150)
bigram_graph = bigram_tops %>% graph_from_data_frame()
graph_data = word_count %>% filter(word %in% names(bigram_graph[1]))
remove_axes = theme(
axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.title = element_blank()
)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(color = "grey") +
geom_node_point(color = "grey") +
geom_node_text(aes(label = name, size = graph_data$n, color = graph_data$n), nudge_x = 0, nudge_y = 0, repel = TRUE, family = "Pragati Narrow") +
scale_color_gradientn(colors = c("#000000", color_pal(1, "discrete"))) +
scale_size_continuous(range = c(4, 10)) +
labs(title = "Connections Between 150 Most Common Word Parings", subtitle = "In UCPD reports from July 2010 to November 2019") +
theme_master(base_size = 22) +
remove_axes +
hide_legend
```