close #11 - quick word cloud of swear words of reddit comments

pdrhlik · Sep 18, 2018 · c730a87 · c730a87
1 parent 70dab82
commit c730a87
Show file tree

Hide file tree

Showing 4 changed files with 303 additions and 3 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,20 +1,24 @@
 Package: sweary
 Type: Package
 Title: Database of Swear Words in R
-Version: 0.0.0.9000
+Version: 0.0.0.9001
 Authors@R: person("Patrik", "Drhlik", email = "patrik.drhlik@gmail.com",
     role = c("aut", "cre"))
 Description: The packages tries to offer an extensive list of swear
     words from different languages, cherry picked by native speakers.
     It should then be used in different text analyses.
 URL: https://github.com/pdrhlik/sweary
 BugReports: https://github.com/pdrhlik/sweary/issues
-Depends: R (>= 2.10)
+Depends: R (>= 3.4.0)
 License: MIT
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 6.0.1
 Suggests: 
     testthat,
     dplyr,
-    purrr
+    purrr,
+    RedditExtractoR,
+    stringi,
+    wordcloud2
+Remotes: lchiffon/wordcloud2
diff --git a/vignettes/README.md b/vignettes/README.md
@@ -0,0 +1 @@
+`ggsci.css` was borrowed from [`ggsci`](https://github.com/road2stat/ggsci/tree/master/vignettes) package
diff --git a/vignettes/ggsci.css b/vignettes/ggsci.css
@@ -0,0 +1,215 @@
+body {
+  background-color: #fff;
+  margin: 1em auto;
+  max-width: 800px;
+  overflow: visible;
+  padding-left: 2em;
+  padding-right: 2em;
+  font-family: "Palatino Linotype", "Book Antiqua", Palatino, serif;
+  font-size: 16px;
+  line-height: 1.35;
+}
+
+#header {
+  text-align: center;
+}
+
+#TOC {
+  clear: both;
+  margin: 0 0 10px 0px;
+  padding: 15px;
+  width: 770px;
+  border: 1px solid #CCCCCC;
+  border-radius: 5px;
+
+  background-color: #f6f6f6;
+  font-size: 16px;
+  line-height: 1.5;
+}
+  #TOC .toctitle {
+    font-weight: bold;
+    font-size: 16px;
+    margin-left: 5px;
+  }
+
+  #TOC ul {
+    padding-left: 50px;
+    margin-left: -1.5em;
+    margin-top: 5px;
+    margin-bottom: 5px;
+  }
+  #TOC ul ul {
+    margin-left: -2em;
+  }
+  #TOC li {
+    line-height: 20px;
+  }
+
+table {
+  margin: 1em auto;
+  border-width: 1px;
+  border-color: #DDDDDD;
+  border-style: outset;
+  border-collapse: collapse;
+}
+table th {
+  border-width: 2px;
+  padding: 5px;
+  border-style: inset;
+}
+table td {
+  border-width: 1px;
+  border-style: inset;
+  line-height: 18px;
+  padding: 5px 5px;
+}
+table, table th, table td {
+  border-left-style: none;
+  border-right-style: none;
+}
+table thead, table tr.even {
+  background-color: #f7f7f7;
+}
+
+p {
+  margin: 0.5em 0;
+}
+
+blockquote {
+  background-color: #f6f6f6;
+  padding: 0.25em 0.75em;
+}
+
+hr {
+  border-style: solid;
+  border: none;
+  border-top: 1px solid #777;
+  margin: 28px 0;
+}
+
+dl {
+  margin-left: 0;
+}
+  dl dd {
+    margin-bottom: 13px;
+    margin-left: 13px;
+  }
+  dl dt {
+    font-weight: bold;
+  }
+
+ul {
+  margin-top: 0;
+}
+  ul li {
+    list-style: circle outside;
+  }
+  ul ul {
+    margin-bottom: 0;
+  }
+
+pre, code {
+  background-color: #f7f7f7;
+  border-radius: 5px;
+  color: #333;
+  white-space: pre-wrap;    /* Wrap long lines */
+}
+pre {
+  border-radius: 5px;
+  margin: 5px 0px 10px 0px;
+  padding: 10px;
+}
+pre:not([class]) {
+  background-color: #f7f7f7;
+}
+
+code {
+  font-family: "Lucida Console", Monaco, Consolas, 'Courier New', monospace;
+  font-size: 85%;
+}
+p > code, li > code {
+  padding: 2px 0px;
+}
+
+div.figure {
+  text-align: center;
+}
+img {
+  background-color: #FFFFFF;
+  padding: 2px;
+  border: 1px solid #DDDDDD;
+  border-radius: 3px;
+  border: 1px solid #CCCCCC;
+  margin: 0 5px;
+}
+
+h1 {
+  margin-top: 25px;
+  font-size: 35px;
+  line-height: 40px;
+}
+
+h2 {
+  border-bottom: 2px solid #f7f7f7;
+  padding-top: 10px;
+  padding-bottom: 2px;
+  font-size: 145%;
+}
+
+h3 {
+  border-bottom: 1px solid #f7f7f7;
+  padding-top: 10px;
+  font-size: 120%;
+}
+
+h4 {
+  margin-left: 8px;
+  font-size: 105%;
+}
+
+em {
+  font-style: normal;
+}
+
+emph {
+  font-style: oblique;
+}
+
+h5, h6 {
+  border-bottom: 1px solid #ccc;
+  font-size: 105%;
+}
+
+a {
+  color: #123d79;
+  text-decoration: none;
+}
+  a:hover {
+    color: #007CC3; }
+  a:visited {
+    color: #581858; }
+  a:visited:hover {
+    color: #007CC3; }
+
+/* code highlight theme: highlight.js - tomorrow
+   http://jmblog.github.io/color-themes-for-highlightjs/tomorrow/
+*/
+
+code > span.kw { color: #4271ae; } /* Keyword */
+code > span.dt { color: #c82829; } /* DataType */
+code > span.dv { color: #f5871f; } /* DecVal (decimal values) */
+code > span.bn { color: #718c00; } /* BaseN */
+code > span.fl { color: #718c00; } /* Float */
+code > span.ch { color: #718c00; } /* Char */
+code > span.st { color: #718c00; } /* String */
+code > span.co { color: #8e908c; } /* Comment */
+code > span.ot { color: #4d4d4c; } /* OtherToken */
+code > span.al { color: #ff0000; } /* AlertToken */
+code > span.fu { color: #4271ae; } /* Function calls */
+code > span.er { color: #a61717; } /* ErrorTok */
+
+/* centering images */
+img {
+    display: block;
+    margin: 0 auto;
+}
diff --git a/vignettes/word_cloud.Rmd b/vignettes/word_cloud.Rmd
@@ -0,0 +1,80 @@
+---
+title: Word cloud of swear-words from reddit comments
+author: Related to Trump
+output:
+  html_document:
+    mathjax:  default
+    fig_caption:  true
+    toc: true
+    section_numbering: true
+    css: ggsci.css
+vignette: >
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteIndexEntry{Word cloud of swear-words from reddit comments}
+---
+```{r include = FALSE}
+library(knitr)
+opts_chunk$set(
+	comment = "",
+	fig.width = 12, 
+	message = FALSE,
+	warning = FALSE,
+	tidy.opts = list(
+		keep.blank.line = TRUE,
+		width.cutoff = 150
+		),
+	options(width = 150),
+	eval = TRUE
+)
+```
+
+
+```{r}
+library(sweary);library(dplyr)
+swear_words %>%
+  count(language)
+```
+
+
+
+```{r}
+library(RedditExtractoR)
+reddit_comments <-
+  get_reddit(
+    search_terms = 'trump',
+    subreddit = 'politics',
+    cn_threshold = 10
+  )
+dim(reddit_comments)
+```
+
+```{r}
+# top_1000_scored <-
+#   reddit_comments %>%
+#   dplyr::arrange(desc(comment_score)) %>%
+#   head(1000)
+```
+```{r}
+reddit_swear_words <-
+  #top_1000_scored$comment %>%
+  reddit_comments$comment %>%
+  stringi::stri_extract_all_words() %>%
+  unlist %>%
+  tolower %>%
+  data.frame(word = ., stringsAsFactors = FALSE) %>%
+  inner_join(
+    swear_words %>% filter(language == 'en')
+  )
+
+reddit_swear_words %>%
+  count(word)
+```
+
+```{r}
+reddit_swear_words %>%
+  count(word) %>%
+  rename(freq = n) %>%
+  wordcloud2()
+```
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		`ggsci.css` was borrowed from [`ggsci`](https://github.com/road2stat/ggsci/tree/master/vignettes) package