-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
close #11 - quick word cloud of swear words of reddit comments
- Loading branch information
1 parent
70dab82
commit c730a87
Showing
4 changed files
with
303 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,24 @@ | ||
Package: sweary | ||
Type: Package | ||
Title: Database of Swear Words in R | ||
Version: 0.0.0.9000 | ||
Version: 0.0.0.9001 | ||
Authors@R: person("Patrik", "Drhlik", email = "patrik.drhlik@gmail.com", | ||
role = c("aut", "cre")) | ||
Description: The packages tries to offer an extensive list of swear | ||
words from different languages, cherry picked by native speakers. | ||
It should then be used in different text analyses. | ||
URL: https://github.com/pdrhlik/sweary | ||
BugReports: https://github.com/pdrhlik/sweary/issues | ||
Depends: R (>= 2.10) | ||
Depends: R (>= 3.4.0) | ||
License: MIT | ||
Encoding: UTF-8 | ||
LazyData: true | ||
RoxygenNote: 6.0.1 | ||
Suggests: | ||
testthat, | ||
dplyr, | ||
purrr | ||
purrr, | ||
RedditExtractoR, | ||
stringi, | ||
wordcloud2 | ||
Remotes: lchiffon/wordcloud2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
`ggsci.css` was borrowed from [`ggsci`](https://github.com/road2stat/ggsci/tree/master/vignettes) package |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
body { | ||
background-color: #fff; | ||
margin: 1em auto; | ||
max-width: 800px; | ||
overflow: visible; | ||
padding-left: 2em; | ||
padding-right: 2em; | ||
font-family: "Palatino Linotype", "Book Antiqua", Palatino, serif; | ||
font-size: 16px; | ||
line-height: 1.35; | ||
} | ||
|
||
#header { | ||
text-align: center; | ||
} | ||
|
||
#TOC { | ||
clear: both; | ||
margin: 0 0 10px 0px; | ||
padding: 15px; | ||
width: 770px; | ||
border: 1px solid #CCCCCC; | ||
border-radius: 5px; | ||
|
||
background-color: #f6f6f6; | ||
font-size: 16px; | ||
line-height: 1.5; | ||
} | ||
#TOC .toctitle { | ||
font-weight: bold; | ||
font-size: 16px; | ||
margin-left: 5px; | ||
} | ||
|
||
#TOC ul { | ||
padding-left: 50px; | ||
margin-left: -1.5em; | ||
margin-top: 5px; | ||
margin-bottom: 5px; | ||
} | ||
#TOC ul ul { | ||
margin-left: -2em; | ||
} | ||
#TOC li { | ||
line-height: 20px; | ||
} | ||
|
||
table { | ||
margin: 1em auto; | ||
border-width: 1px; | ||
border-color: #DDDDDD; | ||
border-style: outset; | ||
border-collapse: collapse; | ||
} | ||
table th { | ||
border-width: 2px; | ||
padding: 5px; | ||
border-style: inset; | ||
} | ||
table td { | ||
border-width: 1px; | ||
border-style: inset; | ||
line-height: 18px; | ||
padding: 5px 5px; | ||
} | ||
table, table th, table td { | ||
border-left-style: none; | ||
border-right-style: none; | ||
} | ||
table thead, table tr.even { | ||
background-color: #f7f7f7; | ||
} | ||
|
||
p { | ||
margin: 0.5em 0; | ||
} | ||
|
||
blockquote { | ||
background-color: #f6f6f6; | ||
padding: 0.25em 0.75em; | ||
} | ||
|
||
hr { | ||
border-style: solid; | ||
border: none; | ||
border-top: 1px solid #777; | ||
margin: 28px 0; | ||
} | ||
|
||
dl { | ||
margin-left: 0; | ||
} | ||
dl dd { | ||
margin-bottom: 13px; | ||
margin-left: 13px; | ||
} | ||
dl dt { | ||
font-weight: bold; | ||
} | ||
|
||
ul { | ||
margin-top: 0; | ||
} | ||
ul li { | ||
list-style: circle outside; | ||
} | ||
ul ul { | ||
margin-bottom: 0; | ||
} | ||
|
||
pre, code { | ||
background-color: #f7f7f7; | ||
border-radius: 5px; | ||
color: #333; | ||
white-space: pre-wrap; /* Wrap long lines */ | ||
} | ||
pre { | ||
border-radius: 5px; | ||
margin: 5px 0px 10px 0px; | ||
padding: 10px; | ||
} | ||
pre:not([class]) { | ||
background-color: #f7f7f7; | ||
} | ||
|
||
code { | ||
font-family: "Lucida Console", Monaco, Consolas, 'Courier New', monospace; | ||
font-size: 85%; | ||
} | ||
p > code, li > code { | ||
padding: 2px 0px; | ||
} | ||
|
||
div.figure { | ||
text-align: center; | ||
} | ||
img { | ||
background-color: #FFFFFF; | ||
padding: 2px; | ||
border: 1px solid #DDDDDD; | ||
border-radius: 3px; | ||
border: 1px solid #CCCCCC; | ||
margin: 0 5px; | ||
} | ||
|
||
h1 { | ||
margin-top: 25px; | ||
font-size: 35px; | ||
line-height: 40px; | ||
} | ||
|
||
h2 { | ||
border-bottom: 2px solid #f7f7f7; | ||
padding-top: 10px; | ||
padding-bottom: 2px; | ||
font-size: 145%; | ||
} | ||
|
||
h3 { | ||
border-bottom: 1px solid #f7f7f7; | ||
padding-top: 10px; | ||
font-size: 120%; | ||
} | ||
|
||
h4 { | ||
margin-left: 8px; | ||
font-size: 105%; | ||
} | ||
|
||
em { | ||
font-style: normal; | ||
} | ||
|
||
emph { | ||
font-style: oblique; | ||
} | ||
|
||
h5, h6 { | ||
border-bottom: 1px solid #ccc; | ||
font-size: 105%; | ||
} | ||
|
||
a { | ||
color: #123d79; | ||
text-decoration: none; | ||
} | ||
a:hover { | ||
color: #007CC3; } | ||
a:visited { | ||
color: #581858; } | ||
a:visited:hover { | ||
color: #007CC3; } | ||
|
||
/* code highlight theme: highlight.js - tomorrow | ||
http://jmblog.github.io/color-themes-for-highlightjs/tomorrow/ | ||
*/ | ||
|
||
code > span.kw { color: #4271ae; } /* Keyword */ | ||
code > span.dt { color: #c82829; } /* DataType */ | ||
code > span.dv { color: #f5871f; } /* DecVal (decimal values) */ | ||
code > span.bn { color: #718c00; } /* BaseN */ | ||
code > span.fl { color: #718c00; } /* Float */ | ||
code > span.ch { color: #718c00; } /* Char */ | ||
code > span.st { color: #718c00; } /* String */ | ||
code > span.co { color: #8e908c; } /* Comment */ | ||
code > span.ot { color: #4d4d4c; } /* OtherToken */ | ||
code > span.al { color: #ff0000; } /* AlertToken */ | ||
code > span.fu { color: #4271ae; } /* Function calls */ | ||
code > span.er { color: #a61717; } /* ErrorTok */ | ||
|
||
/* centering images */ | ||
img { | ||
display: block; | ||
margin: 0 auto; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
--- | ||
title: Word cloud of swear-words from reddit comments | ||
author: Related to Trump | ||
output: | ||
html_document: | ||
mathjax: default | ||
fig_caption: true | ||
toc: true | ||
section_numbering: true | ||
css: ggsci.css | ||
vignette: > | ||
%\VignetteEngine{knitr::rmarkdown} | ||
%\VignetteIndexEntry{Word cloud of swear-words from reddit comments} | ||
--- | ||
```{r include = FALSE} | ||
library(knitr) | ||
opts_chunk$set( | ||
comment = "", | ||
fig.width = 12, | ||
message = FALSE, | ||
warning = FALSE, | ||
tidy.opts = list( | ||
keep.blank.line = TRUE, | ||
width.cutoff = 150 | ||
), | ||
options(width = 150), | ||
eval = TRUE | ||
) | ||
``` | ||
|
||
|
||
```{r} | ||
library(sweary);library(dplyr) | ||
swear_words %>% | ||
count(language) | ||
``` | ||
|
||
|
||
|
||
```{r} | ||
library(RedditExtractoR) | ||
reddit_comments <- | ||
get_reddit( | ||
search_terms = 'trump', | ||
subreddit = 'politics', | ||
cn_threshold = 10 | ||
) | ||
dim(reddit_comments) | ||
``` | ||
|
||
```{r} | ||
# top_1000_scored <- | ||
# reddit_comments %>% | ||
# dplyr::arrange(desc(comment_score)) %>% | ||
# head(1000) | ||
``` | ||
```{r} | ||
reddit_swear_words <- | ||
#top_1000_scored$comment %>% | ||
reddit_comments$comment %>% | ||
stringi::stri_extract_all_words() %>% | ||
unlist %>% | ||
tolower %>% | ||
data.frame(word = ., stringsAsFactors = FALSE) %>% | ||
inner_join( | ||
swear_words %>% filter(language == 'en') | ||
) | ||
reddit_swear_words %>% | ||
count(word) | ||
``` | ||
|
||
```{r} | ||
reddit_swear_words %>% | ||
count(word) %>% | ||
rename(freq = n) %>% | ||
wordcloud2() | ||
``` | ||
|
||
|