Skip to content

Commit

Permalink
Added data. Finished pkg_down build
Browse files Browse the repository at this point in the history
  • Loading branch information
bryanwhiting committed Aug 31, 2021
1 parent 06647d0 commit a992dfe
Show file tree
Hide file tree
Showing 30 changed files with 269 additions and 113 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: generalconference
Type: Package
Title: General Conference text corpus and web scrapers
Version: 0.1.0
Version: 0.2.0
Author: Bryan Whiting
Maintainer: Bryan Whiting <yourself@somewhere.net>
Description: "General conference is the worldwide gathering of The Church of
Expand Down Expand Up @@ -29,6 +29,7 @@ Imports:
glue,
purrr,
stringr,
readr,
rvest,
tictoc,
tidyr,
Expand All @@ -39,6 +40,7 @@ Depends:
furrr,
purrr,
stringr,
readr,
rvest,
tictoc,
tidyr,
Expand Down
48 changes: 48 additions & 0 deletions R/genconf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#' General Conference Data
#'
#' A dataset containing all general conference talks back to 1971.
#'
#' @format genconf: A 4-level nested data frame with nestings for conference, session, talk, and paragraph.
#' \enumerate{
#' \item \strong{genconf} A data frame with one row per conference (year + month)
#' \describe{
#' \item{year}{Session year}
#' \item{month}{Session month}
#' \item{sessions}{List dataframe with one row per session.}
#' }
#'
#' \item \strong{sessions} A data frame one row per session (Saturday AM, PM, etc.)
#' \describe{
#' \item{session_name}{individual timepoint}
#' \item{session_id}{mean value including imputed values}
#' \item{session_url}{Suffix URL path to session (not full url))}
#' \item{talks}{List of dataframes, one row per talk in that session}
#' }
#'
#'
#
#' \item \strong{talks} A data frame one row per talk
#' \describe{
#' \item{talk_urls}{Stub urls for talk.}
#' \item{talk_session_id}{Talk index within session}
#' \item{url}{Full url path to talk.}
#' \item{title1}{Title.}
#' \item{author1}{Author Name (typically, might be missing)}
#' \item{author2}{Author Role (typically, might be missing)}
#' \item{kicker1}{Talk kicker}
#' \item{paragraphs}{List of dataframes, one row per talk in that session}
#' }
#'
#' \item \strong{paragraphs} A data frame one row per paragraph in talk
#' \describe{
#' \item{section_num}{If talk has sections, this would be the section number. Newer talks are more likely to have sections.}
#' \item{p_num}{Paragraph number}
#' \item{p_id}{Paragraph html tag (can be used to generate a url deep link). Might not be in order with p_num due to edge-case talks that use #p1-#p4 for title, author, kicker, etc.}
#' \item{is_header}{If a talk contains sections, those sections have headers. Header content will be a few words.}
#' \item{paragraph}{Text of talk. <sup></sup> html tags (superscripts/footnotes) have been stripped out.}
#' }
#' }
#' @source \url{https://www.churchofjesuschrist.org/study/general-conference}
# genconf %>% select(sessions) %>% unnest(sessions) %>% select(talks) %>% unnest(talks) %>% select(paragraphs) %>% unnest(paragraphs)
# https://stackoverflow.com/questions/38095578/documenting-a-list-of-data-frames-with-roxygen2
"genconf"
56 changes: 0 additions & 56 deletions R/scrape_talk.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,62 +153,6 @@ extract_metadata <- function(rv_doc){
)
}



# DEPRECATED 2021-08-30
#' extract_metadata <- function(rv_doc) {
#' #' Extract title, author, and kicker from a url and return as a row in a
#' #' dataframe.
#'
#' # the .body-block contains the speech text. But the #p anchors
#' # can be wrong.
#' # returns list of p1, p2, p3... for new talks and p2, p3, p4 for old talks
#' p_bodies <- rv_doc %>%
#' html_elements(".body-block p") %>%
#' html_attr("id")
#'
#' # Explaining !("p1" %in% p_bodies):
#' # Sometimes, the first paragraph isn't p1
#' # e.g., "https://www.churchofjesuschrist.org/study/general-conference/2019/04/27homer"
#' # First paragraph is "p20", then 2nd is "p1".
#' if ("p1" %in% p_bodies) {
#' # In new talks, #p1 is the paragraph text
#' elements <- c("#title1", "#author1", "#author2", "#kicker1")
#' map_dfc(elements, ~ extract_element(rv_doc = rv_doc, element = .)) %>%
#' rename_all(~ str_replace(., fixed("#"), "")) %>%
#' return()
#' } else if (p_bodies[1] == "p5"){
#' # Some talks start at p5, and p1-4 are the title, author, author and kicker
#' # url <- "https://www.churchofjesuschrist.org/study/liahona/2020/11/15cook?lang=eng"
#' # rv_doc <- read_html(url)
#' elements <- c("#p1", "#p2", "#p3", "#p4")
#' df <- map_dfc(elements, ~ extract_element(rv_doc = rv_doc, element = .)) %>%
#' rename_all(~ str_replace(., fixed("#"), "")) %>%
#' rename(title1 = p1,
#' author1 = p2,
#' author2 = p3,
#' kicker1 = p4)
#' } else {
#' # In older talks, #p1 is the author block
#' elements <- c("#title1", "#author1", "#author2", "#kicker1", "#p1")
#' df <- map_dfc(elements, ~ extract_element(rv_doc = rv_doc, element = .)) %>%
#' rename_all(~ str_replace(., fixed("#"), ""))
#'
#' if (is.na(df$author1)) {
#' df$author1 <- df$p1
#' } else {
#' url <- extract_url_from_rv_doc(rv_doc)
#' message(
#' "#p1 not in .body-block p: ", url,
#' "\nPulled #p1 for metadata but author1 is not null."
#' )
#' }
#' df %>%
#' select(-p1) %>%
#' return()
#' }
#' }

#' Scrape general conference talk
#'
#' @param url general conference https
Expand Down
7 changes: 4 additions & 3 deletions R/scrape_talk_urls.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ parse_path_for_name <- function(path) {
}

#' Parse Session URLs
#'
#' Take a vector of session hrefs, use the first value
#' as the session name, then the other values as the
#' session talks
Expand All @@ -39,6 +40,7 @@ parse_session_urls <- function(session_hrefs) {
}

#' Scrape HTML doc map from Conference URL
#'
#' Given a year and a month, pull the entire .doc-map class
#' object from the Conference URL. This will be parsed
#' by downstream objects
Expand Down Expand Up @@ -90,9 +92,8 @@ extract_session_hrefs <- function(html_docmap, session_id) {
#' For a given year-month conference, return a nested tibble of all sessions
#' with a tibble-column containing the dataframes
#'
#'
#' @param year
#' @param month
#' @param year year
#' @param month month
#'
#' @return tibble
#' @export
Expand Down
9 changes: 9 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@ template:
params:
bootswatch: cerulean



reference:
- title: Datasets
desc: >
General conference datasets. `data("genconf")`
- contents:
- "genconf"


- title: Scrape
desc: >
Scrape general conference talks urls and contents. Any function with
Expand Down
Binary file added data/genconf.rda
Binary file not shown.
Binary file added data/sessions/201904.rds
Binary file not shown.
Binary file added data/sessions/201910.rds
Binary file not shown.
Binary file added data/sessions/202004.rds
Binary file not shown.
Binary file added data/sessions/202010.rds
Binary file not shown.
Binary file added data/sessions/202104.rds
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/404.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/LICENSE-text.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/articles/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion docs/articles/introduction.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/authors.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ articles:
how-to-scrape: how-to-scrape.html
introduction: introduction.html
new-sessions: new-sessions.html
last_built: 2021-08-30T21:52Z
last_built: 2021-08-31T00:11Z

58 changes: 44 additions & 14 deletions docs/reference/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/extract_body_paragraphs_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/extract_url_from_rv_doc.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit a992dfe

Please sign in to comment.