Added data. Finished pkg_down build

bryanwhiting · Aug 31, 2021 · a992dfe · a992dfe
1 parent 06647d0
commit a992dfe
Show file tree

Hide file tree

Showing 30 changed files with 269 additions and 113 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: generalconference
 Type: Package
 Title: General Conference text corpus and web scrapers
-Version: 0.1.0
+Version: 0.2.0
 Author: Bryan Whiting
 Maintainer: Bryan Whiting <yourself@somewhere.net>
 Description: "General conference is the worldwide gathering of The Church of 
@@ -29,6 +29,7 @@ Imports:
     glue,
     purrr,
     stringr,
+    readr,
     rvest,
     tictoc,
     tidyr,
@@ -39,6 +40,7 @@ Depends:
     furrr,
     purrr,
     stringr,
+    readr,
     rvest,
     tictoc,
     tidyr,

diff --git a/R/genconf.R b/R/genconf.R
@@ -0,0 +1,48 @@
+#' General Conference Data
+#'
+#' A dataset containing all general conference talks back to 1971.
+#'
+#' @format genconf: A 4-level nested data frame with nestings for conference, session, talk, and paragraph.
+#' \enumerate{
+#' \item \strong{genconf} A data frame with one row per conference (year + month)
+#' \describe{
+#'   \item{year}{Session year}
+#'   \item{month}{Session month}
+#'   \item{sessions}{List dataframe with one row per session.}
+#' }
+#'
+#' \item \strong{sessions} A data frame one row per session (Saturday AM, PM, etc.)
+#' \describe{
+#'   \item{session_name}{individual timepoint}
+#'   \item{session_id}{mean value including imputed values}
+#'   \item{session_url}{Suffix URL path to session (not full url))}
+#'   \item{talks}{List of dataframes, one row per talk in that session}
+#' }
+#'
+#'
+#
+#' \item \strong{talks} A data frame one row per talk
+#' \describe{
+#'   \item{talk_urls}{Stub urls for talk.}
+#'   \item{talk_session_id}{Talk index within session}
+#'   \item{url}{Full url path to talk.}
+#'   \item{title1}{Title.}
+#'   \item{author1}{Author Name (typically, might be missing)}
+#'   \item{author2}{Author Role (typically, might be missing)}
+#'   \item{kicker1}{Talk kicker}
+#'   \item{paragraphs}{List of dataframes, one row per talk in that session}
+#' }
+#'
+#' \item \strong{paragraphs} A data frame one row per paragraph in talk
+#' \describe{
+#'   \item{section_num}{If talk has sections, this would be the section number. Newer talks are more likely to have sections.}
+#'   \item{p_num}{Paragraph number}
+#'   \item{p_id}{Paragraph html tag (can be used to generate a url deep link). Might not be in order with p_num due to edge-case talks that use #p1-#p4 for title, author, kicker, etc.}
+#'   \item{is_header}{If a talk contains sections, those sections have headers. Header content will be a few words.}
+#'   \item{paragraph}{Text of talk. <sup></sup> html tags (superscripts/footnotes) have been stripped out.}
+#' }
+#' }
+#' @source \url{https://www.churchofjesuschrist.org/study/general-conference}
+# genconf %>% select(sessions) %>% unnest(sessions) %>% select(talks) %>% unnest(talks) %>% select(paragraphs) %>% unnest(paragraphs)
+# https://stackoverflow.com/questions/38095578/documenting-a-list-of-data-frames-with-roxygen2
+"genconf"
diff --git a/R/scrape_talk.R b/R/scrape_talk.R
@@ -153,62 +153,6 @@ extract_metadata <- function(rv_doc){
   )
 }
 
-
-
-# DEPRECATED 2021-08-30
-#' extract_metadata <- function(rv_doc) {
-#'   #' Extract title, author, and kicker from a url and return as a row in a
-#'   #' dataframe.
-#'
-#'   # the .body-block contains the speech text. But the #p anchors
-#'   # can be wrong.
-#'   # returns list of p1, p2, p3... for new talks and p2, p3, p4 for old talks
-#'   p_bodies <- rv_doc %>%
-#'     html_elements(".body-block p") %>%
-#'     html_attr("id")
-#'
-#'   # Explaining !("p1" %in% p_bodies):
-#'   #  Sometimes, the first paragraph isn't p1
-#'   #  e.g., "https://www.churchofjesuschrist.org/study/general-conference/2019/04/27homer"
-#'   #  First paragraph is "p20", then 2nd is "p1".
-#'   if ("p1" %in% p_bodies) {
-#'     # In new talks, #p1 is the paragraph text
-#'     elements <- c("#title1", "#author1", "#author2", "#kicker1")
-#'     map_dfc(elements, ~ extract_element(rv_doc = rv_doc, element = .)) %>%
-#'       rename_all(~ str_replace(., fixed("#"), "")) %>%
-#'       return()
-#'   } else if (p_bodies[1] == "p5"){
-#'     # Some talks start at p5, and p1-4 are the title, author, author and kicker
-#'     # url <- "https://www.churchofjesuschrist.org/study/liahona/2020/11/15cook?lang=eng"
-#'     # rv_doc <- read_html(url)
-#'     elements <- c("#p1", "#p2", "#p3", "#p4")
-#'     df <- map_dfc(elements, ~ extract_element(rv_doc = rv_doc, element = .)) %>%
-#'       rename_all(~ str_replace(., fixed("#"), "")) %>%
-#'       rename(title1 = p1,
-#'              author1 = p2,
-#'              author2 = p3,
-#'              kicker1 = p4)
-#'   } else {
-#'     # In older talks, #p1 is the author block
-#'     elements <- c("#title1", "#author1", "#author2", "#kicker1", "#p1")
-#'     df <- map_dfc(elements, ~ extract_element(rv_doc = rv_doc, element = .)) %>%
-#'       rename_all(~ str_replace(., fixed("#"), ""))
-#'
-#'     if (is.na(df$author1)) {
-#'       df$author1 <- df$p1
-#'     } else {
-#'       url <- extract_url_from_rv_doc(rv_doc)
-#'       message(
-#'         "#p1 not in .body-block p: ", url,
-#'         "\nPulled #p1 for metadata but author1 is not null."
-#'       )
-#'     }
-#'     df %>%
-#'       select(-p1) %>%
-#'       return()
-#'   }
-#' }
-
 #' Scrape general conference talk
 #'
 #' @param url general conference https

diff --git a/R/scrape_talk_urls.R b/R/scrape_talk_urls.R
@@ -15,6 +15,7 @@ parse_path_for_name <- function(path) {
 }
 
 #' Parse Session URLs
+#'
 #' Take a vector of session hrefs, use the first value
 #' as the session name, then the other values as the
 #' session talks
@@ -39,6 +40,7 @@ parse_session_urls <- function(session_hrefs) {
 }
 
 #' Scrape HTML doc map from Conference URL
+#'
 #' Given a year and a month, pull the entire .doc-map class
 #' object from the Conference URL. This will be parsed
 #' by downstream objects
@@ -90,9 +92,8 @@ extract_session_hrefs <- function(html_docmap, session_id) {
 #' For a given year-month conference, return a nested tibble of all sessions
 #' with a tibble-column containing the dataframes
 #'
-#'
-#' @param year
-#' @param month
+#' @param year year
+#' @param month month
 #'
 #' @return tibble
 #' @export

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -3,7 +3,16 @@ template:
   params:
     bootswatch: cerulean
 
+
+
 reference:
+- title: Datasets
+  desc: >
+    General conference datasets. `data("genconf")`
+- contents:
+  - "genconf"
+
+
 - title: Scrape
   desc: >
     Scrape general conference talks urls and contents. Any function with

diff --git a/data/genconf.rda b/data/genconf.rda
diff --git a/data/sessions/201904.rds b/data/sessions/201904.rds
diff --git a/data/sessions/201910.rds b/data/sessions/201910.rds
diff --git a/data/sessions/202004.rds b/data/sessions/202004.rds
diff --git a/data/sessions/202010.rds b/data/sessions/202010.rds
diff --git a/data/sessions/202104.rds b/data/sessions/202104.rds
diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
diff --git a/docs/articles/index.html b/docs/articles/index.html
diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html
diff --git a/docs/authors.html b/docs/authors.html
diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -5,5 +5,5 @@ articles:
   how-to-scrape: how-to-scrape.html
   introduction: introduction.html
   new-sessions: new-sessions.html
-last_built: 2021-08-30T21:52Z
+last_built: 2021-08-31T00:11Z
 
diff --git a/docs/reference/index.html b/docs/reference/index.html
diff --git a/man/extract_body_paragraphs_df.Rd b/man/extract_body_paragraphs_df.Rd
diff --git a/man/extract_url_from_rv_doc.Rd b/man/extract_url_from_rv_doc.Rd