diff --git a/.gitignore b/.gitignore index 2947144..ee09efe 100755 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,3 @@ vignettes/*.pdf # R Environment Variables .Renviron inst/doc -docs diff --git a/docs/LICENSE.html b/docs/LICENSE.html new file mode 100644 index 0000000..8668c3f --- /dev/null +++ b/docs/LICENSE.html @@ -0,0 +1,169 @@ + + + + + + + + +MIT License • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+ +

Copyright (c) 2021 Bryan Whiting

+

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

+

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

+

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+
+ +
+ + + +
+ + + + +
+ + + + + + + + diff --git a/docs/articles/developers.html b/docs/articles/developers.html new file mode 100644 index 0000000..ea62791 --- /dev/null +++ b/docs/articles/developers.html @@ -0,0 +1,234 @@ + + + + + + + +Developer Notes • generalconference + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

+Developer notes

+

If you’re new to package development, this could be helpful to read.

+
+
+

+Initializing a package

+ +

Starting a new package/one-time commands.

+
+devtools::create('generalconference')
+devtools::install()
+usethis::use_mit_license("My Name")
+# specify packages you want to use
+usethis::use_package('dplyr')
+usethis::use_package('rvest')
+# builds data-raw/ folder
+use_this::use_data_raw() 
+
+
+

+Workflow

+ +
+# build functions in R and save, builds a test file
+usethis::use_r(name="new_func")
+usethis::use_test()
+
+# Add new package to DESCRIPTION as necessary
+usethis::use_package('xxxx'): 
+
+# Once function is written, load it. You'll run `load_all()` multiple times.
+devtools::load_all()
+devtools::test()
+devtools::check()   # checks package
+
+
+

+Build documentation

+
    +
  • First, update _pkgdown.yml with documents
  • +
  • Second, run the following steps
  • +
+
+# Add new documentation
+usethis::use_vignette('introduction') # add a vignette
+
+# (optional, one-off steps) Build individual files
+devtools::run_examples()    # builds examples and vignettes
+devtools::build_vignettes() #
+pkgdown::build_articles()   # 
+pkgdown::build_reference()  # edit reference in _pkgdown.yml reference: section
+
+# Prepare the package
+devtools::document()   # generates NAMESPACE from documentation. Exports functions.
+covr::report()         # run the coverage test
+devtools::test()       # run unit tests
+devtools::check()      # check the package
+devtools::build()      # build the package
+pkgdown::build_site()  # Build the r package documentation
+
+
+

+Nested data

+

The data are nested to minimize redundancy, but they can easily be unnested.

+

Example of nested data:

+
+library(dplyr)
+#> 
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:stats':
+#> 
+#>     filter, lag
+#> The following objects are masked from 'package:base':
+#> 
+#>     intersect, setdiff, setequal, union
+mtcars %>%
+  select(mpg, disp, am, vs) %>%
+  tidyr::nest(data = c(vs, c(mpg, disp)))
+#> # A tibble: 2 × 2
+#>      am data             
+#>   <dbl> <list>           
+#> 1     1 <tibble [13 × 3]>
+#> 2     0 <tibble [19 × 3]>
+
+
+

+Docker

+

All the packages and command line tools are available using the docker container below.

+
docker pull bryanwhiting/r_env:latest
+
+
+

+Github API

+

Github api is easy to manage issues.

+
gh issue create
+gh issue create --title "Some title of a new bug"
+gh issue create --label "bug"
+gh issue list
+gh issue view 4 
+# Todo: parse this and re-submit
+gh issue view 4 --json body
+
+
+ + + +
+ + + + +
+ + + + + + diff --git a/docs/articles/developers_files/header-attrs-2.10/header-attrs.js b/docs/articles/developers_files/header-attrs-2.10/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/developers_files/header-attrs-2.10/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/example-analysis.html b/docs/articles/example-analysis.html new file mode 100644 index 0000000..1d3ba40 --- /dev/null +++ b/docs/articles/example-analysis.html @@ -0,0 +1,199 @@ + + + + + + + +Example Analysis • generalconference + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

+Load the data and analyze

+
+library(generalconference)
+#> Loading required package: dplyr
+#> 
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:stats':
+#> 
+#>     filter, lag
+#> The following objects are masked from 'package:base':
+#> 
+#>     intersect, setdiff, setequal, union
+#> Loading required package: glue
+#> 
+#> Attaching package: 'glue'
+#> The following object is masked from 'package:dplyr':
+#> 
+#>     collapse
+#> Loading required package: furrr
+#> Loading required package: future
+#> Loading required package: purrr
+#> Loading required package: stringr
+#> Loading required package: readr
+#> Loading required package: rvest
+#> 
+#> Attaching package: 'rvest'
+#> The following object is masked from 'package:readr':
+#> 
+#>     guess_encoding
+#> Loading required package: tictoc
+#> Loading required package: tidyr
+#> Loading required package: xml2
+library(dplyr)
+data("genconf")
+head(genconf)
+#> # A tibble: 6 × 3
+#>    year month sessions        
+#>   <dbl> <dbl> <list>          
+#> 1  1971     4 <tibble [7 × 4]>
+#> 2  1971    10 <tibble [7 × 4]>
+#> 3  1972     4 <tibble [7 × 4]>
+#> 4  1972    10 <tibble [7 × 4]>
+#> 5  1973     4 <tibble [7 × 4]>
+#> 6  1973    10 <tibble [7 × 4]>
+
+df <- genconf
+

How many conferences have there been since 1971?

+
+df %>%
+  count()
+#> # A tibble: 1 × 1
+#>       n
+#>   <int>
+#> 1   101
+

How many sessions have there been?

+
+df %>%
+  unnest(sessions) %>%
+  count()
+#> # A tibble: 1 × 1
+#>       n
+#>   <int>
+#> 1   613
+

How many talks have there been since 1971?

+
+df %>%
+  unnest(sessions) %>%
+  unnest(talks) %>%
+  count()
+#> # A tibble: 1 × 1
+#>       n
+#>   <int>
+#> 1  3883
+
+
+ + + +
+ + + + +
+ + + + + + diff --git a/docs/articles/example-analysis_files/header-attrs-2.10/header-attrs.js b/docs/articles/example-analysis_files/header-attrs-2.10/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/example-analysis_files/header-attrs-2.10/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/how-to-scrape.html b/docs/articles/how-to-scrape.html new file mode 100644 index 0000000..87533ea --- /dev/null +++ b/docs/articles/how-to-scrape.html @@ -0,0 +1,564 @@ + + + + + + + +How To Scrape General Conference Talks • generalconference + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

+Inspecting XML nodes

+
+library(rvest)
+library(dplyr)
+#> 
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:stats':
+#> 
+#>     filter, lag
+#> The following objects are masked from 'package:base':
+#> 
+#>     intersect, setdiff, setequal, union
+library(xml2)
+
+rv_doc <- rvest::read_html("https://www.churchofjesuschrist.org/study/liahona/2020/11/15cook?lang=eng")
+rv_doc %>%
+  html_elements(".body-block") %>%
+  xml2::html_structure()
+#> [[1]]
+#> <div.body-block>
+#>   <p#p5 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p6 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p7 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p8 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p9 [data-aid]>
+#>     {text}
+#>   <p#p42 [data-aid]>
+#>     {text}
+#>   <p#p10 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p43 [data-aid]>
+#>     {text}
+#>   <p#p44 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p11 [data-aid]>
+#>     {text}
+#>     <span.page-break [data-page]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p12 [data-aid]>
+#>     <em>
+#>       {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p13 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p14 [data-aid]>
+#>     <em>
+#>       {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p15 [data-aid]>
+#>     {text}
+#>   <p#p16 [data-aid]>
+#>     {text}
+#>     <a.scripture-ref [href]>
+#>       {text}
+#>     {text}
+#>   <p#p17 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p18 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p19 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p38 [data-aid]>
+#>     {text}
+#>   <p#p39 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p20 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p21 [data-aid]>
+#>     {text}
+#>     <span.page-break [data-page]>
+#>     {text}
+#>   <p#p22 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p23 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p24 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <em>
+#>       {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p40 [data-aid]>
+#>     {text}
+#>   <p#p41 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p25 [data-aid]>
+#>     {text}
+#>     <a.scripture-ref [href]>
+#>       {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p26 [data-aid]>
+#>     {text}
+#>   <p#p27 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p28 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p29 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p30 [data-aid]>
+#>     {text}
+#>     <a.scripture-ref [href]>
+#>       {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p31 [data-aid]>
+#>     {text}
+#>     <a.scripture-ref [href]>
+#>       {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p32 [data-aid]>
+#>     {text}
+#>   <p#p33 [data-aid]>
+#>     {text}
+#>     <span.page-break [data-page]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p34 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>     {text}
+#>   <p#p35 [data-aid]>
+#>     {text}
+#>   <p#p36 [data-aid]>
+#>     {text}
+#>     <a.note-ref [to, href, data-scroll-id]>
+#>       <sup.marker>
+#>         {text}
+#>   <p#p37 [data-aid]>
+#>     {text}
+

Explore node 1:

+
+rv_doc %>%
+  html_elements(".body-block") %>%
+  xml2::xml_child(1)
+#> {html_node}
+#> <p data-aid="144618637" id="p5">
+#> [1] <a class="note-ref" to="[object Object]" href="#note1" data-scroll-id="no ...
+

Explore node 2:

+
+rv_doc %>%
+  html_elements(".body-block") %>%
+  xml2::xml_child(2)
+#> {html_node}
+#> <p data-aid="144618639" id="p6">
+#> [1] <a class="note-ref" to="[object Object]" href="#note2" data-scroll-id="no ...
+
+rv_doc %>%
+  html_elements(".body-block") %>%
+  xml_contents()
+#> {xml_nodeset (40)}
+#>  [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
+#>  [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
+#>  [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
+#>  [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore  ...
+#>  [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
+#>  [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
+#>  [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
+#>  [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
+#>  [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
+#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
+#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
+#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
+#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
+#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
+#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father  ...
+#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
+#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
+#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
+#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
+#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
+#> ...
+
+rv_doc %>%
+  html_elements(".body-block p")
+#> {xml_nodeset (40)}
+#>  [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
+#>  [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
+#>  [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
+#>  [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore  ...
+#>  [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
+#>  [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
+#>  [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
+#>  [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
+#>  [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
+#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
+#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
+#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
+#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
+#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
+#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father  ...
+#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
+#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
+#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
+#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
+#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
+#> ...
+
+rv_doc %>%
+  html_elements(".body-block") %>%
+  html_children()
+#> {xml_nodeset (40)}
+#>  [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
+#>  [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
+#>  [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
+#>  [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore  ...
+#>  [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
+#>  [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
+#>  [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
+#>  [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
+#>  [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
+#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
+#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
+#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
+#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
+#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
+#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father  ...
+#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
+#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
+#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
+#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
+#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
+#> ...
+
+rv_doc %>%
+  html_elements("header")
+#> {xml_nodeset (7)}
+#> [1] <header class="panelHeader-2k7Jd backToAll-1PgB6"><a class="backText-1xON ...
+#> [2] <header class="panelHeader-2k7Jd contentHead-3F0ox"><button class="sc-1g7 ...
+#> [3] <header class="bookmarkHeader-2Bn20"><span class="bookmarkManagerTitle-1U ...
+#> [4] <header class="downloadHead-3O2wO">Downloads</header>
+#> [5] <header class="settingsHead-3iDND">Footnotes</header>
+#> [6] <header class="settingsHead-3iDND">Theme</header>
+#> [7] <header><span class="page-break" data-page="18"></span><div class="bvqtyr ...
+
+rv_doc %>%
+  html_elements(".body") %>%
+  html_elements("header") %>%
+  html_text2()
+#> [1] "Hearts Knit in Righteousness and Unity\n\nBy Elder Quentin L. Cook\n\nOf the Quorum of the Twelve Apostles\n\nAt this 200-year hinge point in our Church history, let us commit ourselves to live righteously and be united as never before."
+

Get specific paragraph by id:

+
+rv_doc %>%
+  html_elements("#p5")
+#> {xml_nodeset (1)}
+#> [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly si ...
+

Get multiple things at the same time (headers and paragraphs):

+
+rv_doc %>%
+  html_elements(".body-block h2, .body-block p")
+#> {xml_nodeset (40)}
+#>  [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
+#>  [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
+#>  [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
+#>  [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore  ...
+#>  [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
+#>  [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
+#>  [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
+#>  [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
+#>  [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
+#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
+#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
+#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
+#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
+#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
+#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father  ...
+#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
+#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
+#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
+#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
+#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
+#> ...
+
+
+

+Scratch code

+
+header_ids <- rv_doc %>%
+  html_elements(".body-block h2") %>%
+  html_attr("id")
+
+p_ids <- rv_doc %>%
+  html_elements(".body-block p") %>%
+  html_element("#p1")
+
+xm_contents <- rv_doc %>%
+  html_elements(".body-block") %>%
+  xml_contents()
+
+rv_doc %>%
+  html_elements(".body-block") %>%
+  # html_children() %>%
+  xml_child(1) %>%
+  xml_contents() %>%
+  html_elements("p")
+#> {xml_nodeset (0)}
+
+xm_contents %>%
+  xml_child(1) %>%
+  html_text()
+#> [1] "1"
+

Scrape metadata for url

+
+rv_doc %>%
+  html_elements("head") %>%
+  html_elements("meta")
+#> {xml_nodeset (10)}
+#>  [1] <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n
+#>  [2] <meta charset="utf-8">\n
+#>  [3] <meta name="viewport" content="width=device-width,initial-scale=1">\n
+#>  [4] <meta data-react-helmet="true" name="Search.doc-aid" content="144618619">\n
+#>  [5] <meta data-react-helmet="true" name="title" content="Hearts Knit in Righ ...
+#>  [6] <meta data-react-helmet="true" name="description" content="Elder Cook en ...
+#>  [7] <meta data-react-helmet="true" property="og:image" content="https://medi ...
+#>  [8] <meta data-react-helmet="true" property="og:title" content="Hearts Knit  ...
+#>  [9] <meta data-react-helmet="true" property="og:type" content="website">\n
+#> [10] <meta data-react-helmet="true" property="og:url" content="https://www.ch ...
+
+
+ + + +
+ + + + +
+ + + + + + diff --git a/docs/articles/how-to-scrape_files/header-attrs-2.10/header-attrs.js b/docs/articles/how-to-scrape_files/header-attrs-2.10/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/how-to-scrape_files/header-attrs-2.10/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/new-sessions.html b/docs/articles/new-sessions.html new file mode 100644 index 0000000..123bb3a --- /dev/null +++ b/docs/articles/new-sessions.html @@ -0,0 +1,182 @@ + + + + + + + +Scraping Sessions • generalconference + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+library(generalconference)
+#> Loading required package: dplyr
+#> 
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:stats':
+#> 
+#>     filter, lag
+#> The following objects are masked from 'package:base':
+#> 
+#>     intersect, setdiff, setequal, union
+#> Loading required package: glue
+#> 
+#> Attaching package: 'glue'
+#> The following object is masked from 'package:dplyr':
+#> 
+#>     collapse
+#> Loading required package: furrr
+#> Loading required package: future
+#> Loading required package: purrr
+#> Loading required package: stringr
+#> Loading required package: readr
+#> Loading required package: rvest
+#> 
+#> Attaching package: 'rvest'
+#> The following object is masked from 'package:readr':
+#> 
+#>     guess_encoding
+#> Loading required package: tictoc
+#> Loading required package: tidyr
+#> Loading required package: xml2
+

Use the following code to download a session one-off:

+
+# Define the file path
+year = 2021
+month = 4
+mo_str = "04"
+path=glue("/home/rstudio/generalconference/data/sessions/{year}{mo_str}.rds")
+
+generalconference::scrape_conference_talks(year, month, path)
+
+# Read the dataframe in
+df_conf <- readr::read_rds(path)
+df_conf %>%
+  unnest(sessions) %>%
+  unnest(talks)
+#> # A tibble: 37 × 13
+#>     year month session_name  session_id session_url   talk_urls  talk_session_id
+#>    <dbl> <dbl> <chr>              <int> <chr>         <chr>                <int>
+#>  1  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               1
+#>  2  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               2
+#>  3  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               3
+#>  4  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               4
+#>  5  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               5
+#>  6  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               6
+#>  7  2021     4 Saturday Mor…          1 /study/gener… /study/ge…               7
+#>  8  2021     4 Saturday Aft…          2 /study/gener… /study/ge…               1
+#>  9  2021     4 Saturday Aft…          2 /study/gener… /study/ge…               2
+#> 10  2021     4 Saturday Aft…          2 /study/gener… /study/ge…               3
+#> # … with 27 more rows, and 6 more variables: url <chr>, title1 <chr>,
+#> #   author1 <chr>, author2 <chr>, kicker1 <chr>, paragraphs <list>
+
+ + + +
+ + + + +
+ + + + + + diff --git a/docs/articles/new-sessions_files/header-attrs-2.10/header-attrs.js b/docs/articles/new-sessions_files/header-attrs-2.10/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/new-sessions_files/header-attrs-2.10/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/news/index.html b/docs/news/index.html new file mode 100644 index 0000000..cfa47da --- /dev/null +++ b/docs/news/index.html @@ -0,0 +1,168 @@ + + + + + + + + +Changelog • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

+generalconference 0.1.0

+
    +
  • Added a NEWS.md file to track changes to the package.
  • +
+
+
+ + + +
+ + + +
+ + + + + + + + diff --git a/docs/reference/extract_body_paragraphs_df.html b/docs/reference/extract_body_paragraphs_df.html new file mode 100644 index 0000000..261ead5 --- /dev/null +++ b/docs/reference/extract_body_paragraphs_df.html @@ -0,0 +1,180 @@ + + + + + + + + +Produce paragraphs — extract_body_paragraphs_df • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Produce paragraphs

+
+ +
extract_body_paragraphs_df(rv_doc)
+ +

Arguments

+ + + + + + +
rv_doc

rvest::read_html() document

+ +

Value

+ +

dataframe with paragraphs

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/extract_element.html b/docs/reference/extract_element.html new file mode 100644 index 0000000..27f442c --- /dev/null +++ b/docs/reference/extract_element.html @@ -0,0 +1,184 @@ + + + + + + + + +Extract html document elements — extract_element • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

rvest::read_html("https://www.churchofjesuschrist.org/study/general-conference/1971/04/kingdom-of-god?lang=eng")

+
+ +
extract_element(rv_doc, element)
+ +

Arguments

+ + + + + + + + + + +
rv_doc

rvest::read_html() document

element

class you want to extract (use Selector Gadget)

+ +

Value

+ +

dataframe

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/extract_metadata.html b/docs/reference/extract_metadata.html new file mode 100644 index 0000000..b90f0d3 --- /dev/null +++ b/docs/reference/extract_metadata.html @@ -0,0 +1,172 @@ + + + + + + + + +Extract title, author, and kicker from a url and return as a row in a +dataframe. — extract_metadata • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Extract title, author, and kicker from a url and return as a row in a +dataframe.

+
+ +
extract_metadata(html_document, url)
+ + + +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/extract_session_hrefs.html b/docs/reference/extract_session_hrefs.html new file mode 100644 index 0000000..4edb514 --- /dev/null +++ b/docs/reference/extract_session_hrefs.html @@ -0,0 +1,194 @@ + + + + + + + + +Extract Session hrefs — extract_session_hrefs • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Extract Session hrefs

+
+ +
extract_session_hrefs(html_docmap, session_id)
+ +

Arguments

+ + + + + + + + + + +
html_docmap

An rvest docmap scrape from +scrape_conference_html_doc_map()

session_id

Integer for session you want to extract

+ +

Value

+ +

hrefs for the session, which includes the Session href in addition +to the talk refs.

+ +

Examples

+
scrape_conference_html_doc_map(2019, 4) %>% + extract_session_hrefs(session_id = 1) %>% + parse_session_urls() +
#> # A tibble: 1 × 3 +#> session_name session_url session_talk_ur… +#> <chr> <chr> <list> +#> 1 Saturday Morning Session /study/general-conference/2019/04/s… <tibble [6 × 2]>
+
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/extract_url_from_rv_doc.html b/docs/reference/extract_url_from_rv_doc.html new file mode 100644 index 0000000..90dd334 --- /dev/null +++ b/docs/reference/extract_url_from_rv_doc.html @@ -0,0 +1,180 @@ + + + + + + + + +Extract url from rv_doc — extract_url_from_rv_doc • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Extract url from rv_doc

+
+ +
extract_url_from_rv_doc(rv_doc)
+ +

Arguments

+ + + + + + +
rv_doc

rvest::read_html() object

+ +

Value

+ +

string

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/genconf.html b/docs/reference/genconf.html new file mode 100644 index 0000000..3898af1 --- /dev/null +++ b/docs/reference/genconf.html @@ -0,0 +1,209 @@ + + + + + + + + +General Conference Data — genconf • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

A dataset containing all general conference talks back to 1971.

+
+ +
genconf
+ + +

Format

+ +

genconf: A 4-level nested data frame with nestings for conference, session, talk, and paragraph.

    +
  1. genconf A data frame with one row per conference (year + month)

    +
    year

    Session year

    +
    month

    Session month

    +
    sessions

    List dataframe with one row per session.

    + +
  2. +
  3. sessions A data frame one row per session (Saturday AM, PM, etc.)

    +
    session_name

    individual timepoint

    +
    session_id

    mean value including imputed values

    +
    session_url

    Suffix URL path to session (not full url))

    +
    talks

    List of dataframes, one row per talk in that session

    + +
  4. +
  5. talks A data frame one row per talk

    +
    talk_urls

    Stub urls for talk.

    +
    talk_session_id

    Talk index within session

    +
    url

    Full url path to talk.

    +
    title1

    Title.

    +
    author1

    Author Name (typically, might be missing)

    +
    author2

    Author Role (typically, might be missing)

    +
    kicker1

    Talk kicker

    +
    paragraphs

    List of dataframes, one row per talk in that session

    + +
  6. +
  7. paragraphs A data frame one row per paragraph in talk

    +
    section_num

    If talk has sections, this would be the section number. Newer talks are more likely to have sections.

    +
    p_num

    Paragraph number

    +
    p_id

    Paragraph html tag (can be used to generate a url deep link). Might not be in order with p_num due to edge-case talks that use #p1-#p4 for title, author, kicker, etc.

    +
    is_header

    If a talk contains sections, those sections have headers. Header content will be a few words.

    +
    paragraph

    Text of talk. <sup></sup> html tags (superscripts/footnotes) have been stripped out.

    + +
  8. +
+ +

Source

+ +

https://www.churchofjesuschrist.org/study/general-conference

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/parse_path_for_name.html b/docs/reference/parse_path_for_name.html new file mode 100644 index 0000000..bf93b7e --- /dev/null +++ b/docs/reference/parse_path_for_name.html @@ -0,0 +1,183 @@ + + + + + + + + +Parse path for name — parse_path_for_name • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Parse path for name

+
+ +
parse_path_for_name(path)
+ +

Arguments

+ + + + + + +
path

file path with no extension

+ +

Value

+ +

Upper Case String

+ +

Examples

+
parse_path_for_name(path = "/path/to/hello-world-") +
#> [1] "Hello World"
+
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/parse_session_urls.html b/docs/reference/parse_session_urls.html new file mode 100644 index 0000000..552d535 --- /dev/null +++ b/docs/reference/parse_session_urls.html @@ -0,0 +1,184 @@ + + + + + + + + +Parse Session URLs — parse_session_urls • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Take a vector of session hrefs, use the first value +as the session name, then the other values as the +session talks

+
+ +
parse_session_urls(session_hrefs)
+ +

Arguments

+ + + + + + +
session_hrefs

vector

+ +

Value

+ +

nested tibble

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/parse_url.html b/docs/reference/parse_url.html new file mode 100644 index 0000000..2f6db19 --- /dev/null +++ b/docs/reference/parse_url.html @@ -0,0 +1,184 @@ + + + + + + + + +Scrape an individual General Conference URL — parse_url • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Helps scrape a talk

+
+ +
parse_url(url)
+ +

Arguments

+ + + + + + +
url

raw url

+ +

Value

+ +

string

+

Details

+ +

See unit tests for edge case urls. +Simple string extractor. Removes ?lang=eng and other stuff

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/scrape_conference_html_doc_map.html b/docs/reference/scrape_conference_html_doc_map.html new file mode 100644 index 0000000..52b0f05 --- /dev/null +++ b/docs/reference/scrape_conference_html_doc_map.html @@ -0,0 +1,193 @@ + + + + + + + + +Scrape HTML doc map from Conference URL — scrape_conference_html_doc_map • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Given a year and a month, pull the entire .doc-map class +object from the Conference URL. This will be parsed +by downstream objects

+
+ +
scrape_conference_html_doc_map(year, month)
+ +

Arguments

+ + + + + + + + + + +
year

Year (integer)

month

Month (integer)

+ +

Value

+ +

Rvest object

+ +

Examples

+
scrape_conference_html_doc_map(2017, 4) +scrape_conference_html_doc_map(1971, 10) +scrape_conference_html_doc_map(1985, 10) +
+
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/scrape_conference_talks.html b/docs/reference/scrape_conference_talks.html new file mode 100644 index 0000000..16a2213 --- /dev/null +++ b/docs/reference/scrape_conference_talks.html @@ -0,0 +1,184 @@ + + + + + + + + +Scrapes all conference talks for a sessions — scrape_conference_talks • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

For one-off sessions or debugging, see new-sessions.Rmd.

+
+ +
scrape_conference_talks(year, month, path, loop_method = 1)
+ +

Arguments

+ + + + + + + + + + +
year

Year

month

Month

+ +

Value

+ +

Writes out session to /data/sessions/<year><month>.rds

+ +
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/scrape_conference_urls.html b/docs/reference/scrape_conference_urls.html new file mode 100644 index 0000000..4b8518e --- /dev/null +++ b/docs/reference/scrape_conference_urls.html @@ -0,0 +1,204 @@ + + + + + + + + +Main function to scrape all conference talk urls +For a given year-month conference, return a nested tibble of all sessions +with a tibble-column containing the dataframes — scrape_conference_urls • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Main function to scrape all conference talk urls +For a given year-month conference, return a nested tibble of all sessions +with a tibble-column containing the dataframes

+
+ +
scrape_conference_urls(year, month)
+ +

Arguments

+ + + + + + + + + + +
year

year

month

month

+ +

Value

+ +

tibble

+ +

Examples

+
scrape_conference_urls(2019, 10) +
#> # A tibble: 1 × 3 +#> year month sessions +#> <dbl> <dbl> <list> +#> 1 2019 10 <tibble [5 × 4]>
scrape_conference_urls(1971, 4) +
#> # A tibble: 1 × 3 +#> year month sessions +#> <dbl> <dbl> <list> +#> 1 1971 4 <tibble [7 × 4]>
+
+ +
+ + + +
+ + + + + + + + diff --git a/docs/reference/scrape_talk.html b/docs/reference/scrape_talk.html new file mode 100644 index 0000000..35ace33 --- /dev/null +++ b/docs/reference/scrape_talk.html @@ -0,0 +1,180 @@ + + + + + + + + +Scrape general conference talk — scrape_talk • generalconference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +
+

Scrape general conference talk

+
+ +
scrape_talk(url)
+ +

Arguments

+ + + + + + +
url

general conference https

+ +

Value

+ +

dataframe

+ +
+ +
+ + + +
+ + + + + + + +