From e9f34d6aad109adaba71eb2588e133e2bd4b8791 Mon Sep 17 00:00:00 2001 From: Jerid Francom Date: Fri, 16 Aug 2024 14:35:13 -0400 Subject: [PATCH] remove get_talkbank_data from this release Since the TBDBr package is not on CRAN it cannot feature in the package. --- DESCRIPTION | 6 - NAMESPACE | 12 - NEWS.md | 3 +- R/get_talkbank_data.R | 212 ------------------ _pkgdown.yml | 1 - man/get_talkbank_data.Rd | 55 ----- .../getPathTrees-9ec606-POST.json | 11 - tests/testthat/test-get_talkbank_data.R | 138 ------------ 8 files changed, 2 insertions(+), 436 deletions(-) delete mode 100644 R/get_talkbank_data.R delete mode 100644 man/get_talkbank_data.Rd delete mode 100644 tests/testthat/sla2.talkbank.org-1515/getPathTrees-9ec606-POST.json delete mode 100644 tests/testthat/test-get_talkbank_data.R diff --git a/DESCRIPTION b/DESCRIPTION index f749990..a63fe62 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,6 @@ Depends: R (>= 4.1) Imports: dplyr, - fs, ggplot2, glue, gutenbergr, @@ -28,21 +27,16 @@ Imports: Matrix, openai, purrr, - R.utils, readr, rlang, stringr, - TBDBr, tibble, - tidyr, tidytext Suggests: httptest, rmarkdown, testthat (>= 3.0.0), webshot2 -Remotes: - TalkBank/TBDBr Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US diff --git a/NAMESPACE b/NAMESPACE index 0496f3d..0216c5e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ export(create_data_origin) export(find_outliers) export(get_archive_data) export(get_gutenberg_data) -export(get_talkbank_data) export(write_gg) export(write_kbl) export(write_obj) @@ -16,20 +15,10 @@ import(dplyr) import(gutenbergr) import(readr) importFrom(Matrix,rowSums) -importFrom(R.utils,withTimeout) -importFrom(TBDBr,getParticipants) -importFrom(TBDBr,getTokens) -importFrom(TBDBr,getTranscripts) -importFrom(TBDBr,getUtterances) importFrom(dplyr,count) -importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,mutate) importFrom(dplyr,select) -importFrom(fs,dir_create) -importFrom(fs,dir_exists) -importFrom(fs,file_exists) -importFrom(fs,path) importFrom(ggplot2,ggsave) importFrom(glue,glue) importFrom(knitr,opts_current) @@ -43,7 +32,6 @@ importFrom(rlang,ensym) importFrom(stats,quantile) importFrom(stringr,str_trunc) importFrom(tibble,tibble) -importFrom(tidyr,unnest) importFrom(tidytext,cast_sparse) importFrom(tools,file_ext) importFrom(utils,download.file) diff --git a/NEWS.md b/NEWS.md index 002f962..e7dc7d9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,6 @@ - Adds `create_data_dictionary()` to create a data dictionary for a dataset. - Adds `find_outliers()` to identify observations in a data frame that have outliers for a given variable. - Adds `get_gutenberg_data()` to download a dataset from Project Gutenberg. -- Adds `get_talkbank_data()` to download a dataset from TalkBank. - Adds `add_pkg_to_bib()` to add a package to the bibliography. - Adds `write_*()` functions to aid in publishing results to a variety of formats. - `write_gg()` writes a ggplot object to a file. @@ -13,6 +12,8 @@ - `write_obj()` writes an R object to a file. - Adds `find_outliers()` to identify observations in a data frame that has outliers for a given variable. +Notes: `get_talkbank_data()` is postponed for a future release. + # qtkit 0.10.0 - Initial CRAN submission. diff --git a/R/get_talkbank_data.R b/R/get_talkbank_data.R deleted file mode 100644 index c519118..0000000 --- a/R/get_talkbank_data.R +++ /dev/null @@ -1,212 +0,0 @@ -#' Download TalkBank Corpus Data -#' -#' Downloads the utterances, transcripts, participants, tokens, and token -#' types data from the TalkBank database and saves it to disk in the specified -#' target directory. -#' -#' @param corpus_name The name of the TalkBank corpus to download data from. -#' @param corpus_path The path to the TalkBank corpus to download data from. -#' @param target_dir The directory to save the downloaded data to. -#' @param force If `TRUE`, the data will be downloaded even if it already -#' exists on disk. -#' @param confirmed If `TRUE`, the user has confirmed that they have permission -#' to use the data. -#' If `FALSE`, the function will prompt the user to confirm permission. -#' Setting this to `TRUE` is useful for reproducible workflows. -#' @param timeout The maximum time in seconds to wait for the data to be -#' downloaded. Default is 60 seconds. -#' -#' @return A message indicating whether the data was acquired or already -#' existed on disk, writes the data files to disk in the specified target -#' directory. -#' -#' @examples -#' \dontrun{ -#' data_dir <- file.path(tempdir(), "data") -#' -#' get_talkbank_data( -#' corpus_name = "ca", -#' corpus_path = c("ca", "Nahuatl"), -#' target_dir = data_dir, -#' confirmed = TRUE) -#' } -#' -#' @importFrom dplyr everything -#' @importFrom fs dir_exists dir_create file_exists path -#' @importFrom R.utils withTimeout -#' @importFrom readr write_csv -#' @importFrom TBDBr getUtterances getTranscripts getParticipants getTokens -#' @importFrom tidyr unnest -#' -#' @export -get_talkbank_data <- - function( - corpus_name, - corpus_path, - target_dir, - force = FALSE, - confirmed = FALSE, - timeout = 60) { - - # Confirm permission to use the data - confirmed <- confirm_if_needed(confirmed) - if (!confirmed) { - return(message("Aborted.")) - } - - # Validate input parameters - validate_inputs_gtd(corpus_name, corpus_path, target_dir, force, confirmed) - - # Check if the target directory exists, if not, create it - if (!file.exists(target_dir)) dir.create(target_dir, recursive = TRUE) - - # Set up file paths names - utterances_file <- file.path(target_dir, "utterances.csv") - transcripts_file <- file.path(target_dir, "transcripts.csv") - participants_file <- file.path(target_dir, "participants.csv") - tokens_file <- file.path(target_dir, "tokens.csv") - token_types_file <- file.path(target_dir, "token_types.csv") - - # Check if the file doesn't exist or force is TRUE - if (!file.exists(utterances_file) || force) { - message("Acquiring utterances...") - write_to_disk( - TBDBr::getUtterances, - corpus_name, - corpus_path, - utterances_file, - timeout) - } else { - message("Already acquired: ", utterances_file) - } - - if (!file.exists(transcripts_file) || force) { - message("Acquiring transcripts...") - write_to_disk( - TBDBr::getTranscripts, - corpus_name, - corpus_path, - transcripts_file, - timeout) - } else { - message("Already acquired: ", transcripts_file) - } - - if (!file.exists(participants_file) || force) { - message("Acquiring participants...") - write_to_disk( - TBDBr::getParticipants, - corpus_name, - corpus_path, - participants_file, - timeout) - } else { - message("Already acquired: ", participants_file) - } - - if (!file.exists(tokens_file) || force) { - message("Acquiring tokens...") - write_to_disk( - TBDBr::getTokens, - corpus_name, - corpus_path, - tokens_file, - timeout) - } else { - message("Already acquired: ", tokens_file) - } - - if (!file.exists(token_types_file) || force) { - message("Acquiring token types...") - write_to_disk( - TBDBr::getTokenTypes, - corpus_name, - corpus_path, - token_types_file, - timeout) - } else { - message("Already acquired: ", token_types_file) - } - - message("Acquisition complete.") - message("Use `force = TRUE` to re-acquire.") -} - -# Helper functions - -suppress_output <- function(func, ...) { - # Create a text connection to capture output - temp_output <- character() - tc <- textConnection("temp_output", "w", local = TRUE) - - # Redirect output to the text connection - sink(tc) - sink(tc, type = "message") - - # Run the function and capture the result - result <- tryCatch({ - func(...) - }, finally = { - # Restore normal output - sink(type = "message") - sink() - - # Close the text connection - close(tc) - }) - - return(result) -} - -write_to_disk <- - function( - func, - corpus_name, - corpus_path, - file_path, - timeout) { - tryCatch({ - results <- R.utils::withTimeout({ - suppress_output(func, corpus_name, corpus_path) - }, timeout = timeout, onTimeout = "silent") - results <- results |> tidyr::unnest(cols = dplyr::everything()) - readr::write_csv(results, file_path) - message("Acquired: ", file_path) - return(invisible(NULL)) - }, error = function(e) { - if (grepl("Operation was aborted by an application callback", e)) { - msg <- paste( - "The function", - deparse(substitute(func)), - "timed out at", - timeout, - "seconds. Change the `timeout` variable for a larger timeout.") - message(msg) - } else { - message("An error occurred while acquiring ", file_path, ".") - } - return(invisible(NULL)) - }) -} - -validate_inputs_gtd <- - function( - corpus_name, - corpus_path, - target_dir, - force, - confirmed) { - # Check if TBDBr path is valid - if (!TBDBr::validPath(c(corpus_name, corpus_path))) { - stop("Invalid path to TalkBank Database. - Check `corpus_name` and `corpus_path` again.") - } - # Check for valid force and confirmed values - if (!(force %in% c(TRUE, FALSE))) { - stop("`force` must be a logical value.") - } - if (!(confirmed %in% c(TRUE, FALSE))) { - stop("`confirmed` must be a logical value.") - } -} -utils::globalVariables(c("confirm_if_needed")) diff --git a/_pkgdown.yml b/_pkgdown.yml index 245c324..91e5809 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -47,7 +47,6 @@ reference: - create_data_origin - get_archive_data - get_gutenberg_data - - get_talkbank_data - title: 'Publishing functions' desc: 'Functions to aid in publishing research' contents: diff --git a/man/get_talkbank_data.Rd b/man/get_talkbank_data.Rd deleted file mode 100644 index 69d6d79..0000000 --- a/man/get_talkbank_data.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_talkbank_data.R -\name{get_talkbank_data} -\alias{get_talkbank_data} -\title{Download TalkBank Corpus Data} -\usage{ -get_talkbank_data( - corpus_name, - corpus_path, - target_dir, - force = FALSE, - confirmed = FALSE, - timeout = 60 -) -} -\arguments{ -\item{corpus_name}{The name of the TalkBank corpus to download data from.} - -\item{corpus_path}{The path to the TalkBank corpus to download data from.} - -\item{target_dir}{The directory to save the downloaded data to.} - -\item{force}{If \code{TRUE}, the data will be downloaded even if it already -exists on disk.} - -\item{confirmed}{If \code{TRUE}, the user has confirmed that they have permission -to use the data. -If \code{FALSE}, the function will prompt the user to confirm permission. -Setting this to \code{TRUE} is useful for reproducible workflows.} - -\item{timeout}{The maximum time in seconds to wait for the data to be -downloaded. Default is 60 seconds.} -} -\value{ -A message indicating whether the data was acquired or already -existed on disk, writes the data files to disk in the specified target -directory. -} -\description{ -Downloads the utterances, transcripts, participants, tokens, and token -types data from the TalkBank database and saves it to disk in the specified -target directory. -} -\examples{ -\dontrun{ -data_dir <- file.path(tempdir(), "data") - -get_talkbank_data( - corpus_name = "ca", - corpus_path = c("ca", "Nahuatl"), - target_dir = data_dir, - confirmed = TRUE) -} - -} diff --git a/tests/testthat/sla2.talkbank.org-1515/getPathTrees-9ec606-POST.json b/tests/testthat/sla2.talkbank.org-1515/getPathTrees-9ec606-POST.json deleted file mode 100644 index 61dd338..0000000 --- a/tests/testthat/sla2.talkbank.org-1515/getPathTrees-9ec606-POST.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "respMsg": { - "ca": { - "ca": { - "Nahuatl": { - "nahuatl": null - } - } - } - } -} diff --git a/tests/testthat/test-get_talkbank_data.R b/tests/testthat/test-get_talkbank_data.R deleted file mode 100644 index 7c66444..0000000 --- a/tests/testthat/test-get_talkbank_data.R +++ /dev/null @@ -1,138 +0,0 @@ - -# Test get_talkbank_data -------------------------------------- - -# Create a temporary directory for the test -temp_dir <- tempdir() - -# Create variables -valid_name <- "ca" -valid_path <- c("ca", "Nahuatl") -valid_dir <- file.path(temp_dir, "data") - -invalid_name <- "nonexistent_corpus" -invalid_path <- c("nonexistent_corpus", "nonexistent_language") -invalid_dir <- "nonexistent_directory" - -# Helper functions -# validate_inputs_gtd ------ - -# valid inputs -with_mock_api({ - test_that("valid inputs", { - # Test with the mocked API response - expect_silent(validate_inputs_gtd( - valid_name, valid_path, valid_dir, FALSE, TRUE)) - }) -}) - -# invalid corpus name -with_mock_api({ - test_that("invalid corpus name", { - expect_error(validate_inputs_gtd( - invalid_name, valid_path, valid_dir, FALSE, TRUE)) - }) -}) - -# invalid corpus path -with_mock_api({ - test_that("invalid corpus path", { - expect_error(validate_inputs_gtd( - valid_name, invalid_path, valid_dir, FALSE, TRUE)) - }) -}) - -# invalid boolean force -with_mock_api({ - test_that("invalid boolean force", { - expect_error(validate_inputs_gtd( - valid_name, valid_path, valid_dir, NULL, TRUE)) - }) -}) - -# invalid boolean confirmed -with_mock_api({ - test_that("invalid boolean confirmed", { - expect_error(validate_inputs_gtd( - valid_name, valid_path, valid_dir, FALSE, NULL)) - }) -}) - -# write_to_disk function ------ -# Mock functions and variables -mock_func_success <- function(corpus_name, corpus_path) { - return(data.frame(a = 1, b = 2)) -} - -mock_func_timeout <- function(corpus_name, corpus_path) { - start_time <- Sys.time() - print("Starting timeout test\n") - while (as.numeric(Sys.time() - start_time, units = "secs") < 3) { - # Busy-wait for 3 seconds - } - return(data.frame(a = 1, b = 2)) -} - -mock_func_error <- function(corpus_name, corpus_path) { - stop("mock error") -} - -# Tests -test_that("write_to_disk successfully writes data to disk", { - target_dir <- file.path(temp_dir, "test_data") - dir.create(target_dir, recursive = TRUE, showWarnings = FALSE) - file_path <- file.path(target_dir, "test.csv") - result <- - write_to_disk( - mock_func_success, - corpus_name, - corpus_path, - file_path, - timeout = 5) - expect_null(result) - expect_true(file.exists(file_path)) - expect_equal(read.csv(file_path), data.frame(a = 1, b = 2)) - unlink(target_dir, recursive = TRUE, force = TRUE) -}) - -test_that("write_to_disk handles timeout correctly", { - # Relies on internet connection - skip_on_cran() - corpus_name <- "ca" - corpus_path <- c("ca", "CABNC") - file_path <- file.path(temp_dir, "test.csv") - result <- - write_to_disk(getTokens, corpus_name, corpus_path, file_path, timeout = 1) - expect_null(result) - expect_message(write_to_disk( - getTokens, corpus_name, corpus_path, file_path, timeout = 1), - regex = "timed out at 1 seconds") -}) - -test_that("write_to_disk does not write to disk on function error", { - corpus_name <- "ca" - corpus_path <- c("ca", "CABNC") - file_path <- file.path(temp_dir, "test.csv") - write_to_disk( - mock_func_error, corpus_name, corpus_path, file_path, timeout = 5) - expect_false(file.exists(file_path)) -}) - -test_that("write_to_disk handles empty input data correctly", { - corpus_name <- "ca" - corpus_path <- c("ca", "CABNC") - file_path <- file.path(temp_dir, "test.csv") - mock_func_empty <- function(corpus_name, corpus_path) { - return(data.frame()) - } - result <- - write_to_disk( - mock_func_empty, corpus_name, corpus_path, file_path, timeout = 5) - expect_true(is.null(result)) - expect_true(file.exists(file_path)) - expect_error(read.csv(file_path), "no lines available in input") - file.remove(file_path) -}) - -# Clean up -# Remove the temporary directory -unlink(temp_dir)