updates calc_type_metrics

qtalr · Aug 3, 2024 · ed52c53 · ed52c53
1 parent b50dd2c
commit ed52c53
Show file tree

Hide file tree

Showing 18 changed files with 659 additions and 22 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,16 +1,17 @@
 ^_pkgdown\.yml$
-^docs$
-^pkgdown$
+^.DS_Store$
+^.github$
+^\.github$
+^\.Rproj\.user$
+^cran-comments\.md$
+^CRAN-SUBMISSION$
+^data-raw$
 ^doc$
+^docs$
 ^Meta$
-^README\.Rmd$
-^cran-comments\.md$
-^qtkit\.Rproj$
-^\.Rproj\.user$
-^qtkit\.code-workspace$
-^.github$
 ^LICENSE\.md$
+^pkgdown$
+^qtkit\.code-workspace$
+^qtkit\.Rproj$
+^README\.Rmd$
 ^vignettes/articles$
-^.DS_Store$
-^CRAN-SUBMISSION$
-^\.github$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,7 @@ Authors@R:
     role = c("aut", "cre", "cph"),
     comment = c(ORCID = "0000-0001-5972-6330"))
 Description: Support package for the textbook "An Introduction to
-    Quantitative Text Analysis for Linguists: Reproducible Research using
+    Quantitative Text Analysis for Linguists: Reproducible Research Using
     R" (Francom, 2024) <doi:10.4324/9781003393764> (available only after
     August 12, 2024). Includes functions to acquire, clean, and analyze text
     data as well as functions to document and share the results of text
@@ -18,9 +18,14 @@ BugReports: https://github.com/qtalr/qtkit/issues
 Depends:
     R (>= 3.6)
 Imports:
+    dplyr,
     ggplot2,
     kableExtra,
-    knitr
+    knitr,
+    Matrix,
+    rlang,
+    tibble,
+    tidytext
 Suggests:
     rmarkdown,
     testthat (>= 3.0.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,16 +1,24 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_pkg_to_bib)
+export(calc_assoc_metrics)
+export(calc_type_metrics)
 export(create_data_origin)
 export(find_outliers)
 export(get_archive_data)
 export(write_gg)
 export(write_kbl)
 export(write_obj)
+importFrom(Matrix,rowSums)
+importFrom(dplyr,count)
 importFrom(ggplot2,ggsave)
 importFrom(knitr,opts_current)
 importFrom(knitr,write_bib)
+importFrom(rlang,as_string)
+importFrom(rlang,ensym)
 importFrom(stats,quantile)
+importFrom(tibble,tibble)
+importFrom(tidytext,cast_sparse)
 importFrom(tools,file_ext)
 importFrom(utils,download.file)
 importFrom(utils,untar)

diff --git a/R/add_pkg_to_bib.R b/R/add_pkg_to_bib.R
@@ -7,11 +7,11 @@
 #' @param bib_file The name of the BibTeX file to write to.
 #' @return NULL
 #' @importFrom knitr write_bib
-#' @export
 #' @examples
 #' my_bib_file <- tempfile(fileext = ".bib")
 #' add_pkg_to_bib("dplyr", my_bib_file)
 #' readLines(my_bib_file) |> cat(sep = "\n")
-add_pkg_to_bib  <- function(pkg_name, bib_file = "packages.bib") {
+#' @export
+add_pkg_to_bib <- function(pkg_name, bib_file = "packages.bib") {
   knitr::write_bib(c(.packages(), pkg_name), file = bib_file)
 }
diff --git a/R/calc_assoc_metrics.R b/R/calc_assoc_metrics.R
@@ -0,0 +1,124 @@
+#' Calculate Association Metrics for Bigrams
+#'
+#' This function calculates various association metrics
+#' (PMI, Dice's Coefficient, G-score) for bigrams in a given corpus.
+#'
+#' @param data A data frame containing the corpus.
+#' @param doc_index Column in 'data' which represents the document index.
+#' @param token_index Column in 'data' which represents the token index.
+#' @param type Column in 'data' which represents the tokens or terms.
+#' @param association A character vector specifying which metrics to calculate.
+#' Can be any combination of 'pmi', 'dice_coeff', 'g_score', or 'all'.
+#' Default is 'all'.
+#' @param verbose A logical value indicating whether to keep the intermediate
+#' probability columns. Default is FALSE.
+#'
+#' @return A data frame with one row per bigram and columns for each
+#' calculated metric.
+#'
+#' @examples
+#' data <- data.frame(
+#'   doc_index = c(1, 1, 1, 2),
+#'   token_index = c(1, 2, 3, 1),
+#'   type = c("word1", "word2", "word3", "word2")
+#' )
+#' calc_assoc_metrics(data, doc_index, token_index, type)
+#'
+#' @export
+# FIX: replace this toy dataset with a dataset from an R package
+calc_assoc_metrics <- function(data, doc_index, token_index, type,
+                               association = "all", verbose = FALSE) {
+  doc_index <- deparse(substitute(doc_index))
+  token_index <- deparse(substitute(token_index))
+  type <- deparse(substitute(type))
+
+  validate_assoc_metrics_input(data, doc_index, token_index, type, association)
+
+  bigram_probs <-
+    calculate_bigram_probabilities(data, doc_index, token_index, type)
+
+  metrics <- calculate_metrics(bigram_probs, association)
+
+  if (!verbose) {
+    metrics <- metrics[, !colnames(metrics) %in% c("p_xy", "p_x", "p_y")]
+  }
+
+  return(metrics)
+}
+
+# Helper function to validate input
+validate_assoc_metrics_input <-
+  function(data, doc_index, token_index, type, association) {
+    if (!is.data.frame(data)) {
+      stop("The argument 'data' must be a data frame.")
+    }
+
+    required_cols <- c(doc_index, token_index, type)
+    if (!all(required_cols %in% names(data))) {
+      stop("All specified columns must exist in 'data'.")
+    }
+
+    valid_associations <- c("all", "pmi", "dice_coeff", "g_score")
+    if (!is.character(association) ||
+      !all(association %in% valid_associations)) { # nolint
+      stop("Invalid 'association' argument.")
+    }
+  }
+
+# Helper function to calculate bigram probabilities
+calculate_bigram_probabilities <-
+  function(data, doc_index, token_index, type) {
+    # Sort data by document and token
+    data <- data[order(data[[doc_index]], data[[token_index]]), ]
+
+    # Create bigrams
+    x <- data[[type]]
+    y <- c(x[-1], NA)
+    bigrams <- data.frame(x = x, y = y)
+    bigrams <- bigrams[!is.na(bigrams$y), ]
+
+    # Count bigrams
+    bigram_counts <- table(bigrams)
+    total_bigrams <- sum(bigram_counts)
+
+    # Calculate probabilities
+    p_xy <- as.data.frame(bigram_counts / total_bigrams)
+    colnames(p_xy) <- c("x", "y", "p_xy")
+
+    # Calculate unigram probabilities
+    p_x <- as.data.frame(table(x) / length(x))
+    colnames(p_x) <- c("x", "p_x")
+    p_y <- as.data.frame(table(y) / length(y))
+    colnames(p_y) <- c("y", "p_y")
+
+    # Merge probabilities
+    result <- merge(p_xy, p_x, by = "x", all.x = TRUE)
+    result <- merge(result, p_y, by = "y", all.x = TRUE)
+
+    # Convert to numeric
+    result$p_xy <- as.numeric(result$p_xy)
+    result$p_x <- as.numeric(result$p_x)
+    result$p_y <- as.numeric(result$p_y)
+
+    return(result)
+  }
+
+calculate_metrics <-
+  function(bigram_probs, association) {
+    metrics <- bigram_probs
+
+    if ("all" %in% association || "pmi" %in% association) {
+      metrics$pmi <- log(metrics$p_xy / (metrics$p_x * metrics$p_y))
+    }
+
+    if ("all" %in% association || "dice_coeff" %in% association) {
+      metrics$dice_coeff <- 2 * metrics$p_xy / (metrics$p_x + metrics$p_y)
+    }
+
+    if ("all" %in% association || "g_score" %in% association) {
+      metrics$g_score <- 2 * log(metrics$p_xy) -
+        log(metrics$p_x) - log(metrics$p_y)
+    }
+
+    return(metrics)
+  }
diff --git a/R/calc_df.R b/R/calc_df.R
@@ -0,0 +1,32 @@
+#' Calculate Document Frequency (DF)
+#'
+#' This function is for internal use only.
+#'
+#' This function that calculates the Document Frequency 'DF' for each type 
+#' (e.g., term, lemma) in a term-document matrix (TDM). It is intended to be 
+#' used within the package by the `calc_dispersion_metrics` function.
+#'
+#' @param tdm A term-document matrix (TDM) where each row represents a type 
+#' and each column represents a document.
+#'
+#' @return A numeric vector containing the Document Frequency 'DF' for each 
+#' type in the TDM.
+#'
+#' @keywords internal
+#'
+#' @importFrom Matrix rowSums
+# TODO: Implement the 'calc_df' function
+# identify an R dataset to use for testing
+calc_df <- function(tdm) {
+  # Check if tdm is a 'dgCMatrix' object
+  if (!inherits(tdm, "dgCMatrix")) {
+    stop("The argument 'tdm' must be a 'dgCMatrix' object.")
+  }
+  # Check if the matrix has both rows and columns
+  if (nrow(tdm) == 0 || ncol(tdm) == 0) {
+    stop("The matrix 'tdm' must have both rows and columns.")
+  }
+  df <- Matrix::rowSums(tdm > 0)
+
+  return(df)
+}
diff --git a/R/calc_dp.R b/R/calc_dp.R
@@ -0,0 +1,51 @@
+#' Calculate Gries' Deviation of Proportions (DP)
+#'
+#' This function is for internal use only.
+#'
+#' This function that calculates the Deviation of Proportions (DP)
+#' based on Gries' Deviation of Proportions method. It is intended to be used
+#' within the package, particularly by the `calc_dispersion_metrics` function.
+#'
+#' @param tdm_normalized A normalized term-document matrix (TDM) where each 
+#' row represents a type and each column represents a document. The values 
+#' should be the proportions of each type's frequency to its total frequency 
+#' across all documents.
+#'
+#' @param corpus_parts A numeric vector containing the proportions of each 
+#' document in the corpus, which is used to calculate the Deviation of 
+#' Proportions (DP).
+#'
+#' @return A numeric vector containing the Deviation of Proportions (DP) for 
+#' each type in the TDM.
+#'
+#' @keywords internal
+#'
+#' @importFrom Matrix rowSums
+calc_dp <- function(tdm_normalized, corpus_parts) {
+  # Check if tdm_normalized is a 'dgCMatrix' object
+  if (!inherits(tdm_normalized, "dgCMatrix")) {
+    stop("The argument 'tdm_normalized' must be a 'dgCMatrix' object.")
+  }
+  # Check if the matrix has both rows and columns
+  if (nrow(tdm_normalized) == 0 || ncol(tdm_normalized) == 0) {
+    stop("The matrix 'tdm_normalized' must have both rows and columns.")
+  }
+
+  # Check if corpus_parts is a numeric vector
+  if (!is.numeric(corpus_parts)) {
+    stop("The argument 'corpus_parts' must be a numeric vector.")
+  }
+
+  # Check if length of corpus_parts matches the number of columns in tdm_normalized
+  if (length(corpus_parts) != ncol(tdm_normalized)) {
+    stop("The length of 'corpus_parts' must match the number of columns in 'tdm_normalized'.")
+  }
+
+  diffs <- t(apply(tdm_normalized, 1, function(row) {
+    abs(row - corpus_parts)
+  }))
+  dp <- Matrix::rowSums(diffs) / 2
+  dp_norm <- dp / (1 - min(corpus_parts))
+
+  return(dp_norm)
+}
diff --git a/R/calc_idf.R b/R/calc_idf.R
@@ -0,0 +1,33 @@
+#' Calculate Inverse Document Frequency (IDF)
+#'
+#' This function is for internal use only.
+#'
+#' This function that calculates the Inverse Document Frequency 'IDF' 
+#' for each type (e.g., term, lemma) in a term-document matrix (TDM). 
+#' It is intended to be used within the package by the 
+#' `calc_dispersion_metrics` function.
+#'
+#' @param tdm A term-document matrix (TDM) where each row represents a 
+#' type and each column represents a document.
+#'
+#' @return A numeric vector containing the Inverse Document Frequency 
+#' 'DF' for each type in the TDM.
+#'
+#' @keywords internal
+#'
+#' @importFrom Matrix rowSums
+# TODO: Implement the 'calc_idf' function 
+calc_idf <- function(tdm) {
+  # Check if tdm is a 'dgCMatrix' object
+  if (!inherits(tdm, "dgCMatrix")) {
+    stop("The argument 'tdm' must be a 'dgCMatrix' object.")
+  }
+  # Check if the matrix has both rows and columns
+  if (nrow(tdm) == 0 || ncol(tdm) == 0) {
+    stop("The matrix 'tdm' must have both rows and columns.")
+  }
+  df <- Matrix::rowSums(tdm > 0)
+  idf <- log(ncol(tdm) / df)
+
+  return(idf)
+}