Skip to content

Commit

Permalink
updates calc_type_metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
francojc committed Aug 3, 2024
1 parent b50dd2c commit ed52c53
Show file tree
Hide file tree
Showing 18 changed files with 659 additions and 22 deletions.
23 changes: 12 additions & 11 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
^_pkgdown\.yml$
^docs$
^pkgdown$
^.DS_Store$
^.github$
^\.github$
^\.Rproj\.user$
^cran-comments\.md$
^CRAN-SUBMISSION$
^data-raw$
^doc$
^docs$
^Meta$
^README\.Rmd$
^cran-comments\.md$
^qtkit\.Rproj$
^\.Rproj\.user$
^qtkit\.code-workspace$
^.github$
^LICENSE\.md$
^pkgdown$
^qtkit\.code-workspace$
^qtkit\.Rproj$
^README\.Rmd$
^vignettes/articles$
^.DS_Store$
^CRAN-SUBMISSION$
^\.github$
9 changes: 7 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Authors@R:
role = c("aut", "cre", "cph"),
comment = c(ORCID = "0000-0001-5972-6330"))
Description: Support package for the textbook "An Introduction to
Quantitative Text Analysis for Linguists: Reproducible Research using
Quantitative Text Analysis for Linguists: Reproducible Research Using
R" (Francom, 2024) <doi:10.4324/9781003393764> (available only after
August 12, 2024). Includes functions to acquire, clean, and analyze text
data as well as functions to document and share the results of text
Expand All @@ -18,9 +18,14 @@ BugReports: https://github.com/qtalr/qtkit/issues
Depends:
R (>= 3.6)
Imports:
dplyr,
ggplot2,
kableExtra,
knitr
knitr,
Matrix,
rlang,
tibble,
tidytext
Suggests:
rmarkdown,
testthat (>= 3.0.0),
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
# Generated by roxygen2: do not edit by hand

export(add_pkg_to_bib)
export(calc_assoc_metrics)
export(calc_type_metrics)
export(create_data_origin)
export(find_outliers)
export(get_archive_data)
export(write_gg)
export(write_kbl)
export(write_obj)
importFrom(Matrix,rowSums)
importFrom(dplyr,count)
importFrom(ggplot2,ggsave)
importFrom(knitr,opts_current)
importFrom(knitr,write_bib)
importFrom(rlang,as_string)
importFrom(rlang,ensym)
importFrom(stats,quantile)
importFrom(tibble,tibble)
importFrom(tidytext,cast_sparse)
importFrom(tools,file_ext)
importFrom(utils,download.file)
importFrom(utils,untar)
Expand Down
4 changes: 2 additions & 2 deletions R/add_pkg_to_bib.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
#' @param bib_file The name of the BibTeX file to write to.
#' @return NULL
#' @importFrom knitr write_bib
#' @export
#' @examples
#' my_bib_file <- tempfile(fileext = ".bib")
#' add_pkg_to_bib("dplyr", my_bib_file)
#' readLines(my_bib_file) |> cat(sep = "\n")
add_pkg_to_bib <- function(pkg_name, bib_file = "packages.bib") {
#' @export
add_pkg_to_bib <- function(pkg_name, bib_file = "packages.bib") {
knitr::write_bib(c(.packages(), pkg_name), file = bib_file)
}
124 changes: 124 additions & 0 deletions R/calc_assoc_metrics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#' Calculate Association Metrics for Bigrams
#'
#' This function calculates various association metrics
#' (PMI, Dice's Coefficient, G-score) for bigrams in a given corpus.
#'
#' @param data A data frame containing the corpus.
#' @param doc_index Column in 'data' which represents the document index.
#' @param token_index Column in 'data' which represents the token index.
#' @param type Column in 'data' which represents the tokens or terms.
#' @param association A character vector specifying which metrics to calculate.
#' Can be any combination of 'pmi', 'dice_coeff', 'g_score', or 'all'.
#' Default is 'all'.
#' @param verbose A logical value indicating whether to keep the intermediate
#' probability columns. Default is FALSE.
#'
#' @return A data frame with one row per bigram and columns for each
#' calculated metric.
#'
#' @examples
#' data <- data.frame(
#' doc_index = c(1, 1, 1, 2),
#' token_index = c(1, 2, 3, 1),
#' type = c("word1", "word2", "word3", "word2")
#' )
#' calc_assoc_metrics(data, doc_index, token_index, type)
#'
#' @export
# FIX: replace this toy dataset with a dataset from an R package
calc_assoc_metrics <- function(data, doc_index, token_index, type,
association = "all", verbose = FALSE) {
doc_index <- deparse(substitute(doc_index))
token_index <- deparse(substitute(token_index))
type <- deparse(substitute(type))

validate_assoc_metrics_input(data, doc_index, token_index, type, association)

bigram_probs <-
calculate_bigram_probabilities(data, doc_index, token_index, type)

metrics <- calculate_metrics(bigram_probs, association)

if (!verbose) {
metrics <- metrics[, !colnames(metrics) %in% c("p_xy", "p_x", "p_y")]
}

return(metrics)
}

# Helper function to validate input
validate_assoc_metrics_input <-
function(data, doc_index, token_index, type, association) {
if (!is.data.frame(data)) {
stop("The argument 'data' must be a data frame.")
}

required_cols <- c(doc_index, token_index, type)
if (!all(required_cols %in% names(data))) {
stop("All specified columns must exist in 'data'.")
}

valid_associations <- c("all", "pmi", "dice_coeff", "g_score")
if (!is.character(association) ||
!all(association %in% valid_associations)) { # nolint
stop("Invalid 'association' argument.")
}
}

# Helper function to calculate bigram probabilities
calculate_bigram_probabilities <-
function(data, doc_index, token_index, type) {
# Sort data by document and token
data <- data[order(data[[doc_index]], data[[token_index]]), ]

# Create bigrams
x <- data[[type]]
y <- c(x[-1], NA)
bigrams <- data.frame(x = x, y = y)
bigrams <- bigrams[!is.na(bigrams$y), ]

# Count bigrams
bigram_counts <- table(bigrams)
total_bigrams <- sum(bigram_counts)

# Calculate probabilities
p_xy <- as.data.frame(bigram_counts / total_bigrams)
colnames(p_xy) <- c("x", "y", "p_xy")

# Calculate unigram probabilities
p_x <- as.data.frame(table(x) / length(x))
colnames(p_x) <- c("x", "p_x")
p_y <- as.data.frame(table(y) / length(y))
colnames(p_y) <- c("y", "p_y")

# Merge probabilities
result <- merge(p_xy, p_x, by = "x", all.x = TRUE)
result <- merge(result, p_y, by = "y", all.x = TRUE)

# Convert to numeric
result$p_xy <- as.numeric(result$p_xy)
result$p_x <- as.numeric(result$p_x)
result$p_y <- as.numeric(result$p_y)

return(result)
}

calculate_metrics <-
function(bigram_probs, association) {
metrics <- bigram_probs

if ("all" %in% association || "pmi" %in% association) {
metrics$pmi <- log(metrics$p_xy / (metrics$p_x * metrics$p_y))
}

if ("all" %in% association || "dice_coeff" %in% association) {
metrics$dice_coeff <- 2 * metrics$p_xy / (metrics$p_x + metrics$p_y)
}

if ("all" %in% association || "g_score" %in% association) {
metrics$g_score <- 2 * log(metrics$p_xy) -
log(metrics$p_x) - log(metrics$p_y)
}

return(metrics)
}
32 changes: 32 additions & 0 deletions R/calc_df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' Calculate Document Frequency (DF)
#'
#' This function is for internal use only.
#'
#' This function that calculates the Document Frequency 'DF' for each type
#' (e.g., term, lemma) in a term-document matrix (TDM). It is intended to be
#' used within the package by the `calc_dispersion_metrics` function.
#'
#' @param tdm A term-document matrix (TDM) where each row represents a type
#' and each column represents a document.
#'
#' @return A numeric vector containing the Document Frequency 'DF' for each
#' type in the TDM.
#'
#' @keywords internal
#'
#' @importFrom Matrix rowSums
# TODO: Implement the 'calc_df' function
# identify an R dataset to use for testing
calc_df <- function(tdm) {
# Check if tdm is a 'dgCMatrix' object
if (!inherits(tdm, "dgCMatrix")) {
stop("The argument 'tdm' must be a 'dgCMatrix' object.")
}
# Check if the matrix has both rows and columns
if (nrow(tdm) == 0 || ncol(tdm) == 0) {
stop("The matrix 'tdm' must have both rows and columns.")
}
df <- Matrix::rowSums(tdm > 0)

return(df)
}
51 changes: 51 additions & 0 deletions R/calc_dp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#' Calculate Gries' Deviation of Proportions (DP)
#'
#' This function is for internal use only.
#'
#' This function that calculates the Deviation of Proportions (DP)
#' based on Gries' Deviation of Proportions method. It is intended to be used
#' within the package, particularly by the `calc_dispersion_metrics` function.
#'
#' @param tdm_normalized A normalized term-document matrix (TDM) where each
#' row represents a type and each column represents a document. The values
#' should be the proportions of each type's frequency to its total frequency
#' across all documents.
#'
#' @param corpus_parts A numeric vector containing the proportions of each
#' document in the corpus, which is used to calculate the Deviation of
#' Proportions (DP).
#'
#' @return A numeric vector containing the Deviation of Proportions (DP) for
#' each type in the TDM.
#'
#' @keywords internal
#'
#' @importFrom Matrix rowSums
calc_dp <- function(tdm_normalized, corpus_parts) {
# Check if tdm_normalized is a 'dgCMatrix' object
if (!inherits(tdm_normalized, "dgCMatrix")) {
stop("The argument 'tdm_normalized' must be a 'dgCMatrix' object.")
}
# Check if the matrix has both rows and columns
if (nrow(tdm_normalized) == 0 || ncol(tdm_normalized) == 0) {
stop("The matrix 'tdm_normalized' must have both rows and columns.")
}

# Check if corpus_parts is a numeric vector
if (!is.numeric(corpus_parts)) {
stop("The argument 'corpus_parts' must be a numeric vector.")
}

# Check if length of corpus_parts matches the number of columns in tdm_normalized
if (length(corpus_parts) != ncol(tdm_normalized)) {
stop("The length of 'corpus_parts' must match the number of columns in 'tdm_normalized'.")
}

diffs <- t(apply(tdm_normalized, 1, function(row) {
abs(row - corpus_parts)
}))
dp <- Matrix::rowSums(diffs) / 2
dp_norm <- dp / (1 - min(corpus_parts))

return(dp_norm)
}
33 changes: 33 additions & 0 deletions R/calc_idf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#' Calculate Inverse Document Frequency (IDF)
#'
#' This function is for internal use only.
#'
#' This function that calculates the Inverse Document Frequency 'IDF'
#' for each type (e.g., term, lemma) in a term-document matrix (TDM).
#' It is intended to be used within the package by the
#' `calc_dispersion_metrics` function.
#'
#' @param tdm A term-document matrix (TDM) where each row represents a
#' type and each column represents a document.
#'
#' @return A numeric vector containing the Inverse Document Frequency
#' 'DF' for each type in the TDM.
#'
#' @keywords internal
#'
#' @importFrom Matrix rowSums
# TODO: Implement the 'calc_idf' function
calc_idf <- function(tdm) {
# Check if tdm is a 'dgCMatrix' object
if (!inherits(tdm, "dgCMatrix")) {
stop("The argument 'tdm' must be a 'dgCMatrix' object.")
}
# Check if the matrix has both rows and columns
if (nrow(tdm) == 0 || ncol(tdm) == 0) {
stop("The matrix 'tdm' must have both rows and columns.")
}
df <- Matrix::rowSums(tdm > 0)
idf <- log(ncol(tdm) / df)

return(idf)
}
Loading

0 comments on commit ed52c53

Please sign in to comment.