-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
659 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,17 @@ | ||
^_pkgdown\.yml$ | ||
^docs$ | ||
^pkgdown$ | ||
^.DS_Store$ | ||
^.github$ | ||
^\.github$ | ||
^\.Rproj\.user$ | ||
^cran-comments\.md$ | ||
^CRAN-SUBMISSION$ | ||
^data-raw$ | ||
^doc$ | ||
^docs$ | ||
^Meta$ | ||
^README\.Rmd$ | ||
^cran-comments\.md$ | ||
^qtkit\.Rproj$ | ||
^\.Rproj\.user$ | ||
^qtkit\.code-workspace$ | ||
^.github$ | ||
^LICENSE\.md$ | ||
^pkgdown$ | ||
^qtkit\.code-workspace$ | ||
^qtkit\.Rproj$ | ||
^README\.Rmd$ | ||
^vignettes/articles$ | ||
^.DS_Store$ | ||
^CRAN-SUBMISSION$ | ||
^\.github$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#' Calculate Association Metrics for Bigrams | ||
#' | ||
#' This function calculates various association metrics | ||
#' (PMI, Dice's Coefficient, G-score) for bigrams in a given corpus. | ||
#' | ||
#' @param data A data frame containing the corpus. | ||
#' @param doc_index Column in 'data' which represents the document index. | ||
#' @param token_index Column in 'data' which represents the token index. | ||
#' @param type Column in 'data' which represents the tokens or terms. | ||
#' @param association A character vector specifying which metrics to calculate. | ||
#' Can be any combination of 'pmi', 'dice_coeff', 'g_score', or 'all'. | ||
#' Default is 'all'. | ||
#' @param verbose A logical value indicating whether to keep the intermediate | ||
#' probability columns. Default is FALSE. | ||
#' | ||
#' @return A data frame with one row per bigram and columns for each | ||
#' calculated metric. | ||
#' | ||
#' @examples | ||
#' data <- data.frame( | ||
#' doc_index = c(1, 1, 1, 2), | ||
#' token_index = c(1, 2, 3, 1), | ||
#' type = c("word1", "word2", "word3", "word2") | ||
#' ) | ||
#' calc_assoc_metrics(data, doc_index, token_index, type) | ||
#' | ||
#' @export | ||
# FIX: replace this toy dataset with a dataset from an R package | ||
calc_assoc_metrics <- function(data, doc_index, token_index, type, | ||
association = "all", verbose = FALSE) { | ||
doc_index <- deparse(substitute(doc_index)) | ||
token_index <- deparse(substitute(token_index)) | ||
type <- deparse(substitute(type)) | ||
|
||
validate_assoc_metrics_input(data, doc_index, token_index, type, association) | ||
|
||
bigram_probs <- | ||
calculate_bigram_probabilities(data, doc_index, token_index, type) | ||
|
||
metrics <- calculate_metrics(bigram_probs, association) | ||
|
||
if (!verbose) { | ||
metrics <- metrics[, !colnames(metrics) %in% c("p_xy", "p_x", "p_y")] | ||
} | ||
|
||
return(metrics) | ||
} | ||
|
||
# Helper function to validate input | ||
validate_assoc_metrics_input <- | ||
function(data, doc_index, token_index, type, association) { | ||
if (!is.data.frame(data)) { | ||
stop("The argument 'data' must be a data frame.") | ||
} | ||
|
||
required_cols <- c(doc_index, token_index, type) | ||
if (!all(required_cols %in% names(data))) { | ||
stop("All specified columns must exist in 'data'.") | ||
} | ||
|
||
valid_associations <- c("all", "pmi", "dice_coeff", "g_score") | ||
if (!is.character(association) || | ||
!all(association %in% valid_associations)) { # nolint | ||
stop("Invalid 'association' argument.") | ||
} | ||
} | ||
|
||
# Helper function to calculate bigram probabilities | ||
calculate_bigram_probabilities <- | ||
function(data, doc_index, token_index, type) { | ||
# Sort data by document and token | ||
data <- data[order(data[[doc_index]], data[[token_index]]), ] | ||
|
||
# Create bigrams | ||
x <- data[[type]] | ||
y <- c(x[-1], NA) | ||
bigrams <- data.frame(x = x, y = y) | ||
bigrams <- bigrams[!is.na(bigrams$y), ] | ||
|
||
# Count bigrams | ||
bigram_counts <- table(bigrams) | ||
total_bigrams <- sum(bigram_counts) | ||
|
||
# Calculate probabilities | ||
p_xy <- as.data.frame(bigram_counts / total_bigrams) | ||
colnames(p_xy) <- c("x", "y", "p_xy") | ||
|
||
# Calculate unigram probabilities | ||
p_x <- as.data.frame(table(x) / length(x)) | ||
colnames(p_x) <- c("x", "p_x") | ||
p_y <- as.data.frame(table(y) / length(y)) | ||
colnames(p_y) <- c("y", "p_y") | ||
|
||
# Merge probabilities | ||
result <- merge(p_xy, p_x, by = "x", all.x = TRUE) | ||
result <- merge(result, p_y, by = "y", all.x = TRUE) | ||
|
||
# Convert to numeric | ||
result$p_xy <- as.numeric(result$p_xy) | ||
result$p_x <- as.numeric(result$p_x) | ||
result$p_y <- as.numeric(result$p_y) | ||
|
||
return(result) | ||
} | ||
|
||
calculate_metrics <- | ||
function(bigram_probs, association) { | ||
metrics <- bigram_probs | ||
|
||
if ("all" %in% association || "pmi" %in% association) { | ||
metrics$pmi <- log(metrics$p_xy / (metrics$p_x * metrics$p_y)) | ||
} | ||
|
||
if ("all" %in% association || "dice_coeff" %in% association) { | ||
metrics$dice_coeff <- 2 * metrics$p_xy / (metrics$p_x + metrics$p_y) | ||
} | ||
|
||
if ("all" %in% association || "g_score" %in% association) { | ||
metrics$g_score <- 2 * log(metrics$p_xy) - | ||
log(metrics$p_x) - log(metrics$p_y) | ||
} | ||
|
||
return(metrics) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#' Calculate Document Frequency (DF) | ||
#' | ||
#' This function is for internal use only. | ||
#' | ||
#' This function that calculates the Document Frequency 'DF' for each type | ||
#' (e.g., term, lemma) in a term-document matrix (TDM). It is intended to be | ||
#' used within the package by the `calc_dispersion_metrics` function. | ||
#' | ||
#' @param tdm A term-document matrix (TDM) where each row represents a type | ||
#' and each column represents a document. | ||
#' | ||
#' @return A numeric vector containing the Document Frequency 'DF' for each | ||
#' type in the TDM. | ||
#' | ||
#' @keywords internal | ||
#' | ||
#' @importFrom Matrix rowSums | ||
# TODO: Implement the 'calc_df' function | ||
# identify an R dataset to use for testing | ||
calc_df <- function(tdm) { | ||
# Check if tdm is a 'dgCMatrix' object | ||
if (!inherits(tdm, "dgCMatrix")) { | ||
stop("The argument 'tdm' must be a 'dgCMatrix' object.") | ||
} | ||
# Check if the matrix has both rows and columns | ||
if (nrow(tdm) == 0 || ncol(tdm) == 0) { | ||
stop("The matrix 'tdm' must have both rows and columns.") | ||
} | ||
df <- Matrix::rowSums(tdm > 0) | ||
|
||
return(df) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#' Calculate Gries' Deviation of Proportions (DP) | ||
#' | ||
#' This function is for internal use only. | ||
#' | ||
#' This function that calculates the Deviation of Proportions (DP) | ||
#' based on Gries' Deviation of Proportions method. It is intended to be used | ||
#' within the package, particularly by the `calc_dispersion_metrics` function. | ||
#' | ||
#' @param tdm_normalized A normalized term-document matrix (TDM) where each | ||
#' row represents a type and each column represents a document. The values | ||
#' should be the proportions of each type's frequency to its total frequency | ||
#' across all documents. | ||
#' | ||
#' @param corpus_parts A numeric vector containing the proportions of each | ||
#' document in the corpus, which is used to calculate the Deviation of | ||
#' Proportions (DP). | ||
#' | ||
#' @return A numeric vector containing the Deviation of Proportions (DP) for | ||
#' each type in the TDM. | ||
#' | ||
#' @keywords internal | ||
#' | ||
#' @importFrom Matrix rowSums | ||
calc_dp <- function(tdm_normalized, corpus_parts) { | ||
# Check if tdm_normalized is a 'dgCMatrix' object | ||
if (!inherits(tdm_normalized, "dgCMatrix")) { | ||
stop("The argument 'tdm_normalized' must be a 'dgCMatrix' object.") | ||
} | ||
# Check if the matrix has both rows and columns | ||
if (nrow(tdm_normalized) == 0 || ncol(tdm_normalized) == 0) { | ||
stop("The matrix 'tdm_normalized' must have both rows and columns.") | ||
} | ||
|
||
# Check if corpus_parts is a numeric vector | ||
if (!is.numeric(corpus_parts)) { | ||
stop("The argument 'corpus_parts' must be a numeric vector.") | ||
} | ||
|
||
# Check if length of corpus_parts matches the number of columns in tdm_normalized | ||
if (length(corpus_parts) != ncol(tdm_normalized)) { | ||
stop("The length of 'corpus_parts' must match the number of columns in 'tdm_normalized'.") | ||
} | ||
|
||
diffs <- t(apply(tdm_normalized, 1, function(row) { | ||
abs(row - corpus_parts) | ||
})) | ||
dp <- Matrix::rowSums(diffs) / 2 | ||
dp_norm <- dp / (1 - min(corpus_parts)) | ||
|
||
return(dp_norm) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#' Calculate Inverse Document Frequency (IDF) | ||
#' | ||
#' This function is for internal use only. | ||
#' | ||
#' This function that calculates the Inverse Document Frequency 'IDF' | ||
#' for each type (e.g., term, lemma) in a term-document matrix (TDM). | ||
#' It is intended to be used within the package by the | ||
#' `calc_dispersion_metrics` function. | ||
#' | ||
#' @param tdm A term-document matrix (TDM) where each row represents a | ||
#' type and each column represents a document. | ||
#' | ||
#' @return A numeric vector containing the Inverse Document Frequency | ||
#' 'DF' for each type in the TDM. | ||
#' | ||
#' @keywords internal | ||
#' | ||
#' @importFrom Matrix rowSums | ||
# TODO: Implement the 'calc_idf' function | ||
calc_idf <- function(tdm) { | ||
# Check if tdm is a 'dgCMatrix' object | ||
if (!inherits(tdm, "dgCMatrix")) { | ||
stop("The argument 'tdm' must be a 'dgCMatrix' object.") | ||
} | ||
# Check if the matrix has both rows and columns | ||
if (nrow(tdm) == 0 || ncol(tdm) == 0) { | ||
stop("The matrix 'tdm' must have both rows and columns.") | ||
} | ||
df <- Matrix::rowSums(tdm > 0) | ||
idf <- log(ncol(tdm) / df) | ||
|
||
return(idf) | ||
} |
Oops, something went wrong.