[0.1.0]

psychbruce · Jul 17, 2023 · c79313b · c79313b
1 parent 5d9df58
commit c79313b
Show file tree

Hide file tree

Showing 9 changed files with 101 additions and 27 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,5 +1,6 @@
 data-raw
 research
+logo
 ^.*\.pdf$
 ^.*\.docx$
 ^.Rhistory$

diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ vignettes/*.pdf
 # Others
 data-raw
 research
+logo
 ignore
 docs
 pkgdown

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: FMAT
 Title: The Fill-Mask Association Test
-Version: 0.0.9
-Date: 2023-05-01
+Version: 0.1.0
+Date: 2023-07-08
 Authors@R:
     c(person(given = "Han-Wu-Shuang",
              family = "Bao",
@@ -11,9 +11,9 @@ Authors@R:
 Author: Han-Wu-Shuang Bao [aut, cre]
 Maintainer: Han-Wu-Shuang Bao <baohws@foxmail.com>
 Description:
-    The Fill-Mask Association Test ('FMAT') is a versatile, probability-based
-    method that employs Masked Language Models ('BERT') to
-    measure conceptual associations (e.g., attitudes, biases, stereotypes)
+    The Fill-Mask Association Test ('FMAT') is an integrative, versatile,
+    and probability-based method that employs Masked Language Models ('BERT')
+    to measure conceptual associations (e.g., attitudes, biases, stereotypes)
     in natural language.
 License: GPL-3
 Encoding: UTF-8
@@ -23,7 +23,7 @@ SystemRequirements: Python (>= 3.6.0)
 Depends: R (>= 4.0.0)
 Imports:
   PsychWordVec, reticulate, text,
-  data.table, stringr, forcats,
-  glue, cli, purrr, plyr, parallel
-Suggests: bruceR, nlme, car, knitr, rmarkdown
+  data.table, stringr, forcats, psych,
+  glue, cli, purrr, plyr, tidyr
+Suggests: bruceR, nlme, car, knitr, dplyr, parallel, rmarkdown
 RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(FMAT_load)
 export(FMAT_query)
 export(FMAT_query_bind)
 export(FMAT_run)
+export(LPR_reliability)
 export(cc)
 import(data.table)
 import(stringr)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 **Please check the [latest news (change log)](https://psychbruce.github.io/FMAT/news/index.html) and keep this package updated.**
 
+# FMAT 0.1.0 (Jul 2023)
+
+-   Fixed bugs and improved functions.
+
 # FMAT 0.0.9 (May 2023)
 
 -   Initial public release on [GitHub](https://github.com/psychbruce/FMAT).

diff --git a/R/FMAT.R b/R/FMAT.R
@@ -418,7 +418,7 @@ FMAT_query_bind = function(...) {
 #'   \item{\code{token}: actual token to be filled in the blank mask
 #'   (a note "out-of-vocabulary" will be added
 #'   if the original word is not found in the model vocabulary).}
-#'   \item{\code{prop}: (raw) conditional probability of the unmasked token
+#'   \item{\code{prob}: (raw) conditional probability of the unmasked token
 #'   given the provided context, estimated by the masked language model.
 #'
 #'   * It is NOT SUGGESTED to directly interpret the raw probabilities
@@ -498,8 +498,8 @@ FMAT_run = function(
 
     uncased = str_detect(model, "uncased|albert")
     prefix.u2581 = str_detect(model, "xlm-roberta|albert")
-    prefix.u0120 = str_detect(model, "roberta") & !str_detect(model, "xlm")
-    mask.lower = str_detect(model, "roberta")
+    prefix.u0120 = str_detect(model, "roberta|bertweet-large") & !str_detect(model, "xlm")
+    mask.lower = str_detect(model, "roberta|bertweet")
 
     unmask = function(d) {
       if("TARGET" %in% names(d))
@@ -524,7 +524,7 @@ FMAT_run = function(
           oov=="",  # no extra output from python
           res$token_str,
           paste(res$token_str, "(out-of-vocabulary)")),
-        prop = res$score
+        prob = res$score
       ))
     }
 
@@ -616,15 +616,16 @@ warning_oov = function(data) {
 #' \code{\link{FMAT_run}}
 #'
 #' @export
-summary.fmat = function(object,
-                        mask.pair=TRUE,
-                        target.pair=TRUE,
-                        attrib.pair=TRUE,
-                        warning=TRUE,
-                        ...) {
+summary.fmat = function(
+    object,
+    mask.pair=TRUE,
+    target.pair=TRUE,
+    attrib.pair=TRUE,
+    warning=TRUE,
+    ...) {
   if(warning) warning_oov(object)
   type = attr(object, "type")
-  M_word = T_word = A_word = MASK = TARGET = ATTRIB = prop = LPR = NULL
+  M_word = T_word = A_word = MASK = TARGET = ATTRIB = prob = LPR = NULL
 
   if(mask.pair) {
     gvars = c("model", "query", "M_pair",
@@ -634,7 +635,7 @@ summary.fmat = function(object,
     dt = object[, .(
       MASK = paste(MASK[1], "-", MASK[2]),
       M_word = paste(M_word[1], "-", M_word[2]),
-      LPR = log(prop[1]) - log(prop[2])
+      LPR = log(prob[1]) - log(prob[2])
     ), keyby = grouping.vars]
     dt$MASK = as_factor(dt$MASK)
     dt$M_word = as_factor(dt$M_word)
@@ -643,11 +644,11 @@ summary.fmat = function(object,
     dvars = c("model", "query", "MASK", "M_word",
               "TARGET", "T_pair", "T_word",
               "ATTRIB", "A_pair", "A_word",
-              "prop")
+              "prob")
     dt.vars = intersect(names(object), dvars)
     dt = object[, dt.vars, with=FALSE]
-    dt$LPR = log(dt$prop)
-    dt$prop = NULL
+    dt$LPR = log(dt$prob)
+    dt$prob = NULL
   }
 
   if(type=="MT") {
@@ -692,3 +693,45 @@ summary.fmat = function(object,
 }
 
 
+#' Reliability analysis (Cronbach's \eqn{\alpha}) of LPR.
+#'
+#' @param fmat A data.table returned from \code{\link{summary.fmat}}.
+#' @param item Reliability of multiple \code{"query"} (default),
+#' \code{"T_word"}, or \code{"A_word"}.
+#' @param by Variable(s) to split data by.
+#' Options can be \code{"model"}, \code{"TARGET"}, \code{"ATTRIB"},
+#' or any combination of them.
+#'
+#' @return
+#' A data.table of Cronbach's \eqn{\alpha}.
+#'
+#' @export
+LPR_reliability = function(
+    fmat,
+    item=c("query", "T_word", "A_word"),
+    by=NULL) {
+  item = match.arg(item)
+  alphas = plyr::ddply(fmat, by, function(x) {
+    x = as.data.frame(x)
+    x[[item]] = as.numeric(x[[item]])
+    if("T_pair" %in% names(x)) x$T_pair = NULL
+    if("A_pair" %in% names(x)) x$A_pair = NULL
+    x = tidyr::pivot_wider(
+      x,
+      names_from = item,
+      names_glue = paste0("LPR.{", item, "}"),
+      values_from = "LPR")
+    suppressWarnings({
+      suppressMessages({
+        alpha = psych::alpha(dplyr::select(x, tidyr::starts_with("LPR")),
+                             delete=FALSE, warnings=FALSE)
+      })
+    })
+    data.frame(n.obs = nrow(x),
+               k.items = alpha$nvar,
+               alpha = alpha$total$raw_alpha)
+  })
+  if(is.null(by)) alphas[[1]] = NULL
+  return(as.data.table(alphas))
+}
+
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # FMAT <img src="man/figures/logo.png" align="right" height="160"/>
 
-😷 The Fill-Mask Association Test.
+😷 The Fill-Mask Association Test (掩码填空联系测验).
 
-The *Fill-Mask Association Test* (FMAT) is a versatile, probability-based method that employs Masked Language Models ([BERT](https://arxiv.org/abs/1810.04805)) to measure conceptual associations (e.g., attitudes, biases, stereotypes) in natural language.
+The *Fill-Mask Association Test* (FMAT) is an integrative, versatile, and probability-based method that employs Masked Language Models ([BERT](https://arxiv.org/abs/1810.04805)) to measure conceptual associations (e.g., attitudes, biases, stereotypes) in natural language.
 
 <!-- badges: start -->
 
@@ -22,7 +22,7 @@ Homepage: [psychbruce.github.io](https://psychbruce.github.io)
 
 ## Citation
 
--   Bao, H.-W.-S. (2023). *Using AI language models to unmask society and culture: The Fill-Mask Association Test (FMAT)*. Manuscript in preparation.
+-   Bao, H.-W.-S. (2023). *The Fill-Mask Association Test (FMAT): Using AI language models to understand society and culture* [Manuscript in preparation].
 
 ## Installation
 

diff --git a/man/FMAT_run.Rd b/man/FMAT_run.Rd
diff --git a/man/LPR_reliability.Rd b/man/LPR_reliability.Rd