From e9adda42c47daffb54fbab63ec47cd2b185be9e8 Mon Sep 17 00:00:00 2001
From: Bruce <psychbruce@qq.com>
Date: Sun, 19 May 2024 06:09:52 +0100
Subject: [PATCH] [2024.5]

---
 DESCRIPTION          |   4 +-
 NAMESPACE            |   1 +
 NEWS.md              |   7 ++
 R/FMAT.R             | 233 +++++++++++++++++++++++++++++++++++--------
 README.md            | 219 +++++++++++++++++++++-------------------
 man/BERT_download.Rd |   4 +
 man/BERT_info.Rd     |  35 +++++++
 man/BERT_vocab.Rd    |  30 +++++-
 man/FMAT_load.Rd     |   7 +-
 man/FMAT_run.Rd      |  12 ++-
 man/ICC_models.Rd    |   2 +-
 11 files changed, 395 insertions(+), 159 deletions(-)
 create mode 100644 man/BERT_info.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 9dece8d..ba85246 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: FMAT
 Title: The Fill-Mask Association Test
-Version: 2024.4
-Date: 2024-04-29
+Version: 2024.5
+Date: 2024-05-15
 Authors@R:
     c(person(given = "Han-Wu-Shuang",
              family = "Bao",
diff --git a/NAMESPACE b/NAMESPACE
index 3cf9e03..1740be6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 S3method(summary,fmat)
 export(.)
 export(BERT_download)
+export(BERT_info)
 export(BERT_vocab)
 export(FMAT_load)
 export(FMAT_query)
diff --git a/NEWS.md b/NEWS.md
index 69f7fa9..1089148 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,12 @@
 **Please check the [latest news (change log)](https://psychbruce.github.io/FMAT/news/index.html) and keep this package updated.**
 
+# FMAT 2024.5
+
+-   Added `BERT_info()`.
+-   Added `add.tokens` and `add.method` parameters for `BERT_vocab()` and `FMAT_run()`: An *experimental* functionality to add new tokens (e.g., out-of-vocabulary words, compound words, or even phrases) as [MASK] options. Validation is still needed for this novel practice (one of my ongoing projects), so currently please only use at your own risk, waiting until the publication of my validation work.
+-   All functions except `BERT_download()` now import local model files only, without automatically downloading models. Users must first use `BERT_download()` to download models.
+-   Deprecating `FMAT_load()`: Better to use `FMAT_run()` directly.
+
 # FMAT 2024.4
 
 -   Added `BERT_vocab()` and `ICC_models()`.
diff --git a/R/FMAT.R b/R/FMAT.R
index 6475121..87de170 100644
--- a/R/FMAT.R
+++ b/R/FMAT.R
@@ -42,7 +42,7 @@
 }
 
 
-#### Basic ####
+#### Utils ####
 
 
 #' @importFrom PsychWordVec cc
@@ -97,7 +97,7 @@ gpu_to_device = function(gpu) {
 }
 
 
-transformers_init = function() {
+transformers_init = function(print.info=TRUE) {
   FMAT.ver = as.character(utils::packageVersion("FMAT"))
   reticulate.ver = as.character(utils::packageVersion("reticulate"))
   reticulate::py_capture_output({
@@ -120,22 +120,90 @@ transformers_init = function() {
     transformers = reticulate::import("transformers")
     tf.ver = transformers$`__version__`
   })
-  cli::cli_alert_info(cli::col_blue("Device Info:
+  if(print.info) {
+    cli::cli_alert_info(cli::col_blue("Device Info:
+
+    R Packages:
+    FMAT          {FMAT.ver}
+    reticulate    {reticulate.ver}
+
+    Python Packages:
+    transformers  {tf.ver}
+    torch         {torch.ver}
+
+    NVIDIA GPU CUDA Support:
+    CUDA Enabled: {torch.cuda}
+    CUDA Version: {cuda.ver}
+    {gpu.info}
+    "))
+  }
+  return(transformers)
+}
+
 
-  R Packages:
-  FMAT          {FMAT.ver}
-  reticulate    {reticulate.ver}
+fill_mask_init = function(transformers, model, device=-1L) {
+  config = transformers$AutoConfig$from_pretrained(model, local_files_only=TRUE)
+  fill_mask = transformers$pipeline("fill-mask", model=model, config=config,
+                                    model_kwargs=list(local_files_only=TRUE),
+                                    device=device)
+  return(fill_mask)
+}
 
-  Python Packages:
-  transformers  {tf.ver}
-  torch         {torch.ver}
 
-  NVIDIA GPU CUDA Support:
-  CUDA Enabled: {torch.cuda}
-  CUDA Version: {cuda.ver}
-  {gpu.info}
-  "))
-  return(transformers)
+add_tokens = function(
+    fill_mask, tokens,
+    method = c("sum", "mean"),
+    verbose.in = TRUE,
+    verbose.out = TRUE
+) {
+  # encode new tokens from subwords
+  method = match.arg(method)
+  vocab = fill_mask$tokenizer$get_vocab()
+  embed = fill_mask$model$get_input_embeddings()$weight$data
+  tlist = lapply(tokens, function(token) {
+    encode = fill_mask$tokenizer$encode(token)
+    encode = encode[c(-1, -length(encode))]
+    decode = fill_mask$tokenizer$decode(encode)
+    if(length(encode)==1) {
+      token.embed = embed[encode]
+    } else {
+      if(method=="sum")
+        token.embed = embed[encode]$sum(0L)
+      if(method=="mean")
+        token.embed = embed[encode]$mean(0L)
+    }
+    return(list(
+      token = token,
+      encode = encode,
+      decode = decode,
+      token.raw = sapply(encode, function(id) vocab[vocab==id]),
+      token.embed = token.embed
+    ))
+  })
+  names(tlist) = tokens
+
+  # add new tokens to the tokenizer vocabulary
+  fill_mask$tokenizer$add_tokens(tokens)
+
+  # initialize random embeddings for the new tokens
+  fill_mask$model$resize_token_embeddings(length(fill_mask$tokenizer))
+
+  # reset new embeddings to (sum or mean) subword token embeddings
+  vocab.new = fill_mask$tokenizer$get_vocab()
+  embed.new = fill_mask$model$get_input_embeddings()$weight$data
+  for(t in tlist) {
+    if(is.null(vocab[[t$token]])) {
+      embed.new[vocab.new[[t$token]]] = t$token.embed
+      subwords = paste(names(t$token.raw), collapse=", ")
+      if(verbose.out)
+        cli::cli_alert_success("Added token {.val {t$token}}: {t$decode} = {method}_embed({subwords})")
+    } else {
+      if(verbose.in)
+        cli::cli_alert_success("{t$token}: already in vocab (token id = {t$encode})")
+    }
+  }
+
+  return(fill_mask)
 }
 
 
@@ -147,7 +215,7 @@ find_cached_models = function(cache.folder) {
       paste(paste0(sprintf("%.0f", file.size(models.file) / 1024^2), " MB"), collapse=" / ")
     })
     models.name = str_replace_all(str_remove(models.name, "^models--"), "--", "/")
-    models.info = data.frame(Size=models.size, row.names=models.name)
+    models.info = data.frame(size=models.size, row.names=models.name)
   } else {
     models.info = NULL
   }
@@ -169,6 +237,8 @@ find_cached_models = function(cache.folder) {
 #' No return value.
 #'
 #' @seealso
+#' [`BERT_info`]
+#'
 #' [`BERT_vocab`]
 #'
 #' [`FMAT_load`]
@@ -179,6 +249,8 @@ find_cached_models = function(cache.folder) {
 #' BERT_download(models)
 #'
 #' BERT_download()  # check downloaded models
+#'
+#' BERT_info()  # information of all downloaded models
 #' }
 #'
 #' @export
@@ -207,43 +279,111 @@ BERT_download = function(models=NULL) {
 }
 
 
+#' Get basic information of BERT models.
+#'
+#' @inheritParams BERT_download
+#'
+#' @return
+#' A data.table of model name, model file size,
+#' vocabulary size (of word/token embeddings),
+#' embedding dimensions (of word/token embeddings),
+#' and \[MASK\] token.
+#'
+#' @seealso
+#' [`BERT_download`]
+#'
+#' [`BERT_vocab`]
+#'
+#' @examples
+#' \dontrun{
+#' models = c("bert-base-uncased", "bert-base-cased")
+#' BERT_info(models)
+#'
+#' BERT_info()  # information of all downloaded models
+#' }
+#'
+#' @export
+BERT_info = function(models=NULL) {
+  transformers = transformers_init(print.info=FALSE)
+  cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
+  local.models = find_cached_models(cache.folder)
+  dm = data.table(model=row.names(local.models), size=local.models$size)
+  model = NULL
+  if(!is.null(models)) {
+    dm = dm[model %in% models]
+    dm$model = factor(dm$model, levels=models)
+    dm = dm[order(model)]
+  } else {
+    dm$model = as.factor(dm$model)
+  }
+  dm$size = str_remove(dm$size, " ")
+  dm = cbind(dm, rbindlist(lapply(dm$model, function(model) {
+    tokenizer = transformers$AutoTokenizer$from_pretrained(model, local_files_only=TRUE)
+    model.obj = transformers$AutoModel$from_pretrained(model, local_files_only=TRUE)
+    word.embeddings = model.obj$embeddings$word_embeddings$weight$data$shape
+    data.table(vocab = word.embeddings[0],
+               dims = word.embeddings[1],
+               mask = tokenizer$mask_token)
+  })))
+  return(dm)
+}
+
+
 #' Check if mask words are in the model vocabulary.
 #'
 #' @inheritParams BERT_download
 #' @param mask.words Option words filling in the mask.
+#' @param add.tokens Add new tokens (for out-of-vocabulary words or even phrases) to model vocabulary?
+#' Defaults to `FALSE`. It only temporarily adds tokens for tasks but does not change the raw model file.
+#' @param add.method Method used to produce the token embeddings of new added tokens.
+#' Can be `"sum"` (default) or `"mean"` of subword token embeddings.
 #'
 #' @return
-#' A data.table of model names, mask words, and real tokens (replaced if out of vocabulary).
+#' A data.table of model name, mask word, real token (replaced if out of vocabulary),
+#' and token id (0~N).
 #'
 #' @seealso
 #' [`BERT_download`]
 #'
+#' [`BERT_info`]
+#'
+#' [`FMAT_run`]
+#'
 #' @examples
 #' \dontrun{
 #' models = c("bert-base-uncased", "bert-base-cased")
+#' BERT_info(models)
+#'
 #' BERT_vocab(models, c("bruce", "Bruce"))
-#' BERT_vocab(models, 1800:2024)
+#'
+#' BERT_vocab(models, 2020:2025)  # some are out-of-vocabulary
+#' BERT_vocab(models, 2020:2025, add.tokens=TRUE)  # add vocab
+#'
+#' BERT_vocab(models,
+#'            c("individualism", "artificial intelligence"),
+#'            add.tokens=TRUE)
 #' }
 #'
 #' @export
-BERT_vocab = function(models, mask.words) {
-  transformers = transformers_init()
-  cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/")
-  cli::cli_text("Loading models from {.path {cache.folder}} ...")
-  cat("\n")
-
+BERT_vocab = function(
+    models, mask.words,
+    add.tokens = FALSE,
+    add.method = c("sum", "mean")
+) {
+  transformers = transformers_init(print.info=FALSE)
   mask.words = as.character(mask.words)
 
   maps = rbindlist(lapply(models, function(model) {
     reticulate::py_capture_output({
-      fill_mask = transformers$pipeline("fill-mask", model=model)
+      fill_mask = fill_mask_init(transformers, model)
+      if(add.tokens) fill_mask = add_tokens(fill_mask, mask.words, add.method, verbose.in=FALSE)
       vocab = fill_mask$tokenizer$get_vocab()
       ids = vocab[mask.words]
       map = rbindlist(lapply(mask.words, function(mask) {
         id = as.integer(fill_mask$get_target_ids(mask))
         token = names(vocab[vocab==id])
         if(is.null(ids[[mask]])) token = paste(token, "(out-of-vocabulary)")
-        data.table(model=as_factor(model), M_word=as_factor(mask), token=token)
+        data.table(model=as_factor(model), M_word=as_factor(mask), token=token, token.id=id)
       }))
     })
     return(map)
@@ -258,13 +398,12 @@ BERT_vocab = function(models, mask.words) {
 #### FMAT ####
 
 
-#' (Down)Load BERT models (useless for GPU).
+#' \[Deprecated\] Load BERT models (useless for GPU).
 #'
 #' Load BERT models from local cache folder "%USERPROFILE%/.cache/huggingface".
-#' Models that have not been downloaded can also
-#' be automatically downloaded (but *silently*).
 #' For [GPU Acceleration](https://psychbruce.github.io/FMAT/#guidance-for-gpu-acceleration),
-#' please directly use [`FMAT_run`] instead.
+#' please directly use [`FMAT_run`].
+#' In general, [`FMAT_run`] is always preferred than [`FMAT_load`].
 #'
 #' @inheritParams BERT_download
 #'
@@ -296,7 +435,7 @@ FMAT_load = function(models) {
   fms = lapply(models, function(model) {
     t0 = Sys.time()
     reticulate::py_capture_output({
-      fill_mask = transformers$pipeline("fill-mask", model=model)
+      fill_mask = fill_mask_init(transformers, model)
     })
     cli::cli_alert_success("{model} ({dtime(t0)})")
     return(list(model.name=model, fill.mask=fill_mask))
@@ -588,6 +727,7 @@ FMAT_query_bind = function(...) {
 #' raw probability estimates between using CPU and GPU,
 #' but these differences would have little impact on main results.
 #'
+#' @inheritParams BERT_vocab
 #' @param models Options:
 #' - A character vector of model names at
 #'   [HuggingFace](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers).
@@ -632,7 +772,9 @@ FMAT_query_bind = function(...) {
 #' @seealso
 #' [`BERT_download`]
 #'
-#' [`FMAT_load`]
+#' [`BERT_vocab`]
+#'
+#' [`FMAT_load`] (deprecated)
 #'
 #' [`FMAT_query`]
 #'
@@ -672,6 +814,8 @@ FMAT_run = function(
     models,
     data,
     gpu,
+    add.tokens = FALSE,
+    add.method = c("sum", "mean"),
     file = NULL,
     progress = TRUE,
     warning = TRUE,
@@ -711,7 +855,7 @@ FMAT_run = function(
     ## ---- One Run Begin ---- ##
     if(is.character(model)) {
       reticulate::py_capture_output({
-        fill_mask = transformers$pipeline("fill-mask", model=model, device=device)
+        fill_mask = fill_mask_init(transformers, model, device)
       })
     } else {
       fill_mask = model$fill.mask
@@ -726,7 +870,6 @@ FMAT_run = function(
     uncased = str_detect(model, "uncased|albert")
     prefix.u2581 = str_detect(model, "xlm-roberta|albert")
     prefix.u0120 = str_detect(model, "roberta|bertweet-large") & !str_detect(model, "xlm")
-    mask.lower = str_detect(model, "roberta|bertweet")
 
     # .mask (final mask target words)
     data = mutate(data, .mask = as.character(M_word))
@@ -737,8 +880,12 @@ FMAT_run = function(
     if(prefix.u0120)
       data = mutate(data, .mask = ifelse(!str_detect(.query, "^\\[MASK\\]"),
                                          paste0("\u0120", .mask), .mask))
-    if(mask.lower)
-      data = mutate(data, .query = str_replace(.query, "\\[MASK\\]", "<mask>"))
+    mask.token = fill_mask$tokenizer$mask_token
+    if(mask.token!="[MASK]")
+      data = mutate(data, .query = str_replace(.query, "\\[MASK\\]", mask.token))
+
+    # add tokens for out-of-vocabulary words
+    if(add.tokens) fill_mask = add_tokens(fill_mask, unique(data$.mask), add.method, verbose.in=FALSE)
 
     # unmask (single version) [DEPRECATED]
     # unmask_each = function(d) {
@@ -891,10 +1038,10 @@ warning_oov = function(data) {
 #' @export
 summary.fmat = function(
     object,
-    mask.pair=TRUE,
-    target.pair=TRUE,
-    attrib.pair=TRUE,
-    warning=TRUE,
+    mask.pair = TRUE,
+    target.pair = TRUE,
+    attrib.pair = TRUE,
+    warning = TRUE,
     ...) {
   if(warning) warning_oov(object)
   type = attr(object, "type")
@@ -991,7 +1138,7 @@ summary.fmat = function(
 }
 
 
-#' Intraclass correlation coefficient (ICC) of language models.
+#' Intraclass correlation coefficient (ICC) of BERT models.
 #'
 #' Interrater agreement of log probabilities (treated as "ratings"/rows)
 #' among BERT language models (treated as "raters"/columns),
@@ -1037,8 +1184,8 @@ ICC_models = function(data, type="agreement", unit="average") {
 #' @export
 LPR_reliability = function(
     fmat,
-    item=c("query", "T_word", "A_word"),
-    by=NULL) {
+    item = c("query", "T_word", "A_word"),
+    by = NULL) {
   item = match.arg(item)
   alphas = plyr::ddply(fmat, by, function(x) {
     x = as.data.frame(x)
diff --git a/README.md b/README.md
index 333820e..02681fa 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 The *Fill-Mask Association Test* (FMAT) is an integrative and probability-based method using [BERT Models] to measure conceptual associations (e.g., attitudes, biases, stereotypes, social norms, cultural values) as *propositions* in natural language ([Bao, 2024, *JPSP*](https://psychbruce.github.io/FMAT/#citation)).
 
-⚠️ *Please update this package to version ≥ 2024.4 for faster and more robust functionality.*
+⚠️ *Please update this package to version ≥ 2024.5 for faster and more robust functionality.*
 
 ![](https://psychbruce.github.io/img/FMAT-Workflow.png)
 
@@ -48,20 +48,14 @@ devtools::install_github("psychbruce/FMAT", force=TRUE)
 
 ### (2) Python Environment and Packages
 
-#### Step 1
-
 Install [Anaconda](https://www.anaconda.com/download) (a recommended package manager which automatically installs Python, Python IDEs like Spyder, and a large list of necessary [Python package dependencies](https://docs.anaconda.com/free/anaconda/pkg-docs/)).
 
-#### Step 2
-
 Specify the Python interpreter in RStudio.
 
 > RStudio → Tools → Global/Project Options\
 > → Python → Select → **Conda Environments**\
 > → Choose **".../Anaconda3/python.exe"**
 
-#### Step 3
-
 Install the "[transformers](https://huggingface.co/docs/transformers/installation)" and "[torch](https://pytorch.org/get-started/locally/)" Python packages.\
 (Windows Command / Anaconda Prompt / RStudio Terminal)
 
@@ -71,9 +65,7 @@ pip install transformers torch
 
 See [Guidance for GPU Acceleration] for installation guidance if you have an NVIDIA GPU device on your PC and want to use GPU to accelerate the pipeline.
 
-#### Alternative Approach
-
-(Not suggested) Besides the pip/conda installation in the *Conda Environment*, you might instead create and use a *Virtual Environment* (see R code below with the `reticulate` package), but then you need to specify the Python interpreter as **"\~/.virtualenvs/r-reticulate/Scripts/python.exe"** in RStudio.
+Alternative approach (NOT suggested): Besides the pip/conda installation in the *Conda Environment*, you might instead create and use a *Virtual Environment* (see R code below with the `reticulate` package), but then you need to specify the Python interpreter as **"\~/.virtualenvs/r-reticulate/Scripts/python.exe"** in RStudio.
 
 ``` r
 ## DON'T RUN THIS UNLESS YOU PREFER VIRTUAL ENVIRONMENT
@@ -85,23 +77,21 @@ virtualenv_install(packages=c("transformers", "torch"))
 
 ## Guidance for FMAT
 
-### FMAT Step 1: Query Design
+### Step 1: Download BERT Models
 
-Design queries that conceptually represent the constructs you would measure (see [Bao, 2024, *JPSP*](https://psychbruce.github.io/FMAT/#citation) for how to design queries).
+Use `BERT_download()` to load [BERT models]. Model files are permanently saved to your local folder "%USERPROFILE%/.cache/huggingface". A full list of BERT-family models are available at [Hugging Face](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers).
 
-Use `FMAT_query()` and/or `FMAT_query_bind()` to prepare a `data.table` of queries.
-
-### FMAT Step 2: Model Loading
+### Step 2: Design FMAT Queries
 
-Use `BERT_download()` and `FMAT_load()` to (down)load [BERT models]. Model files are permanently saved to your local folder "%USERPROFILE%/.cache/huggingface". A full list of BERT-family models are available at [Hugging Face](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers).
+Design queries that conceptually represent the constructs you would measure (see [Bao, 2024, *JPSP*](https://psychbruce.github.io/FMAT/#citation) for how to design queries).
 
-If you want to use GPU (see [Guidance for GPU Acceleration]), please skip to [FMAT Step 3: Model Processing] and directly use `FMAT_run()` without `FMAT_load()`.
+Use `FMAT_query()` and/or `FMAT_query_bind()` to prepare a `data.table` of queries.
 
-### FMAT Step 3: Model Processing
+### Step 3: Run FMAT
 
 Use `FMAT_run()` to get raw data (probability estimates) for further analysis.
 
-Several steps of pre-processing have been included in the function for easier use (see `FMAT_run()` for details).
+Several steps of preprocessing have been included in the function for easier use (see `FMAT_run()` for details).
 
 -   For BERT variants using `<mask>` rather than `[MASK]` as the mask token, the input query will be *automatically* modified so that users can always use `[MASK]` in query design.
 -   For some BERT variants, special prefix characters such as `\u0120` and `\u2581` will be *automatically* added to match the whole words (rather than subwords) for `[MASK]`.
@@ -167,7 +157,7 @@ If you are new to [BERT](https://arxiv.org/abs/1810.04805), these references can
 
 ``` r
 library(FMAT)
-model.names = c(
+models = c(
   "bert-base-uncased",
   "bert-base-cased",
   "bert-large-uncased",
@@ -181,15 +171,18 @@ model.names = c(
   "vinai/bertweet-base",
   "vinai/bertweet-large"
 )
-BERT_download(model.names)
+BERT_download(models)
 ```
 
-```         
+``` {style="height: 500px"}
 ℹ Device Info:
 
-Python Environment:
-Package       Version
-transformers  4.38.2
+R Packages:
+FMAT          2024.5
+reticulate    1.36.1
+
+Python Packages:
+transformers  4.40.2
 torch         2.2.1+cu121
 
 NVIDIA GPU CUDA Support:
@@ -198,148 +191,143 @@ CUDA Version: 12.1
 GPU (Device): NVIDIA GeForce RTX 2050
 
 
-── Downloading model "bert-base-uncased" ───────────────────────────────────────────
+── Downloading model "bert-base-uncased" ──────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 570/570 [00:00<00:00, 113kB/s]
+config.json: 100%|██████████| 570/570 [00:00<00:00, 114kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<?, ?B/s]
-vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.37MB/s]
-tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.94MB/s]
+tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 23.9kB/s]
+vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.50MB/s]
+tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.98MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 440M/440M [01:21<00:00, 5.40MB/s] 
+model.safetensors: 100%|██████████| 440M/440M [00:36<00:00, 12.1MB/s] 
 ✔ Successfully downloaded model "bert-base-uncased"
 
-── Downloading model "bert-base-cased" ─────────────────────────────────────────────
+── Downloading model "bert-base-cased" ────────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 570/570 [00:00<00:00, 63.3kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<00:00, 8.18kB/s]
-vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.30MB/s]
-tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 3.67MB/s]
+tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<00:00, 8.66kB/s]
+vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.39MB/s]
+tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 10.1MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 436M/436M [01:20<00:00, 5.41MB/s] 
+model.safetensors: 100%|██████████| 436M/436M [00:37<00:00, 11.6MB/s] 
 ✔ Successfully downloaded model "bert-base-cased"
 
-── Downloading model "bert-large-uncased" ──────────────────────────────────────────
+── Downloading model "bert-large-uncased" ─────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 571/571 [00:00<00:00, 143kB/s]
+config.json: 100%|██████████| 571/571 [00:00<00:00, 268kB/s]
 → (2) Downloading tokenizer...
 tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 12.0kB/s]
-vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 6.04MB/s]
-tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.57MB/s]
+vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.50MB/s]
+tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.99MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 1.34G/1.34G [04:09<00:00, 5.39MB/s]
+model.safetensors: 100%|██████████| 1.34G/1.34G [01:36<00:00, 14.0MB/s]
 ✔ Successfully downloaded model "bert-large-uncased"
 
-── Downloading model "bert-large-cased" ────────────────────────────────────────────
+── Downloading model "bert-large-cased" ───────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 762/762 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 762/762 [00:00<00:00, 125kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<?, ?B/s]
-vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 2.14MB/s]
-tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.75MB/s]
+tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<00:00, 12.3kB/s]
+vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.41MB/s]
+tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 5.39MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 1.34G/1.34G [04:08<00:00, 5.38MB/s]
+model.safetensors: 100%|██████████| 1.34G/1.34G [01:35<00:00, 14.0MB/s]
 ✔ Successfully downloaded model "bert-large-cased"
 
-── Downloading model "distilbert-base-uncased" ─────────────────────────────────────
+── Downloading model "distilbert-base-uncased" ────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 483/483 [00:00<00:00, 161kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
-vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.36MB/s]
-tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.82MB/s]
+tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 9.46kB/s]
+vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 16.5MB/s]
+tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 14.8MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 268M/268M [00:51<00:00, 5.24MB/s] 
+model.safetensors: 100%|██████████| 268M/268M [00:19<00:00, 13.5MB/s] 
 ✔ Successfully downloaded model "distilbert-base-uncased"
 
-── Downloading model "distilbert-base-cased" ───────────────────────────────────────
+── Downloading model "distilbert-base-cased" ──────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 465/465 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 465/465 [00:00<00:00, 233kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<?, ?B/s]
-vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.34MB/s]
-tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 4.20MB/s]
+tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<00:00, 9.80kB/s]
+vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.39MB/s]
+tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 8.70MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 263M/263M [00:49<00:00, 5.36MB/s] 
+model.safetensors: 100%|██████████| 263M/263M [00:24<00:00, 10.9MB/s] 
 ✔ Successfully downloaded model "distilbert-base-cased"
 
-── Downloading model "albert-base-v1" ──────────────────────────────────────────────
+── Downloading model "albert-base-v1" ─────────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 684/684 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 684/684 [00:00<00:00, 137kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 1.65kB/s]
-spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 4.58MB/s]
-tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 3.09MB/s]
+tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 3.57kB/s]
+spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 4.93MB/s]
+tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 13.4MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 47.4M/47.4M [00:09<00:00, 5.07MB/s]
+model.safetensors: 100%|██████████| 47.4M/47.4M [00:03<00:00, 13.4MB/s]
 ✔ Successfully downloaded model "albert-base-v1"
 
-── Downloading model "albert-base-v2" ──────────────────────────────────────────────
+── Downloading model "albert-base-v2" ─────────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 684/684 [00:00<00:00, 45.5kB/s]
+config.json: 100%|██████████| 684/684 [00:00<00:00, 137kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<?, ?B/s]
-spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 2.13MB/s]
-tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 5.66MB/s]
+tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 4.17kB/s]
+spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 5.10MB/s]
+tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 6.93MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 47.4M/47.4M [00:08<00:00, 5.51MB/s]
+model.safetensors: 100%|██████████| 47.4M/47.4M [00:03<00:00, 13.8MB/s]
 ✔ Successfully downloaded model "albert-base-v2"
 
-── Downloading model "roberta-base" ────────────────────────────────────────────────
+── Downloading model "roberta-base" ───────────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 481/481 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 481/481 [00:00<00:00, 80.3kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<?, ?B/s]
-vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 5.73MB/s]
-merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 6.16MB/s]
-tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 5.50MB/s]
+tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 6.25kB/s]
+vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 2.72MB/s]
+merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 8.22MB/s]
+tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 8.56MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 499M/499M [01:32<00:00, 5.38MB/s] 
-Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+model.safetensors: 100%|██████████| 499M/499M [00:38<00:00, 12.9MB/s] 
 ✔ Successfully downloaded model "roberta-base"
 
-── Downloading model "distilroberta-base" ──────────────────────────────────────────
+── Downloading model "distilroberta-base" ─────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 480/480 [00:00<00:00, 30.7kB/s]
+config.json: 100%|██████████| 480/480 [00:00<00:00, 96.4kB/s]
 → (2) Downloading tokenizer...
-tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 7.98kB/s]
-vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 5.18MB/s]
-merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 5.71MB/s]
-tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.83MB/s]
+tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 12.0kB/s]
+vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 6.59MB/s]
+merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 9.46MB/s]
+tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 11.5MB/s]
 → (3) Downloading model...
-model.safetensors: 100%|██████████| 331M/331M [01:01<00:00, 5.39MB/s] 
+model.safetensors: 100%|██████████| 331M/331M [00:25<00:00, 13.0MB/s] 
 ✔ Successfully downloaded model "distilroberta-base"
 
-── Downloading model "vinai/bertweet-base" ─────────────────────────────────────────
+── Downloading model "vinai/bertweet-base" ────────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 558/558 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 558/558 [00:00<00:00, 187kB/s]
 → (2) Downloading tokenizer...
-vocab.txt: 100%|██████████| 843k/843k [00:00<00:00, 5.56MB/s]
-bpe.codes: 100%|██████████| 1.08M/1.08M [00:00<00:00, 5.55MB/s]
-tokenizer.json: 100%|██████████| 2.91M/2.91M [00:00<00:00, 5.50MB/s]
-emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
+vocab.txt: 100%|██████████| 843k/843k [00:00<00:00, 7.44MB/s]
+bpe.codes: 100%|██████████| 1.08M/1.08M [00:00<00:00, 7.01MB/s]
+tokenizer.json: 100%|██████████| 2.91M/2.91M [00:00<00:00, 9.10MB/s]
 → (3) Downloading model...
-pytorch_model.bin: 100%|██████████| 543M/543M [01:40<00:00, 5.39MB/s] 
+pytorch_model.bin: 100%|██████████| 543M/543M [00:48<00:00, 11.1MB/s] 
 ✔ Successfully downloaded model "vinai/bertweet-base"
 
-── Downloading model "vinai/bertweet-large" ────────────────────────────────────────
+── Downloading model "vinai/bertweet-large" ───────────────────────────────────────
 → (1) Downloading configuration...
-config.json: 100%|██████████| 614/614 [00:00<?, ?B/s] 
+config.json: 100%|██████████| 614/614 [00:00<00:00, 120kB/s]
 → (2) Downloading tokenizer...
-vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 5.59MB/s]
-merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 5.04MB/s]
-tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 5.42MB/s]
+vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 5.90MB/s]
+merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 7.30MB/s]
+tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 8.31MB/s]
 → (3) Downloading model...
-pytorch_model.bin: 100%|██████████| 1.42G/1.42G [04:23<00:00, 5.40MB/s]
-Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+pytorch_model.bin: 100%|██████████| 1.42G/1.42G [02:29<00:00, 9.53MB/s]
 ✔ Successfully downloaded model "vinai/bertweet-large"
 
 ── Downloaded models: ──
 
-                           Size
+                           size
 albert-base-v1            45 MB
 albert-base-v2            45 MB
 bert-base-cased          416 MB
@@ -356,7 +344,28 @@ vinai/bertweet-large    1356 MB
 ✔ Downloaded models saved at C:/Users/Bruce/.cache/huggingface/hub (6.52 GB)
 ```
 
-(Tested 2024/03 on the developer's computer: HP Probook 450 G10 Notebook PC)
+``` r
+BERT_info(models)
+```
+
+```         
+                      model   size vocab  dims   mask
+                     <fctr> <char> <int> <int> <char>
+ 1:       bert-base-uncased  420MB 30522   768 [MASK]
+ 2:         bert-base-cased  416MB 28996   768 [MASK]
+ 3:      bert-large-uncased 1283MB 30522  1024 [MASK]
+ 4:        bert-large-cased 1277MB 28996  1024 [MASK]
+ 5: distilbert-base-uncased  256MB 30522   768 [MASK]
+ 6:   distilbert-base-cased  251MB 28996   768 [MASK]
+ 7:          albert-base-v1   45MB 30000   128 [MASK]
+ 8:          albert-base-v2   45MB 30000   128 [MASK]
+ 9:            roberta-base  476MB 50265   768 <mask>
+10:      distilroberta-base  316MB 50265   768 <mask>
+11:     vinai/bertweet-base  517MB 64001   768 <mask>
+12:    vinai/bertweet-large 1356MB 50265  1024 <mask>
+```
+
+(Tested 2024-05-16 on the developer's computer: HP Probook 450 G10 Notebook PC)
 
 ## Related Packages
 
diff --git a/man/BERT_download.Rd b/man/BERT_download.Rd
index e95372b..39d06e9 100644
--- a/man/BERT_download.Rd
+++ b/man/BERT_download.Rd
@@ -22,10 +22,14 @@ models = c("bert-base-uncased", "bert-base-cased")
 BERT_download(models)
 
 BERT_download()  # check downloaded models
+
+BERT_info()  # information of all downloaded models
 }
 
 }
 \seealso{
+\code{\link{BERT_info}}
+
 \code{\link{BERT_vocab}}
 
 \code{\link{FMAT_load}}
diff --git a/man/BERT_info.Rd b/man/BERT_info.Rd
new file mode 100644
index 0000000..46fbc76
--- /dev/null
+++ b/man/BERT_info.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/FMAT.R
+\name{BERT_info}
+\alias{BERT_info}
+\title{Get basic information of BERT models.}
+\usage{
+BERT_info(models = NULL)
+}
+\arguments{
+\item{models}{Model names at
+\href{https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers}{HuggingFace}.}
+}
+\value{
+A data.table of model name, model file size,
+vocabulary size (of word/token embeddings),
+embedding dimensions (of word/token embeddings),
+and [MASK] token.
+}
+\description{
+Get basic information of BERT models.
+}
+\examples{
+\dontrun{
+models = c("bert-base-uncased", "bert-base-cased")
+BERT_info(models)
+
+BERT_info()  # information of all downloaded models
+}
+
+}
+\seealso{
+\code{\link{BERT_download}}
+
+\code{\link{BERT_vocab}}
+}
diff --git a/man/BERT_vocab.Rd b/man/BERT_vocab.Rd
index 830702b..e06e1f4 100644
--- a/man/BERT_vocab.Rd
+++ b/man/BERT_vocab.Rd
@@ -4,16 +4,28 @@
 \alias{BERT_vocab}
 \title{Check if mask words are in the model vocabulary.}
 \usage{
-BERT_vocab(models, mask.words)
+BERT_vocab(
+  models,
+  mask.words,
+  add.tokens = FALSE,
+  add.method = c("sum", "mean")
+)
 }
 \arguments{
 \item{models}{Model names at
 \href{https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers}{HuggingFace}.}
 
 \item{mask.words}{Option words filling in the mask.}
+
+\item{add.tokens}{Add new tokens (for out-of-vocabulary words or even phrases) to model vocabulary?
+Defaults to \code{FALSE}. It only temporarily adds tokens for tasks but does not change the raw model file.}
+
+\item{add.method}{Method used to produce the token embeddings of new added tokens.
+Can be \code{"sum"} (default) or \code{"mean"} of subword token embeddings.}
 }
 \value{
-A data.table of model names, mask words, and real tokens (replaced if out of vocabulary).
+A data.table of model name, mask word, real token (replaced if out of vocabulary),
+and token id (0~N).
 }
 \description{
 Check if mask words are in the model vocabulary.
@@ -21,11 +33,23 @@ Check if mask words are in the model vocabulary.
 \examples{
 \dontrun{
 models = c("bert-base-uncased", "bert-base-cased")
+BERT_info(models)
+
 BERT_vocab(models, c("bruce", "Bruce"))
-BERT_vocab(models, 1800:2024)
+
+BERT_vocab(models, 2020:2025)  # some are out-of-vocabulary
+BERT_vocab(models, 2020:2025, add.tokens=TRUE)  # add vocab
+
+BERT_vocab(models,
+           c("individualism", "artificial intelligence"),
+           add.tokens=TRUE)
 }
 
 }
 \seealso{
 \code{\link{BERT_download}}
+
+\code{\link{BERT_info}}
+
+\code{\link{FMAT_run}}
 }
diff --git a/man/FMAT_load.Rd b/man/FMAT_load.Rd
index 7d2e15c..b714001 100644
--- a/man/FMAT_load.Rd
+++ b/man/FMAT_load.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/FMAT.R
 \name{FMAT_load}
 \alias{FMAT_load}
-\title{(Down)Load BERT models (useless for GPU).}
+\title{[Deprecated] Load BERT models (useless for GPU).}
 \usage{
 FMAT_load(models)
 }
@@ -17,10 +17,9 @@ You will need to \emph{rerun} this function if you \emph{restart} the R session.
 }
 \description{
 Load BERT models from local cache folder "\%USERPROFILE\%/.cache/huggingface".
-Models that have not been downloaded can also
-be automatically downloaded (but \emph{silently}).
 For \href{https://psychbruce.github.io/FMAT/#guidance-for-gpu-acceleration}{GPU Acceleration},
-please directly use \code{\link{FMAT_run}} instead.
+please directly use \code{\link{FMAT_run}}.
+In general, \code{\link{FMAT_run}} is always preferred than \code{\link{FMAT_load}}.
 }
 \examples{
 \dontrun{
diff --git a/man/FMAT_run.Rd b/man/FMAT_run.Rd
index f543bac..a4645ba 100644
--- a/man/FMAT_run.Rd
+++ b/man/FMAT_run.Rd
@@ -8,6 +8,8 @@ FMAT_run(
   models,
   data,
   gpu,
+  add.tokens = FALSE,
+  add.method = c("sum", "mean"),
   file = NULL,
   progress = TRUE,
   warning = TRUE,
@@ -49,6 +51,12 @@ which defines the device (e.g.,
 on which the pipeline will be allocated.
 }}
 
+\item{add.tokens}{Add new tokens (for out-of-vocabulary words or even phrases) to model vocabulary?
+Defaults to \code{FALSE}. It only temporarily adds tokens for tasks but does not change the raw model file.}
+
+\item{add.method}{Method used to produce the token embeddings of new added tokens.
+Can be \code{"sum"} (default) or \code{"mean"} of subword token embeddings.}
+
 \item{file}{File name of \code{.RData} to save the returned data.}
 
 \item{progress}{Show a progress bar? Defaults to \code{TRUE}.}
@@ -129,7 +137,9 @@ summary(data2)
 \seealso{
 \code{\link{BERT_download}}
 
-\code{\link{FMAT_load}}
+\code{\link{BERT_vocab}}
+
+\code{\link{FMAT_load}} (deprecated)
 
 \code{\link{FMAT_query}}
 
diff --git a/man/ICC_models.Rd b/man/ICC_models.Rd
index 6ad1ea7..2f557c7 100644
--- a/man/ICC_models.Rd
+++ b/man/ICC_models.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/FMAT.R
 \name{ICC_models}
 \alias{ICC_models}
-\title{Intraclass correlation coefficient (ICC) of language models.}
+\title{Intraclass correlation coefficient (ICC) of BERT models.}
 \usage{
 ICC_models(data, type = "agreement", unit = "average")
 }