From e9adda42c47daffb54fbab63ec47cd2b185be9e8 Mon Sep 17 00:00:00 2001 From: Bruce Date: Sun, 19 May 2024 06:09:52 +0100 Subject: [PATCH] [2024.5] --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 7 ++ R/FMAT.R | 233 +++++++++++++++++++++++++++++++++++-------- README.md | 219 +++++++++++++++++++++------------------- man/BERT_download.Rd | 4 + man/BERT_info.Rd | 35 +++++++ man/BERT_vocab.Rd | 30 +++++- man/FMAT_load.Rd | 7 +- man/FMAT_run.Rd | 12 ++- man/ICC_models.Rd | 2 +- 11 files changed, 395 insertions(+), 159 deletions(-) create mode 100644 man/BERT_info.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 9dece8d..ba85246 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: FMAT Title: The Fill-Mask Association Test -Version: 2024.4 -Date: 2024-04-29 +Version: 2024.5 +Date: 2024-05-15 Authors@R: c(person(given = "Han-Wu-Shuang", family = "Bao", diff --git a/NAMESPACE b/NAMESPACE index 3cf9e03..1740be6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(summary,fmat) export(.) export(BERT_download) +export(BERT_info) export(BERT_vocab) export(FMAT_load) export(FMAT_query) diff --git a/NEWS.md b/NEWS.md index 69f7fa9..1089148 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ **Please check the [latest news (change log)](https://psychbruce.github.io/FMAT/news/index.html) and keep this package updated.** +# FMAT 2024.5 + +- Added `BERT_info()`. +- Added `add.tokens` and `add.method` parameters for `BERT_vocab()` and `FMAT_run()`: An *experimental* functionality to add new tokens (e.g., out-of-vocabulary words, compound words, or even phrases) as [MASK] options. Validation is still needed for this novel practice (one of my ongoing projects), so currently please only use at your own risk, waiting until the publication of my validation work. +- All functions except `BERT_download()` now import local model files only, without automatically downloading models. Users must first use `BERT_download()` to download models. +- Deprecating `FMAT_load()`: Better to use `FMAT_run()` directly. + # FMAT 2024.4 - Added `BERT_vocab()` and `ICC_models()`. diff --git a/R/FMAT.R b/R/FMAT.R index 6475121..87de170 100644 --- a/R/FMAT.R +++ b/R/FMAT.R @@ -42,7 +42,7 @@ } -#### Basic #### +#### Utils #### #' @importFrom PsychWordVec cc @@ -97,7 +97,7 @@ gpu_to_device = function(gpu) { } -transformers_init = function() { +transformers_init = function(print.info=TRUE) { FMAT.ver = as.character(utils::packageVersion("FMAT")) reticulate.ver = as.character(utils::packageVersion("reticulate")) reticulate::py_capture_output({ @@ -120,22 +120,90 @@ transformers_init = function() { transformers = reticulate::import("transformers") tf.ver = transformers$`__version__` }) - cli::cli_alert_info(cli::col_blue("Device Info: + if(print.info) { + cli::cli_alert_info(cli::col_blue("Device Info: + + R Packages: + FMAT {FMAT.ver} + reticulate {reticulate.ver} + + Python Packages: + transformers {tf.ver} + torch {torch.ver} + + NVIDIA GPU CUDA Support: + CUDA Enabled: {torch.cuda} + CUDA Version: {cuda.ver} + {gpu.info} + ")) + } + return(transformers) +} + - R Packages: - FMAT {FMAT.ver} - reticulate {reticulate.ver} +fill_mask_init = function(transformers, model, device=-1L) { + config = transformers$AutoConfig$from_pretrained(model, local_files_only=TRUE) + fill_mask = transformers$pipeline("fill-mask", model=model, config=config, + model_kwargs=list(local_files_only=TRUE), + device=device) + return(fill_mask) +} - Python Packages: - transformers {tf.ver} - torch {torch.ver} - NVIDIA GPU CUDA Support: - CUDA Enabled: {torch.cuda} - CUDA Version: {cuda.ver} - {gpu.info} - ")) - return(transformers) +add_tokens = function( + fill_mask, tokens, + method = c("sum", "mean"), + verbose.in = TRUE, + verbose.out = TRUE +) { + # encode new tokens from subwords + method = match.arg(method) + vocab = fill_mask$tokenizer$get_vocab() + embed = fill_mask$model$get_input_embeddings()$weight$data + tlist = lapply(tokens, function(token) { + encode = fill_mask$tokenizer$encode(token) + encode = encode[c(-1, -length(encode))] + decode = fill_mask$tokenizer$decode(encode) + if(length(encode)==1) { + token.embed = embed[encode] + } else { + if(method=="sum") + token.embed = embed[encode]$sum(0L) + if(method=="mean") + token.embed = embed[encode]$mean(0L) + } + return(list( + token = token, + encode = encode, + decode = decode, + token.raw = sapply(encode, function(id) vocab[vocab==id]), + token.embed = token.embed + )) + }) + names(tlist) = tokens + + # add new tokens to the tokenizer vocabulary + fill_mask$tokenizer$add_tokens(tokens) + + # initialize random embeddings for the new tokens + fill_mask$model$resize_token_embeddings(length(fill_mask$tokenizer)) + + # reset new embeddings to (sum or mean) subword token embeddings + vocab.new = fill_mask$tokenizer$get_vocab() + embed.new = fill_mask$model$get_input_embeddings()$weight$data + for(t in tlist) { + if(is.null(vocab[[t$token]])) { + embed.new[vocab.new[[t$token]]] = t$token.embed + subwords = paste(names(t$token.raw), collapse=", ") + if(verbose.out) + cli::cli_alert_success("Added token {.val {t$token}}: {t$decode} = {method}_embed({subwords})") + } else { + if(verbose.in) + cli::cli_alert_success("{t$token}: already in vocab (token id = {t$encode})") + } + } + + return(fill_mask) } @@ -147,7 +215,7 @@ find_cached_models = function(cache.folder) { paste(paste0(sprintf("%.0f", file.size(models.file) / 1024^2), " MB"), collapse=" / ") }) models.name = str_replace_all(str_remove(models.name, "^models--"), "--", "/") - models.info = data.frame(Size=models.size, row.names=models.name) + models.info = data.frame(size=models.size, row.names=models.name) } else { models.info = NULL } @@ -169,6 +237,8 @@ find_cached_models = function(cache.folder) { #' No return value. #' #' @seealso +#' [`BERT_info`] +#' #' [`BERT_vocab`] #' #' [`FMAT_load`] @@ -179,6 +249,8 @@ find_cached_models = function(cache.folder) { #' BERT_download(models) #' #' BERT_download() # check downloaded models +#' +#' BERT_info() # information of all downloaded models #' } #' #' @export @@ -207,43 +279,111 @@ BERT_download = function(models=NULL) { } +#' Get basic information of BERT models. +#' +#' @inheritParams BERT_download +#' +#' @return +#' A data.table of model name, model file size, +#' vocabulary size (of word/token embeddings), +#' embedding dimensions (of word/token embeddings), +#' and \[MASK\] token. +#' +#' @seealso +#' [`BERT_download`] +#' +#' [`BERT_vocab`] +#' +#' @examples +#' \dontrun{ +#' models = c("bert-base-uncased", "bert-base-cased") +#' BERT_info(models) +#' +#' BERT_info() # information of all downloaded models +#' } +#' +#' @export +BERT_info = function(models=NULL) { + transformers = transformers_init(print.info=FALSE) + cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/") + local.models = find_cached_models(cache.folder) + dm = data.table(model=row.names(local.models), size=local.models$size) + model = NULL + if(!is.null(models)) { + dm = dm[model %in% models] + dm$model = factor(dm$model, levels=models) + dm = dm[order(model)] + } else { + dm$model = as.factor(dm$model) + } + dm$size = str_remove(dm$size, " ") + dm = cbind(dm, rbindlist(lapply(dm$model, function(model) { + tokenizer = transformers$AutoTokenizer$from_pretrained(model, local_files_only=TRUE) + model.obj = transformers$AutoModel$from_pretrained(model, local_files_only=TRUE) + word.embeddings = model.obj$embeddings$word_embeddings$weight$data$shape + data.table(vocab = word.embeddings[0], + dims = word.embeddings[1], + mask = tokenizer$mask_token) + }))) + return(dm) +} + + #' Check if mask words are in the model vocabulary. #' #' @inheritParams BERT_download #' @param mask.words Option words filling in the mask. +#' @param add.tokens Add new tokens (for out-of-vocabulary words or even phrases) to model vocabulary? +#' Defaults to `FALSE`. It only temporarily adds tokens for tasks but does not change the raw model file. +#' @param add.method Method used to produce the token embeddings of new added tokens. +#' Can be `"sum"` (default) or `"mean"` of subword token embeddings. #' #' @return -#' A data.table of model names, mask words, and real tokens (replaced if out of vocabulary). +#' A data.table of model name, mask word, real token (replaced if out of vocabulary), +#' and token id (0~N). #' #' @seealso #' [`BERT_download`] #' +#' [`BERT_info`] +#' +#' [`FMAT_run`] +#' #' @examples #' \dontrun{ #' models = c("bert-base-uncased", "bert-base-cased") +#' BERT_info(models) +#' #' BERT_vocab(models, c("bruce", "Bruce")) -#' BERT_vocab(models, 1800:2024) +#' +#' BERT_vocab(models, 2020:2025) # some are out-of-vocabulary +#' BERT_vocab(models, 2020:2025, add.tokens=TRUE) # add vocab +#' +#' BERT_vocab(models, +#' c("individualism", "artificial intelligence"), +#' add.tokens=TRUE) #' } #' #' @export -BERT_vocab = function(models, mask.words) { - transformers = transformers_init() - cache.folder = str_replace_all(transformers$TRANSFORMERS_CACHE, "\\\\", "/") - cli::cli_text("Loading models from {.path {cache.folder}} ...") - cat("\n") - +BERT_vocab = function( + models, mask.words, + add.tokens = FALSE, + add.method = c("sum", "mean") +) { + transformers = transformers_init(print.info=FALSE) mask.words = as.character(mask.words) maps = rbindlist(lapply(models, function(model) { reticulate::py_capture_output({ - fill_mask = transformers$pipeline("fill-mask", model=model) + fill_mask = fill_mask_init(transformers, model) + if(add.tokens) fill_mask = add_tokens(fill_mask, mask.words, add.method, verbose.in=FALSE) vocab = fill_mask$tokenizer$get_vocab() ids = vocab[mask.words] map = rbindlist(lapply(mask.words, function(mask) { id = as.integer(fill_mask$get_target_ids(mask)) token = names(vocab[vocab==id]) if(is.null(ids[[mask]])) token = paste(token, "(out-of-vocabulary)") - data.table(model=as_factor(model), M_word=as_factor(mask), token=token) + data.table(model=as_factor(model), M_word=as_factor(mask), token=token, token.id=id) })) }) return(map) @@ -258,13 +398,12 @@ BERT_vocab = function(models, mask.words) { #### FMAT #### -#' (Down)Load BERT models (useless for GPU). +#' \[Deprecated\] Load BERT models (useless for GPU). #' #' Load BERT models from local cache folder "%USERPROFILE%/.cache/huggingface". -#' Models that have not been downloaded can also -#' be automatically downloaded (but *silently*). #' For [GPU Acceleration](https://psychbruce.github.io/FMAT/#guidance-for-gpu-acceleration), -#' please directly use [`FMAT_run`] instead. +#' please directly use [`FMAT_run`]. +#' In general, [`FMAT_run`] is always preferred than [`FMAT_load`]. #' #' @inheritParams BERT_download #' @@ -296,7 +435,7 @@ FMAT_load = function(models) { fms = lapply(models, function(model) { t0 = Sys.time() reticulate::py_capture_output({ - fill_mask = transformers$pipeline("fill-mask", model=model) + fill_mask = fill_mask_init(transformers, model) }) cli::cli_alert_success("{model} ({dtime(t0)})") return(list(model.name=model, fill.mask=fill_mask)) @@ -588,6 +727,7 @@ FMAT_query_bind = function(...) { #' raw probability estimates between using CPU and GPU, #' but these differences would have little impact on main results. #' +#' @inheritParams BERT_vocab #' @param models Options: #' - A character vector of model names at #' [HuggingFace](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers). @@ -632,7 +772,9 @@ FMAT_query_bind = function(...) { #' @seealso #' [`BERT_download`] #' -#' [`FMAT_load`] +#' [`BERT_vocab`] +#' +#' [`FMAT_load`] (deprecated) #' #' [`FMAT_query`] #' @@ -672,6 +814,8 @@ FMAT_run = function( models, data, gpu, + add.tokens = FALSE, + add.method = c("sum", "mean"), file = NULL, progress = TRUE, warning = TRUE, @@ -711,7 +855,7 @@ FMAT_run = function( ## ---- One Run Begin ---- ## if(is.character(model)) { reticulate::py_capture_output({ - fill_mask = transformers$pipeline("fill-mask", model=model, device=device) + fill_mask = fill_mask_init(transformers, model, device) }) } else { fill_mask = model$fill.mask @@ -726,7 +870,6 @@ FMAT_run = function( uncased = str_detect(model, "uncased|albert") prefix.u2581 = str_detect(model, "xlm-roberta|albert") prefix.u0120 = str_detect(model, "roberta|bertweet-large") & !str_detect(model, "xlm") - mask.lower = str_detect(model, "roberta|bertweet") # .mask (final mask target words) data = mutate(data, .mask = as.character(M_word)) @@ -737,8 +880,12 @@ FMAT_run = function( if(prefix.u0120) data = mutate(data, .mask = ifelse(!str_detect(.query, "^\\[MASK\\]"), paste0("\u0120", .mask), .mask)) - if(mask.lower) - data = mutate(data, .query = str_replace(.query, "\\[MASK\\]", "")) + mask.token = fill_mask$tokenizer$mask_token + if(mask.token!="[MASK]") + data = mutate(data, .query = str_replace(.query, "\\[MASK\\]", mask.token)) + + # add tokens for out-of-vocabulary words + if(add.tokens) fill_mask = add_tokens(fill_mask, unique(data$.mask), add.method, verbose.in=FALSE) # unmask (single version) [DEPRECATED] # unmask_each = function(d) { @@ -891,10 +1038,10 @@ warning_oov = function(data) { #' @export summary.fmat = function( object, - mask.pair=TRUE, - target.pair=TRUE, - attrib.pair=TRUE, - warning=TRUE, + mask.pair = TRUE, + target.pair = TRUE, + attrib.pair = TRUE, + warning = TRUE, ...) { if(warning) warning_oov(object) type = attr(object, "type") @@ -991,7 +1138,7 @@ summary.fmat = function( } -#' Intraclass correlation coefficient (ICC) of language models. +#' Intraclass correlation coefficient (ICC) of BERT models. #' #' Interrater agreement of log probabilities (treated as "ratings"/rows) #' among BERT language models (treated as "raters"/columns), @@ -1037,8 +1184,8 @@ ICC_models = function(data, type="agreement", unit="average") { #' @export LPR_reliability = function( fmat, - item=c("query", "T_word", "A_word"), - by=NULL) { + item = c("query", "T_word", "A_word"), + by = NULL) { item = match.arg(item) alphas = plyr::ddply(fmat, by, function(x) { x = as.data.frame(x) diff --git a/README.md b/README.md index 333820e..02681fa 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The *Fill-Mask Association Test* (FMAT) is an integrative and probability-based method using [BERT Models] to measure conceptual associations (e.g., attitudes, biases, stereotypes, social norms, cultural values) as *propositions* in natural language ([Bao, 2024, *JPSP*](https://psychbruce.github.io/FMAT/#citation)). -⚠️ *Please update this package to version ≥ 2024.4 for faster and more robust functionality.* +⚠️ *Please update this package to version ≥ 2024.5 for faster and more robust functionality.* ![](https://psychbruce.github.io/img/FMAT-Workflow.png) @@ -48,20 +48,14 @@ devtools::install_github("psychbruce/FMAT", force=TRUE) ### (2) Python Environment and Packages -#### Step 1 - Install [Anaconda](https://www.anaconda.com/download) (a recommended package manager which automatically installs Python, Python IDEs like Spyder, and a large list of necessary [Python package dependencies](https://docs.anaconda.com/free/anaconda/pkg-docs/)). -#### Step 2 - Specify the Python interpreter in RStudio. > RStudio → Tools → Global/Project Options\ > → Python → Select → **Conda Environments**\ > → Choose **".../Anaconda3/python.exe"** -#### Step 3 - Install the "[transformers](https://huggingface.co/docs/transformers/installation)" and "[torch](https://pytorch.org/get-started/locally/)" Python packages.\ (Windows Command / Anaconda Prompt / RStudio Terminal) @@ -71,9 +65,7 @@ pip install transformers torch See [Guidance for GPU Acceleration] for installation guidance if you have an NVIDIA GPU device on your PC and want to use GPU to accelerate the pipeline. -#### Alternative Approach - -(Not suggested) Besides the pip/conda installation in the *Conda Environment*, you might instead create and use a *Virtual Environment* (see R code below with the `reticulate` package), but then you need to specify the Python interpreter as **"\~/.virtualenvs/r-reticulate/Scripts/python.exe"** in RStudio. +Alternative approach (NOT suggested): Besides the pip/conda installation in the *Conda Environment*, you might instead create and use a *Virtual Environment* (see R code below with the `reticulate` package), but then you need to specify the Python interpreter as **"\~/.virtualenvs/r-reticulate/Scripts/python.exe"** in RStudio. ``` r ## DON'T RUN THIS UNLESS YOU PREFER VIRTUAL ENVIRONMENT @@ -85,23 +77,21 @@ virtualenv_install(packages=c("transformers", "torch")) ## Guidance for FMAT -### FMAT Step 1: Query Design +### Step 1: Download BERT Models -Design queries that conceptually represent the constructs you would measure (see [Bao, 2024, *JPSP*](https://psychbruce.github.io/FMAT/#citation) for how to design queries). +Use `BERT_download()` to load [BERT models]. Model files are permanently saved to your local folder "%USERPROFILE%/.cache/huggingface". A full list of BERT-family models are available at [Hugging Face](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers). -Use `FMAT_query()` and/or `FMAT_query_bind()` to prepare a `data.table` of queries. - -### FMAT Step 2: Model Loading +### Step 2: Design FMAT Queries -Use `BERT_download()` and `FMAT_load()` to (down)load [BERT models]. Model files are permanently saved to your local folder "%USERPROFILE%/.cache/huggingface". A full list of BERT-family models are available at [Hugging Face](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers). +Design queries that conceptually represent the constructs you would measure (see [Bao, 2024, *JPSP*](https://psychbruce.github.io/FMAT/#citation) for how to design queries). -If you want to use GPU (see [Guidance for GPU Acceleration]), please skip to [FMAT Step 3: Model Processing] and directly use `FMAT_run()` without `FMAT_load()`. +Use `FMAT_query()` and/or `FMAT_query_bind()` to prepare a `data.table` of queries. -### FMAT Step 3: Model Processing +### Step 3: Run FMAT Use `FMAT_run()` to get raw data (probability estimates) for further analysis. -Several steps of pre-processing have been included in the function for easier use (see `FMAT_run()` for details). +Several steps of preprocessing have been included in the function for easier use (see `FMAT_run()` for details). - For BERT variants using `` rather than `[MASK]` as the mask token, the input query will be *automatically* modified so that users can always use `[MASK]` in query design. - For some BERT variants, special prefix characters such as `\u0120` and `\u2581` will be *automatically* added to match the whole words (rather than subwords) for `[MASK]`. @@ -167,7 +157,7 @@ If you are new to [BERT](https://arxiv.org/abs/1810.04805), these references can ``` r library(FMAT) -model.names = c( +models = c( "bert-base-uncased", "bert-base-cased", "bert-large-uncased", @@ -181,15 +171,18 @@ model.names = c( "vinai/bertweet-base", "vinai/bertweet-large" ) -BERT_download(model.names) +BERT_download(models) ``` -``` +``` {style="height: 500px"} ℹ Device Info: -Python Environment: -Package Version -transformers 4.38.2 +R Packages: +FMAT 2024.5 +reticulate 1.36.1 + +Python Packages: +transformers 4.40.2 torch 2.2.1+cu121 NVIDIA GPU CUDA Support: @@ -198,148 +191,143 @@ CUDA Version: 12.1 GPU (Device): NVIDIA GeForce RTX 2050 -── Downloading model "bert-base-uncased" ─────────────────────────────────────────── +── Downloading model "bert-base-uncased" ────────────────────────────────────────── → (1) Downloading configuration... -config.json: 100%|██████████| 570/570 [00:00<00:00, 113kB/s] +config.json: 100%|██████████| 570/570 [00:00<00:00, 114kB/s] → (2) Downloading tokenizer... -tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00 + 1: bert-base-uncased 420MB 30522 768 [MASK] + 2: bert-base-cased 416MB 28996 768 [MASK] + 3: bert-large-uncased 1283MB 30522 1024 [MASK] + 4: bert-large-cased 1277MB 28996 1024 [MASK] + 5: distilbert-base-uncased 256MB 30522 768 [MASK] + 6: distilbert-base-cased 251MB 28996 768 [MASK] + 7: albert-base-v1 45MB 30000 128 [MASK] + 8: albert-base-v2 45MB 30000 128 [MASK] + 9: roberta-base 476MB 50265 768 +10: distilroberta-base 316MB 50265 768 +11: vinai/bertweet-base 517MB 64001 768 +12: vinai/bertweet-large 1356MB 50265 1024 +``` + +(Tested 2024-05-16 on the developer's computer: HP Probook 450 G10 Notebook PC) ## Related Packages diff --git a/man/BERT_download.Rd b/man/BERT_download.Rd index e95372b..39d06e9 100644 --- a/man/BERT_download.Rd +++ b/man/BERT_download.Rd @@ -22,10 +22,14 @@ models = c("bert-base-uncased", "bert-base-cased") BERT_download(models) BERT_download() # check downloaded models + +BERT_info() # information of all downloaded models } } \seealso{ +\code{\link{BERT_info}} + \code{\link{BERT_vocab}} \code{\link{FMAT_load}} diff --git a/man/BERT_info.Rd b/man/BERT_info.Rd new file mode 100644 index 0000000..46fbc76 --- /dev/null +++ b/man/BERT_info.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FMAT.R +\name{BERT_info} +\alias{BERT_info} +\title{Get basic information of BERT models.} +\usage{ +BERT_info(models = NULL) +} +\arguments{ +\item{models}{Model names at +\href{https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers}{HuggingFace}.} +} +\value{ +A data.table of model name, model file size, +vocabulary size (of word/token embeddings), +embedding dimensions (of word/token embeddings), +and [MASK] token. +} +\description{ +Get basic information of BERT models. +} +\examples{ +\dontrun{ +models = c("bert-base-uncased", "bert-base-cased") +BERT_info(models) + +BERT_info() # information of all downloaded models +} + +} +\seealso{ +\code{\link{BERT_download}} + +\code{\link{BERT_vocab}} +} diff --git a/man/BERT_vocab.Rd b/man/BERT_vocab.Rd index 830702b..e06e1f4 100644 --- a/man/BERT_vocab.Rd +++ b/man/BERT_vocab.Rd @@ -4,16 +4,28 @@ \alias{BERT_vocab} \title{Check if mask words are in the model vocabulary.} \usage{ -BERT_vocab(models, mask.words) +BERT_vocab( + models, + mask.words, + add.tokens = FALSE, + add.method = c("sum", "mean") +) } \arguments{ \item{models}{Model names at \href{https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers}{HuggingFace}.} \item{mask.words}{Option words filling in the mask.} + +\item{add.tokens}{Add new tokens (for out-of-vocabulary words or even phrases) to model vocabulary? +Defaults to \code{FALSE}. It only temporarily adds tokens for tasks but does not change the raw model file.} + +\item{add.method}{Method used to produce the token embeddings of new added tokens. +Can be \code{"sum"} (default) or \code{"mean"} of subword token embeddings.} } \value{ -A data.table of model names, mask words, and real tokens (replaced if out of vocabulary). +A data.table of model name, mask word, real token (replaced if out of vocabulary), +and token id (0~N). } \description{ Check if mask words are in the model vocabulary. @@ -21,11 +33,23 @@ Check if mask words are in the model vocabulary. \examples{ \dontrun{ models = c("bert-base-uncased", "bert-base-cased") +BERT_info(models) + BERT_vocab(models, c("bruce", "Bruce")) -BERT_vocab(models, 1800:2024) + +BERT_vocab(models, 2020:2025) # some are out-of-vocabulary +BERT_vocab(models, 2020:2025, add.tokens=TRUE) # add vocab + +BERT_vocab(models, + c("individualism", "artificial intelligence"), + add.tokens=TRUE) } } \seealso{ \code{\link{BERT_download}} + +\code{\link{BERT_info}} + +\code{\link{FMAT_run}} } diff --git a/man/FMAT_load.Rd b/man/FMAT_load.Rd index 7d2e15c..b714001 100644 --- a/man/FMAT_load.Rd +++ b/man/FMAT_load.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/FMAT.R \name{FMAT_load} \alias{FMAT_load} -\title{(Down)Load BERT models (useless for GPU).} +\title{[Deprecated] Load BERT models (useless for GPU).} \usage{ FMAT_load(models) } @@ -17,10 +17,9 @@ You will need to \emph{rerun} this function if you \emph{restart} the R session. } \description{ Load BERT models from local cache folder "\%USERPROFILE\%/.cache/huggingface". -Models that have not been downloaded can also -be automatically downloaded (but \emph{silently}). For \href{https://psychbruce.github.io/FMAT/#guidance-for-gpu-acceleration}{GPU Acceleration}, -please directly use \code{\link{FMAT_run}} instead. +please directly use \code{\link{FMAT_run}}. +In general, \code{\link{FMAT_run}} is always preferred than \code{\link{FMAT_load}}. } \examples{ \dontrun{ diff --git a/man/FMAT_run.Rd b/man/FMAT_run.Rd index f543bac..a4645ba 100644 --- a/man/FMAT_run.Rd +++ b/man/FMAT_run.Rd @@ -8,6 +8,8 @@ FMAT_run( models, data, gpu, + add.tokens = FALSE, + add.method = c("sum", "mean"), file = NULL, progress = TRUE, warning = TRUE, @@ -49,6 +51,12 @@ which defines the device (e.g., on which the pipeline will be allocated. }} +\item{add.tokens}{Add new tokens (for out-of-vocabulary words or even phrases) to model vocabulary? +Defaults to \code{FALSE}. It only temporarily adds tokens for tasks but does not change the raw model file.} + +\item{add.method}{Method used to produce the token embeddings of new added tokens. +Can be \code{"sum"} (default) or \code{"mean"} of subword token embeddings.} + \item{file}{File name of \code{.RData} to save the returned data.} \item{progress}{Show a progress bar? Defaults to \code{TRUE}.} @@ -129,7 +137,9 @@ summary(data2) \seealso{ \code{\link{BERT_download}} -\code{\link{FMAT_load}} +\code{\link{BERT_vocab}} + +\code{\link{FMAT_load}} (deprecated) \code{\link{FMAT_query}} diff --git a/man/ICC_models.Rd b/man/ICC_models.Rd index 6ad1ea7..2f557c7 100644 --- a/man/ICC_models.Rd +++ b/man/ICC_models.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/FMAT.R \name{ICC_models} \alias{ICC_models} -\title{Intraclass correlation coefficient (ICC) of language models.} +\title{Intraclass correlation coefficient (ICC) of BERT models.} \usage{ ICC_models(data, type = "agreement", unit = "average") }