diff --git a/R/freq_table.R b/R/freq_table.R index 5bf93150..75db07b9 100644 --- a/R/freq_table.R +++ b/R/freq_table.R @@ -12,6 +12,7 @@ if (getRversion() >= '2.15.1') #' @param byvar character: name of variable to tabulate. Use Standard evaluation. #' @param digits numeric: number of decimal places to display. Default is 1. #' @param na.rm logical: report NA values in frequencies. Default is FALSE. +#' @param freq_var_name character: name for frequency variable. Default is "n" #' #' @return data.table with frequencies. #' @export @@ -26,33 +27,45 @@ if (getRversion() >= '2.15.1') freq_table <- function(x, byvar, digits = 1, - na.rm = FALSE) { + na.rm = FALSE, + freq_var_name = "n") { x_name <- as.character(substitute(x)) if (!is.data.frame(x)) { cli::cli_abort("Argument {.arg x} ({.field {x_name}}) must be a data frame") } + if (isFALSE(is.data.table(x))) { + x <- qDT(x) + } + - fq <- qtab(x[[byvar]], na.exclude = na.rm) - ft <- data.frame(joyn = names(fq), - n = as.numeric(fq)) + fq <- qtab(x[, ..byvar], na.exclude = na.rm, dnn = byvar) + + ft <- fq |> + as.data.table() |> + setnames("N", "n") |> + # filter zeros + fsubset(n > 0) N <- fsum(ft$n) ft <- ft |> ftransform(percent = paste0(round(n / N * 100, digits), "%")) # add row with totals - ft <- rowbind(ft, data.table(joyn = "total", - n = N, - percent = "100%")) |> - # filter zeros - fsubset(n > 0) + total_row <- rep("total", length(byvar)) |> + as.list() |> + as.data.table() |> + setnames(new = byvar) |> + ftransform(n = N, + percent = "100%") - setrename(ft, joyn = byvar, .nse = FALSE) + ft <- rowbind(ft, total_row) + setrename(ft, + n = freq_var_name, + .nse = FALSE) } - #' Report frequencies from attributes in report var #' #' @param x dataframe from [joyn_workhorse] diff --git a/R/is_id.R b/R/is_id.R index a1f77086..c8e3b80a 100644 --- a/R/is_id.R +++ b/R/is_id.R @@ -35,40 +35,42 @@ if (getRversion() >= '2.15.1') #' is_id(y1, by = "id") is_id <- function(dt, by, - verbose = getOption("joyn.verbose"), + verbose = getOption("joyn.verbose", default = FALSE), return_report = FALSE) { - # make sure it is data.table - if (!(is.data.table(dt))) { + # Ensure dt is a data.table + if (!is.data.table(dt)) { dt <- as.data.table(dt) - } else { - dt <- data.table::copy(dt) } - # count - m <- dt[, .(copies =.N), by = mget(by)] - is_id <- m[, mean(copies)] == 1 + # Check for duplicates + is_id <- !(anyDuplicated(dt, by = by) > 0) if (verbose) { - - cli::cli_h3("Duplicates in terms of {.code {by}}") - - d <- freq_table(m, "copies") - print(d[]) - - cli::cli_rule(right = "End of {.field is_id()} report") - + if (is_id) { + cli::cli_alert_success("No duplicates found by {.code {by}}") + } else { + cli::cli_alert_warning("Duplicates found by: {.code {by}}") + } } - if (isFALSE(return_report)) { - - return(is_id) + if (return_report) { + # Return the duplicated rows if requested + if (verbose) cli::cli_h3("Duplicates in terms of {.code {by}}") - } else { + d <- freq_table(x = dt, + byvar = by, + freq_var_name = "copies") - return(m) + if (verbose) { + d |> + fsubset(copies > 1) |> + print() + } + if (verbose) cli::cli_rule(right = "End of {.field is_id()} report") + return(invisible(d)) + } else { + return(is_id) } - } - diff --git a/R/possible_ids.R b/R/possible_ids.R index 86cea94a..faec30b3 100644 --- a/R/possible_ids.R +++ b/R/possible_ids.R @@ -1,16 +1,45 @@ #' Find possible unique identifies of data frame #' -#' Identify possible variables uniquely identifying x +#' Identify possible combinations of variables that uniquely identifying dt #' #' @param dt data frame -#' @param exclude character: Exclude variables to be selected as identifiers. It -#' could be either the name of the variables of one type of the variable -#' prefixed by "_". For instance, "_numeric" or "_character". +#' @param vars character: A vector of variable names to consider for identifying unique combinations. +#' @param exclude character: Names of variables to exclude from analysis #' @param include character: Name of variable to be included, that might belong #' to the group excluded in the `exclude` +#' @param exclude_classes character: classes to exclude from analysis (e.g., +#' "numeric", "integer", "date") +#' @param include_classes character: classes to include in the analysis (e.g., +#' "numeric", "integer", "date") +#' @param min_combination_size numeric: Min number of combinations. Default is +#' 1, so all combinations. +#' @param max_combination_size numeric. Max number of combinations. Default is +#' 5. If there is a combinations of identifiers larger than +#' `max_combination_size`, they won't be found +#' @param max_processing_time numeric: Max time to process in seconds. After +#' that, it returns what it found. +#' @param max_numb_possible_ids numeric: Max number of possible IDs to find. See +#' details. +#' @param get_all logical: get all possible combinations based on the parameters +#' above. #' @param verbose logical: If FALSE no message will be displayed. Default is #' TRUE #' +#' @section Number of possible IDs: +#' +#' The number of possible IDs in a dataframe could be very large. This is why, +#' `possible_ids()` makes use of heuristics to return something useful without +#' wasting the time of the user. In addition, we provide multiple parameter so +#' that the user can fine tune their search for possible IDs easily and +#' quickly. +#' +#' Say for instance that you have a dataframe with 10 variables. Testing every +#' possible pair of variables will give you 90 possible unique identifiers for +#' this dataframe. If you want to test all the possible IDs, you will have to +#' test more 5000 combinations. If the dataframe has many rows, it may take a +#' while. +#' +#' #' @return list with possible identifiers #' @export #' @@ -22,177 +51,397 @@ #' x = c(16, 12, NA, NA, 15)) #' possible_ids(x4) possible_ids <- function(dt, - exclude = NULL, - include = NULL, - verbose = getOption("possible_ids.verbose")) { - - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Check inputs --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - + vars = NULL, + exclude = NULL, + include = NULL, + exclude_classes = NULL, + include_classes = NULL, + verbose = getOption("possible_ids.verbose", + default = FALSE), + min_combination_size = 1, + max_combination_size = 5, + max_processing_time = 60, # in seconds + max_numb_possible_ids = 100, + get_all = FALSE) { + + # defenses --------- + # Ensure dt is a data.table if (!is.data.frame(dt)) { stop("data must be a data frame") } + if (!is.data.table(dt)) { + dt <- as.data.table(dt) + } + + # Get variable + # Vars -------- + + if (is.null(vars)) { + vars <- names(dt) |> + copy() + } else { + + # check if all vars are in dt + missing_vars <- setdiff(vars, names(dt)) + + if (length(missing_vars) > 0) { + cli::cli_abort("The following variables are not in the data table: {.strongVar {missing_vars}}") + } - if (is.data.table(dt)) { - dt <- as.data.frame(dt) + # check at least 2 vars are provided + + if (length(vars) < 2) { + cli::cli_abort("Can't make combinations with a single var: {.strongVar {vars}}") + } + + # exclude should not be used + if (!(is.null(exclude) & is.null(exclude_classes))) { + exclude <- NULL + exclude_classes <- NULL + cli::cli_alert_danger("Args {.strongArg `exclude`} and {.strongArg `exclude_classes`} not available when using {.strongArg `vars`}") + } + + } + + # Exclude and include ------- + + ## classes ---------- + vars <- filter_by_class(dt = dt, + vars = vars, + include_classes = include_classes, + exclude_classes = exclude_classes) + + ## var names -------- + vars <- filter_by_name(vars, include, exclude, verbose) + + ## no duplicated vars ------------- + if (anyDuplicated(vars)) { + dupvars <- vars[duplicated(vars)] |> + unique() + cli::cli_abort("vars {.strongVar {dupvars}} are duplicated.") } + if (verbose) { + cli::cli_alert_info("Variables to test: {.strongVar {vars}}") + } - if (is.null(exclude) && !is.null(include)) { + if (length(vars) == 0) { if (verbose) { - cli::cli_alert_warning("Since {.code exclude} is NULL, {.code include} - directive does not make sense. Ignored.", - wrap = TRUE) + cli::cli_alert_danger("No variables available after applying + include/exclude filters.") } - warning("inconsistent use of `include`") + return(NULL) # should this be an error? } - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ## exclude variables from check ------ + # Unique values --------- - vars <- names(dt) + # Sort variables by number of unique values (ascending order) + unique_counts <- vapply(dt[, ..vars], fnunique, numeric(1)) + vars <- vars[order(unique_counts)] + unique_counts <- unique_counts[order(unique_counts)] + n_rows <- fnrow(dt) + init_index <- 0 - ### Exclude variable according to their type --------- - if (!is.null(include)) { - # Find position of variable to include - ii <- which(names(dt) %in% include) + # Initialize list to store possible IDs + possible_ids_list <- vector("list", max_numb_possible_ids) - } else { + checked_ids <- vars |> + copy() - ii <- NULL + if (min_combination_size == 1) { + unique_ids <- vars[unique_counts == n_rows] + # Add individual unique variables + init_index <- length(unique_ids) + if (init_index > 0) { + possible_ids_list[1:init_index] <- as.list(unique_ids) + if (verbose) { + cli::cli_alert_info("Found unique identifiers: {.code {unique_ids}}") + } + if (!get_all) { + ret_list <- store_checked_ids(checked_ids, + possible_ids_list) + return(ret_list) + } + # Remove unique identifiers from vars to reduce combinations + vars <- setdiff(vars, unique_ids) + if (length(vars) == 0) { + # All variables are unique identifiers + ret_list <- store_checked_ids(checked_ids, + possible_ids_list) + return(ret_list) + } + unique_counts <- unique_counts[vars] + } } - ### Exclude variable by name --------- - if (!is.null(exclude)) { + # combinations ----------- - if (any(grepl("^_", exclude))) { + # Start testing combinations + start_time <- Sys.time() + min_size <- max(min_combination_size, 2) + max_size <- min(length(vars), max_combination_size) + elapsed_time <- 0 - type_ex <- exclude[grepl("^_", exclude)] - vars_ex <- exclude[!grepl("^_", exclude)] + # where there is only one variable or not enough vars to combine + if (min_size > max_size) { + if (verbose) { + cli::cli_alert_warning( + "Can't make combinations of {.field {vars}} if the min number of + combinations is {min_size} and the max is {max_size}") + } - type_ex <- match.arg(type_ex, c("_character", "_numeric")) + cli::cli_abort("No unique identifier found.") + } - # find variable that meet criteria and exclude them, making sure to include - # the variables of the user. - ex <- gsub("^_", "", type_ex) - FUN <- paste0("is.", ex) + j <- init_index + 1 + for (comb_size in min_size:max_size) { - n_cols <- unlist(lapply(dt, FUN)) - n_cols[ii] <- FALSE + # make sure length of vars is >= comb_size + if (length(vars) < comb_size) { + next + # or break + } - # Exclude variables by name + combos <- utils::combn(vars, comb_size, simplify = FALSE) - if (length(vars_ex) > 0) { - ex <-which(names(dt) %in% vars_ex) - n_cols[ex] <- TRUE - } + # Prune combinations where the product of unique counts is less + # than n_rows + combos_to_keep <- vapply(combos, + \(combo) { + prod(unique_counts[combo]) >= n_rows + }, + logical(1)) - vars <- names(dt)[!n_cols] + combos <- combos[combos_to_keep] - } else { - vars <- vars[!(vars %in% exclude)] + # Estimate processing time and prune combinations + est_times <- vapply(combos, + \(combo) { + estimate_combination_time(n_rows, + unique_counts[combo]) + }, + numeric(1)) - if (identical(vars, names(dt))) { - if (verbose) { - cli::cli_alert_warning("Variable {.field {exclude}} is not available in data frame. - Nothing is excluded.", wrap = TRUE) + if (verbose) { + cli::cli_progress_bar( + format = "combs of {cli::pb_extra$comb_size} vars: {cli::pb_bar} {cli::pb_percent} | ETA: {cli::pb_eta} | {cli::pb_current}/{cli::pb_total}", + extra = list(comb_size = comb_size), + total = length(combos)) + } + + for (combo in combos) { + # Check if the combination uniquely identifies the data + if (is_id(dt, by = combo, verbose = FALSE)) { + # This is inefficient... it is copying every time... + # I need to think better on how to do it. + possible_ids_list[[j]] <- combo + + j <- j + 1 + + if (j > max_numb_possible_ids) { + if (verbose) { + cli::cli_alert_warning( + "Max number of possible IDs ({max_numb_possible_ids}) reached. + You may modify it in argument {.arg max_numb_possible_ids}") + } + ret_list <- store_checked_ids(checked_ids = checked_ids, + possible_ids = possible_ids_list) + return(ret_list) } + if (!get_all) { - warning("inconsistenty use of `exclude`") + ret_list <- store_checked_ids(checked_ids = checked_ids, + possible_ids = possible_ids_list) + return(ret_list) + } + # Remove variables in the current combo from vars to + # avoid redundant checks + vars <- setdiff(vars, combo) + unique_counts <- unique_counts[vars] + + # Break since we found a minimal unique key of size i + if (!get_all) break } + # Check processing time + elapsed_time <- as.numeric(difftime(Sys.time(), + start_time, + units = "secs")) + if (elapsed_time > max_processing_time) { + if (verbose) { + mxt_msg <- "Maximum processing time exceeded. + modify {.arg max_processing_time} argument to increse time. + Stopping search." + cli::cli_alert_warning(mxt_msg) + } + break + } + if (verbose) cli::cli_progress_update() + } + # Break if all variables are used + if (length(vars) == 0 || elapsed_time > max_processing_time) { + break } } - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ## check all names are unieuq -------- - dup_var <- duplicated(vars) + if (length(remove_null(possible_ids_list)) == 0) { + if (verbose) { + cli::cli_alert_warning("No unique identifier found.") + } + } - if (any(dup_var)) { + # ----------------------------- # + # Return #### + # ----------------------------- # - dvars <- vars[dup_var] + ret_list <- store_checked_ids(checked_ids = checked_ids, + possible_ids = possible_ids_list) - msg <- "column names must be unique" - hint <- "try changing the names using {.fun make.names}" - problem <- "{.var {dvars}} {?is/are} duplicated" - cli::cli_abort(c( - msg, - i = hint, - x = problem - )) + return(ret_list) +} - } +filter_by_class <- function(dt, vars, include_classes, exclude_classes) { + # Compute the primary class of each variable + vars_class <- vapply(dt, function(x) class(x)[1], character(1)) + names(vars_class) <- vars # Ensure names are preserved - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Find duplicates --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Apply 'include_classes' filter + if (!is.null(include_classes)) { + vars <- vars[vars_class[vars] %in% include_classes] + } - duplicates <- is_id(dt, by = vars, verbose = FALSE) - if (duplicates) { - if (verbose) { - cli::cli_alert_success("There are no duplicates in data frame") - } - } else { - if (verbose) { - cli::cli_alert_warning("Data has duplicates. returning NULL") + # Apply 'exclude_classes' filter + if (!is.null(exclude_classes)) { + vars <- vars[!(vars_class[vars] %in% exclude_classes)] + } + vars +} + +filter_by_name <- function(vars, include, exclude, verbose) { + # Apply 'exclude' filter + if (!is.null(exclude)) { + wno_exc <- which(!exclude %in% vars) # which not excluded + if (length(wno_exc) > 0 & verbose) { + no_exc <- exclude[wno_exc] + cli::cli_alert_warning("var{?s} {.var {no_exc}} not found in dataframe") } - is_id(dt, by = vars, verbose = TRUE) - return(NULL) + vars <- setdiff(vars, exclude) } + # Apply 'include' filter - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Find ids --------- - #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nvars <- length(vars) + c(vars, + setdiff(include, vars)) - found <- FALSE - i = 0 - while(i < nvars && found == FALSE) { - i = i + 1 - cm <- utils::combn(vars, m = i) +} - lcm <- dim(cm)[2] # number of combinations of size j - selected_vars <- vector(length = lcm) - for (j in 1:lcm) { - tvars <- cm[, j] # testing vars - selected_vars[j] <- is_id(dt, by = tvars, verbose = FALSE) - } - sv <- which(selected_vars) +# Function to estimate processing time based on unique counts +estimate_combination_time <- function(n_rows, unique_counts) { + # Simple estimation function + # Time is proportional to (product of unique counts) / n_rows + # Adjust the constant factor based on empirical observations + est_time <- (prod(unique_counts) / n_rows) * 0.0001 + return(est_time) +} - if (length(sv) > 0) { - if (length(sv) == 1 && i > 1) { +remove_null <- \(x) { + y <- vapply(x, \(.) !is.null(.), logical(1)) + x[y] +} - lv <- list(V1 = cm[, sv]) - } else if (i == 1) { +#' store checked variables as possible ids +#' +#' This function processes a list of possible IDs by removing any `NULL` entries, +#' storing a set of checked variables as an attribute and in the specified environment, +#' and then returning the updated list of possible IDs. +#' +#' @param checked_ids A vector of variable names that have been checked as possible IDs. +#' @param possible_ids A list containing potential identifiers. This list may contain `NULL` values, which will be removed by the function. +#' @param env An environment where the `checked_ids` will be stored. The default is `.joynenv`. +#' +#' @return A list of possible IDs with `NULL` values removed, and the `checked_ids` stored as an attribute. +#' +#' +#' @keywords internal +store_checked_ids <- function(checked_ids, + possible_ids, + env = .joynenv) { - ee <- as.data.frame(t(cm[, sv])) - lv <- lapply(ee, unique) + # Remove null from possible ids + possible_ids <- remove_null(possible_ids) - } else { + # Store checked_ids in environment + rlang::env_poke(env = env, + nm = "checked_ids", + value = checked_ids) - ee <- as.data.frame(cm[, sv]) - lv <- lapply(ee, unique) + # Store attribute + attr(possible_ids, + "checked_ids") <- checked_ids - } + # Return + return(possible_ids) - found <- TRUE +} + +#' Create variables that uniquely identify rows in a data table +#' +#' This function generates unique identifier columns for a given number of rows, based on the specified number of identifier variables. +#' +#' @param n_rows An integer specifying the number of rows in the data table for which unique identifiers need to be generated. +#' @param n_ids An integer specifying the number of identifiers to be created. If `n_ids` is 1, a simple sequence of unique IDs is created. If greater than 1, a combination of IDs is generated. +#' @param prefix A character string specifying the prefix for the identifier variable names (default is `"id"`). +#' +#' @return A named list where each element is a vector representing a unique identifier column. The number of elements in the list corresponds to the number of identifier variables (`n_ids`). The length of each element is equal to `n_rows`. +#' +#' +#' @keywords internal +create_ids <- function(n_rows, n_ids, prefix = "id") { + + vars <- vector("list", + n_ids) + + # If n_ids is 1, simply generate a sequence of IDs + if (n_ids == 1) { + vars[[1]] <- seq_len(n_rows) + names(vars)[1] <- paste0(prefix, 1) + + return(vars) + } else { + # Get max unique values each variable can have + max_vals <- ceiling(n_rows^(1 / n_ids)) + + # Generate a sequence of unique identifiers + + all_ids <- expand.grid(rep(list(seq_len(max_vals)), + n_ids)) + + #collapse::fnrow faster? + + if (fnrow(all_ids) > n_rows) { + # Randomly sample the unique combinations + all_ids <- all_ids[sample(fnrow(all_ids), + n_rows), ] } - } - if (verbose) { - cli::cli_alert("we found {length(lv)} possible id{?s}") - } + # Store each unique identifier in the vars list + for (i in seq_len(n_ids)) { + vars[[i]] <- all_ids[[i]] + } - return(lv) + names(vars) <- paste0(prefix, + seq_len(n_ids)) + + return(vars) + } } + diff --git a/R/zzz.R b/R/zzz.R index 49d3b773..f222b498 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -141,6 +141,13 @@ set_joyn_options <- function(..., } +# ------------------------------------ +# -- Define global variables -- +# ------------------------------------ + +utils::globalVariables(c("..byvar", + "..vars")) + # ------------------------------------------------------------------------------------------ # Define custom .strong {cli} classes to emphasize messages subcomponents # --- to be used when creating/storing {joyn} messages diff --git a/man/create_ids.Rd b/man/create_ids.Rd new file mode 100644 index 00000000..2197685a --- /dev/null +++ b/man/create_ids.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/possible_ids.R +\name{create_ids} +\alias{create_ids} +\title{Create variables that uniquely identify rows in a data table} +\usage{ +create_ids(n_rows, n_ids, prefix = "id") +} +\arguments{ +\item{n_rows}{An integer specifying the number of rows in the data table for which unique identifiers need to be generated.} + +\item{n_ids}{An integer specifying the number of identifiers to be created. If \code{n_ids} is 1, a simple sequence of unique IDs is created. If greater than 1, a combination of IDs is generated.} + +\item{prefix}{A character string specifying the prefix for the identifier variable names (default is \code{"id"}).} +} +\value{ +A named list where each element is a vector representing a unique identifier column. The number of elements in the list corresponds to the number of identifier variables (\code{n_ids}). The length of each element is equal to \code{n_rows}. +} +\description{ +This function generates unique identifier columns for a given number of rows, based on the specified number of identifier variables. +} +\keyword{internal} diff --git a/man/freq_table.Rd b/man/freq_table.Rd index 6be47f0c..9ed5cc34 100644 --- a/man/freq_table.Rd +++ b/man/freq_table.Rd @@ -4,7 +4,7 @@ \alias{freq_table} \title{Tabulate simple frequencies} \usage{ -freq_table(x, byvar, digits = 1, na.rm = FALSE) +freq_table(x, byvar, digits = 1, na.rm = FALSE, freq_var_name = "n") } \arguments{ \item{x}{data frame} @@ -14,6 +14,8 @@ freq_table(x, byvar, digits = 1, na.rm = FALSE) \item{digits}{numeric: number of decimal places to display. Default is 1.} \item{na.rm}{logical: report NA values in frequencies. Default is FALSE.} + +\item{freq_var_name}{character: name for frequency variable. Default is "n"} } \value{ data.table with frequencies. diff --git a/man/is_id.Rd b/man/is_id.Rd index fce181e9..2e5e2051 100644 --- a/man/is_id.Rd +++ b/man/is_id.Rd @@ -4,7 +4,12 @@ \alias{is_id} \title{Check if dt is uniquely identified by \code{by} variable} \usage{ -is_id(dt, by, verbose = getOption("joyn.verbose"), return_report = FALSE) +is_id( + dt, + by, + verbose = getOption("joyn.verbose", default = FALSE), + return_report = FALSE +) } \arguments{ \item{dt}{either right of left table} diff --git a/man/possible_ids.Rd b/man/possible_ids.Rd index 3d284b31..484e261c 100644 --- a/man/possible_ids.Rd +++ b/man/possible_ids.Rd @@ -6,30 +6,76 @@ \usage{ possible_ids( dt, + vars = NULL, exclude = NULL, include = NULL, - verbose = getOption("possible_ids.verbose") + exclude_classes = NULL, + include_classes = NULL, + verbose = getOption("possible_ids.verbose", default = FALSE), + min_combination_size = 1, + max_combination_size = 5, + max_processing_time = 60, + max_numb_possible_ids = 100, + get_all = FALSE ) } \arguments{ \item{dt}{data frame} -\item{exclude}{character: Exclude variables to be selected as identifiers. It -could be either the name of the variables of one type of the variable -prefixed by "_". For instance, "_numeric" or "_character".} +\item{vars}{character: A vector of variable names to consider for identifying unique combinations.} + +\item{exclude}{character: Names of variables to exclude from analysis} \item{include}{character: Name of variable to be included, that might belong to the group excluded in the \code{exclude}} +\item{exclude_classes}{character: classes to exclude from analysis (e.g., +"numeric", "integer", "date")} + +\item{include_classes}{character: classes to include in the analysis (e.g., +"numeric", "integer", "date")} + \item{verbose}{logical: If FALSE no message will be displayed. Default is TRUE} + +\item{min_combination_size}{numeric: Min number of combinations. Default is +1, so all combinations.} + +\item{max_combination_size}{numeric. Max number of combinations. Default is +5. If there is a combinations of identifiers larger than +\code{max_combination_size}, they won't be found} + +\item{max_processing_time}{numeric: Max time to process in seconds. After +that, it returns what it found.} + +\item{max_numb_possible_ids}{numeric: Max number of possible IDs to find. See +details.} + +\item{get_all}{logical: get all possible combinations based on the parameters +above.} } \value{ list with possible identifiers } \description{ -Identify possible variables uniquely identifying x +Identify possible combinations of variables that uniquely identifying dt +} +\section{Number of possible IDs}{ + + +The number of possible IDs in a dataframe could be very large. This is why, +\code{possible_ids()} makes use of heuristics to return something useful without +wasting the time of the user. In addition, we provide multiple parameter so +that the user can fine tune their search for possible IDs easily and +quickly. + +Say for instance that you have a dataframe with 10 variables. Testing every +possible pair of variables will give you 90 possible unique identifiers for +this dataframe. If you want to test all the possible IDs, you will have to +test more 5000 combinations. If the dataframe has many rows, it may take a +while. } + \examples{ library(data.table) x4 = data.table(id1 = c(1, 1, 2, 3, 3), diff --git a/man/store_checked_ids.Rd b/man/store_checked_ids.Rd new file mode 100644 index 00000000..b0471724 --- /dev/null +++ b/man/store_checked_ids.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/possible_ids.R +\name{store_checked_ids} +\alias{store_checked_ids} +\title{store checked variables as possible ids} +\usage{ +store_checked_ids(checked_ids, possible_ids, env = .joynenv) +} +\arguments{ +\item{checked_ids}{A vector of variable names that have been checked as possible IDs.} + +\item{possible_ids}{A list containing potential identifiers. This list may contain \code{NULL} values, which will be removed by the function.} + +\item{env}{An environment where the \code{checked_ids} will be stored. The default is \code{.joynenv}.} +} +\value{ +A list of possible IDs with \code{NULL} values removed, and the \code{checked_ids} stored as an attribute. +} +\description{ +This function processes a list of possible IDs by removing any \code{NULL} entries, +storing a set of checked variables as an attribute and in the specified environment, +and then returning the updated list of possible IDs. +} +\keyword{internal} diff --git a/tests/testthat/test-freq_table.R b/tests/testthat/test-freq_table.R index f579699b..d5467b71 100644 --- a/tests/testthat/test-freq_table.R +++ b/tests/testthat/test-freq_table.R @@ -67,9 +67,7 @@ test_that("correct totals", { tr <- nrow(y4) - j <- freq_table(y4, "id2") - j <- freq_table(y4, "id2") - j |> + freq_table(y4, "id2") |> fsubset(id2 == "total") |> fselect(n) |> reg_elem() |> diff --git a/tests/testthat/test-is_id.R b/tests/testthat/test-is_id.R index 3205ccdd..332c5a2a 100644 --- a/tests/testthat/test-is_id.R +++ b/tests/testthat/test-is_id.R @@ -75,10 +75,13 @@ test_that("returns correct report table", { id = c("c", "b", "c", "a"), y = c(11L, 15L, 18L, 20L) ) - j <- is_id(y, by = "id", return_report = TRUE) + j <- is_id(y, by = "id", return_report = TRUE) |> + roworder(by = "id") - r <- data.table(id = c("c", "b", "a"), - copies = c(2L, 1L, 1L)) + r <- data.table(id = c("c", "b", "a", "total"), + copies = c(2L, 1L, 1L, 4L), + percent = c("50%", "25%", "25%", "100%")) |> + roworder(by = "id") expect_equal(j, r) diff --git a/tests/testthat/test-possible_ids.R b/tests/testthat/test-possible_ids.R index ccf68847..4d9041ec 100644 --- a/tests/testthat/test-possible_ids.R +++ b/tests/testthat/test-possible_ids.R @@ -1,3 +1,5 @@ + +# PREPARATION #### withr::local_options(joyn.verbose = FALSE) library(data.table) # options(possible_ids.verbose = FALSE) @@ -38,12 +40,234 @@ y4 = data.table(id = c(1, 2, 5, 6, 3), y = c(11L, 15L, 20L, 13L, 10L), x = c(16:20)) +# Auxiliary data: Big data table-------------------- + +# Set seed for reproducibility +set.seed(123) + +# Number of rows and variables +n_rows <- 1e4 # 10,000 rows +n_vars <- 50 # Total variables + +# Initialize an empty data.table +dt_large <- data.table(id = 1:n_rows) + +## Manually create three variables that uniquely identify the data #### +dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times +dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable +dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer + +# Function to generate random data +generate_random_data <- function(n, type) { + switch(type, + "numeric_int" = sample(1:1e6, n, replace = TRUE), + "numeric_double" = rnorm(n), + "character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")), + "factor" = factor(sample(letters[1:10], n, replace = TRUE)), + "logical" = sample(c(TRUE, FALSE), n, replace = TRUE), + "date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE), + "datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE) + ) +} + +# Variable types and counts +var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime") +vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # Total should sum to 50 + +# Generate variables and add to the data.table +var_count <- 0 +for (i in seq_along(var_types)) { + type <- var_types[i] + n_vars_type <- vars_per_type[i] + for (j in 1:n_vars_type) { + var_count <- var_count + 1 + var_name <- paste0(type, "_", j) + dt_large[, (var_name) := generate_random_data(n_rows, type)] + } +} + +## Introduce duplicates in some columns that are NOT the unique identifiers #### +# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns +# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)]) + +# Shuffle the data to avoid ordered data +dt_large <- dt_large[sample(.N)] + + + +# dt_large[, id := .I] +dt <- copy(dt_large) + + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), + verbose = TRUE +) + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), + exclude = "id", + verbose = TRUE +) + +uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) +pids <- possible_ids( + dt = dt_large, + #exclude_classes = c("logical", "date", "datetime", "numeric"), + exclude = "id", + #vars = uniq_vars, + verbose = TRUE, + min_combination_size = 3, + # max_combination_size = 3, + max_processing_time = 240, + get_all = TRUE +) + +possible_ids( + dt = dt_large, + verbose = TRUE +) + +## Remove the 'id' column to simulate data without a clear unique identifier #### +dt_large[, id := NULL] + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_classes = c("logical", "date", "datetime"), # Exclude some types for efficiency + verbose = TRUE +) +possible_ids_list + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_classes = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency + max_processing_time = 120, + verbose = TRUE +) +possible_ids_list + + + +# TESTS #### +## Test create ids --------------------- #### + +test_that("create_ids works as intended", { + df_test <- data.table(a = 1:50) + + # with a single id + res <- as.data.frame(create_ids(n_rows = 50, + n_ids = 1)) + + nrow(unique(res)) |> + expect_equal(50) + + class(res) |> + expect_equal("data.frame") + + + # ret class + res <- create_ids(n_rows = 50, + n_ids = 1) + + class(res) |> + expect_equal("list") + + length(res) |> + expect_equal(1) + + # prefix + create_ids(n_rows = 40, n_ids = 4, prefix = "unique_id") |> + names() |> + expect_equal(paste0("unique_id", 1:4)) + + # with more than an id + vars <- c("var1", "var2", "var3") + + dt <- df_test |> copy() + + res <- dt[, (vars) := create_ids(.N, + n_ids = 3)] + + nrow(res[, .N, + + by = vars][N > 1]) |> + expect_equal(0) + + res <- df_test[, .(as.data.table( + create_ids(.N, n_ids = 3) + ))] + + nrow(res[, .N, + + by = c("id1", "id2", "id3")][N > 1]) |> + expect_equal(0) + + dt <- dt_large |> copy() + + new_ids <- as.data.table(create_ids(nrow(dt), + n_ids = 3, + prefix = "new_id")) + # Bind the new IDs to the existing data.table + dt <- cbind(dt, new_ids) + + +}) + + + +## Test possible_ids ------------------- #### test_that("convert to data.table", { xx1 <- as.data.frame(x1) expect_equal(possible_ids(x1), possible_ids(xx1)) }) + +test_that("store checked ids", { + + res <- possible_ids(x4, + get_all = TRUE) + + attributes(res)$checked_ids |> + expect_setequal(names(x4)) + + .joynenv$checked_ids |> + expect_setequal(names(x4)) + + # with get_all = FALSE + res <- possible_ids(x4) + + attributes(res)$checked_ids |> + expect_setequal(names(x4)) + + .joynenv$checked_ids |> + expect_setequal(names(x4)) + + # with large dt -get_all FALSE + res <- possible_ids(dt, + vars = paste0("numeric_int_", 1:4)) + + attributes(res)$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + + .joynenv$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + + # with large dt -get_all TRUE + res <- possible_ids(dt, + vars = paste0("numeric_int_", 1:4), + get_all = TRUE) + + attributes(res)$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + + .joynenv$checked_ids |> + expect_setequal(paste0("numeric_int_", 1:4)) + +}) + test_that("error if not dataframe", { m1 <- as.matrix(x1) @@ -51,36 +275,319 @@ test_that("error if not dataframe", { }) -test_that("inconsistent user of `include`", { +test_that("vars provided by user", { + + # single var -raise error + possible_ids(x4, + vars = c("t")) |> + expect_error() + + # one or more vars not included in dt + possible_ids(x4, + vars = c("id3", "id2")) |> + expect_error() + + possible_ids(dt, + vars = c("id", "id3", "id2")) |> + expect_error() + + possible_ids(dt, + vars = c("id", "numeric_int_1", "character_1"), + verbose = TRUE) |> + expect_no_error() + + ids_dt <- possible_ids(dt) + + vars <- c("id", "numeric_double_1", "numeric_double_2") + + use_ids_dt <- possible_ids(dt, + vars = vars) + + all(sapply(use_ids_dt, + function(x) { x %in% ids_dt })) |> + expect_equal(TRUE) + + all(unlist(use_ids_dt) %in% vars) |> + expect_equal(TRUE) + + + # Check if the combination of unique_id1, unique_id2, and unique_id3 uniquely identifies rows + vars <- c("id", "unique_id2", "unique_id3") + + res <- dt[, .N, + by = vars][N > 1] |> + nrow() + + # --if it does, possible_ids should return those vars - expect_warning(possible_ids(x1, - include = "x")) + if (res == 0) { # if no duplicate rows + possible_ids(dt, + vars = vars, + min_combination_size = 3) |> + unlist() |> + expect_setequal(vars) + } }) +test_that("relationship include and vars", { + + possible_ids(x4, + vars = c("id1", "id2"), + include = c("t")) |> + expect_no_error() + + possible_ids(dt, + vars = c("id", "unique_id2"), + include = c("id", "factor_1", "factor_2")) |> + expect_no_error() + + possible_ids(x4, + vars = NULL, + include = c("t", "x")) |> + expect_no_error() + + possible_ids(x4, + vars = c("t", "x"), + include = NULL) |> + expect_no_error() + + # test checked vars are at least `vars` plus those in `include` + res <- possible_ids(x4, + vars = c("id1", "id2", "t"), + include = "t") + + checked_ids <- attributes(res)$checked_ids + + checked_ids |> + expect_setequal(c("id1", "id2", "t")) + + res <- possible_ids(dt, + vars = c("logical_1", "logical_2", "factor_1", "factor_2"), + include = "unique_id1") + + checked_ids <- attributes(res)$checked_ids + + checked_ids |> + expect_setequal(c("logical_1", "logical_2", "factor_1", "factor_2", "unique_id1")) + + +}) + +test_that("relationship exclude and vars", { + + possible_ids(x4, + vars = c("t", "x"), + exclude_classes = "character") |> + expect_message() + + possible_ids(x4, + vars = c("id1", "x"), + exclude = "x") |> + expect_message() + + possible_ids(dt, + vars = paste0("character_", 1:10), + exclude = c("character_1", "character_2")) |> + expect_message() + }) + +# test_that("inconsistent use of `include`", { +# +# # expect_warning(possible_ids(x1, +# # include = "x")) +# # +# # possible_ids(x1, +# # include = c("id", "x")) |> +# # expect_no_error() +# +# }) + test_that("exclude and include", { dd <- possible_ids(x3, - exclude = "_numeric", + exclude_classes = c("numeric", "integer"), include = "foo") - expect_equal(c("V1", "V2"), names(dd)) + + expect_equal(unlist(dd), + c("id", "foo")) + + ## Test combination between include class and exclude vars #### + + res <- possible_ids(dt, + get_all = TRUE, + include_classes = c("integer"), + exclude = paste0("numeric_int_", 1:5)) + + checked_vars <- attributes(res)$checked_ids + + any( + paste0("numeric_int_", 1:5) %in% checked_vars + ) |> + expect_equal(FALSE) + + all( + paste0("numeric_int_", 6:10) %in% checked_vars + ) |> + expect_equal(TRUE) + + ## Test combination between include vars and exclude class #### + res <- possible_ids(dt, + include = c("numeric_double_1", + "numeric_double_2"), + exclude_classes = "numeric") + + checked_vars <- attributes(res)$checked_ids + + all( + paste0("numeric_double_", 1:2) %in% checked_vars + ) |> + expect_equal(TRUE) + + any( + paste0("numeric_double_", 3:10) %in% checked_vars + ) |> + expect_equal(FALSE) + + res_ids <- possible_ids(x2, + include = "x", + exclude_classes = "numeric") + + res_ids |> + unlist() |> + expect_equal(c("t", "x")) + + res_ids <- possible_ids(x3, + include = "id", + exclude_classes = "character") + + res_ids |> + unlist() |> + expect_setequal(c("foo", "id", "v")) + + # alert if include and exclude same class #### + possible_ids(dt, + include_classes = "numeric", + exclude_classes = "numeric") |> + expect_message() + + # alert if include and exclude same vars #### + possible_ids(dt, + include = c("id", "unique_id1"), + exclude = c("id", "unique_id1")) |> + expect_message() + + res <- possible_ids(dt, + exclude_classes = c("integer"), + include = c("numeric_int_1")) + + attributes(res)$checked_ids |> + expect_setequal(setdiff(names(dt), c(paste0("numeric_int_", 2:10), + "id", + "unique_id1", "unique_id3"))) }) -test_that("get NULL when duplicates", { +test_that("get length 0 -error", { - expect_null(possible_ids(x1, - exclude = "_numeric", + expect_error(possible_ids(x1, + exclude_classes = c("numeric", "integer"), include = "t")) }) +test_that("get all works", { + + # no error + possible_ids(x4, + get_all = TRUE) |> + expect_no_error() + + # get all with user selected vars + possible_ids(x4, + vars = c("id1", "t"), + get_all = TRUE) |> + expect_no_error() + + # get all with max number of combinations + possible_ids(x4, + max_combination_size = 3, + get_all = TRUE) |> + expect_no_error() + + # check get all combs + possible_ids(x3, + get_all = TRUE) |> + unlist() |> + expect_setequal(c("id", "v", "foo")) + + + +}) + + +test_that("Max combination size", { + + res <- possible_ids(dt, + vars = c( "unique_id1", "unique_id2", "unique_id3", + "character_1", "character_2", "character_3", "character_4"), + max_combination_size = 5) + + + sapply(res, function(sublist) { + length(sublist) <= 3}) |> + all() |> + expect_true() + + res <- possible_ids(x1, + get_all = TRUE, + max_combination_size = 2) + + sapply(res, function(sublist) { + length(sublist) <= 2}) |> + all() |> + expect_true() + +}) + +test_that("Min combination size", { + + res <- possible_ids(x4, + min_combination_size = 1, + get_all = FALSE) |> + unlist() + + expect_true( + length(res) >= 1) + + res <- possible_ids(dt, + min_combination_size = 3, + get_all = FALSE) |> + unlist() + + expect_true(length(res) >= 3) + + + possible_ids(x4, + #min_combination_size = 1, + max_combination_size = 1) |> + expect_error() + + + possible_ids(x4, + min_combination_size = 3, + max_combination_size = 2) |> + expect_error() + +}) + test_that("Exclude nothing", { + p1 <- possible_ids(x1) + p2 <- possible_ids(x1, exclude = "rer") - expect_warning(possible_ids(x1, - exclude = "rer")) + expect_equal(p1, p2) }) @@ -91,9 +598,10 @@ test_that("Exclude type and variable", { xx4[, id2 := as.character(id2)] dd <- possible_ids(xx4, - exclude = c("_character", "x")) + exclude_classes = c("character"), + exclude = "x") - expect_equal(c("id1", "t"), dd$V1) + expect_equal(c("id1", "t"), unlist(dd)) }) @@ -104,7 +612,7 @@ test_that("Exclude more than one variable", { dd <- possible_ids(x4, exclude = c("id2", "x")) - expect_equal(c("id1", "t"), dd$V1) + expect_equal(c("id1", "t"), unlist(dd)) }) @@ -117,4 +625,139 @@ test_that("duplicated names", { }) +test_that("identifies ids", { + + vars <- c("var1", "var2", "var3") + dt[, (vars) := create_ids(.N, n_ids = 3)] + + possible_ids(dt, + vars = vars) |> + unlist() |> + expect_equal(vars) + + df_test <- as.data.frame( + create_ids(n_rows = 50, + n_ids = 3) + ) + + possible_ids(df_test, + vars = c("id1", "id2"), + include = "id3") |> + unlist() |> + expect_equal(c("id1", "id2", "id3")) + + possible_ids(df_test, + exclude_classes = "integer", + include = c("id1", "id2", "id3")) |> + unlist() |> + expect_equal(c("id1", "id2", "id3")) + +}) +# Auxiliary data: Big data table-------------------- + +# Set seed for reproducibility +set.seed(123) + +# Number of rows and variables +n_rows <- 1e4 # 10,000 rows +n_vars <- 50 # Total variables + +# Initialize an empty data.table +dt_large <- data.table(id = 1:n_rows) + +## Manually create three variables that uniquely identify the data #### +dt_large[, unique_id1 := rep(1:10, each = 1000)] # 1000 unique values repeated 100 times +dt_large[, unique_id2 := sample(letters, n_rows, replace = TRUE)] # Random character variable +dt_large[, unique_id3 := sample(1:1000, n_rows, replace = TRUE)] # Random integer + +# Function to generate random data +generate_random_data <- function(n, type) { + switch(type, + "numeric_int" = sample(1:1e6, n, replace = TRUE), + "numeric_double" = rnorm(n), + "character" = replicate(n, paste0(sample(letters, 5, replace = TRUE), collapse = "")), + "factor" = factor(sample(letters[1:10], n, replace = TRUE)), + "logical" = sample(c(TRUE, FALSE), n, replace = TRUE), + "date" = as.Date("2000-01-01") + sample(0:3650, n, replace = TRUE), + "datetime" = as.POSIXct("2000-01-01") + sample(0:(3650*24*60*60), n, replace = TRUE) + ) +} + +# Variable types and counts +var_types <- c("numeric_int", "numeric_double", "character", "factor", "logical", "date", "datetime") +vars_per_type <- c(10, 10, 10, 10, 5, 3, 2) # Total should sum to 50 + +# Generate variables and add to the data.table +var_count <- 0 +for (i in seq_along(var_types)) { + type <- var_types[i] + n_vars_type <- vars_per_type[i] + for (j in 1:n_vars_type) { + var_count <- var_count + 1 + var_name <- paste0(type, "_", j) + dt_large[, (var_name) := generate_random_data(n_rows, type)] + } +} + +## Introduce duplicates in some columns that are NOT the unique identifiers #### +# For example, we can duplicate the first 100 rows in the "numeric_int_1" and "character_1" columns +# dt_large <- rbind(dt_large, dt_large[1:100, .(numeric_int_1, character_1)]) + +# Shuffle the data to avoid ordered data +dt_large <- dt_large[sample(.N)] + + + +# dt_large[, id := .I] +dt <- copy(dt_large) + + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), + verbose = TRUE +) + +possible_ids( + dt = dt_large, + exclude_classes = c("numeric"), + exclude = "id", + verbose = TRUE +) + +uniq_vars <- grep("unique_id", names(dt_large), value = TRUE) +pids <- possible_ids( + dt = dt_large, + #exclude_classes = c("logical", "date", "datetime", "numeric"), + exclude = "id", + #vars = uniq_vars, + verbose = TRUE, + min_combination_size = 3, + # max_combination_size = 3, + max_processing_time = 240, + get_all = TRUE +) + +possible_ids( + dt = dt_large, + verbose = TRUE +) + +## Remove the 'id' column to simulate data without a clear unique identifier #### +dt_large[, id := NULL] + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_classes = c("logical", "date", "datetime"), # Exclude some types for efficiency + verbose = TRUE +) +possible_ids_list + +possible_ids_list <- possible_ids( + dt = dt_large, + exclude_classes = c("logical", "date", "datetime", "numeric"), # Exclude some types for efficiency + max_processing_time = 120, + verbose = TRUE +) +possible_ids_list diff --git a/vignettes/aux-functions.Rmd b/vignettes/aux-functions.Rmd index 2eab35de..a668bb74 100644 --- a/vignettes/aux-functions.Rmd +++ b/vignettes/aux-functions.Rmd @@ -68,10 +68,10 @@ possible_ids(dt = x1, possible_ids(dt = x1, exclude = "_character") -# Identify possible unique identifiers, excluding character variables but considering variable z +# Identify possible unique identifiers, excluding character variables but considering variable c1 possible_ids(dt = x1, - exclude = "_character", - include = "z") + exclude_classes = "character", + include = "c1") ```