1_DVEP.Rmd

---
title: "DVEP Data Analysis"
author: "Gustavo Santos Paiva Laender Moura"
date: "`r format(Sys.Date(), '%d de %B de %Y')`"
output:
  html_document
PID: REDCap 1958
project: Effect of Eclipta prostrata (L.) L. (Asteraceae) on bioelectrical impedance
  phase angle in adults with grade I obesity (DVEP)
---

# DATA WRANGLING

## Getting started with R

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 1. Getting started with R
## Clear existing data and graphics

rm(list = ls())
graphics.off()
cat("\014")  # Clear any pending RStudio sessions or temporary files

## Load necessary libraries
library(tidyverse)
library(readxl)
library(lubridate)
library(stringr)
library(purrr)
library(gt)
library(jmv)
library(skimr)
```

## Read CSV data files (Tidyverse)

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 2. Read CSV data files (Tidyverse)
codebook_dvep <- read_excel(
        "Codebooks/codebook_dvep.xlsx",
        col_names = TRUE,
        col_types = NULL,
        na = c("", "NA", "NI", "UNK", "NASK", "ASKU", "INV"),
        trim_ws = TRUE,
        skip = 0, # Number of lines to skip before reading data
        n_max = Inf, # Maximum number of lines to read.
        guess_max = 1000
    ) |>
    arrange(index)

codebook_bia <- read_excel(
    "Codebooks/codebook_bia.xlsx",
    col_names = TRUE,
    col_types = NULL,
        na = c("", "NA", "NI", "UNK", "NASK", "ASKU", "INV"),
        trim_ws = TRUE,
        skip = 0, # Number of lines to skip before reading data
        n_max = Inf, # Maximum number of lines to read.
        guess_max = 1000
    ) |>
    arrange(index)

codebook_structure  <- read_csv(
  "Codebooks/codebook_structure.csv",
  col_names = TRUE) |> 
    select(
        form_name_en:V3
    )

codebook_ncit  <- read_csv(
  "Codebooks/codebook_ncit.csv",
  col_names = TRUE)

data  <- read_csv(
  "Data/data_dvep.csv",
  col_names = TRUE,
  col_types = NULL,
  col_select = NULL,
  id = NULL,
  locale = default_locale(),
  na = c("", "NA", "NI", "UNK", "NASK", "ASKU", "INV"),
  quote = "\"",
  comment = "",
  trim_ws = TRUE,
  skip = 0, # Number of lines to skip before reading data
  n_max = Inf, # Maximum number of lines to read.
  guess_max = 1000,
  name_repair = "unique",
  num_threads = readr_threads(),
  progress = show_progress(),
  show_col_types = TRUE,
  skip_empty_rows = TRUE,
  lazy = should_read_lazy()
)

data_bia_D3 <- read_csv(
    "Data/data_bia_D3.csv",
    col_names = TRUE)

data_bia_D1 <- read_csv(
    "Data/data_bia_D1.csv",
    col_names = TRUE)
```

## Remove identifying data from record_id

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 3. Remove identifying data from record_id
data$record_id <- substr(data$record_id,1,2)
data_bia_D3 $File <- substr(data_bia_D3 $File,1,2)
data_bia_D1$File <- substr(data_bia_D1$File,1,2)
```

## Renaming variables

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 4. Renaming variables
# 4.1 data
rename_data <- setNames(object = colnames(data), codebook_dvep$variable)
data <- data |>
    rename(!!!rename_data)

rm(rename_data)

# 4.2 bia
rename_bia <- setNames(object = colnames(data_bia_D3 ), codebook_bia$variable)

data_bia_D3 <- data_bia_D3 |>
    rename(!!!rename_bia)

data_bia_D1 <- data_bia_D1 |>
    rename(!!!rename_bia)

rm(rename_bia)
```

## record_id as.integer

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 5. record_id as.integer
data$record_id <- as.integer(data$record_id)
data_bia_D3 $record_id <- as.integer(data_bia_D3 $record_id)
data_bia_D1$record_id <- as.integer(data_bia_D1$record_id)
```

## Assign labels to variables

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 6. Assign labels to variables with base R attr()
data <- data |> 
  mutate(across(
    all_of(codebook_dvep$variable),
    ~ {
      attr(., "label") <- codebook_dvep$label_pt[codebook_dvep$variable == cur_column()]
      .
    }
  ))

data_bia_D3 <- data_bia_D3 |> 
  mutate(across(
    all_of(codebook_bia$variable),
    ~ {
      attr(., "label") <- codebook_bia$label_pt[codebook_bia$variable == cur_column()]
      .
    }
  ))
```

## Creating functions

#### filter_variables()

`filter_variables(visit = c("eleg", "V1", "V2", "V3"), include_repeating = NULL, form_name = NULL, filter_included = TRUE)`

Arguments:

-   `visit`: A vector of visit names to filter (default: all visits)

-   `include_repeating = NULL`

    -   If NULL, includes all variables

    -   If 0, does not include repeating variables

    -   If 1, only shows repeating variables

-   `form_name = NULL`: The name of the form to filter on (NULL means no filtering by form)

-   `filter_included = TRUE`: Whether to filter by the 'included' column (default: TRUE)

```{r, results = 'hide', message = FALSE, warning = FALSE}
filter_variables <- function(
        visit = c("eleg", "V1", "V2", "V3"),  
        include_repeating = NULL,            
        form_name = NULL,                    
        filter_included = TRUE              
        ) {
  
  # Validate the 'visit' input: Ensure all provided visit names are valid
  if (!all(visit %in% c("eleg", "V1", "V2", "V3"))) {
    stop("Invalid visit name. Choose from 'eleg', 'V1', 'V2', or 'V3'.")
  }
  
  # Define valid form names for validation
  valid_form_names <- c(
    "eleg", "demographic", "whoqol", "dass", "ecap", "measures", "bp_limb", "bp", 
    "bia", "handgrip", "eliminations", "evs", "alcohol", "tobacco", "diet_recall", 
    "intake", "dates", "allocation", "conditions", "drugs", "old.drugs", "history", 
    "symptoms", "phy.exam", "labs", "ecg", "compliance", "events", "medical", 
    "followup", "conclusion"
  )
  
  # Validate the 'form_name' input: Ensure it contains only valid form names
  if (!is.null(form_name) && !all(form_name %in% valid_form_names)) {
    stop("Invalid form_name. Choose from: ", paste(valid_form_names, collapse = ", "))
  }
  
  # Filter the 'codebook_dvep' based on the specified criteria
  filtered_codebook <- codebook_dvep |> 
    filter(
      # If filter_included is TRUE, filter for rows where 'included' equals 1
      if (filter_included) included == 1 else TRUE,
      
      # Retain rows where at least one of the selected visits has a value greater than 0
      rowSums(across(all_of(visit))) > 0,
      
      # If 'include_repeating' is specified, filter by the repeating_instrument column
      if (!is.null(include_repeating)) repeating_instrument == include_repeating else TRUE,
      
      # If 'form_name' is specified, filter by the form_name_en column
      if (!is.null(form_name)) form_name_en == form_name else TRUE
    )
  
  # Extract and return the 'variable' column from the filtered codebook
  filtered_vars <- filtered_codebook$variable
  
  return(filtered_vars)  # Return the filtered variable names
}

```

#### filter_data()

`filter_data <- function( visit = c("eleg", "V1", "V2", "V3"), include_repeating = NULL, form_name = NULL)`

Arguments:

-   `visit`: A vector of visit names to filter (default: all visits)

-   `include_repeating = NULL`

    -   If NULL, includes all variables

    -   If 0, does not include repeating variables

    -   If 1, only shows repeating variables

-   `form_name = NULL`: The name of the form to filter on. **Will only work for repeating instruments.**

```{r, results = 'hide', message = FALSE, warning = FALSE}

## 7.2 filter_data() based on visit, repeating instrument and form
filter_data <- function(
        visit = c("eleg", "V1", "V2", "V3"), 
        include_repeating = NULL, 
        form_name = NULL) {
  
  # Map visit names to actual event_name values
  mapped_visits <- case_when(
    visit == "eleg" ~ "eleg_arm_1",
    visit == "V1" ~ "1visit_arm_1",
    visit == "V2" ~ "2visit_arm_1",
    visit == "V3" ~ "3visit_arm_1",
    TRUE ~ visit
  )
  
  # Map form_name to repeat_instrument values using a case_when structure
  mapped_form_name <- if (!is.null(form_name)) {
    case_when(
      form_name == "eleg" ~ "elegibilidade",
      form_name == "demographic" ~ "dados_demogrficos",
      form_name == "whoqol" ~ "questionrio_qualidade_de_vida",
      form_name == "dass" ~ "escore_de_depresso_ansiedade_e_estresse",
      form_name == "ecap" ~ "escala_de_compulso_alimentar",
      form_name == "measures" ~ "antropometria",
      form_name == "bp_limb" ~ "presso_arterial_determinao_do_membro_de_referncia",
      form_name == "bp" ~ "presso_arterial",
      form_name == "bia" ~ "impedncia_bioeltrica_corporal",
      form_name == "handgrip" ~ "fora_de_preenso_palmar",
      form_name == "eliminations" ~ "avaliao_nutricional",
      # form_name == "allergies" ~ "alergia_alimentar",
      form_name == "evs" ~ "exercise_vital_sign",
      form_name == "alcohol" ~ "consumo_alcool",
      form_name == "tobacco" ~ "consumo_tabaco",
      form_name == "diet_recall" ~ "recordatrio_alimentar",
      form_name == "intake" ~ "avaliao_da_ingesto_alimentar",
      form_name == "dates" ~ "datas_importantes",
      form_name == "allocation" ~ "nmero_do_participante",
      form_name == "conditions" ~ "comorbidades",
      form_name == "drugs" ~ "medicamentos_de_uso_habitual",
      form_name == "old.drugs" ~ "medicamentos_prvios",
      form_name == "history" ~ "antecedentes_pessoais",
      form_name == "symptoms" ~ "sintomas",
      form_name == "phy.exam" ~ "exame_fsico",
      form_name == "labs" ~ "exames_laboratoriais",
      form_name == "ecg" ~ "eletrocardiograma",
      form_name == "compliance" ~ "adeso",
      form_name == "events" ~ "eventos_adversos",
      form_name == "medical" ~ "avaliao_mdica",
      form_name == "followup" ~ "contato_semanal",
      form_name == "conclusion" ~ "concluso",
      # form_name == "annex" ~ "anexos",
      TRUE ~ form_name
    )
  } else {
    NULL
  }
  
  # Get the filtered variable names using the filter_variables function
  filtered_vars <- filter_variables(visit, include_repeating, form_name)
  
  # Filter the raw data to only include these columns and match event_name and form_name
  filtered_data <- data %>% 
    filter(event_name %in% mapped_visits) %>%
    filter(if (!is.null(include_repeating) && include_repeating == 0) is.na(repeat_instrument) | repeat_instrument == "" else TRUE) %>%
    filter(if (!is.null(include_repeating) && include_repeating == 1) !is.na(repeat_instrument) & repeat_instrument != "" else TRUE) %>%
    filter(if (!is.null(mapped_form_name)) repeat_instrument == mapped_form_name else TRUE) %>%
    select(record_id, event_name, repeat_instrument, repeat_instance, all_of(filtered_vars)) %>%
    mutate(
      repeat_instrument = ifelse(is.na(repeat_instrument), "", repeat_instrument),
      repeat_instance = ifelse(repeat_instrument == "", NA, repeat_instance)
    )
  
  return(filtered_data)
}
```

#### filter_codebook()

`filter_codebook(form_name = c(...), included = 1)`

Arguments

-   `form_name`

-   `included = 1`: defaults to 1, filtering variables by `included` column. If set to 0, will include all variables

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 7.3 filter_codebook()
filter_codebook <- function(form_name = c(
    "eleg", "tcle", "demographic", "whoqol", "dass", "ecap", "measures", 
  "bp_limb", "bp", "bia", "handgrip", "eliminations", "allergies", 
  "evs", "alcohol", "tobacco", "diet_recall", "intake", "dates", 
  "allocation", "conditions", "drugs", "old.drugs", "history", 
  "symptoms", "phy.exam", "labs", "ecg", "compliance", "events", 
  "medical", "followup", "conclusion", "annex"),
  included = 1) {
    
    # Ensure input is valid
    if (!all(form_name %in% c("eleg", "tcle", "demographic", "whoqol", "dass", "ecap", "measures", 
  "bp_limb", "bp", "bia", "handgrip", "eliminations", "allergies", 
  "evs", "alcohol", "tobacco", "diet_recall", "intake", "dates", 
  "allocation", "conditions", "drugs", "old.drugs", "history", 
  "symptoms", "phy.exam", "labs", "ecg", "compliance", "events", 
  "medical", "followup", "conclusion", "annex"))) 
        {
      stop("Invalid form name")
    }
    
    if (included == 1) {
        codebook_form <- codebook_dvep |> 
        filter(form_name_en %in% form_name & included == 1)
    } else {
        codebook_form <- codebook_dvep |> 
        filter(form_name_en %in% form_name)
    }

    return(codebook_form)
}
```

#### convert_col_type()

`convert_col_type(data, codebook = codebook_dvep)`

Arguments:

-   `data`: dataframe to apply the function

-   `codebook = codebook_dvep`: codebook source. Defaults to `codebook_dvep`

Tips:

-   as.factor(): categorical data where the label (e.g., "6 cápsulas ao dia") is more meaningful than numeric code.
-   binary data (0, Não \| 1, Sim):
    -   Use as.factor() if the "label" (Não or Sim) is important.
    -   Use as.numeric(as.character()) if you're performing mathematical operations (e.g., calculating proportions, averages).
-   For ordinal data (1, Ruim \| 2, Regular \| 3, Boa \| 4, Excelente): use as.factor() with ordered levels (ordered()) if you need to preserve the ranking.

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 7.4 convert_col_type()
convert_col_type <- function(data, codebook = codebook_dvep) {
    # Nested function to convert a single column
    convert_column <- function(column, type) {
        switch(type,
           f = as.factor(column),                      # Factor
           o = as.factor(column),                      # Factor
           c = as.character(column),                   # Character
           d = as.numeric(column),                     # Numeric
           i = as.integer(column),                     # Integer
           k = lubridate::ymd(column),                 # Date (YYYY-MM-DD)
           t = lubridate::ymd_hms(column),             # Date-Time (YYYY-MM-DD HH:MM:SS)
           h = lubridate::hms(column),                 # Time only (HH:MM:SS)
           n = as.numeric(as.character(column)),       # Coerce to Numeric
           l = as.logical(column),                     # Logical
           D = as.Date(column, format = "%Y-%m-%d"),   # Date with specified format
           T = as.POSIXct(column, format = "%Y-%m-%d %H:%M:%S"),  # Date-Time
           column  # Default (no change)
    )
    }
    # Apply conversion
    data <- data |>
        mutate(
            across(
                .cols = all_of(intersect(colnames(data), codebook$variable)), # Ensures only common variables are processed
                .fns = ~ convert_column(.x, codebook$col_types[which(codebook$variable == cur_column())])
            )
        )
}
convert_col_type(data)
```

#### label_variables()

`label_variables(data, codebook, language = "pt")`

Arguments:

-   `data`: dataframe to which function will be applied

-   `codebook`: source codebook

-   `language = "pt"`: defaults to portuguese (`"pt"`); set `"en"` for english

```{r, results = 'hide', message = FALSE, warning = FALSE}
label_variables <- function(data, codebook, language = "pt") {
  # Determine the label column based on the language argument
  label_column <- ifelse(language == "en", "label_en", "label_pt")

  # Ensure column names are consistent
  codebook_vars <- codebook$variable
  codebook_labels <- codebook[[label_column]]

  # Identify common variables in both data and codebook
  common_vars <- intersect(names(data), codebook_vars)

  # Loop through the common variables and assign labels
  for (var in common_vars) {
    label <- codebook_labels[codebook_vars == var]
    attr(data[[var]], "label") <- label
  }

  return(data)
}
```

label_variables2()

```{r, eval=FALSE, echo=FALSE}
label_variables2 <- function(data, codebook, language = "pt") {
  # Determine the label column based on the language argument
  label_column <- ifelse(language == "en", "label_en", "label_pt")
  
  # Check if the label column exists in the codebook
  if (!(label_column %in% names(codebook))) {
    stop(paste("Codebook does not contain the column:", label_column))
  }
  
  # Ensure column names are consistent
  codebook_vars <- trimws(codebook$variable)
  codebook_labels <- codebook[[label_column]]
  
  # Identify common variables in both data and codebook
  common_vars <- intersect(names(data), codebook_vars)
  
  # Check if there are common variables
  if (length(common_vars) == 0) {
    stop("No matching variables found between the data and the codebook.")
  }
  
  # Loop through the common variables and assign labels
  for (var in common_vars) {
    # Ensure label is unique
    label <- codebook_labels[which(codebook_vars == var)]
    if (length(label) != 1) {
      warning(paste("Variable", var, "does not have a unique label in the codebook."))
    } else {
      attr(data[[var]], "label") <- label
    }
  }
  
  return(data)
}

```

#### label_choices()

`label_choices(data, codebook = codebook_dvep)`

Arguments:

-   `data`: dataframe to which function will be applied

-   `codebook = codebook_dvep`: source codebook (dafaults to `codebook_dvep`)

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 7.5 label_choices()
label_choices <- function(data, codebook = codebook_dvep) {
  # Ensure required libraries are loaded
  if (!requireNamespace("dplyr") || !requireNamespace("tidyr") ||
      !requireNamespace("stringr") || !requireNamespace("purrr")) {
    stop("Required libraries: dplyr, tidyr, stringr, purrr")
  }
  
  # 1. Filter codebook for relevant variables with col_types in "f" or "o"
  selected_codebook <- codebook |> 
    filter(
      variable %in% colnames(data) &  # Variables present in data
      col_types %in% c("f", "o")     # col_types in "f" or "o"
    )
  
  # 2. Parse the `choices` column
  parsed_choices <- selected_codebook |>
    rowwise() |> 
    mutate(
      parsed = list(
        str_split(choices, " \\| ") |>   # Split choices by "|"
          unlist() |>
          map(~ str_split_fixed(.x, ", ", 2) |>  # Split by ", " into two columns
            as_tibble(.name_repair = "unique") |>  # Ensure unique column names
            setNames(c("raw_value", "label"))    # Name columns
          ) |>
          bind_rows()  # Combine into a tibble
      )
    ) |>
    select(variable, parsed) |>
    unnest(parsed)  # Expand parsed choices into rows
  
  # 3. Create lookup tables for selected variables
  lookup_tables <- parsed_choices |>
    group_by(variable) |>
    summarize(
      lookup = list(setNames(label, raw_value)), .groups = "drop"
    ) |>
    deframe()
  
  # 4. Replace raw values with labels in data, using "Unmatched" for unmatched values
  for (column_name in names(lookup_tables)) {
    if (column_name %in% colnames(data)) {  # Ensure column exists in data
      data[[column_name]] <- recode(        # Apply recode using the lookup table
        data[[column_name]],
        !!!lookup_tables[[column_name]],
        .default = "Unmatched"             # Set "Unmatched" as the placeholder for unmatched values
      )
    }
  }
  
  # Explicitly return the modified data
  return(data)
}
```

## Bioimpedance data

#### Data from first/third visit

Applies to participants who completed the intervention

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 8. Wrangling Bioimpedance data
## 8.1 D3 DATA (contains data from first and third visits for participants who completed the intervention)
### Filter lines for which phaseangle is <> ""
data_bia_D3_filtered <- data_bia_D3 |> 
    filter(!is.na(phaseangle)) |> 
    mutate(
        date = as.Date(timestamp),           # Extract the date
        time = format(timestamp, "%H:%M:%S") # Extract the time
    ) |> 
    select(all_of(codebook_bia$variable)[codebook_bia$included == 1], date, time) |> 
    relocate(record_id, date, time, phaseangle, raverage, xcaverage, weight:w_ecwbytbw) |> 
    arrange(record_id, date, time)

### Group by record_id and date and obtain mean of multiple measurements from the same day
data_bia_D3_filtered <- data_bia_D3_filtered |> 
    group_by(record_id, date) |>
    summarise(
        across(c(phaseangle:m_tohimaginary), \(x) mean(x, na.rm = TRUE)),
        .groups = "drop"
    ) |>
    group_by(record_id) |> # Add coding for visit number
    mutate(
        visit = case_when(
            date == min(date) ~ 1,  # Assign 1 to the earliest date
            date == max(date) ~ 3,  # Assign 3 to the latest date
            TRUE ~ NA_real_         # Default to NA for unexpected cases
        ),
        .after = record_id
    )
```

#### Data from first visit

Applies to participants who did not complete the intervention

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 8.2 D1 DATA (data from the first visit for participants who did not complete the intervention)
## Filter lines for which phaseangle is <> ""
data_bia_D1_filtered <- data_bia_D1 |> 
    filter(!is.na(phaseangle)) |> 
    mutate(
        date = as.Date(timestamp),           # Extract the date
        time = format(timestamp, "%H:%M:%S") # Extract the time
    ) |> 
    select(all_of(codebook_bia$variable)[codebook_bia$included == 1], date, time) |> 
    relocate(record_id, date, time, phaseangle, raverage, xcaverage, weight:w_ecwbytbw) |> 
    arrange(record_id, date, time)

### Group by record_id and date and obtain mean of multiple measurements from the same day
data_bia_D1_filtered <- data_bia_D1_filtered |> 
    group_by(record_id, date) |>
    summarise(
        across(c(phaseangle:m_tohimaginary), \(x) mean(x, na.rm = TRUE)),
        .groups = "drop"
    ) |>
    group_by(record_id) |> # Add coding for visit number
    mutate(
        visit = case_when(
            date == min(date) ~ 1,  # Assign 1 to the earliest date
            date == max(date) ~ 3,  # Assign 3 to the latest date
            TRUE ~ NA_real_         # Default to NA for unexpected cases
        ),
        .after = record_id
    )
### Selecting BIA data from D1 not present in D3
data_bia_D1_filtered <- data_bia_D1_filtered |> 
    filter(
     record_id %in% setdiff(1:75, data_bia_D3_filtered$record_id)   
    )
```

#### Merging to single tibble

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 8.3 MERGE D1/D3 BIA data into a single tibble
data_bia <- bind_rows(
    data_bia_D1_filtered, data_bia_D3_filtered
) |> 
    mutate(
        visit = as.integer(visit)
    ) |> 
    arrange(
        record_id, visit
    ) |> 
    ungroup()

### label_variables
data_bia <- label_variables(data_bia, codebook_bia)
```

#### Drop intermediate tibbles

```{r, results = 'hide', message = FALSE, warning = FALSE}
# 8.4. DROP intermediate tibbles
rm(data_bia_D1)
rm(data_bia_D1_filtered)
rm(data_bia_D3)
rm(data_bia_D3_filtered)
```

## Wrangling DVEP REDCap data

#### Adding NCIT labels

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 9.2 Repeating instruments
### 9.2.1. Concatenating NCIT labels (from codebook_ncit) to NCIT codes
#### Conditions (commorbidities)
I21_conditions_R  <- filter_data("eleg",1,"conditions")  |> 
    left_join(
        codebook_ncit |> select(ncit_code, descriptive),
        # by = c("common_comorbidities" = "ncit_code")
        join_by(common_comorbidities == ncit_code)
    ) |>
    relocate(
        descriptive, .after = common_comorbidities
    )

I21_conditions_R <- label_variables(I21_conditions_R, codebook_dvep)

#### Drugs in regular use
I22_drugs_R  <- filter_data("eleg",1,"drugs")  |> 
    left_join(
        codebook_ncit |> select(ncit_code, descriptive),
        join_by(drugs_sql == ncit_code)
    ) |>
    relocate(
        descriptive, .after = drugs_sql
    )

I22_drugs_R <- label_variables(I22_drugs_R, codebook_dvep)

# # 1.3 Previous drugs
# I23_old_drugs_R <- filter_data("eleg",1,"old.drugs") |> 
#     left_join(
#         codebook_ncit |> select(ncit_code, descriptive),
#         join_by(common_previous_medications == ncit_code)
#     )|>
#     relocate(
#         descriptive, .after = common_previous_medications
#     )
# 
# # 1.4 Past medical conditions
# I24_old_conditions_R <- filter_data("eleg",1,"history") |> 
#     left_join(
#         codebook_ncit |> select(ncit_code, descriptive),
#         join_by(common_medical_history == ncit_code)
#     )|>
#     relocate(
#         descriptive, .after = common_medical_history
#     )


# Most common comorbidities
# I21_conditions_R |> 
#     group_by(common_comorbidities, descriptive) |> 
#     count(common_comorbidities, sort = TRUE, name = "frequency") |> 
#     mutate(percentage = round((frequency/75 * 100),1)) |> 
#     view()

# NCIT    Condition   
# C3117	Hipertensão	                18	24         *1
# C26696	Ansiedade	                16	21.3        
# C37967	Hipercolesterolemia         16	21.3       *2
# C37971	Hipertrigliceridemia        13	17.3       *2
# C113101	Resistência à insulina      11	14.7       *3
# C26800	Hipotireoidismo             9	12         
# C89715	Enxaqueca	                8	10.7
# C114667	SOP                     	7	9.3
# C26747	DM2                     	7	9.3        *3
 
# Most common drugs
# ```{r, eval = FALSE}
# I22_drugs_R |> 
#     group_by(drugs_sql, descriptive) |> 
#     count(drugs_sql, sort = TRUE, name = "frequency") |> 
#     mutate(percentage = round((frequency/75 * 100),1)) |> 
#     view()
```

#### Exclusive variables from Eleg/D1 `data_d1_exclusive`

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 9.1 Eclusive variables from Eleg/D1 (`data_d1_exclusive`) to be replicated to d2 and d3
eleg_exclusive <- filter_data("eleg",0) |> 
    mutate(
        intervention_duration = as.numeric(conclusion_date - intervention_start_date)
        )|> 
    select(record_id, allocation_group, completed_intervention, intervention_duration, non_completion_reason, age, sex)

visit_1_exclusive <- filter_data("V1",0) |> 
    select(
        record_id,
        race:income_level
    ) |> 
# codebook_dvep$choices[codebook_dvep$variable == "race"]
# [1] "c41260, Asiático | c41261, Branco origem europeia | c128994, Branco origem América do Sul | c16352, Negro | c17998, desconhecido | c17649, Outro"
    mutate(
        race = if_else(race == "c41261", "c128994", race)
    )

data_d1_exclusive <- eleg_exclusive |> 
    left_join(
        visit_1_exclusive,
        by = join_by(record_id)
)

rm(eleg_exclusive)
rm(visit_1_exclusive)
```

### Repeating instruments

#### Creating relevant binary variables\*

Hypertension

```{r, results = 'hide', message = FALSE, warning = FALSE}
### 9.3.1 HYPERTENSION
#### Extract record IDs associated with hypertension diagnosis
hypertension_conditions <- I21_conditions_R |> 
    filter(str_detect(common_comorbidities, "C3117")) |> 
    pull(record_id)

#### Extract record IDs associated with antihypertensive drugs
hypertension_drugs <- I22_drugs_R |> 
    filter(str_detect(drugs_sql, 
        "C66869|C29098|C61635|C47640_2|C28836|C29254|C62027|C62027_2"
        )
        ) |> 
    pull(record_id)

#### Assign hypertension based on conditions or drugs
data_d1_exclusive <- data_d1_exclusive |> 
    mutate(hypertension = if_else(
        record_id %in% hypertension_conditions, 
        1, 
        if_else(
            record_id %in% hypertension_drugs, 
            1, 
            0
        )
    ))

rm(hypertension_conditions)
rm(hypertension_drugs)
```

Dyslipidemia

```{r, results = 'hide', message = FALSE, warning = FALSE}
### 9.3.2 DYSLIPIDEMIA
#### Extract record IDs associated with dyslipidemia conditions
dyslipidemia_conditions <- I21_conditions_R |> 
    filter(str_detect(common_comorbidities, "C37967|C37971")) |>
    pull(record_id)

#### Extract record IDs associated with antilipemic drugs
dyslipidemia_drugs <- I22_drugs_R |> 
    filter(str_detect(drugs_sql, 
        "C29454|C66523_2|C47529|C61527|C87471"
        )
        ) |> 
    pull(record_id)

#### Assign dyslipidemia based on conditions or drugs
data_d1_exclusive <- data_d1_exclusive |> 
    mutate(dyslipidemia = if_else(
        record_id %in% dyslipidemia_conditions, 
        1, 
        if_else(
            record_id %in% dyslipidemia_drugs, 
            1, 
            0
        )
    ))

rm(dyslipidemia_conditions)
rm(dyslipidemia_drugs)
```

Insulin resistance

```{r, results = 'hide', message = FALSE, warning = FALSE}
### 9.3.3 INSULIN RESISTANCE
#### Extract record IDs associated with insulin resistance or diabetes
insulin_conditions <- I21_conditions_R |> 
    filter(str_detect(common_comorbidities, "C113101|C26747")) |>
    pull(record_id)

#### Extract record IDs associated with anti-hyperglycemic / hypoglycemic drugs
insulin_drugs <- I22_drugs_R |> 
    filter(str_detect(drugs_sql, 
        "C61612|C61612_2|C87618|C180533"
        )
        ) |> 
    pull(record_id)

#### Assign dyslipidemia based on conditions or drugs
data_d1_exclusive <- data_d1_exclusive |> 
    mutate(insulin = if_else(
        record_id %in% insulin_conditions, 
        1, 
        if_else(
            record_id %in% insulin_drugs, 
            1, 
            0
        )
    ))

rm(insulin_conditions)
rm(insulin_drugs)
```

Drugs that might induce weight loss

```{r, results = 'hide', message = FALSE, warning = FALSE}
### 9.3.4 DRUGS THAT MIGHT INDUCE WEIGHT LOSS
#### Extract record IDs
drugs_w_loss <- I22_drugs_R |> 
    filter(str_detect(drugs_sql, 
        "C61939|C62012|C506_1|C1278_2|C1278_1|C1278_3|C47764_1|C47764_2|C61680"
        )
        ) |> 
    pull(record_id)

#### Assign drugs_w_loss based on drugs
data_d1_exclusive <- data_d1_exclusive |> 
    mutate(drugs_w_loss = if_else(
        record_id %in% drugs_w_loss, 1, 0)
    )

rm(drugs_w_loss)
```

Drugs that might induce weight gain

```{r, results = 'hide', message = FALSE, warning = FALSE}
### 9.3.5 DRUGS THAT MIGHT INDUCE WEIGHT GAIN
#### Extract record IDs
drugs_w_gain <- I22_drugs_R |> 
    filter(str_detect(drugs_sql, 
        "C61879|C62005|C61917_2|C29416|C29536_2"
        )
        ) |> 
    pull(record_id)

#### Assign drugs_w_loss based on drugs
data_d1_exclusive <- data_d1_exclusive |> 
    mutate(drugs_w_gain = if_else(
        record_id %in% drugs_w_gain, 1, 0)
    )

rm(drugs_w_gain)

```

\*Relevant binary variables:

-   Hypertension present if: C3117 Hipertensão C66869 Losartana C29098 Hidroclorotiazida C61635 Anlodipino C47640_2 Olmesartana C28836 Atenolol C29254 Metoprolol C62027 Enalapril 10 mg C62027_2 Enalapril 20 mg

-   Dyslipidemia present if: C37967 Hipercolesterolemia C37971 Hipertrigliceridemia C29454 Sinvastatina C66523_2 Rosuvastatina C47529 Ezetimiba C61527 Atorvastatina C87471 Ciprofibrato

-   Insulin resistance present if: C113101 Resistência insulínica C26747 DM2 C61612 Metformina 500 mg C61612_2 Metformina 850 mg C87618 Gliclazida 30 mg C180533 Empagliflozin/Linagliptin

-   Drugs that might induce weight loss C61939 Sertralina C62012 Bupropiona C506_1 Fluoxetina C1278_2 Venlafaxina 75 mg C1278_1 Venlafaxina 37,5 mg C1278_3 Venlafaxina 150 mg C47764_1 Topiramato 25 mg C47764_2 Topiramato 50 mg C61680 Citalopram 20 mg

-   Drugs that might induce weight gain C61879 Paroxetina 20 mg C62005 Amitriptilina 25 mg C61917_2 Quetiapina 50 mg C29416 Risperidona 2 mg C29536_2 Ácido Valpróico 250 mg "C61879\|C62005\|C61917_2\|C29416\|C29536_2"

###### Wrapping up `data_d1_exclusive`

```{r, results = 'hide', message = FALSE, warning = FALSE}

data_d1_exclusive <- label_choices(data_d1_exclusive, codebook_dvep)
data_d1_exclusive <- convert_col_type(data_d1_exclusive, codebook_dvep)
data_d1_exclusive <- data_d1_exclusive |> 
    mutate(
        hypertension = as.factor(hypertension),
        dyslipidemia = as.factor(dyslipidemia),
        insulin = as.factor(insulin),
        drugs_w_loss = as.factor(drugs_w_loss),
        drugs_w_gain = as.factor(drugs_w_gain)
    )
data_d1_exclusive <- label_variables(data_d1_exclusive, codebook_dvep)

```

#### Lab exames

```{r, results = 'hide', message = FALSE, warning = FALSE}
I27_labs_R <- filter_data(c("V1","V2","V3"),1,"labs") |> 
    mutate(
        visit = case_when(
            event_name == "1visit_arm_1"    ~ 1,
            event_name == "2visit_arm_1"    ~ 2,
            event_name == "3visit_arm_1"    ~ 3
        ),
        .after = record_id
        )|> 
    select(-event_name, -repeat_instrument, -repeat_instance, -labs_checked_results_yn)

```

#### Compliance

```{r, results = 'hide', message = FALSE, warning = FALSE}
compliance_V2 <- data |> 
    select(
        record_id, event_name,
        filter_variables("V2",1,"compliance")
        ) |> 
    filter(event_name == "2visit_arm_1" & cp_compliance_complete == 2) |> 
    left_join(data |>
                  filter(event_name == "eleg_arm_1" & !is.na(intervention_start_date)) |> 
                  select(record_id,intervention_start_date, conclusion_date),
              by = join_by(record_id)
              ) |> 
    left_join(data |>
                  filter(event_name == "2visit_arm_1" & !is.na(evaluation_date)) |> 
                  select(record_id,evaluation_date),
              by = join_by(record_id)
        ) |> 
    rename(evaluation_date_2 = evaluation_date)

compliance_V3 <- data |> 
    select(
        record_id, event_name,
        filter_variables("V3",,"compliance")
        ) |> 
    filter(event_name == "3visit_arm_1" & cp_compliance_complete == 2) |> 
    left_join(data |>
                  filter(event_name == "eleg_arm_1" & !is.na(intervention_start_date)) |> 
                  select(record_id,intervention_start_date, conclusion_date),
              by = join_by(record_id)
              ) |> 
    left_join(data |>
                  filter(event_name == "2visit_arm_1" & !is.na(evaluation_date)) |> 
                  select(record_id,evaluation_date),
              by = join_by(record_id)
        ) |> 
    rename(evaluation_date_2 = evaluation_date)

I29_compliance <- bind_rows(
    compliance_V2,compliance_V3
) |> 
    mutate(
        record_id = as.integer(record_id)
    ) |> 
    mutate(
        visit = case_when(
            event_name == "2visit_arm_1"    ~ 2,
            event_name == "3visit_arm_1"    ~ 3
        ),
        .after = record_id
    ) |> 
    arrange(record_id,visit) |> 
    select(record_id, visit, intervention_start_date, evaluation_date_2, conclusion_date, cp_taking_as_directed_yn, cp_schedule, cp_schedule_other, cp_missed_dose_yn, cp_missed_dose_count, cp_discontinued_yn, cp_discontinued_n_days, cp_discontinued_reason_other, cp_ran_out_of_drug_yn, cp_ran_out_reason, cp_perceived_improvement_yn, cp_perceived_improvement, cp_medication_confidence_scale, cp_self_reported_compliance_rate)  |> 
    convert_col_type()

rm(compliance_V2)
rm(compliance_V3)

I29_compliance <- label_variables(I29_compliance, codebook_dvep)
I29_compliance <- label_choices(I29_compliance, codebook_dvep)

```

#### Adverse events

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 9.9 Adverse events
I30_events_R <- filter_data(,1,"events") |> 
    filter(
        cp_adverse_event_this_cycle_yn == 1
    ) |>
    mutate(
        visit = case_when(
            event_name == "1visit_arm_1"    ~ 1,
            event_name == "2visit_arm_1"    ~ 2,
            event_name == "3visit_arm_1"    ~ 3
        ),
        .after = record_id
        )|> 
    select(-event_name, -repeat_instrument, -cp_additional_adverse_events_yn,  -cp_adverse_event_this_cycle_yn)

I30_events_R <- label_variables(I30_events_R, codebook_dvep)
I30_events_R <- label_choices(I30_events_R, codebook_dvep) 
```

### Non-repeating instruments

#### Data common to V1 and V3 (`d1d3`)

```{r, results = 'hide', message = FALSE, warning = FALSE}

### Non-repeating data common to V1 and V3 (`d1d3`)
# calculate mean of handgrip strenght
# select relevant variables
d1d3 <- filter_data(c("V1","V3"),0) |> 
    mutate(
        handgrip = if_else(
            is.na(handgrip_right_mean) & is.na(handgrip_left_mean),
            NA_real_,  # Leave blank (NA) if both are missing
            if_else(
                !is.na(handgrip_right_mean) & is.na(handgrip_left_mean),
                handgrip_right_mean,  # Use the right hand value if left is missing
                if_else(
                    is.na(handgrip_right_mean) & !is.na(handgrip_left_mean),
                    handgrip_left_mean,  # Use the left hand value if right is missing
                    rowMeans(cbind(handgrip_right_mean, handgrip_left_mean), na.rm = TRUE)  # Calculate mean if both are present
                    )
                )
            )
        ) |> 
    mutate(
        visit = case_when(
            event_name == "1visit_arm_1"    ~ 1,
            event_name == "3visit_arm_1"    ~ 3
        )
    ) |> 
    select(
        record_id, visit,
        whoqol_score_overall, # 4. whoqol
        dass_score_depression:ecap_score, # 5. dass, 6. ecap
        height, weight, abdomen, arm, bmi, # 7. measures
        mean_bp_mean, # 9. bp 
        time_fasted_food, time_fasted_liquid, resistance, reactance, phase_angle, # 10. bia
        handgrip, # 11. handgrip
        # 12. eliminations
        evs_score, # 14. evs
        alcohol_dose, alcohol_significant, # 15. alcohol
        smoke_history, pack_years,  # 16. tobacco
        carbs_kcal, protein_kcal, fat_kcal, # 18. intake
        drugs_dose_change_yn, drugs_dose_change_notes, # 31. medical
        intervention_prevention_reason_yn, # 31. medical
        specify_intervention_prevention_reasons, # 31. medical
        intervention_delivered_yn, # 31. medical
        explain_intervention_not_delivered # 31. medical
        )
```

#### Data from the second visit (`d2`)

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 9.5 Non-repeating data from the second visit (`d2`)
d2 <- filter_data("V2",0) |> 
    mutate(
        visit = case_when(
            event_name == "2visit_arm_1"    ~ 2
        )
    ) |> 
    select(
        record_id, visit,
        height, weight, abdomen, arm, bmi, # 7. measures
        mean_bp_mean, # 9. bp 
        # 12. eliminations
        evs_score, # 14. evs
        drugs_dose_change_yn, drugs_dose_change_notes, # 31. medical
        intervention_prevention_reason_yn, # 31. medical
        specify_intervention_prevention_reasons, # 31. medical
        intervention_delivered_yn, # 31. medical
        explain_intervention_not_delivered # 31. medical
        )
```

### Joining

#### Bind rows from `d1d3` and `d2`: `data_filtered`

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 9.6 Bind rows for non-repeating variables from D1/D2/D3
data_filtered <- bind_rows(
    d1d3,d2
) |> 
    mutate(
        record_id = as.integer(record_id),
        visit = as.integer(visit)
) |> 
    arrange(
        record_id, visit
    ) |> 
    convert_col_type()

rm(d1d3)
rm(d2)
```

#### Left_joins

##### BIA data to `data_filtered`

```{r, results = 'hide', message = FALSE, warning = FALSE}
## 9.7. Left_join BIA data to `data_filtered`
data_filtered <- data_filtered |> 
    left_join(
        data_bia |> 
            select(
                record_id, visit,
                phaseangle, raverage, xcaverage, 
                weight, height, waist, pal, bmi, 
                fmi, ffmi, vat,
                w_tbw, w_ecw
            ),
        by = join_by(record_id, visit)
    )

```

##### Compliance data to `data_filtered`

```{r, results = 'hide', message = FALSE, warning = FALSE}
data_filtered <- data_filtered |> 
    left_join(
        I29_compliance,
        by = join_by(record_id, visit)
    )

```

##### data_d1_exclusive to `all data`

```{r, results = 'hide', message = FALSE, warning = FALSE}
data_filtered <- data_d1_exclusive |> 
    right_join(
        data_filtered,
        by = join_by(record_id)
    ) |> 
    relocate(
        visit,
        .after = record_id
    )

```

##### Labs to `data_filtered`

```{r, results = 'hide', message = FALSE, warning = FALSE}
data_filtered <- data_filtered |> 
    left_join(I27_labs_R,
              by = join_by(record_id, visit)
    )

data_filtered <- label_variables(data_filtered, codebook_dvep)
data_filtered <- label_variables(data_filtered, codebook_bia)
data_filtered <- data_filtered |> 
    mutate(
        visit = as.integer(visit)
    )
```

### Supertible (`data_instruments`)

Creates a supertibble with one tibble for each instrument

```{r, results = 'hide', message = FALSE, warning = FALSE}
form_names <- unique(codebook_dvep$form_name_en)
form_names <- form_names[-2]

# Dynamically create the instruments list
instruments <- setNames(
    lapply(form_names, function(form_name) {
        filter_codebook(form_name, 0)$variable
    }),
    paste0("I", sprintf("%02d", seq_along(form_names)), "_", form_names)
)

# Estas variáveis não devem ser consideradas na verificação de dados faltantes (NA) porque sempre contêm informações.
always_present_vars <- c("record_id", "event_name", "repeat_instrument", "repeat_instance")

# Criar uma lista de tibbles separadas para cada instrumento, excluindo linhas que contenham apenas NAs
data_instruments <- lapply(names(instruments), function(instr_name) {
    
    # `instr_name` é o nome atual do instrumento sendo processado, por exemplo, "elegibility".
    
    # Seleciona a lista de variáveis associadas ao instrumento atual
    selected_vars <- instruments[[instr_name]]
    
    # Remove as variáveis da lista que estão em `always_present_vars` (que sempre possuem valores).
    # `setdiff()` retorna apenas as variáveis exclusivas (aquelas que não estão em `always_present_vars`).
    vars_to_check <- setdiff(selected_vars, always_present_vars)
    
    # Filtrar os dados para o instrumento atual
    filtered_tibble <- data  |> 
        # Seleciona as colunas correspondentes às variáveis do instrumento atual
        select(all_of(selected_vars))  |> 
        
        # Filtra as linhas onde pelo menos uma das variáveis relevantes (não constantes) não é NA
        filter(
            rowSums(
                !is.na(
                    select(cur_data(), all_of(vars_to_check)) # Seleciona apenas as colunas relevantes para a verificação de NA
                )
            ) > 0 # `rowSums()` conta quantas colunas não são NA por linha. Mantemos linhas onde este total é maior que 0.
        )
    
    # Retorna a tibble filtrada com as variáveis e linhas relevantes para o instrumento atual
    return(filtered_tibble)
})

# Nomeia os elementos da lista `data_instruments` com os nomes correspondentes dos instrumentos.
# Por exemplo, o primeiro elemento da lista será nomeado "redcap", o segundo "elegibility", e assim por diante.
names(data_instruments) <- names(instruments)

rm(always_present_vars)
rm(form_names)
rm(instruments)

# Opcional: Salvar cada tibble no ambiente global como um objeto independente.
# `list2env()` converte cada elemento da lista `data_instruments` em um objeto no ambiente global,
# com o nome correspondente ao instrumento.
#list2env(data_instruments, .GlobalEnv)

```

### Exports tibbles to `Generated data` folder

```{r, results = 'hide', message = FALSE, warning = FALSE}
output_dir <- 'Output/Baseline/Instruments'


# Iterate over `data_instruments`
for (instr_name in names(data_instruments)) {
    # Create the file path for the current instrument
    file_path <- file.path(output_dir, paste0(instr_name, ".csv"))
    
    # Write the tibble to a CSV file
    write_csv(data_instruments[[instr_name]], file_path)
    
    # Print a message confirming the export
    message("Exported: ", file_path)
}

# Additional tibbles
write_csv(data_bia, file.path(output_dir, "data_bia.csv"))
write_csv(data_d1_exclusive, file.path(output_dir, "data_d1_exclusive.csv"))
write_csv(data_filtered, file.path(output_dir, "data_filtered.csv"))

rm(output_dir)
rm(file_path)
rm(instr_name)
```

------------------------------------------------------------------------

# DATA ANALYSIS

## Simplifying Environment

```{r}
codebooks <- tibble(
    name = c("bia", "dvep", "ncit", "structure"),
    data = list(codebook_bia, codebook_dvep, codebook_ncit, codebook_structure)
)

# Assign names to the `data` list-column
names(codebooks$data) <- codebooks$name

# Remove the individual tibbles from the environment
rm(codebook_bia, codebook_dvep, codebook_ncit, codebook_structure)


```

### Pull individual codebooks

You can pull each individual codebook by:

-   codebooks\$data[["bia"]]
-   codebooks\$data[["dvep"]]
-   codebooks\$data[["ncit"]]
-   codebooks\$data[["structure"]]

## Functions

### summarize_numerical()

`summarize_numerical(data, group_col = NULL, use_labels = TRUE)`

Arguments: `data`: dataframe `group_col = NULL`: grouping column; defaults to NULL `use_labels = TRUE`: If set to true, uses labels instead of variable names

```{r, results = 'hide', message = FALSE, warning = FALSE, eval=FALSE}
summarize_numerical <- function(data, group_col = NULL, use_labels = TRUE) {
  # Extract variable labels, falling back to variable names if labels are missing
  variable_labels <- sapply(names(data), function(var) {
    label <- attr(data[[var]], "label")
    if (is.null(label) || !use_labels) var else label
  }, simplify = TRUE)
  
  # Identify numeric columns
  numeric_cols <- data %>% select(where(is.numeric)) %>% names()
  
  if (!is.null(group_col)) {
    # Grouped summary
    summary <- data %>%
      group_by(across(all_of(group_col))) %>%
      summarise(across(
        all_of(numeric_cols),
        ~ paste0(
          round(mean(.x, na.rm = TRUE), 1), 
          " (", 
          round(mean(.x, na.rm = TRUE) - 1.96 * sd(.x, na.rm = TRUE) / sqrt(sum(!is.na(.x))), 1), 
          "–", 
          round(mean(.x, na.rm = TRUE) + 1.96 * sd(.x, na.rm = TRUE) / sqrt(sum(!is.na(.x))), 1), 
          ")"
        ),
        .names = "{.col}"
      )) %>%
      pivot_longer(-all_of(group_col), names_to = "Variable", values_to = "Value") %>%
      mutate(Variable = variable_labels[Variable]) # Replace variable names with labels/names
  } else {
    # Ungrouped summary
    summary <- data %>%
      summarise(across(
        all_of(numeric_cols),
        ~ paste0(
          round(mean(.x, na.rm = TRUE), 1), 
          " (", 
          round(mean(.x, na.rm = TRUE) - 1.96 * sd(.x, na.rm = TRUE) / sqrt(sum(!is.na(.x))), 1), 
          "–", 
          round(mean(.x, na.rm = TRUE) + 1.96 * sd(.x, na.rm = TRUE) / sqrt(sum(!is.na(.x))), 1), 
          ")"
        ),
        .names = "{.col}"
      )) %>%
      pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
      mutate(Variable = variable_labels[Variable]) # Replace variable names with labels/names
  }
  
  return(summary)
}
```

------------------------------------------------------------------------

### summarize_categorical()

`summarize_categorical(data, group_col = NULL, use_labels = TRUE)`

Arguments: - `data`: dataframe - `group_col = NULL`: grouping column; defaults to NULL - `use_labels = TRUE`: If set to true, uses labels instead of variable names

```{r, results = 'hide', message = FALSE, warning = FALSE}
summarize_categorical <- function(data, group_col = NULL, use_labels = TRUE) {
  # Extract variable labels, falling back to variable names if labels are missing
  variable_labels <- sapply(names(data), function(var) {
    label <- attr(data[[var]], "label")
    if (is.null(label) || !use_labels) var else label
  }, simplify = TRUE)
  
  # Identify categorical columns
  categorical_cols <- data %>% select(where(~ is.character(.x) || is.factor(.x))) %>% names()
  
  if (!is.null(group_col)) {
    # For grouped data
    summary <- lapply(categorical_cols, function(col) {
      data %>%
        group_by(across(all_of(group_col)), .drop = FALSE) %>%
        count(!!sym(col), .drop = FALSE, name = "Freq") %>%
        mutate(
          Percent = round(100 * Freq / sum(Freq), 1),
          Variable = variable_labels[col] # Use labels or names
        ) %>%
        rename(Level = !!sym(col)) %>%
        ungroup()
    }) %>%
      bind_rows()
  } else {
    # For ungrouped data
    summary <- lapply(categorical_cols, function(col) {
      data %>%
        count(!!sym(col), .drop = FALSE, name = "Freq") %>%
        mutate(
          Percent = round(100 * Freq / sum(Freq), 1),
          Variable = variable_labels[col] # Use labels or names
        ) %>%
        rename(Level = !!sym(col))
    }) %>%
      bind_rows()
  }
  
  # Arrange columns for consistency
  summary <- summary %>%
    select(Variable, Level, Freq, Percent, everything())
  
  return(summary)
}

```

------------------------------------------------------------------------

### compare_groups()

```{r, results = 'hide', message = FALSE, warning = FALSE, eval=FALSE}
compare_groups <- function(data, group_col = "allocation_group", use_labels = TRUE) {
  
  # Extract variable labels, default to variable names if labels are missing
  # If use_labels = TRUE, fetch the 'label' attribute for each variable. 
  # If no label exists or use_labels = FALSE, default to the variable name.
  variable_labels <- sapply(names(data), function(var) {
    label <- attr(data[[var]], "label") # Check for the 'label' attribute
    if (is.null(label) || !use_labels) var else label # Use variable name if label is missing
  }, simplify = TRUE)
  
  # Prepare results storage
  # Create an empty data.frame to store test results with predefined columns
  results <- data.frame(
    Variable = character(), # The variable being tested
    Test = character(),     # The type of test (t-test or chi-squared/Fisher's)
    Statistic = numeric(),  # The test statistic value
    P_value = numeric(),    # The p-value from the test
    stringsAsFactors = FALSE # Ensure strings are not converted to factors
  )
  
  # Loop through all columns in the data, excluding the grouping column
  for (var in setdiff(names(data), group_col)) {
    if (is.numeric(data[[var]])) {
      # Run t-test for numeric variables
      # Compares the means of the variable between the groups in group_col
      test_result <- t.test(data[[var]] ~ data[[group_col]])
      
      # Append the t-test results to the results data.frame
      results <- rbind(results, data.frame(
        Variable = variable_labels[var], # Use label or name
        Test = "t-test",                 # Specify the test type
        Statistic = round(test_result$statistic, 2), # Round the t-statistic
        P_value = round(test_result$p.value, 4)     # Round the p-value
      ))
    } else if (is.factor(data[[var]]) || is.character(data[[var]])) {
      # Run chi-squared test for categorical variables
      # Create a contingency table for the variable and the group_col
      contingency_table <- table(data[[var]], data[[group_col]])
      
      # Check expected counts to decide between chi-squared and Fisher's test
      if (any(chisq.test(contingency_table)$expected < 5)) {
        # Use Fisher's exact test when expected counts are too small
        test_result <- fisher.test(contingency_table)
        
        # Append the Fisher's exact test results to the results data.frame
        results <- rbind(results, data.frame(
          Variable = variable_labels[var], # Use label or name
          Test = "Fisher's exact test",    # Specify the test type
          Statistic = NA,                  # No statistic for Fisher's test
          P_value = round(test_result$p.value, 4) # Round the p-value
        ))
      } else {
        # Use chi-squared test when expected counts are sufficient
        test_result <- tryCatch(
          chisq.test(contingency_table), # Perform the chi-squared test
          error = function(e) list(statistic = NA, p.value = NA) # Handle potential errors
        )
        
        # Append the chi-squared test results to the results data.frame
        results <- rbind(results, data.frame(
          Variable = variable_labels[var], # Use label or name
          Test = "Chi-squared test",       # Specify the test type
          Statistic = round(test_result$statistic, 2), # Round the chi-squared statistic
          P_value = round(test_result$p.value, 4)     # Round the p-value
        ))
      }
    }
  }
  
  # Return results as a gt table
  # Convert the results data.frame into a gt table for better visualization
  gt_table <- results %>%
    gt() %>%
    tab_header(
      title = "Hypothesis Test Results",       # Main title for the table
      subtitle = paste("Comparison of", group_col) # Subtitle indicating the grouping variable
    ) %>%
    cols_label(
      Variable = "Variable",          # Rename the Variable column
      Test = "Test Type",             # Rename the Test column
      Statistic = "Test Statistic",   # Rename the Statistic column
      P_value = "P-value"             # Rename the P_value column
    )
  
  return(gt_table) # Return the formatted gt table
}
```

1.  **Extract Variable Labels**:
    -   Fetches the `label` attribute for each variable.
    -   If no label exists or if `use_labels = FALSE`, defaults to the variable name.
2.  **Prepare Results Storage**:
    -   An empty `data.frame` is created to store test results, including the variable name/label, test type, test statistic, and p-value.
3.  **Loop Through Variables**:
    -   **Numeric Variables**:
        -   A two-sample t-test is run to compare means between groups.
        -   Test results (t-statistic and p-value) are appended to the results `data.frame`.
    -   **Categorical Variables**:
        -   A contingency table is created for the variable and the grouping column.
        -   If expected counts in the table are too small, Fisher's exact test is used.
        -   Otherwise, a chi-squared test is performed.
        -   Results are appended to the results `data.frame`.
4.  **Error Handling**:
    -   The chi-squared test may fail for certain edge cases (e.g., empty levels). Errors are caught and handled gracefully by returning `NA` for the statistic and p-value.
5.  **Output as a `gt` Table**:
    -   The `results` `data.frame` is converted to a nicely formatted `gt` table with a title and custom column labels.

Example Usage

```{r, eval=FALSE}
# Run the function and display results
test_results <- compare_groups(data_d1_exclusive)

# Print the results
print(test_results)
```