document all standards

pachadotdev · Nov 23, 2024 · af36b88 · af36b88
1 parent bcfc745
commit af36b88
Show file tree

Hide file tree

Showing 20 changed files with 148 additions and 5 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,7 +26,8 @@ Suggests:
     knitr,
     rmarkdown,
     testthat (>= 3.0.0),
-    tidyr
+    tidyr,
+    units
 Depends: R(>= 3.5.0)
 Description: Fast and user-friendly estimation of generalized linear models with
     multiple fixed effects and cluster the standard errors. The method to obtain

diff --git a/NAMESPACE b/NAMESPACE
@@ -58,6 +58,7 @@ importFrom(dplyr,select)
 importFrom(dplyr,summarise)
 importFrom(dplyr,ungroup)
 importFrom(dplyr,vars)
+importFrom(dplyr,where)
 importFrom(generics,augment)
 importFrom(generics,glance)
 importFrom(generics,tidy)

diff --git a/R/apes.R b/R/apes.R
@@ -5,14 +5,19 @@
 #' @srrstats {G2.3b} Uses `tolower()` to handle potential case sensitivity issues.
 #' @srrstats {G2.13} Validates that the input data contains no missing values.
 #' @srrstats {G2.14a} Issues errors when handling missing data is required but unsupported.
-#' @srrstats {G2.14b} Provides default warnings or messages when missing data is ignored.
+#' @srrstats {G2.14b} Provides clear error messages when the data structure is incompatible with the model requirements.
 #' @srrstats {G3.1a} Allows arbitrarily specified covariance methods for flexibility in inference.
 #' @srrstats {G5.2a} Produces unique and meaningful error, warning, and message outputs for diagnostics.
 #' @srrstats {RE5.0} Considers relationships between input data size and computational efficiency.
 #' @srrstats {G5.4a} Includes tests against trivial cases or alternative implementations to ensure algorithm correctness.
 #' @noRd
 NULL
 
+#' NA_standards
+#' @srrstatsNA {G2.14} Missing observations are dropped, otherwise providing imputation methods would bias the estimation (i.e., replacing all missing values with the median).
+#' @noRd
+NULL
+
 #' @title Compute average partial effects after fitting binary choice models
 #'  with a 1,2,3-way error component
 #'

diff --git a/R/autoplot.r b/R/autoplot.r
@@ -15,6 +15,12 @@ ggplot2::autoplot
 #' @noRd
 NULL
 
+#' NA_standards
+#' @srrstatsNA {RE6.2} Considering that the data tends to be very large, it made more sense to add a method to plot the coefficients instead of millions of predicted data points.
+#' @srrstatsNA {RE6.3} We plot the estimated coefficients without the fixed effects. Plotting millions of points would only add visual clutter and not provide any additional information.
+#' @noRd
+NULL
+
 #' @title Autoplot method for feglm objects
 #'
 #' @description Extracts the estimated coefficients and their confidence

diff --git a/R/bias_corr.R b/R/bias_corr.R
@@ -3,14 +3,19 @@
 #' @srrstats {G2.1a} Ensures input objects are of the expected class (`feglm`).
 #' @srrstats {G2.3a} Validates string arguments like `panel_structure` using `match.arg()` for predefined values.
 #' @srrstats {G2.14a} Provides errors for missing or invalid inputs, such as non-`feglm` objects.
-#' @srrstats {G2.14b} Provides clear error messages when exceeding supported fixed-effect dimensions.
+#' @srrstats {G2.14b} Provides clear error messages when the data structure is incompatible with the model requirements.
 #' @srrstats {G3.1a} Supports structured panels (`classic` or `network`) for analyzing fixed effects.
 #' @srrstats {RE5.0} Efficient handling of computational scaling for large panels through fixed-effect groupings.
 #' @srrstats {G5.2a} Produces unique and informative error messages for all stopping conditions.
 #' @srrstats {G5.4a} Includes logical checks for computational edge cases, such as unsupported models.
 #' @noRd
 NULL
 
+#' NA_standards
+#' @srrstatsNA {G2.14} Missing observations are dropped, otherwise providing imputation methods would bias the estimation (i.e., replacing all missing values with the median).
+#' @noRd
+NULL
+
 #' @title Asymptotic bias correction after fitting binary choice models with a
 #'  1,2,3-way error component
 #'

diff --git a/R/capybara-package.R b/R/capybara-package.R
@@ -15,13 +15,26 @@
 #'  in this implementation compare to base R.
 #' @srrstats {G1.6} To keep dependencies minimal, we compare against base R in
 #'  the tests. An alternative would be to compare against alpaca.
+#' @srrstats {RE4.12} The link and inverse link functions are written in C++
+#'  to use those with the Armadillo library. This is in the file
+#'  `src/05_glm_fit.cpp`.
 #' @noRd
 NULL
 
 #' NA_standards
+#' @srrstatsNA {G2.6} Only some model parameters can be unidimensional. To fit
+#'  a regression we need at least two observations and two variables.
 #' @srrstatsNA {G5.6b} No randomness is needed for the in fixed effects
 #'  estimation. With the model slopes, recovering the fixed effects is a
 #'  deterministic process.
+#' @srrstatsNA {G2.9} Conversion of variables from factor to character is not
+#'  conducted and the original input data is not modified.
+#' @srrstatsNA {G2.12} `data.frame`-like tabular objects which have list
+#'  columns cannot be used as input data. This behaviour should be tested.
+#' @srrstatsNA {G2.14c} Missing data is not replaced with imputed values.
+#' @srrstatsNA {G2.14c} Replacing data with imputed values bias the estimation.
+#'  This is not done in the package, and it is left to the user to decide
+#'  when processing the data.
 #' @srrstatsNA {RE7.0a} No cross-validation implemented in this package.
 #' @noRd
 NULL
@@ -45,7 +58,7 @@ NULL
 #'
 #' @name capybara-package
 #' @importFrom dplyr across all_of filter group_by mutate pull select summarise
-#'  ungroup vars
+#'  ungroup vars where
 #' @importFrom Formula Formula
 #' @importFrom ggplot2 ggplot aes geom_point geom_errorbar labs theme_minimal
 #'  coord_flip autoplot

diff --git a/R/feglm.R b/R/feglm.R
@@ -34,6 +34,9 @@
 #' @srrstats {RE1.4} Implements diagnostic checks to verify the assumptions of independence and homoscedasticity, essential for valid inference.
 #' @srrstats {RE2.0} Labels all regression outputs, such as coefficients and standard errors, to ensure clarity and interpretability.
 #' @srrstats {RE2.4} Quantifies uncertainty in regression coefficients using confidence intervals.
+#' @srrstats {RE2.4a} Rejects perfect collinearity between independent variables.
+#' @srrstats {RE2.4b} Rejects perfect collinearity between dependent and independent variables.
+#' @srrstats {RE4.0} This returns a model-type object that is essentially a list with specific components and attributes.
 #' @srrstats {RE4.1} Identifies outliers and influential data points that may unduly impact regression results, offering visualization tools.
 #' @srrstats {RE4.6} Includes standard metrics such as R-squared and RMSE to help users evaluate model performance.
 #' @srrstats {RE4.7} Tests sensitivity to hyperparameter choices in regularized or complex regression models.

diff --git a/R/feglm_control.R b/R/feglm_control.R
@@ -5,12 +5,19 @@
 #' @srrstats {G2.1a} Ensures the proper data types for arguments (e.g., logical for `trace`, integer for `iter_max`).
 #' @srrstats {G2.3a} Uses argument validation to ensure appropriate ranges for critical parameters (e.g., `iter_max` and `limit` >= 1).
 #' @srrstats {G2.14a} Provides informative error messages when tolerance levels or iteration counts are invalid.
+#' @srrstats {G2.14b} Provides clear error messages when the data structure is incompatible with the model requirements.
 #' @srrstats {G5.2a} Produces unique and descriptive error messages for all validation checks.
+#' @srrstats {RE3.0} If the deviance difference between 2 iterations is not less than tolerance after the max number of iterations, it prints a convergence warning.
 #' @srrstats {RE5.0} Supports control over algorithmic complexity, such as dropping perfectly separated observations (`drop_pc`) and optional matrix storage (`keep_mx`).
 #' @srrstats {G5.4a} Includes robust edge case handling, such as enforcing positive tolerance and iteration counts.
 #' @noRd
 NULL
 
+#' NA_standards
+#' @srrstatsNA {G2.14} Missing observations are dropped, otherwise providing imputation methods would bias the estimation (i.e., replacing all missing values with the median).
+#' @noRd
+NULL
+
 #' @title Set \code{feglm} Control Parameters
 #'
 #' @description Set and change parameters used for fitting \code{\link{feglm}}.

diff --git a/R/feglm_helpers.R b/R/feglm_helpers.R
@@ -9,11 +9,23 @@
 #' @srrstats {G2.4c} Ensures numeric inputs (e.g., convergence thresholds, tolerances) are within acceptable ranges to avoid unexpected results.
 #' @srrstats {G2.4d} Verifies the structure and completeness of input data, including the absence of missing values and correct dimensionality for matrices.
 #' @srrstats {G2.4e} Issues warnings when deprecated or redundant arguments are used, encouraging users to adopt updated practices while maintaining backward compatibility.
+#' @srrstats {G2.7} The input accepts data frames, tibbles and data table objects, from which it creates the design matrix.
+#' @srrstats {G2.8} The pre-processing for all main functions (e.g., `feglm`, `felm`, `fepois`, `fenegbin`) is the same. The helper functions discard unusable observations dependening on the link function, and then create the design matrix.
+#' @srrstats {G2.10} For data frames, tibbles and data tables the column-extraction operations are consistent.
+#' @srrstats {G2.11} `data.frame`-like tabular objects which have can have atypical columns (i.e., `vector`) do not error without reason.
 #' @srrstats {G2.13} Checks for and handles missing data in input datasets.
 #' @srrstats {G2.14a} Issues informative errors for invalid inputs, such as incorrect link functions or missing data.
+#' @srrstats {G2.14b} Provides clear error messages when the data structure is incompatible with the model requirements.
+#' @srrstats {G2.15} The functions check for unusable observations (i.e., one column has an NA), and these are discarded before creating the design matrix.
+#' @srrstats {G2.16} `NaN`, `Inf` and `-Inf` cannot be used for the design matrix, and all observations with these values are removed.
 #' @srrstats {G5.2a} Ensures that all error and warning messages are unique and descriptive.
-#' @srrstats {RE5.0} Supports internal optimizations, including centering variables and reducing computational redundancy.
 #' @srrstats {G5.4a} Includes tests for edge cases, such as binary and continuous response variables, and validates all input arguments.
+#' @srrstats {RE4.4} The model is specified using a formula object, or a character-type object convertible to a formula, which is then used to create the design matrix.
+#' @srrstats {RE4.5} Fitted models have an nobs element that can be called with `nobs()`.
+#' @srrstats {RE4.8} The response variable is checked and some observations are dropped if the response is not compatible with the link (i.e., negative values and log-link).
+#' @srrstats {RE4.12} The `check_data_()` function drops observations that are not useable with link function or that do not contribute to the log-likelihood.
+#' @srrstats {RE4.13} Observations with a dependent variable that is incompatible with the link function are removed.
+#' @srrstats {RE5.0} Supports internal optimizations, including centering variables and reducing computational redundancy.
 #' @srrstats {RE5.1} Implements computational safeguards for iterative processes, such as weight validation and convergence checks.
 #' @srrstats {RE5.2} Provides utilities for scalable and efficient computation of GLM derivatives and score matrices.
 #' @noRd
@@ -191,6 +203,14 @@ update_formula_ <- function(formula) {
   formula
 }
 
+#' @title Column types
+#' @description Returns the column types of a data frame
+#' @param data Data frame
+#' @noRd
+col_types <- function(data) {
+  vapply(data, class, character(1L), USE.NAMES = FALSE)
+}
+
 #' @title Model frame
 #' @description Creates model frame for GLM/NegBin models
 #' @param data Data frame
@@ -206,6 +226,13 @@ model_frame_ <- function(data, formula, weights) {
 
   data <- na.omit(data)
 
+  # if any column if of type "units", convert it to numeric
+  types <- col_types(data)
+  if (any(types == "units")) {
+    # use a mutate to transform each unit-type column to numeric
+    data <- mutate(data, across(where(~"units" %in% types), as.numeric))
+  }
+
   nobs_na <- nobs_full - nrow(data)
   nobs_full <- nrow(data)
 

diff --git a/R/felm.R b/R/felm.R
@@ -34,6 +34,9 @@
 #' @srrstats {RE1.4} Implements diagnostic checks to verify the assumptions of independence and homoscedasticity, essential for valid inference.
 #' @srrstats {RE2.0} Labels all regression outputs, such as coefficients and standard errors, to ensure clarity and interpretability.
 #' @srrstats {RE2.4} Quantifies uncertainty in regression coefficients using confidence intervals.
+#' @srrstats {RE2.4a} Rejects perfect collinearity between independent variables.
+#' @srrstats {RE2.4b} Rejects perfect collinearity between dependent and independent variables.
+#' @srrstats {RE4.0} This returns a model-type object that is essentially a list with specific components and attributes.
 #' @srrstats {RE4.1} Identifies outliers and influential data points that may unduly impact regression results, offering visualization tools.
 #' @srrstats {RE4.6} Includes standard metrics such as R-squared and RMSE to help users evaluate model performance.
 #' @srrstats {RE4.7} Tests sensitivity to hyperparameter choices in regularized or complex regression models.

diff --git a/R/fenegbin.R b/R/fenegbin.R
@@ -36,6 +36,9 @@
 #' @srrstats {RE1.4} Implements diagnostic checks to verify the assumptions of independence and homoscedasticity, essential for valid inference.
 #' @srrstats {RE2.0} Labels all regression outputs, such as coefficients and standard errors, to ensure clarity and interpretability.
 #' @srrstats {RE2.4} Quantifies uncertainty in regression coefficients using confidence intervals.
+#' @srrstats {RE2.4a} Rejects perfect collinearity between independent variables.
+#' @srrstats {RE2.4b} Rejects perfect collinearity between dependent and independent variables.
+#' @srrstats {RE4.0} This returns a model-type object that is essentially a list with specific components and attributes.
 #' @srrstats {RE4.1} Identifies outliers and influential data points that may unduly impact regression results, offering visualization tools.
 #' @srrstats {RE4.6} Includes standard metrics such as R-squared and RMSE to help users evaluate model performance.
 #' @srrstats {RE4.7} Tests sensitivity to hyperparameter choices in regularized or complex regression models.

diff --git a/R/fepoisson.R b/R/fepoisson.R
@@ -33,6 +33,9 @@
 #' @srrstats {RE1.4} Implements diagnostic checks to verify the assumptions of independence and homoscedasticity, essential for valid inference.
 #' @srrstats {RE2.0} Labels all regression outputs, such as coefficients and standard errors, to ensure clarity and interpretability.
 #' @srrstats {RE2.4} Quantifies uncertainty in regression coefficients using confidence intervals.
+#' @srrstats {RE2.4a} Rejects perfect collinearity between independent variables.
+#' @srrstats {RE2.4b} Rejects perfect collinearity between dependent and independent variables.
+#' @srrstats {RE4.0} This returns a model-type object that is essentially a list with specific components and attributes.
 #' @srrstats {RE4.1} Identifies outliers and influential data points that may unduly impact regression results, offering visualization tools.
 #' @srrstats {RE4.6} Includes standard metrics such as R-squared and RMSE to help users evaluate model performance.
 #' @srrstats {RE4.7} Tests sensitivity to hyperparameter choices in regularized or complex regression models.

diff --git a/R/generics_augment.R b/R/generics_augment.R
@@ -10,6 +10,8 @@ generics::augment
 #' @srrstats {G3.1c} Supports additional columns in the output for confidence intervals if requested.
 #' @srrstats {G3.3} Handles the addition of multiple model outputs (`.fitted`, `.residuals`) to the data.
 #' @srrstats {G5.1} Provides robust error handling for missing or invalid input objects.
+#' @srrstats {RE4.10} The residuals are returned in a tidy data frame following the `broom` convention.
+#' @srrstats {RE4.11} The deviance and null deviance are returned in a tidy data frame following the `broom` convention.
 #' @srrstats {RE5.0} Optimized for integration with downstream analysis workflows.
 #' @srrstats {RE5.1} Maintains computational efficiency when augmenting large datasets.
 #' @srrstats {RE5.3} Supports additional data input (`newdata`) to enhance flexibility.

diff --git a/R/generics_coef.R b/R/generics_coef.R
@@ -6,6 +6,7 @@
 #' @srrstats {G3.1c} Provides access to summary statistics (`cm`) where applicable.
 #' @srrstats {G5.1} Includes robust error handling for unsupported or invalid input objects.
 #' @srrstats {G5.4a} Includes tests for extracting coefficients from simple and complex model objects.
+#' @srrstats {RE4.2} Returns coefficients via a standard method for feglm-type objects and derived classes (i.e., felm, apes, etc).
 #' @srrstats {RE5.0} Enables seamless integration with downstream analysis workflows.
 #' @srrstats {RE5.2} Maintains computational efficiency in coefficient extraction.
 #' @noRd

diff --git a/R/generics_predict.R b/R/generics_predict.R
@@ -8,6 +8,8 @@
 #' @srrstats {G3.4a} Includes an option for type-specific predictions (e.g., `link` vs. `response`).
 #' @srrstats {G5.2a} Tests include validation of predictions against known values and edge cases.
 #' @srrstats {G5.4a} Outputs predictions in a format compatible with standard R workflows.
+#' @srrstats {RE4.9} The predicted values for the model data or new data are returned as a vector with `predict()`.
+#' @srrstats {RE4.16} The fixed effects are passed to the `predict()` function to add the group-specific effects to the predictions.
 #' @srrstats {RE5.0} Ensures computational efficiency in handling both `feglm` and `felm` prediction workflows.
 #' @srrstats {RE5.2} Integrates seamlessly with user-provided data for generating predictions.
 #' @srrstats {RE5.3} Provides predictable and consistent output types for downstream analysis.

diff --git a/R/generics_print.R b/R/generics_print.R
@@ -5,6 +5,7 @@
 #' @srrstats {G3.3} Includes well-structured significance indicators (`***`, `**`, `*`, `.`) for coefficient p-values.
 #' @srrstats {G5.2a} Outputs are formatted for clarity, with aligned columns and headers.
 #' @srrstats {G5.4a} Validates consistency of printed summaries across model types, ensuring uniform presentation.
+#' @srrstats {RE4.17} Specific default `print()` method for summaries and coefficients.
 #' @srrstats {RE5.0} Reduces cyclomatic complexity by modularizing summary and print methods.
 #' @srrstats {RE5.2} Facilitates easy interpretation of model summaries, including pseudo R-squared, deviance, and fixed-effects estimates.
 #' @srrstats {RE5.3} Designed for extensibility to accommodate additional model types or summary elements.

diff --git a/R/generics_summary.R b/R/generics_summary.R
@@ -6,6 +6,8 @@
 #' @srrstats {G5.2a} Outputs include well-structured coefficient matrices with appropriate column headers and row names.
 #' @srrstats {RE2.1} Summary methods ensure compatibility with standard statistical workflows by providing model evaluation metrics.
 #' @srrstats {RE2.2} Custom handling of model-specific details like Poisson pseudo R-squared and Negative Binomial `theta` values.
+#' @srrstats {RE4.11} The deviance, null deviance, R-squared and adjusted R-squared are returned in the summaries.
+#' @srrstats {RE4.18} Implemented `summary()` functions specific for GLMs and LMs (i.e., it shows R2 for LMs and pseudo R2 for Poisson models).
 #' @srrstats {RE5.0} Reduces cyclomatic complexity through modular functions for computing summary components.
 #' @srrstats {RE5.2} Facilitates interpretability of models by providing a unified and clear summary output format.
 #' @noRd

diff --git a/tests/testthat/test-deterministic.R b/tests/testthat/test-deterministic.R
@@ -4,6 +4,8 @@
 #' @srrstats {RE5.1} Confirms that the function provides meaningful error messages for invalid input.
 #' @srrstats {RE5.2} Verifies that the model throws an error when dependent columns are included in the formula.
 #' @srrstats {RE5.4} Checks robustness against deterministic linear relationships in the design matrix.
+#' @srrstats {RE7.0} Exact relationships return a collinearity error.
+#' @srrstats {RE7.0a} Perfectly noiseless input data is rejected, we have the `solve()` function for that.
 #' @noRd
 NULL