From 0c6d4a7a3f95dfd58a62b351f2270f44daeec845 Mon Sep 17 00:00:00 2001 From: marcos Date: Sun, 15 Dec 2024 09:28:35 -0300 Subject: [PATCH] new independent_variables --- src/linear_regression.jl | 131 +++++++++++++++++++---------------- src/regression_parameters.jl | 121 ++++---------------------------- 2 files changed, 86 insertions(+), 166 deletions(-) diff --git a/src/linear_regression.jl b/src/linear_regression.jl index 8bb155c..a192164 100644 --- a/src/linear_regression.jl +++ b/src/linear_regression.jl @@ -1,74 +1,48 @@ -function _create_matrix_formulas!(matrix_formulas::Vector{MatrixTerm}, x_term_list::Vector{MixTerm}) - @inbounds for (k, x) ∈ enumerate(x_term_list) - matrix_formulas[k] = MatrixTerm(β0 + x) - end -end - -function _create_matrix_formulas!(matrix_formulas::Vector{MatrixTerm}, x_term_list::Vector{MixTerm}, q_term::AbstractTerm...) - @inbounds for (k, x) ∈ enumerate(x_term_list) - matrix_formulas[k] = MatrixTerm(β0 + x + sum(q_term)) - end -end - -function _fit_regression!(fitted_models::Vector{TableRegressionModel}, +function _fit_regression!( + fitted_models::Vector{TableRegressionModel}, y_term_list::Vector{AbstractTerm}, - matrix_formulas::Vector{MatrixTerm}, - cols::NamedTuple -) - - # Dictionary to store model column data for each y term - model_cols = Dict{AbstractTerm,Union{Vector{<:Real},Nothing}}() - - # Dictionary to store model matrix data for each matrix term - model_matrix = Dict{MatrixTerm,Union{Matrix{<:Real},Nothing}}() + model_matrix::Dict{MatrixTerm,Matrix{Float64}}, + cols::NamedTuple) - # Precompute model columns for each y term in y_term_list using modelcols function - @inbounds for y ∈ y_term_list - model_cols[y] = try - modelcols(y, cols) - catch nothing - end - end + model_dependent_variable = Dict{AbstractTerm,Union{AbstractVector,Nothing}}() - # Precompute model matrices for each x term (matrix formulas) using modelmatrix function - @inbounds for x ∈ matrix_formulas - model_matrix[x] = try - modelmatrix(x, cols) - catch nothing + for yt in y_term_list + model_dependent_variable[yt] = try + modelcols(yt, cols) + catch + nothing end end - # Loop over all combinations of y terms and matrix terms - @inbounds for (y, x) ∈ Iterators.product(y_term_list, matrix_formulas) - Y = model_cols[y] # Extract precomputed columns for y - X = model_matrix[x] # Extract precomputed matrix for x + for y in y_term_list + Y = model_dependent_variable[y] - # Skip if either Y or X is missing (nothing) - if Y === nothing || X === nothing + if Y === nothing continue end - try - # Perform linear regression using GLM.fit - fitted_model = GLM.fit(LinearModel, X, Y) + for (mt, X) in model_matrix + try + # Perform linear regression using GLM.fit + fitted_model = GLM.fit(LinearModel, X, Y) - # Create a formula combining y and x terms - formula = FormulaTerm(y, x) + # Create a formula combining y and x terms + formula = FormulaTerm(y, mt) - # Create a ModelFrame object to store schema and columns for the model - mf = ModelFrame(formula, emptySchema, cols, LinearModel) + # Create a ModelFrame object to store schema and columns for the model + mf = ModelFrame(formula, emptySchema, cols, LinearModel) - # Create a ModelMatrix object based on the formula - mm = ModelMatrix(X, asgn(formula)) + # Create a ModelMatrix object based on the formula + mm = ModelMatrix(X, asgn(formula)) - # Store the fitted model along with its associated data in TableRegressionModel - push!(fitted_models, TableRegressionModel(fitted_model, mf, mm)) - catch - # Handle any errors silently during model fitting + # Store the fitted model along with its associated data in TableRegressionModel + push!(fitted_models, TableRegressionModel(fitted_model, mf, mm)) + catch + # Handle any errors silently during model fitting + end end end end - """ regression(y::Symbol, x::Symbol, data::AbstractDataFrame, q::Symbol...) @@ -134,7 +108,7 @@ best_models = criteria_table(models, :adjr2, :rmse) ``` """ function regression(y::Symbol, x::Symbol, data::AbstractDataFrame, q::Symbol...) - + # remove empty values from the data new_data = dropmissing(data[:, [y, x, q...]]) n, k = size(new_data) @@ -154,21 +128,58 @@ function regression(y::Symbol, x::Symbol, data::AbstractDataFrame, q::Symbol...) # Convert the DataFrame to a column table (named tuple of vectors) cols = columntable(new_data) + # create terms to performe the linear regression y_term = concrete_term(term(y), cols, ContinuousTerm) x_term = concrete_term(term(x), cols, ContinuousTerm) - q_term = [concrete_term(term(terms), cols, CategoricalTerm) for terms ∈ q] + q_terms = [concrete_term(term(qq), cols, CategoricalTerm) for qq in q] y_term_list = _dependent_variable(y_term, x_term) - x_term_list = _independent_variable(x_term) + model_matrix = _independent_variable(x_term, cols, q_terms...) + + fitted_models = Vector{TableRegressionModel}() + + _fit_regression!(fitted_models, y_term_list, model_matrix, cols) + + isempty(fitted_models) && error("Failed to fit any models") + + return fitted_models +end + +function regression(y::Symbol, x1::Symbol, x2::Symbol, data::AbstractDataFrame, q::Symbol...) + # remove empty values from the data + new_data = dropmissing(data[:, [y, x1, x2, q...]]) + + n, k = size(new_data) + + if n < k + 2 + error("There are not enough data points to perform regression. At least $(k + 2) observations are required.") + end + + #Attempt to coerce the y and x columns to the Continuous scitype (e.g., Float64) + try + coerce!(new_data, y => ScientificTypes.Continuous, x1 => ScientificTypes.Continuous, x2 => ScientificTypes.Continuous) + catch + # If coercion fails, an error will be thrown + error("Unable to coerce variables '$(y)', '$(x1)' and '$(x2)' to Continuous. Please ensure they contain numeric values.") + end + + # Convert the DataFrame to a column table (named tuple of vectors) + cols = columntable(new_data) + + # create terms to performe the linear regression + y_term = concrete_term(term(y), cols, ContinuousTerm) + x1_term = concrete_term(term(x1), cols, ContinuousTerm) + x2_term = concrete_term(term(x2), cols, ContinuousTerm) + q_terms = [concrete_term(term(qq), cols, CategoricalTerm) for qq in q] - matrix_formulas = Vector{MatrixTerm}(undef, length(x_term_list)) + y_term_list = _dependent_variable(y_term) - isempty(q_term) ? _create_matrix_formulas!(matrix_formulas, x_term_list) : _create_matrix_formulas!(matrix_formulas, x_term_list, q_term...) + model_matrix = _independent_variable(x1_term, x2_term, cols, q_terms...) fitted_models = Vector{TableRegressionModel}() - _fit_regression!(fitted_models, y_term_list, matrix_formulas, cols) + _fit_regression!(fitted_models, y_term_list, model_matrix, cols) isempty(fitted_models) && error("Failed to fit any models") diff --git a/src/regression_parameters.jl b/src/regression_parameters.jl index 4730a28..b196007 100644 --- a/src/regression_parameters.jl +++ b/src/regression_parameters.jl @@ -1,24 +1,25 @@ log_minus(y::Real) = log(y - 1.3) - one_by_y(y::Real) = 1 / y one_by_y_minus(y::Real) = 1 / (y - 1.3) one_by_sqrt(y::Real) = 1 / √y one_by_sqrt_minus(y::Real) = 1 / √(y - 1.3) - x_by_sqrt_y(x::Real, y::Real) = x / √y x_by_sqrt_y_minus(x::Real, y::Real) = x / √(y - 1.3) square_x_by_y(x::Real, y::Real) = x^2 / y square_x_by_y_minus(x::Real, y::Real) = x^2 / (y - 1.3) # Generates a list of transformed dependent variable terms. -function _dependent_variable(y_term::AbstractTerm, x_term::AbstractTerm)::Vector{AbstractTerm} - return [ - y_term, FunctionTerm(log, [y_term], :(log($(y_term)))), +function _dependent_variable(y_term::AbstractTerm, x_term::AbstractTerm) + return AbstractTerm[ + y_term, + FunctionTerm(log, [y_term], :(log($(y_term)))), FunctionTerm(log_minus, [y_term], :(log_minus($(y_term) - 1.3))), - FunctionTerm(log1p, [y_term], :(log1p($(y_term)))), FunctionTerm(one_by_y, [y_term], :(1 / ($(y_term)))), + FunctionTerm(log1p, [y_term], :(log1p($(y_term)))), + FunctionTerm(one_by_y, [y_term], :(1 / ($(y_term)))), FunctionTerm(one_by_y_minus, [y_term], :(1 / ($(y_term) - 1.3))), FunctionTerm(one_by_sqrt, [y_term], :(1 / √($(y_term)))), - FunctionTerm(one_by_sqrt_minus, [y_term], :(1 / √($(y_term) - 1.3))), FunctionTerm(x_by_sqrt_y, [x_term, y_term], :($(x_term) / √$(y_term))), + FunctionTerm(one_by_sqrt_minus, [y_term], :(1 / √($(y_term) - 1.3))), + FunctionTerm(x_by_sqrt_y, [x_term, y_term], :($(x_term) / √$(y_term))), FunctionTerm(x_by_sqrt_y_minus, [x_term, y_term], :($(x_term) / √($(y_term) - 1.3))), FunctionTerm(square_x_by_y, [x_term, y_term], :($(x_term)^2 / $(y_term))), FunctionTerm(square_x_by_y_minus, [x_term, y_term], :($(x_term)^2 / ($(y_term) - 1.3))) @@ -26,108 +27,16 @@ function _dependent_variable(y_term::AbstractTerm, x_term::AbstractTerm)::Vector end # Generates a list of transformed dependent variable term. -function _dependent_variable(y_term::AbstractTerm)::Vector{AbstractTerm} - return [ - y_term, FunctionTerm(log, [y_term], :(log($(y_term)))), - FunctionTerm(log_minus, [y_term], :(log_minus($(y_term) - 1.3))), - FunctionTerm(log1p, [y_term], :(log1p($(y_term)))), FunctionTerm(one_by_y, [y_term], :(1 / ($(y_term)))), - FunctionTerm(one_by_y_minus, [y_term], :(1 / ($(y_term) - 1.3))), - FunctionTerm(one_by_sqrt, [y_term], :(1 / √($(y_term)))), - FunctionTerm(one_by_sqrt_minus, [y_term], :(1 / √($(y_term) - 1.3))) - ] -end - -# Generates a list of transformed independent variable terms and their interactions. -function _independent_variable(x_term::AbstractTerm)::Vector{MixTerm} - # Define the six base terms - x2 = FunctionTerm(x -> x^2, [x_term], :($(x_term)^2)) - log_x = FunctionTerm(log, [x_term], :(log($(x_term)))) - log_x2 = FunctionTerm(x -> log(x)^2, [x_term], :(log($(x_term))^2)) - inv_x = FunctionTerm(x -> 1 / x, [x_term], :($(x_term)^-1)) - inv_x2 = FunctionTerm(x -> 1 / x^2, [x_term], :($(x_term)^-2)) - # Use the base terms in combinations - return [ - x_term, - x2, - log_x, - log_x2, - inv_x, - inv_x2, x_term + x2, - x_term + log_x, - x_term + log_x2, - x_term + inv_x, - x_term + inv_x2, x2 + log_x, - x2 + log_x2, - x2 + inv_x, log_x + log_x2, - log_x + inv_x, - log_x + inv_x2, log_x2 + inv_x, - log_x2 + inv_x2, inv_x + inv_x2 - ] -end - -function _generate_combined_terms(x_term::AbstractTerm, y_term::AbstractTerm)::Vector{MixTerm} - - # Define transformations for the x_term variable - # x^2: square of the x_term - # log1p: log(1 + x_term) to prevent log(0) issues - x2 = FunctionTerm(x -> x^2, [x_term], :($(x_term)^2)) - log_x = FunctionTerm(log1p, [x_term], :(log1p($(x_term)))) - - # Collect all x transformations into a list - x_terms = [ - x_term, - x2, - log_x - ] - - # Define transformations for the y_term variable - # y^2: square of the y_term - # log1p: log(1 + y_term) to prevent log(0) issues - y2 = FunctionTerm(y -> y^2, [y_term], :($(y_term)^2)) - log_y = FunctionTerm(log1p, [y_term], :(log1p($(y_term)))) - - # Collect all y transformations into a list - y_terms = [ +function _dependent_variable(y_term::AbstractTerm) + return AbstractTerm[ y_term, - y2, - log_y + FunctionTerm(log, [y_term], :(log($(y_term)))), + FunctionTerm(log_minus, [y_term], :(log_minus($(y_term) - 1.3))), + FunctionTerm(log1p, [y_term], :(log1p($(y_term)))), ] - - # Generate all possible sums between x_terms and y_terms - sum_terms = [x + y for x in x_terms for y in y_terms] - - # Generate all possible interactions (products) between x_terms and y_terms - interaction_terms = [x & y for x in x_terms for y in y_terms] - - # Combine the sum_terms and interaction_terms into a new set of terms - combined_terms = [st + it for st in sum_terms for it in interaction_terms] - - # The number of interaction terms generated - n = length(interaction_terms) - - # Loop over x_terms and combine them with pairs of interaction terms - for x in x_terms - for i in 1:n-1 - for j in i+1:n - push!(combined_terms, x + interaction_terms[i] + interaction_terms[j]) - end - end - end - - # Similarly, loop over y_terms and combine them with pairs of interaction terms - for y in y_terms - for i in 1:n-1 - for j in i+1:n - push!(combined_terms, y + interaction_terms[i] + interaction_terms[j]) - end - end - end - - # Return the final vector of all combined terms - return combined_terms end -function _generate_terms(x_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...) +function _independent_variable(x_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...) # Define the six base terms x2 = FunctionTerm(x -> x^2, [x_term], :($(x_term)^2)) log_x = FunctionTerm(log, [x_term], :(log($(x_term)))) @@ -199,7 +108,7 @@ function _generate_terms(x_term::AbstractTerm, cols::NamedTuple, q_terms::Abstra end -function _generate_terms(x1_term::AbstractTerm, x2_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...) +function _independent_variable(x1_term::AbstractTerm, x2_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...) # Define transformations for the x1_term variable x1_2 = FunctionTerm(x -> x^2, [x1_term], :($(x1_term)^2)) log_x1 = FunctionTerm(log, [x1_term], :(log($(x1_term))))