Skip to content

Commit

Permalink
new independent_variables
Browse files Browse the repository at this point in the history
  • Loading branch information
marcosdanieldasilva committed Dec 15, 2024
1 parent 8ce8570 commit 0c6d4a7
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 166 deletions.
131 changes: 71 additions & 60 deletions src/linear_regression.jl
Original file line number Diff line number Diff line change
@@ -1,74 +1,48 @@
function _create_matrix_formulas!(matrix_formulas::Vector{MatrixTerm}, x_term_list::Vector{MixTerm})
@inbounds for (k, x) enumerate(x_term_list)
matrix_formulas[k] = MatrixTerm(β0 + x)
end
end

function _create_matrix_formulas!(matrix_formulas::Vector{MatrixTerm}, x_term_list::Vector{MixTerm}, q_term::AbstractTerm...)
@inbounds for (k, x) enumerate(x_term_list)
matrix_formulas[k] = MatrixTerm(β0 + x + sum(q_term))
end
end

function _fit_regression!(fitted_models::Vector{TableRegressionModel},
function _fit_regression!(
fitted_models::Vector{TableRegressionModel},
y_term_list::Vector{AbstractTerm},
matrix_formulas::Vector{MatrixTerm},
cols::NamedTuple
)

# Dictionary to store model column data for each y term
model_cols = Dict{AbstractTerm,Union{Vector{<:Real},Nothing}}()

# Dictionary to store model matrix data for each matrix term
model_matrix = Dict{MatrixTerm,Union{Matrix{<:Real},Nothing}}()
model_matrix::Dict{MatrixTerm,Matrix{Float64}},
cols::NamedTuple)

# Precompute model columns for each y term in y_term_list using modelcols function
@inbounds for y y_term_list
model_cols[y] = try
modelcols(y, cols)
catch nothing
end
end
model_dependent_variable = Dict{AbstractTerm,Union{AbstractVector,Nothing}}()

# Precompute model matrices for each x term (matrix formulas) using modelmatrix function
@inbounds for x matrix_formulas
model_matrix[x] = try
modelmatrix(x, cols)
catch nothing
for yt in y_term_list
model_dependent_variable[yt] = try
modelcols(yt, cols)
catch
nothing
end
end

# Loop over all combinations of y terms and matrix terms
@inbounds for (y, x) Iterators.product(y_term_list, matrix_formulas)
Y = model_cols[y] # Extract precomputed columns for y
X = model_matrix[x] # Extract precomputed matrix for x
for y in y_term_list
Y = model_dependent_variable[y]

# Skip if either Y or X is missing (nothing)
if Y === nothing || X === nothing
if Y === nothing
continue
end

try
# Perform linear regression using GLM.fit
fitted_model = GLM.fit(LinearModel, X, Y)
for (mt, X) in model_matrix
try
# Perform linear regression using GLM.fit
fitted_model = GLM.fit(LinearModel, X, Y)

# Create a formula combining y and x terms
formula = FormulaTerm(y, x)
# Create a formula combining y and x terms
formula = FormulaTerm(y, mt)

# Create a ModelFrame object to store schema and columns for the model
mf = ModelFrame(formula, emptySchema, cols, LinearModel)
# Create a ModelFrame object to store schema and columns for the model
mf = ModelFrame(formula, emptySchema, cols, LinearModel)

# Create a ModelMatrix object based on the formula
mm = ModelMatrix(X, asgn(formula))
# Create a ModelMatrix object based on the formula
mm = ModelMatrix(X, asgn(formula))

# Store the fitted model along with its associated data in TableRegressionModel
push!(fitted_models, TableRegressionModel(fitted_model, mf, mm))
catch
# Handle any errors silently during model fitting
# Store the fitted model along with its associated data in TableRegressionModel
push!(fitted_models, TableRegressionModel(fitted_model, mf, mm))
catch
# Handle any errors silently during model fitting
end
end
end
end

"""
regression(y::Symbol, x::Symbol, data::AbstractDataFrame, q::Symbol...)
Expand Down Expand Up @@ -134,7 +108,7 @@ best_models = criteria_table(models, :adjr2, :rmse)
```
"""
function regression(y::Symbol, x::Symbol, data::AbstractDataFrame, q::Symbol...)

# remove empty values from the data
new_data = dropmissing(data[:, [y, x, q...]])

n, k = size(new_data)
Expand All @@ -154,21 +128,58 @@ function regression(y::Symbol, x::Symbol, data::AbstractDataFrame, q::Symbol...)
# Convert the DataFrame to a column table (named tuple of vectors)
cols = columntable(new_data)

# create terms to performe the linear regression
y_term = concrete_term(term(y), cols, ContinuousTerm)
x_term = concrete_term(term(x), cols, ContinuousTerm)
q_term = [concrete_term(term(terms), cols, CategoricalTerm) for terms q]
q_terms = [concrete_term(term(qq), cols, CategoricalTerm) for qq in q]

y_term_list = _dependent_variable(y_term, x_term)

x_term_list = _independent_variable(x_term)
model_matrix = _independent_variable(x_term, cols, q_terms...)

fitted_models = Vector{TableRegressionModel}()

_fit_regression!(fitted_models, y_term_list, model_matrix, cols)

isempty(fitted_models) && error("Failed to fit any models")

return fitted_models
end

function regression(y::Symbol, x1::Symbol, x2::Symbol, data::AbstractDataFrame, q::Symbol...)
# remove empty values from the data
new_data = dropmissing(data[:, [y, x1, x2, q...]])

n, k = size(new_data)

if n < k + 2
error("There are not enough data points to perform regression. At least $(k + 2) observations are required.")
end

#Attempt to coerce the y and x columns to the Continuous scitype (e.g., Float64)
try
coerce!(new_data, y => ScientificTypes.Continuous, x1 => ScientificTypes.Continuous, x2 => ScientificTypes.Continuous)
catch
# If coercion fails, an error will be thrown
error("Unable to coerce variables '$(y)', '$(x1)' and '$(x2)' to Continuous. Please ensure they contain numeric values.")
end

# Convert the DataFrame to a column table (named tuple of vectors)
cols = columntable(new_data)

# create terms to performe the linear regression
y_term = concrete_term(term(y), cols, ContinuousTerm)
x1_term = concrete_term(term(x1), cols, ContinuousTerm)
x2_term = concrete_term(term(x2), cols, ContinuousTerm)
q_terms = [concrete_term(term(qq), cols, CategoricalTerm) for qq in q]

matrix_formulas = Vector{MatrixTerm}(undef, length(x_term_list))
y_term_list = _dependent_variable(y_term)

isempty(q_term) ? _create_matrix_formulas!(matrix_formulas, x_term_list) : _create_matrix_formulas!(matrix_formulas, x_term_list, q_term...)
model_matrix = _independent_variable(x1_term, x2_term, cols, q_terms...)

fitted_models = Vector{TableRegressionModel}()

_fit_regression!(fitted_models, y_term_list, matrix_formulas, cols)
_fit_regression!(fitted_models, y_term_list, model_matrix, cols)

isempty(fitted_models) && error("Failed to fit any models")

Expand Down
121 changes: 15 additions & 106 deletions src/regression_parameters.jl
Original file line number Diff line number Diff line change
@@ -1,133 +1,42 @@
log_minus(y::Real) = log(y - 1.3)

one_by_y(y::Real) = 1 / y
one_by_y_minus(y::Real) = 1 / (y - 1.3)
one_by_sqrt(y::Real) = 1 / y
one_by_sqrt_minus(y::Real) = 1 / (y - 1.3)

x_by_sqrt_y(x::Real, y::Real) = x / y
x_by_sqrt_y_minus(x::Real, y::Real) = x / (y - 1.3)
square_x_by_y(x::Real, y::Real) = x^2 / y
square_x_by_y_minus(x::Real, y::Real) = x^2 / (y - 1.3)

# Generates a list of transformed dependent variable terms.
function _dependent_variable(y_term::AbstractTerm, x_term::AbstractTerm)::Vector{AbstractTerm}
return [
y_term, FunctionTerm(log, [y_term], :(log($(y_term)))),
function _dependent_variable(y_term::AbstractTerm, x_term::AbstractTerm)
return AbstractTerm[
y_term,
FunctionTerm(log, [y_term], :(log($(y_term)))),
FunctionTerm(log_minus, [y_term], :(log_minus($(y_term) - 1.3))),
FunctionTerm(log1p, [y_term], :(log1p($(y_term)))), FunctionTerm(one_by_y, [y_term], :(1 / ($(y_term)))),
FunctionTerm(log1p, [y_term], :(log1p($(y_term)))),
FunctionTerm(one_by_y, [y_term], :(1 / ($(y_term)))),
FunctionTerm(one_by_y_minus, [y_term], :(1 / ($(y_term) - 1.3))),
FunctionTerm(one_by_sqrt, [y_term], :(1 / ($(y_term)))),
FunctionTerm(one_by_sqrt_minus, [y_term], :(1 / ($(y_term) - 1.3))), FunctionTerm(x_by_sqrt_y, [x_term, y_term], :($(x_term) / $(y_term))),
FunctionTerm(one_by_sqrt_minus, [y_term], :(1 / ($(y_term) - 1.3))),
FunctionTerm(x_by_sqrt_y, [x_term, y_term], :($(x_term) / $(y_term))),
FunctionTerm(x_by_sqrt_y_minus, [x_term, y_term], :($(x_term) / ($(y_term) - 1.3))),
FunctionTerm(square_x_by_y, [x_term, y_term], :($(x_term)^2 / $(y_term))),
FunctionTerm(square_x_by_y_minus, [x_term, y_term], :($(x_term)^2 / ($(y_term) - 1.3)))
]
end

# Generates a list of transformed dependent variable term.
function _dependent_variable(y_term::AbstractTerm)::Vector{AbstractTerm}
return [
y_term, FunctionTerm(log, [y_term], :(log($(y_term)))),
FunctionTerm(log_minus, [y_term], :(log_minus($(y_term) - 1.3))),
FunctionTerm(log1p, [y_term], :(log1p($(y_term)))), FunctionTerm(one_by_y, [y_term], :(1 / ($(y_term)))),
FunctionTerm(one_by_y_minus, [y_term], :(1 / ($(y_term) - 1.3))),
FunctionTerm(one_by_sqrt, [y_term], :(1 / ($(y_term)))),
FunctionTerm(one_by_sqrt_minus, [y_term], :(1 / ($(y_term) - 1.3)))
]
end

# Generates a list of transformed independent variable terms and their interactions.
function _independent_variable(x_term::AbstractTerm)::Vector{MixTerm}
# Define the six base terms
x2 = FunctionTerm(x -> x^2, [x_term], :($(x_term)^2))
log_x = FunctionTerm(log, [x_term], :(log($(x_term))))
log_x2 = FunctionTerm(x -> log(x)^2, [x_term], :(log($(x_term))^2))
inv_x = FunctionTerm(x -> 1 / x, [x_term], :($(x_term)^-1))
inv_x2 = FunctionTerm(x -> 1 / x^2, [x_term], :($(x_term)^-2))
# Use the base terms in combinations
return [
x_term,
x2,
log_x,
log_x2,
inv_x,
inv_x2, x_term + x2,
x_term + log_x,
x_term + log_x2,
x_term + inv_x,
x_term + inv_x2, x2 + log_x,
x2 + log_x2,
x2 + inv_x, log_x + log_x2,
log_x + inv_x,
log_x + inv_x2, log_x2 + inv_x,
log_x2 + inv_x2, inv_x + inv_x2
]
end

function _generate_combined_terms(x_term::AbstractTerm, y_term::AbstractTerm)::Vector{MixTerm}

# Define transformations for the x_term variable
# x^2: square of the x_term
# log1p: log(1 + x_term) to prevent log(0) issues
x2 = FunctionTerm(x -> x^2, [x_term], :($(x_term)^2))
log_x = FunctionTerm(log1p, [x_term], :(log1p($(x_term))))

# Collect all x transformations into a list
x_terms = [
x_term,
x2,
log_x
]

# Define transformations for the y_term variable
# y^2: square of the y_term
# log1p: log(1 + y_term) to prevent log(0) issues
y2 = FunctionTerm(y -> y^2, [y_term], :($(y_term)^2))
log_y = FunctionTerm(log1p, [y_term], :(log1p($(y_term))))

# Collect all y transformations into a list
y_terms = [
function _dependent_variable(y_term::AbstractTerm)
return AbstractTerm[
y_term,
y2,
log_y
FunctionTerm(log, [y_term], :(log($(y_term)))),
FunctionTerm(log_minus, [y_term], :(log_minus($(y_term) - 1.3))),
FunctionTerm(log1p, [y_term], :(log1p($(y_term)))),
]

# Generate all possible sums between x_terms and y_terms
sum_terms = [x + y for x in x_terms for y in y_terms]

# Generate all possible interactions (products) between x_terms and y_terms
interaction_terms = [x & y for x in x_terms for y in y_terms]

# Combine the sum_terms and interaction_terms into a new set of terms
combined_terms = [st + it for st in sum_terms for it in interaction_terms]

# The number of interaction terms generated
n = length(interaction_terms)

# Loop over x_terms and combine them with pairs of interaction terms
for x in x_terms
for i in 1:n-1
for j in i+1:n
push!(combined_terms, x + interaction_terms[i] + interaction_terms[j])
end
end
end

# Similarly, loop over y_terms and combine them with pairs of interaction terms
for y in y_terms
for i in 1:n-1
for j in i+1:n
push!(combined_terms, y + interaction_terms[i] + interaction_terms[j])
end
end
end

# Return the final vector of all combined terms
return combined_terms
end

function _generate_terms(x_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...)
function _independent_variable(x_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...)
# Define the six base terms
x2 = FunctionTerm(x -> x^2, [x_term], :($(x_term)^2))
log_x = FunctionTerm(log, [x_term], :(log($(x_term))))
Expand Down Expand Up @@ -199,7 +108,7 @@ function _generate_terms(x_term::AbstractTerm, cols::NamedTuple, q_terms::Abstra

end

function _generate_terms(x1_term::AbstractTerm, x2_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...)
function _independent_variable(x1_term::AbstractTerm, x2_term::AbstractTerm, cols::NamedTuple, q_terms::AbstractTerm...)
# Define transformations for the x1_term variable
x1_2 = FunctionTerm(x -> x^2, [x1_term], :($(x1_term)^2))
log_x1 = FunctionTerm(log, [x1_term], :(log($(x1_term))))
Expand Down

0 comments on commit 0c6d4a7

Please sign in to comment.