From 4cf698726a81e2951e4b0c7096e8155a4aeebe6c Mon Sep 17 00:00:00 2001 From: getzze Date: Fri, 12 May 2023 13:22:13 +0100 Subject: [PATCH] add Penalties and Regularized Least Squares format use :cholesky and dropcollinear like in GLM correctly use QR test method :cholesky change types clean after rebase --- README.md | 97 +++- _typos.toml | 1 + docs/src/api.md | 28 +- src/RobustModels.jl | 23 + src/ipod.jl | 1002 ++++++++++++++++++++++++++++++++++ src/linpred.jl | 2 + src/penalties.jl | 826 ++++++++++++++++++++++++++++ src/regularizedpred.jl | 1104 ++++++++++++++++++++++++++++++++++++++ src/robustlinearmodel.jl | 297 ++++++++++ test/data/Animals2.jl | 1 + test/data/starsCYG.jl | 115 ++++ test/ipod.jl | 275 ++++++++++ test/linearfit.jl | 1 - test/penalties.jl | 212 ++++++++ test/robustridge.jl | 1 - test/runtests.jl | 12 +- test/underdetermined.jl | 105 ++++ 17 files changed, 4082 insertions(+), 20 deletions(-) create mode 100644 src/ipod.jl create mode 100644 src/penalties.jl create mode 100644 test/data/starsCYG.jl create mode 100644 test/ipod.jl create mode 100644 test/penalties.jl create mode 100644 test/underdetermined.jl diff --git a/README.md b/README.md index 6d23e8b..a170e79 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ This package implements: * MQuantile regression (e.g. Expectile regression) * Robust Ridge regression (using any of the previous estimator) * Quantile regression using interior point method +* Regularized Least Square regression +* Θ-IPOD regression, possibly with a penalty term ## Installation @@ -55,7 +57,7 @@ julia>] add RobustModels#main ## Usage -The prefered way of performing robust regression is by calling the `rlm` function: +The preferred way of performing robust regression is by calling the `rlm` function: `m = rlm(X, y, MEstimator{TukeyLoss}(); initial_scale=:mad)` @@ -63,6 +65,15 @@ For quantile regression, use `quantreg`: `m = quantreg(X, y; quantile=0.5)` +For Regularized Least Squares and a penalty term, use `rlm`: + +`m = rlm(X, y, L1Penalty(); method=:cgd)` + +For Θ-IPOD regression with outlier detection and a penalty term, use `ipod`: + +`m = ipod(X, y, L2Loss(), SquaredL2Penalty(); method=:auto)` + + For robust version of `mean`, `std`, `var` and `sem` statistics, specify the estimator as first argument. Use the `dims` keyword for computing the statistics along specific dimensions. The following functions are also implemented: `mean_and_std`, `mean_and_var` and `mean_and_sem`. @@ -119,6 +130,16 @@ m9 = rlm(form, data, MEstimator{TukeyLoss}(); initial_scale=:L1, ridgeλ=1.0) ## Quantile regression solved by linear programming interior point method m10 = quantreg(form, data; quantile=0.2) refit!(m10; quantile=0.8) + +## Penalized regression +m11 = rlm(form, data, SquaredL2Penalty(); method=:auto) + +## Θ-IPOD regression with outlier detection +m12 = ipod(form, data, TukeyLoss(); method=:auto) + +## Θ-IPOD regression with outlier detection and a penalty term +m13 = ipod(form, data, L2Loss(), L1Penalty(); method=:ama) + ; # output @@ -136,7 +157,7 @@ With ordinary least square (OLS), the objective function is, from maximum likeli where `yᵢ` are the values of the response variable, `𝒙ᵢ` are the covectors of individual covariates (rows of the model matrix `X`), `𝜷` is the vector of fitted coefficients and `rᵢ` are the individual residuals. -A `RobustLinearModel` solves instead for the following loss function: `L' = Σᵢ ρ(rᵢ)` +A `RobustLinearModel` solves instead for the following loss function [1]: `L' = Σᵢ ρ(rᵢ)` (more precisely `L' = Σᵢ ρ(rᵢ/σ)` where `σ` is an (robust) estimate of the standard deviation of the residual). Several loss functions are implemented: @@ -150,7 +171,7 @@ Several loss functions are implemented: - `CauchyLoss`: `ρ(r) = log(1+(r/c)²)`, non-convex estimator, that also corresponds to a Student's-t distribution (with fixed degree of freedom). It suppresses outliers more strongly but it is not sure to converge. - `GemanLoss`: `ρ(r) = ½ (r/c)²/(1 + (r/c)²)`, non-convex and bounded estimator, it suppresses outliers more strongly. - `WelschLoss`: `ρ(r) = ½ (1 - exp(-(r/c)²))`, non-convex and bounded estimator, it suppresses outliers more strongly. -- `TukeyLoss`: `ρ(r) = if r0, τ, 1-τ)` and `τ` is the quantile of interest. `τ=0.5` corresponds to _Least Absolute Deviations_. + +This problem is solved exactly using linear programming techniques, +specifically, interior point methods using the internal API of [Tulip](https://github.com/ds4dm/Tulip.jl). +The internal API is considered unstable, but it results in a much lighter dependency than +including the [JuMP](https://github.com/JuliaOpt/JuMP.jl) package with Tulip backend. ### Robust Ridge regression @@ -192,16 +224,44 @@ By default, all the coefficients (except the intercept) have the same penalty, w all the feature variables have the same scale. If it is not the case, use a robust estimate of scale to normalize every column of the model matrix `X` before fitting the regression. -### Quantile regression +### Regularized Least Squares -_Quantile regression_ results from minimizing the following objective function: -`L = Σᵢ wᵢ|yᵢ - 𝒙ᵢ 𝜷| = Σᵢ wᵢ(rᵢ) |rᵢ|`, -where `wᵢ = ifelse(rᵢ>0, τ, 1-τ)` and `τ` is the quantile of interest. `τ=0.5` corresponds to _Least Absolute Deviations_. +_Regularized Least Squares_ regression is the solution of the minimization of following objective function: +`L = ½ Σᵢ |yᵢ - 𝒙ᵢ 𝜷|² + P(𝜷)`, +where `P(𝜷)` is a (sparse) penalty on the coefficients. + +The following penalty function are defined: + - `NoPenalty`: `cost(𝐱) = 0`, no penalty. + - `SquaredL2Penalty`: `cost(𝐱) = λ ½||𝐱||₂²`, also called Ridge. + - `L1Penalty`: `cost(𝐱) = λ||𝐱||₁`, also called LASSO. + - `ElasticNetPenalty`: `cost(𝐱) = λ . l1_ratio .||𝐱||₁ + λ .(1 - l1_ratio) . ½||𝐱||₂²`. + - `EuclideanPenalty`: `cost(𝐱) = λ||𝐱||₂`, non-separable penalty used in Group LASSO. + +Different penalties can be applied to different indices of the coefficients, using +`RangedPenalties(ranges, penalties)`. E.g., `RangedPenalties([2:5], [L1Penalty(1.0)])` defines +a L1 penalty on every coefficients except the first index, which can correspond to the intercept. + +With a penalty, the following solvers are available (instead of the other ones): + - `:cgd`, Coordinate Gradient Descent [2]. + - `:fista`, Fast Iterative Shrinkage-Thresholding Algorithm [3]. + - `:ama`, Alternating Minimization Algorithm [4]. + - `:admm`, Alternating Direction Method of Multipliers [5]. + +To use a robust loss function with a penalty, see Θ-IPOD regression. + +### Θ-IPOD regression + +_Θ-IPOD regression_ (Θ-thresholding based Iterative Procedure for Outlier Detection) results from +minimizing the following objective function [6]: +`L = ½ Σᵢ |yᵢ - 𝒙ᵢ 𝜷 - γᵢ|² + P(𝜷) + Q(γ)`, +where `Q(γ)` is a penalty function on the outliers `γ` that is sparse so the problem is not underdetermined. +We don't need to know the expression of this penalty function, just that it leads to thresholding using +one of the loss function used by M-Estimators. Then Θ-IPOD is equivalent to solving an M-Estimator. +This problem is solved using an alternating minimization technique, for the outlier detection. +Without penalty, the coefficients are updated at every step using a solver for _Ordinary Least Square_. + +`P(𝜷)` is an optional (sparse) penalty on the coefficients. -This problem is solved exactly using linear programming techniques, -specifically, interior point methods using the internal API of [Tulip](https://github.com/ds4dm/Tulip.jl). -The internal API is considered unstable, but it results in a much lighter dependency than -including the [JuMP](https://github.com/JuliaOpt/JuMP.jl) package with Tulip backend. ## Credits @@ -209,10 +269,15 @@ This package derives from the [RobustLeastSquares](https://github.com/FugroRoame package for the initial implementation, especially for the Conjugate Gradient solver and the definition of the M-Estimator functions. -Credits to the developpers of the [GLM](https://github.com/JuliaStats/GLM.jl) +Credits to the developers of the [GLM](https://github.com/JuliaStats/GLM.jl) and [MixedModels](https://github.com/JuliaStats/MixedModels.jl) packages for implementing the Iteratively Reweighted Least Square algorithm. ## References -- "Robust Statistics: Theory and Methods (with R)", 2nd Edition, 2019, R. Maronna, R. Martin, V. Yohai, M. Salibián-Barrera +[1] "Robust Statistics: Theory and Methods (with R)", 2nd Edition, 2019, R. Maronna, R. Martin, V. Yohai, M. Salibián-Barrera +[2] "Regularization Paths for Generalized Linear Models via Coordinate Descent", 2009, J. Friedman, T. Hastie, R. Tibshirani +[3] "A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems", 2009, A. Beck, M. Teboulle +[4] "Applications of a splitting algorithm to decomposition in convex programming and variational inequalities", 1991, P. Tseng +[5] "Fast Alternating Direction Optimization Methods", 2014, T. Goldstein, B. O'Donoghue, S. Setzer, R. Baraniuk +[6] "Outlier Detection Using Nonconvex Penalized Regression", 2011, Y. She, A.B. Owen diff --git a/_typos.toml b/_typos.toml index 8d35b03..611fc45 100644 --- a/_typos.toml +++ b/_typos.toml @@ -8,3 +8,4 @@ Artic = "Artic" [default.extend-words] qest = "qest" +wrk = "wrk" diff --git a/docs/src/api.md b/docs/src/api.md index 2096196..e43fb77 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -9,6 +9,7 @@ AbstractQuantileEstimator LossFunction RobustLinearModel RobustModels.RobustLinResp +RobustModels.IPODResp GLM.LinPred RobustModels.DensePredCG RobustModels.SparsePredCG @@ -17,7 +18,12 @@ GLM.SparsePredChol GLM.DensePredQR RobustModels.RidgePred RobustModels.AbstractRegularizedPred +RobustModels.CGDPred +RobustModels.FISTAPred +RobustModels.AMAPred +RobustModels.ADMMPred QuantileRegression +IPODRegression ``` ## Constructors for models @@ -30,6 +36,7 @@ fit(::Type{M}, ::Union{AbstractMatrix{T}}, ::AbstractVector{T}) where {T<:Abstra ```@docs rlm quantreg +ipod fit! refit! ``` @@ -67,10 +74,13 @@ StatsAPI.residuals StatsModels.hasintercept hasformula formula +haspenalty +penalty scale tauscale RobustModels.location_variance Estimator +outliers GLM.linpred! RobustModels.pirls! RobustModels.pirls_Sestimate! @@ -111,7 +121,17 @@ HardThresholdLoss HampelLoss ``` -## Estimator and Loss functions methods +## Penalty functions +```@docs +NoPenalty +SquaredL2Penalty +EuclideanPenalty +L1Penalty +ElasticNetPenalty +RangedPenalties +``` + +## Estimator, Loss and Penalty functions methods ```@docs RobustModels.rho RobustModels.psi @@ -139,4 +159,10 @@ RobustModels.set_MEstimator RobustModels.update_weight! RobustModels.tau_scale_estimate RobustModels.quantile_weight +RobustModels.cost +RobustModels.proximal! +RobustModels.proximal +RobustModels.isconcrete +RobustModels.concrete! +RobustModels.concrete ``` diff --git a/src/RobustModels.jl b/src/RobustModels.jl index 0df1a61..ea1491d 100644 --- a/src/RobustModels.jl +++ b/src/RobustModels.jl @@ -173,6 +173,18 @@ export LossFunction, GeneralizedQuantileEstimator, ExpectileEstimator, L2Estimator, + PenaltyFunction, + NoPenalty, + SquaredL2Penalty, + L1Penalty, + ElasticNetPenalty, + EuclideanPenalty, + BerhuPenalty, + CappedL1Penalty, + SCADPenalty, + MCPPenalty, + RangedPenalties, + End, DensePredCG, SparsePredCG, RidgePred, @@ -183,6 +195,11 @@ export LossFunction, Estimator, rlm, quantreg, + IPODRegression, + ipod, + outliers, + penalty, + haspenalty, loss, tuning_constant, refit!, @@ -224,6 +241,9 @@ abstract type AbstractMEstimator <: AbstractEstimator end "Generalized M-Quantile estimator" abstract type AbstractQuantileEstimator <: AbstractMEstimator end +"Penalty function" +abstract type PenaltyFunction{T} end + """ AbstractRobustModel @@ -243,17 +263,20 @@ abstract type AbstractRegularizedPred{T} end Base.broadcastable(m::AbstractEstimator) = Ref(m) Base.broadcastable(m::LossFunction) = Ref(m) +Base.broadcastable(m::PenaltyFunction) = Ref(m) include("tools.jl") include("losses.jl") include("estimators.jl") +include("penalties.jl") include("linpred.jl") include("regularizedpred.jl") include("linresp.jl") include("robustlinearmodel.jl") include("univariate.jl") include("quantileregression.jl") +include("ipod.jl") include("deprecated.jl") end # module diff --git a/src/ipod.jl b/src/ipod.jl new file mode 100644 index 0000000..a5265f9 --- /dev/null +++ b/src/ipod.jl @@ -0,0 +1,1002 @@ + +using LinearAlgebra: qr, norm, eigmin, eigmax, rdiv! +using SparseArrays: sparse + + +#################################### +### LinPred methods +#################################### + +function initpred!( + p::LinPred, wt::AbstractVector{T}=T[], σ::Real=one(T); verbose::Bool=false +) where {T<:BlasReal} + return p +end + +solvepred!(p::LinPred, r::AbstractVector{T}) where {T<:BlasReal} = delbeta!(p, r) + +function solvepred!( + p::LinPred, r::AbstractVector{T}, wts::AbstractVector{T} +) where {T<:BlasReal} + return delbeta!(p, r, wts) +end + +updatepred!(p::LinPred, args...; kwargs...) = p + +function update_beta!( + p::LinPred, + r::AbstractVector{T}, + wts::AbstractVector{T}=T[], + σ2::T=one(T); # placeholder + verbose::Bool=false, +) where {T<:BlasReal} + if isempty(wts) + return solvepred!(p, r) + else + return solvepred!(p, r, wts) + end +end + +function initpred!( + p::Union{DensePredCG{T},SparsePredCG{T}}, + wt::AbstractVector{T}=T[], + σ::Real=one(T); # placeholder + verbose::Bool=false, +) where {T<:BlasReal} + # Initialize the scratchm1 matrix + if isempty(wt) + scr = transpose(copy!(p.scratchm1, p.X)) + else + scr = transpose(broadcast!(*, p.scratchm1, wt, p.X)) + end + # Initialize the Gram matrix + mul!(p.Σ, scr, p.X) + return p +end + +function solvepred!( + p::Union{DensePredCG{T},SparsePredCG{T}}, r::AbstractVector{T} +) where {T<:BlasReal} + ## Assume that the relevant matrices are pre-computed + # Compute the left-hand side. + mul!(p.scratchbeta, transpose(p.scratchm1), r) + + # Solve the linear system + cg!(p.delbeta, Hermitian(p.Σ, :U), p.scratchbeta) + return p +end + + +#################################### +### IPODResp +#################################### + +""" + IPODResp + +Robust Θ-IPOD linear response structure. + +Solve the following minimization problem: + +```math +\\min \\left\\lVert(\\dfrac{\\mathbf{y} - \\mathbf{X}\\mathbf{\\beta} - \\mathbf{gamma}} +{\\hat{\\sigma}}\right\\rVert^2_2 + P_c\\left(\\dfrac{\\mathbf{\\gamma}}{\\hat{\\sigma}}\\right) +``` + +# Fields + +- `loss`: loss used for the model +- `y`: response vector +- `μ`: mean response vector +- `wts`: prior case weights. Can be of length 0. +- `outliers`: outlier vector subtracted from `y` to form the working response. +- `σ`: current estimate of the scale or dispersion +- `wrky`: working response +- `wrkres`: working residuals + +""" +mutable struct IPODResp{T<:AbstractFloat,V<:AbstractVector{T},L<:LossFunction} <: + RobustResp{T} + "`loss`: loss used for the model" + loss::L + "`y`: response vector" + y::V + "`μ`: mean response vector" + μ::V + "`precision`: prior precision weights. Can be of length 0." + precision::V + "`outliers`: outlier vector subtracted from `y` to form the working response." + outliers::V + "`σ`: current estimate of the scale or dispersion" + σ::T + "`wrky`: working response." + wrky::V + "`wrkres`: working residuals" + wrkres::V + + function IPODResp{T,V,L}( + l::L, y::V, precision::V, outliers::V, σ::Real + ) where {L<:LossFunction,V<:AbstractVector{T}} where {T<:AbstractFloat} + n = length(y) + ll = length(precision) + ll == 0 || + ll == n || + throw(DimensionMismatch("length of precision is $ll, must be $n or 0")) + σ > 0 || throw(ArgumentError("σ must be positive: $σ")) + wrky = y - outliers + return new{T,V,L}( + l, y, zeros(T, n), precision, outliers, convert(T, σ), wrky, copy(wrky) + ) + end +end + +""" + IPODResp(l::L, y::V, precision::V, outliers::V=zeros(eltype(y), length(y)), σ::Real=1) + where {L<:LossFunction, V<:FPVector} + +Initialize the Robust Θ-IPOD linear response structure. + +""" +function IPODResp( + l::L, y::V, precision::V=T[], outliers::V=zeros(T, length(y)), σ::Real=1 +) where {L<:LossFunction,V<:AbstractVector{T}} where {T<:AbstractFloat} + r = IPODResp{T,V,L}(l, y, precision, outliers, T(σ)) + initresp!(r) + return r +end + +""" + IPODResp(l::L, y, wts, outliers=zeros(eltype(y), length(y)), σ::Real=1) where {L<:LossFunction} + +Convert the arguments to float arrays. +""" +function IPODResp( + l::L, y, precision=eltype(y)[], outliers=zeros(eltype(y), length(y)), σ::Real=1 +) where {L<:LossFunction} + return IPODResp( + l, float(collect(y)), float(collect(precision)), float(collect(outliers)), σ + ) +end + +function Base.getproperty(r::IPODResp, s::Symbol) + if s ∈ (:mu, :η, :eta) + r.μ + elseif s ∈ (:sigma, :scale) + r.σ + elseif s ∈ (:γ, :gamma) + r.outliers + elseif s ∈ (:λ, :lambda) + r.precision + else + getfield(r, s) + end +end + +""" + initresp!(r::IPODResp) + +Initialize the response structure. +""" +function initresp!(r::IPODResp) + # Set working y + broadcast!(-, r.wrky, r.y, r.outliers) + + # Set residual (without offset) + return broadcast!(-, r.wrkres, r.wrky, r.μ) +end + +function update_outliers!(r::IPODResp) + if isempty(r.precision) + λi = one(eltype(r.y)) + @inbounds @simd for i in eachindex(r.y, r.μ, r.wrkres, r.outliers) + # Use threshold to compute outliers + r.outliers[i] = r.σ * threshold(r.loss, (r.y[i] - r.μ[i]) / r.σ, λi) + r.wrky[i] = r.y[i] - r.outliers[i] + r.wrkres[i] = r.wrky[i] - r.μ[i] + end + else + @inbounds @simd for i in eachindex(r.y, r.μ, r.wrkres, r.outliers, r.precision) + λi = r.precision[i] + # Use threshold to compute outliers + r.outliers[i] = r.σ * threshold(r.loss, (r.y[i] - r.μ[i]) / r.σ, λi) + r.wrky[i] = r.y[i] - r.outliers[i] + r.wrkres[i] = r.wrky[i] - r.μ[i] + end + end + return r +end + +function update_residuals!(r::IPODResp) + # update residuals + @. r.wrkres = r.wrky - r.μ + return r +end + +function update_scale!(r::IPODResp) + r.σ = sqrt((sum(abs2, r.wrkres) + dot(r.outliers, r.wrkres)) / length(r.y)) + return r +end + +function outliers_criteria(r::IPODResp{T}, γold::AbstractVector{T}) where {T<:AbstractFloat} + return maximum(abs(γold[i] - r.outliers[i]) / r.σ for i in eachindex(γold, r.outliers)) +end + +""" + dev_criteria(r::IPODResp) + +Deviance part coming from the loss and the response struct. +""" +dev_criteria(r::IPODResp) = sum(abs2, r.wrkres) / r.σ^2 + +""" + outliers(r::IPODResp) + +Returns the vector of the outlier part γ of the response y, so that `(y - γ) | X ~ Normal`. +""" +outliers(r::IPODResp) = r.outliers + + +#################################### +### IPODRegression +#################################### + + +""" + IPODRegression + +Robust regression using the Φ-IPOD algorithm + +## Fields + +* `resp`: the [`IPODResp`](@ref) structure. +* `pred`: the [`IPODPred`](@ref) structure. +* `formula`: either a `FormulaTerm` object or `nothing` +* `wts`: the prior observation weights (can be empty). +* `fitdispersion`: if true, the dispersion is estimated otherwise it is kept fixed +* `fitted`: if true, the model was already fitted +""" +mutable struct IPODRegression{ + T<:AbstractFloat, + R<:IPODResp{T}, + P<:Union{LinPred,AbstractRegularizedPred}, + V<:AbstractVector{T}, +} <: AbstractRobustModel{T} + resp::R + pred::P + formula::Union{FormulaTerm,Nothing} + wts::V + fitdispersion::Bool + fitted::Bool +end + +function IPODRegression( + X::AbstractMatrix{T}, + y::AbstractVector{T}, + loss::LossFunction, + penalty::Union{Nothing,PenaltyFunction}=nothing; + method::Symbol=:auto, # :chol, :qr, :cg, :cgd, :fista, :ama, :admm + wts::FPVector=similar(y, 0), + precision::FPVector=similar(y, 0), + fitdispersion::Bool=false, + dropcollinear::Bool=false, + formula::Union{Nothing,FormulaTerm}=nothing, + use_backtracking::Bool=false, + AMA::Bool=false, + A::AbstractMatrix{<:Real}=zeros(T, 0, 0), + b::AbstractVector{<:Real}=zeros(T, 0), + restart::Bool=true, + adapt::Bool=false, + penalty_omit_intercept::Bool=true, +) where {T<:AbstractFloat} + + # Check that X and y have the same number of observations + n, p = size(X) + n == size(y, 1) || throw(DimensionMismatch("number of rows in X and y must match")) + ll = size(wts, 1) + ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n.")) + + # Response object + rr = IPODResp(loss, y, precision) + + # Method + nopen_methods = (:auto, :chol, :cholesky, :cg, :qr) + pen_methods = (:auto, :cgd, :fista, :ama, :admm) + if isnothing(penalty) && method ∉ nopen_methods + @warn( + "Incorrect method `:$(method)` without a penalty, should be one of $(nopen_methods)" + ) + method = :auto + elseif !isnothing(penalty) && method ∉ pen_methods + @warn( + "Incorrect method `:$(method)` with a penalty, should be one of $(pen_methods)" + ) + method = :auto + end + + # Predictor without penalty + if isnothing(penalty) + pred = if method === :cg + cgpred(X, dropcollinear) + elseif method === :qr + qrpred(X, dropcollinear) + elseif method in (:chol, :cholesky, :auto) + cholpred(X, dropcollinear) + else + error( + "without penalty, method :$method is not allowed, should be in: $nopen_methods", + ) + end + + # Predictor with penalty + else + # Penalty + new_penalty = try + intercept_col = penalty_omit_intercept ? get_intercept_col(X, formula) : nothing + concrete(penalty, p, intercept_col) + catch + error("penalty is not compatible with coefficients size $p: $(penalty)") + end + + # Predictor + pred = if method === :fista + FISTARegPred(X, new_penalty, wts, restart, use_backtracking) + elseif method === :ama + AMARegPred(X, new_penalty, wts, A, b, restart) + elseif method === :admm + ADMMRegPred(X, new_penalty, wts, A, b, restart, adapt) + elseif method in (:cgd, :auto) + CGDRegPred(X, new_penalty, wts) + else + error("with penalty, method :$method is not allowed, should be in: $pen_methods") + end + end + + return IPODRegression(rr, pred, formula, wts, fitdispersion, false) +end + +function Base.getproperty(r::IPODRegression, s::Symbol) + if s ∈ (:beta0, :β) + r.pred.beta0 + elseif s ∈ (:delbeta, :dβ) + r.pred.delbeta + else + getfield(r, s) + end +end + +function Base.show(io::IO, obj::IPODRegression) + msg = "Robust Θ-IPOD regression with $(obj.resp.loss)\n\n" + if hasformula(obj) + msg *= "$(formula(obj))\n\n" + end + msg *= "Coefficients:\n" + return println(io, msg, coeftable(obj)) +end + +#################################### +### Interface +#################################### + +## TODO: use (weighted) matrix rank, with dropcollinear +function StatsAPI.dof(r::IPODRegression) + df = if haspenalty(r) + dof(penalty(r), coef(r)) + else + length(coef(r)) + end + return min(wobs(r), df) +end + +hasformula(m::IPODRegression) = isnothing(m.formula) ? false : true + +function StatsModels.formula(m::IPODRegression) + if !hasformula(m) + throw(ArgumentError("model was fitted without a formula")) + end + return m.formula +end + +StatsAPI.modelmatrix(r::IPODRegression) = r.pred.X + +function StatsAPI.vcov(r::IPODRegression, wt::AbstractVector) + wXt = isempty(wt) ? modelmatrix(r)' : (modelmatrix(r) .* wt)' + return inv(Hermitian(float(Matrix(wXt * modelmatrix(r))))) +end +StatsAPI.vcov(r::IPODRegression) = vcov(r, weights(r)) + +function projectionmatrix(r::IPODRegression, wt::AbstractVector) + X = modelmatrix(r) + wXt = isempty(wt) ? X' : (wt .* X)' + return Hermitian(X * vcov(r, wt) * wXt) +end +projectionmatrix(r::IPODRegression) = projectionmatrix(r, weights(r)) + +# Define the methods, but all `variant` give the same result +for fun in (:vcov, :projectionmatrix, :leverage, :leverage_weights) + @eval begin + $(fun)(m::IPODRegression, variant::Symbol) = $(fun)(m) + end +end + +function GLM.dispersion(r::IPODRegression, sqr::Bool=false) + wts = weights(r) + res = residuals(r) + dofres = dof_residual(r) + + if dofres <= 0 + return convert(eltype(res), Inf) + end + + s = if isempty(wts) + sum(abs2, res) / dofres + else + sum(i -> wts[i] * abs2(res[i]), eachindex(wts, res)) / dofres + end + + return sqr ? s : sqrt(s) +end + +""" + location_variance(r::RobustLinResp, sqr::Bool=false) + +Compute the part of the variance of the coefficients `β` that is due to the encertainty +from the location. If `sqr` is `false`, return the standard deviation instead. + +From Maronna et al., Robust Statistics: Theory and Methods, Equation 4.49 +""" +location_variance(r::IPODRegression, sqr::Bool=false) = dispersion(r, sqr) + +StatsAPI.stderror(r::IPODRegression) = location_variance(r, false) .* sqrt.(abs.(diag(vcov(r)))) + +## Loglikelihood of the full model +## l = Σi log fi = Σi log ( 1/(σ * Z) exp( - ρ(ri/σ) ) = -n (log σ + log Z) - Σi ρ(ri/σ) +fullloglikelihood(r::IPODRegression) = -wobs(r) * (log(r.resp.σ) + log(2 * π) / 2) + +StatsAPI.deviance(r::IPODRegression) = sum(abs2, residuals(r)) / scale(r)^2 + +function StatsAPI.nulldeviance(r::IPODRegression) + y = response(r) + wts = weights(r) + + # Compute location of the null model + μ = if !hasintercept(r) + zero(eltype(y)) + elseif isempty(wts) + mean(y) + else + mean(y, weights(wts)) + end + + # Sum deviance for each observation + dev = 0 + if isempty(wts) + @inbounds for i in eachindex(y) + dev += abs2((y[i] - μ) / scale(r)) + end + else + @inbounds for i in eachindex(y, wts) + dev += wts[i] * abs2((y[i] - μ) / scale(r)) + end + end + return dev +end + +StatsAPI.loglikelihood(r::IPODRegression) = fullloglikelihood(r) - deviance(r) / 2 + +StatsAPI.nullloglikelihood(r::IPODRegression) = fullloglikelihood(r) - nulldeviance(r) / 2 + +StatsAPI.response(r::IPODRegression) = r.resp.y + +StatsAPI.isfitted(r::IPODRegression) = r.fitted + +StatsAPI.fitted(r::IPODRegression) = r.resp.μ + +StatsAPI.residuals(r::IPODRegression) = r.resp.wrkres + +StatsAPI.predict(r::IPODRegression) = fitted(r) + +""" + scale(m::RobustLinearModel, sqr::Bool=false) + +The robust scale estimate used for the robust estimation. + +If `sqr` is `true`, the square of the scale is returned. +""" +scale(r::IPODRegression, sqr::Bool=false) = sqr ? r.resp.σ^2 : r.resp.σ + +StatsAPI.weights(r::IPODRegression) = r.wts + +""" + coef(m::IPODRegression) +The coefficients of the model. +""" +StatsAPI.coef(r::IPODRegression) = coef(r.pred) + +""" + nobs(obj::IPODRegression)::Integer +For linear and generalized linear models, returns the number of elements of the response. +For models with prior weights, return the number of non-zero weights. +""" +function StatsAPI.nobs(r::IPODRegression{T}) where {T} + wts = weights(r) + if !isempty(wts) + ## Suppose that the weights are probability weights + count(!iszero, wts) + else + length(response(r)) + end +end + +""" + wobs(obj::IPODRegression) +For unweighted linear models, equals to ``nobs``, it returns the number of elements of the response. +For models with prior weights, return the sum of the weights. +""" +function wobs(r::IPODRegression{T}) where {T} + wts = weights(r) + if !isempty(wts) + ## Suppose that the weights are probability weights + sum(wts) + else + oftype(sum(one(eltype(wts))), nobs(r)) + end +end + +haspenalty(r::IPODRegression{T,R,P,V}) where {T,R,P<:AbstractRegularizedPred,V} = true + +function penalty(r::IPODRegression{T,R,P,V}) where {T,R,P<:AbstractRegularizedPred,V} + return r.pred.penalty +end + +""" + outliers(r::IPODRegression) + +Returns the vector of the outlier part γ of the response y, so that `(y - γ) | X ~ Normal`. +""" +outliers(r::IPODRegression) = outliers(r.resp) + +function update_fields!( + m::IPODRegression{T}; + wts::Union{Nothing,AbstractVector{<:Real}}=nothing, + initial_scale::Union{Nothing,Symbol,Real}=nothing, + σ0::Union{Nothing,Symbol,Real}=initial_scale, + initial_coef::Union{Nothing,AbstractVector{<:Real}}=nothing, + β0::Union{Nothing,AbstractVector{<:Real}}=initial_coef, + initial_outliers::Union{Nothing,AbstractVector{<:Real}}=nothing, + γ0::Union{Nothing,AbstractVector{<:Real}}=initial_outliers, + precision::Union{Nothing,Real,AbstractVector{<:Real}}=nothing, + λ0::Union{Nothing,Real,AbstractVector{<:Real}}=precision, + kwargs..., +) where {T<:AbstractFloat} + + resp = m.resp + pred = m.pred + n = length(response(resp)) + p = length(coef(pred)) + + if !isnothing(wts) + if length(wts) in (0, n) + copy!(m.wts, float(wts)) + else + throw(ArgumentError("wts should be a vector of length 0 or $n: $(length(wts))")) + end + end + if !isnothing(σ0) + # convert to float + if σ0 isa Symbol + σ0 = initialscale(modelmatrix(m), response(m), weights(m), σ0) + else + σ0 = float(σ0) + end + + if σ0 > 0 + resp.σ = σ0 + else + throw(ArgumentError("σ0 should be a positive real: $(σ0))")) + end + end + if !isnothing(γ0) + if length(γ0) == n + copy!(resp.outliers, float(γ0)) + else + throw(ArgumentError("γ0 should be a vector of length $n: $(length(γ0))")) + end + end + if !isnothing(λ0) + if λ0 isa Real + copy!(resp.precision, fill(float(λ0), n)) + elseif length(λ0) in (0, n) + copy!(resp.precision, λ0) + else + throw( + ArgumentError("λ0 should be a real or a vector of length 0 or $n: $(λ0))") + ) + end + end + if !isnothing(β0) + if length(β0) == p + copy!(pred.beta0, float(β0)) + else + throw(ArgumentError("β0 should be a vector of length $p: $(length(β0))")) + end + else + fill!(pred.beta0, zero(eltype(coef(m)))) + end + + ## The predictor must be updated if wts, σ0 or β0 was changed + + # return the rest of the keyword arguments + return kwargs +end + + +#################################### +### Fit IPOD model +#################################### + +""" + ipod(X, y, args...; kwargs...) + +An alias for `fit(IPODRegression, X, y, loss, penalty; kwargs...)`. + +The arguments `X` and `y` can be a `Matrix` and a `Vector` or a `Formula` and a `DataFrame`. +""" +ipod(X, y, args...; kwargs...) = fit(IPODRegression, X, y, args...; kwargs...) + + +""" + fit(::Type{M}, + X::AbstractMatrix{T}, + y::AbstractVector{T}, + loss::LossFunction, + penalty::Union{Nothing,PenaltyFunction}=nothing; + method::Symbol = :auto, # :chol, :cg, :cgd, :fista + dofit::Bool = true, + wts::FPVector = similar(y, 0), + offset::FPVector = similar(y, 0), + fitdispersion::Bool = false, + initial_scale::Union{Symbol, Real}=:mad, + σ0::Union{Symbol, Real}=initial_scale, + initial_coef::AbstractVector=[], + β0::AbstractVector=initial_coef, + correct_leverage::Bool=false + penalty_omit_intercept::Bool=true, + fitargs..., + ) where {M<:IPODRegression, T<:AbstractFloat} + +Create a robust model with the model matrix (or formula) X and response vector (or dataframe) y, +using a robust estimator. + + +# Arguments + +- `X`: the model matrix (it can be dense or sparse) or a formula +- `y`: the response vector or a table (dataframe, namedtuple, ...). +- `loss`: a robust loss function +- `penalty`: a penalty function, or `nothing` if the coefficients are not penalized. + +# Keywords + +- `method::Symbol = :auto`: the method used to solve the linear system, + Without penalty: + - Direct method, Cholesky decomposition: `:chol` (default) + - Direct method, QR decomposition: `:qr` + - Iterative method, Conjugate Gradient: `:cg` + With penalty: + - Coordinate Gradient Descent: `:cgd` (default) + - Fast Iterative Shrinkage-Thresholding Algorithm: `:fista` + - Alternating Minimization Algorithm: `:ama` + - Alternating Direction Method of Multipliers: `:admm` + Use :auto to select the default method based on whether penalty is or is not `nothing`. +- `dofit::Bool = true`: if false, return the model object without fitting; +- `dropmissing::Bool = false`: if true, drop the rows with missing values (and convert to + Non-Missing type). With `dropmissing=true` the number of observations may be smaller + than the size of the input arrays; +- `wts::Vector = similar(y, 0)`: Prior probability weights of observations. + Can be empty (length 0) if no weights are used (default); +- `precision::Vector = similar(y, 0)`: Prior precision weights of observations for outlier + detection. Can be empty (length 0) if all observations have the same precision (default); +- `fitdispersion::Bool = false`: reevaluate the dispersion; +- `contrasts::AbstractDict{Symbol,Any} = Dict{Symbol,Any}()`: a `Dict` mapping term names + (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts (e.g., `HelmertCoding()`, + `SeqDiffCoding(; levels=["a", "b", "c"])`, etc.). If contrasts are not provided for a variable, + the appropriate term type will be guessed based on the data type from the data column: + any numeric data is assumed to be continuous, and any non-numeric data is assumed to be + categorical (with `DummyCoding()` as the default contrast type); +- `initial_scale::Union{Symbol, Real}=:mad`: the initial scale estimate, for non-convex estimator + it helps to find the global minimum. + Automatic computation using `:mad`, `:L1` or `:extrema` (non-robust). +- `σ0::Union{Nothing, Symbol, Real}=initial_scale`: alias of `initial_scale`; +- `initial_coef::AbstractVector=[]`: the initial coefficients estimate, for non-convex estimator + it helps to find the global minimum. +- `β0::AbstractVector=initial_coef`: alias of `initial_coef`; +- `penalty_omit_intercept::Bool=true`: if true, do not penalize the intercept, + otherwise use the penalty on all the coefficients; +- `fitargs...`: other keyword arguments used to control the convergence of the IRLS algorithm + (see [`pirls!`](@ref)). + +# Output + +the IPODRegression object. + +""" +function StatsAPI.fit( + ::Type{M}, + X::AbstractMatrix{T}, + y::AbstractVector{T}, + loss::LossFunction, + penalty::Union{Nothing,PenaltyFunction}=nothing; + method::Symbol=:auto, # :chol, :qr, :cg, :cgd, :fista, :ama, :admm + dofit::Bool=true, + dropmissing::Bool=false, # placeholder + initial_scale::Union{Nothing,Symbol,Real}=:mad, + σ0::Union{Nothing,Symbol,Real}=initial_scale, + wts::FPVector=similar(y, 0), + precision::FPVector=similar(y, 0), + fitdispersion::Bool=false, + dropcollinear::Bool=false, + contrasts::AbstractDict{Symbol,Any}=Dict{Symbol,Any}(), # placeholder + __formula::Union{Nothing,FormulaTerm}=nothing, + use_backtracking::Bool=false, + A::AbstractMatrix{<:Real}=zeros(T, 0, 0), + b::AbstractVector{<:Real}=zeros(T, 0), + restart::Bool=true, + adapt::Bool=false, + penalty_omit_intercept::Bool=true, + fitargs..., +) where {M<:IPODRegression,T<:AbstractFloat} + + m = IPODRegression( + X, + y, + loss, + penalty; + method=method, + wts=wts, + precision=precision, + fitdispersion=fitdispersion, + dropcollinear=dropcollinear, + formula=__formula, + use_backtracking=use_backtracking, + A=A, + b=b, + restart=restart, + adapt=adapt, + penalty_omit_intercept=penalty_omit_intercept, + ) + + # Update fields + fitargs = update_fields!(m; σ0=σ0, fitargs...) + + return dofit ? fit!(m; fitargs...) : m +end + +## Convert from formula-data to modelmatrix-response calling form +## the `fit` method must allow the `wts`, `precision`, `contrasts` and `__formula` keyword arguments +function StatsAPI.fit( + ::Type{M}, + f::FormulaTerm, + data, + args...; + dropmissing::Bool=false, + wts::Union{Nothing,Symbol,FPVector}=nothing, + precision::Union{Nothing,Symbol,FPVector}=nothing, + contrasts::AbstractDict{Symbol,Any}=Dict{Symbol,Any}(), + kwargs..., +) where {M<:IPODRegression} + # Extract arrays from data using formula + f, y, X, extra = modelframe( + f, data, contrasts, dropmissing, M; wts=wts, precision=precision + ) + # Call the `fit` method with arrays + return fit( + M, + X, + y, + args...; + wts=extra.wts, + precision=extra.precision, + contrasts=contrasts, + __formula=f, + kwargs..., + ) +end + + +""" + ipod(X, y, l::LossFunction; λ=1) + +She & Owen (2011) - Outlier Detection Using Nonconvex Penalized Regression +""" +function StatsAPI.fit!( + m::IPODRegression{T,R,P,V}; + correct_leverage::Bool=false, + updatescale::Bool=false, + maxiter::Integer=(P <: FISTARegPred) ? 1000 : 100, + minstepfac::Real=1e-3, + atol::Real=1e-8, + rtol::Real=1e-7, + verbose::Bool=false, +) where {T,R,P,V} + # Return early if model has the fit flag set + m.fitted && return m + + verbose && println("\nFit with IPOD model: $(m.resp.loss), $(penalty(m))") + + # TODO: check if it works and if it can work + updatescale = false + + # Initialize the response and predictors + init!(m; verbose=verbose) + + # convergence criteria + γold = copy(m.resp.outliers) + Δγ = 0 + + devold = dev_criteria(m) + dev = devold + Δdev = 0 + + ### Loop until convergence + cvg = false + for i in 1:maxiter + ## Update outliers vector + update_outliers!(m.resp) + Δγ = outliers_criteria(m.resp, γold) + + ## Update coefficients + setη!(m; verbose=verbose) + dev = dev_criteria(m) + Δdev = abs(dev - devold) + + # ## Update scale + if updatescale + update_scale!(m.resp) + end + + ## Postupdate (only for ADMM) + updatepred!(m.pred, scale(m); verbose=verbose, force=updatescale) + + # Test for convergence + verbose && println("Iteration: $i, Δoutliers: $(Δγ), Δdev: $(Δdev)") + tol = max(rtol * abs(devold), atol) + if Δγ < atol && Δdev < tol + cvg = true + break + end + copyto!(γold, m.resp.outliers) + @assert isfinite(dev) + devold = dev + end + cvg || throw(ConvergenceException(maxiter)) + m.fitted = true + return m +end + +""" + refit!(m::RobustLinearModel, [y::FPVector]; + wts::Union{Nothing, FPVector} = nothing, + offset::Union{Nothing, FPVector} = nothing, + quantile::Union{Nothing, AbstractFloat} = nothing, + ridgeλ::Union{Nothing, Real} = nothing, + kwargs...) + +Refit the [`RobustLinearModel`](@ref). + +This function assumes that `m` was correctly initialized and the model is refitted with +the new values for the response, weights, offset, quantile and ridge shrinkage. + +Defining a new `quantile` is only possible for [`GeneralizedQuantileEstimator`](@ref). + +Defining a new `ridgeλ` is only possible for [`RidgePred`](@ref) objects. +""" +function refit!(m::IPODRegression, y::FPVector; kwargs...) + r = m.resp + # Check that old and new y have the same number of observations + if size(r.y, 1) != size(y, 1) + mess = + "the new response vector should have the same dimension: " * + "$(size(r.y, 1)) != $(size(y, 1))" + throw(DimensionMismatch(mess)) + end + # Update y + copyto!(r.y, y) + + return refit!(m; kwargs...) +end + +function refit!(m::IPODRegression; method=nothing, kwargs...) + + if !isnothing(method) + @warn( + "the method cannot be changed when refitting," * + " ignore the keyword argument `method=:$(method)`." + ) + end + + # Update fields + kwargs = update_fields!(m; kwargs...) + + m.fitted = false + return fit!(m; kwargs...) +end + + +function dev_criteria(m::IPODRegression) + if !haspenalty(m) + # this criteria is not used with no penalty + return 0 + end + l = dev_criteria(m.resp) + l += dev_criteria(m.pred) + return l +end + + +function init!(m::IPODRegression{T}; verbose::Bool=false) where {T<:AbstractFloat} + + resp = m.resp + pred = m.pred + + # reset ∇β + copyto!(pred.delbeta, pred.beta0) + + # res = y - Xβ + mul!(resp.μ, pred.X, pred.beta0) + initresp!(resp) + + # Initialize the predictor + initpred!(pred, m.wts, resp.σ; verbose=verbose) + + return m +end + +function setη!( + m::IPODRegression{T,R,P,V}, linesearch_f::Real=1; verbose::Bool=false +) where {T,R,P<:LinPred,V} + # TODO: add line search + μ = m.resp.μ + + # # dβ = Σ \ (X' * r) + # mul!(dβ, wXt, p.wrkres) + # ldiv!(facΣ, dβ) + update_beta!(m.pred, m.resp.wrkres, m.wts) + + # Install beta + broadcast!(+, m.pred.beta0, m.pred.beta0, m.pred.delbeta) + + # Update linear predictor + mul!(μ, m.pred.X, m.pred.beta0) + + # Update residuals with the new μ + return update_residuals!(m.resp) +end + +function setη!( + m::IPODRegression{T,R,P,V}, linesearch_f::Real=1; verbose::Bool=false +) where {T,R,P<:AbstractRegularizedPred,V} + wts = weights(m) + + μ = m.resp.μ + σ2 = m.resp.σ^2 + wrky = m.resp.wrky + + if m.pred isa CGDRegPred + res = m.resp.wrkres + + # Update coefs and μ + update_βμ!(m.pred, wrky, μ, res, σ2, wts; verbose=verbose) + + else + # Update coefs + update_beta!(m.pred, wrky, wts, σ2; verbose=verbose) + + # Update linear predictor + mul!(μ, m.pred.X, m.pred.beta0) + + # Update residuals with the new μ + update_residuals!(m.resp) + end + + return m +end diff --git a/src/linpred.jl b/src/linpred.jl index fafc9bb..4c3e687 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -2,6 +2,8 @@ import GLM: cholpred ################# +loss_criteria(p::LinPred) = 0 + StatsAPI.modelmatrix(p::LinPred) = p.X function StatsAPI.vcov(p::LinPred, wt::AbstractVector) diff --git a/src/penalties.jl b/src/penalties.jl new file mode 100644 index 0000000..1299338 --- /dev/null +++ b/src/penalties.jl @@ -0,0 +1,826 @@ + + +soft_threshold(x::Real, λ::Real) = (abs(x) <= λ) ? zero(x) : x - sign(x) * λ + + + +isconvex(::PenaltyFunction) = false +isseparable(::PenaltyFunction) = false + +""" + concrete!(p::PenaltyFunction, m::Integer, intercept_index::Union{Nothing,Integer}=nothing) + +Modifies the penalty to make it compatible with the coefficients of size `m`. +For penalties that do not handle indices range, return the unmodified penalty, +otherwise (like for RangedPenalties), modify the ranges so they cover all +indices in the `1:m` range. +Throws an error if the penalty ranges are not compatible with `m` or if `intercept_index` +is defined and `p` is not a RangedPenalties. +""" +function concrete!( + p::PenaltyFunction, n::Integer, intercept_index::Union{Nothing,Integer}=nothing +) + if !isnothing(intercept_index) && (1 <= intercept_index <= n) + error( + "intercept_index $(intercept_index) in 1:$n, cannot change the type " * + "from $(typeof(p)) to RangedPenalties.", + ) + end + return p +end + +""" + concrete(p::PenaltyFunction, m::Integer) + +Returns a penalty that is compatible with the coefficients of size `m`. +For penalties that do not handle indices range, if `intercept_index` is not defined +return a deep copy of the penalty, otherwise return a RangedPenalties excluding this index. +For RangedPenalties, returns a penalty with ranges that cover all the indices in the `1:m` range. +Throws an error if the penalty ranges are not compatible with `m`. +""" +function concrete( + p::PenaltyFunction, n::Integer, intercept_index::Union{Nothing,Integer}=nothing +) + if !isnothing(intercept_index) && (1 <= intercept_index <= n) + if n == 1 + ranges = [] + penalties = [] + else + ranges = [excludeindex(1:n, intercept_index)] + penalties = [p] + end + return RangedPenalties(ranges, penalties, n) + else + return deepcopy(p) + end +end + + + +function proximal!( + p::PenaltyFunction, + out::AbstractVector, + index::Integer, + x::AbstractVector, + step::AbstractFloat, +) + proximal!(p, view(out, index), view(x, index), step) + return out +end + +function proximal(p::PenaltyFunction, x::AbstractArray, step::AbstractFloat) + return proximal!(p, similar(x), x, step) +end +function proximal(p::PenaltyFunction, x::T, step::AbstractFloat) where {T<:AbstractFloat} + return proximal!(p, ones(T, 1), [x], step)[1] +end + +StatsAPI.dof(p::PenaltyFunction, x::AbstractVector{T}) where {T<:AbstractFloat} = length(x) + +######################################################################## +##### Penalty functions +######################################################################## + +""" + NoPenalty{T<:AbstractFloat} + +No penalty (constant penalty), the proximal operator returns the same vector as input (identity map). +P(x) = 0 +""" +struct NoPenalty{T<:AbstractFloat} <: PenaltyFunction{T} end +NoPenalty(args...; kwargs...) = NoPenalty{Float64}() +cost(::NoPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} = zero(T) +function proximal!( + p::NoPenalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + return copyto!(out, x) +end + +isconvex(::NoPenalty) = true +isseparable(::NoPenalty) = true + + +""" + SquaredL2Penalty{T<:AbstractFloat} + +Squared L2 penalty, the proximal operator returns a scaled version. +P(x) = λ/2 Σi |xi|² +""" +struct SquaredL2Penalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + nonnegative::Bool + + function SquaredL2Penalty(λ::T; nonnegative::Bool=false) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ")) + return new{T}(λ, nonnegative) + end +end +function cost(p::SquaredL2Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return p.λ / 2 * sum(abs2, x) +end +function proximal!( + p::SquaredL2Penalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + # return broadcast!(/, out, x, 1 + p.λ * step) + nonnegative = p.nonnegative + a = 1 / (1 + p.λ * step) + @inbounds @simd for i in eachindex(out, x) + out[i] = (nonnegative && x[i] <= 0) ? zero(T) : x[i] * a + end + return out +end + +# Approximate shrinkage of coefficients +function StatsAPI.dof(p::SquaredL2Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return length(x) / (1 + p.λ) +end + +isconvex(::SquaredL2Penalty) = true +isseparable(::SquaredL2Penalty) = true + + +""" + EuclideanPenalty{T<:AbstractFloat} + +Euclidean norm penalty, the proximal operator returns a scaled version. +P(x) = λ √(Σi |xi|²) +""" +struct EuclideanPenalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + nonnegative::Bool + + function EuclideanPenalty(λ::T; nonnegative::Bool=false) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ")) + return new{T}(λ, nonnegative) + end +end +function cost(p::EuclideanPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return p.λ * norm(x, 2) +end +function proximal!( + p::EuclideanPenalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + # nn = p.nonnegative ? norm(broadcast!(max, out, x, 0), 2) : norm(x, 2) + if p.nonnegative + nn = zero(T) + @inbounds @simd for xi in x + nn += xi > 0 ? xi^2 : zero(T) + end + nn = sqrt(nn) + else + nn = norm(x, 2) + end + return rmul!(copyto!(out, x), (1 - p.λ * step / max(p.λ * step, nn))) +end + +isconvex(::EuclideanPenalty) = true +isseparable(::EuclideanPenalty) = false + + +""" + L1Penalty{T<:AbstractFloat} + +L1 penalty, the proximal operator returns a soft-thresholded value. +P(x) = λ Σi |xi| +""" +struct L1Penalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + nonnegative::Bool + + function L1Penalty(λ::T; nonnegative::Bool=false) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ")) + return new{T}(λ, nonnegative) + end +end +cost(p::L1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} = p.λ * sum(abs, x) +function proximal!( + p::L1Penalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + a = p.λ * step + nonnegative = p.nonnegative + + @inbounds @simd for i in eachindex(out, x) + out[i] = (nonnegative && x[i] <= 0) ? zero(T) : soft_threshold(x[i], a) + end + return out +end + +function StatsAPI.dof(p::L1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return count(!iszero, x) +end + +isconvex(::L1Penalty) = true +isseparable(::L1Penalty) = true + + +""" + ElasticNetPenalty{T<:AbstractFloat} + +ElasticNet penalty, a sum of SquaredL2Penalty and sparse L1Penalty. +P(x) = l1_ratio . λ Σi |xi| + (1 - l1_ratio) . λ/2 Σi |xi|² +""" +struct ElasticNetPenalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + l1_ratio::T + nonnegative::Bool + + function ElasticNetPenalty( + λ::T, l1_ratio::AbstractFloat=T(0.5); nonnegative::Bool=false + ) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ")) + 0 <= l1_ratio <= 1 || + throw(ArgumentError("l1_ratio must be between 0 and 1: $(l1_ratio)")) + return new{T}(λ, l1_ratio, nonnegative) + end +end + +function cost(p::ElasticNetPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return p.λ * (p.l1_ratio * sum(abs, x) + (1 - p.l1_ratio) * sum(abs2, x) / 2) +end + +function proximal!( + p::ElasticNetPenalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + a = p.λ * step + nonnegative = p.nonnegative + l1_ratio = p.l1_ratio + @inbounds @simd for i in eachindex(out, x) + out[i] = if (nonnegative && x[i] <= 0) + zero(T) + else + (soft_threshold(x[i], l1_ratio * a) / (1 + (1 - l1_ratio) * a)) + end + end + return out +end + +# Approximate shrinkage of coefficients +function StatsAPI.dof( + p::ElasticNetPenalty{T}, x::AbstractVector{T} +) where {T<:AbstractFloat} + return count(!iszero, x) / (1 + p.λ) +end + +isconvex(::ElasticNetPenalty) = true +isseparable(::ElasticNetPenalty) = true + + +""" + BerhuPenalty{T<:AbstractFloat} + +Berhu convex penalty, equivalent to L1Penalty at low values and SquaredL2Penalty at high values +(reverse of Huber loss). +Owen (2006) - A robust hybrid of lasso and ridge regression + +P(x) = λl1_ratio . λ Σi |xi| + (1 - l1_ratio) . λ/2 Σi |xi|² +""" +struct BerhuPenalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + γ::T + nonnegative::Bool + + function BerhuPenalty( + λ::T, γ::AbstractFloat=T(1); nonnegative::Bool=false + ) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ")) + γ >= 0 || throw(ArgumentError("γ must be non-negative: $(γ)")) + return new{T}(λ, γ, nonnegative) + end +end + +function cost(p::BerhuPenalty{T}, xi::T) where {T<:AbstractFloat} + return abs(xi) <= p.γ ? p.λ * abs(xi) : p.λ * p.γ / 2 * (1 + (xi / p.γ)^2) +end +function cost(p::BerhuPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return sum(xi -> cost(p, xi), x; init=zero(T)) +end + +function proximal!( + p::BerhuPenalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + a = p.λ * step + γ = p.γ + nonnegative = p.nonnegative + @inbounds @simd for i in eachindex(out, x) + if p.nonnegative && x[i] <= 0 + out[i] = zero(T) + elseif abs(x[i]) <= a + γ + out[i] = soft_threshold(x[i], a) + else + out[i] = γ * x[i] / (a + γ) + end + end + return out +end + +# Approximate coefficient selection +function StatsAPI.dof(p::BerhuPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return count(!iszero, x) +end + +isconvex(::BerhuPenalty) = true +isseparable(::BerhuPenalty) = true + + +""" + SCADPenalty{T<:AbstractFloat} + +SCAD penalty, folded-concave penalty, generalization of the LASSO. +Fan & Li (2001) - Variable Selection via Nonconcave Penalized Likelihood and its Oracle Properties + +P(x) = λ ∫_0^|x| dt I(t<=λ) + (γλ - t)_+ / (λ*(γ-1)) I (t>λ) +""" +struct SCADPenalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + γ::T + nonnegative::Bool + + function SCADPenalty( + λ::T, γ::AbstractFloat=T(3.7); nonnegative::Bool=false + ) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ must be non-negative: $λ")) + γ > 2 || throw(ArgumentError("γ must be greater than 2: $(γ)")) + if nonnegative + @warn "`nonnegative` argument is ignored for SCADPenalty." + end + return new{T}(λ, γ, false) + end +end + +function cost(p::SCADPenalty{T}, xi::T) where {T<:AbstractFloat} + if abs(xi) <= p.λ + return p.λ * abs(xi) + elseif abs(x[i]) >= p.λ * p.γ + return p.λ^2 * (p.γ + 1) / 2 + else + return -(xi^2 - 2 * p.λ * p.γ * abs(xi) + p.λ^2) / (2 * (p.γ - 1)) + end +end +function cost(p::SCADPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return sum(xi -> cost(p, xi), x; init=zero(T)) +end + +function convex_approx(p::SCADPenalty{T}, x::T, u::T, s::T) where {T} + return 0.5 * (x - u)^2 + cost(p, x) / s +end + +function proximal!( + p::SCADPenalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + # Step is non-multiplicative + # Gong et al. (2013) - A General Iterative Shrinkage and Thresholding Algorithm for + # Non-convex Regularized Optimization Problems + λ = p.λ + γ = p.γ + + if step <= 0 + return copy!(out, x) + end + + @inbounds @simd for i in eachindex(out, x) + # if p.nonnegative && x[i] <= 0 + # out[i] = zero(T) + # else + x1 = sign(x[i]) * min(λ, max(0, abs(x[i]) - λ / step)) + x2 = + sign(x[i]) * + min(λ * γ, max(λ, (abs(x[i]) * (γ - 1) - λ * γ / step) / (γ - 2))) + x3 = sign(x[i]) * min(λ * γ, abs(x[i])) + sols = [x1, x2, x3] + ind = argmin(convex_approx.(p, sols, x[i], step)) + out[i] = sols[ind] + end + return out +end + +# Approximate coefficient selection +function StatsAPI.dof(p::SCADPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return count(!iszero, x) +end + +isconvex(::SCADPenalty) = false +isseparable(::SCADPenalty) = true + + +""" + CappedL1Penalty{T<:AbstractFloat} + +Capped L1 penalty, folded-concave penalty, a bridge between the L1Penalty and the Hard thresholding penalty. +For γ -> Inf, the MC+ penalty becomes equivalent to the L1Penalty. +For γ -> 0+, the MC+ penalty tends to the L0 penalty or hard thresholding penalty. + +P(x) = λ min(abs(x), γ) +""" +struct CappedL1Penalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + γ::T + nonnegative::Bool + + function CappedL1Penalty( + λ::T, γ::AbstractFloat=T(3); nonnegative::Bool=false + ) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ must be non-negative: $λ")) + γ > 0 || throw(ArgumentError("γ must be greater than 1: $(γ)")) + if nonnegative + @warn "`nonnegative` argument is ignored for CappedL1Penalty." + end + return new{T}(λ, γ, false) + end +end + +cost(p::CappedL1Penalty{T}, xi::T) where {T<:AbstractFloat} = p.λ * min(abs(xi), p.γ) +function cost(p::CappedL1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return sum(xi -> cost(p, xi), x; init=zero(T)) +end + +function convex_approx(p::CappedL1Penalty{T}, x::T, u::T, s::T) where {T} + return 0.5 * (x - u)^2 + cost(p, x) / s +end + +function proximal!( + p::CappedL1Penalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + # Step is non-multiplicative + # Gong et al. (2013) - A General Iterative Shrinkage and Thresholding Algorithm for + # Non-convex Regularized Optimization Problems + λ = p.λ + γ = p.γ + + if step <= 0 + return copy!(out, x) + end + + @inbounds @simd for i in eachindex(out, x) + # if p.nonnegative && x[i] <= 0 + # out[i] = zero(T) + # else + x1 = sign(x[i]) * max(γ, abs(x[i])) + x2 = sign(x[i]) * min(γ, max(0, abs(x[i]) - λ / step)) + sols = [x1, x2] + ind = argmin(convex_approx.(p, sols, x[i], step)) + out[i] = sols[ind] + end + return out +end + +# Approximate coefficient selection +function StatsAPI.dof(p::CappedL1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return count(!iszero, x) +end + +isconvex(::CappedL1Penalty) = false +isseparable(::CappedL1Penalty) = true + + +""" + MCPPenalty{T<:AbstractFloat} + +MC+ penalty, folded-concave penalty, a bridge between the L1Penalty and the Hard thresholding penalty. +For γ -> Inf, the MC+ penalty becomes equivalent to the L1Penalty. +For γ -> 1+, the MC+ penalty tends to the L0 penalty or hard thresholding penalty. + +Zhang (2010) - Nearly unbiased variable selection under minimax concave penalty + +P(x) = λ ∫_0^|x| dt (1 - t/γλ)_+ +""" +struct MCPPenalty{T<:AbstractFloat} <: PenaltyFunction{T} + λ::T + γ::T + nonnegative::Bool + + function MCPPenalty( + λ::T, γ::AbstractFloat=T(3); nonnegative::Bool=false + ) where {T<:AbstractFloat} + λ >= 0 || throw(ArgumentError("penalty constant λ must be non-negative: $λ")) + γ > 1 || throw(ArgumentError("γ must be greater than 1: $(γ)")) + if nonnegative + @warn "`nonnegative` argument is ignored for MCPPenalty." + end + return new{T}(λ, γ, false) + end +end + +function cost(p::MCPPenalty{T}, xi::T) where {T<:AbstractFloat} + if abs(xi) < p.λ * p.γ + return p.λ * abs(xi) - xi^2 / (2 * p.γ) + else + return p.λ^2 * p.γ / 2 + end +end +function cost(p::MCPPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return sum(xi -> cost(p, xi), x; init=zero(T)) +end + +function convex_approx(p::MCPPenalty{T}, x::T, u::T, s::T) where {T} + return 0.5 * (x - u)^2 + cost(p, x) / s +end +function aux_func2(p::MCPPenalty{T}, x::T, u::T, s::T) where {T} + return (x - abs(u))^2 / 2 + p.λ * x / s - x^2 / (2 * p.γ) +end + +function proximal!( + p::MCPPenalty{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + # proximal(p, x) = abs(x) <= p.λ ? 0 : + # abs(x) < p.λ * p.γ ? sign(x) * (abs(x) - p.λ) / (1 - 1/p.γ) : x + # + # Step is non-multiplicative + # Gong et al. (2013) - A General Iterative Shrinkage and Thresholding Algorithm for + # Non-convex Regularized Optimization Problems + λ = p.λ + γ = p.γ + + if step <= 0 + return copy!(out, x) + end + + a = p.λ * p.γ + @inbounds @simd for i in eachindex(out, x) + # if p.nonnegative && x[i] <= 0 + # out[i] = zero(T) + # else + z1 = 0 + z2 = a + z3 = min(a, max(0, (γ * abs(x[i]) - a / step) / (γ - 1))) + zsols = [z1, z2, z3] + ind = argmin(aux_func2.(p, zsols, x[i], step)) + z = zsols[ind] + + x1 = sign(x[i]) * z + x2 = sign(x[i]) * min(a, abs(x[i])) + sols = [x1, x2] + ind = argmin(convex_approx.(p, sols, x[i], step)) + out[i] = sols[ind] + end + return out +end + +# Approximate coefficient selection +function StatsAPI.dof(p::MCPPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + return count(!iszero, x) +end + +isconvex(::MCPPenalty) = false +isseparable(::MCPPenalty) = true + + +################################# +### RangedPenalties +################################# + +abstract type AbstractRangedPenalties{T} <: PenaltyFunction{T} end + +struct End{T<:Integer} + offset::T + + End{T}(offset::T) where {T<:Integer} = new{T}(offset) + End{T}() where {T<:Integer} = new{T}(zero(T)) +end +End(offset::T) where {T<:Integer} = End{T}(offset) +End() = End{Int}() + +struct EndStepRange{T<:Integer} + start::T + step::T + stop::T + + function EndStepRange{T}( + start::Integer, step::Integer, stop::Integer + ) where {T<:Integer} + return new{T}(Base.convert.(T, (start, step, stop))...) + end +end +function EndStepRange( + start::I1, step::I2, stop::I3 +) where {I1<:Integer,I2<:Integer,I3<:Integer} + return EndStepRange{promote_type(I1, I2, I3)}(start, step, stop) +end + +(::Colon)(start::Integer, stop::End) = EndStepRange(start, one(start), stop.offset) +(::Colon)(start::Integer, step::Integer, stop::End) = EndStepRange(start, step, stop.offset) + +torange(p::EndStepRange, n::Integer) = (p.start):(p.step):(n - p.stop) +function torange(p::Vector, n::Integer) + return if all(1 .<= p .<= n) + sort(p) + else + throw(BoundsError("attempt to access $n-element Vector at index [$(p)]")) + end +end +function torange(p::AbstractRange, n::Integer) + return if (last(p) <= n) + p + else + throw(BoundsError("attempt to access $n-element Vector at index [$(p)]")) + end +end + +function excludeindex(p::Vector, i::Integer) + i in p || return p + + return filter(!=(i), p) +end +function excludeindex(p::AbstractRange, i::Integer) + i in p || return p + + if first(p) == i + ran = (first(p) + step(p)):step(p):last(p) + elseif last(p) == i + ran = first(p):step(p):(last(p) - step(p)) + else + ran = filter(!=(i), p) + end + return ran +end + +RangeLikeType{T} = Union{UnitRange{<:T},StepRange{<:T,<:T},EndStepRange{<:T},Vector{<:T}} + + +mutable struct RangedPenalties{T<:AbstractFloat,N<:Integer} <: AbstractRangedPenalties{T} + ranges::Vector{RangeLikeType{N}} + penalties::Vector{PenaltyFunction} + notinrange::Vector{N} + isconcrete::Bool + + function RangedPenalties{T,N}( + ranges::AbstractVector, penalties::AbstractVector{<:PenaltyFunction{T}} + ) where {T<:AbstractFloat,N<:Integer} + any(p isa RangedPenalties for p in penalties) && throw( + ArgumentError( + "RangedPenalties penalties should not be of type RangedPenalties: $(typeof.(penalties))", + ), + ) + length(ranges) == length(penalties) || throw( + ArgumentError( + "ranges and penalties should have the same number of elements: $(length(ranges)) != $(length(penalties))", + ), + ) + + return new{T,N}(ranges, penalties, zeros(N, 0), false) + end +end + +function RangedPenalties( + ranges::AbstractVector, + penalties::AbstractVector{<:PenaltyFunction{T}}, + n::Union{Nothing,Integer}=nothing, +) where {T<:AbstractFloat} + length(penalties) > 0 || + throw(ArgumentError("RangedPenalties should contain at least one PenaltyFunction")) + p = RangedPenalties{T,typeof(length(ranges))}(ranges, penalties) + if !isnothing(n) + concrete!(p, n) + end + return p +end + +function Base.:(==)(x::RangedPenalties, y::RangedPenalties) + if !isconcrete(x) || !isconcrete(y) + return x.ranges == y.ranges && x.penalties == y.penalties + end + + # Concrete penalties + x.penalties == y.penalties || return false + for (xr, yr) in zip(x.ranges, y.ranges) + Set(xr) == Set(yr) || return false + end + Set(x.notinrange) == Set(y.notinrange) || return false + return true +end + +isconcrete(p::RangedPenalties) = p.isconcrete + +function concrete!( + p::RangedPenalties, n::Integer, intercept_index::Union{Nothing,Integer}=nothing +) + ranges = p.ranges + penalties = p.penalties + # Check that ranges are non-overlapping + notinrange = Set(1:n) + @inbounds for j in eachindex(ranges, penalties) + ran = torange(ranges[j], n) + sran = Set(ran) + if intersect(notinrange, sran) != sran + error("Overlapping ranges are not allowed: $(ranges)") + end + if !isnothing(intercept_index) + # Remove intercept_index from range + ran = excludeindex(ran, intercept_index) + sran = Set(ran) + end + ## Transform End-ranges to Base ranges + ranges[j] = ran + ## Remoev used ranges + setdiff!(notinrange, sran) + end + ## Store missing indices in notinrange + p.notinrange = sort!(collect(notinrange)) + p.isconcrete = true + return p +end + +function concrete( + p::RangedPenalties, n::Integer, intercept_index::Union{Nothing,Integer}=nothing +) + new_p = deepcopy(p) + return concrete!(new_p, n, intercept_index) +end + +function cost(p::RangedPenalties{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + isconcrete(p) || error( + "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.", + ) + ranges = p.ranges + penalties = p.penalties + + n = length(x) + r = zero(T) + for (ran, pen) in zip(ranges, penalties) + r += cost(pen, view(x, ran)) + end + return r +end + +function proximal!( + p::RangedPenalties{T}, out, x::AbstractArray{T}, step::T=one(T) +) where {T<:AbstractFloat} + isconcrete(p) || error( + "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.", + ) + ranges = p.ranges + penalties = p.penalties + + n = length(x) + for (ran, pen) in zip(ranges, penalties) + proximal!(pen, view(out, ran), view(x, ran), step) + end + # Apply NoPenalty to indices not defined by ranges + if !isempty(p.notinrange) + ran = p.notinrange + pen = NoPenalty{T}() + proximal!(pen, view(out, ran), view(x, ran), step) + end + return out +end + +function proximal!( + p::RangedPenalties{T}, + out::AbstractVector, + index::Integer, + x::AbstractVector, + step::AbstractFloat, +) where {T<:AbstractFloat} + isconcrete(p) || error( + "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.", + ) + ranges = p.ranges + penalties = p.penalties + + done = false + for (ran, pen) in zip(ranges, penalties) + if in(index, ran) + proximal!(pen, view(out, index), view(x, index), step) + done = true + break + end + end + if !done && !isempty(p.notinrange) + if !in(index, p.notinrange) + n = length(x) + error("index not found in range $(1:n): $index") + end + proximal!(NoPenalty{T}(), view(out, index), view(x, index), step) + end + return out +end + +function StatsAPI.dof(p::RangedPenalties{T}, x::AbstractVector{T}) where {T<:AbstractFloat} + isconcrete(p) || error( + "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.", + ) + ranges = p.ranges + penalties = p.penalties + + # For ranges without penalty, count 1 dof per index + r = convert(T, length(p.notinrange)) + @inbounds for (ran, pen) in zip(ranges, penalties) + r += dof(pen, view(x, ran)) + end + return r +end + +isconvex(p::RangedPenalties) = all(isconvex.(p.penalties)) +function isseparable(p::RangedPenalties) + ranges = p.ranges + penalties = p.penalties + + if !isconcrete(p) + error("isseparable is only defined on concrete RangedPenalties.") + end + @inbounds for (ran, pen) in zip(ranges, penalties) + if !isseparable(pen) && length(ran) > 1 + return false + end + end + return true +end diff --git a/src/regularizedpred.jl b/src/regularizedpred.jl index f91f893..ccfa886 100644 --- a/src/regularizedpred.jl +++ b/src/regularizedpred.jl @@ -10,6 +10,18 @@ StatsAPI.modelmatrix(p::AbstractRegularizedPred) = p.X StatsAPI.coef(p::AbstractRegularizedPred) = p.beta0 +penalty(p::AbstractRegularizedPred) = p.penalty + +penalized_coef(p::AbstractRegularizedPred) = coef(p) + +dev_criteria(p::AbstractRegularizedPred) = 2 * cost(penalty(p), penalized_coef(p)) + +initpred!(p::AbstractRegularizedPred, args...; kwargs...) = p + +updatepred!(p::AbstractRegularizedPred, args...; kwargs...) = p + +update_beta!(p::AbstractRegularizedPred, args...; kwargs...) = p + function StatsAPI.vcov(p::AbstractRegularizedPred, wt::AbstractVector) wXt = isempty(wt) ? modelmatrix(p)' : (modelmatrix(p) .* wt)' return inv(Hermitian(float(Matrix(wXt * modelmatrix(p))))) @@ -217,6 +229,10 @@ function Base.propertynames(r::RidgePred, private::Bool=false) end end +penalty(p::RidgePred) = SquaredL2Penalty(p.λ) + +penalized_coef(p::RidgePred) = p.G * (p.pred.beta0 - p.βprior) + function cgpred( X::StridedMatrix{T}, λ::Real, @@ -363,3 +379,1091 @@ function StatsAPI.vcov(p::RidgePred, wt::AbstractVector) wXt = isempty(wwt) ? extendedmodelmatrix(p)' : (extendedmodelmatrix(p) .* wwt)' return inv(Hermitian(float(Matrix(wXt * extendedmodelmatrix(p))))) end + + +############################## +### Coordinate Gradient Descent predictor +############################## + +mutable struct CGDParams + "`outerloop`: use extra outer loop" + outerloop::Bool + "`update_all`: update all coefficients, that are zero or not." + update_all::Bool + + function CGDParams(outerloop::Bool=true, update_all::Bool=true) + return new(outerloop, update_all) + end +end + +function initparams!(p::CGDParams) + p.update_all = true + return p +end + +""" + CGDRegPred + +Regularization using Coordinate Gradient Descent. + +# Members + +- `X`: model matrix +- `beta0`: base vector for coefficients +- `delbeta`: coefficients increment +- `Σ`: Gram matrix and temporary matrix +- `penalty`: the penalty function. +- `invvar`: precision vector. +- `scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method. +""" +struct CGDRegPred{T<:BlasReal,M<:AbstractMatrix{T},V<:Vector{T},P<:PenaltyFunction} <: + AbstractRegularizedPred{T} + "`X`: model matrix" + X::M + "`beta0`: base vector for coefficients" + beta0::V + "`delbeta`: coefficients increment" + delbeta::V + "`Σ`: Gram matrix, X'WX." + Σ::M + "`penalty`: penalty function" + penalty::P + "`invvar`: inverse variance/precision vector" + invvar::V + "`params`: parameters for CGD" + params::CGDParams + "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method" + scratchbeta::Vector{T} + + function CGDRegPred( + X::M, + Σ::AbstractMatrix{<:Real}, + invvar::AbstractVector{<:Real}, + penalty::P, + outerloop::Bool=true, + ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal} + m = size(Σ, 1) + + if !isseparable(penalty) + error( + "Coordinate Gradient Descent is only allowed with a separable penalty: " * + "$(penalty)" + ) + end + + params = CGDParams(outerloop) + + return new{T,M,typeof(invvar),P}( + X, + zeros(T, m), + zeros(T, m), + Σ, + penalty, + invvar, + params, + zeros(T, m), + ) + end +end + +function CGDRegPred( + X::M, penalty::P, wts::V, outerloop::Bool=true, +) where {M<:AbstractMatrix{T},V<:AbstractVector,P<:PenaltyFunction} where {T<:BlasReal} + n, m = size(X) + ll = size(wts, 1) + ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n.")) + + # invvar = precision_vector(X, wts) + if isempty(wts) + Σ = Hermitian(float(X' * X)) + else + Σ = Hermitian(float((wts .* X)' * X)) + end + invvar = Vector(inv.(diag(Σ))) + + return CGDRegPred(X, Σ, invvar, penalty, outerloop) +end + +function precision_vector( + X::AbstractMatrix{T}, wts::AbstractVector +) where {T<:AbstractFloat} + invvar = Vector{T}(undef, size(X, 2)) + if isempty(wts) + @inbounds @simd for j in eachindex(invvar, axes(X, 2)) + invvar[j] = 1 / sum(abs2, view(X, :, j)) + end + else + @inbounds @simd for j in eachindex(invvar, axes(X, 2)) + invvar[j] = 1 / sum(i -> wts[i] * abs2(X[i, j]), eachindex(wts, axes(X, 1))) + end + end + return invvar +end + +function initpred!(p::CGDRegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false) + initparams!(p.params) + return p +end + +function update_βμ!( + p::CGDRegPred{T}, + y::AbstractVector{T}, + μ::AbstractVector{T}, + res::AbstractVector{T}, + σ2::T=one(T), + wts::AbstractVector{T}=T[]; + tol::Real=1e-4, + verbose::Bool=false, +) where {T<:BlasReal} + X = p.X + β = p.beta0 + newβ = p.delbeta + invvar = p.invvar + pen = penalty(p) + + m = size(X, 2) + outerM = p.params.outerloop ? m : 1 + update_all = p.params.update_all + +# copyto!(newβ, β) + # outer loop + @inbounds for _ in 1:outerM + + Δdev = zero(T) + # iterate over indices + @inbounds for j in eachindex(axes(X, 2), β, newβ, invvar) + βj = β[j] + !update_all && iszero(βj) && continue + + Xj = view(X, :, j) + + # remove component due to index j in μ + # µ -= X[:, j] * β[j] +# if !iszero(βj) +# @inbounds @simd for i in eachindex(μ, Xj) +# μ[i] -= Xj[i] * β[j] +# end +## broadcast!(muladd, μ, -β[j], Xj, μ) +# end + + newβ[j] = βj + if isempty(wts) + @inbounds @simd for i in eachindex(axes(X, 1), res, Xj) + newβ[j] += Xj[i] * res[i] +# @inbounds @simd for i in eachindex(axes(X, 1), y, μ, Xj) +# newβ[j] += Xj[i] * (y[i] - μ[i] + Xj[i] * βj) + end + else + @inbounds @simd for i in eachindex(axes(X, 1), res, Xj, wts) + newβ[j] += wts[i] * Xj[i] * res[i] + end + end + newβ[j] *= invvar[j] + + + # verbose && println("∇βj before prox: $(gradβ)") + # Proximal operator on a single coordinate + proximal!(pen, newβ, j, newβ, invvar[j] * σ2) +# proximal!(pen, β, j, newβ, invvar[j] * σ2) + # verbose && println("β after prox: $(p.β)") + + # re-add component due to index j in μ + # μj = p.X[:, j] * p.β[j] + # p.µ .+= μj +## if !iszero(β[j]) +## @inbounds @simd for i in eachindex(μ, Xj) +## μ[i] += Xj[i] * β[j] +## end +### broadcast!(muladd, μ, β[j], view(X, :, j), μ) +## end + Δβ = newβ[j] - βj + if !iszero(Δβ) + β[j] = newβ[j] + @inbounds @simd for i in eachindex(μ, res, Xj) + dμi = Xj[i] * Δβ + μ[i] += dμi + res[i] -= dμi + end + Δdev = max(Δdev, abs2(Δβ) / invvar[j]) + end + end + + if Δdev < tol + if update_all + # Converged + break + else + # Run a last iteration, computing all the coefficients again + update_all = true + end + else + # Check for convergence + update_all = false + end + end + + p.params.update_all = true + return p +end + + +############################## +### FISTA predictor +############################## + +mutable struct FISTAParams{T<:AbstractFloat} + "`restart`: allow restart" + restart::Bool + "`use_backtracking`: use backtracking for adjusting the step size" + use_backtracking::Bool + "`bt_maxiter`: backtracking loop, maximum number of iteration" + bt_maxiter::Int + "`bt_stepfac`: backtracking loop, reducing factor" + bt_delta::T + "`sk`: FISTA step parameter" + sk::T + "`tk`: FISTA acceleration step" + tk::T + "`kk`: ADMM number of iterations since last restart" + kk::Int + + function FISTAParams{T}( + restart::Bool, use_backtracking::Bool, bt_maxiter::Integer=20, bt_delta::Real=0.5 + ) where {T<:AbstractFloat} + return new{T}( + restart, use_backtracking, bt_maxiter, bt_delta, one(T), one(T), 1, + ) + end +end + +function initparams!(p::FISTAParams, sk::Real=1) + p.sk = sk + p.tk = 1 + p.kk = 1 + return p +end + +""" + update_accelstep!(p::FISTAParams, restart::Bool=false; verbose::Bool=false) + +Update the step for acceleration and return the acceleration factor. +If `restart`, reset to 1. +""" +function update_accelstep!(p::FISTAParams, restart::Bool=false; verbose::Bool=false) + if restart + verbose && println("Restart after $(p.kk) FISTA iterations") + # reset step and iteration number + p.tk = 1 + p.kk = 1 + return zero(p.tk) + end + + tk = p.tk + # tkp1 = (1 + √(1 + 4 * p.tk^2)) / 2 + # mk = (p.tk - 1) / tkp1 + + # mk = (k - 1) / (k + 2) + + # Tseng (2008) - On accelerated proximal gradient methods for convex-concave optimization + tkp1 = (√(tk^4 + 4 * tk^2) - tk^2) / 2 + mk = tkp1 * (1 - tk) / tk + # Update the acceleration step + p.tk = tkp1 + # Update the iteration number + p.kk += 1 + return mk +end + + +""" + FISTARegPred + +Regularization using Fast Iterative Shrinkage-Thresholding Algorithm. + +# Members + +- `X`: model matrix +- `beta0`: base vector for coefficients +- `delbeta`: coefficients increment +- `Σ`: Gram matrix and temporary matrix +- `penalty`: the penalty function. +- `scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method. +""" +struct FISTARegPred{T<:BlasReal,M<:AbstractMatrix{T},V<:Vector{T},P<:PenaltyFunction} <: + AbstractRegularizedPred{T} + "`X`: model matrix" + X::M + "`beta0`: base vector for coefficients" + beta0::V + "`delbeta`: coefficients increment" + delbeta::V + "`Σ`: Corresponds to X'WX." + Σ::M + "`penalty`: penalty function" + penalty::P + "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method" + scratchbeta::V + "`wXt`: transpose of the (weighted) model matrix" + wXt::M + "`params`: parameters for FISTA" + params::FISTAParams + "pre-allocated temporary vectors" + gradβ::V + proxβ::V + stepβ::V + + function FISTARegPred( + X::M, + wXt::AbstractMatrix{<:Real}, + Σ::AbstractMatrix{<:Real}, + penalty::P, + restart::Bool=true, + use_backtracking::Bool=false, + ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal} + m = size(Σ, 1) + + params = FISTAParams{T}(restart, use_backtracking) + + beta0 = zeros(T, m) + return new{T,M,typeof(beta0),P}( + X, + beta0, + zeros(T, m), + Σ, + penalty, + zeros(T, m), + wXt, + params, + zeros(T, m), + zeros(T, m), + zeros(T, m), + ) + end +end + +function FISTARegPred( + X::M, penalty::P, wts::V, restart::Bool=true, use_backtracking::Bool=false +) where {M<:AbstractMatrix{T},V<:AbstractVector,P<:PenaltyFunction} where {T<:BlasReal} + n, m = size(X) + ll = size(wts, 1) + ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n.")) + + wXt = isempty(wts) ? X' : (X .* wts)' + Σ = Hermitian(float(wXt * X)) + + return FISTARegPred(X, wXt, Σ, penalty, restart, use_backtracking) +end + +function initpred!(p::FISTARegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false) + copyto!(p.proxβ, p.beta0) + copyto!(p.stepβ, p.beta0) + + # for Normal errors + # sk = 1 / eigmax(Σ) + # TODO: `eigen` methods not defined for sparse arrays, use Arpack.jl? + # sk = 1 / eigmax(Matrix(Σ)) + sk = 1 / eigmax(Matrix(p.Σ)) + initparams!(p.params, sk) + return p +end + +function update_beta!( + p::FISTARegPred{T}, + y::AbstractVector{T}, + wts::AbstractVector{T}=T[], + σ2::T=one(T); + verbose::Bool=false, +) where {T<:BlasReal} + β = p.beta0 + dβ = p.delbeta + Σ = p.Σ + wXt = p.wXt + gradβ = p.gradβ + stepβ = p.stepβ + proxβ = p.proxβ + scratchbeta = p.scratchbeta + + sk = p.params.sk + restart = p.params.restart + + + # ∇_β̂(f) = (-1/σ^2) * ((W .* X)' * wrky - Σ * β̂) + # ∇_β̂(f) = Σ * β̂ - ((W .* X)' * wrky) + # For Normal errors + mul!(gradβ, wXt, y) + mul!(scratchbeta, Σ, stepβ) # dumb variable scratchbeta + # rdiv!(broadcast!(-, gradβ, gradβ, scratchbeta), -σ2) + broadcast!(-, gradβ, scratchbeta, gradβ) + + # broadcast!(-, yres, broadcast!(-, yres, p.y, mul!(yres, p.X, stepβ)), p.outliers) + # rdiv!(mul!(gradβ, wXt, yres), -p.σ) + @. dβ = stepβ - sk * gradβ + proximal!(penalty(p), proxβ, dβ, sk * σ2) + verbose && println("current coefs:\n\t$(β)") + verbose && println("gradient:\n\t$(sk * gradβ)") + verbose && println("new coefs:\n\t$(proxβ)") + + # find step-size + if p.params.use_backtracking + Im = I(size(β, 1)) + bt_delta = p.params.bt_delta + bt_maxiter = p.params.bt_maxiter + # backtracking + # x = βstep + # f(β) = 1/2 ||y - Xβ - γ||² + # Δ = F(pr) - Q(pr, x, Li) + # Δ = f(pr) - f(x) - dot(pr - x, df_x) - Lk/2 * sum(abs2, pr - x) + broadcast!(-, scratchbeta, proxβ, stepβ) + # for Normal errors + # Needs julia > v1.4 + Δ = dot(scratchbeta, sk * Σ - Im, scratchbeta) + verbose && println("backtracking: decrease gradient step if 0 < $(Δ)") + jj = 0 + while Δ > 0 && jj < bt_maxiter + # Update the gradient step + p.params.sk = sk = bt_delta * sk + @. dβ = stepβ - sk * gradβ + proximal!(penalty(m), proxβ, dβ, si) + broadcast!(-, scratchbeta, proxβ, stepβ) + # for Normal errors + Δ = dot(scratchbeta, sk * Σ - Im, scratchbeta) + verbose && println("backtracking step $jj: decrease FISTA step $sk") + jj += 1 + end + Δ > 0 && throw(ConvergenceException(bt_maxiter)) + end + + + # FISTA step + c = dot(gradβ, proxβ) - dot(gradβ, β) + if !restart || c <= 0 + # Accelerated step + mk = update_accelstep!(p.params, false; verbose=verbose) + @. stepβ = proxβ + mk * (proxβ - β) + else + # Restart + update_accelstep!(p.params, true; verbose=verbose) + @. stepβ = β + end + + # update variables + copyto!(β, proxβ) + + return p +end + + +############################## +### AMA predictor +############################## + +mutable struct AMAParams{T<:AbstractFloat} + "`restart`: allow restart" + restart::Bool + "`η`: η for restart criteria" + η::T + "`ρ`: AMA step parameter" + ρ::T + "`tk`: AMA acceleration step" + tk::T + "`ck`: AMA restart criterium" + ck::T + "`kk`: AMA number of iterations since last restart" + kk::Int + + function AMAParams{T}(restart::Bool, η::Real=1.0) where {T<:AbstractFloat} + return new{T}(restart, η, one(T), one(T), zero(T), 1) + end +end + +function initparams!(p::AMAParams, ρ::Real=1) + p.ρ = ρ + p.tk = 1 + p.ck = 0 + p.kk = 1 + return p +end + +""" + update_accelstep!(p::AMAParams, restart::Bool=false; verbose::Bool=false) + +Update the step for acceleration and return the acceleration factor. +If `restart`, reset to 1. +""" +function update_accelstep!(p::AMAParams, restart::Bool=false; verbose::Bool=false) + if restart + verbose && println("Restart after $(p.kk) AMA iterations") + # reset step and iteration number + p.tk = 1 + p.kk = 1 + return zero(p.tk) + end + + tk = p.tk + tkp1 = (1 + √(1 + 4 * p.tk^2)) / 2 + mk = (p.tk - 1) / tkp1 + p.tk = tkp1 + # update iteration number + p.kk += 1 + return mk +end + + +struct AMARegPred{ + T<:BlasReal, + M<:AbstractMatrix{T}, + V<:Vector{T}, +# C, + P<:PenaltyFunction, + M2<:AbstractMatrix{T}, + V2<:AbstractVector{T}, +} <: AbstractRegularizedPred{T} + "`X`: model matrix" + X::M + "`beta0`: base vector for coefficients" + beta0::V + "`delbeta`: coefficients increment" + delbeta::V + "`Σ`: Corresponds to X'WX." + Σ::M + "`penalty`: penalty function" + penalty::P + "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method" + scratchbeta::V + "`penbeta0`: vector of length `p`, used in [`penalized_coef`](@ref) method" + penbeta0::V +# "`chol`: cholesky factorization" +# chol::C + "`wXt`: transpose of the (weighted) model matrix" + wXt::M + "`A`: matrix of the constraint equation `A . u - b - v = 0`" + A::M2 + "`b`: vector of the constraint equation `A . u - b - v = 0`" + b::V2 + "`params`: parameters for AMA" + params::AMAParams + "pre-allocated temporary vectors" + vk::V + vkp1::V + wk::V + wkp1::V + whatk::V + whatkp1::V + + function AMARegPred( + X::M, + wXt::AbstractMatrix{<:Real}, + Σ::AbstractMatrix{<:Real}, + penalty::P, + A::AbstractMatrix{<:Real}=float(I(size(Σ, 1))), + b::AbstractVector{<:Real}=zeros(T, size(Σ, 1)), + restart::Bool=true, + ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal} + m = size(Σ, 1) + +# chol = cholesky(Σ) + bhat = A * b + params = AMAParams{T}(restart) + + beta0 = zeros(T, m) +# return new{T,M,typeof(beta0),typeof(chol),P,typeof(A),typeof(b)}( + return new{T,M,typeof(beta0),P,typeof(A),typeof(b)}( + X, + beta0, + zeros(T, m), + Σ, + penalty, + zeros(T, m), + zeros(T, m), +# chol, + wXt, + A, + bhat, + params, + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + ) + end +end + +function AMARegPred( + X::M, + penalty::P, + wts::V, + A::AbstractMatrix{<:Real}=zeros(T, 0, 0), + b::AbstractVector{<:Real}=zeros(T, 0), + restart::Bool=true, +) where { + M<:AbstractMatrix{T},V<:AbstractVector{<:Real},P<:PenaltyFunction +} where {T<:BlasReal} + n, m = size(X) + ll = size(wts, 1) + ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n.")) + + wXt = isempty(wts) ? X' : (X .* wts)' + Σ = Hermitian(float(wXt * X)) + + if isempty(A) + A = float(I(m)) + end + size(A) == (m, m) || + throw(DimensionMismatch("size of A is $(size(A)), must be (0, 0) or $((m, m)).")) + if isempty(b) + b = zeros(T, m) + end + size(b, 1) == m || + throw(DimensionMismatch("size of b is $(size(b, 1)), must be 0 or $(m).")) + + return AMARegPred(X, wXt, Σ, penalty, A, b, restart) +end + +penalized_coef(p::AMARegPred) = mul!(copyto!(p.penbeta0, p.b), p.A, p.beta0, 1, -1) + +function initpred!(p::AMARegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false) + copyto!(p.delbeta, p.beta0) + copyto!(p.scratchbeta, p.beta0) + copyto!(p.vk, p.beta0) + copyto!(p.vkp1, p.beta0) + copyto!(p.wk, p.beta0) + copyto!(p.wkp1, p.beta0) + copyto!(p.whatk, p.beta0) + copyto!(p.whatkp1, p.beta0) + + # Goldstein (2014) - Fast Alternating Direction Optimization Methods + # Theorem 5: ρ <= σH / ρ(A'A) + # ρ = eigmin(Σ) / eigmax(A'A) + # TODO: `eigen` methods not defined for sparse arrays, use Arpack.jl? + # ρ = eigmin(Matrix(Σ)) / eigmax(Matrix(A'A)) + ρ = eigmin(Matrix(p.Σ)) / eigmax(Matrix(p.A' * p.A)) + initparams!(p.params, abs(ρ)) + + return p +end + +function update_beta!( + p::AMARegPred{T}, + y::AbstractVector{T}, + wts::AbstractVector{T}=T[], + σ2::T=one(T); + verbose::Bool=false, +) where {T<:BlasReal} + β = p.beta0 + wXt = p.wXt + βkp1 = p.delbeta + scratch = p.scratchbeta +# chol = p.chol + A = p.A + b = p.b + + vk = p.vk + vkp1 = p.vkp1 + wk = p.wk + wkp1 = p.wkp1 + whatk = p.whatk + whatkp1 = p.whatkp1 + + ρ = p.params.ρ + ck = p.params.ck + restart = p.params.restart + η = p.params.η + + verbose && println("AMA current coefs:\n\t$(β)") + + # Goldstein (2014) - Fast Alternating Minimization Algorithm + # Lρ(u, v, w) = 1/2σ² |y - X u|² + P(v) + ρ w' * (A u - b - v) + ρ/2 |Au - b - v|² + # uk+1 = (X'WX) \ (X'Wy - ρ σ² A' wk) # argmin_u Lρ(u, vk, wk) without last term + # vk+1 = prox_λ/ρ ( wk + A uk+1 - b ) # argmin_v Lρ(uk+1, v, wk) + # wk+1 = wk + A uk+1 - b - vk+1 + # + # primal residual: rk = A uk - b - vk + # dual residual: dk = ρ * (vk+1 - vk) + # + # βkp1 = chol \ (wXt * wrky - ρ * σ2 * A' * whatk) + scratch = mul!(scratch, A', whatk) + scratch = mul!(scratch, wXt, y, 1, -ρ * σ2) +# βkp1 = chol \ scratch + cg!(βkp1, p.Σ, scratch) + # vkp1 = proximal(penalty(p), A * βkp1 - b + whatk, 1/ρ) + scratch = mul!(copyto!(scratch, b), A, βkp1, 1, -1) + broadcast!(+, scratch, scratch, whatk) + proximal!(penalty(p), vkp1, scratch, 1 / ρ) + # wkp1 = whatk + A * βkp1 - b - vkp1 + broadcast!(-, wkp1, scratch, vkp1) + + ckp1 = sum(abs2, wkp1 - whatk) + if !restart || ck == 0 || ckp1 < η * ck + # Accelerated step + mk = update_accelstep!(p.params, false; verbose=verbose) + @. whatkp1 = wkp1 + mk * (wkp1 - wk) + else + # Restart + update_accelstep!(p.params, true; verbose=verbose) + @. whatkp1 = wk + ckp1 = ck / η + end + + # update variables + p.params.ck = ckp1 + copyto!(β, βkp1) + copyto!(vk, vkp1) + copyto!(wk, wkp1) + copyto!(whatk, whatkp1) + + return p +end + + + +############################## +### ADMM predictor +############################## + +mutable struct ADMMParams{T<:AbstractFloat} + "`restart`: allow restart" + restart::Bool + "`η`: η for restart criteria" + η::T + "`adapt`: allow adaptation of ρ" + adapt::Bool + "`adapt_τ`: parameter τ for ρ-adaptation" + adapt_τ::T + "`adapt_logμ`: (log of the) parameter μ for ρ-adaptation" + adapt_logμ::T + "`ρ`: ADMM step parameter" + ρ::T + "`tk`: ADMM acceleration step" + tk::T + "`ck`: ADMM restart criterium" + ck::T + "`kk`: ADMM number of iterations since last restart" + kk::Int + "`updatedρ`: flag for updated ρ" + updatedρ::Bool + + function ADMMParams{T}( + restart::Bool, adapt::Bool, η::Real=0.999, adapt_τ::Real=2.0, adapt_μ::Real=10.0 + ) where {T<:AbstractFloat} + return new{T}( + restart, η, adapt, adapt_τ, log(adapt_μ), one(T), one(T), zero(T), 1, false + ) + end +end + +function initparams!(p::ADMMParams, ρ::Real=1) + p.ρ = ρ + p.tk = 1 + p.ck = 0 + p.kk = 1 + p.updatedρ = false + return p +end + +""" + update_accelstep!(p::ADMMParams, restart::Bool=false; verbose::Bool=false) + +Update the step for acceleration and return the acceleration factor. +If `restart`, reset to 1. +""" +function update_accelstep!(p::ADMMParams, restart::Bool=false; verbose::Bool=false) + if restart + verbose && println("Restart after $(p.kk) ADMM iterations") + # reset step and iteration number + p.tk = 1 + p.kk = 1 + return zero(p.tk) + end + + tk = p.tk + tkp1 = (1 + √(1 + 4 * p.tk^2)) / 2 + mk = (p.tk - 1) / tkp1 + p.tk = tkp1 + # update iteration number + p.kk += 1 + return mk +end + +""" + update_adapt!(p::ADMMParams, logr::Real; verbose::Bool=false) + +If `adapt`, update the step ρ. `logr`= log(rrk / ssk), so the criteria become: + - `rrk > ssk * adapt_μ` ==> `logr > adapt_logμ` + - `rrk < ssk / adapt_μ` ==> `logr < -adapt_logμ` +Returns the step ρ. +""" +function update_adapt!(p::ADMMParams, logr::Real; verbose::Bool=false) + if !p.adapt + return p.ρ + end + ρ = p.ρ + if logr > p.adapt_logμ + ρ *= p.adapt_τ + elseif logr < -p.adapt_logμ + ρ /= p.adapt_τ + end + if p.ρ != ρ + verbose && println("ADMM adapt ρ: $ρ") + p.updatedρ = true + end + p.ρ = ρ + return ρ +end + + +struct ADMMRegPred{ + T<:BlasReal, + M<:AbstractMatrix{T}, + V<:Vector{T}, +# C, + P<:PenaltyFunction, + M2<:AbstractMatrix{T}, + V2<:AbstractVector{T}, +} <: AbstractRegularizedPred{T} + "`X`: model matrix" + X::M + "`beta0`: base vector for coefficients" + beta0::V + "`delbeta`: coefficients increment" + delbeta::V + "`Σ`: Corresponds to X'WX." + Σ::M + "`penalty`: penalty function" + penalty::P + "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method" + scratchbeta::V + "`penbeta0`: vector of length `p`, used in [`penalized_coef`](@ref) method" + penbeta0::V +# "`chol`: cholesky factorization" +# chol::C + "`wXt`: transpose of the (weighted) model matrix" + wXt::M + "`A`: matrix of the constraint equation `A . u - b - v = 0`" + A::M2 + "`b`: vector of the constraint equation `A . u - b - v = 0`" + b::V2 + "`params`: parameters for ADMM" + params::ADMMParams + "pre-allocated temporary vectors" + vk::V + vkp1::V + vhatk::V + vhatkp1::V + wk::V + wkp1::V + whatk::V + whatkp1::V + wrkΣ::M + + function ADMMRegPred( + X::M, + wXt::AbstractMatrix{<:Real}, + Σ::AbstractMatrix{<:Real}, + penalty::P, + A::AbstractMatrix{<:Real}=float(I(size(Σ, 1))), + b::AbstractVector{<:Real}=zeros(T, size(Σ, 1)), + restart::Bool=true, + adapt::Bool=false, + ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal} + m = size(Σ, 1) + +# chol = cholesky(sparse(Σ)) + bhat = A * b + params = ADMMParams{T}(restart, adapt) + + beta0 = zeros(T, m) +# return new{T,M,typeof(beta0),typeof(chol),P,typeof(A),typeof(b)}( + return new{T,M,typeof(beta0),P,typeof(A),typeof(b)}( + X, + beta0, + zeros(T, m), + Σ, + penalty, + zeros(T, m), + zeros(T, m), +# chol, + wXt, + A, + bhat, + params, + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + zeros(T, m), + similar(Σ), + ) + end +end + +function ADMMRegPred( + X::M, + penalty::P, + wts::V, + A::AbstractMatrix{<:Real}=zeros(T, 0, 0), + b::AbstractVector{<:Real}=zeros(T, 0), + restart::Bool=true, + adapt::Bool=false, +) where { + M<:AbstractMatrix{T},V<:AbstractVector{<:Real},P<:PenaltyFunction +} where {T<:BlasReal} + n, m = size(X) + ll = size(wts, 1) + ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n.")) + + wXt = isempty(wts) ? X' : (X .* wts)' + Σ = Hermitian(float(wXt * X)) + + if isempty(A) + A = float(I(m)) + end + size(A) == (m, m) || + throw(DimensionMismatch("size of A is $(size(A)), must be (0, 0) or $((m, m)).")) + if isempty(b) + b = zeros(T, m) + end + size(b, 1) == m || + throw(DimensionMismatch("size of b is $(size(b, 1)), must be 0 or $(m).")) + + return ADMMRegPred(X, wXt, Σ, penalty, A, b, restart, adapt) +end + +penalized_coef(p::ADMMRegPred) = mul!(copyto!(p.penbeta0, p.b), p.A, p.beta0, 1, -1) + +function initpred!(p::ADMMRegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false) + ## Init temporary arrays + copyto!(p.scratchbeta, p.beta0) + copyto!(p.vk, p.beta0) + copyto!(p.vkp1, p.beta0) + copyto!(p.vhatk, p.beta0) + copyto!(p.vhatkp1, p.beta0) + copyto!(p.wk, p.beta0) + copyto!(p.wkp1, p.beta0) + copyto!(p.whatk, p.beta0) + copyto!(p.whatkp1, p.beta0) + + ## Init parameters + # Goldstein (2014) - Fast Alternating Direction Optimization Methods + # Theorem 2: ρ^3 <= (σH σG^2) / (ρ(A'A) * ρ(B'B)^2) + # σK = eigmin(Σ) / eigmax(A'A) + # TODO: `eigen` methods not defined for sparse arrays, use Arpack.jl? + # σK = eigmin(Matrix(Σ)) / eigmax(Matrix(A'A)) + # ρ = σK^(1/3) + σK = eigmin(Matrix(p.Σ)) / eigmax(Matrix(p.A' * p.A)) + initparams!(p.params, abs(σK)^(1 / 3)) + + # Generate the extended Hessian matrix + updatepred!(p, σ; verbose=verbose, force=true) + + return p +end + + +function update_beta!( + p::ADMMRegPred{T}, + y::AbstractVector{T}, + wts::AbstractVector{T}=T[], + σ2::T=one(T); + verbose::Bool=false, +) where {T<:BlasReal} + β = p.beta0 + wXt = p.wXt + βkp1 = p.delbeta + scratch = p.scratchbeta + wrkΣ = p.wrkΣ + A = p.A + b = p.b + + vk = p.vk + vkp1 = p.vkp1 + vhatk = p.vhatk + vhatkp1 = p.vhatkp1 + wk = p.wk + wkp1 = p.wkp1 + whatk = p.whatk + whatkp1 = p.whatkp1 + + ρ = p.params.ρ + ck = p.params.ck + restart = p.params.restart + η = p.params.η + + verbose && println("ADMM current coefs:\n\t$(β)") + + # Goldstein (2014) - Fast Alternating Direction Optimization Methods + # Lρ(u, v, w) = 1/2σ² |y - X u|² + P(v) + ρ/2 |A u - b - v + w|² - ρ/2 |w|² + # uk+1 = (X'WX + ρ σ² A'A) \ (X'Wy + ρ σ² A' (b + vk - wk)) + # vk+1 = prox_λ/ρ ( A uk+1 - b + wk ) + # wk+1 = wk + A uk+1 - b - vk+1 + # + # primal residual: rk = A uk - b - vk + # dual residual: dk = ρ * (vk+1 - vk) + # + # βkp1 = facΣρ \ (wXt * wrky + ρ * σ2 * A' * (b + vhatk - whatk)) + broadcast!(+, βkp1, broadcast!(-, βkp1, vhatk, whatk), b) # dumb βkp1 + mul!(scratch, A', βkp1) # dumb βkp1 + mul!(scratch, wXt, y, 1, ρ * σ2) +# βkp1 = facΣρ \ scratch + cg!(βkp1, wrkΣ, scratch) + # proximal!(penalty(p), vkp1, A * βkp1 - b + whatk, 1/ρ) + scratch = mul!(copyto!(scratch, b), A, βkp1, 1, -1) + broadcast!(+, scratch, scratch, whatk) + proximal!(penalty(p), vkp1, scratch, 1 / ρ) + # wkp1 = whatk + A * βkp1 - b - vkp1 + broadcast!(-, wkp1, scratch, vkp1) + + rrk = sum(abs2, wkp1 - whatk) + ssk = ρ * sum(abs2, vkp1 - vhatk) + ckp1 = rrk + ssk + # verbose && println((; rrk, ssk)) + # verbose && println("criteria: $(ckp1) < $(η) * $(ck)") + if !restart || ck == 0 || ckp1 < η * ck + # Accelerated step + mk = update_accelstep!(p.params, false; verbose=verbose) + @. vhatkp1 = vkp1 + mk * (vkp1 - vk) + @. whatkp1 = wkp1 + mk * (wkp1 - wk) + else + # Restart + update_accelstep!(p.params, true; verbose=verbose) + @. vhatkp1 = vk + @. whatkp1 = wk + ckp1 = ck / η + end + + # update variables + p.params.ck = ckp1 + copyto!(β, βkp1) + copyto!(vk, vkp1) + copyto!(vhatk, vhatkp1) + copyto!(wk, wkp1) + copyto!(whatk, whatkp1) + + # adapt ρ + update_adapt!(p.params, log(rrk / ssk); verbose=verbose) + + return p +end + +function updatepred!(p::ADMMRegPred, σ::Real; verbose::Bool=false, force::Bool=false) + if (p.params.adapt && p.params.updatedρ) || force + Σ = p.Σ + A = p.A + ρ = p.params.ρ + + # Refactorize + # TODO: should be improved +# cholesky!(p.chol, sparse(Σ + ρ * σ^2 * Hermitian(A'A))) +# p.wrkΣ = Hermitian(p.Σ + σ2 * ρ * A'A) + copyto!(p.wrkΣ, p.Σ) + mul!(p.wrkΣ, A', A, ρ * σ^2, 1) + copyto!(p.wrkΣ, Hermitian(p.wrkΣ)) + p.params.updatedρ = false + verbose && println("Recompute cholesky factorization") + end + return p +end diff --git a/src/robustlinearmodel.jl b/src/robustlinearmodel.jl index 77385f3..daf70ee 100644 --- a/src/robustlinearmodel.jl +++ b/src/robustlinearmodel.jl @@ -66,6 +66,10 @@ function StatsAPI.confint(m::AbstractRobustModel; level::Real=0.95) return hcat(cc, cc) + alpha * se * hcat(-1.0, 1.0) end +haspenalty(r::AbstractRobustModel) = false + +penalty(r::AbstractRobustModel) = nothing + ## TODO: specialize to make it faster StatsAPI.leverage(m::AbstractRobustModel) = diag(projectionmatrix(m)) @@ -1499,3 +1503,296 @@ function resampling_best_estimate( verbose && println("Best subsample: β=$(βls[:, N])\tσ=$(σls[N])") return σls[N], βls[:, N] end + + +######################################## +### Penalized Least Squares +######################################## + +haspenalty(r::RobustLinearModel{T,R,P}) where {T,R,P<:AbstractRegularizedPred} = true + +penalty(r::RobustLinearModel{T,R,P}) where {T,R,P<:AbstractRegularizedPred} = r.pred.penalty + +function StatsAPI.dof(r::RobustLinearModel{T,R,P}) where {T,R,P<:AbstractRegularizedPred} + return min(wobs(r), dof(penalty(r), coef(r))) +end + + + +""" + fit(::Type{M}, + X::AbstractMatrix{T}, + y::AbstractVector{T}, + penalty::PenaltyFunction; + method::Symbol = :auto, # :cgd, :fista, :ama, :admm + dofit::Bool = true, + wts::FPVector = similar(y, 0), + offset::FPVector = similar(y, 0), + fitdispersion::Bool = false, + initial_scale::Union{Symbol, Real}=:mad, + σ0::Union{Symbol, Real}=initial_scale, + initial_coef::AbstractVector=[], + β0::AbstractVector=initial_coef, + correct_leverage::Bool=false + fitargs...) where {M<:RobustLinearModel, T<:AbstractFloat} + +Create a robust model with the model matrix (or formula) X and response vector (or dataframe) y, +using L2Loss with a Penalty on the coefficients. + + +# Arguments + +- `X`: the model matrix (it can be dense or sparse) or a formula +- `y`: the response vector or a table (dataframe, namedtuple, ...). +- `penalty`: a penalty function for the coefficients. + +# Keywords + +- `method::Symbol = :auto`: the method used to solve the linear system with penalty, + - Coordinate Gradient Descent: `:cgd` (default) + - Fast Iterative Shrinkage-Thresholding Algorithm: `:fista` + - Alternating Minimization Algorithm: `:ama` + - Alternating Direction Method of Multipliers: `:admm` + Use :auto to select the default method. +- `dofit::Bool = true`: if false, return the model object without fitting; +- `dropmissing::Bool = false`: if true, drop the rows with missing values (and convert to + Non-Missing type). With `dropmissing=true` the number of observations may be smaller than + the size of the input arrays; +- `wts::Vector = similar(y, 0)`: Prior probability weights of observations. + Can be empty (length 0) if no weights are used (default); +- `offset::Vector = similar(y, 0)`: A vector of offsets to apply to the mean response. + Can be empty (length 0) if no offset is set (default); +- `fitdispersion::Bool = false`: reevaluate the dispersion; +- `contrasts::AbstractDict{Symbol,Any} = Dict{Symbol,Any}()`: a `Dict` mapping term names + (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts (e.g., `HelmertCoding()`, + `SeqDiffCoding(; levels=["a", "b", "c"])`, etc.). If contrasts are not provided for a variable, + the appropriate term type will be guessed based on the data type from the data column: + any numeric data is assumed to be continuous, and any non-numeric data is assumed to be + categorical (with `DummyCoding()` as the default contrast type); +- `initial_scale::Union{Symbol, Real}=:mad`: the initial scale estimate, for non-convex estimator + it helps to find the global minimum. + Automatic computation using `:mad`, `:L1` or `:extrema` (non-robust). +- `σ0::Union{Nothing, Symbol, Real}=initial_scale`: alias of `initial_scale`; +- `initial_coef::AbstractVector=[]`: the initial coefficients estimate, for non-convex estimator + it helps to find the global minimum. +- `β0::AbstractVector=initial_coef`: alias of `initial_coef`; +- `penalty_omit_intercept::Bool=true`: if true, do not penalize the intercept, + otherwise use the penalty on all the coefficients; +- `fitargs...`: other keyword arguments used to control the convergence of the IRLS algorithm + (see [`pirls!`](@ref)). + +# Output + +the RobustLinearModel object. + +""" +function StatsAPI.fit( + ::Type{M}, + X::AbstractMatrix{T}, + y::AbstractVector{T}, + penalty::PenaltyFunction; + method::Symbol=:auto, # :cgd, :fista, :ama, :admm + dofit::Bool=true, + dropmissing::Bool=false, # placeholder + initial_scale::Union{Nothing,Symbol,Real}=:mad, + σ0::Union{Nothing,Symbol,Real}=initial_scale, + wts::FPVector=similar(y, 0), + offset::FPVector=similar(y, 0), + fitdispersion::Bool=false, + dropcollinear::Bool=false, + contrasts::AbstractDict{Symbol,Any}=Dict{Symbol,Any}(), # placeholder + __formula::Union{Nothing,FormulaTerm}=nothing, + use_backtracking::Bool=false, + A::AbstractMatrix{<:Real}=zeros(T, 0, 0), + b::AbstractVector{<:Real}=zeros(T, 0), + restart::Bool=true, + adapt::Bool=false, + outerloop::Bool=false, + penalty_omit_intercept::Bool=true, + fitargs..., +) where {M<:RobustLinearModel,T<:AbstractFloat} + + # Check that X and y have the same number of observations + n, p = size(X) + if n != size(y, 1) + throw(DimensionMismatch("number of rows in X and y must match")) + end + + # Response object + est = MEstimator{L2Loss}() + resp = RobustLinResp(est, y, offset, T[]) + + # Penalty + new_penalty = try + intercept_col = penalty_omit_intercept ? get_intercept_col(X, __formula) : nothing + concrete(penalty, p, intercept_col) + catch + error("penalty is not compatible with coefficients size $p: $(penalty)") + end + + # Predictor object + pen_methods = (:auto, :cgd, :fista, :ama, :admm) + if method ∉ pen_methods + @warn( + "Incorrect method `:$(method)` with a penalty, should be one of $(pen_methods)" + ) + method = :auto + end + + pred = if method === :fista + FISTARegPred(X, new_penalty, wts, restart, use_backtracking) + elseif method === :ama + AMARegPred(X, new_penalty, wts, A, b, restart) + elseif method === :admm + ADMMRegPred(X, new_penalty, wts, A, b, restart, adapt) + elseif method in (:cgd, :auto) + CGDRegPred(X, new_penalty, wts, outerloop) + else + error("with penalty, method :$method is not allowed, should be in: $pen_methods") + end + + # RobustLinearModel + m = RobustLinearModel(resp, pred, __formula, fitdispersion, false) + + # Update fields + fitargs = update_fields!(m; σ0=σ0, fitargs...) + + return dofit ? fit!(m; fitargs...) : m +end + + +function StatsAPI.fit!( + m::RobustLinearModel{T,R,P}; + correct_leverage::Bool=false, + updatescale::Bool=false, + maxiter::Integer=(P<:CGDRegPred) ? 10_000 : (P<:FISTARegPred) ? 1_000 : 100, + minstepfac::Real=1e-3, + atol::Real=1e-8, + rtol::Real=1e-7, + verbose::Bool=false, +) where {T,R,P<:AbstractRegularizedPred} + # Return early if model has the fit flag set + m.fitted && return m + + verbose && println("\nFit robust model with penalty: $(penalty(m))") + + # Initialize the response and predictors + init!(m; verbose=verbose) + + # convergence criteria + devold = dev_criteria(m) + dev = devold + Δdev = zero(T) + + ### Loop until convergence + cvg = false + for i in 1:maxiter + ## Update coefficients + setη!(m; verbose=verbose) + dev = dev_criteria(m) + Δdev = abs(dev - devold) + + ## Postupdate (only for ADMM) + updatepred!(m.pred, scale(m); verbose=verbose) + + # Test for convergence + verbose && println("Iteration: $i, Δdev: $(Δdev)") + tol = max(rtol * abs(devold), atol) + if Δdev < tol || dev < atol + cvg = true + break + end + @assert isfinite(dev) + devold = dev + end + cvg || throw(ConvergenceException(maxiter)) + m.fitted = true + return m +end + +function StatsAPI.fit!( + m::RobustLinearModel{T,R,P}; correct_leverage::Bool=false, kwargs... +) where {T,R,P<:RidgePred} + + # Return early if model has the fit flag set + m.fitted && return m + + # Compute the initial values + σ0, β0 = scale(m), coef(m) + + if correct_leverage + wts = m.resp.wts + copy!(wts, leverage_weights(m, :original)) + end + + # Get type + V = typeof(m.resp.est) + + _fit!(m, V; σ0=σ0, β0=β0, kwargs...) + + m.fitted = true + return m +end + +function dev_criteria(m::RobustLinearModel) + if !haspenalty(m) + # this criteria is not used with no penalty + return 0 + end + l = deviance(m.resp) + l += dev_criteria(m.pred) + return l +end + +function init!( + m::RobustLinearModel{T,R,P}; verbose::Bool=false +) where {T,R,P<:AbstractRegularizedPred} + + resp = m.resp + pred = m.pred + + # reset ∇β + copyto!(pred.delbeta, pred.beta0) + + # res = y - Xβ + mul!(resp.μ, pred.X, pred.beta0) + initresp!(resp) + + # Initialize the predictor + initpred!(pred, weights(m), scale(m); verbose=verbose) + + return m +end + +function setη!( + m::RobustLinearModel{T,R,P}, linesearch_f::Real=1; verbose::Bool=false +) where {T,R,P<:AbstractRegularizedPred} + # TODO: add line search + wts = weights(m) + + μ = m.resp.μ + σ2 = m.resp.σ^2 + y = m.resp.y + + if m.pred isa CGDRegPred + res = m.resp.wrkres + devresid = m.resp.devresid + + # Update coefs, μ and residuals + update_βμ!(m.pred, y, μ, res, σ2, wts; verbose=verbose) + + # Update deviance with the new residuals + map!(abs2, devresid, res) + else + # Update coefs + update_beta!(m.pred, y, wts, σ2; verbose=verbose) + + # Update linear predictor + mul!(μ, m.pred.X, m.pred.beta0) + + # Update residuals with the new μ + updateres!(m.resp; updatescale=false) + end + + return m +end diff --git a/test/data/Animals2.jl b/test/data/Animals2.jl index 0b09765..f27c59b 100644 --- a/test/data/Animals2.jl +++ b/test/data/Animals2.jl @@ -215,3 +215,4 @@ y = data.logBrain nt = (; logBrain=data.logBrain, logBody=data.logBody) data_tuples = ((form, data), (form, nt), (X, y), (sX, y)) +; diff --git a/test/data/starsCYG.jl b/test/data/starsCYG.jl new file mode 100644 index 0000000..1a31177 --- /dev/null +++ b/test/data/starsCYG.jl @@ -0,0 +1,115 @@ + +# Import data +#using RDatasets: dataset +#data = dataset("robustbase", "starsCYG") + +logTemp = [ + 4.37, + 4.56, + 4.26, + 4.56, + 4.30, + 4.46, + 3.84, + 4.57, + 4.26, + 4.37, + 3.49, + 4.43, + 4.48, + 4.01, + 4.29, + 4.42, + 4.23, + 4.42, + 4.23, + 3.49, + 4.29, + 4.29, + 4.42, + 4.49, + 4.38, + 4.42, + 4.29, + 4.38, + 4.22, + 3.48, + 4.38, + 4.56, + 4.45, + 3.49, + 4.23, + 4.62, + 4.53, + 4.45, + 4.53, + 4.43, + 4.38, + 4.45, + 4.50, + 4.45, + 4.55, + 4.45, + 4.42, +] + +logLight = [ + 5.23, + 5.74, + 4.93, + 5.74, + 5.19, + 5.46, + 4.65, + 5.27, + 5.57, + 5.12, + 5.73, + 5.45, + 5.42, + 4.05, + 4.26, + 4.58, + 3.94, + 4.18, + 4.18, + 5.89, + 4.38, + 4.22, + 4.42, + 4.85, + 5.02, + 4.66, + 4.66, + 4.90, + 4.39, + 6.05, + 4.42, + 5.10, + 5.22, + 6.29, + 4.34, + 5.62, + 5.10, + 5.22, + 5.18, + 5.57, + 4.62, + 5.06, + 5.34, + 5.34, + 5.54, + 4.98, + 4.50, +] + +data2 = DataFrame(; logTemp=logTemp, logLight=logLight) + +form2 = @formula(logLight ~ 1 + logTemp) +X2 = hcat(ones(size(data2, 1)), data2.logTemp) +sX2 = SparseMatrixCSC(X2) +y2 = data2.logLight +nt2 = (; logTemp=logTemp, logLight=logLight) + +data2_tuples = ((form2, data2), (form2, nt2), (X2, y2), (sX2, y2)) +; diff --git a/test/ipod.jl b/test/ipod.jl new file mode 100644 index 0000000..6470353 --- /dev/null +++ b/test/ipod.jl @@ -0,0 +1,275 @@ + +loss1 = RobustModels.L2Loss() +est1 = MEstimator(loss1) +est2 = MMEstimator(TukeyLoss) +pen1 = RobustModels.NoPenalty() +λ = 10_000.0 +pen2 = RobustModels.RangedPenalties( + [2:RobustModels.End()], [RobustModels.SquaredL2Penalty(λ)] +) + +m00 = fit(LinearModel, form, data) +m01 = fit(RobustLinearModel, form, data, est1; method=:auto, initial_scale=1) +m02 = fit(RobustLinearModel, form, data, est1; method=:auto, initial_scale=1, ridgeλ=λ) +m03 = fit(RobustLinearModel, form, data, est2) +σ = scale(m03) + +@testset "Θ-IPOD: L2Loss method, no penalty ($(pen))" for (pen, methods) in zip( + (nothing, pen1), (nopen_methods, pen_methods) +) + def_rtol = isnothing(pen) ? 1e-6 : 1e-4 + + @testset "solver method $(method)" for method in methods + rtol = def_rtol + if method === :fista + rtol = 1e-1 + end + + # Formula, dense and sparse entry + @testset "data type: $(typeof(A))" for (A, b) in data_tuples + name = "Θ-IPOD(L2Loss, $(pen); method=$(method)),\t" + name *= if A == form + "formula" + elseif A == X + "dense " + else + "sparse " + end + + m1 = fit(IPODRegression, A, b, loss1, pen; method=method, initial_scale=1) + @test_nowarn ipod(A, b, loss1, pen; method=method, initial_scale=1) + + @test all(iszero, outliers(m1)) + @test all(isfinite.(coef(m1))) + + VERBOSE && println("\n\t\u25CF $(name)") + VERBOSE && println(" rlm : ", coef(m01)) + VERBOSE && println("ipod($(method)) : ", coef(m1)) + @test isapprox(coef(m1), coef(m01); rtol=rtol) + + # Test printing the model + @test_nowarn show(devnull, m1) + + # make sure that it is not a TableRegressionModel + @test !isa(m1, TableRegressionModel) + + # refit + @test_nowarn refit!(m1) + + # interface + @testset "method: $(f)" for f in interface_methods + if f == workingweights + # Not defined for IPODRegression + continue + end + + # make sure the methods for IPODRegression give the same results as RobustLinearModel + var0 = f(m01) + var1 = f(m1) + if f == hasformula + # m01 is defined from a formula + @test var1 == (A isa FormulaTerm) + else + @test isapprox(var0, var1; rtol=rtol) + end + end + end + end +end + + +@testset "Θ-IPOD: robust loss, no penalty ($(pen))" for (pen, methods) in zip( + (nothing, pen1), (nopen_methods, pen_methods) +) + @testset "solver method $(method)" for method in methods + @testset "loss $(loss_name)" for loss_name in ("Huber", "Hampel", "Tukey") + typeloss = getproperty(RobustModels, Symbol(loss_name * "Loss")) + l = typeloss() + est = MEstimator(l) + + # Formula, dense and sparse entry + @testset "data type: $(typeof(A))" for (A, b) in ((X, y), (sX, y)) + name = "Θ-IPOD($(l), $(pen); method=$(method)),\t" + name *= ( + (A isa FormulaTerm) ? "formula" : + (A isa SparseMatrixCSC) ? "sparse " : "dense " + ) + + m0 = fit(RobustLinearModel, A, b, est; method=:chol, initial_scale=σ) + m1 = fit(IPODRegression, A, b, l, pen; method=method, initial_scale=σ) + β1 = copy(coef(m1)) + γ1 = copy(outliers(m1)) + + @test any(!iszero, outliers(m1)) + @test all(isfinite.(coef(m1))) + + VERBOSE && println("\n\t\u25CF $(name)") + VERBOSE && println(" rlm : ", coef(m0)) + VERBOSE && println("ipod($(method)) : ", coef(m1)) + @test coef(m0) ≈ coef(m1) rtol = 0.1 + @test stderror(m0) ≈ stderror(m1) rtol = 0.5 + + # Test printing the model + @test_nowarn show(devnull, m1) + + # make sure that it is not a TableRegressionModel + @test !isa(m1, TableRegressionModel) + + # refit + @test_nowarn refit!(m1) + @test coef(m1) ≈ β1 + @test outliers(m1) ≈ γ1 + end + end + end +end + + + +@testset "Θ-IPOD: L2Loss method, Ridge penalty" begin + def_rtol = 1e-5 + + kwargs = (; initial_scale=1, maxiter=10_000) + + @testset "solver method $(method)" for method in pen_methods + rtol = def_rtol + if method === :cgd + rtol = 1e-5 + elseif method === :fista + rtol = 1e-1 + else + rtol = 1e-3 + end + + # Formula, dense and sparse entry + @testset "data type: $(typeof(A))" for (A, b) in data_tuples + name = "Θ-IPOD(L2Loss, $(pen2); method=$(method)),\t" + name *= ( + (A isa FormulaTerm) ? "formula" : + (A isa SparseMatrixCSC) ? "sparse " : "dense " + ) + + m2 = ipod(A, b, loss1, pen2; method=method, kwargs...) + + @test all(iszero, outliers(m2)) + @test all(isfinite.(coef(m2))) + + VERBOSE && println("\n\t\u25CF $(name)") + VERBOSE && println(" rlm : ", coef(m02)) + VERBOSE && println("ipod($(method)) : ", coef(m2)) + @test isapprox(coef(m2), coef(m02); rtol=rtol) + + # interface + @testset "method: $(f)" for f in interface_methods + if f == workingweights + # Not defined for IPODRegression + continue + end + + # make sure the methods for IPODRegression give the same results as RobustLinearModel + var0 = f(m02) + var1 = f(m2) + if f == hasformula + # m0 is defined from a formula + @test var1 == (A isa FormulaTerm) + elseif f in (dof, dof_residual, dispersion) + # Ridge dof is smaller than the unpenalized regression + # IPOD dof is the same as the unpenalized IPOD + @test var0 ≈ var1 rtol = 1e-1 + # @test var1 == f(m01) + elseif f in (stderror, vcov, leverage) + @test all(abs.(var1) .>= abs.(var0)) + elseif f in (leverage_weights,) + @test all(abs.(var1) .<= abs.(var0)) + elseif f in (confint, residuals, fitted, predict) + # Larger error + @test isapprox(var0, var1; rtol=1e-1) + elseif f in (projectionmatrix,) + continue + else + @test isapprox(var0, var1; rtol=rtol) + end + end + end + end +end + + +@testset "Θ-IPOD: L2Loss method, $(pen_name) penalty" for pen_name in penalties + typepenalty = getproperty(RobustModels, Symbol(pen_name * "Penalty")) + pen = typepenalty(λ) + sep_pen1 = RobustModels.RangedPenalties([1:RobustModels.End()], [pen]) + sep_pen2 = RobustModels.RangedPenalties([2:RobustModels.End()], [pen]) + kwargs = (; initial_scale=1, maxiter=10_000) + + m0 = fit(RobustLinearModel, X, y, pen; method=:auto, kwargs...) + + # Formula, dense and sparse entry + @testset "Omit intercept, data type: $(typeof(A))" for (A, b) in data_tuples + name = "Θ-IPOD(L2Loss, $(pen); method=:auto),\t" + name *= ( + (A isa FormulaTerm) ? "formula" : + (A isa SparseMatrixCSC) ? "sparse " : "dense " + ) + + m1 = fit(IPODRegression, A, b, loss1, pen; kwargs...) + m2 = fit(IPODRegression, A, b, loss1, sep_pen2; kwargs...) + m3 = fit(IPODRegression, A, b, loss1, sep_pen1; kwargs...) + m4 = fit(IPODRegression, A, b, loss1, pen; penalty_omit_intercept=false, kwargs...) + m5 = fit( + IPODRegression, A, b, loss1, sep_pen1; penalty_omit_intercept=false, kwargs... + ) + + @test all(isfinite.(coef(m1))) + @test all(isfinite.(coef(m2))) + @test all(isfinite.(coef(m3))) + @test all(isfinite.(coef(m4))) + @test all(isfinite.(coef(m5))) + @test all(iszero, outliers(m1)) + @test all(iszero, outliers(m2)) + @test all(iszero, outliers(m3)) + @test all(iszero, outliers(m4)) + @test all(iszero, outliers(m5)) + + @test penalty(m0) == penalty(m1) + @test penalty(m0) == penalty(m2) + @test penalty(m0) == penalty(m3) + @test penalty(m4) != penalty(m5) + @test penalty(m0) != penalty(m4) + + @test coef(m0) ≈ coef(m1) + @test coef(m0) ≈ coef(m2) + @test coef(m0) ≈ coef(m3) + @test coef(m4) ≈ coef(m5) + @test coef(m4)[2] ≈ 0 atol = 0.1 + @test coef(m5)[2] ≈ 0 atol = 0.1 + + VERBOSE && println("\n\t\u25CF $(name)") + VERBOSE && println("ipod($(pen), $(method)) : ", coef(m1)) + end + + @testset "solver method $(method)" for method in pen_methods + m0 = fit(RobustLinearModel, X, y, pen; method=method, kwargs...) + + # Formula, dense and sparse entry + @testset "data type: $(typeof(A))" for (A, b) in data_tuples + name = "Θ-IPOD(L2Loss, $(pen); method=$(method)),\t" + name *= ( + (A isa FormulaTerm) ? "formula" : + (A isa SparseMatrixCSC) ? "sparse " : "dense " + ) + + m1 = fit(IPODRegression, A, b, loss1, pen; method=method, kwargs...) + + @test all(isfinite.(coef(m1))) + @test all(iszero, outliers(m1)) + + @test penalty(m0) == penalty(m1) + @test coef(m0) ≈ coef(m1) + @test coef(m1)[2] ≈ 0 atol = 0.1 + + VERBOSE && println("\n\t\u25CF $(name)") + VERBOSE && println("ipod($(pen), $(method)) : ", coef(m1)) + end + end +end diff --git a/test/linearfit.jl b/test/linearfit.jl index c0c06d6..9f2d390 100644 --- a/test/linearfit.jl +++ b/test/linearfit.jl @@ -1,5 +1,4 @@ -using Random: MersenneTwister m1 = fit(LinearModel, form, data) diff --git a/test/penalties.jl b/test/penalties.jl new file mode 100644 index 0000000..0199cc1 --- /dev/null +++ b/test/penalties.jl @@ -0,0 +1,212 @@ + +using RobustModels: + cost, + proximal, + proximal!, + isconcrete, + concrete, + isconvex, + isseparable, + RangedPenalties, + End, + nothing # stopper + +λ = 10_000.0 + +seed = 1234 +rng = MersenneTwister(seed) + +t = 10 * randn(rng, 30) + +@testset "Penalties" begin + + @testset "Methods penalty functions: $(name)" for name in ("No", penalties...) + tt = copy(t) + + typepenalty = getproperty(RobustModels, Symbol(name * "Penalty")) + + pen = typepenalty(λ) + penpos = typepenalty(λ; nonnegative=true) + + @test_nowarn println(pen) + + # Cost + P0 = cost(pen, zeros(Float64, 4)) + P1 = cost(pen, ones(Float64, 4)) + @test P1 >= P0 + + # Proximal operator + copy!(tt, t) + @test_nowarn proximal!(pen, tt, t, 1.0) + @test tt ≈ proximal(pen, t, 1.0) + + # Shrinkage + @test all(@.(abs(tt) <= abs(t))) + + # Single index update + copy!(tt, t) + proximal!(pen, tt, 1, t, 1.0) + @test abs(tt[1]) <= abs(t[1]) + @test all(tt[2:end] .== t[2:end]) + + # Nonnegative + if name != "No" + tt = proximal(penpos, t, 1.0) + # Get only non-negative values + @test all(>=(0), tt) + + # Negative values are set to 0 + nonpos_indices = findall(<=(0), t) + @test all(iszero, tt[nonpos_indices]) + end + + # Concrete + new_pen = concrete(pen, length(t)) + @test new_pen == pen + + @testset "RangedPenalties" begin + n = length(t) + seppen = RangedPenalties([2:End()], [pen]) + concpen = RangedPenalties([2:n], [pen], n) + seppen2 = RangedPenalties([[1], 2:End()], [EuclideanPenalty(λ), pen]) + + # Not concrete + @test !isconcrete(seppen) + @test isconcrete(concpen) + + copy!(tt, t) + @test_throws Exception cost(seppen, t) + @test_throws Exception proximal(seppen, t, 1.0) + @test_throws Exception proximal!(seppen, tt, t, 1.0) + @test_throws Exception proximal!(seppen, tt, 2, t, 1.0) + + # Concrete + new_pen = concrete(seppen, n) + @test new_pen == concpen + @test isconcrete(new_pen) + + # Proximal operator + copy!(tt, t) + @test_nowarn proximal!(new_pen, tt, t, 1.0) + @test tt[1] == t[1] + @test all(abs.(tt[2:end]) .<= abs.(t[2:end])) + + @test_nowarn proximal!(new_pen, tt, 2, t, 1.0) + @test_throws Exception proximal!(new_pen, tt, 0, t, 1.0) + @test_throws Exception proximal!(new_pen, tt, length(t) + 1, t, 1.0) + + # Convex + @test isconvex(seppen) == isconvex(pen) + @test isconvex(concpen) == isconvex(pen) + + @test_throws Exception isseparable(seppen) + @test isseparable(concpen) == isseparable(pen) + @test_throws Exception isseparable(seppen2) + + # Split ranges + @test pen == concrete(pen, n, nothing) + @test concpen == concrete(pen, n, 1) + for j in (n, 2) + new_pen = concrete(pen, n, j) + @test length(new_pen.ranges) == 1 + @test length(new_pen.penalties) == 1 + @test Set(only(new_pen.ranges)) == Set(filter(!=(j), 1:n)) + @test only(new_pen.penalties) == pen + end + end + end + + + @testset "M-estimator with penalty: Ridge" begin + def_rtol = 1e-4 + pen = SquaredL2Penalty(λ) + + kwargs = (; initial_scale=1) + m0 = rlm(form2, data2, MEstimator{L2Loss}(); kwargs...) + m1 = rlm(form2, data2, MEstimator{L2Loss}(); ridgeλ=λ, kwargs...) + + @testset "solver method $(method)" for method in pen_methods + rtol = def_rtol + if method === :fista + rtol = 1e-2 + end + + # Formula, dense and sparse entry + @testset "data type: $(typeof(A))" for (A, b) in data2_tuples + name = "rlm($(pen); method=$(method)),\t" + name *= ( + (A isa FormulaTerm) ? "formula" : + (A isa SparseMatrixCSC) ? "sparse " : "dense " + ) + + m2 = rlm(A, b, pen; method=method, kwargs...) + + @test all(isfinite.(coef(m2))) + + VERBOSE && println("\n\t\u25CF $(name)") + VERBOSE && println("rlm ridge : ", coef(m1)) + VERBOSE && println("rlm($method) $(pen) : ", coef(m2)) + @test isapprox(coef(m2), coef(m1); rtol=rtol) + + # interface + @testset "method: $(f)" for f in interface_methods + # make sure the methods for IPODRegression give the same results + # as RobustLinearModel + var1 = f(m1) + var2 = f(m2) + if f == hasformula + # m1 is defined from a formula + @test var2 == (A isa FormulaTerm) + elseif f in (dof, dof_residual, dispersion) + # Ridge dof is smaller than the unpenalized regression + # rlm with penalty dof is the same as the unpenalized rlm + @test isapprox(var1, var2; rtol=1e-2) + # @test var2 == f(m0) + elseif f in (stderror, vcov, leverage) + @test all(abs.(var2) .>= abs.(var1)) + elseif f in (leverage_weights,) + @test all(abs.(var2) .<= abs.(var1)) + # elseif f in (confint,) + # @test isapprox(var1, var2; rtol=1e-1) + elseif f in (confint, projectionmatrix) + continue + else + @test isapprox(var1, var2; rtol=rtol) + end + end + end + end + end + + + @testset "M-estimator with penalty $(name)" for name in penalties + typepenalty = getproperty(RobustModels, Symbol(name * "Penalty")) + + pen = typepenalty(λ) + penpos = typepenalty(λ; nonnegative=true) + kwargs = (; initial_scale=1, maxiter=1000) + def_rtol = 1e-3 + + m1 = fit(RobustLinearModel, form2, data2, pen; verbose=false, kwargs...) + @test all(isfinite.(coef(m1))) + + # make sure that it is not a TableRegressionModel + @test !isa(m1, TableRegressionModel) + + @testset "method: $(method)" for method in pen_methods + m2 = fit(RobustLinearModel, form2, data2, pen; method=method, kwargs...) + rtol = method == :fista ? 1e-1 : def_rtol + @test isapprox(coef(m1), coef(m2); rtol=rtol) + end + + m3 = fit(RobustLinearModel, form2, data2, penpos; verbose=false, kwargs...) + j = hasintercept(m1) ? 2 : 1 + @test all(>=(0), coef(m3)[j:end]) + + # refit + β1 = copy(coef(m1)) + refit!(m1) + @test all(coef(m1) .== β1) + end + +end diff --git a/test/robustridge.jl b/test/robustridge.jl index aa6496d..a510ddd 100644 --- a/test/robustridge.jl +++ b/test/robustridge.jl @@ -1,4 +1,3 @@ -using Random: MersenneTwister using LinearAlgebra: inv, Hermitian, I, tr, diag seed = 123987 diff --git a/test/runtests.jl b/test/runtests.jl index 1c2d883..3a437df 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,7 @@ using GLM using SparseArrays using DataFrames using Test -using Random: MersenneTwister +using Random: AbstractRNG, MersenneTwister using RobustModels @@ -52,8 +52,14 @@ other_losses = ("Cauchy",) bounded_losses = ("Geman", "Welsch", "Tukey", "YohaiZamar", "HardThreshold", "Hampel") losses = (convex_losses..., other_losses..., bounded_losses...) +## Penalties +penalties = ( + "SquaredL2", "Euclidean", "L1", "ElasticNet", "Berhu", "CappedL1", "SCAD", "MCP" +) + ## Solving methods nopen_methods = (:auto, :chol, :cholesky, :qr, :cg) +pen_methods = (:auto, :cgd, :fista, :ama, :admm) ## Interface methods interface_methods = ( @@ -88,6 +94,7 @@ interface_methods = ( # Import data include("data/Animals2.jl") +include("data/starsCYG.jl") # Include tests include("estimators.jl") @@ -98,3 +105,6 @@ include("robustridge.jl") include("qreg.jl") include("weights.jl") include("univariate.jl") +include("penalties.jl") +include("ipod.jl") +include("underdetermined.jl") diff --git a/test/underdetermined.jl b/test/underdetermined.jl new file mode 100644 index 0000000..b2e4f47 --- /dev/null +++ b/test/underdetermined.jl @@ -0,0 +1,105 @@ + + +seed = 1234 +rng = MersenneTwister(seed) + +function gendata( + rng::AbstractRNG, + ::Type{T}, + N::Integer, + p::Integer, + nonzero::Integer, + rho::Real, + snr::Real = 3, + alternate::Bool=true, +) where {T<:AbstractFloat} + if nonzero > 0 + beta = nonzero:-1:1 + if alternate + beta = [(nonzero - i + 1) * (i % 2 == 0 ? 1 : -1) for i in 1:nonzero] + else + beta = nonzero:-1:1 + end + beta = vcat(beta, zeros(T, p-nonzero)) + else + beta = zeros(T, p) + end + + X = randn(rng, N, p) + if rho > 0 + z = randn(N) + X .+= sqrt(rho/(1-rho)) * z + X *= sqrt(1-rho) + end + + ssd = sqrt((1-rho)*sum(abs2, beta) + rho*sum(abs, beta)) + nsd = ssd / snr + + y = X * beta + nsd * randn(N) + return X, y, beta +end + + +X3, y3, β03 = gendata(rng, Float64, 100, 1000, 30, 0.3) + +loss1 = L2Loss() +est1 = MEstimator{L2Loss}() +λ = 1.0 +pen1 = L1Penalty(λ) +pen2 = SquaredL2Penalty(λ) + +@testset "Rank deficient X: penalty ($(pen))" for (pen, methods) in zip( + (nothing, pen1, pen2), (nopen_methods, pen_methods, pen_methods) +) + def_rtol = isnothing(pen) ? 1e-6 : 1e-4 + + @testset "solver method $(method)" for method in methods + rtol = def_rtol + if method === :fista + rtol = 1e-1 + end + + ### RobustLinearModel + name = "rlm(X3, y3, L2Loss, $(pen); method=$(method))" + VERBOSE && println("\n\t\u25CF $(name)") + + if isnothing(pen) + if method in (:chol, :cholesky, :auto) + @test_throws Exception rlm(X3, y3, est1; method=method, initial_scale=1, dropcollinear=false) + end + m1 = rlm(X3, y3, est1; method=method, initial_scale=1, dropcollinear=true) + else + m1 = rlm(X3, y3, pen; method=method, initial_scale=1, maxiter=1000, dropcollinear=true) + end + + # Test printing the model + @test_nowarn show(devnull, m1) + VERBOSE && println(m1) + + # interface + @testset "method: $(f)" for f in interface_methods + @test_nowarn f(m1) + end + + ### IPODRegression + name = "ipod(X3, y3, L2Loss, $(pen); method=$(method))" + VERBOSE && println("\n\t\u25CF $(name)") + + if isnothing(pen) && method in (:chol, :cholesky, :auto) + @test_throws Exception ipod(X3, y3, loss1, pen; method=method, initial_scale=1, dropcollinear=false) + end + m2 = ipod(X3, y3, loss1, pen; method=method, initial_scale=1, maxiter=1000, dropcollinear=true) + + # Test printing the model + @test_nowarn show(devnull, m2) + + # interface + @testset "method: $(f)" for f in interface_methods + if f == workingweights + # Not defined for IPODRegression + continue + end + @test_nowarn f(m2) + end + end +end