From 4cf698726a81e2951e4b0c7096e8155a4aeebe6c Mon Sep 17 00:00:00 2001
From: getzze <getzze@gmail.com>
Date: Fri, 12 May 2023 13:22:13 +0100
Subject: [PATCH] add Penalties and Regularized Least Squares

format

use :cholesky and dropcollinear like in GLM

correctly use QR

test method :cholesky

change types

clean after rebase
---
 README.md                |   97 +++-
 _typos.toml              |    1 +
 docs/src/api.md          |   28 +-
 src/RobustModels.jl      |   23 +
 src/ipod.jl              | 1002 ++++++++++++++++++++++++++++++++++
 src/linpred.jl           |    2 +
 src/penalties.jl         |  826 ++++++++++++++++++++++++++++
 src/regularizedpred.jl   | 1104 ++++++++++++++++++++++++++++++++++++++
 src/robustlinearmodel.jl |  297 ++++++++++
 test/data/Animals2.jl    |    1 +
 test/data/starsCYG.jl    |  115 ++++
 test/ipod.jl             |  275 ++++++++++
 test/linearfit.jl        |    1 -
 test/penalties.jl        |  212 ++++++++
 test/robustridge.jl      |    1 -
 test/runtests.jl         |   12 +-
 test/underdetermined.jl  |  105 ++++
 17 files changed, 4082 insertions(+), 20 deletions(-)
 create mode 100644 src/ipod.jl
 create mode 100644 src/penalties.jl
 create mode 100644 test/data/starsCYG.jl
 create mode 100644 test/ipod.jl
 create mode 100644 test/penalties.jl
 create mode 100644 test/underdetermined.jl

diff --git a/README.md b/README.md
index 6d23e8b..a170e79 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,8 @@ This package implements:
 * MQuantile regression (e.g. Expectile regression)
 * Robust Ridge regression (using any of the previous estimator)
 * Quantile regression using interior point method
+* Regularized Least Square regression
+* Θ-IPOD regression, possibly with a penalty term
 
 ## Installation
 
@@ -55,7 +57,7 @@ julia>] add RobustModels#main
 
 ## Usage
 
-The prefered way of performing robust regression is by calling the `rlm` function:
+The preferred way of performing robust regression is by calling the `rlm` function:
 
 `m = rlm(X, y, MEstimator{TukeyLoss}(); initial_scale=:mad)`
 
@@ -63,6 +65,15 @@ For quantile regression, use `quantreg`:
 
 `m = quantreg(X, y; quantile=0.5)`
 
+For Regularized Least Squares and a penalty term, use `rlm`:
+
+`m = rlm(X, y, L1Penalty(); method=:cgd)`
+
+For Θ-IPOD regression with outlier detection and a penalty term, use `ipod`:
+
+`m = ipod(X, y, L2Loss(), SquaredL2Penalty(); method=:auto)`
+
+
 For robust version of `mean`, `std`, `var` and `sem` statistics, specify the estimator as first argument.
 Use the `dims` keyword for computing the statistics along specific dimensions.
 The following functions are also implemented: `mean_and_std`, `mean_and_var` and `mean_and_sem`.
@@ -119,6 +130,16 @@ m9 = rlm(form, data, MEstimator{TukeyLoss}(); initial_scale=:L1, ridgeλ=1.0)
 ## Quantile regression solved by linear programming interior point method
 m10 = quantreg(form, data; quantile=0.2)
 refit!(m10; quantile=0.8)
+
+## Penalized regression
+m11 = rlm(form, data, SquaredL2Penalty(); method=:auto)
+
+## Θ-IPOD regression with outlier detection
+m12 = ipod(form, data, TukeyLoss(); method=:auto)
+
+## Θ-IPOD regression with outlier detection and a penalty term
+m13 = ipod(form, data, L2Loss(), L1Penalty(); method=:ama)
+
 ;
 
 # output
@@ -136,7 +157,7 @@ With ordinary least square (OLS), the objective function is, from maximum likeli
 where `yᵢ` are the values of the response variable, `𝒙ᵢ` are the covectors of individual covariates
 (rows of the model matrix `X`), `𝜷` is the vector of fitted coefficients and `rᵢ` are the individual residuals.
 
-A `RobustLinearModel` solves instead for the following loss function: `L' = Σᵢ ρ(rᵢ)`
+A `RobustLinearModel` solves instead for the following loss function [1]: `L' = Σᵢ ρ(rᵢ)`
 (more precisely `L' = Σᵢ ρ(rᵢ/σ)` where `σ` is an (robust) estimate of the standard deviation of the residual).
 Several loss functions are implemented:
 
@@ -150,7 +171,7 @@ Several loss functions are implemented:
 - `CauchyLoss`: `ρ(r) = log(1+(r/c)²)`, non-convex estimator, that also corresponds to a Student's-t distribution (with fixed degree of freedom). It suppresses outliers more strongly but it is not sure to converge.
 - `GemanLoss`: `ρ(r) = ½ (r/c)²/(1 + (r/c)²)`, non-convex and bounded estimator, it suppresses outliers more strongly.
 - `WelschLoss`: `ρ(r) = ½ (1 - exp(-(r/c)²))`, non-convex and bounded estimator, it suppresses outliers more strongly.
-- `TukeyLoss`: `ρ(r) = if r<c; ⅙(1 - (1-(r/c)²)³) else ⅙ end`, non-convex and bounded estimator, it suppresses outliers more strongly and it is the prefered estimator for most cases.
+- `TukeyLoss`: `ρ(r) = if r<c; ⅙(1 - (1-(r/c)²)³) else ⅙ end`, non-convex and bounded estimator, it suppresses outliers more strongly and it is the preferred estimator for most cases.
 - `YohaiZamarLoss`: `ρ(r)` is quadratic for `r/c < 2/3` and is bounded to 1; non-convex estimator, it is optimized to have the lowest bias for a given efficiency.
 
 The value of the tuning constants `c` are optimized for each estimator so the M-estimators have a high efficiency of 0.95. However, these estimators have a low breakdown point.
@@ -178,9 +199,20 @@ the two loss functions should be the same but with different tuning constants.
 ### MQuantile-estimators
 
 Using an asymmetric variant of the `L1Estimator`, quantile regression is performed
-(although the `QuantileRegression` solver should be prefered because it gives an exact solution).
-Identically, with an M-estimator using an asymetric version of the loss function,
-a generalization of quantiles is obtained. For instance, using an asymetric `L2Loss` results in _Expectile Regression_.
+(although the `QuantileRegression` solver should be preferred because it gives an exact solution).
+Identically, with an M-estimator using an asymmetric version of the loss function,
+a generalization of quantiles is obtained. For instance, using an asymmetric `L2Loss` results in _Expectile Regression_.
+
+### Quantile regression
+
+_Quantile regression_ results from minimizing the following objective function:
+`L = Σᵢ wᵢ|yᵢ - 𝒙ᵢ 𝜷| = Σᵢ wᵢ(rᵢ) |rᵢ|`,
+where `wᵢ = ifelse(rᵢ>0, τ, 1-τ)` and `τ` is the quantile of interest. `τ=0.5` corresponds to _Least Absolute Deviations_.
+
+This problem is solved exactly using linear programming techniques,
+specifically, interior point methods using the internal API of [Tulip](https://github.com/ds4dm/Tulip.jl).
+The internal API is considered unstable, but it results in a much lighter dependency than
+including the [JuMP](https://github.com/JuliaOpt/JuMP.jl) package with Tulip backend.
 
 ### Robust Ridge regression
 
@@ -192,16 +224,44 @@ By default, all the coefficients (except the intercept) have the same penalty, w
 all the feature variables have the same scale. If it is not the case, use a robust estimate of scale
 to normalize every column of the model matrix `X` before fitting the regression.
 
-### Quantile regression
+### Regularized Least Squares
 
-_Quantile regression_ results from minimizing the following objective function:
-`L = Σᵢ wᵢ|yᵢ - 𝒙ᵢ 𝜷| = Σᵢ wᵢ(rᵢ) |rᵢ|`,
-where `wᵢ = ifelse(rᵢ>0, τ, 1-τ)` and `τ` is the quantile of interest. `τ=0.5` corresponds to _Least Absolute Deviations_.
+_Regularized Least Squares_ regression is the solution of the minimization of following objective function:
+`L = ½ Σᵢ |yᵢ - 𝒙ᵢ 𝜷|² + P(𝜷)`,
+where `P(𝜷)` is a (sparse) penalty on the coefficients.
+
+The following penalty function are defined:
+    - `NoPenalty`: `cost(𝐱) = 0`, no penalty.
+    - `SquaredL2Penalty`: `cost(𝐱) = λ ½||𝐱||₂²`, also called Ridge.
+    - `L1Penalty`: `cost(𝐱) = λ||𝐱||₁`, also called LASSO.
+    - `ElasticNetPenalty`: `cost(𝐱) = λ . l1_ratio .||𝐱||₁ + λ .(1 - l1_ratio) . ½||𝐱||₂²`.
+    - `EuclideanPenalty`: `cost(𝐱) = λ||𝐱||₂`, non-separable penalty used in Group LASSO.
+
+Different penalties can be applied to different indices of the coefficients, using
+`RangedPenalties(ranges, penalties)`. E.g., `RangedPenalties([2:5], [L1Penalty(1.0)])` defines
+a L1 penalty on every coefficients except the first index, which can correspond to the intercept.
+
+With a penalty, the following solvers are available (instead of the other ones):
+    - `:cgd`, Coordinate Gradient Descent [2].
+    - `:fista`, Fast Iterative Shrinkage-Thresholding Algorithm [3].
+    - `:ama`, Alternating Minimization Algorithm [4].
+    - `:admm`, Alternating Direction Method of Multipliers [5].
+
+To use a robust loss function with a penalty, see Θ-IPOD regression.
+
+### Θ-IPOD regression
+
+_Θ-IPOD regression_ (Θ-thresholding based Iterative Procedure for Outlier Detection) results from
+minimizing the following objective function [6]:
+`L = ½ Σᵢ |yᵢ - 𝒙ᵢ 𝜷 - γᵢ|² + P(𝜷) + Q(γ)`,
+where `Q(γ)` is a penalty function on the outliers `γ` that is sparse so the problem is not underdetermined.
+We don't need to know the expression of this penalty function, just that it leads to thresholding using
+one of the loss function used by M-Estimators. Then Θ-IPOD is equivalent to solving an M-Estimator.
+This problem is solved using an alternating minimization technique, for the outlier detection.
+Without penalty, the coefficients are updated at every step using a solver for _Ordinary Least Square_.
+
+`P(𝜷)` is an optional (sparse) penalty on the coefficients.
 
-This problem is solved exactly using linear programming techniques,
-specifically, interior point methods using the internal API of [Tulip](https://github.com/ds4dm/Tulip.jl).
-The internal API is considered unstable, but it results in a much lighter dependency than
-including the [JuMP](https://github.com/JuliaOpt/JuMP.jl) package with Tulip backend.
 
 ## Credits
 
@@ -209,10 +269,15 @@ This package derives from the [RobustLeastSquares](https://github.com/FugroRoame
 package for the initial implementation, especially for the Conjugate Gradient
 solver and the definition of the M-Estimator functions.
 
-Credits to the developpers of the [GLM](https://github.com/JuliaStats/GLM.jl)
+Credits to the developers of the [GLM](https://github.com/JuliaStats/GLM.jl)
 and [MixedModels](https://github.com/JuliaStats/MixedModels.jl) packages
 for implementing the Iteratively Reweighted Least Square algorithm.
 
 ## References
 
-- "Robust Statistics: Theory and Methods (with R)", 2nd Edition, 2019, R. Maronna, R. Martin, V. Yohai, M. Salibián-Barrera
+[1] "Robust Statistics: Theory and Methods (with R)", 2nd Edition, 2019, R. Maronna, R. Martin, V. Yohai, M. Salibián-Barrera
+[2] "Regularization Paths for Generalized Linear Models via Coordinate Descent", 2009, J. Friedman, T. Hastie, R. Tibshirani
+[3] "A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems", 2009, A. Beck, M. Teboulle
+[4] "Applications of a splitting algorithm to decomposition in convex programming and variational inequalities", 1991, P. Tseng
+[5] "Fast Alternating Direction Optimization Methods", 2014, T. Goldstein, B. O'Donoghue, S. Setzer, R. Baraniuk
+[6] "Outlier Detection Using Nonconvex Penalized Regression", 2011, Y. She, A.B. Owen
diff --git a/_typos.toml b/_typos.toml
index 8d35b03..611fc45 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -8,3 +8,4 @@ Artic = "Artic"
 
 [default.extend-words]
 qest = "qest"
+wrk = "wrk"
diff --git a/docs/src/api.md b/docs/src/api.md
index 2096196..e43fb77 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -9,6 +9,7 @@ AbstractQuantileEstimator
 LossFunction
 RobustLinearModel
 RobustModels.RobustLinResp
+RobustModels.IPODResp
 GLM.LinPred
 RobustModels.DensePredCG
 RobustModels.SparsePredCG
@@ -17,7 +18,12 @@ GLM.SparsePredChol
 GLM.DensePredQR
 RobustModels.RidgePred
 RobustModels.AbstractRegularizedPred
+RobustModels.CGDPred
+RobustModels.FISTAPred
+RobustModels.AMAPred
+RobustModels.ADMMPred
 QuantileRegression
+IPODRegression
 ```
 
 ## Constructors for models
@@ -30,6 +36,7 @@ fit(::Type{M}, ::Union{AbstractMatrix{T}}, ::AbstractVector{T}) where {T<:Abstra
 ```@docs
 rlm
 quantreg
+ipod
 fit!
 refit!
 ```
@@ -67,10 +74,13 @@ StatsAPI.residuals
 StatsModels.hasintercept
 hasformula
 formula
+haspenalty
+penalty
 scale
 tauscale
 RobustModels.location_variance
 Estimator
+outliers
 GLM.linpred!
 RobustModels.pirls!
 RobustModels.pirls_Sestimate!
@@ -111,7 +121,17 @@ HardThresholdLoss
 HampelLoss
 ```
 
-## Estimator and Loss functions methods
+## Penalty functions
+```@docs
+NoPenalty
+SquaredL2Penalty
+EuclideanPenalty
+L1Penalty
+ElasticNetPenalty
+RangedPenalties
+```
+
+## Estimator, Loss and Penalty functions methods
 ```@docs
 RobustModels.rho
 RobustModels.psi
@@ -139,4 +159,10 @@ RobustModels.set_MEstimator
 RobustModels.update_weight!
 RobustModels.tau_scale_estimate
 RobustModels.quantile_weight
+RobustModels.cost
+RobustModels.proximal!
+RobustModels.proximal
+RobustModels.isconcrete
+RobustModels.concrete!
+RobustModels.concrete
 ```
diff --git a/src/RobustModels.jl b/src/RobustModels.jl
index 0df1a61..ea1491d 100644
--- a/src/RobustModels.jl
+++ b/src/RobustModels.jl
@@ -173,6 +173,18 @@ export LossFunction,
     GeneralizedQuantileEstimator,
     ExpectileEstimator,
     L2Estimator,
+    PenaltyFunction,
+    NoPenalty,
+    SquaredL2Penalty,
+    L1Penalty,
+    ElasticNetPenalty,
+    EuclideanPenalty,
+    BerhuPenalty,
+    CappedL1Penalty,
+    SCADPenalty,
+    MCPPenalty,
+    RangedPenalties,
+    End,
     DensePredCG,
     SparsePredCG,
     RidgePred,
@@ -183,6 +195,11 @@ export LossFunction,
     Estimator,
     rlm,
     quantreg,
+    IPODRegression,
+    ipod,
+    outliers,
+    penalty,
+    haspenalty,
     loss,
     tuning_constant,
     refit!,
@@ -224,6 +241,9 @@ abstract type AbstractMEstimator <: AbstractEstimator end
 "Generalized M-Quantile estimator"
 abstract type AbstractQuantileEstimator <: AbstractMEstimator end
 
+"Penalty function"
+abstract type PenaltyFunction{T} end
+
 
 """
     AbstractRobustModel
@@ -243,17 +263,20 @@ abstract type AbstractRegularizedPred{T} end
 
 Base.broadcastable(m::AbstractEstimator) = Ref(m)
 Base.broadcastable(m::LossFunction) = Ref(m)
+Base.broadcastable(m::PenaltyFunction) = Ref(m)
 
 
 include("tools.jl")
 include("losses.jl")
 include("estimators.jl")
+include("penalties.jl")
 include("linpred.jl")
 include("regularizedpred.jl")
 include("linresp.jl")
 include("robustlinearmodel.jl")
 include("univariate.jl")
 include("quantileregression.jl")
+include("ipod.jl")
 include("deprecated.jl")
 
 end # module
diff --git a/src/ipod.jl b/src/ipod.jl
new file mode 100644
index 0000000..a5265f9
--- /dev/null
+++ b/src/ipod.jl
@@ -0,0 +1,1002 @@
+
+using LinearAlgebra: qr, norm, eigmin, eigmax, rdiv!
+using SparseArrays: sparse
+
+
+####################################
+### LinPred methods
+####################################
+
+function initpred!(
+    p::LinPred, wt::AbstractVector{T}=T[], σ::Real=one(T); verbose::Bool=false
+) where {T<:BlasReal}
+    return p
+end
+
+solvepred!(p::LinPred, r::AbstractVector{T}) where {T<:BlasReal} = delbeta!(p, r)
+
+function solvepred!(
+    p::LinPred, r::AbstractVector{T}, wts::AbstractVector{T}
+) where {T<:BlasReal}
+    return delbeta!(p, r, wts)
+end
+
+updatepred!(p::LinPred, args...; kwargs...) = p
+
+function update_beta!(
+    p::LinPred,
+    r::AbstractVector{T},
+    wts::AbstractVector{T}=T[],
+    σ2::T=one(T);  # placeholder
+    verbose::Bool=false,
+) where {T<:BlasReal}
+    if isempty(wts)
+        return solvepred!(p, r)
+    else
+        return solvepred!(p, r, wts)
+    end
+end
+
+function initpred!(
+    p::Union{DensePredCG{T},SparsePredCG{T}},
+    wt::AbstractVector{T}=T[],
+    σ::Real=one(T);  # placeholder
+    verbose::Bool=false,
+) where {T<:BlasReal}
+    # Initialize the scratchm1 matrix
+    if isempty(wt)
+        scr = transpose(copy!(p.scratchm1, p.X))
+    else
+        scr = transpose(broadcast!(*, p.scratchm1, wt, p.X))
+    end
+    # Initialize the Gram matrix
+    mul!(p.Σ, scr, p.X)
+    return p
+end
+
+function solvepred!(
+    p::Union{DensePredCG{T},SparsePredCG{T}}, r::AbstractVector{T}
+) where {T<:BlasReal}
+    ## Assume that the relevant matrices are pre-computed
+    # Compute the left-hand side.
+    mul!(p.scratchbeta, transpose(p.scratchm1), r)
+
+    # Solve the linear system
+    cg!(p.delbeta, Hermitian(p.Σ, :U), p.scratchbeta)
+    return p
+end
+
+
+####################################
+### IPODResp
+####################################
+
+"""
+    IPODResp
+
+Robust Θ-IPOD linear response structure.
+
+Solve the following minimization problem:
+
+```math
+\\min \\left\\lVert(\\dfrac{\\mathbf{y} - \\mathbf{X}\\mathbf{\\beta} - \\mathbf{gamma}}
+{\\hat{\\sigma}}\right\\rVert^2_2 + P_c\\left(\\dfrac{\\mathbf{\\gamma}}{\\hat{\\sigma}}\\right)
+```
+
+# Fields
+
+- `loss`: loss used for the model
+- `y`: response vector
+- `μ`: mean response vector
+- `wts`: prior case weights.  Can be of length 0.
+- `outliers`: outlier vector subtracted from `y` to form the working response.
+- `σ`: current estimate of the scale or dispersion
+- `wrky`: working response
+- `wrkres`: working residuals
+
+"""
+mutable struct IPODResp{T<:AbstractFloat,V<:AbstractVector{T},L<:LossFunction} <:
+               RobustResp{T}
+    "`loss`: loss used for the model"
+    loss::L
+    "`y`: response vector"
+    y::V
+    "`μ`: mean response vector"
+    μ::V
+    "`precision`: prior precision weights. Can be of length 0."
+    precision::V
+    "`outliers`: outlier vector subtracted from `y` to form the working response."
+    outliers::V
+    "`σ`: current estimate of the scale or dispersion"
+    σ::T
+    "`wrky`: working response."
+    wrky::V
+    "`wrkres`: working residuals"
+    wrkres::V
+
+    function IPODResp{T,V,L}(
+        l::L, y::V, precision::V, outliers::V, σ::Real
+    ) where {L<:LossFunction,V<:AbstractVector{T}} where {T<:AbstractFloat}
+        n = length(y)
+        ll = length(precision)
+        ll == 0 ||
+            ll == n ||
+            throw(DimensionMismatch("length of precision is $ll, must be $n or 0"))
+        σ > 0 || throw(ArgumentError("σ must be positive: $σ"))
+        wrky = y - outliers
+        return new{T,V,L}(
+            l, y, zeros(T, n), precision, outliers, convert(T, σ), wrky, copy(wrky)
+        )
+    end
+end
+
+"""
+    IPODResp(l::L, y::V, precision::V, outliers::V=zeros(eltype(y), length(y)), σ::Real=1)
+            where {L<:LossFunction, V<:FPVector}
+
+Initialize the Robust Θ-IPOD linear response structure.
+
+"""
+function IPODResp(
+    l::L, y::V, precision::V=T[], outliers::V=zeros(T, length(y)), σ::Real=1
+) where {L<:LossFunction,V<:AbstractVector{T}} where {T<:AbstractFloat}
+    r = IPODResp{T,V,L}(l, y, precision, outliers, T(σ))
+    initresp!(r)
+    return r
+end
+
+"""
+    IPODResp(l::L, y, wts, outliers=zeros(eltype(y), length(y)), σ::Real=1) where {L<:LossFunction}
+
+Convert the arguments to float arrays.
+"""
+function IPODResp(
+    l::L, y, precision=eltype(y)[], outliers=zeros(eltype(y), length(y)), σ::Real=1
+) where {L<:LossFunction}
+    return IPODResp(
+        l, float(collect(y)), float(collect(precision)), float(collect(outliers)), σ
+    )
+end
+
+function Base.getproperty(r::IPODResp, s::Symbol)
+    if s ∈ (:mu, :η, :eta)
+        r.μ
+    elseif s ∈ (:sigma, :scale)
+        r.σ
+    elseif s ∈ (:γ, :gamma)
+        r.outliers
+    elseif s ∈ (:λ, :lambda)
+        r.precision
+    else
+        getfield(r, s)
+    end
+end
+
+"""
+    initresp!(r::IPODResp)
+
+Initialize the response structure.
+"""
+function initresp!(r::IPODResp)
+    # Set working y
+    broadcast!(-, r.wrky, r.y, r.outliers)
+
+    # Set residual (without offset)
+    return broadcast!(-, r.wrkres, r.wrky, r.μ)
+end
+
+function update_outliers!(r::IPODResp)
+    if isempty(r.precision)
+        λi = one(eltype(r.y))
+        @inbounds @simd for i in eachindex(r.y, r.μ, r.wrkres, r.outliers)
+            # Use threshold to compute outliers
+            r.outliers[i] = r.σ * threshold(r.loss, (r.y[i] - r.μ[i]) / r.σ, λi)
+            r.wrky[i] = r.y[i] - r.outliers[i]
+            r.wrkres[i] = r.wrky[i] - r.μ[i]
+        end
+    else
+        @inbounds @simd for i in eachindex(r.y, r.μ, r.wrkres, r.outliers, r.precision)
+            λi = r.precision[i]
+            # Use threshold to compute outliers
+            r.outliers[i] = r.σ * threshold(r.loss, (r.y[i] - r.μ[i]) / r.σ, λi)
+            r.wrky[i] = r.y[i] - r.outliers[i]
+            r.wrkres[i] = r.wrky[i] - r.μ[i]
+        end
+    end
+    return r
+end
+
+function update_residuals!(r::IPODResp)
+    # update residuals
+    @. r.wrkres = r.wrky - r.μ
+    return r
+end
+
+function update_scale!(r::IPODResp)
+    r.σ = sqrt((sum(abs2, r.wrkres) + dot(r.outliers, r.wrkres)) / length(r.y))
+    return r
+end
+
+function outliers_criteria(r::IPODResp{T}, γold::AbstractVector{T}) where {T<:AbstractFloat}
+    return maximum(abs(γold[i] - r.outliers[i]) / r.σ for i in eachindex(γold, r.outliers))
+end
+
+"""
+    dev_criteria(r::IPODResp)
+
+Deviance part coming from the loss and the response struct.
+"""
+dev_criteria(r::IPODResp) = sum(abs2, r.wrkres) / r.σ^2
+
+"""
+    outliers(r::IPODResp)
+
+Returns the vector of the outlier part γ of the response y, so that `(y - γ) | X ~ Normal`.
+"""
+outliers(r::IPODResp) = r.outliers
+
+
+####################################
+### IPODRegression
+####################################
+
+
+"""
+    IPODRegression
+
+Robust regression using the Φ-IPOD algorithm
+
+## Fields
+
+* `resp`: the [`IPODResp`](@ref) structure.
+* `pred`: the [`IPODPred`](@ref) structure.
+* `formula`: either a `FormulaTerm` object or `nothing`
+* `wts`: the prior observation weights (can be empty).
+* `fitdispersion`: if true, the dispersion is estimated otherwise it is kept fixed
+* `fitted`: if true, the model was already fitted
+"""
+mutable struct IPODRegression{
+    T<:AbstractFloat,
+    R<:IPODResp{T},
+    P<:Union{LinPred,AbstractRegularizedPred},
+    V<:AbstractVector{T},
+} <: AbstractRobustModel{T}
+    resp::R
+    pred::P
+    formula::Union{FormulaTerm,Nothing}
+    wts::V
+    fitdispersion::Bool
+    fitted::Bool
+end
+
+function IPODRegression(
+    X::AbstractMatrix{T},
+    y::AbstractVector{T},
+    loss::LossFunction,
+    penalty::Union{Nothing,PenaltyFunction}=nothing;
+    method::Symbol=:auto,  # :chol, :qr, :cg, :cgd, :fista, :ama, :admm
+    wts::FPVector=similar(y, 0),
+    precision::FPVector=similar(y, 0),
+    fitdispersion::Bool=false,
+    dropcollinear::Bool=false,
+    formula::Union{Nothing,FormulaTerm}=nothing,
+    use_backtracking::Bool=false,
+    AMA::Bool=false,
+    A::AbstractMatrix{<:Real}=zeros(T, 0, 0),
+    b::AbstractVector{<:Real}=zeros(T, 0),
+    restart::Bool=true,
+    adapt::Bool=false,
+    penalty_omit_intercept::Bool=true,
+) where {T<:AbstractFloat}
+
+    # Check that X and y have the same number of observations
+    n, p = size(X)
+    n == size(y, 1) || throw(DimensionMismatch("number of rows in X and y must match"))
+    ll = size(wts, 1)
+    ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n."))
+
+    # Response object
+    rr = IPODResp(loss, y, precision)
+
+    # Method
+    nopen_methods = (:auto, :chol, :cholesky, :cg, :qr)
+    pen_methods = (:auto, :cgd, :fista, :ama, :admm)
+    if isnothing(penalty) && method ∉ nopen_methods
+        @warn(
+            "Incorrect method `:$(method)` without a penalty, should be one of $(nopen_methods)"
+        )
+        method = :auto
+    elseif !isnothing(penalty) && method ∉ pen_methods
+        @warn(
+            "Incorrect method `:$(method)` with a penalty, should be one of $(pen_methods)"
+        )
+        method = :auto
+    end
+
+    # Predictor without penalty
+    if isnothing(penalty)
+        pred = if method === :cg
+            cgpred(X, dropcollinear)
+        elseif method === :qr
+            qrpred(X, dropcollinear)
+        elseif method in (:chol, :cholesky, :auto)
+            cholpred(X, dropcollinear)
+        else
+            error(
+                "without penalty, method :$method is not allowed, should be in: $nopen_methods",
+            )
+        end
+
+        # Predictor with penalty
+    else
+        # Penalty
+        new_penalty = try
+            intercept_col = penalty_omit_intercept ? get_intercept_col(X, formula) : nothing
+            concrete(penalty, p, intercept_col)
+        catch
+            error("penalty is not compatible with coefficients size $p: $(penalty)")
+        end
+
+        # Predictor
+        pred = if method === :fista
+            FISTARegPred(X, new_penalty, wts, restart, use_backtracking)
+        elseif method === :ama
+            AMARegPred(X, new_penalty, wts, A, b, restart)
+        elseif method === :admm
+            ADMMRegPred(X, new_penalty, wts, A, b, restart, adapt)
+        elseif method in (:cgd, :auto)
+            CGDRegPred(X, new_penalty, wts)
+        else
+            error("with penalty, method :$method is not allowed, should be in: $pen_methods")
+        end
+    end
+
+    return IPODRegression(rr, pred, formula, wts, fitdispersion, false)
+end
+
+function Base.getproperty(r::IPODRegression, s::Symbol)
+    if s ∈ (:beta0, :β)
+        r.pred.beta0
+    elseif s ∈ (:delbeta, :dβ)
+        r.pred.delbeta
+    else
+        getfield(r, s)
+    end
+end
+
+function Base.show(io::IO, obj::IPODRegression)
+    msg = "Robust Θ-IPOD regression with $(obj.resp.loss)\n\n"
+    if hasformula(obj)
+        msg *= "$(formula(obj))\n\n"
+    end
+    msg *= "Coefficients:\n"
+    return println(io, msg, coeftable(obj))
+end
+
+####################################
+### Interface
+####################################
+
+## TODO: use (weighted) matrix rank, with dropcollinear
+function StatsAPI.dof(r::IPODRegression)
+    df = if haspenalty(r)
+        dof(penalty(r), coef(r))
+    else
+        length(coef(r))
+    end
+    return min(wobs(r), df)
+end
+
+hasformula(m::IPODRegression) = isnothing(m.formula) ? false : true
+
+function StatsModels.formula(m::IPODRegression)
+    if !hasformula(m)
+        throw(ArgumentError("model was fitted without a formula"))
+    end
+    return m.formula
+end
+
+StatsAPI.modelmatrix(r::IPODRegression) = r.pred.X
+
+function StatsAPI.vcov(r::IPODRegression, wt::AbstractVector)
+    wXt = isempty(wt) ? modelmatrix(r)' : (modelmatrix(r) .* wt)'
+    return inv(Hermitian(float(Matrix(wXt * modelmatrix(r)))))
+end
+StatsAPI.vcov(r::IPODRegression) = vcov(r, weights(r))
+
+function projectionmatrix(r::IPODRegression, wt::AbstractVector)
+    X = modelmatrix(r)
+    wXt = isempty(wt) ? X' : (wt .* X)'
+    return Hermitian(X * vcov(r, wt) * wXt)
+end
+projectionmatrix(r::IPODRegression) = projectionmatrix(r, weights(r))
+
+# Define the methods, but all `variant` give the same result
+for fun in (:vcov, :projectionmatrix, :leverage, :leverage_weights)
+    @eval begin
+        $(fun)(m::IPODRegression, variant::Symbol) = $(fun)(m)
+    end
+end
+
+function GLM.dispersion(r::IPODRegression, sqr::Bool=false)
+    wts = weights(r)
+    res = residuals(r)
+    dofres = dof_residual(r)
+
+    if dofres <= 0
+        return convert(eltype(res), Inf)
+    end
+
+    s = if isempty(wts)
+        sum(abs2, res) / dofres
+    else
+        sum(i -> wts[i] * abs2(res[i]), eachindex(wts, res)) / dofres
+    end
+
+    return sqr ? s : sqrt(s)
+end
+
+"""
+    location_variance(r::RobustLinResp, sqr::Bool=false)
+
+Compute the part of the variance of the coefficients `β` that is due to the encertainty
+from the location. If `sqr` is `false`, return the standard deviation instead.
+
+From Maronna et al., Robust Statistics: Theory and Methods, Equation 4.49
+"""
+location_variance(r::IPODRegression, sqr::Bool=false) = dispersion(r, sqr)
+
+StatsAPI.stderror(r::IPODRegression) = location_variance(r, false) .* sqrt.(abs.(diag(vcov(r))))
+
+## Loglikelihood of the full model
+## l = Σi log fi = Σi log ( 1/(σ * Z) exp( - ρ(ri/σ) ) = -n (log σ + log Z) - Σi ρ(ri/σ)
+fullloglikelihood(r::IPODRegression) = -wobs(r) * (log(r.resp.σ) + log(2 * π) / 2)
+
+StatsAPI.deviance(r::IPODRegression) = sum(abs2, residuals(r)) / scale(r)^2
+
+function StatsAPI.nulldeviance(r::IPODRegression)
+    y = response(r)
+    wts = weights(r)
+
+    # Compute location of the null model
+    μ = if !hasintercept(r)
+        zero(eltype(y))
+    elseif isempty(wts)
+        mean(y)
+    else
+        mean(y, weights(wts))
+    end
+
+    # Sum deviance for each observation
+    dev = 0
+    if isempty(wts)
+        @inbounds for i in eachindex(y)
+            dev += abs2((y[i] - μ) / scale(r))
+        end
+    else
+        @inbounds for i in eachindex(y, wts)
+            dev += wts[i] * abs2((y[i] - μ) / scale(r))
+        end
+    end
+    return dev
+end
+
+StatsAPI.loglikelihood(r::IPODRegression) = fullloglikelihood(r) - deviance(r) / 2
+
+StatsAPI.nullloglikelihood(r::IPODRegression) = fullloglikelihood(r) - nulldeviance(r) / 2
+
+StatsAPI.response(r::IPODRegression) = r.resp.y
+
+StatsAPI.isfitted(r::IPODRegression) = r.fitted
+
+StatsAPI.fitted(r::IPODRegression) = r.resp.μ
+
+StatsAPI.residuals(r::IPODRegression) = r.resp.wrkres
+
+StatsAPI.predict(r::IPODRegression) = fitted(r)
+
+"""
+    scale(m::RobustLinearModel, sqr::Bool=false)
+
+The robust scale estimate used for the robust estimation.
+
+If `sqr` is `true`, the square of the scale is returned.
+"""
+scale(r::IPODRegression, sqr::Bool=false) = sqr ? r.resp.σ^2 : r.resp.σ
+
+StatsAPI.weights(r::IPODRegression) = r.wts
+
+"""
+    coef(m::IPODRegression)
+The coefficients of the model.
+"""
+StatsAPI.coef(r::IPODRegression) = coef(r.pred)
+
+"""
+    nobs(obj::IPODRegression)::Integer
+For linear and generalized linear models, returns the number of elements of the response.
+For models with prior weights, return the number of non-zero weights.
+"""
+function StatsAPI.nobs(r::IPODRegression{T}) where {T}
+    wts = weights(r)
+    if !isempty(wts)
+        ## Suppose that the weights are probability weights
+        count(!iszero, wts)
+    else
+        length(response(r))
+    end
+end
+
+"""
+    wobs(obj::IPODRegression)
+For unweighted linear models, equals to ``nobs``, it returns the number of elements of the response.
+For models with prior weights, return the sum of the weights.
+"""
+function wobs(r::IPODRegression{T}) where {T}
+    wts = weights(r)
+    if !isempty(wts)
+        ## Suppose that the weights are probability weights
+        sum(wts)
+    else
+        oftype(sum(one(eltype(wts))), nobs(r))
+    end
+end
+
+haspenalty(r::IPODRegression{T,R,P,V}) where {T,R,P<:AbstractRegularizedPred,V} = true
+
+function penalty(r::IPODRegression{T,R,P,V}) where {T,R,P<:AbstractRegularizedPred,V}
+    return r.pred.penalty
+end
+
+"""
+    outliers(r::IPODRegression)
+
+Returns the vector of the outlier part γ of the response y, so that `(y - γ) | X ~ Normal`.
+"""
+outliers(r::IPODRegression) = outliers(r.resp)
+
+function update_fields!(
+    m::IPODRegression{T};
+    wts::Union{Nothing,AbstractVector{<:Real}}=nothing,
+    initial_scale::Union{Nothing,Symbol,Real}=nothing,
+    σ0::Union{Nothing,Symbol,Real}=initial_scale,
+    initial_coef::Union{Nothing,AbstractVector{<:Real}}=nothing,
+    β0::Union{Nothing,AbstractVector{<:Real}}=initial_coef,
+    initial_outliers::Union{Nothing,AbstractVector{<:Real}}=nothing,
+    γ0::Union{Nothing,AbstractVector{<:Real}}=initial_outliers,
+    precision::Union{Nothing,Real,AbstractVector{<:Real}}=nothing,
+    λ0::Union{Nothing,Real,AbstractVector{<:Real}}=precision,
+    kwargs...,
+) where {T<:AbstractFloat}
+
+    resp = m.resp
+    pred = m.pred
+    n = length(response(resp))
+    p = length(coef(pred))
+
+    if !isnothing(wts)
+        if length(wts) in (0, n)
+            copy!(m.wts, float(wts))
+        else
+            throw(ArgumentError("wts should be a vector of length 0 or $n: $(length(wts))"))
+        end
+    end
+    if !isnothing(σ0)
+        # convert to float
+        if σ0 isa Symbol
+            σ0 = initialscale(modelmatrix(m), response(m), weights(m), σ0)
+        else
+            σ0 = float(σ0)
+        end
+
+        if σ0 > 0
+            resp.σ = σ0
+        else
+            throw(ArgumentError("σ0 should be a positive real: $(σ0))"))
+        end
+    end
+    if !isnothing(γ0)
+        if length(γ0) == n
+            copy!(resp.outliers, float(γ0))
+        else
+            throw(ArgumentError("γ0 should be a vector of length $n: $(length(γ0))"))
+        end
+    end
+    if !isnothing(λ0)
+        if λ0 isa Real
+            copy!(resp.precision, fill(float(λ0), n))
+        elseif length(λ0) in (0, n)
+            copy!(resp.precision, λ0)
+        else
+            throw(
+                ArgumentError("λ0 should be a real or a vector of length 0 or $n: $(λ0))")
+            )
+        end
+    end
+    if !isnothing(β0)
+        if length(β0) == p
+            copy!(pred.beta0, float(β0))
+        else
+            throw(ArgumentError("β0 should be a vector of length $p: $(length(β0))"))
+        end
+    else
+        fill!(pred.beta0, zero(eltype(coef(m))))
+    end
+
+    ## The predictor must be updated if wts, σ0 or β0 was changed
+
+    # return the rest of the keyword arguments
+    return kwargs
+end
+
+
+####################################
+### Fit IPOD model
+####################################
+
+"""
+    ipod(X, y, args...; kwargs...)
+
+An alias for `fit(IPODRegression, X, y, loss, penalty; kwargs...)`.
+
+The arguments `X` and `y` can be a `Matrix` and a `Vector` or a `Formula` and a `DataFrame`.
+"""
+ipod(X, y, args...; kwargs...) = fit(IPODRegression, X, y, args...; kwargs...)
+
+
+"""
+    fit(::Type{M},
+        X::AbstractMatrix{T},
+        y::AbstractVector{T},
+        loss::LossFunction,
+        penalty::Union{Nothing,PenaltyFunction}=nothing;
+        method::Symbol = :auto,  # :chol, :cg, :cgd, :fista
+        dofit::Bool = true,
+        wts::FPVector = similar(y, 0),
+        offset::FPVector = similar(y, 0),
+        fitdispersion::Bool = false,
+        initial_scale::Union{Symbol, Real}=:mad,
+        σ0::Union{Symbol, Real}=initial_scale,
+        initial_coef::AbstractVector=[],
+        β0::AbstractVector=initial_coef,
+        correct_leverage::Bool=false
+        penalty_omit_intercept::Bool=true,
+        fitargs...,
+    ) where {M<:IPODRegression, T<:AbstractFloat}
+
+Create a robust model with the model matrix (or formula) X and response vector (or dataframe) y,
+using a robust estimator.
+
+
+# Arguments
+
+- `X`: the model matrix (it can be dense or sparse) or a formula
+- `y`: the response vector or a table (dataframe, namedtuple, ...).
+- `loss`: a robust loss function
+- `penalty`: a penalty function, or `nothing` if the coefficients are not penalized.
+
+# Keywords
+
+- `method::Symbol = :auto`: the method used to solve the linear system,
+    Without penalty:
+    - Direct method, Cholesky decomposition: `:chol` (default)
+    - Direct method, QR decomposition: `:qr`
+    - Iterative method, Conjugate Gradient: `:cg`
+    With penalty:
+    - Coordinate Gradient Descent: `:cgd` (default)
+    - Fast Iterative Shrinkage-Thresholding Algorithm: `:fista`
+    - Alternating Minimization Algorithm: `:ama`
+    - Alternating Direction Method of Multipliers: `:admm`
+    Use :auto to select the default method based on whether penalty is or is not `nothing`.
+- `dofit::Bool = true`: if false, return the model object without fitting;
+- `dropmissing::Bool = false`: if true, drop the rows with missing values (and convert to
+    Non-Missing type). With `dropmissing=true` the number of observations may be smaller
+    than the size of the input arrays;
+- `wts::Vector = similar(y, 0)`: Prior probability weights of observations.
+    Can be empty (length 0) if no weights are used (default);
+- `precision::Vector = similar(y, 0)`: Prior precision weights of observations for outlier
+    detection. Can be empty (length 0) if all observations have the same precision (default);
+- `fitdispersion::Bool = false`: reevaluate the dispersion;
+- `contrasts::AbstractDict{Symbol,Any} = Dict{Symbol,Any}()`: a `Dict` mapping term names
+    (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts (e.g., `HelmertCoding()`,
+    `SeqDiffCoding(; levels=["a", "b", "c"])`, etc.). If contrasts are not provided for a variable,
+    the appropriate term type will be guessed based on the data type from the data column:
+    any numeric data is assumed to be continuous, and any non-numeric data is assumed to be
+    categorical (with `DummyCoding()` as the default contrast type);
+- `initial_scale::Union{Symbol, Real}=:mad`: the initial scale estimate, for non-convex estimator
+    it helps to find the global minimum.
+    Automatic computation using `:mad`, `:L1` or `:extrema` (non-robust).
+- `σ0::Union{Nothing, Symbol, Real}=initial_scale`: alias of `initial_scale`;
+- `initial_coef::AbstractVector=[]`: the initial coefficients estimate, for non-convex estimator
+    it helps to find the global minimum.
+- `β0::AbstractVector=initial_coef`: alias of `initial_coef`;
+- `penalty_omit_intercept::Bool=true`: if true, do not penalize the intercept,
+    otherwise use the penalty on all the coefficients;
+- `fitargs...`: other keyword arguments used to control the convergence of the IRLS algorithm
+    (see [`pirls!`](@ref)).
+
+# Output
+
+the IPODRegression object.
+
+"""
+function StatsAPI.fit(
+    ::Type{M},
+    X::AbstractMatrix{T},
+    y::AbstractVector{T},
+    loss::LossFunction,
+    penalty::Union{Nothing,PenaltyFunction}=nothing;
+    method::Symbol=:auto,  # :chol, :qr, :cg, :cgd, :fista, :ama, :admm
+    dofit::Bool=true,
+    dropmissing::Bool=false,  # placeholder
+    initial_scale::Union{Nothing,Symbol,Real}=:mad,
+    σ0::Union{Nothing,Symbol,Real}=initial_scale,
+    wts::FPVector=similar(y, 0),
+    precision::FPVector=similar(y, 0),
+    fitdispersion::Bool=false,
+    dropcollinear::Bool=false,
+    contrasts::AbstractDict{Symbol,Any}=Dict{Symbol,Any}(),  # placeholder
+    __formula::Union{Nothing,FormulaTerm}=nothing,
+    use_backtracking::Bool=false,
+    A::AbstractMatrix{<:Real}=zeros(T, 0, 0),
+    b::AbstractVector{<:Real}=zeros(T, 0),
+    restart::Bool=true,
+    adapt::Bool=false,
+    penalty_omit_intercept::Bool=true,
+    fitargs...,
+) where {M<:IPODRegression,T<:AbstractFloat}
+
+    m = IPODRegression(
+        X,
+        y,
+        loss,
+        penalty;
+        method=method,
+        wts=wts,
+        precision=precision,
+        fitdispersion=fitdispersion,
+        dropcollinear=dropcollinear,
+        formula=__formula,
+        use_backtracking=use_backtracking,
+        A=A,
+        b=b,
+        restart=restart,
+        adapt=adapt,
+        penalty_omit_intercept=penalty_omit_intercept,
+    )
+
+    # Update fields
+    fitargs = update_fields!(m; σ0=σ0, fitargs...)
+
+    return dofit ? fit!(m; fitargs...) : m
+end
+
+## Convert from formula-data to modelmatrix-response calling form
+## the `fit` method must allow the `wts`, `precision`, `contrasts` and `__formula` keyword arguments
+function StatsAPI.fit(
+    ::Type{M},
+    f::FormulaTerm,
+    data,
+    args...;
+    dropmissing::Bool=false,
+    wts::Union{Nothing,Symbol,FPVector}=nothing,
+    precision::Union{Nothing,Symbol,FPVector}=nothing,
+    contrasts::AbstractDict{Symbol,Any}=Dict{Symbol,Any}(),
+    kwargs...,
+) where {M<:IPODRegression}
+    # Extract arrays from data using formula
+    f, y, X, extra = modelframe(
+        f, data, contrasts, dropmissing, M; wts=wts, precision=precision
+    )
+    # Call the `fit` method with arrays
+    return fit(
+        M,
+        X,
+        y,
+        args...;
+        wts=extra.wts,
+        precision=extra.precision,
+        contrasts=contrasts,
+        __formula=f,
+        kwargs...,
+    )
+end
+
+
+"""
+    ipod(X, y, l::LossFunction; λ=1)
+
+She & Owen (2011) - Outlier Detection Using Nonconvex Penalized Regression
+"""
+function StatsAPI.fit!(
+    m::IPODRegression{T,R,P,V};
+    correct_leverage::Bool=false,
+    updatescale::Bool=false,
+    maxiter::Integer=(P <: FISTARegPred) ? 1000 : 100,
+    minstepfac::Real=1e-3,
+    atol::Real=1e-8,
+    rtol::Real=1e-7,
+    verbose::Bool=false,
+) where {T,R,P,V}
+    # Return early if model has the fit flag set
+    m.fitted && return m
+
+    verbose && println("\nFit with IPOD model: $(m.resp.loss), $(penalty(m))")
+
+    # TODO: check if it works and if it can work
+    updatescale = false
+
+    # Initialize the response and predictors
+    init!(m; verbose=verbose)
+
+    # convergence criteria
+    γold = copy(m.resp.outliers)
+    Δγ = 0
+
+    devold = dev_criteria(m)
+    dev = devold
+    Δdev = 0
+
+    ### Loop until convergence
+    cvg = false
+    for i in 1:maxiter
+        ## Update outliers vector
+        update_outliers!(m.resp)
+        Δγ = outliers_criteria(m.resp, γold)
+
+        ## Update coefficients
+        setη!(m; verbose=verbose)
+        dev = dev_criteria(m)
+        Δdev = abs(dev - devold)
+
+        # ## Update scale
+        if updatescale
+            update_scale!(m.resp)
+        end
+
+        ## Postupdate (only for ADMM)
+        updatepred!(m.pred, scale(m); verbose=verbose, force=updatescale)
+
+        # Test for convergence
+        verbose && println("Iteration: $i, Δoutliers: $(Δγ), Δdev: $(Δdev)")
+        tol = max(rtol * abs(devold), atol)
+        if Δγ < atol && Δdev < tol
+            cvg = true
+            break
+        end
+        copyto!(γold, m.resp.outliers)
+        @assert isfinite(dev)
+        devold = dev
+    end
+    cvg || throw(ConvergenceException(maxiter))
+    m.fitted = true
+    return m
+end
+
+"""
+    refit!(m::RobustLinearModel, [y::FPVector];
+                                 wts::Union{Nothing, FPVector} = nothing,
+                                 offset::Union{Nothing, FPVector} = nothing,
+                                 quantile::Union{Nothing, AbstractFloat} = nothing,
+                                 ridgeλ::Union{Nothing, Real} = nothing,
+                                 kwargs...)
+
+Refit the [`RobustLinearModel`](@ref).
+
+This function assumes that `m` was correctly initialized and the model is refitted with
+the new values for the response, weights, offset, quantile and ridge shrinkage.
+
+Defining a new `quantile` is only possible for [`GeneralizedQuantileEstimator`](@ref).
+
+Defining a new `ridgeλ` is only possible for [`RidgePred`](@ref) objects.
+"""
+function refit!(m::IPODRegression, y::FPVector; kwargs...)
+    r = m.resp
+    # Check that old and new y have the same number of observations
+    if size(r.y, 1) != size(y, 1)
+        mess =
+            "the new response vector should have the same dimension: " *
+            "$(size(r.y, 1)) != $(size(y, 1))"
+        throw(DimensionMismatch(mess))
+    end
+    # Update y
+    copyto!(r.y, y)
+
+    return refit!(m; kwargs...)
+end
+
+function refit!(m::IPODRegression; method=nothing, kwargs...)
+
+    if !isnothing(method)
+        @warn(
+            "the method cannot be changed when refitting," *
+                " ignore the keyword argument `method=:$(method)`."
+        )
+    end
+
+    # Update fields
+    kwargs = update_fields!(m; kwargs...)
+
+    m.fitted = false
+    return fit!(m; kwargs...)
+end
+
+
+function dev_criteria(m::IPODRegression)
+    if !haspenalty(m)
+        # this criteria is not used with no penalty
+        return 0
+    end
+    l = dev_criteria(m.resp)
+    l += dev_criteria(m.pred)
+    return l
+end
+
+
+function init!(m::IPODRegression{T}; verbose::Bool=false) where {T<:AbstractFloat}
+
+    resp = m.resp
+    pred = m.pred
+
+    # reset ∇β
+    copyto!(pred.delbeta, pred.beta0)
+
+    # res = y - Xβ
+    mul!(resp.μ, pred.X, pred.beta0)
+    initresp!(resp)
+
+    # Initialize the predictor
+    initpred!(pred, m.wts, resp.σ; verbose=verbose)
+
+    return m
+end
+
+function setη!(
+    m::IPODRegression{T,R,P,V}, linesearch_f::Real=1; verbose::Bool=false
+) where {T,R,P<:LinPred,V}
+    # TODO: add line search
+    μ = m.resp.μ
+
+    #    # dβ = Σ \ (X' * r)
+    #    mul!(dβ, wXt, p.wrkres)
+    #    ldiv!(facΣ, dβ)
+    update_beta!(m.pred, m.resp.wrkres, m.wts)
+
+    # Install beta
+    broadcast!(+, m.pred.beta0, m.pred.beta0, m.pred.delbeta)
+
+    # Update linear predictor
+    mul!(μ, m.pred.X, m.pred.beta0)
+
+    # Update residuals with the new μ
+    return update_residuals!(m.resp)
+end
+
+function setη!(
+    m::IPODRegression{T,R,P,V}, linesearch_f::Real=1; verbose::Bool=false
+) where {T,R,P<:AbstractRegularizedPred,V}
+    wts = weights(m)
+
+    μ = m.resp.μ
+    σ2 = m.resp.σ^2
+    wrky = m.resp.wrky
+
+    if m.pred isa CGDRegPred
+        res = m.resp.wrkres
+
+        # Update coefs and μ
+        update_βμ!(m.pred, wrky, μ, res, σ2, wts; verbose=verbose)
+
+    else
+        # Update coefs
+        update_beta!(m.pred, wrky, wts, σ2; verbose=verbose)
+
+        # Update linear predictor
+        mul!(μ, m.pred.X, m.pred.beta0)
+
+        # Update residuals with the new μ
+        update_residuals!(m.resp)
+    end
+
+    return m
+end
diff --git a/src/linpred.jl b/src/linpred.jl
index fafc9bb..4c3e687 100644
--- a/src/linpred.jl
+++ b/src/linpred.jl
@@ -2,6 +2,8 @@ import GLM: cholpred
 
 
 #################
+loss_criteria(p::LinPred) = 0
+
 StatsAPI.modelmatrix(p::LinPred) = p.X
 
 function StatsAPI.vcov(p::LinPred, wt::AbstractVector)
diff --git a/src/penalties.jl b/src/penalties.jl
new file mode 100644
index 0000000..1299338
--- /dev/null
+++ b/src/penalties.jl
@@ -0,0 +1,826 @@
+
+
+soft_threshold(x::Real, λ::Real) = (abs(x) <= λ) ? zero(x) : x - sign(x) * λ
+
+
+
+isconvex(::PenaltyFunction) = false
+isseparable(::PenaltyFunction) = false
+
+"""
+    concrete!(p::PenaltyFunction, m::Integer, intercept_index::Union{Nothing,Integer}=nothing)
+
+Modifies the penalty to make it compatible with the coefficients of size `m`.
+For penalties that do not handle indices range, return the unmodified penalty,
+otherwise (like for RangedPenalties), modify the ranges so they cover all
+indices in the `1:m` range.
+Throws an error if the penalty ranges are not compatible with `m` or if `intercept_index`
+is defined and `p` is not a RangedPenalties.
+"""
+function concrete!(
+    p::PenaltyFunction, n::Integer, intercept_index::Union{Nothing,Integer}=nothing
+)
+    if !isnothing(intercept_index) && (1 <= intercept_index <= n)
+        error(
+            "intercept_index $(intercept_index) in 1:$n, cannot change the type " *
+            "from $(typeof(p)) to RangedPenalties.",
+        )
+    end
+    return p
+end
+
+"""
+    concrete(p::PenaltyFunction, m::Integer)
+
+Returns a penalty that is compatible with the coefficients of size `m`.
+For penalties that do not handle indices range, if `intercept_index` is not defined
+return a deep copy of the penalty, otherwise return a RangedPenalties excluding this index.
+For RangedPenalties, returns a penalty with ranges that cover all the indices in the `1:m` range.
+Throws an error if the penalty ranges are not compatible with `m`.
+"""
+function concrete(
+    p::PenaltyFunction, n::Integer, intercept_index::Union{Nothing,Integer}=nothing
+)
+    if !isnothing(intercept_index) && (1 <= intercept_index <= n)
+        if n == 1
+            ranges = []
+            penalties = []
+        else
+            ranges = [excludeindex(1:n, intercept_index)]
+            penalties = [p]
+        end
+        return RangedPenalties(ranges, penalties, n)
+    else
+        return deepcopy(p)
+    end
+end
+
+
+
+function proximal!(
+    p::PenaltyFunction,
+    out::AbstractVector,
+    index::Integer,
+    x::AbstractVector,
+    step::AbstractFloat,
+)
+    proximal!(p, view(out, index), view(x, index), step)
+    return out
+end
+
+function proximal(p::PenaltyFunction, x::AbstractArray, step::AbstractFloat)
+    return proximal!(p, similar(x), x, step)
+end
+function proximal(p::PenaltyFunction, x::T, step::AbstractFloat) where {T<:AbstractFloat}
+    return proximal!(p, ones(T, 1), [x], step)[1]
+end
+
+StatsAPI.dof(p::PenaltyFunction, x::AbstractVector{T}) where {T<:AbstractFloat} = length(x)
+
+########################################################################
+##### Penalty functions
+########################################################################
+
+"""
+    NoPenalty{T<:AbstractFloat}
+
+No penalty (constant penalty), the proximal operator returns the same vector as input (identity map).
+P(x) = 0
+"""
+struct NoPenalty{T<:AbstractFloat} <: PenaltyFunction{T} end
+NoPenalty(args...; kwargs...) = NoPenalty{Float64}()
+cost(::NoPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} = zero(T)
+function proximal!(
+    p::NoPenalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    return copyto!(out, x)
+end
+
+isconvex(::NoPenalty) = true
+isseparable(::NoPenalty) = true
+
+
+"""
+    SquaredL2Penalty{T<:AbstractFloat}
+
+Squared L2 penalty, the proximal operator returns a scaled version.
+P(x) = λ/2 Σi |xi|²
+"""
+struct SquaredL2Penalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    nonnegative::Bool
+
+    function SquaredL2Penalty(λ::T; nonnegative::Bool=false) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ"))
+        return new{T}(λ, nonnegative)
+    end
+end
+function cost(p::SquaredL2Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return p.λ / 2 * sum(abs2, x)
+end
+function proximal!(
+    p::SquaredL2Penalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    #    return broadcast!(/, out, x, 1 + p.λ * step)
+    nonnegative = p.nonnegative
+    a = 1 / (1 + p.λ * step)
+    @inbounds @simd for i in eachindex(out, x)
+        out[i] = (nonnegative && x[i] <= 0) ? zero(T) : x[i] * a
+    end
+    return out
+end
+
+# Approximate shrinkage of coefficients
+function StatsAPI.dof(p::SquaredL2Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return length(x) / (1 + p.λ)
+end
+
+isconvex(::SquaredL2Penalty) = true
+isseparable(::SquaredL2Penalty) = true
+
+
+"""
+    EuclideanPenalty{T<:AbstractFloat}
+
+Euclidean norm penalty, the proximal operator returns a scaled version.
+P(x) = λ √(Σi |xi|²)
+"""
+struct EuclideanPenalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    nonnegative::Bool
+
+    function EuclideanPenalty(λ::T; nonnegative::Bool=false) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ"))
+        return new{T}(λ, nonnegative)
+    end
+end
+function cost(p::EuclideanPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return p.λ * norm(x, 2)
+end
+function proximal!(
+    p::EuclideanPenalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    # nn = p.nonnegative ? norm(broadcast!(max, out, x, 0), 2) : norm(x, 2)
+    if p.nonnegative
+        nn = zero(T)
+        @inbounds @simd for xi in x
+            nn += xi > 0 ? xi^2 : zero(T)
+        end
+        nn = sqrt(nn)
+    else
+        nn = norm(x, 2)
+    end
+    return rmul!(copyto!(out, x), (1 - p.λ * step / max(p.λ * step, nn)))
+end
+
+isconvex(::EuclideanPenalty) = true
+isseparable(::EuclideanPenalty) = false
+
+
+"""
+    L1Penalty{T<:AbstractFloat}
+
+L1 penalty, the proximal operator returns a soft-thresholded value.
+P(x) = λ Σi |xi|
+"""
+struct L1Penalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    nonnegative::Bool
+
+    function L1Penalty(λ::T; nonnegative::Bool=false) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ"))
+        return new{T}(λ, nonnegative)
+    end
+end
+cost(p::L1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat} = p.λ * sum(abs, x)
+function proximal!(
+    p::L1Penalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    a = p.λ * step
+    nonnegative = p.nonnegative
+
+    @inbounds @simd for i in eachindex(out, x)
+        out[i] = (nonnegative && x[i] <= 0) ? zero(T) : soft_threshold(x[i], a)
+    end
+    return out
+end
+
+function StatsAPI.dof(p::L1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return count(!iszero, x)
+end
+
+isconvex(::L1Penalty) = true
+isseparable(::L1Penalty) = true
+
+
+"""
+    ElasticNetPenalty{T<:AbstractFloat}
+
+ElasticNet penalty, a sum of SquaredL2Penalty and sparse L1Penalty.
+P(x) = l1_ratio . λ Σi |xi| + (1 - l1_ratio) . λ/2 Σi |xi|²
+"""
+struct ElasticNetPenalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    l1_ratio::T
+    nonnegative::Bool
+
+    function ElasticNetPenalty(
+        λ::T, l1_ratio::AbstractFloat=T(0.5); nonnegative::Bool=false
+    ) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ"))
+        0 <= l1_ratio <= 1 ||
+            throw(ArgumentError("l1_ratio must be between 0 and 1: $(l1_ratio)"))
+        return new{T}(λ, l1_ratio, nonnegative)
+    end
+end
+
+function cost(p::ElasticNetPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return p.λ * (p.l1_ratio * sum(abs, x) + (1 - p.l1_ratio) * sum(abs2, x) / 2)
+end
+
+function proximal!(
+    p::ElasticNetPenalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    a = p.λ * step
+    nonnegative = p.nonnegative
+    l1_ratio = p.l1_ratio
+    @inbounds @simd for i in eachindex(out, x)
+        out[i] = if (nonnegative && x[i] <= 0)
+            zero(T)
+        else
+            (soft_threshold(x[i], l1_ratio * a) / (1 + (1 - l1_ratio) * a))
+        end
+    end
+    return out
+end
+
+# Approximate shrinkage of coefficients
+function StatsAPI.dof(
+    p::ElasticNetPenalty{T}, x::AbstractVector{T}
+) where {T<:AbstractFloat}
+    return count(!iszero, x) / (1 + p.λ)
+end
+
+isconvex(::ElasticNetPenalty) = true
+isseparable(::ElasticNetPenalty) = true
+
+
+"""
+    BerhuPenalty{T<:AbstractFloat}
+
+Berhu convex penalty, equivalent to L1Penalty at low values and SquaredL2Penalty at high values
+(reverse of Huber loss).
+Owen (2006) - A robust hybrid of lasso and ridge regression
+
+P(x) = λl1_ratio . λ Σi |xi| + (1 - l1_ratio) . λ/2 Σi |xi|²
+"""
+struct BerhuPenalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    γ::T
+    nonnegative::Bool
+
+    function BerhuPenalty(
+        λ::T, γ::AbstractFloat=T(1); nonnegative::Bool=false
+    ) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ should be non-negative: $λ"))
+        γ >= 0 || throw(ArgumentError("γ must be non-negative: $(γ)"))
+        return new{T}(λ, γ, nonnegative)
+    end
+end
+
+function cost(p::BerhuPenalty{T}, xi::T) where {T<:AbstractFloat}
+    return abs(xi) <= p.γ ? p.λ * abs(xi) : p.λ * p.γ / 2 * (1 + (xi / p.γ)^2)
+end
+function cost(p::BerhuPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return sum(xi -> cost(p, xi), x; init=zero(T))
+end
+
+function proximal!(
+    p::BerhuPenalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    a = p.λ * step
+    γ = p.γ
+    nonnegative = p.nonnegative
+    @inbounds @simd for i in eachindex(out, x)
+        if p.nonnegative && x[i] <= 0
+            out[i] = zero(T)
+        elseif abs(x[i]) <= a + γ
+            out[i] = soft_threshold(x[i], a)
+        else
+            out[i] = γ * x[i] / (a + γ)
+        end
+    end
+    return out
+end
+
+# Approximate coefficient selection
+function StatsAPI.dof(p::BerhuPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return count(!iszero, x)
+end
+
+isconvex(::BerhuPenalty) = true
+isseparable(::BerhuPenalty) = true
+
+
+"""
+    SCADPenalty{T<:AbstractFloat}
+
+SCAD penalty, folded-concave penalty, generalization of the LASSO.
+Fan & Li (2001) - Variable Selection via Nonconcave Penalized Likelihood and its Oracle Properties
+
+P(x) = λ ∫_0^|x| dt I(t<=λ) + (γλ - t)_+ / (λ*(γ-1)) I (t>λ)
+"""
+struct SCADPenalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    γ::T
+    nonnegative::Bool
+
+    function SCADPenalty(
+        λ::T, γ::AbstractFloat=T(3.7); nonnegative::Bool=false
+    ) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ must be non-negative: $λ"))
+        γ > 2 || throw(ArgumentError("γ must be greater than 2: $(γ)"))
+        if nonnegative
+            @warn "`nonnegative` argument is ignored for SCADPenalty."
+        end
+        return new{T}(λ, γ, false)
+    end
+end
+
+function cost(p::SCADPenalty{T}, xi::T) where {T<:AbstractFloat}
+    if abs(xi) <= p.λ
+        return p.λ * abs(xi)
+    elseif abs(x[i]) >= p.λ * p.γ
+        return p.λ^2 * (p.γ + 1) / 2
+    else
+        return -(xi^2 - 2 * p.λ * p.γ * abs(xi) + p.λ^2) / (2 * (p.γ - 1))
+    end
+end
+function cost(p::SCADPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return sum(xi -> cost(p, xi), x; init=zero(T))
+end
+
+function convex_approx(p::SCADPenalty{T}, x::T, u::T, s::T) where {T}
+    return 0.5 * (x - u)^2 + cost(p, x) / s
+end
+
+function proximal!(
+    p::SCADPenalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    # Step is non-multiplicative
+    # Gong et al. (2013) - A General Iterative Shrinkage and Thresholding Algorithm for
+    # Non-convex Regularized Optimization Problems
+    λ = p.λ
+    γ = p.γ
+
+    if step <= 0
+        return copy!(out, x)
+    end
+
+    @inbounds @simd for i in eachindex(out, x)
+        #        if p.nonnegative && x[i] <= 0
+        #            out[i] = zero(T)
+        #        else
+        x1 = sign(x[i]) * min(λ, max(0, abs(x[i]) - λ / step))
+        x2 =
+            sign(x[i]) *
+            min(λ * γ, max(λ, (abs(x[i]) * (γ - 1) - λ * γ / step) / (γ - 2)))
+        x3 = sign(x[i]) * min(λ * γ, abs(x[i]))
+        sols = [x1, x2, x3]
+        ind = argmin(convex_approx.(p, sols, x[i], step))
+        out[i] = sols[ind]
+    end
+    return out
+end
+
+# Approximate coefficient selection
+function StatsAPI.dof(p::SCADPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return count(!iszero, x)
+end
+
+isconvex(::SCADPenalty) = false
+isseparable(::SCADPenalty) = true
+
+
+"""
+    CappedL1Penalty{T<:AbstractFloat}
+
+Capped L1 penalty, folded-concave penalty, a bridge between the L1Penalty and the Hard thresholding penalty.
+For γ -> Inf, the MC+ penalty becomes equivalent to the L1Penalty.
+For γ -> 0+, the MC+ penalty tends to the L0 penalty or hard thresholding penalty.
+
+P(x) = λ min(abs(x), γ)
+"""
+struct CappedL1Penalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    γ::T
+    nonnegative::Bool
+
+    function CappedL1Penalty(
+        λ::T, γ::AbstractFloat=T(3); nonnegative::Bool=false
+    ) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ must be non-negative: $λ"))
+        γ > 0 || throw(ArgumentError("γ must be greater than 1: $(γ)"))
+        if nonnegative
+            @warn "`nonnegative` argument is ignored for CappedL1Penalty."
+        end
+        return new{T}(λ, γ, false)
+    end
+end
+
+cost(p::CappedL1Penalty{T}, xi::T) where {T<:AbstractFloat} = p.λ * min(abs(xi), p.γ)
+function cost(p::CappedL1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return sum(xi -> cost(p, xi), x; init=zero(T))
+end
+
+function convex_approx(p::CappedL1Penalty{T}, x::T, u::T, s::T) where {T}
+    return 0.5 * (x - u)^2 + cost(p, x) / s
+end
+
+function proximal!(
+    p::CappedL1Penalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    # Step is non-multiplicative
+    # Gong et al. (2013) - A General Iterative Shrinkage and Thresholding Algorithm for
+    # Non-convex Regularized Optimization Problems
+    λ = p.λ
+    γ = p.γ
+
+    if step <= 0
+        return copy!(out, x)
+    end
+
+    @inbounds @simd for i in eachindex(out, x)
+        #        if p.nonnegative && x[i] <= 0
+        #            out[i] = zero(T)
+        #        else
+        x1 = sign(x[i]) * max(γ, abs(x[i]))
+        x2 = sign(x[i]) * min(γ, max(0, abs(x[i]) - λ / step))
+        sols = [x1, x2]
+        ind = argmin(convex_approx.(p, sols, x[i], step))
+        out[i] = sols[ind]
+    end
+    return out
+end
+
+# Approximate coefficient selection
+function StatsAPI.dof(p::CappedL1Penalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return count(!iszero, x)
+end
+
+isconvex(::CappedL1Penalty) = false
+isseparable(::CappedL1Penalty) = true
+
+
+"""
+    MCPPenalty{T<:AbstractFloat}
+
+MC+ penalty, folded-concave penalty, a bridge between the L1Penalty and the Hard thresholding penalty.
+For γ -> Inf, the MC+ penalty becomes equivalent to the L1Penalty.
+For γ -> 1+, the MC+ penalty tends to the L0 penalty or hard thresholding penalty.
+
+Zhang (2010) - Nearly unbiased variable selection under minimax concave penalty
+
+P(x) = λ ∫_0^|x| dt (1 - t/γλ)_+
+"""
+struct MCPPenalty{T<:AbstractFloat} <: PenaltyFunction{T}
+    λ::T
+    γ::T
+    nonnegative::Bool
+
+    function MCPPenalty(
+        λ::T, γ::AbstractFloat=T(3); nonnegative::Bool=false
+    ) where {T<:AbstractFloat}
+        λ >= 0 || throw(ArgumentError("penalty constant λ must be non-negative: $λ"))
+        γ > 1 || throw(ArgumentError("γ must be greater than 1: $(γ)"))
+        if nonnegative
+            @warn "`nonnegative` argument is ignored for MCPPenalty."
+        end
+        return new{T}(λ, γ, false)
+    end
+end
+
+function cost(p::MCPPenalty{T}, xi::T) where {T<:AbstractFloat}
+    if abs(xi) < p.λ * p.γ
+        return p.λ * abs(xi) - xi^2 / (2 * p.γ)
+    else
+        return p.λ^2 * p.γ / 2
+    end
+end
+function cost(p::MCPPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return sum(xi -> cost(p, xi), x; init=zero(T))
+end
+
+function convex_approx(p::MCPPenalty{T}, x::T, u::T, s::T) where {T}
+    return 0.5 * (x - u)^2 + cost(p, x) / s
+end
+function aux_func2(p::MCPPenalty{T}, x::T, u::T, s::T) where {T}
+    return (x - abs(u))^2 / 2 + p.λ * x / s - x^2 / (2 * p.γ)
+end
+
+function proximal!(
+    p::MCPPenalty{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    # proximal(p, x) = abs(x) <= p.λ ? 0 :
+    #                  abs(x) < p.λ * p.γ ? sign(x) * (abs(x) - p.λ) / (1 - 1/p.γ) : x
+    #
+    # Step is non-multiplicative
+    # Gong et al. (2013) - A General Iterative Shrinkage and Thresholding Algorithm for
+    # Non-convex Regularized Optimization Problems
+    λ = p.λ
+    γ = p.γ
+
+    if step <= 0
+        return copy!(out, x)
+    end
+
+    a = p.λ * p.γ
+    @inbounds @simd for i in eachindex(out, x)
+        #        if p.nonnegative && x[i] <= 0
+        #            out[i] = zero(T)
+        #        else
+        z1 = 0
+        z2 = a
+        z3 = min(a, max(0, (γ * abs(x[i]) - a / step) / (γ - 1)))
+        zsols = [z1, z2, z3]
+        ind = argmin(aux_func2.(p, zsols, x[i], step))
+        z = zsols[ind]
+
+        x1 = sign(x[i]) * z
+        x2 = sign(x[i]) * min(a, abs(x[i]))
+        sols = [x1, x2]
+        ind = argmin(convex_approx.(p, sols, x[i], step))
+        out[i] = sols[ind]
+    end
+    return out
+end
+
+# Approximate coefficient selection
+function StatsAPI.dof(p::MCPPenalty{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    return count(!iszero, x)
+end
+
+isconvex(::MCPPenalty) = false
+isseparable(::MCPPenalty) = true
+
+
+#################################
+### RangedPenalties
+#################################
+
+abstract type AbstractRangedPenalties{T} <: PenaltyFunction{T} end
+
+struct End{T<:Integer}
+    offset::T
+
+    End{T}(offset::T) where {T<:Integer} = new{T}(offset)
+    End{T}() where {T<:Integer} = new{T}(zero(T))
+end
+End(offset::T) where {T<:Integer} = End{T}(offset)
+End() = End{Int}()
+
+struct EndStepRange{T<:Integer}
+    start::T
+    step::T
+    stop::T
+
+    function EndStepRange{T}(
+        start::Integer, step::Integer, stop::Integer
+    ) where {T<:Integer}
+        return new{T}(Base.convert.(T, (start, step, stop))...)
+    end
+end
+function EndStepRange(
+    start::I1, step::I2, stop::I3
+) where {I1<:Integer,I2<:Integer,I3<:Integer}
+    return EndStepRange{promote_type(I1, I2, I3)}(start, step, stop)
+end
+
+(::Colon)(start::Integer, stop::End) = EndStepRange(start, one(start), stop.offset)
+(::Colon)(start::Integer, step::Integer, stop::End) = EndStepRange(start, step, stop.offset)
+
+torange(p::EndStepRange, n::Integer) = (p.start):(p.step):(n - p.stop)
+function torange(p::Vector, n::Integer)
+    return if all(1 .<= p .<= n)
+        sort(p)
+    else
+        throw(BoundsError("attempt to access $n-element Vector at index [$(p)]"))
+    end
+end
+function torange(p::AbstractRange, n::Integer)
+    return if (last(p) <= n)
+        p
+    else
+        throw(BoundsError("attempt to access $n-element Vector at index [$(p)]"))
+    end
+end
+
+function excludeindex(p::Vector, i::Integer)
+    i in p || return p
+
+    return filter(!=(i), p)
+end
+function excludeindex(p::AbstractRange, i::Integer)
+    i in p || return p
+
+    if first(p) == i
+        ran = (first(p) + step(p)):step(p):last(p)
+    elseif last(p) == i
+        ran = first(p):step(p):(last(p) - step(p))
+    else
+        ran = filter(!=(i), p)
+    end
+    return ran
+end
+
+RangeLikeType{T} = Union{UnitRange{<:T},StepRange{<:T,<:T},EndStepRange{<:T},Vector{<:T}}
+
+
+mutable struct RangedPenalties{T<:AbstractFloat,N<:Integer} <: AbstractRangedPenalties{T}
+    ranges::Vector{RangeLikeType{N}}
+    penalties::Vector{PenaltyFunction}
+    notinrange::Vector{N}
+    isconcrete::Bool
+
+    function RangedPenalties{T,N}(
+        ranges::AbstractVector, penalties::AbstractVector{<:PenaltyFunction{T}}
+    ) where {T<:AbstractFloat,N<:Integer}
+        any(p isa RangedPenalties for p in penalties) && throw(
+            ArgumentError(
+                "RangedPenalties penalties should not be of type RangedPenalties: $(typeof.(penalties))",
+            ),
+        )
+        length(ranges) == length(penalties) || throw(
+            ArgumentError(
+                "ranges and penalties should have the same number of elements: $(length(ranges)) != $(length(penalties))",
+            ),
+        )
+
+        return new{T,N}(ranges, penalties, zeros(N, 0), false)
+    end
+end
+
+function RangedPenalties(
+    ranges::AbstractVector,
+    penalties::AbstractVector{<:PenaltyFunction{T}},
+    n::Union{Nothing,Integer}=nothing,
+) where {T<:AbstractFloat}
+    length(penalties) > 0 ||
+        throw(ArgumentError("RangedPenalties should contain at least one PenaltyFunction"))
+    p = RangedPenalties{T,typeof(length(ranges))}(ranges, penalties)
+    if !isnothing(n)
+        concrete!(p, n)
+    end
+    return p
+end
+
+function Base.:(==)(x::RangedPenalties, y::RangedPenalties)
+    if !isconcrete(x) || !isconcrete(y)
+        return x.ranges == y.ranges && x.penalties == y.penalties
+    end
+
+    # Concrete penalties
+    x.penalties == y.penalties || return false
+    for (xr, yr) in zip(x.ranges, y.ranges)
+        Set(xr) == Set(yr) || return false
+    end
+    Set(x.notinrange) == Set(y.notinrange) || return false
+    return true
+end
+
+isconcrete(p::RangedPenalties) = p.isconcrete
+
+function concrete!(
+    p::RangedPenalties, n::Integer, intercept_index::Union{Nothing,Integer}=nothing
+)
+    ranges = p.ranges
+    penalties = p.penalties
+    # Check that ranges are non-overlapping
+    notinrange = Set(1:n)
+    @inbounds for j in eachindex(ranges, penalties)
+        ran = torange(ranges[j], n)
+        sran = Set(ran)
+        if intersect(notinrange, sran) != sran
+            error("Overlapping ranges are not allowed: $(ranges)")
+        end
+        if !isnothing(intercept_index)
+            # Remove intercept_index from range
+            ran = excludeindex(ran, intercept_index)
+            sran = Set(ran)
+        end
+        ## Transform End-ranges to Base ranges
+        ranges[j] = ran
+        ## Remoev used ranges
+        setdiff!(notinrange, sran)
+    end
+    ## Store missing indices in notinrange
+    p.notinrange = sort!(collect(notinrange))
+    p.isconcrete = true
+    return p
+end
+
+function concrete(
+    p::RangedPenalties, n::Integer, intercept_index::Union{Nothing,Integer}=nothing
+)
+    new_p = deepcopy(p)
+    return concrete!(new_p, n, intercept_index)
+end
+
+function cost(p::RangedPenalties{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    isconcrete(p) || error(
+        "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.",
+    )
+    ranges = p.ranges
+    penalties = p.penalties
+
+    n = length(x)
+    r = zero(T)
+    for (ran, pen) in zip(ranges, penalties)
+        r += cost(pen, view(x, ran))
+    end
+    return r
+end
+
+function proximal!(
+    p::RangedPenalties{T}, out, x::AbstractArray{T}, step::T=one(T)
+) where {T<:AbstractFloat}
+    isconcrete(p) || error(
+        "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.",
+    )
+    ranges = p.ranges
+    penalties = p.penalties
+
+    n = length(x)
+    for (ran, pen) in zip(ranges, penalties)
+        proximal!(pen, view(out, ran), view(x, ran), step)
+    end
+    # Apply NoPenalty to indices not defined by ranges
+    if !isempty(p.notinrange)
+        ran = p.notinrange
+        pen = NoPenalty{T}()
+        proximal!(pen, view(out, ran), view(x, ran), step)
+    end
+    return out
+end
+
+function proximal!(
+    p::RangedPenalties{T},
+    out::AbstractVector,
+    index::Integer,
+    x::AbstractVector,
+    step::AbstractFloat,
+) where {T<:AbstractFloat}
+    isconcrete(p) || error(
+        "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.",
+    )
+    ranges = p.ranges
+    penalties = p.penalties
+
+    done = false
+    for (ran, pen) in zip(ranges, penalties)
+        if in(index, ran)
+            proximal!(pen, view(out, index), view(x, index), step)
+            done = true
+            break
+        end
+    end
+    if !done && !isempty(p.notinrange)
+        if !in(index, p.notinrange)
+            n = length(x)
+            error("index not found in range $(1:n): $index")
+        end
+        proximal!(NoPenalty{T}(), view(out, index), view(x, index), step)
+    end
+    return out
+end
+
+function StatsAPI.dof(p::RangedPenalties{T}, x::AbstractVector{T}) where {T<:AbstractFloat}
+    isconcrete(p) || error(
+        "RangedPenalties not concrete, call `concrete!` beforehand to make sure the ranges are well-defined.",
+    )
+    ranges = p.ranges
+    penalties = p.penalties
+
+    # For ranges without penalty, count 1 dof per index
+    r = convert(T, length(p.notinrange))
+    @inbounds for (ran, pen) in zip(ranges, penalties)
+        r += dof(pen, view(x, ran))
+    end
+    return r
+end
+
+isconvex(p::RangedPenalties) = all(isconvex.(p.penalties))
+function isseparable(p::RangedPenalties)
+    ranges = p.ranges
+    penalties = p.penalties
+
+    if !isconcrete(p)
+        error("isseparable is only defined on concrete RangedPenalties.")
+    end
+    @inbounds for (ran, pen) in zip(ranges, penalties)
+        if !isseparable(pen) && length(ran) > 1
+            return false
+        end
+    end
+    return true
+end
diff --git a/src/regularizedpred.jl b/src/regularizedpred.jl
index f91f893..ccfa886 100644
--- a/src/regularizedpred.jl
+++ b/src/regularizedpred.jl
@@ -10,6 +10,18 @@ StatsAPI.modelmatrix(p::AbstractRegularizedPred) = p.X
 
 StatsAPI.coef(p::AbstractRegularizedPred) = p.beta0
 
+penalty(p::AbstractRegularizedPred) = p.penalty
+
+penalized_coef(p::AbstractRegularizedPred) = coef(p)
+
+dev_criteria(p::AbstractRegularizedPred) = 2 * cost(penalty(p), penalized_coef(p))
+
+initpred!(p::AbstractRegularizedPred, args...; kwargs...) = p
+
+updatepred!(p::AbstractRegularizedPred, args...; kwargs...) = p
+
+update_beta!(p::AbstractRegularizedPred, args...; kwargs...) = p
+
 function StatsAPI.vcov(p::AbstractRegularizedPred, wt::AbstractVector)
     wXt = isempty(wt) ? modelmatrix(p)' : (modelmatrix(p) .* wt)'
     return inv(Hermitian(float(Matrix(wXt * modelmatrix(p)))))
@@ -217,6 +229,10 @@ function Base.propertynames(r::RidgePred, private::Bool=false)
     end
 end
 
+penalty(p::RidgePred) = SquaredL2Penalty(p.λ)
+
+penalized_coef(p::RidgePred) = p.G * (p.pred.beta0 - p.βprior)
+
 function cgpred(
     X::StridedMatrix{T},
     λ::Real,
@@ -363,3 +379,1091 @@ function StatsAPI.vcov(p::RidgePred, wt::AbstractVector)
     wXt = isempty(wwt) ? extendedmodelmatrix(p)' : (extendedmodelmatrix(p) .* wwt)'
     return inv(Hermitian(float(Matrix(wXt * extendedmodelmatrix(p)))))
 end
+
+
+##############################
+###  Coordinate Gradient Descent predictor
+##############################
+
+mutable struct CGDParams
+    "`outerloop`: use extra outer loop"
+    outerloop::Bool
+    "`update_all`: update all coefficients, that are zero or not."
+    update_all::Bool
+
+    function CGDParams(outerloop::Bool=true, update_all::Bool=true)
+        return new(outerloop, update_all)
+    end
+end
+
+function initparams!(p::CGDParams)
+    p.update_all = true
+    return p
+end
+
+"""
+    CGDRegPred
+
+Regularization using Coordinate Gradient Descent.
+
+# Members
+
+- `X`: model matrix
+- `beta0`: base vector for coefficients
+- `delbeta`: coefficients increment
+- `Σ`: Gram matrix and temporary matrix
+- `penalty`: the penalty function.
+- `invvar`: precision vector.
+- `scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method.
+"""
+struct CGDRegPred{T<:BlasReal,M<:AbstractMatrix{T},V<:Vector{T},P<:PenaltyFunction} <:
+       AbstractRegularizedPred{T}
+    "`X`: model matrix"
+    X::M
+    "`beta0`: base vector for coefficients"
+    beta0::V
+    "`delbeta`: coefficients increment"
+    delbeta::V
+    "`Σ`: Gram matrix, X'WX."
+    Σ::M
+    "`penalty`: penalty function"
+    penalty::P
+    "`invvar`: inverse variance/precision vector"
+    invvar::V
+    "`params`: parameters for CGD"
+    params::CGDParams
+    "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method"
+    scratchbeta::Vector{T}
+
+    function CGDRegPred(
+        X::M,
+        Σ::AbstractMatrix{<:Real},
+        invvar::AbstractVector{<:Real},
+        penalty::P,
+        outerloop::Bool=true,
+    ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal}
+        m = size(Σ, 1)
+
+        if !isseparable(penalty)
+            error(
+                "Coordinate Gradient Descent is only allowed with a separable penalty: " *
+                "$(penalty)"
+            )
+        end
+
+        params = CGDParams(outerloop)
+
+        return new{T,M,typeof(invvar),P}(
+            X,
+            zeros(T, m),
+            zeros(T, m),
+            Σ,
+            penalty,
+            invvar,
+            params,
+            zeros(T, m),
+        )
+    end
+end
+
+function CGDRegPred(
+    X::M, penalty::P, wts::V, outerloop::Bool=true,
+) where {M<:AbstractMatrix{T},V<:AbstractVector,P<:PenaltyFunction} where {T<:BlasReal}
+    n, m = size(X)
+    ll = size(wts, 1)
+    ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n."))
+
+    # invvar = precision_vector(X, wts)
+    if isempty(wts)
+        Σ = Hermitian(float(X' * X))
+    else
+        Σ = Hermitian(float((wts .* X)' * X))
+    end
+    invvar = Vector(inv.(diag(Σ)))
+
+    return CGDRegPred(X, Σ, invvar, penalty, outerloop)
+end
+
+function precision_vector(
+    X::AbstractMatrix{T}, wts::AbstractVector
+) where {T<:AbstractFloat}
+    invvar = Vector{T}(undef, size(X, 2))
+    if isempty(wts)
+        @inbounds @simd for j in eachindex(invvar, axes(X, 2))
+            invvar[j] = 1 / sum(abs2, view(X, :, j))
+        end
+    else
+        @inbounds @simd for j in eachindex(invvar, axes(X, 2))
+            invvar[j] = 1 / sum(i -> wts[i] * abs2(X[i, j]), eachindex(wts, axes(X, 1)))
+        end
+    end
+    return invvar
+end
+
+function initpred!(p::CGDRegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false)
+    initparams!(p.params)
+    return p
+end
+
+function update_βμ!(
+    p::CGDRegPred{T},
+    y::AbstractVector{T},
+    μ::AbstractVector{T},
+    res::AbstractVector{T},
+    σ2::T=one(T),
+    wts::AbstractVector{T}=T[];
+    tol::Real=1e-4,
+    verbose::Bool=false,
+) where {T<:BlasReal}
+    X = p.X
+    β = p.beta0
+    newβ = p.delbeta
+    invvar = p.invvar
+    pen = penalty(p)
+
+    m = size(X, 2)
+    outerM = p.params.outerloop ? m : 1
+    update_all = p.params.update_all
+
+#    copyto!(newβ, β)
+    # outer loop
+    @inbounds for _ in 1:outerM
+
+        Δdev = zero(T)
+        # iterate over indices
+        @inbounds for j in eachindex(axes(X, 2), β, newβ, invvar)
+            βj = β[j]
+            !update_all && iszero(βj) && continue
+
+            Xj = view(X, :, j)
+
+            # remove component due to index j in μ
+            # µ -= X[:, j] * β[j]
+#            if !iszero(βj)
+#                @inbounds @simd for i in eachindex(μ, Xj)
+#                    μ[i] -= Xj[i] * β[j]
+#                end
+##                broadcast!(muladd, μ, -β[j], Xj, μ)
+#            end
+
+            newβ[j] = βj
+            if isempty(wts)
+                @inbounds @simd for i in eachindex(axes(X, 1), res, Xj)
+                    newβ[j] += Xj[i] * res[i]
+#                @inbounds @simd for i in eachindex(axes(X, 1), y, μ, Xj)
+#                    newβ[j] += Xj[i] * (y[i] - μ[i] + Xj[i] * βj)
+                end
+            else
+                @inbounds @simd for i in eachindex(axes(X, 1), res, Xj, wts)
+                    newβ[j] += wts[i] * Xj[i] * res[i]
+                end
+            end
+            newβ[j] *= invvar[j]
+
+
+            #            verbose && println("∇βj before prox: $(gradβ)")
+            # Proximal operator on a single coordinate
+            proximal!(pen, newβ, j, newβ, invvar[j] * σ2)
+#            proximal!(pen, β, j, newβ, invvar[j] * σ2)
+            #            verbose && println("β after prox: $(p.β)")
+
+            # re-add component due to index j in μ
+            #            μj = p.X[:, j] * p.β[j]
+            #            p.µ .+= μj
+##            if !iszero(β[j])
+##                @inbounds @simd for i in eachindex(μ, Xj)
+##                    μ[i] += Xj[i] * β[j]
+##                end
+###                broadcast!(muladd, μ, β[j], view(X, :, j), μ)
+##            end
+            Δβ = newβ[j] - βj
+            if !iszero(Δβ)
+                β[j] = newβ[j]
+                @inbounds @simd for i in eachindex(μ, res, Xj)
+                    dμi = Xj[i] * Δβ
+                    μ[i] += dμi
+                    res[i] -= dμi
+                end
+                Δdev = max(Δdev, abs2(Δβ) / invvar[j])
+            end
+        end
+
+        if Δdev < tol
+            if update_all
+                # Converged
+                break
+            else
+                # Run a last iteration, computing all the coefficients again
+                update_all = true
+            end
+        else
+            # Check for convergence
+            update_all = false
+        end
+    end
+
+    p.params.update_all = true
+    return p
+end
+
+
+##############################
+###  FISTA predictor
+##############################
+
+mutable struct FISTAParams{T<:AbstractFloat}
+    "`restart`: allow restart"
+    restart::Bool
+    "`use_backtracking`: use backtracking for adjusting the step size"
+    use_backtracking::Bool
+    "`bt_maxiter`: backtracking loop, maximum number of iteration"
+    bt_maxiter::Int
+    "`bt_stepfac`: backtracking loop, reducing factor"
+    bt_delta::T
+    "`sk`: FISTA step parameter"
+    sk::T
+    "`tk`: FISTA acceleration step"
+    tk::T
+    "`kk`: ADMM number of iterations since last restart"
+    kk::Int
+
+    function FISTAParams{T}(
+        restart::Bool, use_backtracking::Bool, bt_maxiter::Integer=20, bt_delta::Real=0.5
+    ) where {T<:AbstractFloat}
+        return new{T}(
+            restart, use_backtracking, bt_maxiter, bt_delta, one(T), one(T), 1,
+        )
+    end
+end
+
+function initparams!(p::FISTAParams, sk::Real=1)
+    p.sk = sk
+    p.tk = 1
+    p.kk = 1
+    return p
+end
+
+"""
+    update_accelstep!(p::FISTAParams, restart::Bool=false; verbose::Bool=false)
+
+Update the step for acceleration and return the acceleration factor.
+If `restart`, reset to 1.
+"""
+function update_accelstep!(p::FISTAParams, restart::Bool=false; verbose::Bool=false)
+    if restart
+        verbose && println("Restart after $(p.kk) FISTA iterations")
+        # reset step and iteration number
+        p.tk = 1
+        p.kk = 1
+        return zero(p.tk)
+    end
+
+    tk = p.tk
+    # tkp1 = (1 + √(1 + 4 * p.tk^2)) / 2
+    # mk = (p.tk - 1) / tkp1
+
+    # mk = (k - 1) / (k + 2)
+
+    # Tseng (2008) - On accelerated proximal gradient methods for convex-concave optimization
+    tkp1 = (√(tk^4 + 4 * tk^2) - tk^2) / 2
+    mk = tkp1 * (1 - tk) / tk
+    # Update the acceleration step
+    p.tk = tkp1
+    # Update the iteration number
+    p.kk += 1
+    return mk
+end
+
+
+"""
+    FISTARegPred
+
+Regularization using Fast Iterative Shrinkage-Thresholding Algorithm.
+
+# Members
+
+- `X`: model matrix
+- `beta0`: base vector for coefficients
+- `delbeta`: coefficients increment
+- `Σ`: Gram matrix and temporary matrix
+- `penalty`: the penalty function.
+- `scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method.
+"""
+struct FISTARegPred{T<:BlasReal,M<:AbstractMatrix{T},V<:Vector{T},P<:PenaltyFunction} <:
+       AbstractRegularizedPred{T}
+    "`X`: model matrix"
+    X::M
+    "`beta0`: base vector for coefficients"
+    beta0::V
+    "`delbeta`: coefficients increment"
+    delbeta::V
+    "`Σ`: Corresponds to X'WX."
+    Σ::M
+    "`penalty`: penalty function"
+    penalty::P
+    "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method"
+    scratchbeta::V
+    "`wXt`: transpose of the (weighted) model matrix"
+    wXt::M
+    "`params`: parameters for FISTA"
+    params::FISTAParams
+    "pre-allocated temporary vectors"
+    gradβ::V
+    proxβ::V
+    stepβ::V
+
+    function FISTARegPred(
+        X::M,
+        wXt::AbstractMatrix{<:Real},
+        Σ::AbstractMatrix{<:Real},
+        penalty::P,
+        restart::Bool=true,
+        use_backtracking::Bool=false,
+    ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal}
+        m = size(Σ, 1)
+
+        params = FISTAParams{T}(restart, use_backtracking)
+
+        beta0 = zeros(T, m)
+        return new{T,M,typeof(beta0),P}(
+            X,
+            beta0,
+            zeros(T, m),
+            Σ,
+            penalty,
+            zeros(T, m),
+            wXt,
+            params,
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+        )
+    end
+end
+
+function FISTARegPred(
+    X::M, penalty::P, wts::V, restart::Bool=true, use_backtracking::Bool=false
+) where {M<:AbstractMatrix{T},V<:AbstractVector,P<:PenaltyFunction} where {T<:BlasReal}
+    n, m = size(X)
+    ll = size(wts, 1)
+    ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n."))
+
+    wXt = isempty(wts) ? X' : (X .* wts)'
+    Σ = Hermitian(float(wXt * X))
+
+    return FISTARegPred(X, wXt, Σ, penalty, restart, use_backtracking)
+end
+
+function initpred!(p::FISTARegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false)
+    copyto!(p.proxβ, p.beta0)
+    copyto!(p.stepβ, p.beta0)
+
+    # for Normal errors
+    #        sk = 1 / eigmax(Σ)
+    # TODO: `eigen` methods not defined for sparse arrays, use Arpack.jl?
+    #        sk = 1 / eigmax(Matrix(Σ))
+    sk = 1 / eigmax(Matrix(p.Σ))
+    initparams!(p.params, sk)
+    return p
+end
+
+function update_beta!(
+    p::FISTARegPred{T},
+    y::AbstractVector{T},
+    wts::AbstractVector{T}=T[],
+    σ2::T=one(T);
+    verbose::Bool=false,
+) where {T<:BlasReal}
+    β = p.beta0
+    dβ = p.delbeta
+    Σ = p.Σ
+    wXt = p.wXt
+    gradβ = p.gradβ
+    stepβ = p.stepβ
+    proxβ = p.proxβ
+    scratchbeta = p.scratchbeta
+
+    sk = p.params.sk
+    restart = p.params.restart
+
+
+    # ∇_β̂(f) = (-1/σ^2) * ((W .* X)' * wrky - Σ * β̂)
+    # ∇_β̂(f) = Σ * β̂ - ((W .* X)' * wrky)
+    # For Normal errors
+    mul!(gradβ, wXt, y)
+    mul!(scratchbeta, Σ, stepβ)  # dumb variable scratchbeta
+    # rdiv!(broadcast!(-, gradβ, gradβ, scratchbeta), -σ2)
+    broadcast!(-, gradβ, scratchbeta, gradβ)
+
+    # broadcast!(-, yres, broadcast!(-, yres, p.y, mul!(yres, p.X, stepβ)), p.outliers)
+    # rdiv!(mul!(gradβ, wXt, yres), -p.σ)
+    @. dβ = stepβ - sk * gradβ
+    proximal!(penalty(p), proxβ, dβ, sk * σ2)
+    verbose && println("current coefs:\n\t$(β)")
+    verbose && println("gradient:\n\t$(sk * gradβ)")
+    verbose && println("new coefs:\n\t$(proxβ)")
+
+    # find step-size
+    if p.params.use_backtracking
+        Im = I(size(β, 1))
+        bt_delta = p.params.bt_delta
+        bt_maxiter = p.params.bt_maxiter
+        # backtracking
+        # x = βstep
+        # f(β) = 1/2 ||y - Xβ - γ||²
+        # Δ = F(pr) - Q(pr, x, Li)
+        # Δ = f(pr) - f(x) - dot(pr - x, df_x) - Lk/2 * sum(abs2, pr - x)
+        broadcast!(-, scratchbeta, proxβ, stepβ)
+        # for Normal errors
+        # Needs julia > v1.4
+        Δ = dot(scratchbeta, sk * Σ - Im, scratchbeta)
+        verbose && println("backtracking: decrease gradient step if 0 < $(Δ)")
+        jj = 0
+        while Δ > 0 && jj < bt_maxiter
+            # Update the gradient step
+            p.params.sk = sk = bt_delta * sk
+            @. dβ = stepβ - sk * gradβ
+            proximal!(penalty(m), proxβ, dβ, si)
+            broadcast!(-, scratchbeta, proxβ, stepβ)
+            # for Normal errors
+            Δ = dot(scratchbeta, sk * Σ - Im, scratchbeta)
+            verbose && println("backtracking step $jj: decrease FISTA step $sk")
+            jj += 1
+        end
+        Δ > 0 && throw(ConvergenceException(bt_maxiter))
+    end
+
+
+    # FISTA step
+    c = dot(gradβ, proxβ) - dot(gradβ, β)
+    if !restart || c <= 0
+        # Accelerated step
+        mk = update_accelstep!(p.params, false; verbose=verbose)
+        @. stepβ = proxβ + mk * (proxβ - β)
+    else
+        # Restart
+        update_accelstep!(p.params, true; verbose=verbose)
+        @. stepβ = β
+    end
+
+    # update variables
+    copyto!(β, proxβ)
+
+    return p
+end
+
+
+##############################
+###  AMA predictor
+##############################
+
+mutable struct AMAParams{T<:AbstractFloat}
+    "`restart`: allow restart"
+    restart::Bool
+    "`η`: η for restart criteria"
+    η::T
+    "`ρ`: AMA step parameter"
+    ρ::T
+    "`tk`: AMA acceleration step"
+    tk::T
+    "`ck`: AMA restart criterium"
+    ck::T
+    "`kk`: AMA number of iterations since last restart"
+    kk::Int
+
+    function AMAParams{T}(restart::Bool, η::Real=1.0) where {T<:AbstractFloat}
+        return new{T}(restart, η, one(T), one(T), zero(T), 1)
+    end
+end
+
+function initparams!(p::AMAParams, ρ::Real=1)
+    p.ρ = ρ
+    p.tk = 1
+    p.ck = 0
+    p.kk = 1
+    return p
+end
+
+"""
+    update_accelstep!(p::AMAParams, restart::Bool=false; verbose::Bool=false)
+
+Update the step for acceleration and return the acceleration factor.
+If `restart`, reset to 1.
+"""
+function update_accelstep!(p::AMAParams, restart::Bool=false; verbose::Bool=false)
+    if restart
+        verbose && println("Restart after $(p.kk) AMA iterations")
+        # reset step and iteration number
+        p.tk = 1
+        p.kk = 1
+        return zero(p.tk)
+    end
+
+    tk = p.tk
+    tkp1 = (1 + √(1 + 4 * p.tk^2)) / 2
+    mk = (p.tk - 1) / tkp1
+    p.tk = tkp1
+    # update iteration number
+    p.kk += 1
+    return mk
+end
+
+
+struct AMARegPred{
+    T<:BlasReal,
+    M<:AbstractMatrix{T},
+    V<:Vector{T},
+#    C,
+    P<:PenaltyFunction,
+    M2<:AbstractMatrix{T},
+    V2<:AbstractVector{T},
+} <: AbstractRegularizedPred{T}
+    "`X`: model matrix"
+    X::M
+    "`beta0`: base vector for coefficients"
+    beta0::V
+    "`delbeta`: coefficients increment"
+    delbeta::V
+    "`Σ`: Corresponds to X'WX."
+    Σ::M
+    "`penalty`: penalty function"
+    penalty::P
+    "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method"
+    scratchbeta::V
+    "`penbeta0`: vector of length `p`, used in [`penalized_coef`](@ref) method"
+    penbeta0::V
+#    "`chol`: cholesky factorization"
+#    chol::C
+    "`wXt`: transpose of the (weighted) model matrix"
+    wXt::M
+    "`A`: matrix of the constraint equation `A . u  - b - v = 0`"
+    A::M2
+    "`b`: vector of the constraint equation `A . u  - b - v = 0`"
+    b::V2
+    "`params`: parameters for AMA"
+    params::AMAParams
+    "pre-allocated temporary vectors"
+    vk::V
+    vkp1::V
+    wk::V
+    wkp1::V
+    whatk::V
+    whatkp1::V
+
+    function AMARegPred(
+        X::M,
+        wXt::AbstractMatrix{<:Real},
+        Σ::AbstractMatrix{<:Real},
+        penalty::P,
+        A::AbstractMatrix{<:Real}=float(I(size(Σ, 1))),
+        b::AbstractVector{<:Real}=zeros(T, size(Σ, 1)),
+        restart::Bool=true,
+    ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal}
+        m = size(Σ, 1)
+
+#        chol = cholesky(Σ)
+        bhat = A * b
+        params = AMAParams{T}(restart)
+
+        beta0 = zeros(T, m)
+#        return new{T,M,typeof(beta0),typeof(chol),P,typeof(A),typeof(b)}(
+        return new{T,M,typeof(beta0),P,typeof(A),typeof(b)}(
+            X,
+            beta0,
+            zeros(T, m),
+            Σ,
+            penalty,
+            zeros(T, m),
+            zeros(T, m),
+#            chol,
+            wXt,
+            A,
+            bhat,
+            params,
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+        )
+    end
+end
+
+function AMARegPred(
+    X::M,
+    penalty::P,
+    wts::V,
+    A::AbstractMatrix{<:Real}=zeros(T, 0, 0),
+    b::AbstractVector{<:Real}=zeros(T, 0),
+    restart::Bool=true,
+) where {
+    M<:AbstractMatrix{T},V<:AbstractVector{<:Real},P<:PenaltyFunction
+} where {T<:BlasReal}
+    n, m = size(X)
+    ll = size(wts, 1)
+    ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n."))
+
+    wXt = isempty(wts) ? X' : (X .* wts)'
+    Σ = Hermitian(float(wXt * X))
+
+    if isempty(A)
+        A = float(I(m))
+    end
+    size(A) == (m, m) ||
+        throw(DimensionMismatch("size of A is $(size(A)), must be (0, 0) or $((m, m))."))
+    if isempty(b)
+        b = zeros(T, m)
+    end
+    size(b, 1) == m ||
+        throw(DimensionMismatch("size of b is $(size(b, 1)), must be 0 or $(m)."))
+
+    return AMARegPred(X, wXt, Σ, penalty, A, b, restart)
+end
+
+penalized_coef(p::AMARegPred) = mul!(copyto!(p.penbeta0, p.b), p.A, p.beta0, 1, -1)
+
+function initpred!(p::AMARegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false)
+    copyto!(p.delbeta, p.beta0)
+    copyto!(p.scratchbeta, p.beta0)
+    copyto!(p.vk, p.beta0)
+    copyto!(p.vkp1, p.beta0)
+    copyto!(p.wk, p.beta0)
+    copyto!(p.wkp1, p.beta0)
+    copyto!(p.whatk, p.beta0)
+    copyto!(p.whatkp1, p.beta0)
+
+    # Goldstein (2014) - Fast Alternating Direction Optimization Methods
+    # Theorem 5: ρ <= σH / ρ(A'A)
+    #        ρ = eigmin(Σ) / eigmax(A'A)
+    # TODO: `eigen` methods not defined for sparse arrays, use Arpack.jl?
+    #        ρ = eigmin(Matrix(Σ)) / eigmax(Matrix(A'A))
+    ρ = eigmin(Matrix(p.Σ)) / eigmax(Matrix(p.A' * p.A))
+    initparams!(p.params, abs(ρ))
+
+    return p
+end
+
+function update_beta!(
+    p::AMARegPred{T},
+    y::AbstractVector{T},
+    wts::AbstractVector{T}=T[],
+    σ2::T=one(T);
+    verbose::Bool=false,
+) where {T<:BlasReal}
+    β = p.beta0
+    wXt = p.wXt
+    βkp1 = p.delbeta
+    scratch = p.scratchbeta
+#    chol = p.chol
+    A = p.A
+    b = p.b
+
+    vk = p.vk
+    vkp1 = p.vkp1
+    wk = p.wk
+    wkp1 = p.wkp1
+    whatk = p.whatk
+    whatkp1 = p.whatkp1
+
+    ρ = p.params.ρ
+    ck = p.params.ck
+    restart = p.params.restart
+    η = p.params.η
+
+    verbose && println("AMA current coefs:\n\t$(β)")
+
+    # Goldstein (2014) - Fast Alternating Minimization Algorithm
+    # Lρ(u, v, w) = 1/2σ² |y - X u|² + P(v) + ρ w' * (A u - b - v) + ρ/2 |Au - b - v|²
+    # uk+1 = (X'WX) \ (X'Wy - ρ σ² A' wk)     # argmin_u Lρ(u, vk, wk) without last term
+    # vk+1 = prox_λ/ρ ( wk + A uk+1 - b )     # argmin_v Lρ(uk+1, v, wk)
+    # wk+1 = wk + A uk+1 - b - vk+1
+    #
+    # primal residual: rk = A uk - b - vk
+    # dual residual:   dk = ρ * (vk+1 - vk)
+    #
+    # βkp1 = chol \ (wXt * wrky - ρ * σ2 * A' * whatk)
+    scratch = mul!(scratch, A', whatk)
+    scratch = mul!(scratch, wXt, y, 1, -ρ * σ2)
+#    βkp1 = chol \ scratch
+    cg!(βkp1, p.Σ, scratch)
+    # vkp1 = proximal(penalty(p), A * βkp1 - b + whatk, 1/ρ)
+    scratch = mul!(copyto!(scratch, b), A, βkp1, 1, -1)
+    broadcast!(+, scratch, scratch, whatk)
+    proximal!(penalty(p), vkp1, scratch, 1 / ρ)
+    # wkp1 = whatk + A * βkp1 - b - vkp1
+    broadcast!(-, wkp1, scratch, vkp1)
+
+    ckp1 = sum(abs2, wkp1 - whatk)
+    if !restart || ck == 0 || ckp1 < η * ck
+        # Accelerated step
+        mk = update_accelstep!(p.params, false; verbose=verbose)
+        @. whatkp1 = wkp1 + mk * (wkp1 - wk)
+    else
+        # Restart
+        update_accelstep!(p.params, true; verbose=verbose)
+        @. whatkp1 = wk
+        ckp1 = ck / η
+    end
+
+    # update variables
+    p.params.ck = ckp1
+    copyto!(β, βkp1)
+    copyto!(vk, vkp1)
+    copyto!(wk, wkp1)
+    copyto!(whatk, whatkp1)
+
+    return p
+end
+
+
+
+##############################
+###  ADMM predictor
+##############################
+
+mutable struct ADMMParams{T<:AbstractFloat}
+    "`restart`: allow restart"
+    restart::Bool
+    "`η`: η for restart criteria"
+    η::T
+    "`adapt`: allow adaptation of ρ"
+    adapt::Bool
+    "`adapt_τ`: parameter τ for ρ-adaptation"
+    adapt_τ::T
+    "`adapt_logμ`: (log of the) parameter μ for ρ-adaptation"
+    adapt_logμ::T
+    "`ρ`: ADMM step parameter"
+    ρ::T
+    "`tk`: ADMM acceleration step"
+    tk::T
+    "`ck`: ADMM restart criterium"
+    ck::T
+    "`kk`: ADMM number of iterations since last restart"
+    kk::Int
+    "`updatedρ`: flag for updated ρ"
+    updatedρ::Bool
+
+    function ADMMParams{T}(
+        restart::Bool, adapt::Bool, η::Real=0.999, adapt_τ::Real=2.0, adapt_μ::Real=10.0
+    ) where {T<:AbstractFloat}
+        return new{T}(
+            restart, η, adapt, adapt_τ, log(adapt_μ), one(T), one(T), zero(T), 1, false
+        )
+    end
+end
+
+function initparams!(p::ADMMParams, ρ::Real=1)
+    p.ρ = ρ
+    p.tk = 1
+    p.ck = 0
+    p.kk = 1
+    p.updatedρ = false
+    return p
+end
+
+"""
+    update_accelstep!(p::ADMMParams, restart::Bool=false; verbose::Bool=false)
+
+Update the step for acceleration and return the acceleration factor.
+If `restart`, reset to 1.
+"""
+function update_accelstep!(p::ADMMParams, restart::Bool=false; verbose::Bool=false)
+    if restart
+        verbose && println("Restart after $(p.kk) ADMM iterations")
+        # reset step and iteration number
+        p.tk = 1
+        p.kk = 1
+        return zero(p.tk)
+    end
+
+    tk = p.tk
+    tkp1 = (1 + √(1 + 4 * p.tk^2)) / 2
+    mk = (p.tk - 1) / tkp1
+    p.tk = tkp1
+    # update iteration number
+    p.kk += 1
+    return mk
+end
+
+"""
+    update_adapt!(p::ADMMParams, logr::Real; verbose::Bool=false)
+
+If `adapt`, update the step ρ. `logr`= log(rrk / ssk), so the criteria become:
+ - `rrk > ssk * adapt_μ` ==> `logr > adapt_logμ`
+ - `rrk < ssk / adapt_μ` ==> `logr < -adapt_logμ`
+Returns the step ρ.
+"""
+function update_adapt!(p::ADMMParams, logr::Real; verbose::Bool=false)
+    if !p.adapt
+        return p.ρ
+    end
+    ρ = p.ρ
+    if logr > p.adapt_logμ
+        ρ *= p.adapt_τ
+    elseif logr < -p.adapt_logμ
+        ρ /= p.adapt_τ
+    end
+    if p.ρ != ρ
+        verbose && println("ADMM adapt ρ: $ρ")
+        p.updatedρ = true
+    end
+    p.ρ = ρ
+    return ρ
+end
+
+
+struct ADMMRegPred{
+    T<:BlasReal,
+    M<:AbstractMatrix{T},
+    V<:Vector{T},
+#    C,
+    P<:PenaltyFunction,
+    M2<:AbstractMatrix{T},
+    V2<:AbstractVector{T},
+} <: AbstractRegularizedPred{T}
+    "`X`: model matrix"
+    X::M
+    "`beta0`: base vector for coefficients"
+    beta0::V
+    "`delbeta`: coefficients increment"
+    delbeta::V
+    "`Σ`: Corresponds to X'WX."
+    Σ::M
+    "`penalty`: penalty function"
+    penalty::P
+    "`scratchbeta`: scratch vector of length `p`, used in [`linpred!`](@ref) method"
+    scratchbeta::V
+    "`penbeta0`: vector of length `p`, used in [`penalized_coef`](@ref) method"
+    penbeta0::V
+#    "`chol`: cholesky factorization"
+#    chol::C
+    "`wXt`: transpose of the (weighted) model matrix"
+    wXt::M
+    "`A`: matrix of the constraint equation `A . u  - b - v = 0`"
+    A::M2
+    "`b`: vector of the constraint equation `A . u  - b - v = 0`"
+    b::V2
+    "`params`: parameters for ADMM"
+    params::ADMMParams
+    "pre-allocated temporary vectors"
+    vk::V
+    vkp1::V
+    vhatk::V
+    vhatkp1::V
+    wk::V
+    wkp1::V
+    whatk::V
+    whatkp1::V
+    wrkΣ::M
+
+    function ADMMRegPred(
+        X::M,
+        wXt::AbstractMatrix{<:Real},
+        Σ::AbstractMatrix{<:Real},
+        penalty::P,
+        A::AbstractMatrix{<:Real}=float(I(size(Σ, 1))),
+        b::AbstractVector{<:Real}=zeros(T, size(Σ, 1)),
+        restart::Bool=true,
+        adapt::Bool=false,
+    ) where {M<:AbstractMatrix{T},P<:PenaltyFunction} where {T<:BlasReal}
+        m = size(Σ, 1)
+
+#        chol = cholesky(sparse(Σ))
+        bhat = A * b
+        params = ADMMParams{T}(restart, adapt)
+
+        beta0 = zeros(T, m)
+#        return new{T,M,typeof(beta0),typeof(chol),P,typeof(A),typeof(b)}(
+        return new{T,M,typeof(beta0),P,typeof(A),typeof(b)}(
+            X,
+            beta0,
+            zeros(T, m),
+            Σ,
+            penalty,
+            zeros(T, m),
+            zeros(T, m),
+#            chol,
+            wXt,
+            A,
+            bhat,
+            params,
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            zeros(T, m),
+            similar(Σ),
+        )
+    end
+end
+
+function ADMMRegPred(
+    X::M,
+    penalty::P,
+    wts::V,
+    A::AbstractMatrix{<:Real}=zeros(T, 0, 0),
+    b::AbstractVector{<:Real}=zeros(T, 0),
+    restart::Bool=true,
+    adapt::Bool=false,
+) where {
+    M<:AbstractMatrix{T},V<:AbstractVector{<:Real},P<:PenaltyFunction
+} where {T<:BlasReal}
+    n, m = size(X)
+    ll = size(wts, 1)
+    ll in (0, n) || throw(DimensionMismatch("length of wts is $ll, must be 0 or $n."))
+
+    wXt = isempty(wts) ? X' : (X .* wts)'
+    Σ = Hermitian(float(wXt * X))
+
+    if isempty(A)
+        A = float(I(m))
+    end
+    size(A) == (m, m) ||
+        throw(DimensionMismatch("size of A is $(size(A)), must be (0, 0) or $((m, m))."))
+    if isempty(b)
+        b = zeros(T, m)
+    end
+    size(b, 1) == m ||
+        throw(DimensionMismatch("size of b is $(size(b, 1)), must be 0 or $(m)."))
+
+    return ADMMRegPred(X, wXt, Σ, penalty, A, b, restart, adapt)
+end
+
+penalized_coef(p::ADMMRegPred) = mul!(copyto!(p.penbeta0, p.b), p.A, p.beta0, 1, -1)
+
+function initpred!(p::ADMMRegPred, wts::AbstractVector=[], σ::Real=1; verbose::Bool=false)
+    ## Init temporary arrays
+    copyto!(p.scratchbeta, p.beta0)
+    copyto!(p.vk, p.beta0)
+    copyto!(p.vkp1, p.beta0)
+    copyto!(p.vhatk, p.beta0)
+    copyto!(p.vhatkp1, p.beta0)
+    copyto!(p.wk, p.beta0)
+    copyto!(p.wkp1, p.beta0)
+    copyto!(p.whatk, p.beta0)
+    copyto!(p.whatkp1, p.beta0)
+
+    ## Init parameters
+    # Goldstein (2014) - Fast Alternating Direction Optimization Methods
+    # Theorem 2: ρ^3 <= (σH σG^2) / (ρ(A'A) * ρ(B'B)^2)
+    #        σK = eigmin(Σ) / eigmax(A'A)
+    # TODO: `eigen` methods not defined for sparse arrays, use Arpack.jl?
+    #        σK = eigmin(Matrix(Σ)) / eigmax(Matrix(A'A))
+    #        ρ = σK^(1/3)
+    σK = eigmin(Matrix(p.Σ)) / eigmax(Matrix(p.A' * p.A))
+    initparams!(p.params, abs(σK)^(1 / 3))
+
+    # Generate the extended Hessian matrix
+    updatepred!(p, σ; verbose=verbose, force=true)
+
+    return p
+end
+
+
+function update_beta!(
+    p::ADMMRegPred{T},
+    y::AbstractVector{T},
+    wts::AbstractVector{T}=T[],
+    σ2::T=one(T);
+    verbose::Bool=false,
+) where {T<:BlasReal}
+    β = p.beta0
+    wXt = p.wXt
+    βkp1 = p.delbeta
+    scratch = p.scratchbeta
+    wrkΣ = p.wrkΣ
+    A = p.A
+    b = p.b
+
+    vk = p.vk
+    vkp1 = p.vkp1
+    vhatk = p.vhatk
+    vhatkp1 = p.vhatkp1
+    wk = p.wk
+    wkp1 = p.wkp1
+    whatk = p.whatk
+    whatkp1 = p.whatkp1
+
+    ρ = p.params.ρ
+    ck = p.params.ck
+    restart = p.params.restart
+    η = p.params.η
+
+    verbose && println("ADMM current coefs:\n\t$(β)")
+
+    # Goldstein (2014) - Fast Alternating Direction Optimization Methods
+    # Lρ(u, v, w) = 1/2σ² |y - X u|² + P(v) + ρ/2 |A u - b - v + w|² - ρ/2 |w|²
+    # uk+1 = (X'WX + ρ σ² A'A) \ (X'Wy + ρ σ² A' (b + vk - wk))
+    # vk+1 = prox_λ/ρ ( A uk+1 - b + wk )
+    # wk+1 = wk + A uk+1 - b - vk+1
+    #
+    # primal residual: rk = A uk - b - vk
+    # dual residual:   dk = ρ * (vk+1 - vk)
+    #
+    # βkp1 = facΣρ \ (wXt * wrky + ρ * σ2 * A' * (b + vhatk - whatk))
+    broadcast!(+, βkp1, broadcast!(-, βkp1, vhatk, whatk), b)  # dumb βkp1
+    mul!(scratch, A', βkp1)  # dumb βkp1
+    mul!(scratch, wXt, y, 1, ρ * σ2)
+#    βkp1 = facΣρ \ scratch
+    cg!(βkp1, wrkΣ, scratch)
+    # proximal!(penalty(p), vkp1, A * βkp1 - b + whatk, 1/ρ)
+    scratch = mul!(copyto!(scratch, b), A, βkp1, 1, -1)
+    broadcast!(+, scratch, scratch, whatk)
+    proximal!(penalty(p), vkp1, scratch, 1 / ρ)
+    # wkp1 = whatk + A * βkp1 - b - vkp1
+    broadcast!(-, wkp1, scratch, vkp1)
+
+    rrk = sum(abs2, wkp1 - whatk)
+    ssk = ρ * sum(abs2, vkp1 - vhatk)
+    ckp1 = rrk + ssk
+    # verbose && println((; rrk, ssk))
+    # verbose && println("criteria: $(ckp1) < $(η) * $(ck)")
+    if !restart || ck == 0 || ckp1 < η * ck
+        # Accelerated step
+        mk = update_accelstep!(p.params, false; verbose=verbose)
+        @. vhatkp1 = vkp1 + mk * (vkp1 - vk)
+        @. whatkp1 = wkp1 + mk * (wkp1 - wk)
+    else
+        # Restart
+        update_accelstep!(p.params, true; verbose=verbose)
+        @. vhatkp1 = vk
+        @. whatkp1 = wk
+        ckp1 = ck / η
+    end
+
+    # update variables
+    p.params.ck = ckp1
+    copyto!(β, βkp1)
+    copyto!(vk, vkp1)
+    copyto!(vhatk, vhatkp1)
+    copyto!(wk, wkp1)
+    copyto!(whatk, whatkp1)
+
+    # adapt ρ
+    update_adapt!(p.params, log(rrk / ssk); verbose=verbose)
+
+    return p
+end
+
+function updatepred!(p::ADMMRegPred, σ::Real; verbose::Bool=false, force::Bool=false)
+    if (p.params.adapt && p.params.updatedρ) || force
+        Σ = p.Σ
+        A = p.A
+        ρ = p.params.ρ
+
+        # Refactorize
+        # TODO: should be improved
+#        cholesky!(p.chol, sparse(Σ + ρ * σ^2 * Hermitian(A'A)))
+#        p.wrkΣ = Hermitian(p.Σ + σ2 * ρ * A'A)
+        copyto!(p.wrkΣ, p.Σ)
+        mul!(p.wrkΣ, A', A, ρ * σ^2, 1)
+        copyto!(p.wrkΣ, Hermitian(p.wrkΣ))
+        p.params.updatedρ = false
+        verbose && println("Recompute cholesky factorization")
+    end
+    return p
+end
diff --git a/src/robustlinearmodel.jl b/src/robustlinearmodel.jl
index 77385f3..daf70ee 100644
--- a/src/robustlinearmodel.jl
+++ b/src/robustlinearmodel.jl
@@ -66,6 +66,10 @@ function StatsAPI.confint(m::AbstractRobustModel; level::Real=0.95)
     return hcat(cc, cc) + alpha * se * hcat(-1.0, 1.0)
 end
 
+haspenalty(r::AbstractRobustModel) = false
+
+penalty(r::AbstractRobustModel) = nothing
+
 ## TODO: specialize to make it faster
 StatsAPI.leverage(m::AbstractRobustModel) = diag(projectionmatrix(m))
 
@@ -1499,3 +1503,296 @@ function resampling_best_estimate(
     verbose && println("Best subsample: β=$(βls[:, N])\tσ=$(σls[N])")
     return σls[N], βls[:, N]
 end
+
+
+########################################
+###   Penalized Least Squares
+########################################
+
+haspenalty(r::RobustLinearModel{T,R,P}) where {T,R,P<:AbstractRegularizedPred} = true
+
+penalty(r::RobustLinearModel{T,R,P}) where {T,R,P<:AbstractRegularizedPred} = r.pred.penalty
+
+function StatsAPI.dof(r::RobustLinearModel{T,R,P}) where {T,R,P<:AbstractRegularizedPred}
+    return min(wobs(r), dof(penalty(r), coef(r)))
+end
+
+
+
+"""
+    fit(::Type{M},
+        X::AbstractMatrix{T},
+        y::AbstractVector{T},
+        penalty::PenaltyFunction;
+        method::Symbol = :auto,  # :cgd, :fista, :ama, :admm
+        dofit::Bool = true,
+        wts::FPVector = similar(y, 0),
+        offset::FPVector = similar(y, 0),
+        fitdispersion::Bool = false,
+        initial_scale::Union{Symbol, Real}=:mad,
+        σ0::Union{Symbol, Real}=initial_scale,
+        initial_coef::AbstractVector=[],
+        β0::AbstractVector=initial_coef,
+        correct_leverage::Bool=false
+        fitargs...) where {M<:RobustLinearModel, T<:AbstractFloat}
+
+Create a robust model with the model matrix (or formula) X and response vector (or dataframe) y,
+using L2Loss with a Penalty on the coefficients.
+
+
+# Arguments
+
+- `X`: the model matrix (it can be dense or sparse) or a formula
+- `y`: the response vector or a table (dataframe, namedtuple, ...).
+- `penalty`: a penalty function for the coefficients.
+
+# Keywords
+
+- `method::Symbol = :auto`: the method used to solve the linear system with penalty,
+    - Coordinate Gradient Descent: `:cgd` (default)
+    - Fast Iterative Shrinkage-Thresholding Algorithm: `:fista`
+    - Alternating Minimization Algorithm: `:ama`
+    - Alternating Direction Method of Multipliers: `:admm`
+    Use :auto to select the default method.
+- `dofit::Bool = true`: if false, return the model object without fitting;
+- `dropmissing::Bool = false`: if true, drop the rows with missing values (and convert to
+    Non-Missing type). With `dropmissing=true` the number of observations may be smaller than
+    the size of the input arrays;
+- `wts::Vector = similar(y, 0)`: Prior probability weights of observations.
+    Can be empty (length 0) if no weights are used (default);
+- `offset::Vector = similar(y, 0)`: A vector of offsets to apply to the mean response.
+    Can be empty (length 0) if no offset is set (default);
+- `fitdispersion::Bool = false`: reevaluate the dispersion;
+- `contrasts::AbstractDict{Symbol,Any} = Dict{Symbol,Any}()`: a `Dict` mapping term names
+    (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts (e.g., `HelmertCoding()`,
+    `SeqDiffCoding(; levels=["a", "b", "c"])`, etc.). If contrasts are not provided for a variable,
+    the appropriate term type will be guessed based on the data type from the data column:
+    any numeric data is assumed to be continuous, and any non-numeric data is assumed to be
+    categorical (with `DummyCoding()` as the default contrast type);
+- `initial_scale::Union{Symbol, Real}=:mad`: the initial scale estimate, for non-convex estimator
+    it helps to find the global minimum.
+    Automatic computation using `:mad`, `:L1` or `:extrema` (non-robust).
+- `σ0::Union{Nothing, Symbol, Real}=initial_scale`: alias of `initial_scale`;
+- `initial_coef::AbstractVector=[]`: the initial coefficients estimate, for non-convex estimator
+    it helps to find the global minimum.
+- `β0::AbstractVector=initial_coef`: alias of `initial_coef`;
+- `penalty_omit_intercept::Bool=true`: if true, do not penalize the intercept,
+    otherwise use the penalty on all the coefficients;
+- `fitargs...`: other keyword arguments used to control the convergence of the IRLS algorithm
+    (see [`pirls!`](@ref)).
+
+# Output
+
+the RobustLinearModel object.
+
+"""
+function StatsAPI.fit(
+    ::Type{M},
+    X::AbstractMatrix{T},
+    y::AbstractVector{T},
+    penalty::PenaltyFunction;
+    method::Symbol=:auto,  # :cgd, :fista, :ama, :admm
+    dofit::Bool=true,
+    dropmissing::Bool=false,  # placeholder
+    initial_scale::Union{Nothing,Symbol,Real}=:mad,
+    σ0::Union{Nothing,Symbol,Real}=initial_scale,
+    wts::FPVector=similar(y, 0),
+    offset::FPVector=similar(y, 0),
+    fitdispersion::Bool=false,
+    dropcollinear::Bool=false,
+    contrasts::AbstractDict{Symbol,Any}=Dict{Symbol,Any}(),  # placeholder
+    __formula::Union{Nothing,FormulaTerm}=nothing,
+    use_backtracking::Bool=false,
+    A::AbstractMatrix{<:Real}=zeros(T, 0, 0),
+    b::AbstractVector{<:Real}=zeros(T, 0),
+    restart::Bool=true,
+    adapt::Bool=false,
+    outerloop::Bool=false,
+    penalty_omit_intercept::Bool=true,
+    fitargs...,
+) where {M<:RobustLinearModel,T<:AbstractFloat}
+
+    # Check that X and y have the same number of observations
+    n, p = size(X)
+    if n != size(y, 1)
+        throw(DimensionMismatch("number of rows in X and y must match"))
+    end
+
+    # Response object
+    est = MEstimator{L2Loss}()
+    resp = RobustLinResp(est, y, offset, T[])
+
+    # Penalty
+    new_penalty = try
+        intercept_col = penalty_omit_intercept ? get_intercept_col(X, __formula) : nothing
+        concrete(penalty, p, intercept_col)
+    catch
+        error("penalty is not compatible with coefficients size $p: $(penalty)")
+    end
+
+    # Predictor object
+    pen_methods = (:auto, :cgd, :fista, :ama, :admm)
+    if method ∉ pen_methods
+        @warn(
+            "Incorrect method `:$(method)` with a penalty, should be one of $(pen_methods)"
+        )
+        method = :auto
+    end
+
+    pred = if method === :fista
+        FISTARegPred(X, new_penalty, wts, restart, use_backtracking)
+    elseif method === :ama
+        AMARegPred(X, new_penalty, wts, A, b, restart)
+    elseif method === :admm
+        ADMMRegPred(X, new_penalty, wts, A, b, restart, adapt)
+    elseif method in (:cgd, :auto)
+        CGDRegPred(X, new_penalty, wts, outerloop)
+    else
+        error("with penalty, method :$method is not allowed, should be in: $pen_methods")
+    end
+
+    # RobustLinearModel
+    m = RobustLinearModel(resp, pred, __formula, fitdispersion, false)
+
+    # Update fields
+    fitargs = update_fields!(m; σ0=σ0, fitargs...)
+
+    return dofit ? fit!(m; fitargs...) : m
+end
+
+
+function StatsAPI.fit!(
+    m::RobustLinearModel{T,R,P};
+    correct_leverage::Bool=false,
+    updatescale::Bool=false,
+    maxiter::Integer=(P<:CGDRegPred) ? 10_000 : (P<:FISTARegPred) ? 1_000 : 100,
+    minstepfac::Real=1e-3,
+    atol::Real=1e-8,
+    rtol::Real=1e-7,
+    verbose::Bool=false,
+) where {T,R,P<:AbstractRegularizedPred}
+    # Return early if model has the fit flag set
+    m.fitted && return m
+
+    verbose && println("\nFit robust model with penalty: $(penalty(m))")
+
+    # Initialize the response and predictors
+    init!(m; verbose=verbose)
+
+    # convergence criteria
+    devold = dev_criteria(m)
+    dev = devold
+    Δdev = zero(T)
+
+    ### Loop until convergence
+    cvg = false
+    for i in 1:maxiter
+        ## Update coefficients
+        setη!(m; verbose=verbose)
+        dev = dev_criteria(m)
+        Δdev = abs(dev - devold)
+
+        ## Postupdate (only for ADMM)
+        updatepred!(m.pred, scale(m); verbose=verbose)
+
+        # Test for convergence
+        verbose && println("Iteration: $i, Δdev: $(Δdev)")
+        tol = max(rtol * abs(devold), atol)
+        if Δdev < tol || dev < atol
+            cvg = true
+            break
+        end
+        @assert isfinite(dev)
+        devold = dev
+    end
+    cvg || throw(ConvergenceException(maxiter))
+    m.fitted = true
+    return m
+end
+
+function StatsAPI.fit!(
+    m::RobustLinearModel{T,R,P}; correct_leverage::Bool=false, kwargs...
+) where {T,R,P<:RidgePred}
+
+    # Return early if model has the fit flag set
+    m.fitted && return m
+
+    # Compute the initial values
+    σ0, β0 = scale(m), coef(m)
+
+    if correct_leverage
+        wts = m.resp.wts
+        copy!(wts, leverage_weights(m, :original))
+    end
+
+    # Get type
+    V = typeof(m.resp.est)
+
+    _fit!(m, V; σ0=σ0, β0=β0, kwargs...)
+
+    m.fitted = true
+    return m
+end
+
+function dev_criteria(m::RobustLinearModel)
+    if !haspenalty(m)
+        # this criteria is not used with no penalty
+        return 0
+    end
+    l = deviance(m.resp)
+    l += dev_criteria(m.pred)
+    return l
+end
+
+function init!(
+    m::RobustLinearModel{T,R,P}; verbose::Bool=false
+) where {T,R,P<:AbstractRegularizedPred}
+
+    resp = m.resp
+    pred = m.pred
+
+    # reset ∇β
+    copyto!(pred.delbeta, pred.beta0)
+
+    # res = y - Xβ
+    mul!(resp.μ, pred.X, pred.beta0)
+    initresp!(resp)
+
+    # Initialize the predictor
+    initpred!(pred, weights(m), scale(m); verbose=verbose)
+
+    return m
+end
+
+function setη!(
+    m::RobustLinearModel{T,R,P}, linesearch_f::Real=1; verbose::Bool=false
+) where {T,R,P<:AbstractRegularizedPred}
+    # TODO: add line search
+    wts = weights(m)
+
+    μ = m.resp.μ
+    σ2 = m.resp.σ^2
+    y = m.resp.y
+
+    if m.pred isa CGDRegPred
+        res = m.resp.wrkres
+        devresid = m.resp.devresid
+
+        # Update coefs, μ and residuals
+        update_βμ!(m.pred, y, μ, res, σ2, wts; verbose=verbose)
+
+        # Update deviance with the new residuals
+        map!(abs2, devresid, res)
+    else
+        # Update coefs
+        update_beta!(m.pred, y, wts, σ2; verbose=verbose)
+
+        # Update linear predictor
+        mul!(μ, m.pred.X, m.pred.beta0)
+
+        # Update residuals with the new μ
+        updateres!(m.resp; updatescale=false)
+    end
+
+    return m
+end
diff --git a/test/data/Animals2.jl b/test/data/Animals2.jl
index 0b09765..f27c59b 100644
--- a/test/data/Animals2.jl
+++ b/test/data/Animals2.jl
@@ -215,3 +215,4 @@ y = data.logBrain
 nt = (; logBrain=data.logBrain, logBody=data.logBody)
 
 data_tuples = ((form, data), (form, nt), (X, y), (sX, y))
+;
diff --git a/test/data/starsCYG.jl b/test/data/starsCYG.jl
new file mode 100644
index 0000000..1a31177
--- /dev/null
+++ b/test/data/starsCYG.jl
@@ -0,0 +1,115 @@
+
+# Import data
+#using RDatasets: dataset
+#data = dataset("robustbase", "starsCYG")
+
+logTemp = [
+    4.37,
+    4.56,
+    4.26,
+    4.56,
+    4.30,
+    4.46,
+    3.84,
+    4.57,
+    4.26,
+    4.37,
+    3.49,
+    4.43,
+    4.48,
+    4.01,
+    4.29,
+    4.42,
+    4.23,
+    4.42,
+    4.23,
+    3.49,
+    4.29,
+    4.29,
+    4.42,
+    4.49,
+    4.38,
+    4.42,
+    4.29,
+    4.38,
+    4.22,
+    3.48,
+    4.38,
+    4.56,
+    4.45,
+    3.49,
+    4.23,
+    4.62,
+    4.53,
+    4.45,
+    4.53,
+    4.43,
+    4.38,
+    4.45,
+    4.50,
+    4.45,
+    4.55,
+    4.45,
+    4.42,
+]
+
+logLight = [
+    5.23,
+    5.74,
+    4.93,
+    5.74,
+    5.19,
+    5.46,
+    4.65,
+    5.27,
+    5.57,
+    5.12,
+    5.73,
+    5.45,
+    5.42,
+    4.05,
+    4.26,
+    4.58,
+    3.94,
+    4.18,
+    4.18,
+    5.89,
+    4.38,
+    4.22,
+    4.42,
+    4.85,
+    5.02,
+    4.66,
+    4.66,
+    4.90,
+    4.39,
+    6.05,
+    4.42,
+    5.10,
+    5.22,
+    6.29,
+    4.34,
+    5.62,
+    5.10,
+    5.22,
+    5.18,
+    5.57,
+    4.62,
+    5.06,
+    5.34,
+    5.34,
+    5.54,
+    4.98,
+    4.50,
+]
+
+data2 = DataFrame(; logTemp=logTemp, logLight=logLight)
+
+form2 = @formula(logLight ~ 1 + logTemp)
+X2 = hcat(ones(size(data2, 1)), data2.logTemp)
+sX2 = SparseMatrixCSC(X2)
+y2 = data2.logLight
+nt2 = (; logTemp=logTemp, logLight=logLight)
+
+data2_tuples = ((form2, data2), (form2, nt2), (X2, y2), (sX2, y2))
+;
diff --git a/test/ipod.jl b/test/ipod.jl
new file mode 100644
index 0000000..6470353
--- /dev/null
+++ b/test/ipod.jl
@@ -0,0 +1,275 @@
+
+loss1 = RobustModels.L2Loss()
+est1 = MEstimator(loss1)
+est2 = MMEstimator(TukeyLoss)
+pen1 = RobustModels.NoPenalty()
+λ = 10_000.0
+pen2 = RobustModels.RangedPenalties(
+    [2:RobustModels.End()], [RobustModels.SquaredL2Penalty(λ)]
+)
+
+m00 = fit(LinearModel, form, data)
+m01 = fit(RobustLinearModel, form, data, est1; method=:auto, initial_scale=1)
+m02 = fit(RobustLinearModel, form, data, est1; method=:auto, initial_scale=1, ridgeλ=λ)
+m03 = fit(RobustLinearModel, form, data, est2)
+σ = scale(m03)
+
+@testset "Θ-IPOD: L2Loss method, no penalty ($(pen))" for (pen, methods) in zip(
+    (nothing, pen1), (nopen_methods, pen_methods)
+)
+    def_rtol = isnothing(pen) ? 1e-6 : 1e-4
+
+    @testset "solver method $(method)" for method in methods
+        rtol = def_rtol
+        if method === :fista
+            rtol = 1e-1
+        end
+
+        # Formula, dense and sparse entry
+        @testset "data type: $(typeof(A))" for (A, b) in data_tuples
+            name = "Θ-IPOD(L2Loss, $(pen); method=$(method)),\t"
+            name *= if A == form
+                "formula"
+            elseif A == X
+                "dense  "
+            else
+                "sparse "
+            end
+
+            m1 = fit(IPODRegression, A, b, loss1, pen; method=method, initial_scale=1)
+            @test_nowarn ipod(A, b, loss1, pen; method=method, initial_scale=1)
+
+            @test all(iszero, outliers(m1))
+            @test all(isfinite.(coef(m1)))
+
+            VERBOSE && println("\n\t\u25CF $(name)")
+            VERBOSE && println(" rlm : ", coef(m01))
+            VERBOSE && println("ipod($(method)) : ", coef(m1))
+            @test isapprox(coef(m1), coef(m01); rtol=rtol)
+
+            # Test printing the model
+            @test_nowarn show(devnull, m1)
+
+            # make sure that it is not a TableRegressionModel
+            @test !isa(m1, TableRegressionModel)
+
+            # refit
+            @test_nowarn refit!(m1)
+
+            # interface
+            @testset "method: $(f)" for f in interface_methods
+                if f == workingweights
+                    # Not defined for IPODRegression
+                    continue
+                end
+
+                # make sure the methods for IPODRegression give the same results as RobustLinearModel
+                var0 = f(m01)
+                var1 = f(m1)
+                if f == hasformula
+                    # m01 is defined from a formula
+                    @test var1 == (A isa FormulaTerm)
+                else
+                    @test isapprox(var0, var1; rtol=rtol)
+                end
+            end
+        end
+    end
+end
+
+
+@testset "Θ-IPOD: robust loss, no penalty ($(pen))" for (pen, methods) in zip(
+    (nothing, pen1), (nopen_methods, pen_methods)
+)
+    @testset "solver method $(method)" for method in methods
+        @testset "loss $(loss_name)" for loss_name in ("Huber", "Hampel", "Tukey")
+            typeloss = getproperty(RobustModels, Symbol(loss_name * "Loss"))
+            l = typeloss()
+            est = MEstimator(l)
+
+            # Formula, dense and sparse entry
+            @testset "data type: $(typeof(A))" for (A, b) in ((X, y), (sX, y))
+                name = "Θ-IPOD($(l), $(pen); method=$(method)),\t"
+                name *= (
+                    (A isa FormulaTerm) ? "formula" :
+                    (A isa SparseMatrixCSC) ? "sparse " : "dense  "
+                )
+
+                m0 = fit(RobustLinearModel, A, b, est; method=:chol, initial_scale=σ)
+                m1 = fit(IPODRegression, A, b, l, pen; method=method, initial_scale=σ)
+                β1 = copy(coef(m1))
+                γ1 = copy(outliers(m1))
+
+                @test any(!iszero, outliers(m1))
+                @test all(isfinite.(coef(m1)))
+
+                VERBOSE && println("\n\t\u25CF $(name)")
+                VERBOSE && println(" rlm : ", coef(m0))
+                VERBOSE && println("ipod($(method)) : ", coef(m1))
+                @test coef(m0) ≈ coef(m1) rtol = 0.1
+                @test stderror(m0) ≈ stderror(m1) rtol = 0.5
+
+                # Test printing the model
+                @test_nowarn show(devnull, m1)
+
+                # make sure that it is not a TableRegressionModel
+                @test !isa(m1, TableRegressionModel)
+
+                # refit
+                @test_nowarn refit!(m1)
+                @test coef(m1) ≈ β1
+                @test outliers(m1) ≈ γ1
+            end
+        end
+    end
+end
+
+
+
+@testset "Θ-IPOD: L2Loss method, Ridge penalty" begin
+    def_rtol = 1e-5
+
+    kwargs = (; initial_scale=1, maxiter=10_000)
+    
+    @testset "solver method $(method)" for method in pen_methods
+        rtol = def_rtol
+        if method === :cgd
+            rtol = 1e-5
+        elseif method === :fista
+            rtol = 1e-1
+        else
+            rtol = 1e-3
+        end
+
+        # Formula, dense and sparse entry
+        @testset "data type: $(typeof(A))" for (A, b) in data_tuples
+            name = "Θ-IPOD(L2Loss, $(pen2); method=$(method)),\t"
+            name *= (
+                (A isa FormulaTerm) ? "formula" :
+                (A isa SparseMatrixCSC) ? "sparse " : "dense  "
+            )
+
+            m2 = ipod(A, b, loss1, pen2; method=method, kwargs...)
+
+            @test all(iszero, outliers(m2))
+            @test all(isfinite.(coef(m2)))
+
+            VERBOSE && println("\n\t\u25CF $(name)")
+            VERBOSE && println(" rlm : ", coef(m02))
+            VERBOSE && println("ipod($(method)) : ", coef(m2))
+            @test isapprox(coef(m2), coef(m02); rtol=rtol)
+
+            # interface
+            @testset "method: $(f)" for f in interface_methods
+                if f == workingweights
+                    # Not defined for IPODRegression
+                    continue
+                end
+
+                # make sure the methods for IPODRegression give the same results as RobustLinearModel
+                var0 = f(m02)
+                var1 = f(m2)
+                if f == hasformula
+                    # m0 is defined from a formula
+                    @test var1 == (A isa FormulaTerm)
+                elseif f in (dof, dof_residual, dispersion)
+                    # Ridge dof is smaller than the unpenalized regression
+                    # IPOD dof is the same as the unpenalized IPOD
+                    @test var0 ≈ var1 rtol = 1e-1
+                    #                    @test var1 == f(m01)
+                elseif f in (stderror, vcov, leverage)
+                    @test all(abs.(var1) .>= abs.(var0))
+                elseif f in (leverage_weights,)
+                    @test all(abs.(var1) .<= abs.(var0))
+                elseif f in (confint, residuals, fitted, predict)
+                    # Larger error
+                    @test isapprox(var0, var1; rtol=1e-1)
+                elseif f in (projectionmatrix,)
+                    continue
+                else
+                    @test isapprox(var0, var1; rtol=rtol)
+                end
+            end
+        end
+    end
+end
+
+
+@testset "Θ-IPOD: L2Loss method, $(pen_name) penalty" for pen_name in penalties
+    typepenalty = getproperty(RobustModels, Symbol(pen_name * "Penalty"))
+    pen = typepenalty(λ)
+    sep_pen1 = RobustModels.RangedPenalties([1:RobustModels.End()], [pen])
+    sep_pen2 = RobustModels.RangedPenalties([2:RobustModels.End()], [pen])
+    kwargs = (; initial_scale=1, maxiter=10_000)
+
+    m0 = fit(RobustLinearModel, X, y, pen; method=:auto, kwargs...)
+
+    # Formula, dense and sparse entry
+    @testset "Omit intercept, data type: $(typeof(A))" for (A, b) in data_tuples
+        name = "Θ-IPOD(L2Loss, $(pen); method=:auto),\t"
+        name *= (
+            (A isa FormulaTerm) ? "formula" :
+            (A isa SparseMatrixCSC) ? "sparse " : "dense  "
+        )
+
+        m1 = fit(IPODRegression, A, b, loss1, pen; kwargs...)
+        m2 = fit(IPODRegression, A, b, loss1, sep_pen2; kwargs...)
+        m3 = fit(IPODRegression, A, b, loss1, sep_pen1; kwargs...)
+        m4 = fit(IPODRegression, A, b, loss1, pen; penalty_omit_intercept=false, kwargs...)
+        m5 = fit(
+            IPODRegression, A, b, loss1, sep_pen1; penalty_omit_intercept=false, kwargs...
+        )
+
+        @test all(isfinite.(coef(m1)))
+        @test all(isfinite.(coef(m2)))
+        @test all(isfinite.(coef(m3)))
+        @test all(isfinite.(coef(m4)))
+        @test all(isfinite.(coef(m5)))
+        @test all(iszero, outliers(m1))
+        @test all(iszero, outliers(m2))
+        @test all(iszero, outliers(m3))
+        @test all(iszero, outliers(m4))
+        @test all(iszero, outliers(m5))
+
+        @test penalty(m0) == penalty(m1)
+        @test penalty(m0) == penalty(m2)
+        @test penalty(m0) == penalty(m3)
+        @test penalty(m4) != penalty(m5)
+        @test penalty(m0) != penalty(m4)
+
+        @test coef(m0) ≈ coef(m1)
+        @test coef(m0) ≈ coef(m2)
+        @test coef(m0) ≈ coef(m3)
+        @test coef(m4) ≈ coef(m5)
+        @test coef(m4)[2] ≈ 0 atol = 0.1
+        @test coef(m5)[2] ≈ 0 atol = 0.1
+
+        VERBOSE && println("\n\t\u25CF $(name)")
+        VERBOSE && println("ipod($(pen), $(method)) : ", coef(m1))
+    end
+
+    @testset "solver method $(method)" for method in pen_methods
+        m0 = fit(RobustLinearModel, X, y, pen; method=method, kwargs...)
+
+        # Formula, dense and sparse entry
+        @testset "data type: $(typeof(A))" for (A, b) in data_tuples
+            name = "Θ-IPOD(L2Loss, $(pen); method=$(method)),\t"
+            name *= (
+                (A isa FormulaTerm) ? "formula" :
+                (A isa SparseMatrixCSC) ? "sparse " : "dense  "
+            )
+
+            m1 = fit(IPODRegression, A, b, loss1, pen; method=method, kwargs...)
+
+            @test all(isfinite.(coef(m1)))
+            @test all(iszero, outliers(m1))
+
+            @test penalty(m0) == penalty(m1)
+            @test coef(m0) ≈ coef(m1)
+            @test coef(m1)[2] ≈ 0 atol = 0.1
+
+            VERBOSE && println("\n\t\u25CF $(name)")
+            VERBOSE && println("ipod($(pen), $(method)) : ", coef(m1))
+        end
+    end
+end
diff --git a/test/linearfit.jl b/test/linearfit.jl
index c0c06d6..9f2d390 100644
--- a/test/linearfit.jl
+++ b/test/linearfit.jl
@@ -1,5 +1,4 @@
 
-using Random: MersenneTwister
 
 m1 = fit(LinearModel, form, data)
 
diff --git a/test/penalties.jl b/test/penalties.jl
new file mode 100644
index 0000000..0199cc1
--- /dev/null
+++ b/test/penalties.jl
@@ -0,0 +1,212 @@
+
+using RobustModels:
+    cost,
+    proximal,
+    proximal!,
+    isconcrete,
+    concrete,
+    isconvex,
+    isseparable,
+    RangedPenalties,
+    End,
+    nothing  # stopper
+
+λ = 10_000.0
+
+seed = 1234
+rng = MersenneTwister(seed)
+
+t = 10 * randn(rng, 30)
+
+@testset "Penalties" begin
+
+    @testset "Methods penalty functions: $(name)" for name in ("No", penalties...)
+        tt = copy(t)
+
+        typepenalty = getproperty(RobustModels, Symbol(name * "Penalty"))
+
+        pen = typepenalty(λ)
+        penpos = typepenalty(λ; nonnegative=true)
+
+        @test_nowarn println(pen)
+
+        # Cost
+        P0 = cost(pen, zeros(Float64, 4))
+        P1 = cost(pen, ones(Float64, 4))
+        @test P1 >= P0
+
+        # Proximal operator
+        copy!(tt, t)
+        @test_nowarn proximal!(pen, tt, t, 1.0)
+        @test tt ≈ proximal(pen, t, 1.0)
+
+        # Shrinkage
+        @test all(@.(abs(tt) <= abs(t)))
+
+        # Single index update
+        copy!(tt, t)
+        proximal!(pen, tt, 1, t, 1.0)
+        @test abs(tt[1]) <= abs(t[1])
+        @test all(tt[2:end] .== t[2:end])
+
+        # Nonnegative
+        if name != "No"
+            tt = proximal(penpos, t, 1.0)
+            # Get only non-negative values
+            @test all(>=(0), tt)
+
+            # Negative values are set to 0
+            nonpos_indices = findall(<=(0), t)
+            @test all(iszero, tt[nonpos_indices])
+        end
+
+        # Concrete
+        new_pen = concrete(pen, length(t))
+        @test new_pen == pen
+
+        @testset "RangedPenalties" begin
+            n = length(t)
+            seppen = RangedPenalties([2:End()], [pen])
+            concpen = RangedPenalties([2:n], [pen], n)
+            seppen2 = RangedPenalties([[1], 2:End()], [EuclideanPenalty(λ), pen])
+
+            # Not concrete
+            @test !isconcrete(seppen)
+            @test isconcrete(concpen)
+
+            copy!(tt, t)
+            @test_throws Exception cost(seppen, t)
+            @test_throws Exception proximal(seppen, t, 1.0)
+            @test_throws Exception proximal!(seppen, tt, t, 1.0)
+            @test_throws Exception proximal!(seppen, tt, 2, t, 1.0)
+
+            # Concrete
+            new_pen = concrete(seppen, n)
+            @test new_pen == concpen
+            @test isconcrete(new_pen)
+
+            # Proximal operator
+            copy!(tt, t)
+            @test_nowarn proximal!(new_pen, tt, t, 1.0)
+            @test tt[1] == t[1]
+            @test all(abs.(tt[2:end]) .<= abs.(t[2:end]))
+
+            @test_nowarn proximal!(new_pen, tt, 2, t, 1.0)
+            @test_throws Exception proximal!(new_pen, tt, 0, t, 1.0)
+            @test_throws Exception proximal!(new_pen, tt, length(t) + 1, t, 1.0)
+
+            # Convex
+            @test isconvex(seppen) == isconvex(pen)
+            @test isconvex(concpen) == isconvex(pen)
+
+            @test_throws Exception isseparable(seppen)
+            @test isseparable(concpen) == isseparable(pen)
+            @test_throws Exception isseparable(seppen2)
+
+            # Split ranges
+            @test pen == concrete(pen, n, nothing)
+            @test concpen == concrete(pen, n, 1)
+            for j in (n, 2)
+                new_pen = concrete(pen, n, j)
+                @test length(new_pen.ranges) == 1
+                @test length(new_pen.penalties) == 1
+                @test Set(only(new_pen.ranges)) == Set(filter(!=(j), 1:n))
+                @test only(new_pen.penalties) == pen
+            end
+        end
+    end
+
+
+    @testset "M-estimator with penalty: Ridge" begin
+        def_rtol = 1e-4
+        pen = SquaredL2Penalty(λ)
+
+        kwargs = (; initial_scale=1)
+        m0 = rlm(form2, data2, MEstimator{L2Loss}(); kwargs...)
+        m1 = rlm(form2, data2, MEstimator{L2Loss}(); ridgeλ=λ, kwargs...)
+
+        @testset "solver method $(method)" for method in pen_methods
+            rtol = def_rtol
+            if method === :fista
+                rtol = 1e-2
+            end
+
+            # Formula, dense and sparse entry
+            @testset "data type: $(typeof(A))" for (A, b) in data2_tuples
+                name = "rlm($(pen); method=$(method)),\t"
+                name *= (
+                    (A isa FormulaTerm) ? "formula" :
+                    (A isa SparseMatrixCSC) ? "sparse " : "dense  "
+                )
+
+                m2 = rlm(A, b, pen; method=method, kwargs...)
+
+                @test all(isfinite.(coef(m2)))
+
+                VERBOSE && println("\n\t\u25CF $(name)")
+                VERBOSE && println("rlm ridge : ", coef(m1))
+                VERBOSE && println("rlm($method) $(pen) : ", coef(m2))
+                @test isapprox(coef(m2), coef(m1); rtol=rtol)
+
+                # interface
+                @testset "method: $(f)" for f in interface_methods
+                    # make sure the methods for IPODRegression give the same results
+                    # as RobustLinearModel
+                    var1 = f(m1)
+                    var2 = f(m2)
+                    if f == hasformula
+                        # m1 is defined from a formula
+                        @test var2 == (A isa FormulaTerm)
+                    elseif f in (dof, dof_residual, dispersion)
+                        # Ridge dof is smaller than the unpenalized regression
+                        # rlm with penalty dof is the same as the unpenalized rlm
+                        @test isapprox(var1, var2; rtol=1e-2)
+                        # @test var2 == f(m0)
+                    elseif f in (stderror, vcov, leverage)
+                        @test all(abs.(var2) .>= abs.(var1))
+                    elseif f in (leverage_weights,)
+                        @test all(abs.(var2) .<= abs.(var1))
+                    # elseif f in (confint,)
+                    #     @test isapprox(var1, var2; rtol=1e-1)
+                    elseif f in (confint, projectionmatrix)
+                        continue
+                    else
+                        @test isapprox(var1, var2; rtol=rtol)
+                    end
+                end
+            end
+        end
+    end
+
+
+    @testset "M-estimator with penalty $(name)" for name in penalties
+        typepenalty = getproperty(RobustModels, Symbol(name * "Penalty"))
+
+        pen = typepenalty(λ)
+        penpos = typepenalty(λ; nonnegative=true)
+        kwargs = (; initial_scale=1, maxiter=1000)
+        def_rtol = 1e-3
+
+        m1 = fit(RobustLinearModel, form2, data2, pen; verbose=false, kwargs...)
+        @test all(isfinite.(coef(m1)))
+
+        # make sure that it is not a TableRegressionModel
+        @test !isa(m1, TableRegressionModel)
+
+        @testset "method: $(method)" for method in pen_methods
+            m2 = fit(RobustLinearModel, form2, data2, pen; method=method, kwargs...)
+            rtol = method == :fista ? 1e-1 : def_rtol
+            @test isapprox(coef(m1), coef(m2); rtol=rtol)
+        end
+
+        m3 = fit(RobustLinearModel, form2, data2, penpos; verbose=false, kwargs...)
+        j = hasintercept(m1) ? 2 : 1
+        @test all(>=(0), coef(m3)[j:end])
+
+        # refit
+        β1 = copy(coef(m1))
+        refit!(m1)
+        @test all(coef(m1) .== β1)
+    end
+
+end
diff --git a/test/robustridge.jl b/test/robustridge.jl
index aa6496d..a510ddd 100644
--- a/test/robustridge.jl
+++ b/test/robustridge.jl
@@ -1,4 +1,3 @@
-using Random: MersenneTwister
 using LinearAlgebra: inv, Hermitian, I, tr, diag
 
 seed = 123987
diff --git a/test/runtests.jl b/test/runtests.jl
index 1c2d883..3a437df 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,7 @@ using GLM
 using SparseArrays
 using DataFrames
 using Test
-using Random: MersenneTwister
+using Random: AbstractRNG, MersenneTwister
 
 using RobustModels
 
@@ -52,8 +52,14 @@ other_losses = ("Cauchy",)
 bounded_losses = ("Geman", "Welsch", "Tukey", "YohaiZamar", "HardThreshold", "Hampel")
 losses = (convex_losses..., other_losses..., bounded_losses...)
 
+## Penalties
+penalties = (
+    "SquaredL2", "Euclidean", "L1", "ElasticNet", "Berhu", "CappedL1", "SCAD", "MCP"
+)
+
 ## Solving methods
 nopen_methods = (:auto, :chol, :cholesky, :qr, :cg)
+pen_methods = (:auto, :cgd, :fista, :ama, :admm)
 
 ## Interface methods
 interface_methods = (
@@ -88,6 +94,7 @@ interface_methods = (
 
 # Import data
 include("data/Animals2.jl")
+include("data/starsCYG.jl")
 
 # Include tests
 include("estimators.jl")
@@ -98,3 +105,6 @@ include("robustridge.jl")
 include("qreg.jl")
 include("weights.jl")
 include("univariate.jl")
+include("penalties.jl")
+include("ipod.jl")
+include("underdetermined.jl")
diff --git a/test/underdetermined.jl b/test/underdetermined.jl
new file mode 100644
index 0000000..b2e4f47
--- /dev/null
+++ b/test/underdetermined.jl
@@ -0,0 +1,105 @@
+
+
+seed = 1234
+rng = MersenneTwister(seed)
+
+function gendata(
+    rng::AbstractRNG,
+    ::Type{T},
+    N::Integer,
+    p::Integer,
+    nonzero::Integer,
+    rho::Real,
+    snr::Real = 3,
+    alternate::Bool=true,
+) where {T<:AbstractFloat}
+    if nonzero > 0
+        beta = nonzero:-1:1
+        if alternate
+            beta = [(nonzero - i + 1) * (i % 2 == 0 ? 1 : -1) for i in 1:nonzero]
+        else
+            beta = nonzero:-1:1
+        end
+        beta = vcat(beta, zeros(T, p-nonzero))
+    else
+        beta = zeros(T, p)
+    end
+    
+    X = randn(rng, N, p)
+    if rho > 0
+        z = randn(N)
+        X .+= sqrt(rho/(1-rho)) * z
+        X *= sqrt(1-rho)
+    end
+    
+    ssd = sqrt((1-rho)*sum(abs2, beta) + rho*sum(abs, beta))
+    nsd = ssd / snr
+    
+    y = X * beta + nsd * randn(N)
+    return X, y, beta
+end
+
+
+X3, y3, β03 = gendata(rng, Float64, 100, 1000, 30, 0.3)
+
+loss1 = L2Loss()
+est1 = MEstimator{L2Loss}()
+λ = 1.0
+pen1 = L1Penalty(λ)
+pen2 = SquaredL2Penalty(λ)
+
+@testset "Rank deficient X: penalty ($(pen))" for (pen, methods) in zip(
+    (nothing, pen1, pen2), (nopen_methods, pen_methods, pen_methods)
+)
+    def_rtol = isnothing(pen) ? 1e-6 : 1e-4
+
+    @testset "solver method $(method)" for method in methods
+        rtol = def_rtol
+        if method === :fista
+            rtol = 1e-1
+        end
+
+        ### RobustLinearModel
+        name = "rlm(X3, y3, L2Loss, $(pen); method=$(method))"
+        VERBOSE && println("\n\t\u25CF $(name)")
+
+        if isnothing(pen)
+            if method in (:chol, :cholesky, :auto)
+                @test_throws Exception rlm(X3, y3, est1; method=method, initial_scale=1, dropcollinear=false)
+            end
+            m1 = rlm(X3, y3, est1; method=method, initial_scale=1, dropcollinear=true)
+        else
+            m1 = rlm(X3, y3, pen; method=method, initial_scale=1, maxiter=1000, dropcollinear=true)
+        end
+
+        # Test printing the model
+        @test_nowarn show(devnull, m1)
+        VERBOSE && println(m1)
+
+        # interface
+        @testset "method: $(f)" for f in interface_methods
+            @test_nowarn f(m1)
+        end
+
+        ### IPODRegression
+        name = "ipod(X3, y3, L2Loss, $(pen); method=$(method))"
+        VERBOSE && println("\n\t\u25CF $(name)")
+
+        if isnothing(pen) && method in (:chol, :cholesky, :auto)
+            @test_throws Exception ipod(X3, y3, loss1, pen; method=method, initial_scale=1, dropcollinear=false)
+        end
+        m2 = ipod(X3, y3, loss1, pen; method=method, initial_scale=1, maxiter=1000, dropcollinear=true)
+
+        # Test printing the model
+        @test_nowarn show(devnull, m2)
+
+        # interface
+        @testset "method: $(f)" for f in interface_methods
+            if f == workingweights
+                # Not defined for IPODRegression
+                continue
+            end
+            @test_nowarn f(m2)
+        end
+    end
+end