diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex index b4dba618..bca0c2e4 100644 --- a/slides/regularization/slides-regu-ridge-deepdive.tex +++ b/slides/regularization/slides-regu-ridge-deepdive.tex @@ -20,28 +20,37 @@ \begin{vbframe}{Perspectives on $L2$ regularization} -We already saw that $L2$ regularization is equivalent to a constrained optimization problem: -\begin{eqnarray*} +We already saw two interpretations of $L2$ regularization. +\begin{itemize} + \item We know that it is equivalent to a constrained optimization problem: + \begin{eqnarray*} \thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv\\ %&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\ &=& \argmin_{\thetab} \sumin \left(\yi - \fxit\right)^2 \, \text{s.t. } \|\thetab\|_2^2 \leq t \end{eqnarray*} -We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let \scriptsize{$\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix} + \item Bayesian interpretation of Ridge regression: For normal likelihood contributions $\mathcal{N}(\thetab^{\top}\xi,\sigma^2)$ and i.i.d. normal priors $\theta_j \sim \mathcal{N}(0,\tau^{2})$, the resulting MAP estimate is $\thetah_{\text{Ridge}}$ with $\lambda=\frac{\sigma^2}{\tau^2}$: + $$\thetah_{\text{MAP}}=\argmax_{\theta} \log[p(\yv|\Xmat,\thetab)p(\thetab)] = \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \frac{\sigma^2}{\tau^2} \|\thetab\|_2^2$$ +\end{itemize} + +\end{vbframe} + +\begin{vbframe}{Another perspective on $L2$} +We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let $\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix} \yv \\ \bm{0}_{p} -\end{pmatrix}$.} \normalsize{Using the augmented data, the least-squares objective becomes} -\small{ -$$%\argmin_{\thetab} -\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 = %\argmin_{\thetab} -\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 %= \thetah_{\text{Ridge}} -=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 -$$ -} -\normalsize{$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!} +\end{pmatrix}$. Using the augmented data, the unregularized least-squares solution $\tilde{\thetab}$ can be written as +\begin{eqnarray*} +\tilde{\thetab} &=& \argmin_{\thetab} +\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 \\ &=& \argmin_{\thetab} +\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 \\ %= \thetah_{\text{Ridge}} +&=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 +\end{eqnarray*} + +$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\tilde{\thetab}$ but using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$! %$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$ \end{vbframe} -\begin{vbframe}{Another perspective on $L2$} +\begin{vbframe}{Yet Another perspective on $L2$} Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations: $$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$ Expanding, we obtain @@ -50,7 +59,7 @@ By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get $$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$ -Summed over $n$ samples this is exactly to Ridge regression with regularization strength $\lambda$.\\ +Summing over $n$ samples exactly yields Ridge regression with hyperparameter $\lambda$.\\ $\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$! \end{vbframe} diff --git a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex index fdd75fad..f2189d2a 100644 --- a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex +++ b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex @@ -21,29 +21,33 @@ \begin{vbframe}{Soft-thresholding and L1 regularization} In the lecture, we wanted to solve \[ - \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) = \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_k \left[ \frac{1}{2} H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right] + \sum_k \lambda |\theta_k|. - \] -This is a convex problem (since it is the sum of convex functions) for which, in general, no analytical solution exists. \\ + \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) = \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j|. + \] +Note that we can separate the dimensions, i.e., + +\[\mathcal{\tilde R}_{\text{reg}}(\thetab) = \sum_j g_j(\theta_j) \text{ with } g_j(\theta_j) = \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 + \lambda |\theta_j|.\] + +Hence, we can minimize each $g_j$ separately to find the global minimum. \\ \lz -For convex functions, every stationary point is a minimum. \\ +Each $g_j$ is convex since it is a sum of convex functions. For convex functions, every stationary point is a minimum. \\ \lz - Hence, we will analyze the coordinate-wise derivative $\frac{\partial}{\partial \thetab_j} \mathcal{\tilde R}_{\text{reg}}.$ \\ - (Note: This derivative is not defined for $\thetab_j = 0)$\\ + \framebreak -First, we will focus on the everywhere differentiable part: -\begin{align*} -\frac{\partial}{\partial \thetab_j}\sum_k \left[\frac{1}{2} H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right] - &= H_{j,j} (\theta_j - \hat{\theta}_j) \\ - &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j \\ -\end{align*} -Now, we analyze the stationary points $\hat{\theta}_{\text{Lasso},j}$ of $\riskrt.$ \\ -First, we consider the cases $\hat{\theta}_{\text{Lasso},j} > 0, \hat{\theta}_{\text{Lasso},j} < 0.$ \\ -(Here $\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}}$ exists) \\ +Thus, we analyze the stationary points $\hat{\theta}_{\text{Lasso},j}$ of $g_j.$ \\ +\lz +For this, we assume we already know the sign of the minimizer and then derive conditions for which our assumption holds. \\ +\lz +So, first we consider the cases $\hat{\theta}_{\text{Lasso},j} > 0, \hat{\theta}_{\text{Lasso},j} < 0.$\\ \lz +NB: +\begin{itemize} + \item For $\theta_j > 0: \frac{d}{d\theta_j}\vert \theta_j\vert = \frac{d}{d\theta_j}\theta_j = 1$ and + \item For $\theta_j < 0: \frac{d}{d\theta_j}\vert \theta_j\vert = \frac{d}{d\theta_j}\left(-\theta_j\right) = -1$. +\end{itemize} \framebreak @@ -55,7 +59,7 @@ \hfill \begin{minipage}{0.49\textwidth} \begin{align*} - \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0 \\ + \frac{d}{d \theta_j}g_j(\theta_j) &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0 \\ &\Rightarrow \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j -\frac{\lambda}{H_{j,j}} > 0 \\ &\iff \hat{\theta}_j > \frac{\lambda}{H_{j,j}} @@ -71,7 +75,7 @@ \hfill \begin{minipage}{0.49\textwidth} \begin{align*} - \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0 \\ + \frac{d}{d \theta_j}g_j(\theta_j) &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0 \\ &\Rightarrow \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j +\frac{\lambda}{H_{j,j}} < 0 \\ &\iff \hat{\theta}_j < -\frac{\lambda}{H_{j,j}} @@ -85,8 +89,9 @@ \end{minipage} \hfill \begin{minipage}{0.49\textwidth} -$\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $\mathcal{\tilde R}_{\text{reg}}$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\ -However, there must be at least one stationary point since $\mathcal{\tilde R}_{\text{reg}}$ is a regularized convex risk. +$\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $g_j$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\ +However, at least one stationary point must exist since $g_j$ is a regularized convex function with $\lambda > 0.$\\ +$\Rightarrow$ An equivalent constraint with $\vert\theta_j\vert \leq t \in\R_+$ must exist. \end{minipage} \\