Updates from Overleaf

slds-lmu · Dec 5, 2023 · fba6e07 · fba6e07
1 parent dfb9a21
commit fba6e07
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 33 deletions.
diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -20,28 +20,37 @@
 
 
 \begin{vbframe}{Perspectives on $L2$ regularization}
-We already saw that $L2$ regularization is equivalent to a constrained optimization problem:
-\begin{eqnarray*}  
+We already saw two interpretations of $L2$ regularization. 
+\begin{itemize}
+    \item We know that it is equivalent to a constrained optimization problem:
+  \begin{eqnarray*}  
   \thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv\\
   %&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\
   &=& \argmin_{\thetab} \sumin \left(\yi - \fxit\right)^2 \,
   \text{s.t. } \|\thetab\|_2^2  \leq t
   \end{eqnarray*}
-We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let \scriptsize{$\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix}
+  \item Bayesian interpretation of Ridge regression: For normal likelihood contributions $\mathcal{N}(\thetab^{\top}\xi,\sigma^2)$ and i.i.d. normal priors $\theta_j \sim \mathcal{N}(0,\tau^{2})$, the resulting MAP estimate is $\thetah_{\text{Ridge}}$ with $\lambda=\frac{\sigma^2}{\tau^2}$:
+  $$\thetah_{\text{MAP}}=\argmax_{\theta} \log[p(\yv|\Xmat,\thetab)p(\thetab)] = \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \frac{\sigma^2}{\tau^2} \|\thetab\|_2^2$$
+\end{itemize}
+
+\end{vbframe}
+
+\begin{vbframe}{Another perspective on $L2$}
+We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let $\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix}
     \yv \\ \bm{0}_{p}
-\end{pmatrix}$.} \normalsize{Using the augmented data, the least-squares objective becomes}
-\small{
-$$%\argmin_{\thetab} 
-\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 = %\argmin_{\thetab} 
-\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 %= \thetah_{\text{Ridge}}
-=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2
-$$
-}
-\normalsize{$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!}
+\end{pmatrix}$. Using the augmented data, the unregularized least-squares solution $\tilde{\thetab}$ can be written as
+\begin{eqnarray*}
+\tilde{\thetab} &=& \argmin_{\thetab} 
+\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 \\ &=& \argmin_{\thetab} 
+\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 \\ %= \thetah_{\text{Ridge}}
+&=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2
+\end{eqnarray*}
+
+$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\tilde{\thetab}$ but using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!
 %$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv$$
 \end{vbframe}
 
-\begin{vbframe}{Another perspective on $L2$}
+\begin{vbframe}{Yet Another perspective on $L2$}
 Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations:
 $$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$
 Expanding, we obtain
@@ -50,7 +59,7 @@
 By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get
 $$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$
 
-Summed over $n$ samples this is exactly to Ridge regression with regularization strength $\lambda$.\\
+Summing over $n$ samples exactly yields Ridge regression with hyperparameter $\lambda$.\\
 $\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$!
 
 \end{vbframe}

diff --git a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex
@@ -21,29 +21,33 @@
 \begin{vbframe}{Soft-thresholding and L1 regularization}
 In the lecture, we wanted to solve
      \[
-      \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) =  \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_k \left[ \frac{1}{2} H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right] + \sum_k \lambda |\theta_k|.
-      \] 
-This is a convex problem (since it is the sum of convex functions) for which, in general, no analytical solution exists. \\
+      \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) =  \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j|.
+      \]
+Note that we can separate the dimensions, i.e.,
+
+\[\mathcal{\tilde R}_{\text{reg}}(\thetab) = \sum_j g_j(\theta_j) \text{ with } g_j(\theta_j) = \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 + \lambda |\theta_j|.\]
+
+Hence, we can minimize each $g_j$ separately to find the global minimum. \\
 \lz
 
-For convex functions, every stationary point is a minimum. \\
+Each $g_j$ is convex since it is a sum of convex functions. For convex functions, every stationary point is a minimum. \\
 \lz
 
- Hence, we will analyze the coordinate-wise derivative $\frac{\partial}{\partial \thetab_j} \mathcal{\tilde R}_{\text{reg}}.$ \\
- (Note: This derivative is not defined for $\thetab_j = 0)$\\
+
 
 \framebreak
 
-First, we will focus on the everywhere differentiable part:
-\begin{align*}
-\frac{\partial}{\partial \thetab_j}\sum_k \left[\frac{1}{2}  H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right]
-    &=  H_{j,j} (\theta_j - \hat{\theta}_j)  \\
-        &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j  \\
-\end{align*}
-Now, we analyze the stationary points $\hat{\theta}_{\text{Lasso},j}$ of $\riskrt.$ \\
-First, we consider the cases $\hat{\theta}_{\text{Lasso},j} > 0, \hat{\theta}_{\text{Lasso},j} < 0.$ \\
-(Here $\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}}$ exists) \\
+Thus, we analyze the stationary points $\hat{\theta}_{\text{Lasso},j}$ of $g_j.$ \\
+\lz 
+For this, we assume we already know the sign of the minimizer and then derive conditions for which our assumption holds. \\
+\lz 
+So, first we consider the cases $\hat{\theta}_{\text{Lasso},j} > 0, \hat{\theta}_{\text{Lasso},j} < 0.$\\
 \lz
+NB: 
+\begin{itemize}
+    \item For $\theta_j > 0: \frac{d}{d\theta_j}\vert \theta_j\vert = \frac{d}{d\theta_j}\theta_j = 1$ and 
+    \item For $\theta_j < 0: \frac{d}{d\theta_j}\vert \theta_j\vert = \frac{d}{d\theta_j}\left(-\theta_j\right) = -1$.
+\end{itemize}
 
 \framebreak
 
@@ -55,7 +59,7 @@
 \hfill
 \begin{minipage}{0.49\textwidth}
 \begin{align*}
-    \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0 \\
+    \frac{d}{d \theta_j}g_j(\theta_j) &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0 \\
     &\Rightarrow  \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j 
  -\frac{\lambda}{H_{j,j}} > 0 \\
  &\iff \hat{\theta}_j >  \frac{\lambda}{H_{j,j}}
@@ -71,7 +75,7 @@
 \hfill
 \begin{minipage}{0.49\textwidth}
 \begin{align*}
-    \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0 \\
+    \frac{d}{d \theta_j}g_j(\theta_j) &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0 \\
     &\Rightarrow  \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j 
  +\frac{\lambda}{H_{j,j}} < 0 \\
  &\iff \hat{\theta}_j < -\frac{\lambda}{H_{j,j}}
@@ -85,8 +89,9 @@
 \end{minipage}
 \hfill
 \begin{minipage}{0.49\textwidth}
-$\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $\mathcal{\tilde R}_{\text{reg}}$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\
-However, there must be at least one stationary point since $\mathcal{\tilde R}_{\text{reg}}$ is a regularized convex risk.
+$\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $g_j$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\
+However, at least one stationary point must exist since $g_j$ is a regularized convex function with $\lambda > 0.$\\
+$\Rightarrow$ An equivalent constraint with $\vert\theta_j\vert \leq t \in\R_+$ must exist.
 \end{minipage}
  \\