From 048c173f8458c5847398e3fc5377101b75404f99 Mon Sep 17 00:00:00 2001
From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com>
Date: Wed, 6 Dec 2023 00:49:30 +0100
Subject: [PATCH] Updates from Overleaf

---
 .../slides-regu-ridge-deepdive.tex            | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
index bca0c2e4..9f2317e3 100644
--- a/slides/regularization/slides-regu-ridge-deepdive.tex
+++ b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -5,7 +5,7 @@
 
 \newcommand{\titlefigure}{figure_man/bias-variance-ridge.png}
 \newcommand{\learninggoals}{
-  \item Know alternative interpretations of Ridge regression
+  \item Know four perspectives interpretations on Ridge regression
   \item Derivation of the bias-variance tradeoff for Ridge regression
 }
 
@@ -14,7 +14,7 @@
 
 \begin{document}
 
-\lecturechapter{Ridge Regression Deep-Dive}
+\lecturechapter{Perspectives on Ridge Regression (Deep-Dive)}
 \lecture{Introduction to Machine Learning}
 
 
@@ -35,7 +35,7 @@
 
 \end{vbframe}
 
-\begin{vbframe}{Another perspective on $L2$}
+\begin{vbframe}{$L2$ and row-augmentation}
 We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let $\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix}
     \yv \\ \bm{0}_{p}
 \end{pmatrix}$. Using the augmented data, the unregularized least-squares solution $\tilde{\thetab}$ can be written as
@@ -50,18 +50,20 @@
 %$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv$$
 \end{vbframe}
 
-\begin{vbframe}{Yet Another perspective on $L2$}
-Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations:
-$$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$
-Expanding, we obtain
-$$\riskt = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\bm{\delta}(y-\thetab^{\top}\bm{x}) + \thetab^{\top}\bm{\delta}\bm{\delta}^{\top}\thetab]$$
-
-By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get
-$$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$
-
-Summing over $n$ samples exactly yields Ridge regression with hyperparameter $\lambda$.\\
-$\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$!
-
+\begin{vbframe}{$L2$ and noisy features}
+Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}^{(i)}$ where $\bm{\delta}^{(i)} \overset{iid}{\sim} (\bm{0},\lambda \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the perturbations $\bm{\delta}$:
+$$\riskt:= \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\tilde{\xi})^2\big)\Big] = \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}(\xi+\bm{\delta}^{(i)}))^2\big)\Big]\,\,\Big|\, \text{expand}$$
+\vspace{-0.2cm}
+%Expanding, we obtain
+$$\riskt = \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\bm{\delta}^{(i)}(\yi-\thetab^{\top}\xi) + \thetab^{\top}\bm{\delta}^{(i)}\bm{{\delta}}^{(i) \top}\thetab\big)\Big]$$
+
+By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}]=\bm{0}_p$ and $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]=\lambda \id_p$, this is
+\vspace{-0.2cm}
+%
+\begin{align*}\riskt&=\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}](\yi-\thetab^{\top}\xi) + \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]\thetab \big) \\
+&= \frac{1}{n}{\textstyle \sumin}(\yi-\thetab^{\top}\xi)^2+\lambda \Vert \thetab \Vert_2^2
+\end{align*}
+$\Longrightarrow$ Ridge regression on unperturbed features {\small $\xi$} turns out to be minimizing squared loss averaged over feature noise distribution!
 \end{vbframe}