From 048c173f8458c5847398e3fc5377101b75404f99 Mon Sep 17 00:00:00 2001 From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com> Date: Wed, 6 Dec 2023 00:49:30 +0100 Subject: [PATCH] Updates from Overleaf --- .../slides-regu-ridge-deepdive.tex | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex index bca0c2e4..9f2317e3 100644 --- a/slides/regularization/slides-regu-ridge-deepdive.tex +++ b/slides/regularization/slides-regu-ridge-deepdive.tex @@ -5,7 +5,7 @@ \newcommand{\titlefigure}{figure_man/bias-variance-ridge.png} \newcommand{\learninggoals}{ - \item Know alternative interpretations of Ridge regression + \item Know four perspectives interpretations on Ridge regression \item Derivation of the bias-variance tradeoff for Ridge regression } @@ -14,7 +14,7 @@ \begin{document} -\lecturechapter{Ridge Regression Deep-Dive} +\lecturechapter{Perspectives on Ridge Regression (Deep-Dive)} \lecture{Introduction to Machine Learning} @@ -35,7 +35,7 @@ \end{vbframe} -\begin{vbframe}{Another perspective on $L2$} +\begin{vbframe}{$L2$ and row-augmentation} We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let $\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix} \yv \\ \bm{0}_{p} \end{pmatrix}$. Using the augmented data, the unregularized least-squares solution $\tilde{\thetab}$ can be written as @@ -50,18 +50,20 @@ %$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$ \end{vbframe} -\begin{vbframe}{Yet Another perspective on $L2$} -Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations: -$$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$ -Expanding, we obtain -$$\riskt = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\bm{\delta}(y-\thetab^{\top}\bm{x}) + \thetab^{\top}\bm{\delta}\bm{\delta}^{\top}\thetab]$$ - -By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get -$$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$ - -Summing over $n$ samples exactly yields Ridge regression with hyperparameter $\lambda$.\\ -$\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$! - +\begin{vbframe}{$L2$ and noisy features} +Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}^{(i)}$ where $\bm{\delta}^{(i)} \overset{iid}{\sim} (\bm{0},\lambda \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the perturbations $\bm{\delta}$: +$$\riskt:= \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\tilde{\xi})^2\big)\Big] = \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}(\xi+\bm{\delta}^{(i)}))^2\big)\Big]\,\,\Big|\, \text{expand}$$ +\vspace{-0.2cm} +%Expanding, we obtain +$$\riskt = \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\bm{\delta}^{(i)}(\yi-\thetab^{\top}\xi) + \thetab^{\top}\bm{\delta}^{(i)}\bm{{\delta}}^{(i) \top}\thetab\big)\Big]$$ + +By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}]=\bm{0}_p$ and $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]=\lambda \id_p$, this is +\vspace{-0.2cm} +% +\begin{align*}\riskt&=\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}](\yi-\thetab^{\top}\xi) + \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]\thetab \big) \\ +&= \frac{1}{n}{\textstyle \sumin}(\yi-\thetab^{\top}\xi)^2+\lambda \Vert \thetab \Vert_2^2 +\end{align*} +$\Longrightarrow$ Ridge regression on unperturbed features {\small $\xi$} turns out to be minimizing squared loss averaged over feature noise distribution! \end{vbframe}