Skip to content

Commit

Permalink
Updates from Overleaf
Browse files Browse the repository at this point in the history
  • Loading branch information
ludwigbothmann committed Dec 5, 2023
1 parent fba6e07 commit 048c173
Showing 1 changed file with 17 additions and 15 deletions.
32 changes: 17 additions & 15 deletions slides/regularization/slides-regu-ridge-deepdive.tex
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png}
\newcommand{\learninggoals}{
\item Know alternative interpretations of Ridge regression
\item Know four perspectives interpretations on Ridge regression
\item Derivation of the bias-variance tradeoff for Ridge regression
}

Expand All @@ -14,7 +14,7 @@

\begin{document}

\lecturechapter{Ridge Regression Deep-Dive}
\lecturechapter{Perspectives on Ridge Regression (Deep-Dive)}
\lecture{Introduction to Machine Learning}


Expand All @@ -35,7 +35,7 @@

\end{vbframe}

\begin{vbframe}{Another perspective on $L2$}
\begin{vbframe}{$L2$ and row-augmentation}
We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let $\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix}
\yv \\ \bm{0}_{p}
\end{pmatrix}$. Using the augmented data, the unregularized least-squares solution $\tilde{\thetab}$ can be written as
Expand All @@ -50,18 +50,20 @@
%$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$
\end{vbframe}

\begin{vbframe}{Yet Another perspective on $L2$}
Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations:
$$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$
Expanding, we obtain
$$\riskt = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\bm{\delta}(y-\thetab^{\top}\bm{x}) + \thetab^{\top}\bm{\delta}\bm{\delta}^{\top}\thetab]$$

By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get
$$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$

Summing over $n$ samples exactly yields Ridge regression with hyperparameter $\lambda$.\\
$\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$!

\begin{vbframe}{$L2$ and noisy features}
Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}^{(i)}$ where $\bm{\delta}^{(i)} \overset{iid}{\sim} (\bm{0},\lambda \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the perturbations $\bm{\delta}$:
$$\riskt:= \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\tilde{\xi})^2\big)\Big] = \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}(\xi+\bm{\delta}^{(i)}))^2\big)\Big]\,\,\Big|\, \text{expand}$$
\vspace{-0.2cm}
%Expanding, we obtain
$$\riskt = \mathbb{E}_{\bm{\delta}}\Big[\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\bm{\delta}^{(i)}(\yi-\thetab^{\top}\xi) + \thetab^{\top}\bm{\delta}^{(i)}\bm{{\delta}}^{(i) \top}\thetab\big)\Big]$$

By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}]=\bm{0}_p$ and $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]=\lambda \id_p$, this is
\vspace{-0.2cm}
%
\begin{align*}\riskt&=\frac{1}{n}{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}](\yi-\thetab^{\top}\xi) + \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]\thetab \big) \\
&= \frac{1}{n}{\textstyle \sumin}(\yi-\thetab^{\top}\xi)^2+\lambda \Vert \thetab \Vert_2^2
\end{align*}
$\Longrightarrow$ Ridge regression on unperturbed features {\small $\xi$} turns out to be minimizing squared loss averaged over feature noise distribution!
\end{vbframe}


Expand Down

0 comments on commit 048c173

Please sign in to comment.