diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex
index 1304232b..7120ed85 100644
--- a/slides/information-theory/slides-info-kl.tex
+++ b/slides/information-theory/slides-info-kl.tex
@@ -71,11 +71,10 @@
 
 \begin{vbframe} {KL-Divergence Example}
 
-KL divergence of $p(x)=N(0,1)$ and $q(x)=LP(0, \sigma)$ for varying $\sigma$
-
+KL divergence of $p(x)=N(0,1)$ and $q(x)=LP(0, \sigma)$ for varying $\sigma$ \\
+\lz
 \begin{figure}
 \includegraphics[width = 12cm ]{figure/kl_norm_lp.png} 
-\includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png} 
 \end{figure}
 
 
@@ -100,8 +99,7 @@
 \end{vbframe}
 
 \begin{vbframe} {KL as Log-Difference}
-Suppose that data is being generated from an unknown distribution $p(x)$. 
-Suppose we modeled $p(x)$ using an approximating distribution $q(x)$. 
+Suppose that data is being generated from an unknown distribution $p(x)$ and we model $p(x)$ using an approximating distribution $q(x)$. 
 
 \lz
 
@@ -112,6 +110,24 @@
 This is why we integrate out with respect to the data distribution $p$.
 A \enquote{good} approximation $q(x)$ should minimize the difference to $p(x)$.
 
+\usetikzlibrary{shapes, arrows.meta}
+\begin{center}
+\begin{tikzpicture}[>=Stealth, node distance=3cm, rounded corners]
+
+    % Left box with Gaussian density curve
+    \node (leftbox) [draw, rounded corners, minimum width=3cm, minimum height=1.5cm] {$x \sim p(x)$};
+    \begin{scope}
+        \clip (leftbox.south west) rectangle (leftbox.north east);
+    \end{scope}
+    
+    % Arrow
+    \draw[->] (leftbox.east) -- node[above] {$x$} ++(2,0);
+    
+    % Right box
+    \node (rightbox) [draw, rounded corners, minimum width=3cm, minimum height=1.5cm, right of=leftbox, node distance=5cm] {$\log p(x) - \log q(x)$};
+    
+\end{tikzpicture}
+\end{center}
 \framebreak
 
 Let $p(x)=N(0,1)$ and $q(x)=LP(0, 3)$. Observe
@@ -128,60 +144,69 @@
 
 \framebreak
 
-In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\
-\lz
-\textbf{Example}:
-Let $q(x)$ be a binomial distribution with $N = 2$ and $p = 0.3$ and let $p(x)$ be a discrete uniform distribution. Both distributions have the same support $\Xspace = \{0, 1, 2\}$.
+%In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\
+% \lz
+% \textbf{Example}:
+% Let $q(x)$ be a binomial distribution with $N = 2$ and $p = 0.3$ and let $p(x)$ be a discrete uniform distribution. Both distributions have the same support $\Xspace = \{0, 1, 2\}$.
 
-\begin{figure}
-\includegraphics[width = 5cm ]{figure/kl_log_diff_plot.png} 
-\end{figure}
+% \begin{figure}
+% \includegraphics[width = 5cm ]{figure/kl_log_diff_plot.png} 
+% \end{figure}
 
-\framebreak
+% \framebreak
 
-\begin{equation*}
-  \begin{split}
- D_{KL}(p \| q) &= \sum_{x \in \Xspace} p(x) \log \left( \frac{p(x)}{q(x)} \right)
- \\ &= 0.333 \log \left( \frac{0.333}{0.49} \right) + 0.333 \log \left( \frac{0.333}{0.42} \right) + 0.333 \log \left( \frac{0.333}{0.09} \right) \\ &= 0.23099 \text{    (nats)}
-  \end{split}
-\end{equation*}
+% \begin{equation*}
+%   \begin{split}
+%  D_{KL}(p \| q) &= \sum_{x \in \Xspace} p(x) \log \left( \frac{p(x)}{q(x)} \right)
+%  \\ &= 0.333 \log \left( \frac{0.333}{0.49} \right) + 0.333 \log \left( \frac{0.333}{0.42} \right) + 0.333 \log \left( \frac{0.333}{0.09} \right) \\ &= 0.23099 \text{    (nats)}
+%   \end{split}
+% \end{equation*}
 
-\begin{equation*}
-  \begin{split}
- D_{KL}(q \| p) &= \sum_{x \in \Xspace} q(x) \log \left( \frac{q(x)}{p(x)} \right)
- \\ &= 0.49 \log \left( \frac{0.49}{0.333} \right) + 0.42 \log \left( \frac{0.42}{0.333} \right) + 0.09 \log \left( \frac{0.09}{0.333} \right) \\ &= 0.16801 \text{    (nats)}
-  \end{split}
-\end{equation*}
+% \begin{equation*}
+%   \begin{split}
+%  D_{KL}(q \| p) &= \sum_{x \in \Xspace} q(x) \log \left( \frac{q(x)}{p(x)} \right)
+%  \\ &= 0.49 \log \left( \frac{0.49}{0.333} \right) + 0.42 \log \left( \frac{0.42}{0.333} \right) + 0.09 \log \left( \frac{0.09}{0.333} \right) \\ &= 0.16801 \text{    (nats)}
+%   \end{split}
+% \end{equation*}
 
-Again, note that $D_{KL}(p \| q) \neq D_{KL}(q \| p)$.
-\end{vbframe}
+% Again, note that $D_{KL}(p \| q) \neq D_{KL}(q \| p)$.
+ \end{vbframe}
 
 \begin{vbframe} {KL in Fitting}
 
-Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data.
+In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\
+
+Because KL quantifies the difference between distributions, it can be used as a loss function between distributions. %to find a good fit for the observed data. \\
+\lz
 
+In our example, we can identify an optimal $\sigma$ which minimizes the KL.
 \begin{figure}
-    \centering
-      \scalebox{0.9}{\includegraphics{figure_man/binom1.png}}
-      \tiny{\\ Credit: Will Kurt}
-      \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}}
+\includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png} 
 \end{figure}
 
-{\tiny Will Kurt (2017): Kullback-Leibler Divergence Explained. 
-\emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}}\par}
+
+% \begin{figure}
+%     \centering
+%       \scalebox{0.9}{\includegraphics{figure_man/binom1.png}}
+%       \tiny{\\ Credit: Will Kurt}
+%       \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}}
+% \end{figure}
+
+% {\tiny Will Kurt (2017): Kullback-Leibler Divergence Explained. 
+% \emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}}\par}
 
 \framebreak
 
-Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data.
+% Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data.
 
-\begin{figure}
-    \centering
-      \scalebox{0.9}{\includegraphics{figure_man/binom2.png}}
-      \tiny{\\ Credit: Will Kurt}
-      \caption{\footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable which takes values between 0 and 10. \textit{Right}: Fitted Binomial distribution ($p \approx 0.57$).}}
-\end{figure}
+% \begin{figure}
+%     \centering
+%       \scalebox{0.9}{\includegraphics{figure_man/binom2.png}}
+%       \tiny{\\ Credit: Will Kurt}
+%       \caption{\footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable which takes values between 0 and 10. \textit{Right}: Fitted Binomial distribution ($p \approx 0.57$).}}
+% \end{figure}
 
-On the right is the Binomial distribution that minimizes the KL divergence.
+% On the right is the Binomial distribution that minimizes the KL divergence.
 \end{vbframe}
 
 \begin{vbframe}{KL as likelihood ratio}
@@ -191,33 +216,52 @@
 \item How do we usually do that in stats? Likelihood ratio! 
 \end{itemize}
 
-$$ LR = \frac{p(x)}{q(x)} $$
+$$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$
   
-In the above, if for $x$ we have $LR>1$, then $p$ seems better, for $LR < 1$ $q$ seems better.
+If for $\xi$ we have $p(\xi)/q(\xi)>1$, then $p$ seems better, for $p(\xi)/q(\xi) < 1$ $q$ seems better. \\
+\begin{itemize}
+    \item Let us assume that our data already come from p. It no longer makes sense to ask whether p or q fits the data better.
+    \item So we ask instead: "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
+\end{itemize}
+$$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
+That expected LLR is really KL!
 
-\framebreak
+\end{vbframe}
 
-Or we can compute LR for a complete set of data (as always, logs make our life easier):
-$$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$
-Now let us assume that our data already come from $p$. It does not really make sense anymore to ask 
-whether $p$ or $q$ fit the data better.
+% \begin{vbframe}{KL as likelihood ratio}
 
-But maybe we want to pose the question "How different is $q$ from $p$?" by formulating it as:
-"If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
-$$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
-That expected LR is really KL!
+% \begin{itemize}
+% \item Let us assume we have some data and want to figure out whether $p(x)$ or $q(x)$ matches it better.
+% \item How do we usually do that in stats? Likelihood ratio! 
+% \end{itemize}
 
-\framebreak 
+% $$ LR = \frac{p(x)}{q(x)} $$
+  
+% In the above, if for $x$ we have $LR>1$, then $p$ seems better, for $LR < 1$ $q$ seems better.
 
-In summary we could say for KL:
-\begin{itemize}
-\item It measures how much "evidence" each sample provides on average to distinguish $p$ from $q$, 
-  if you sample from $p$.
-\item If $p$ and $q$ are very similar, most samples will not help much, and vice versa for very different distributions.
-\item In practice, we often want to make the approximation $q$ as indistinguishable from the real $p$ (our data) as possible. We already did that when we fitted (in our log-difference perspective).
-\end{itemize}
+% \framebreak
 
-\end{vbframe}
+% Or we can compute LR for a complete set of data (as always, logs make our life easier):
+% $$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$
+% Now let us assume that our data already come from $p$. It does not really make sense anymore to ask 
+% whether $p$ or $q$ fit the data better.
+
+% But maybe we want to pose the question "How different is $q$ from $p$?" by formulating it as:
+% "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
+% $$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
+% That expected LR is really KL!
+
+% \framebreak 
+
+% In summary we could say for KL:
+% \begin{itemize}
+% \item It measures how much "evidence" each sample provides on average to distinguish $p$ from $q$, 
+%   if you sample from $p$.
+% \item If $p$ and $q$ are very similar, most samples will not help much, and vice versa for very different distributions.
+% \item In practice, we often want to make the approximation $q$ as indistinguishable from the real $p$ (our data) as possible. We already did that when we fitted (in our log-difference perspective).
+% \end{itemize}
+
+% \end{vbframe}
 
 
 %\begin{vbframe} {Kullback-Leibler Divergence - Summary}
diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
index b123ade8..b4dba618 100644
--- a/slides/regularization/slides-regu-ridge-deepdive.tex
+++ b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -37,9 +37,28 @@
 =\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2
 $$
 }
-\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.}
+\normalsize{$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!}
 %$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv$$
 \end{vbframe}
 
+\begin{vbframe}{Another perspective on $L2$}
+Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations:
+$$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$
+Expanding, we obtain
+$$\riskt = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\bm{\delta}(y-\thetab^{\top}\bm{x}) + \thetab^{\top}\bm{\delta}\bm{\delta}^{\top}\thetab]$$
+
+By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get
+$$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$
+
+Summed over $n$ samples this is exactly to Ridge regression with regularization strength $\lambda$.\\
+$\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$!
+
+\end{vbframe}
+
+
+
+
+
+
 \endlecture
 \end{document}