From a756ce319f035fa000eb6121dcc071f12bfa095d Mon Sep 17 00:00:00 2001
From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com>
Date: Wed, 10 Jan 2024 10:32:58 +0100
Subject: [PATCH] Updates from Overleaf

---
 .../information-theory/slides-info-kl-ml.tex  | 42 +++++++++++--------
 .../slides-regu-ridge-deepdive.tex            |  2 +-
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex
index e9c994ce..ff9a06b4 100644
--- a/slides/information-theory/slides-info-kl-ml.tex
+++ b/slides/information-theory/slides-info-kl-ml.tex
@@ -3,7 +3,7 @@
 \input{../../latex-math/basic-math}
 \input{../../latex-math/basic-ml}
 
-\newcommand{\titlefigure}{figure/kl_log_diff_plot.png}
+\newcommand{\titlefigure}{figure/normal_distributions.png}
 \newcommand{\learninggoals}{
   \item Understand why measuring distribution similarity is important in ML
   \item Understand the advantages of forward and reverse KL
@@ -43,7 +43,7 @@
 
 \lz
 
-Many losses can be derived this way. (e.g. cross-entropy loss)
+Many losses can be derived this way. (e.g., cross-entropy loss)
 
 \end{itemize}
 
@@ -58,6 +58,7 @@
 \end{center}
 
 We can measure dependency by measuring the similarity between $p(\mathbf{x}, y)$ and $p(\mathbf{x})\cdot p(y).$ \\
+\lz
 We will later see that measuring this similarity with KL  leads to the concept of mutual information.
 
 \end{itemize}
@@ -66,7 +67,8 @@
 
 \begin{itemize}
     \item \textbf{Variational inference (VI)}
-Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable.
+%Our data can also induce probability distributions: 
+By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing the normaliziation constant $c = \int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}$ analytically is usually intractable.
 
 \begin{center}
 \includegraphics[width=0.99\linewidth]{figure/gaussian_mixture_scatter.png}
@@ -74,7 +76,7 @@
 
 In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to 
     $p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$
-This scenario fundamentally differs from the previous ones because we can now generate samples.
+%This scenario fundamentally differs from the previous ones because we can now generate samples.
 
 \end{itemize}
 
@@ -105,17 +107,22 @@
 
 \begin{itemize}
     \item \textbf{Forward KL for probabilistic model fitting}
-    \\ We have samples from the DGP $p(y|x)$ when we fit our ML model.
+    \\ We have samples from the DGP $p(y, \xv)$ when we fit our ML model.
     \\
     \lz
     
-    If we have a probabilistic ML model $q_{\bm{\phi}}$ and can specify $p(y|x)$ then the forward KL can be directly applied such that
+    If we have a probabilistic ML model $q_{\bm{\phi}}$ the  expected forward KL
     $$\E_{\xv \sim p_{\xv}}D_{KL}(p(\cdot|\xv) \| q_{\bm{\phi}}(\cdot|\xv)) = \E_{\xv \sim p_{\xv}}\E_{y \sim p_{y|\xv}}\log\left(\frac{p(y|\xv)}{q_{\bm{\phi}}(y|\xv)}\right).$$
-For example, if $p$ and $q_{\bm{\phi}}$ are Gaussians with the same $\sigma$, minimizing this expression is equivalent to L2 minimization. \\
-\lz 
-Assuming we have i.i.d. observations, an unbiased estimator of this expected forward KL is
-$$\sumin \log\left(\frac{p(\yi|\xi)}{q_{\bm{\phi}}(\yi|\xi)}\right) \Rightarrow \text{can be used for mini-batching.} $$
-
+We can directly minimize this objective since 
+\begin{align*}
+         \nabla_{\bm{\phi}} \E_{\xv \sim p_{\xv}}D_{KL}(p(\cdot|\xv) \| q_{\bm{\phi}}(\cdot|\xv)) &=  \E_{\xv \sim p_{\xv}}\E_{y \sim p_{y|\xv}}\nabla_{\bm{\phi}}\log\left(
+         p(y|\xv)\right) \\
+         &- \E_{\xv \sim p_{\xv}}\E_{y \sim p_{y|\xv}}\nabla_{\bm{\phi}}\log\left(q_{\bm{\phi}}(y|\xv)\right) \\
+         &= -\nabla_{\bm{\phi}} \E_{\xv \sim p_{\xv}}\E_{y \sim p_{y|\xv}}\log\left(q_{\bm{\phi}}(y|\xv)\right)
+     \end{align*}
+% Assuming we have i.i.d. observations, an unbiased estimator of this expected forward KL is
+% $$\sumin \log\left(\frac{p(\yi|\xi)}{q_{\bm{\phi}}(\yi|\xi)}\right) \Rightarrow \text{can be used for mini-batching.} $$
+     $\Rightarrow$ We can estimate the gradient of the expected forward KL without bias, although we can not evaluate $p(y\vert \xv)$ in general. 
 \end{itemize}
  \framebreak
 
@@ -132,17 +139,16 @@
      $\Rightarrow$ We can estimate the gradient of the reverse KL without bias (even if we only have an unnormalized target distribution)
  \end{itemize}
  \framebreak
- 
-\begin{center}
-\includegraphics[width=0.7\linewidth]{figure/kl_fitting_plot.png}
-\end{center}
 
 The asymmetry of the KL has the following implications
 \begin{itemize}
-    \item The forward KL $D_{KL}(p\|q_{\bm{\phi}}) = \E_{\xv \sim p} \log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right)$ is mass-covering since $p(\xv)\log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right) \approx 0$ if $p(\xv) \approx 0$ (as long as both distribution do not extremely differ)
-        \item The reverse KL $D_{KL}(q_{\bm{\phi}}\|p) = \E_{\xv \sim q_{\bm{\phi}}} \log\left(\frac{q_{\bm{\phi}}(\xv)}{p(\xv)}\right)$ is mode-seeking / zero-avoiding since $q_{\bm{\phi}}(\xv)\log\left(\frac{q_{\bm{\phi}}(\xv)}{p(\xv)}\right) \gg 0$ if $p(\xv) \approx 0$ and $q_{\bm{\phi}}(\xv) > 0$ 
+    \item Forward KL $D_{KL}(p\|q_{\bm{\phi}}) = \E_{\xv \sim p} \log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right)$ is mass-covering since $p(\xv)\log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right) \approx 0$ if $p(\xv) \approx 0$ and $q_{\bm{\phi}}(\xv) \not\gg p(\xv).$
+        \item Reverse KL $D_{KL}(q_{\bm{\phi}}\|p) = \E_{\xv \sim q_{\bm{\phi}}} \log\left(\frac{q_{\bm{\phi}}(\xv)}{p(\xv)}\right)$ is mode-seeking (zero-avoiding) since $q_{\bm{\phi}}(\xv)\log\left(\frac{q_{\bm{\phi}}(\xv)}{p(\xv)}\right) \gg 0$ if $p(\xv) \approx 0$ and $q_{\bm{\phi}}(\xv) > 0$ 
 \end{itemize}
- 
+ \begin{center}
+\includegraphics[width=0.7\linewidth]{figure/kl_fitting_plot.png}
+\end{center}
+\small Figure: Optimal $q_{\bm{\phi}}$ when $q_{\bm{\phi}}$ is restricted to be Gaussian.
 \end{vbframe}
 
 \endlecture
diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
index 56a32248..15548ef8 100644
--- a/slides/regularization/slides-regu-ridge-deepdive.tex
+++ b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -109,7 +109,7 @@
 Comparing MSEs of $\thetah_{\text{Ridge}}$ and $\thetah_{\text{OLS}}$ and using $\text{Bias}(\thetah_{\text{OLS}})=0$ we find 
 $$\text{MSE}(\thetah_{\text{OLS}})-\text{MSE}(\thetah_{\text{Ridge}}) = \underbrace{\text{trace}\big({\text{Var}(\thetah_{\text{OLS}})}-{\text{Var}(\thetah_{\text{Ridge}})}\big)}_{>0} - \underbrace{\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2}_{>0}$$
 
-Since both terms are positive, their difference is \textit{a priori} undetermined. \citebutton{Theobald, 1973}{https://www.jstor.org/stable/2984775} and \citebutton{Farebrother, 1976}{https://www.jstor.org/stable/2984971} prove there always exists some $\lambda^{\ast}>0$ so that
+Since both terms are positive, sign of their diff is \textit{a priori} undetermined. \citebutton{Theobald, 1973}{https://www.jstor.org/stable/2984775} and \citebutton{Farebrother, 1976}{https://www.jstor.org/stable/2984971} prove there always exists some $\lambda^{\ast}>0$ so that
 $$\text{MSE}(\thetah_{\text{OLS}})-\text{MSE}(\thetah_{\text{Ridge}})>0$$
 Important theoretical result: While Gauss-Markov guarantuees $\thetah_{\text{OLS}}$ is best linear unbiased estimator (BLUE) there are biased estimators with lower MSE.