From 3b0c180905136f64cb748de2ec7a3798e7fd438c Mon Sep 17 00:00:00 2001
From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com>
Date: Mon, 22 Jan 2024 23:30:56 +0100
Subject: [PATCH] inf theory part2

---
 slides/information-theory/slides-info-ml.tex  | 20 +++---
 .../slides-info-mutual-info.tex               | 19 +++---
 .../slides-info-mutual-info2.tex              | 65 ++++++++++---------
 3 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex
index 9e0173dc..168c5e23 100644
--- a/slides/information-theory/slides-info-ml.tex
+++ b/slides/information-theory/slides-info-ml.tex
@@ -5,9 +5,9 @@
 
 \newcommand{\titlefigure}{figure_man/multinoulli.png}
 \newcommand{\learninggoals}{
-  \item Minimizing KL is equivalent to maximizing the log-likelihod
-  \item Minimizing KL is equivalent to minimizinig cross-entropy
-  \item Minimizing cross-entropy between modeled and observed probabilities is equivalent to log-loss minimization
+  \item Minimizing KL =\\ maximizing log-likelihood
+  \item Minimizing KL =\\ minimizing cross-entropy
+  \item Minimizing CE between modeled and observed probabilities =\\log-loss minimization
 }
 
 \title{Introduction to Machine Learning}
@@ -26,7 +26,9 @@
     D_{KL}(p \| q_{\thetab}) &= \E_{X \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\
      &= \E_{X \sim p} \log p(x) - \E_{X \sim p} \log q(x|\thetab)
   \end{align*}
-  The first term above does not depend on $\thetab$ and the second term we could also as a def for CE! Therefore,
+  as first term above does not depend on $\thetab$.
+  %and the second term we could also as a def for CE! 
+  Therefore,
   \begin{align*}
     \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab)\\ 
                                            &= \argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab)
@@ -34,16 +36,18 @@
   For a finite dataset of $n$ samples from $p$, this is approximated as 
   $$\argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab) \approx \argmax_{\thetab} \frac{1}{n} \sumin \log q(\xi|\thetab)\,.$$
 
+  This also directly implies an equivalence to risk minimization!
+
 % This demonstrates that density estimation and optimal coding are closely related. If the estimated distribution is different from the true one, any code based on the estimated distribution will necessarily be suboptimal (in terms of the expected length of \enquote{messages} from the true distribution).
 \end{vbframe}
 
 \begin{vbframe}{KL vs Cross-Entropy}
-From this here we can actually see much more:
+From this here we can see much more:
 $$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$
   \begin{itemize}
-    \item So minimizing w.r.t. KL is the same as minimizing w.r.t. cross-entropy, which implies minimizing w.r.t. cross-entropy is the same as maximum likelihood!
-    \item We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)!
-    \item Or we could say: Cross-entropy between $p$ and $q$ is simply the expected negative log-likelihood of $q$, when our data comes from $p$!
+    \item So minimizing KL is the same as minimizing CE, is the same as maximum likelihood!
+    \item We could now motivate CE as the "relevant" term that you have to minimize when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)!
+    \item Or we could say: CE between $p$ and $q$ is simply the expected negative log-likelihood of $q$, when our data comes from $p$!
   \end{itemize}
 \end{vbframe}
 
diff --git a/slides/information-theory/slides-info-mutual-info.tex b/slides/information-theory/slides-info-mutual-info.tex
index bc78df2b..13f046b6 100644
--- a/slides/information-theory/slides-info-mutual-info.tex
+++ b/slides/information-theory/slides-info-mutual-info.tex
@@ -21,7 +21,7 @@
 
 \begin{vbframe}{Joint entropy}
 \begin{itemize}
-  \item The \textbf{joint entropy} of two discrete random variables $X$ and $Y$ with a joint distribution $p(x, y)$ is:
+  \item Recap: The \textbf{joint entropy} of two discrete RVs $X$ and $Y$ with joint pmf $p(x, y)$ is:
   $$ H(X,Y) = -\sum_{x \in \Xspace} \sum_{y \in \Yspace}  p(x,y) \log(p(x,y)),$$ 
   which can also be expressed as $$ H(X,Y) = -\E \left[ \log(p(X,Y)) \right].$$
   % where $I(x,y)$ is the self-information of $(x,y)$.
@@ -33,7 +33,7 @@
   % \begin{footnotesize}
   % $$ H(X_1, X_2, \ldots, X_n) = - \sum_{x_1 \in \Xspace_1} \ldots \sum_{x_n \in \Xspace_n} p(x_1,x_2, \ldots, x_n) \log_2(p(x_1,x_2, \ldots, x_n)) $$ 
   % \end{footnotesize}
-  \item For continuous random variables $X$ and $Y$ with joint density $p(x,y)$, the differential joint entropy is:\\
+  \item For continuous RVs $X$ and $Y$ with joint density $p(x,y)$, the differential joint entropy is:\\
   $$ h(X,Y) = - \int_{\Xspace \times \Yspace} p(x,y) \log p(x,y) dx dy$$
 \end{itemize}
 
@@ -73,7 +73,7 @@
 
 
 \begin{vbframe} {Chain rule for entropy}
-The \textbf{chain rule for entropy} is analogous to the chain rule for probability and, in fact, derives directly from it.
+The \textbf{chain rule for entropy} is analogous to the chain rule for probability and derives directly from it.
 $$H(X, Y)=H(X)+H(Y | X)$$
 \footnotesize
 \textbf{Proof:}
@@ -91,7 +91,7 @@
 
 \lz
 
-n-Variable version:
+n-variable version:
 $$H\left(X_{1}, X_{2}, \ldots, X_{n}\right)=\sumin H\left(X_{i} | X_{i-1}, \ldots, X_{1}\right).$$
 
 
@@ -157,7 +157,7 @@
 \begin{aligned}
 H(X, X)       &= H(X)  \\
 H(X | X)      &= 0  \\
-H(X, Y | Z)   &=H(X | Z)+H(Y | X, Z)\\
+H( (X, Y) | Z)   &=H(X | Z)+H(Y | (X, Z))\\
 \end{aligned}
 \end{equation*}
 
@@ -181,8 +181,8 @@
 % \normalsize
 
 \begin{itemize}
-\item The MI describes the amount of information about one random variable obtained through the other one or how different the joint distribution is from pure independence.
-\item Consider two random variables $X$ and $Y$ with a joint probability mass function $p(x, y)$ and marginal probability mass functions $p(x)$ and $p(y)$. The MI $I (X;Y)$ is the Kullback-Leibler Divergence between the joint distribution and the product distribution $p(x)p(y)$:
+\item The MI describes the amount of info about one RV obtained through another RV or how different their joint distribution is from pure independence.
+\item Consider two RVs $X$ and $Y$ with a joint pmf $p(x, y)$ and marginal pmfs $p(x)$ and $p(y)$. The MI $I (X;Y)$ is the Kullback-Leibler Divergence between the joint distribution and the product distribution $p(x)p(y)$:
 \footnotesize
 \begin{equation*}\begin{aligned}
 I(X ; Y) &=\sum_{x \in \Xspace} \sum_{y \in \Yspace} p(x, y) \log \frac{p(x, y)}{p(x) p(y)} \\
@@ -216,8 +216,7 @@
 &=H(X)-H(X | Y).
 \end{aligned}\end{equation*}
 
-Thus, mutual information $I(X;Y)$ is the reduction in the uncertainty
-of $X$ due to the knowledge of $Y$.
+So, $I(X;Y)$ is reduction in uncertainty of $X$ due to knowledge of $Y$.
 
 \end{vbframe}
 
@@ -264,7 +263,7 @@
 
 \lz
 
-The marginal distribution of $X$ is $(\frac{1}{2}, \frac{1}{4}, \frac{1}{8}, \frac{1}{8})$ and the marginal distribution of $Y$ is $(\frac{1}{4}, \frac{1}{4}, \frac{1}{4}, \frac{1}{4})$, and hence $H(X) = \frac{7}{4}$ bits and $H(Y) = 2$ bits.
+Marginal distribution of $X$ is $(\frac{1}{2}, \frac{1}{4}, \frac{1}{8}, \frac{1}{8})$ and marginal distribution of $Y$ is $(\frac{1}{4}, \frac{1}{4}, \frac{1}{4}, \frac{1}{4})$, and hence $H(X) = \frac{7}{4}$ bits and $H(Y) = 2$ bits.
 
 \framebreak
 
diff --git a/slides/information-theory/slides-info-mutual-info2.tex b/slides/information-theory/slides-info-mutual-info2.tex
index 11d9473d..67e7c631 100644
--- a/slides/information-theory/slides-info-mutual-info2.tex
+++ b/slides/information-theory/slides-info-mutual-info2.tex
@@ -38,10 +38,10 @@
 
 \textbf{Proof:}$\quad 0 \leq I(X ; Y)=H(X)-H(X | Y)$
 
-Intuitively, the theorem says that knowing another random variable $Y$ can only reduce the uncertainty in $X$. Note that this is true only on the average. 
+Intuitively, the theorem says that knowing another random variable $Y$ can only reduce the uncertainty in $X$. Note that this is true only on average. 
 \vspace{0.5cm}
 
-\textbf{Remark}: Because $H(X)\geq H(X|Y)$ and $H(X)$ is only bounded from below, $I(X ; Y)$ is unbounded from above (lives in all of $\mathbb{R}_{0}^{+}$)
+%\textbf{Remark}: Because $H(X)\geq H(X|Y)$ and $H(X)$ is only bounded from below, $I(X ; Y)$ is unbounded from above (lives in all of $\mathbb{R}_{0}^{+}$)
 
 \framebreak
 
@@ -104,28 +104,33 @@
 
 \framebreak
 
-\textbf{Independence bound on entropy:} Let $X_{1}, X_{2}, \ldots, X_{n}$ be drawn according to $p\left(x_{1}, x_{2}, \ldots, x_{n}\right) .$ Then
+\textbf{Independence bound on entropy:} 
+
+%Let $X_{1}, X_{2}, \ldots, X_{n}$ be drawn according to $p\left(x_{1}, x_{2}, \ldots, x_{n}\right) .$ Then
 
 \footnotesize
 $$H\left(X_{1}, X_{2}, \ldots, X_{n}\right) \leq \sum_{i=1}^{n} H\left(X_{i}\right),$$
 \normalsize
 
-with equality if and only if the $X_{i}$ are independent.\\
+Holds with equality if and only if $X_{i}$ are jointly independent.
 
 \lz
 
-\textbf{Proof:} With the chain rule for entropies,
+\textbf{Proof:} With chain rule and "conditioning reduces entropy"
 
 \footnotesize
 \begin{equation*}
 \begin{aligned}
 H\left(X_{1}, X_{2}, \ldots, X_{n}\right) &=\sum_{i=1}^{n} H\left(X_{i} | X_{i-1}, \ldots, X_{1}\right) 
-&\leq \sum_{i=1}^{n} H\left(X_{i}\right),
+&\leq \sum_{i=1}^{n} H\left(X_{i}\right)
 \end{aligned}
 \end{equation*}
 \normalsize
 
-where the inequality follows directly from above. We have equality if and only if $X_{i}$ is independent of $X_{i-1}, \ldots, X_{1}$ for all $i$ (i.e., if and only if the $X_{i}$ 's are independent).
+
+Equality holds iff $X_i$ is independent of 
+$X_{i-1},\ldots, X_1$ for all $i$,
+so iff all $X_i$ are jointly independent.
 
 
 \end{vbframe}
@@ -175,19 +180,20 @@
 \begin{itemize}
   % \item Intuitively, mutual information quantifies the amount of shared information between variables.
   \item MI is a measure of the amount of "dependence" between variables. It is zero if and only if the variables are independent.
-  \item On the other hand, if one of the variables is a deterministic function of the other, the mutual information is maximal, i.e. entropy of the first.
- \item Unlike (Pearson) correlation, mutual information is not limited to real-valued random variables.
-    \item Mutual information can be used to perform \textbf{feature selection}. Quite simply, each variable $X_i$ is rated according to $I(X_i;Y)$, this is sometimes called information gain.
-  \item The same principle can also be used in decision trees to select a feature to split on. Splitting on MI/IG is then equivalent to risk reduction with log-loss.
-  \item MI is invariant w.r.t. injective and continuously differentiable reparametrizations
+  \item OTOH, if one RV is a deterministic function of the other, MI is maximal, i.e. entropy of the first RV.
+ \item Unlike (Pearson) correlation, MI is not limited to real-valued RVs.
+    \item Can use MI as a \textbf{feature filter},
+    sometimes called information gain.
+  \item Can also be used in CART to select feature for split.\\ Splitting on MI/IG = risk reduction with log-loss.
+  \item MI invariant under injective and continuously differentiable reparametrizations.
 \end{itemize}
 \end{vbframe}
 
 \begin{vbframe} {Mutual information vs. correlation}
   
 \begin{itemize}
-    \item If two variables are independent, their correlation is 0.
-    \item However, the reverse is not necessarily true. It is possible for two dependent variables to have 0 correlation because correlation only measures linear dependence.
+    \item If two RVs are independent, their correlation is 0.
+    \item But: two dependent RVs can have correlation 0 because correlation only measures linear dependence.
 \end{itemize}
     
 \begin{center}
@@ -195,8 +201,9 @@
 \end{center}
 
 \begin{itemize}
-    \item The figure above shows various scatterplots where, in each case, the correlation is 0 even though the two variables are strongly dependent, and MI is large. 
-    \item Mutual information can therefore be seen as a more general measure of dependence between variables than correlation.
+    \item Above: Many examples with strong dependence, nearly 0 correlation and much larger MI.
+    
+    \item MI can be seen as more general measure of dependence than correlation.
 \end{itemize}
 
 \end{vbframe}
@@ -225,21 +232,21 @@
 For $\rho = \pm 1$, $X$ and $Y$ are perfectly correlated and $I(X;Y) \rightarrow \infty$. 
 \end{vbframe}
 
-\begin{vbframe}{Estimation of MI}
+% \begin{vbframe}{Estimation of MI}
 
-\begin{itemize}
-    \item In practice, estimation of the mutual information $$I(X;Y) = H(X) + H(Y) - H(X,Y)$$ is usually based on the \textit{empirical information}, i.e., $$\hat{I}(X;Y) = \hat{H}(X) + \hat{H}(Y) - \hat{H}(X,Y)$$
-\end{itemize}
+% \begin{itemize}
+%     \item In practice, estimation of the mutual information $$I(X;Y) = H(X) + H(Y) - H(X,Y)$$ is usually based on the \textit{empirical information}, i.e., $$\hat{I}(X;Y) = \hat{H}(X) + \hat{H}(Y) - \hat{H}(X,Y)$$
+% \end{itemize}
 
-Here, we simply plug in the estimates of the empirical distribution $\hat{p}(x),\hat{p}(y),\hat{p}(x,y)$:
-{\small
-\begin{align*}
-       \hat{H}(X)&=-\sum_{x \in \Xspace} \hat{p}(x) \log_2 \hat{p}(x)\\
-       \hat{H}(Y)&=-\sum_{y \in \mathcal{Y}} \hat{p}(y) \log_2 \hat{p}(y)\\
-       \hat{H}(X,Y) &= - \sum_{x \in \Xspace} \sum_{y \in \Yspace}  \hat{p}(x,y) \log_2(\hat{p}(x,y))\\
-\end{align*}
-}
-\end{vbframe}
+% Here, we simply plug in the estimates of the empirical distribution $\hat{p}(x),\hat{p}(y),\hat{p}(x,y)$:
+% {\small
+% \begin{align*}
+%        \hat{H}(X)&=-\sum_{x \in \Xspace} \hat{p}(x) \log_2 \hat{p}(x)\\
+%        \hat{H}(Y)&=-\sum_{y \in \mathcal{Y}} \hat{p}(y) \log_2 \hat{p}(y)\\
+%        \hat{H}(X,Y) &= - \sum_{x \in \Xspace} \sum_{y \in \Yspace}  \hat{p}(x,y) \log_2(\hat{p}(x,y))\\
+% \end{align*}
+% }
+% \end{vbframe}
 
 % \begin{vbframe} {Chain rule for information}