Updates from Overleaf

slds-lmu · Nov 29, 2023 · 5841b10 · 5841b10
1 parent 94f7988
commit 5841b10
Show file tree

Hide file tree

Showing 5 changed files with 593 additions and 568 deletions.
diff --git a/slides/information-theory/chapter-order.tex b/slides/information-theory/chapter-order.tex
@@ -25,7 +25,10 @@ \subsection{Cross-Entropy, KL and Source Coding}
 \subsection{Information Theory for Machine Learning}
 \includepdf[pages=-]{../slides-pdf/slides-info-ml.pdf}
 
-\subsection{Joint Entropy and Mutual Information}
+\subsection{Joint Entropy and Mutual Information I}
+\includepdf[pages=-]{../slides-pdf/slides-info-mutual-info.pdf}
+
+\subsection{Joint Entropy and Mutual Information II}
 \includepdf[pages=-]{../slides-pdf/slides-info-mutual-info.pdf}
 
 \subsection{Entropy and Optimal Code Length}

diff --git a/slides/information-theory/slides-info-cross-entropy-kld.tex b/slides/information-theory/slides-info-cross-entropy-kld.tex
@@ -21,28 +21,29 @@
 \begin{vbframe} {Cross-Entropy - Discrete Case}
 
 \textbf{Cross-entropy} measures the average amount of information required to represent an event from one distribution $p$ using a predictive scheme based on another distribution $q$ (assume they have the same domain $\Xspace$ as in KL).
-  $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$
-
+  $$ H(p \| q) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$
 
+For now, we accept the formula as-is. More on the underlying intuition follows in the concent on inf. theory for ML and sourcecoding.
 \begin{itemize}
-\setlength{\itemsep}{1.2em}
+\setlength{\itemsep}{0.9em}
 \item Entropy = Avg. amount of information if we optimally encode $p$
 \item Cross-Entropy = Avg. amount of information if we suboptimally encode $p$ with $q$
 \item $DL_ {KL}(p \| q)$: Difference between the two
+\item $H(p \| q)$ sometimes also denoted as $H_{q}(p)$ to set it apart from KL
 \end{itemize}
 
-\lz
+\framebreak
 
 We can summarize this also through this identity: 
 \lz
 $$
-H_q(p) = H(p) + D_{KL}(p \| q)
+H(p \| q) = H(p) + D_{KL}(p \| q)
 $$
 This is because: 
 \begin{eqnarray*}
 H(p) + D_{KL}(p \| q) &=& - \sum_{x \in \Xspace} p(x) \log p(x) + \sum_{x \in \Xspace} p(x) \log \frac{p(x)}{q(x)} \\
                       &=& \sum_{x \in \Xspace} p(x) (-\log p(x) +  \log p(x) - \log q(x)) \\
-&=& - \sum_{x \in \Xspace} p(x) \log q(x) = H_q(p) \\
+&=& - \sum_{x \in \Xspace} p(x) \log q(x) = H(p \| q) \\
 \end{eqnarray*}
 
 \framebreak
@@ -52,11 +53,11 @@
 
 For continuous density functions $p(x)$ and $q(x)$: 
 
-$$ H_q(p) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$
+$$ H(p \| q) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$
 
 \begin{itemize}
 \item It is not symmetric.
-\item As for the discrete case, $H_q(p) = h(p) + D_{KL}(p \| q)$ holds.
+\item As for the discrete case, $H(p \| q) = h(p) + D_{KL}(p \| q)$ holds.
 \item Can now become negative, as the $h(p)$ can be negative! 
 \end{itemize}
 \end{vbframe}
@@ -103,10 +104,10 @@
 %      $$ H_p(q) = \sum_{x \in \Xspace} q(x) \log_2\left(\frac{1}{p(x)}\right) = - \sum_{x \in \Xspace} q(x) log_2(p(x))$$
 %      \item For probability densities $p(x)$ and $q(x)$, it is: 
 %      $$ H_p(q) = \int_{\Xspace} q(x) \ln\left(\frac{1}{p(x)}\right) dx = - \int_{\Xspace} q(x) \ln\left(p(x)\right) dx $$
-%     \item It is not symmetric: $ H_p(q) \neq H_q(p)$.
+%     \item It is not symmetric: $ H_p(q) \neq H(p \| q)$.
 %     \item Relationship to KL divergence:
 %       \begin{align*}
-%         H_q(p) &= H(p) + D_{KL}(p \| q) \\
+%         H(p \| q) &= H(p) + D_{KL}(p \| q) \\
 %         H_p(q) &= H(q) + D_{KL}(q \| p)
 %       \end{align*}
 %     \item It is non-negative. If the two distributions are the same, cross-entropy equals entropy and KL divergence is zero. 

diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex
@@ -23,27 +23,26 @@
 \begin{vbframe}{KL vs Maximum Likelihood}
 Minimizing KL between the true distribution $p(x)$ and approximating model $q(x|\thetab)$ is equivalent to maximizing the log-likelihood.
   \begin{align*}
-    D_{KL}(p \| q_{\thetab})) &= \E_{x \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\
-     &= \E_{x \sim p} \log p(x) - \E_{x \sim p} \log q(x|\thetab)
+    D_{KL}(p \| q_{\thetab}) &= \E_{X \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\
+     &= \E_{X \sim p} \log p(x) - \E_{X \sim p} \log q(x|\thetab)
   \end{align*}
-  The first term above does not depend on $\thetab$. Therefore,
+  The first term above does not depend on $\thetab$ and the second term can be defined as the cross-entropy. Therefore,
   \begin{align*}
-    \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{x \sim p} \log q(x|\thetab)\\ 
-                                           &= \argmax_{\thetab} \E_{x \sim p} \log q(x|\thetab)
+    \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab)\\ 
+                                           &= \argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab)
   \end{align*}
   For a finite dataset of $n$ samples from $p$, this is approximated as 
-  $$\argmax_{\thetab} \E_{x \sim p} \log q(x|\thetab) \approx \argmax_{\thetab} \frac{1}{n} \sumin \log q(\xi|\thetab)\,.$$
+  $$\argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab) \approx \argmax_{\thetab} \frac{1}{n} \sumin \log q(\xi|\thetab)\,.$$
 
 % This demonstrates that density estimation and optimal coding are closely related. If the estimated distribution is different from the true one, any code based on the estimated distribution will necessarily be suboptimal (in terms of the expected length of \enquote{messages} from the true distribution).
 \end{vbframe}
 
 \begin{vbframe}{KL vs Cross-Entropy}
 From this here we can actually see much more:
-$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{x \sim p} \log q(x|\thetab) = \argmin_{\thetab} H_{q_{\thetab}}(p) $$
+$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$
   \begin{itemize}
-    \item So minimizing with respect to KL is the same as minimizing with respect to cross-entropy! 
-    \item That implies minimizing with respect to cross-entropy is the same as maximum likelihood!
-    \item Remember, how we only characterized cross-entropy through source coding / bits? We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)!
+    \item So minimizing w.r.t. to KL is the same as minimizing with respect to cross-entropy, which implies minimizing w.r.t. cross-entropy is the same as maximum likelihood!
+    \item Remember, how we only characterized cross-entropy through bits? We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)!
     \item Or we could say: Cross-entropy between $p$ and $q$ is simply the expected negative log-likelihood of $q$, when our data comes from $p$!
   \end{itemize}
 \end{vbframe}
@@ -67,14 +66,14 @@
 
 
 To train the model, we minimize KL between $d^{(i)}$ and $\pi(\xv^{(i)}|\thetab)$ :
-$$ \argmin_{\thetab} \sum_{i=1}^n D_{KL} (d^{(i)} \| \pi(\xv^{(i)}|\thetab)) = \argmin_{\thetab} \sum_{i=1}^n  H_{\pi(\xv^{(i)}|\thetab)}(d^{(i)}) $$
+$$ \argmin_{\thetab} \sum_{i=1}^n D_{KL} (d^{(i)} \| \pi(\xv^{(i)}|\thetab)) = \argmin_{\thetab} \sum_{i=1}^n  H(d^{(i)} \| \pi(\xv^{(i)}|\thetab)) $$
     % where the entropy $H(d^{(i)})$ was dropped because it is not a function of $\thetab$.
 
 We see that this is equivalent to log-loss risk minimization!
   \begin{footnotesize}
     \begin{equation*}
       \begin{split}
-               R &= \sumin  H_{\pi_k(\xv^{(i)}|\thetab)}(d^{(i)}) \\
+               R &= \sumin  H(d^{(i)} \| \pi_k(\xv^{(i)}|\thetab)) \\
                  &= \sumin \left( - \sum_k d^{(i)}_k \log\pi_k(\xv^{(i)}|\thetab) \right) \\
                  & = \sumin \underbrace{ \left( -\sum_{k = 1}^g [\yi = k]\log \pi_{k}(\xv^{(i)}|\thetab) \right) }_{\text{log loss}} \\
                  & = \sumin (-\log\pi_{y^{(i)}}(\xv^{(i)}|\thetab)) 
@@ -106,7 +105,7 @@
 
 \lz
 
-  If $p$ represents a $\text{Ber}(y)$ distribution (so deterministic, where the true label receives probability mass 1) and we also interpret $\pix$ as a Bernoulli distribution $\text{Ber}(\pix)$, the Bernoulli loss $L(y,\pix)$ is the cross-entropy $H_{\pix}(p)$.
+  If $p$ represents a $\text{Ber}(y)$ distribution (so deterministic, where the true label receives probability mass 1) and we also interpret $\pix$ as a Bernoulli distribution $\text{Ber}(\pix)$, the Bernoulli loss $L(y,\pix)$ is the cross-entropy $H(p \| \pix)$.
 %    \item If $\hat{y}$ is a Bernoulli random variable with distribution defined by $\pi(x)$, $L(y,\pix)$ is the cross-entropy $H_{\hat{y}}(y)$.
     % \item For a given training set with $n$ samples, the cost function is computed by taking the average of all the cross-entropies in the sample
     %   $$-\frac{1}{n} \sum_{i=1}^{n}\left[\yi \log \pi(\xi)+\left(1-\yi\right) \log \left(1-\pi(\xi)\right)\right].$$
@@ -118,13 +117,9 @@
 
 What is the (average) risk of that minimal constant model?
 
-\framebreak
-
 \begin{align*}
-  \risk &= \frac{1}{n} \sumin \left( -\sumkg [\yi = k]\log \pik \right)  \\
-        &= - \frac{1}{n} \sumkg \sumin [\yi = k]\log \pik  \\
-        &= -\sumkg \frac{n_k}{n}\log \pik \\
-        &= -\sumkg \pi_k \log \pik = H(\pi) 
+  \risk &= \frac{1}{n} \sumin \left( -\sumkg [\yi = k]\log \pik \right) = - \frac{1}{n} \sumkg \sumin [\yi = k]\log \pik  \\
+        &= -\sumkg \frac{n_k}{n}\log \pik = -\sumkg \pi_k \log \pik = H(\pi) 
 \end{align*}
 
 So entropy is the (average) risk of the optimal "observed class frequency" model under log-loss!