changes 6.12 post-meeting

slds-lmu · Dec 6, 2023 · bfd2311 · bfd2311
1 parent 048c173
commit bfd2311
Show file tree

Hide file tree

Showing 6 changed files with 13 additions and 12 deletions.
diff --git a/slides/information-theory/chapter-order.tex b/slides/information-theory/chapter-order.tex
@@ -19,9 +19,10 @@ \subsection{Differential Entropy}
 \subsection{Kullback-Leibler Divergence}
 \includepdf[pages=-]{../slides-pdf/slides-info-kl.pdf}
 
-\subsection{Cross-Entropy, KL and Source Coding}
+\subsection{Cross-Entropy and KL}
 \includepdf[pages=-]{../slides-pdf/slides-info-cross-entropy-kld.pdf}
 
+% bis hier fertig dez 23
 \subsection{Information Theory for Machine Learning}
 \includepdf[pages=-]{../slides-pdf/slides-info-ml.pdf}
 

diff --git a/slides/information-theory/slides-info-cross-entropy-kld.tex b/slides/information-theory/slides-info-cross-entropy-kld.tex
@@ -23,7 +23,7 @@
 \textbf{Cross-entropy} measures the average amount of information required to represent an event from one distribution $p$ using a predictive scheme based on another distribution $q$ (assume they have the same domain $\Xspace$ as in KL).
   $$ H(p \| q) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$
 
-For now, we accept the formula as-is. More on the underlying intuition follows in the concent on inf. theory for ML and sourcecoding.
+For now, we accept the formula as-is. More on the underlying intuition follows in the content on inf. theory for ML and sourcecoding.
 \begin{itemize}
 \setlength{\itemsep}{0.9em}
 \item Entropy = Avg. amount of information if we optimally encode $p$

diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex
@@ -220,8 +220,8 @@
 
 If for $\xi$ we have $p(\xi)/q(\xi)>1$, then $p$ seems better, for $p(\xi)/q(\xi) < 1$ $q$ seems better. \\
 \begin{itemize}
-    \item Let us assume that our data already come from p. It no longer makes sense to ask whether p or q fits the data better.
-    \item So we ask instead: "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
+    \item Now assume that the data is generated by $p$. Can also ask:
+    \item "How to quantify how much better does $p$ fit than $q$, on average?"
 \end{itemize}
 $$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
 That expected LLR is really KL!

diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex
@@ -26,7 +26,7 @@
     D_{KL}(p \| q_{\thetab}) &= \E_{X \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\
      &= \E_{X \sim p} \log p(x) - \E_{X \sim p} \log q(x|\thetab)
   \end{align*}
-  The first term above does not depend on $\thetab$ and the second term can be defined as the cross-entropy. Therefore,
+  The first term above does not depend on $\thetab$ and the second term we could also as a def for CE! Therefore,
   \begin{align*}
     \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab)\\ 
                                            &= \argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab)
@@ -41,8 +41,8 @@
 From this here we can actually see much more:
 $$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$
   \begin{itemize}
-    \item So minimizing w.r.t. to KL is the same as minimizing with respect to cross-entropy, which implies minimizing w.r.t. cross-entropy is the same as maximum likelihood!
-    \item Remember, how we only characterized cross-entropy through bits? We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)!
+    \item So minimizing w.r.t. KL is the same as minimizing w.r.t. cross-entropy, which implies minimizing w.r.t. cross-entropy is the same as maximum likelihood!
+    \item We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)!
     \item Or we could say: Cross-entropy between $p$ and $q$ is simply the expected negative log-likelihood of $q$, when our data comes from $p$!
   \end{itemize}
 \end{vbframe}

diff --git a/slides/information-theory/slides-info-sourcecoding.tex b/slides/information-theory/slides-info-sourcecoding.tex
@@ -163,15 +163,15 @@
 
 \framebreak
 \textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL).
-  $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$
+  $$ H(p \| q) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$
 
 \begin{figure}
     \centering
       \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
       \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
   \end{figure}
 
-We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$.
+We directly see: cross-entropy of $p$ with itself is entropy: $H(p \| p) = H(p)$.
 
 \framebreak
   \begin{figure}
@@ -181,9 +181,9 @@
   \end{figure}
 
   \begin{itemize}
-    \item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$.
+    \item \small{In top, $H(p \| q)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$.
     \item Same, in bottom, for pink when we go from $q$ to $p$.
-    \item Note that $H_q(p) \neq H_p(q)$}. 
+    \item Note that $H(p \| q) \neq H(q \| p)$}. 
   \end{itemize}
 
   \framebreak

diff --git a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex
@@ -91,7 +91,7 @@
 \begin{minipage}{0.49\textwidth}
 $\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $g_j$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\
 However, at least one stationary point must exist since $g_j$ is a regularized convex function with $\lambda > 0.$\\
-$\Rightarrow$ An equivalent constraint with $\vert\theta_j\vert \leq t \in\R_+$ must exist.
+$(\Rightarrow$ An equivalent constraint with $\vert\theta_j\vert \leq t \in\R_+$ must exist.)
 \end{minipage}
  \\