diff --git a/slides/information-theory/figure/binary-ce.jpg b/slides/information-theory/figure/binary-ce.jpg new file mode 100644 index 00000000..0a83d624 Binary files /dev/null and b/slides/information-theory/figure/binary-ce.jpg differ diff --git a/slides/information-theory/rsrc/make_binary-ce.py b/slides/information-theory/rsrc/make_binary-ce.py new file mode 100644 index 00000000..3859c84b --- /dev/null +++ b/slides/information-theory/rsrc/make_binary-ce.py @@ -0,0 +1,24 @@ +import matplotlib.pyplot as plt +import numpy as np + +# Binary Cross-Entropy Loss function for true value y and predicted probability p +def binary_cross_entropy(y, p): + return -(y * np.log(p) + (1 - y) * np.log(1 - p)) + +# Predicted probabilities +p = np.linspace(0.01, 0.99, 100) # Avoiding the extreme values 0 and 1 for numerical stability + +# Calculate the loss for true values 0 and 1 +loss_for_1 = binary_cross_entropy(1, p) +loss_for_0 = binary_cross_entropy(0, p) + +# Plotting +plt.figure(figsize=(10, 6)) +plt.plot(p, loss_for_1, label='True value: 1') +plt.plot(p, loss_for_0, label='True value: 0', color='orange') +plt.title('Binary Cross-Entropy Loss') +plt.xlabel('p') +plt.ylabel('Binary Cross-Entropy Loss') +plt.legend() +plt.grid(True) +plt.show() \ No newline at end of file diff --git a/slides/information-theory/slides-info-cross-entropy-kld.tex b/slides/information-theory/slides-info-cross-entropy-kld.tex index 637edde8..350e1225 100644 --- a/slides/information-theory/slides-info-cross-entropy-kld.tex +++ b/slides/information-theory/slides-info-cross-entropy-kld.tex @@ -3,7 +3,7 @@ \input{../../latex-math/basic-math} \input{../../latex-math/basic-ml} -\newcommand{\titlefigure}{figure_man/shift.png} +\newcommand{\titlefigure}{figure/binary-ce.jpg} \newcommand{\learninggoals}{ \item Know the cross-entropy \item Understand the connection between entropy, cross-entropy, and KL divergence @@ -14,86 +14,34 @@ \begin{document} -\lecturechapter{Cross-Entropy, KL and Source Coding} +\lecturechapter{Cross-Entropy and KL} \lecture{Introduction to Machine Learning} \begin{vbframe} {Cross-Entropy - Discrete Case} -\begin{itemize} - \item For a random source / distribution $p$, the minimal number of bits to optimally encode messages from is the entropy $H(p)$. - \item If the optimal code for a different distribution $q(x)$ is instead used to encode messages from $p(x)$, expected code length will grow. -% (Note: Both distributions are assumed to have the same support.) -\end{itemize} - \vspace{-0.3cm} - \begin{figure} - \centering - \scalebox{0.5}{\includegraphics{figure_man/shift.png}} - \scalebox{1}{\includegraphics{figure_man/xent_pq.png}} - \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}} - \end{figure} +\textbf{Cross-entropy} measures the average amount of information required to represent an event from one distribution $p$ using a predictive scheme based on another distribution $q$ (assume they have the same domain $\Xspace$ as in KL). + $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$ -\framebreak -\textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL). - $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$ - -\begin{figure} - \centering - \scalebox{1}{\includegraphics{figure_man/xent_pq.png}} - \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}} - \end{figure} - -We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$. - -\framebreak - \begin{figure} - \centering - \scalebox{0.8}{\includegraphics{figure_man/crossent.png}} - \tiny{\\ Credit: Chris Olah} - \end{figure} - - \begin{itemize} - \item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$. - \item Same, in bottom, for pink when we go from $q$ to $p$. - \item Note that $H_q(p) \neq H_p(q)$}. - \end{itemize} - - \framebreak - - \begin{figure} - \centering - \scalebox{1}{\includegraphics{figure_man/xent_pq.png}} - \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}} - \end{figure} - - \begin{itemize} - \item Let $x^\prime$ denote the symbol "dog". The difference in code lengths is: - $$ \log \left ( \frac{1}{q(x^\prime)} \right ) - \log \left( \frac{1}{p(x^\prime)} \right) = \log \frac{p(x^\prime)}{q(x^\prime)} $$ - -\item If $p(x^\prime) > q(x^\prime)$, this is positive, if $p(x^\prime) < q(x^\prime)$, it is negative. - \item The expected difference is KL, if we encode symbols from $p$: - $$ D_{KL}(p \| q) = \sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} $$ - \end{itemize} - -\framebreak \begin{itemize} -\item Entropy = Avg. nr. of bits if we optimally encode $p$ -\item Cross-Entropy = Avg. nr. of bits if we suboptimally encode $p$ with $q$ -\item $DL_ {KL}(p \| q)$: Difference in bits between the two +\setlength{\itemsep}{1.2em} +\item Entropy = Avg. amount of information if we optimally encode $p$ +\item Cross-Entropy = Avg. amount of information if we suboptimally encode $p$ with $q$ +\item $DL_ {KL}(p \| q)$: Difference between the two \end{itemize} \lz We can summarize this also through this identity: - +\lz $$ H_q(p) = H(p) + D_{KL}(p \| q) $$ This is because: \begin{eqnarray*} H(p) + D_{KL}(p \| q) &=& - \sum_{x \in \Xspace} p(x) \log p(x) + \sum_{x \in \Xspace} p(x) \log \frac{p(x)}{q(x)} \\ - &=& \sum_{x \in \Xspace} p(x) (-\log p(x) + \log p(x) - \log q(x) \\ + &=& \sum_{x \in \Xspace} p(x) (-\log p(x) + \log p(x) - \log q(x)) \\ &=& - \sum_{x \in \Xspace} p(x) \log q(x) = H_q(p) \\ \end{eqnarray*} @@ -104,17 +52,17 @@ For continuous density functions $p(x)$ and $q(x)$: -$$ H_p(q) = \int q(x) \log\left(\frac{1}{p(x)}\right) dx = - \int q(x) \log\left(p(x)\right) dx $$ +$$ H_q(p) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$ \begin{itemize} \item It is not symmetric. -\item As for the discrete case, $H_p(q) = h(q) + D_{KL}(q \| p)$ holds. -\item Can now become negative, as the $h(q)$ can be negative! +\item As for the discrete case, $H_q(p) = h(p) + D_{KL}(p \| q)$ holds. +\item Can now become negative, as the $h(p)$ can be negative! \end{itemize} \end{vbframe} \begin{vbframe}{Proof: Maximum of Differential Entropy} - \textbf{Claim}: For a given variance, the distribution that maximizes differential entropy is the Gaussian. + \textbf{Claim}: For a given variance, the continuous distribution that maximizes differential entropy is the Gaussian. \lz diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex index 3564929a..1304232b 100644 --- a/slides/information-theory/slides-info-kl.tex +++ b/slides/information-theory/slides-info-kl.tex @@ -31,7 +31,7 @@ $$ D_{KL}(p \| q) = \E_{X \sim p} \left[\log \frac{p(X)}{q(X)}\right] = \int_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} \mathrm{d}x. $$ In the above definition, we use the conventions that $0 \log (0/0) = 0$, $0 \log (0/q) = 0$ and $p \log(p/0) = \infty$ (based on continuity arguments where $p \to 0$). -Thus, if there is any symbol $x \in \Xspace$ such that $p(x) > 0$ and $q(x) = 0$, +Thus, if there is any realization $x \in \Xspace$ such that $p(x) > 0$ and $q(x) = 0$, then $D_{KL}(p \| q) = \infty.$ \framebreak diff --git a/slides/information-theory/slides-info-sourcecoding.tex b/slides/information-theory/slides-info-sourcecoding.tex index 9ec68cbf..52c7d8d0 100644 --- a/slides/information-theory/slides-info-sourcecoding.tex +++ b/slides/information-theory/slides-info-sourcecoding.tex @@ -146,6 +146,65 @@ \end{vbframe} +\begin{vbframe} {Source coding and (cross-)entropy} + +\begin{itemize} + \item For a random source / distribution $p$, the minimal number of bits to optimally encode messages from is the entropy $H(p)$. + \item If the optimal code for a different distribution $q(x)$ is instead used to encode messages from $p(x)$, expected code length will grow. +% (Note: Both distributions are assumed to have the same support.) +\end{itemize} + \vspace{-0.3cm} + \begin{figure} + \centering + \scalebox{0.5}{\includegraphics{figure_man/shift.png}} + \scalebox{1}{\includegraphics{figure_man/xent_pq.png}} + \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}} + \end{figure} + +\framebreak +\textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL). + $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$ + +\begin{figure} + \centering + \scalebox{1}{\includegraphics{figure_man/xent_pq.png}} + \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}} + \end{figure} + +We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$. + +\framebreak + \begin{figure} + \centering + \scalebox{0.8}{\includegraphics{figure_man/crossent.png}} + \tiny{\\ Credit: Chris Olah} + \end{figure} + + \begin{itemize} + \item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$. + \item Same, in bottom, for pink when we go from $q$ to $p$. + \item Note that $H_q(p) \neq H_p(q)$}. + \end{itemize} + + \framebreak + + \begin{figure} + \centering + \scalebox{1}{\includegraphics{figure_man/xent_pq.png}} + \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}} + \end{figure} + + \begin{itemize} + \item Let $x^\prime$ denote the symbol "dog". The difference in code lengths is: + $$ \log \left ( \frac{1}{q(x^\prime)} \right ) - \log \left( \frac{1}{p(x^\prime)} \right) = \log \frac{p(x^\prime)}{q(x^\prime)} $$ + +\item If $p(x^\prime) > q(x^\prime)$, this is positive, if $p(x^\prime) < q(x^\prime)$, it is negative. + \item The expected difference is KL, if we encode symbols from $p$: + $$ D_{KL}(p \| q) = \sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} $$ + \end{itemize} + +\end{vbframe} + \endlecture \end{document}