Skip to content

Commit

Permalink
separating cross-entropy and source coding
Browse files Browse the repository at this point in the history
  • Loading branch information
ludwigbothmann committed Nov 29, 2023
1 parent 4c98a45 commit 94f7988
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 67 deletions.
Binary file added slides/information-theory/figure/binary-ce.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions slides/information-theory/rsrc/make_binary-ce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import matplotlib.pyplot as plt
import numpy as np

# Binary Cross-Entropy Loss function for true value y and predicted probability p
def binary_cross_entropy(y, p):
return -(y * np.log(p) + (1 - y) * np.log(1 - p))

# Predicted probabilities
p = np.linspace(0.01, 0.99, 100) # Avoiding the extreme values 0 and 1 for numerical stability

# Calculate the loss for true values 0 and 1
loss_for_1 = binary_cross_entropy(1, p)
loss_for_0 = binary_cross_entropy(0, p)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(p, loss_for_1, label='True value: 1')
plt.plot(p, loss_for_0, label='True value: 0', color='orange')
plt.title('Binary Cross-Entropy Loss')
plt.xlabel('p')
plt.ylabel('Binary Cross-Entropy Loss')
plt.legend()
plt.grid(True)
plt.show()
80 changes: 14 additions & 66 deletions slides/information-theory/slides-info-cross-entropy-kld.tex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
\input{../../latex-math/basic-math}
\input{../../latex-math/basic-ml}

\newcommand{\titlefigure}{figure_man/shift.png}
\newcommand{\titlefigure}{figure/binary-ce.jpg}
\newcommand{\learninggoals}{
\item Know the cross-entropy
\item Understand the connection between entropy, cross-entropy, and KL divergence
Expand All @@ -14,86 +14,34 @@

\begin{document}

\lecturechapter{Cross-Entropy, KL and Source Coding}
\lecturechapter{Cross-Entropy and KL}
\lecture{Introduction to Machine Learning}


\begin{vbframe} {Cross-Entropy - Discrete Case}

\begin{itemize}
\item For a random source / distribution $p$, the minimal number of bits to optimally encode messages from is the entropy $H(p)$.
\item If the optimal code for a different distribution $q(x)$ is instead used to encode messages from $p(x)$, expected code length will grow.
% (Note: Both distributions are assumed to have the same support.)
\end{itemize}
\vspace{-0.3cm}
\begin{figure}
\centering
\scalebox{0.5}{\includegraphics{figure_man/shift.png}}
\scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
\caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
\end{figure}
\textbf{Cross-entropy} measures the average amount of information required to represent an event from one distribution $p$ using a predictive scheme based on another distribution $q$ (assume they have the same domain $\Xspace$ as in KL).
$$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$

\framebreak
\textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL).
$$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$

\begin{figure}
\centering
\scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
\caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
\end{figure}

We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$.

\framebreak
\begin{figure}
\centering
\scalebox{0.8}{\includegraphics{figure_man/crossent.png}}
\tiny{\\ Credit: Chris Olah}
\end{figure}

\begin{itemize}
\item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$.
\item Same, in bottom, for pink when we go from $q$ to $p$.
\item Note that $H_q(p) \neq H_p(q)$}.
\end{itemize}

\framebreak

\begin{figure}
\centering
\scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
\caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
\end{figure}

\begin{itemize}
\item Let $x^\prime$ denote the symbol "dog". The difference in code lengths is:
$$ \log \left ( \frac{1}{q(x^\prime)} \right ) - \log \left( \frac{1}{p(x^\prime)} \right) = \log \frac{p(x^\prime)}{q(x^\prime)} $$

\item If $p(x^\prime) > q(x^\prime)$, this is positive, if $p(x^\prime) < q(x^\prime)$, it is negative.
\item The expected difference is KL, if we encode symbols from $p$:
$$ D_{KL}(p \| q) = \sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} $$
\end{itemize}

\framebreak

\begin{itemize}
\item Entropy = Avg. nr. of bits if we optimally encode $p$
\item Cross-Entropy = Avg. nr. of bits if we suboptimally encode $p$ with $q$
\item $DL_ {KL}(p \| q)$: Difference in bits between the two
\setlength{\itemsep}{1.2em}
\item Entropy = Avg. amount of information if we optimally encode $p$
\item Cross-Entropy = Avg. amount of information if we suboptimally encode $p$ with $q$
\item $DL_ {KL}(p \| q)$: Difference between the two
\end{itemize}

\lz

We can summarize this also through this identity:

\lz
$$
H_q(p) = H(p) + D_{KL}(p \| q)
$$
This is because:
\begin{eqnarray*}
H(p) + D_{KL}(p \| q) &=& - \sum_{x \in \Xspace} p(x) \log p(x) + \sum_{x \in \Xspace} p(x) \log \frac{p(x)}{q(x)} \\
&=& \sum_{x \in \Xspace} p(x) (-\log p(x) + \log p(x) - \log q(x) \\
&=& \sum_{x \in \Xspace} p(x) (-\log p(x) + \log p(x) - \log q(x)) \\
&=& - \sum_{x \in \Xspace} p(x) \log q(x) = H_q(p) \\
\end{eqnarray*}

Expand All @@ -104,17 +52,17 @@

For continuous density functions $p(x)$ and $q(x)$:

$$ H_p(q) = \int q(x) \log\left(\frac{1}{p(x)}\right) dx = - \int q(x) \log\left(p(x)\right) dx $$
$$ H_q(p) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$

\begin{itemize}
\item It is not symmetric.
\item As for the discrete case, $H_p(q) = h(q) + D_{KL}(q \| p)$ holds.
\item Can now become negative, as the $h(q)$ can be negative!
\item As for the discrete case, $H_q(p) = h(p) + D_{KL}(p \| q)$ holds.
\item Can now become negative, as the $h(p)$ can be negative!
\end{itemize}
\end{vbframe}

\begin{vbframe}{Proof: Maximum of Differential Entropy}
\textbf{Claim}: For a given variance, the distribution that maximizes differential entropy is the Gaussian.
\textbf{Claim}: For a given variance, the continuous distribution that maximizes differential entropy is the Gaussian.

\lz

Expand Down
2 changes: 1 addition & 1 deletion slides/information-theory/slides-info-kl.tex
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
$$ D_{KL}(p \| q) = \E_{X \sim p} \left[\log \frac{p(X)}{q(X)}\right] = \int_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} \mathrm{d}x. $$

In the above definition, we use the conventions that $0 \log (0/0) = 0$, $0 \log (0/q) = 0$ and $p \log(p/0) = \infty$ (based on continuity arguments where $p \to 0$).
Thus, if there is any symbol $x \in \Xspace$ such that $p(x) > 0$ and $q(x) = 0$,
Thus, if there is any realization $x \in \Xspace$ such that $p(x) > 0$ and $q(x) = 0$,
then $D_{KL}(p \| q) = \infty.$

\framebreak
Expand Down
59 changes: 59 additions & 0 deletions slides/information-theory/slides-info-sourcecoding.tex
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,65 @@

\end{vbframe}

\begin{vbframe} {Source coding and (cross-)entropy}

\begin{itemize}
\item For a random source / distribution $p$, the minimal number of bits to optimally encode messages from is the entropy $H(p)$.
\item If the optimal code for a different distribution $q(x)$ is instead used to encode messages from $p(x)$, expected code length will grow.
% (Note: Both distributions are assumed to have the same support.)
\end{itemize}
\vspace{-0.3cm}
\begin{figure}
\centering
\scalebox{0.5}{\includegraphics{figure_man/shift.png}}
\scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
\caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
\end{figure}

\framebreak
\textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL).
$$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$

\begin{figure}
\centering
\scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
\caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
\end{figure}

We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$.

\framebreak
\begin{figure}
\centering
\scalebox{0.8}{\includegraphics{figure_man/crossent.png}}
\tiny{\\ Credit: Chris Olah}
\end{figure}

\begin{itemize}
\item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$.
\item Same, in bottom, for pink when we go from $q$ to $p$.
\item Note that $H_q(p) \neq H_p(q)$}.
\end{itemize}

\framebreak

\begin{figure}
\centering
\scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
\caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
\end{figure}

\begin{itemize}
\item Let $x^\prime$ denote the symbol "dog". The difference in code lengths is:
$$ \log \left ( \frac{1}{q(x^\prime)} \right ) - \log \left( \frac{1}{p(x^\prime)} \right) = \log \frac{p(x^\prime)}{q(x^\prime)} $$

\item If $p(x^\prime) > q(x^\prime)$, this is positive, if $p(x^\prime) < q(x^\prime)$, it is negative.
\item The expected difference is KL, if we encode symbols from $p$:
$$ D_{KL}(p \| q) = \sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} $$
\end{itemize}

\end{vbframe}


\endlecture
\end{document}
Expand Down

0 comments on commit 94f7988

Please sign in to comment.