diff --git a/slides/information-theory/figure/binary-ce.jpg b/slides/information-theory/figure/binary-ce.jpg
new file mode 100644
index 00000000..0a83d624
Binary files /dev/null and b/slides/information-theory/figure/binary-ce.jpg differ
diff --git a/slides/information-theory/rsrc/make_binary-ce.py b/slides/information-theory/rsrc/make_binary-ce.py
new file mode 100644
index 00000000..3859c84b
--- /dev/null
+++ b/slides/information-theory/rsrc/make_binary-ce.py
@@ -0,0 +1,24 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Binary Cross-Entropy Loss function for true value y and predicted probability p
+def binary_cross_entropy(y, p):
+    return -(y * np.log(p) + (1 - y) * np.log(1 - p))
+
+# Predicted probabilities
+p = np.linspace(0.01, 0.99, 100)  # Avoiding the extreme values 0 and 1 for numerical stability
+
+# Calculate the loss for true values 0 and 1
+loss_for_1 = binary_cross_entropy(1, p)
+loss_for_0 = binary_cross_entropy(0, p)
+
+# Plotting
+plt.figure(figsize=(10, 6))
+plt.plot(p, loss_for_1, label='True value: 1')
+plt.plot(p, loss_for_0, label='True value: 0', color='orange')
+plt.title('Binary Cross-Entropy Loss')
+plt.xlabel('p')
+plt.ylabel('Binary Cross-Entropy Loss')
+plt.legend()
+plt.grid(True)
+plt.show()
\ No newline at end of file
diff --git a/slides/information-theory/slides-info-cross-entropy-kld.tex b/slides/information-theory/slides-info-cross-entropy-kld.tex
index 637edde8..350e1225 100644
--- a/slides/information-theory/slides-info-cross-entropy-kld.tex
+++ b/slides/information-theory/slides-info-cross-entropy-kld.tex
@@ -3,7 +3,7 @@
 \input{../../latex-math/basic-math}
 \input{../../latex-math/basic-ml}
 
-\newcommand{\titlefigure}{figure_man/shift.png}
+\newcommand{\titlefigure}{figure/binary-ce.jpg}
 \newcommand{\learninggoals}{
   \item Know the cross-entropy 
   \item Understand the connection between entropy, cross-entropy, and KL divergence
@@ -14,86 +14,34 @@
 
 \begin{document}
 
-\lecturechapter{Cross-Entropy, KL and Source Coding}
+\lecturechapter{Cross-Entropy and KL}
 \lecture{Introduction to Machine Learning}
 
 
 \begin{vbframe} {Cross-Entropy - Discrete Case}
 
-\begin{itemize}
-  \item For a random source / distribution $p$, the minimal number of bits to optimally encode messages from is the entropy $H(p)$.
-  \item If the optimal code for a different distribution $q(x)$ is instead used to encode messages from $p(x)$, expected code length will grow.
-%  (Note: Both distributions are assumed to have the same support.)
-\end{itemize}
-  \vspace{-0.3cm}
-  \begin{figure}
-    \centering
-      \scalebox{0.5}{\includegraphics{figure_man/shift.png}}
-      \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
-      \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
-  \end{figure}
+\textbf{Cross-entropy} measures the average amount of information required to represent an event from one distribution $p$ using a predictive scheme based on another distribution $q$ (assume they have the same domain $\Xspace$ as in KL).
+  $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$
 
-\framebreak
-\textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL).
-  $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$
-
-\begin{figure}
-    \centering
-      \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
-      \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
-  \end{figure}
-  
-We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$.
-  
-\framebreak
-  \begin{figure}
-    \centering
-      \scalebox{0.8}{\includegraphics{figure_man/crossent.png}}
-      \tiny{\\ Credit: Chris Olah}
-  \end{figure}
-  
-  \begin{itemize}
-    \item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$.
-    \item Same, in bottom, for pink when we go from $q$ to $p$.
-    \item Note that $H_q(p) \neq H_p(q)$}. 
-  \end{itemize}
-
-  \framebreak
-
-  \begin{figure}
-    \centering
-      \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
-      \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
-  \end{figure}
-  
-  \begin{itemize}
-   \item Let $x^\prime$ denote the symbol "dog". The difference in code lengths is:
-  $$ \log \left ( \frac{1}{q(x^\prime)} \right ) - \log \left( \frac{1}{p(x^\prime)} \right) = \log \frac{p(x^\prime)}{q(x^\prime)} $$
-  
-\item If $p(x^\prime) > q(x^\prime)$, this is positive, if $p(x^\prime) < q(x^\prime)$, it is negative. 
-    \item The expected difference is KL, if we encode symbols from $p$:
-  $$ D_{KL}(p \| q) = \sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} $$
-  \end{itemize}
-
-\framebreak 
 
 \begin{itemize}
-\item Entropy = Avg. nr. of bits if we optimally encode $p$
-\item Cross-Entropy = Avg. nr. of bits if we suboptimally encode $p$ with $q$
-\item $DL_ {KL}(p \| q)$: Difference in bits between the two
+\setlength{\itemsep}{1.2em}
+\item Entropy = Avg. amount of information if we optimally encode $p$
+\item Cross-Entropy = Avg. amount of information if we suboptimally encode $p$ with $q$
+\item $DL_ {KL}(p \| q)$: Difference between the two
 \end{itemize}
 
 \lz
 
 We can summarize this also through this identity: 
-
+\lz
 $$
 H_q(p) = H(p) + D_{KL}(p \| q)
 $$
 This is because: 
 \begin{eqnarray*}
 H(p) + D_{KL}(p \| q) &=& - \sum_{x \in \Xspace} p(x) \log p(x) + \sum_{x \in \Xspace} p(x) \log \frac{p(x)}{q(x)} \\
-                      &=& \sum_{x \in \Xspace} p(x) (-\log p(x) +  \log p(x) - \log q(x) \\
+                      &=& \sum_{x \in \Xspace} p(x) (-\log p(x) +  \log p(x) - \log q(x)) \\
 &=& - \sum_{x \in \Xspace} p(x) \log q(x) = H_q(p) \\
 \end{eqnarray*}
    
@@ -104,17 +52,17 @@
 
 For continuous density functions $p(x)$ and $q(x)$: 
 
-$$ H_p(q) = \int q(x) \log\left(\frac{1}{p(x)}\right) dx = - \int q(x) \log\left(p(x)\right) dx $$
+$$ H_q(p) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$
 
 \begin{itemize}
 \item It is not symmetric.
-\item As for the discrete case, $H_p(q) = h(q) + D_{KL}(q \| p)$ holds.
-\item Can now become negative, as the $h(q)$ can be negative! 
+\item As for the discrete case, $H_q(p) = h(p) + D_{KL}(p \| q)$ holds.
+\item Can now become negative, as the $h(p)$ can be negative! 
 \end{itemize}
 \end{vbframe}
  
 \begin{vbframe}{Proof: Maximum of Differential Entropy}
-  \textbf{Claim}: For a given variance, the distribution that maximizes differential entropy is the Gaussian.
+  \textbf{Claim}: For a given variance, the continuous distribution that maximizes differential entropy is the Gaussian.
 
   \lz
 
diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex
index 3564929a..1304232b 100644
--- a/slides/information-theory/slides-info-kl.tex
+++ b/slides/information-theory/slides-info-kl.tex
@@ -31,7 +31,7 @@
   $$ D_{KL}(p \| q) = \E_{X \sim p} \left[\log \frac{p(X)}{q(X)}\right] = \int_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} \mathrm{d}x. $$
 
 In the above definition, we use the conventions that $0 \log (0/0) = 0$, $0 \log (0/q) = 0$ and $p \log(p/0) = \infty$ (based on continuity arguments where $p \to 0$). 
-Thus, if there is any symbol $x \in \Xspace$ such that $p(x) > 0$ and $q(x) = 0$,
+Thus, if there is any realization $x \in \Xspace$ such that $p(x) > 0$ and $q(x) = 0$,
 then $D_{KL}(p \| q) = \infty.$
   
 \framebreak
diff --git a/slides/information-theory/slides-info-sourcecoding.tex b/slides/information-theory/slides-info-sourcecoding.tex
index 9ec68cbf..52c7d8d0 100644
--- a/slides/information-theory/slides-info-sourcecoding.tex
+++ b/slides/information-theory/slides-info-sourcecoding.tex
@@ -146,6 +146,65 @@
   
 \end{vbframe}
 
+\begin{vbframe} {Source coding and (cross-)entropy}
+
+\begin{itemize}
+  \item For a random source / distribution $p$, the minimal number of bits to optimally encode messages from is the entropy $H(p)$.
+  \item If the optimal code for a different distribution $q(x)$ is instead used to encode messages from $p(x)$, expected code length will grow.
+%  (Note: Both distributions are assumed to have the same support.)
+\end{itemize}
+  \vspace{-0.3cm}
+  \begin{figure}
+    \centering
+      \scalebox{0.5}{\includegraphics{figure_man/shift.png}}
+      \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
+      \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
+  \end{figure}
+
+\framebreak
+\textbf{Cross-entropy} is the average length of communicating an event from one distribution with the optimal code for another distribution (assume they have the same domain $\Xspace$ as in KL).
+  $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) $$
+
+\begin{figure}
+    \centering
+      \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
+      \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
+  \end{figure}
+  
+We directly see: cross-entropy of $p$ with itself is entropy: $H_p(p) = H(p)$.
+  
+\framebreak
+  \begin{figure}
+    \centering
+      \scalebox{0.8}{\includegraphics{figure_man/crossent.png}}
+      \tiny{\\ Credit: Chris Olah}
+  \end{figure}
+  
+  \begin{itemize}
+    \item \small{In top, $H_q(p)$ is greater than $H(p)$ primarily because the blue event that is very likely under $p$ has a very long codeword in $q$.
+    \item Same, in bottom, for pink when we go from $q$ to $p$.
+    \item Note that $H_q(p) \neq H_p(q)$}. 
+  \end{itemize}
+
+  \framebreak
+
+  \begin{figure}
+    \centering
+      \scalebox{1}{\includegraphics{figure_man/xent_pq.png}}
+      \caption{\footnotesize{$L_p(x)$, $L_q(x)$ are the optimal code lengths for $p(x)$ and $q(x)$}}
+  \end{figure}
+  
+  \begin{itemize}
+   \item Let $x^\prime$ denote the symbol "dog". The difference in code lengths is:
+  $$ \log \left ( \frac{1}{q(x^\prime)} \right ) - \log \left( \frac{1}{p(x^\prime)} \right) = \log \frac{p(x^\prime)}{q(x^\prime)} $$
+  
+\item If $p(x^\prime) > q(x^\prime)$, this is positive, if $p(x^\prime) < q(x^\prime)$, it is negative. 
+    \item The expected difference is KL, if we encode symbols from $p$:
+  $$ D_{KL}(p \| q) = \sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)} $$
+  \end{itemize}
+
+\end{vbframe}
+
 
 \endlecture
 \end{document}