Merge overleaf-2024-01-13-2220 into main

slds-lmu · Jan 13, 2024 · 7f0c378 · 7f0c378
2 parents 2ca9901 + bc226b3
commit 7f0c378
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 39 deletions.
diff --git a/slides/information-theory/slides-info-entropy.tex b/slides/information-theory/slides-info-entropy.tex
@@ -22,10 +22,12 @@
 
 \begin{itemize}
   \item \textbf{Information Theory} is a field of study based on probability theory.
-  \item The foundation of the field was laid by Claude Shannon in 1948 and it has since found applications in areas as diverse as communication theory, computer science, optimization, cryptography, machine learning and statistical inference.
-  \item In addition to quantifying information, it also deals with efficiently storing and transmitting the information.
-  \item Information theory tries to quantify the "amount" of information gained or 
+  \item Foundation was laid by Claude Shannon in 1948; since then been applied in: communication theory, computer science, optimization, cryptography, machine learning and statistical inference.
+
+  \item Quantify the "amount" of information gained or 
     uncertainty reduced when a random variable is observed.
+
+  \item Also about storing and transmitting information.
 \end{itemize}
 
   \begin{center}
@@ -45,7 +47,7 @@
 \end{itemize}
 \end{vbframe}
 
-\begin{vbframe}{Entropy}
+\begin{vbframe}{Entropy as Surprisal and Uncertainty}
 %\begin{itemize}
   %\item Entropy is often introduced in IT as a measure of
   %  expected information or in terms of bits needed for efficient coding, 
@@ -61,58 +63,62 @@
 \end{aligned} 
 \end{equation*}
 
-\begin{center}
-\includegraphics[width = 11cm ]{figure/entropy_calc.png} 
-\end{center}
-\vspace{-0.5cm}
+Some technicalities first:
+
 \begin{itemize}
-\item The final entropy is $H(X)=1.5$.
+\setlength\itemsep{1.2em} 
+\item $H$ is actually Greek capital letter \textbf{E}ta ($\eta$) for \textbf{e}ntropy
+\item Base of the log simply specifies the unit we measure information in, usually bits (base 2) or 'nats' (base $e$)
+\item If $p(x) = 0$ for an $x$, then $p(x) \log_2 p(x)$ is taken to be zero,\\
+because $\lim _{p \rightarrow 0} p \log_2 p=0$. %for $x=0$.
 \end{itemize}
 
 
 \end{vbframe}
 
-\begin{vbframe}{Entropy Calculation}
+\begin{vbframe}{Entropy as Surprisal and Uncertainty}
 
 
 \begin{equation*}
 \begin{aligned} 
   H(X) = - \E[\log_2(p(X))]           &= -\sum_{x \in \Xspace} p(x) \log_2 p(x) 
 \end{aligned} 
 \end{equation*}
+
+Now: What's the point?
 \begin{itemize}
-\setlength\itemsep{1.2em} 
-\item \textbf{Definition:}
-Base $2$ means the information is measured in bits, but you can use any number $>1$ as base of the logarithm.
-\item \textbf{Note:} If $p(x) = 0$, then $p(x) \log_2 p(x)$ is taken to be zero, because $\lim _{p \rightarrow 0} p \log_2 p=0$. %for $x=0$.
-\item NB: $H$ is actually Greek capital letter \textbf{E}ta ($\eta$) for \textbf{e}ntropy
-\item The negative log probabilities $-\log_2 p(x)$ are called "Surprisal".
-\item More surprising means less likely. Distributions are more surprising, i.e., have higher entropy, when events are equally likely.
+\item The negative log probabilities $-\log_2 p(x)$ are called "surprisal"
+\item More surprising means less likely
+\item PMFs surprising, so with higher H, when events more equally likely
+\item Entropy is simply expected surprisal
 \end{itemize}
 
 
+\begin{center}
+\includegraphics[width = 10cm ]{figure/entropy_calc.png} 
+\end{center}
+\vspace{-0.5cm}
+\begin{itemize}
+\item The final entropy is $H(X)=1.5$.
+\end{itemize}
+
 
 \end{vbframe}
 
-\begin{vbframe}{Entropy Properties}
+\begin{vbframe}{Entropy Basic Properties}
 
 $$H(X) := H(p) = - \E[\log_2(p(X))] = -\sum_{x \in \Xspace} p(x) \log_2 p(x)$$
 
-We can directly note some basic properties:
 \vspace{0.2cm}
   \begin{enumerate}
   \setlength\itemsep{1.2em} 
-    \item Entropy is non-negative, so $H(X) \geq 0$.
-    \item If one event has probability $p(x) = 1$, then $H(X)=0$. 
-    \item Adding or removing an event with $p(x)=0$ does not change entropy.
-    \item $H(X)$ is continuous in probabilities $p(x)$.
+    \item Entropy is non-negative, so $H(X) \geq 0$    \item If one event has probability $p(x) = 1$, then $H(X)=0$
+    \item Adding or removing an event with $p(x)=0$ doesn't change it
+    \item $H(X)$ is continuous in probabilities $p(x)$
   \end{enumerate}
 \vspace{0.2cm}  
 All these properties follow directly from the definition.
 
-\vspace{0.2cm}
-
-In the following, we will look at various simple examples are derive some more properties of the entropy.
 
 \end{vbframe}
 
@@ -152,36 +158,40 @@
 \end{center}
 
 \begin{itemize}
-   \item Naive observation: Entropy minimal for peaked distribution and maximal for uniform distribution.
+   \item Naive observation:\\
+   Entropy min for 1-point and max for uniform distribution
 \end{itemize}
 
 \end{vbframe}
 
 \begin{vbframe}{Entropy is Maximal for Uniform}
 \begin{enumerate}
 \setcounter{enumi}{5}
-\item Entropy is maximal for a uniform distribution, so for a domain with $g$ elements:  
+\item Entropy is maximal for a uniform distribution,\\
+      for domain of size $g$:  
       $H(X) \leq -g\frac{1}{g} \log_2(\frac{1}{g}) = log_2(g)$.
 \end{enumerate}
 \vspace{0.2cm}
-\textbf{Claim}: The entropy of a discrete random variable $X$ which takes on values in $\{x_1,x_2, \ldots, x_g\}$ with associated probabilities $\{p_1,p_2, \ldots, p_g\}$ is maximal when the distribution over $X$ is uniform.
+%\textbf{Claim}: The entropy of a discrete random variable $X$ which takes on values in $\{x_1,x_2, \ldots, x_g\}$ with associated probabilities $\{p_1,p_2, \ldots, p_g\}$ is maximal when the distribution over $X$ is uniform.
 
 \lz
-\textbf{Proof}: The entropy $H(X)$ is $- \sum_{i=1}^g p_i \log_2 p_i$ and our goal is to find:
-  $$\underset{p_{1}, p_{2}, \ldots, p_{g}}{\operatorname{argmax}}-\sum_{i=1}^{g} p_{i} \log _{2} p_{i}$$
+\textbf{Proof}: 
+So we want to maximize w.r.t. all $p_i$:
+
+  $$\underset{p_{1}, p_{2}, \ldots, p_{g}}{\operatorname{argmax}} -\sum_{i=1}^{g} p_{i} \log _{2} p_{i}$$
   subject to
-  $$\sum_{i=1}^g p_i = 1.$$
+  $$\sum_{i=1}^g p_i = 1$$
 
   \framebreak
   The Lagrangian $L(p_1, \ldots, p_g, \lambda)$ is :
   $$L(p_1, \ldots, p_g, \lambda) = - \sum_{i=1}^g p_i \log_2(p_i) - \lambda \left( \sum_{i=1}^g p_i - 1 \right)$$
 
-  Solving for $\nabla L = 0$,
+  Solving when requiring $\nabla L = 0$,
   \begin{gather*}
     \frac{\partial L(p_1, \ldots, p_g, \lambda)}{\partial p_i} = 0 = - \log_2(p_i) - 1 - \lambda \\
     \implies p_i = 2^{(-1 - \lambda)} \implies p_i = \frac{1}{g},
   \end{gather*}
-  where the last step follows from the fact that all $p_i$ are equal and the constraint.
+  last step follows from that all $p_i$ are equal and constraint
   \vspace{0.2cm}\\
   \textbf{NB}: We also could have solved the constraint for $p_1$ and substitute $p_1=1-\sum_{i=2}^{g} p_i$ in the objective to avoid constrained optimization.
 

diff --git a/slides/information-theory/slides-info-entropy2.tex b/slides/information-theory/slides-info-entropy2.tex
@@ -98,14 +98,15 @@
 
 \textbf{Maximum entropy principle}: Among all feasible distributions satisfying the constraints, choose the one with maximum entropy!
 \begin{itemize}
-    \item Motivation: ensure no unwarranted assumptions on $p(x)$ are made beyond what we know. MEP follows similar logic to Occam's razor and principle of insufficient reason
-    \item We already saw an application of this: the (trivial) constraint $\sum_{x \in \Xspace} p(x) = 1$ for which we obtained the uniform distribution as having maximum entropy translates to $g_0(x)=1$ and $\alpha_0=1$ 
+    \item Motivation: ensure no unwarranted assumptions on $p(x)$ are made beyond what we know. 
+    \item MEP follows similar logic to Occam's razor and principle of insufficient reason
 \end{itemize}
-Maxent distribution given $M$ constraints can be derived using Lagrangian with multipliers $\lambda_1,\ldots,\lambda_M$. Finding the optimal $\lambda_m$ means finding the constrained maxent distribution. 
+
+
 \end{vbframe}
 
 \begin{vbframe}{The Maximum Entropy Principle}
-The Lagrangian for this problem using base $e$ in the entropy is given by:
+Can be solved via Lagrangian multipliers (here with base $e$)
 \small{$$L(p(x),(\lambda_m)_{m=0}^{M}) = - \sum_{x \in \Xspace} p(x) \log(p(x)) + \lambda_0 \big( \sum_{x \in \Xspace} p(x) - 1 \big) + \sum_{m=1}^{M} \lambda_m \big( \sum_{x \in \Xspace} g_m(x)p(x)-\alpha_m \big)$$}
 Finding critical points $p^{\ast}(x)$ :
 $$\frac{\partial L}{\partial p(x)} = -\log(p(x)) -1 + \lambda_0 + \sum_{m=1}^{M} \lambda_m g_m(x) \overset{!}{=} 0 \iff p^{\ast}(x)=\textcolor{blue}{\exp(\lambda_0-1)}\textcolor{red}{
@@ -120,6 +121,10 @@
 
 \begin{vbframe}{The Maximum Entropy Principle}
 
+We now have: functional form of our distribution, up to $M$ unknowns, the $\lambda_m$. But also: $M$ equations, the moment conditions. So we can solve.
+
+\vspace{0.5cm}
+
 \textbf{Example}: Consider discrete RV representing a six-sided die roll and the moment condition $\mathbb{E}(X)=4.8$. What is the maxent distribution?\\
 
 \begin{itemize}
@@ -137,7 +142,7 @@
 \hline
 \end{tabular}
 }
-\item For a fair die ($\mathbb{E}(X)=\alpha_1=3.5$) it is not hard to see that $\lambda=0$ satisfies the equation, resulting in uniform probabilities
+%\item For a fair die ($\mathbb{E}(X)=\alpha_1=3.5$) it is not hard to see that $\lambda=0$ satisfies the equation, resulting in uniform probabilities
 \end{itemize}
 \end{vbframe}