From b13a310355d7f6bc8b9eea62e04e53b7823f93a3 Mon Sep 17 00:00:00 2001
From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com>
Date: Thu, 9 Nov 2023 14:56:55 +0100
Subject: [PATCH] Updates from Overleaf

---
 latex-math/basic-ml.tex                       |  2 +-
 .../slides-advriskmin-logreg-deepdive.tex     | 73 +++++++++----------
 .../slides-info-diffent.tex                   |  4 +-
 .../slides-info-entropy.tex                   |  6 +-
 4 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/latex-math/basic-ml.tex b/latex-math/basic-ml.tex
index 29b0d831..056796f0 100644
--- a/latex-math/basic-ml.tex
+++ b/latex-math/basic-ml.tex
@@ -95,7 +95,7 @@
 \newcommand{\pdf}{p} % p
 \newcommand{\pdfx}{p(\xv)} % p(x)
 \newcommand{\pixt}{\pi(\xv~|~ \thetab)} % pi(x|theta), pdf of x given theta
-\newcommand{\pixit}{\pi\left(\xi ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta
+\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta
 \newcommand{\pixii}{\pi\left(\xi\right)} % pi(x^i), pdf of i-th x 
 
 % pdf of (x, y)
diff --git a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex
index ed41d88b..74c41e91 100644
--- a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex
+++ b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex
@@ -34,31 +34,31 @@
 
 \begin{vbframe}{Logistic regression: Problem}
 
-Given $n \in \mathbb{N}$ observations $\left(\xi, \yi\right) \in \Xspace \times \Yspace$ with  $\Xspace = \mathbb{R}^d, \Yspace = \{0, 1\}$ we want to minimize the following risk 
+Given $n \in \mathbb{N}$ observations $\left(\xi, \yi\right) \in \Xspace \times \Yspace$ with  $\Xspace = \R^d, \Yspace = \{0, 1\}$ we want to minimize the following risk 
 
 
 \vspace*{-0.5cm}
 
 \begin{eqnarray*}
-\pixit[1]
   \riske  & = & 
-  -\sum^n_{i=1} y^{(i)}\log(\pi_{\bm{\theta}}(\xi)) + (1-y^{(i)})\log(1-\pi_{\bm{\theta}}(\xi))
+  -\sum^n_{i=1} \yi\log\left(\pixit\right) + \left(1-\yi\log(1-\pixit\right)
 \end{eqnarray*}
 
-with respect to $\bm{\theta}$ where the probabilistic classifier
+with respect to $\thetab$ where the probabilistic classifier
 
 \begin{eqnarray*}
-  \pi_{\bm{\theta}}(\xi)  & = & 
- \sigma(f(\xi, \bm{\theta})),
+  \pixit  & = & 
+ s\left(\fxit\right),
 \end{eqnarray*}
 
-the sigmoid function $\sigma(f) = \frac{1}{1 + \exp(-f)}$ and the score $f(\xi, \bm{\theta}) = \bm{\theta}^\top \xi.$
+the sigmoid function $s(f) = \frac{1}{1 + \exp(-f)}$ and the score $\fxit = \thx.$
 
 \vspace*{0.5cm} 
 
-NB: Note that $\frac{\partial}{\partial f} \sigma(f) = \sigma(f)(1-\sigma(f))$ and $\frac{\partial f(\xi, \bm{\theta})}{\partial \bm{\theta}} = \left(\xi\right)^\top.$
+NB: Note that $\frac{\partial}{\partial f} s(f) = s(f)(1-s(f))$ and $\frac{\partial \fxit}{\partial \thetab} = \left(\xi\right)^\top.$
 
 \end{vbframe}
+
 \begin{vbframe}{Logistic regression: Gradient}
 
 We find the gradient of logistic regression with the chain rule, s.t., 
@@ -66,43 +66,40 @@
 \vspace*{-0.5cm}
 
 \begin{align*}
-  \frac{\partial}{\partial\bm{\theta}}\riske  & = & 
- -\sum^n_{i=1} \frac{\partial}{\partial \pi_{\bm{\theta}}(\xi)}y^{(i)}\log(\pi_{\bm{\theta}}(\xi))\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}} +  \\
- &&  \frac{\partial}{\partial \pi_{\bm{\theta}}(\xi)}(1-y^{(i)})\log(1-\pi_{\bm{\theta}}(\xi))\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}}\\
+  \frac{\partial}{\partial\thetab}\riske  & = & 
+ -\sumin \frac{\partial}{\partial\pixit \yi}\log(\pixit)\frac{\partial\pixit}{\partial \thetab} +  \\
+ &&  \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetab}\\
  & = & 
- -\sum^n_{i=1} \frac{y^{(i)}}{\pi_{\bm{\theta}}(\xi)}\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}} -  \frac{1-y^{(i)}}{1-\pi_{\bm{\theta}}(\xi)}\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}}\\
+ -\sumin \frac{\yi}{\pixit}\frac{\partial\pixit}{\partial \thetab} -  \frac{1-\yi}{1-\pixit}\frac{\partial\pixit}{\partial \thetab}\\
  &=&  
-  -\sum^n_{i=1} \left(\frac{y^{(i)}}{\pi_{\bm{\theta}}(\xi)} -  \frac{1-y^{(i)}}{1-\pi_{\bm{\theta}}(\xi)}\right)\frac{\partial \sigma(f(\xi, \bm{\theta}))}{\partial  f(\xi, \bm{\theta})}\frac{\partial  f(\xi, \bm{\theta})}{\partial\bm{\theta}}\\
+  -\sumin \left(\frac{\yi}{\pixit} -  \frac{1-\yi}{1-\pixit}\right)\frac{\partial s(\fxit)}{\partial  \fxit}\frac{\partial  \fxit}{\partial\thetab}\\
   &=&  
-  -\sum^n_{i=1} \left(y^{(i)}(1-\pi_{\bm{\theta}}(\xi))  -  (1-y^{(i)})\pi_{\bm{\theta}}(\xi) \right)\left(\xi\right)^\top.\\
+  -\sum^n_{i=1} \left(\yi(1-\pixit)  -  (1-\yi)\pixit \right)\left(\xi\right)^\top.\\
 \end{align*}
 
-\framebreak
 
+
+\framebreak
 \begin{align*}
   \quad &=& 
-  \sum^n_{i=1} \left(\pi_{\bm{\theta}}(\xi) - y^{(i)}\right)\left(\xi\right)^\top.\\
+  \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top.\\
     \quad &=& 
-  \left(\pi_{\bm{\theta}}(\mathbf{X}) - \mathbf{y}\right)^\top\mathbf{X}\\
+  \left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)^\top\mathbf{X}\\
 \end{align*}
 
-where \\ $\mathbf{X} = \begin{pmatrix}
-    \xi[1]^\top \\
-    \vdots \\
-    \xi[n]^\top
-\end{pmatrix} \in \mathbb{R}^{n\times d}, \mathbf{y} = \begin{pmatrix}
-    \yi[1] \\
-    \vdots \\
+where  $\mathbf{X} = \left(
+    \xi[1], \dots, 
+    \xi[n]\right)^\top \in \R^{n\times d}, \mathbf{y} = \left(
+    \yi[1], \dots,
     \yi[n]
-\end{pmatrix}, \pi_{\bm{\theta}}(\mathbf{X}) = \begin{pmatrix}
-    \pi_{\bm{\theta}}(\xi[1]) \\
-    \vdots \\
-    \pi_{\bm{\theta}}(\xi[n])
-\end{pmatrix} \in \mathbb{R}^{n}$.
+\right)^\top,$ \\ $\pi(\mathbf{X}\vert\;\thetab) = \left(
+    \pixit[1], \dots,
+    \pixit[n]
+\right)^\top \in \R^{n}$.
 
 \vspace*{1cm}
 
-$\Rightarrow$ The gradient $\nabla_{\bm{\theta}}\riske = \left(\frac{\partial}{\partial\bm{\theta}}\riske\right)^\top =  \mathbf{X}^\top\left(\pi_{\bm{\theta}}(\mathbf{X}) - \mathbf{y}\right)$ 
+$\Rightarrow$ The gradient $\nabla_{\thetab}\riske = \left(\frac{\partial}{\partial\thetab}\riske\right)^\top =  \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)$ 
 
 \end{vbframe}
 
@@ -112,16 +109,16 @@
 We find the Hessian via differentiation, s.t.,
 
 \begin{align*}
-  \nabla^2_{\bm{\theta}}\riske  = \frac{\partial^2}{\partial{\bm{\theta}^\top}\partial\bm{\theta}}\riske  & = & 
- \frac{\partial}{\partial{\bm{\theta}^\top}} \sum^n_{i=1} \left(\pi_{\bm{\theta}}(\xi) - y^{(i)}\right)\left(\xi\right)^\top\\
+  \nabla^2_{\thetab}\riske  = \frac{\partial^2}{\partial{\thetab^\top}\partial\thetab}\riske  & = & 
+ \frac{\partial}{\partial{\thetab^\top}} \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\
  & = & 
-  \sum^n_{i=1}\xi \left(\pi_{\bm{\theta}}(\xi)(1-\pi_{\bm{\theta}}(\xi))\right)\left(\xi\right)^\top\\
+  \sum^n_{i=1}\xi \left(\pixit\left(1-\pixit\right)\right)\left(\xi\right)^\top\\
   & = & 
 \mathbf{X}^\top \mathbf{D} \mathbf{X}\\
 \end{align*}
 
 where $\mathbf{D} \in \mathbb{R}^{n\times n}$ is a diagonal matrix with diagonal 
-$$(\pi_{\bm{\theta}}(\xi[1])(1-\pi_{\bm{\theta}}(\xi[1]), \dots, \pi_{\bm{\theta}}(\xi[n])(1-\pi_{\bm{\theta}}(\xi[n])).$$
+$$\left(\pixit[1](1-\pixit[1], \dots, \pixit[n](1-\pixit[n]\right).$$
 
 \end{vbframe}
 
@@ -130,16 +127,16 @@
 \vspace*{0.3cm}
 
 We define the diagonal matrix $\bar{\mathbf{D}} \in \mathbb{R}^{n \times n}$ with diagonal 
-$$\left(\sqrt{\pi_{\bm{\theta}}(\xi[1])(1-\pi_{\bm{\theta}}(\xi[1])}, \dots, \sqrt{\pi_{\bm{\theta}}(\xi[n])(1-\pi_{\bm{\theta}}(\xi[n])}\right) $$
-which is possible since $\pi_{\bm{\theta}}$ maps into (0, 1). \\
+$$\left(\sqrt{\pixit[1])(1-\pixit[1]}, \dots, \sqrt{\pixit[n](1-\pixit[n]}\right) $$
+which is possible since $\pi$ maps into (0, 1). \\
 \vspace*{0.3cm}
 With this, we get for any $\mathbf{w} \in \mathbb{R}^d$ that
 
-$$\mathbf{w}^\top  \nabla^2_{\bm{\theta}}\riske \mathbf{w} =   \mathbf{w}^\top  \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$
+$$\mathbf{w}^\top  \nabla^2_{\thetab}\riske \mathbf{w} =   \mathbf{w}^\top  \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$
 
 since obviously $\mathbf{D} = \bar{\mathbf{D}}^\top \bar{\mathbf{D}}.$ \\
 \vspace*{0.3cm}
-$\Rightarrow \nabla^2_{\bm{\theta}}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex.
+$\Rightarrow \nabla^2_{\thetab}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex.
 
 \end{vbframe}
 
diff --git a/slides/information-theory/slides-info-diffent.tex b/slides/information-theory/slides-info-diffent.tex
index 92c18c43..fd63c505 100644
--- a/slides/information-theory/slides-info-diffent.tex
+++ b/slides/information-theory/slides-info-diffent.tex
@@ -23,8 +23,7 @@
     $$ h(X) := h(f) := - \int_{\Xspace} f(x) \log(f(x)) dx $$
     \item The base of the log is again somewhat arbitrary, and we could either use 2 (and measure in bits) or e (to measure in nats).
     \item The integral above does not necessarily exist for all densities.
-    \item Differential entropy lacks some properties of discrete entropy.
-    \item $h(X) < 0$ is possible because $f(x) > 1$ is possible.
+    \item Differential entropy lacks the non-negativeness property of the discrete entropy: $h(X) < 0$ is possible because $f(x) > 1$ is possible.
     \end{itemize}
 \end{vbframe}
 
@@ -56,6 +55,7 @@
   \end{equation*}
 \framebreak
 
+$$ h(X) := - \int_{\Xspace} f(x) \log(f(x)) dx = \log(\sigma \sqrt{2\pi e})$$
   \begin{itemize}
     \item $h(X)$ is not a function of $\mu$ (see translation invariance).
     \item As $\sigma^2$ increases, the differential entropy also increases.
diff --git a/slides/information-theory/slides-info-entropy.tex b/slides/information-theory/slides-info-entropy.tex
index e09bfe9d..a3f239df 100644
--- a/slides/information-theory/slides-info-entropy.tex
+++ b/slides/information-theory/slides-info-entropy.tex
@@ -71,12 +71,12 @@
 \begin{vbframe}{Entropy Calculation}
 
   \begin{itemize}
-  \item The negative log probabilities $\log_2 p(x)$ are called "Surprise".
+  \item The negative log probabilities $\log_2 p(x)$ are called "Surprisal".
   \end{itemize}
 
 \begin{equation*}
 \begin{aligned} 
-  H(X) = - \E[\log_2(p(X))]           &= -\sum_{x \in \Xspace} p(x) \log_2 p(x)} 
+  H(X) = - \E[\log_2(p(X))]           &= -\sum_{x \in \Xspace} p(x) \log_2 p(x) 
 \end{aligned} 
 \end{equation*}
 
@@ -92,6 +92,8 @@
 
 \begin{vbframe}{Entropy Properties}
 
+$$H(X) := H(p) = - \E[\log_2(p(X))] = -\sum_{x \in \Xspace} p(x) \log_2 p(x)$$
+
 We can directly note some basic properties:
 \vspace{0.2cm}
   \begin{enumerate}