From b13a310355d7f6bc8b9eea62e04e53b7823f93a3 Mon Sep 17 00:00:00 2001 From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com> Date: Thu, 9 Nov 2023 14:56:55 +0100 Subject: [PATCH] Updates from Overleaf --- latex-math/basic-ml.tex | 2 +- .../slides-advriskmin-logreg-deepdive.tex | 73 +++++++++---------- .../slides-info-diffent.tex | 4 +- .../slides-info-entropy.tex | 6 +- 4 files changed, 42 insertions(+), 43 deletions(-) diff --git a/latex-math/basic-ml.tex b/latex-math/basic-ml.tex index 29b0d831..056796f0 100644 --- a/latex-math/basic-ml.tex +++ b/latex-math/basic-ml.tex @@ -95,7 +95,7 @@ \newcommand{\pdf}{p} % p \newcommand{\pdfx}{p(\xv)} % p(x) \newcommand{\pixt}{\pi(\xv~|~ \thetab)} % pi(x|theta), pdf of x given theta -\newcommand{\pixit}{\pi\left(\xi ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta +\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta \newcommand{\pixii}{\pi\left(\xi\right)} % pi(x^i), pdf of i-th x % pdf of (x, y) diff --git a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex index ed41d88b..74c41e91 100644 --- a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex +++ b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex @@ -34,31 +34,31 @@ \begin{vbframe}{Logistic regression: Problem} -Given $n \in \mathbb{N}$ observations $\left(\xi, \yi\right) \in \Xspace \times \Yspace$ with $\Xspace = \mathbb{R}^d, \Yspace = \{0, 1\}$ we want to minimize the following risk +Given $n \in \mathbb{N}$ observations $\left(\xi, \yi\right) \in \Xspace \times \Yspace$ with $\Xspace = \R^d, \Yspace = \{0, 1\}$ we want to minimize the following risk \vspace*{-0.5cm} \begin{eqnarray*} -\pixit[1] \riske & = & - -\sum^n_{i=1} y^{(i)}\log(\pi_{\bm{\theta}}(\xi)) + (1-y^{(i)})\log(1-\pi_{\bm{\theta}}(\xi)) + -\sum^n_{i=1} \yi\log\left(\pixit\right) + \left(1-\yi\log(1-\pixit\right) \end{eqnarray*} -with respect to $\bm{\theta}$ where the probabilistic classifier +with respect to $\thetab$ where the probabilistic classifier \begin{eqnarray*} - \pi_{\bm{\theta}}(\xi) & = & - \sigma(f(\xi, \bm{\theta})), + \pixit & = & + s\left(\fxit\right), \end{eqnarray*} -the sigmoid function $\sigma(f) = \frac{1}{1 + \exp(-f)}$ and the score $f(\xi, \bm{\theta}) = \bm{\theta}^\top \xi.$ +the sigmoid function $s(f) = \frac{1}{1 + \exp(-f)}$ and the score $\fxit = \thx.$ \vspace*{0.5cm} -NB: Note that $\frac{\partial}{\partial f} \sigma(f) = \sigma(f)(1-\sigma(f))$ and $\frac{\partial f(\xi, \bm{\theta})}{\partial \bm{\theta}} = \left(\xi\right)^\top.$ +NB: Note that $\frac{\partial}{\partial f} s(f) = s(f)(1-s(f))$ and $\frac{\partial \fxit}{\partial \thetab} = \left(\xi\right)^\top.$ \end{vbframe} + \begin{vbframe}{Logistic regression: Gradient} We find the gradient of logistic regression with the chain rule, s.t., @@ -66,43 +66,40 @@ \vspace*{-0.5cm} \begin{align*} - \frac{\partial}{\partial\bm{\theta}}\riske & = & - -\sum^n_{i=1} \frac{\partial}{\partial \pi_{\bm{\theta}}(\xi)}y^{(i)}\log(\pi_{\bm{\theta}}(\xi))\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}} + \\ - && \frac{\partial}{\partial \pi_{\bm{\theta}}(\xi)}(1-y^{(i)})\log(1-\pi_{\bm{\theta}}(\xi))\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}}\\ + \frac{\partial}{\partial\thetab}\riske & = & + -\sumin \frac{\partial}{\partial\pixit \yi}\log(\pixit)\frac{\partial\pixit}{\partial \thetab} + \\ + && \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetab}\\ & = & - -\sum^n_{i=1} \frac{y^{(i)}}{\pi_{\bm{\theta}}(\xi)}\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}} - \frac{1-y^{(i)}}{1-\pi_{\bm{\theta}}(\xi)}\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}}\\ + -\sumin \frac{\yi}{\pixit}\frac{\partial\pixit}{\partial \thetab} - \frac{1-\yi}{1-\pixit}\frac{\partial\pixit}{\partial \thetab}\\ &=& - -\sum^n_{i=1} \left(\frac{y^{(i)}}{\pi_{\bm{\theta}}(\xi)} - \frac{1-y^{(i)}}{1-\pi_{\bm{\theta}}(\xi)}\right)\frac{\partial \sigma(f(\xi, \bm{\theta}))}{\partial f(\xi, \bm{\theta})}\frac{\partial f(\xi, \bm{\theta})}{\partial\bm{\theta}}\\ + -\sumin \left(\frac{\yi}{\pixit} - \frac{1-\yi}{1-\pixit}\right)\frac{\partial s(\fxit)}{\partial \fxit}\frac{\partial \fxit}{\partial\thetab}\\ &=& - -\sum^n_{i=1} \left(y^{(i)}(1-\pi_{\bm{\theta}}(\xi)) - (1-y^{(i)})\pi_{\bm{\theta}}(\xi) \right)\left(\xi\right)^\top.\\ + -\sum^n_{i=1} \left(\yi(1-\pixit) - (1-\yi)\pixit \right)\left(\xi\right)^\top.\\ \end{align*} -\framebreak + +\framebreak \begin{align*} \quad &=& - \sum^n_{i=1} \left(\pi_{\bm{\theta}}(\xi) - y^{(i)}\right)\left(\xi\right)^\top.\\ + \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top.\\ \quad &=& - \left(\pi_{\bm{\theta}}(\mathbf{X}) - \mathbf{y}\right)^\top\mathbf{X}\\ + \left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)^\top\mathbf{X}\\ \end{align*} -where \\ $\mathbf{X} = \begin{pmatrix} - \xi[1]^\top \\ - \vdots \\ - \xi[n]^\top -\end{pmatrix} \in \mathbb{R}^{n\times d}, \mathbf{y} = \begin{pmatrix} - \yi[1] \\ - \vdots \\ +where $\mathbf{X} = \left( + \xi[1], \dots, + \xi[n]\right)^\top \in \R^{n\times d}, \mathbf{y} = \left( + \yi[1], \dots, \yi[n] -\end{pmatrix}, \pi_{\bm{\theta}}(\mathbf{X}) = \begin{pmatrix} - \pi_{\bm{\theta}}(\xi[1]) \\ - \vdots \\ - \pi_{\bm{\theta}}(\xi[n]) -\end{pmatrix} \in \mathbb{R}^{n}$. +\right)^\top,$ \\ $\pi(\mathbf{X}\vert\;\thetab) = \left( + \pixit[1], \dots, + \pixit[n] +\right)^\top \in \R^{n}$. \vspace*{1cm} -$\Rightarrow$ The gradient $\nabla_{\bm{\theta}}\riske = \left(\frac{\partial}{\partial\bm{\theta}}\riske\right)^\top = \mathbf{X}^\top\left(\pi_{\bm{\theta}}(\mathbf{X}) - \mathbf{y}\right)$ +$\Rightarrow$ The gradient $\nabla_{\thetab}\riske = \left(\frac{\partial}{\partial\thetab}\riske\right)^\top = \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)$ \end{vbframe} @@ -112,16 +109,16 @@ We find the Hessian via differentiation, s.t., \begin{align*} - \nabla^2_{\bm{\theta}}\riske = \frac{\partial^2}{\partial{\bm{\theta}^\top}\partial\bm{\theta}}\riske & = & - \frac{\partial}{\partial{\bm{\theta}^\top}} \sum^n_{i=1} \left(\pi_{\bm{\theta}}(\xi) - y^{(i)}\right)\left(\xi\right)^\top\\ + \nabla^2_{\thetab}\riske = \frac{\partial^2}{\partial{\thetab^\top}\partial\thetab}\riske & = & + \frac{\partial}{\partial{\thetab^\top}} \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\ & = & - \sum^n_{i=1}\xi \left(\pi_{\bm{\theta}}(\xi)(1-\pi_{\bm{\theta}}(\xi))\right)\left(\xi\right)^\top\\ + \sum^n_{i=1}\xi \left(\pixit\left(1-\pixit\right)\right)\left(\xi\right)^\top\\ & = & \mathbf{X}^\top \mathbf{D} \mathbf{X}\\ \end{align*} where $\mathbf{D} \in \mathbb{R}^{n\times n}$ is a diagonal matrix with diagonal -$$(\pi_{\bm{\theta}}(\xi[1])(1-\pi_{\bm{\theta}}(\xi[1]), \dots, \pi_{\bm{\theta}}(\xi[n])(1-\pi_{\bm{\theta}}(\xi[n])).$$ +$$\left(\pixit[1](1-\pixit[1], \dots, \pixit[n](1-\pixit[n]\right).$$ \end{vbframe} @@ -130,16 +127,16 @@ \vspace*{0.3cm} We define the diagonal matrix $\bar{\mathbf{D}} \in \mathbb{R}^{n \times n}$ with diagonal -$$\left(\sqrt{\pi_{\bm{\theta}}(\xi[1])(1-\pi_{\bm{\theta}}(\xi[1])}, \dots, \sqrt{\pi_{\bm{\theta}}(\xi[n])(1-\pi_{\bm{\theta}}(\xi[n])}\right) $$ -which is possible since $\pi_{\bm{\theta}}$ maps into (0, 1). \\ +$$\left(\sqrt{\pixit[1])(1-\pixit[1]}, \dots, \sqrt{\pixit[n](1-\pixit[n]}\right) $$ +which is possible since $\pi$ maps into (0, 1). \\ \vspace*{0.3cm} With this, we get for any $\mathbf{w} \in \mathbb{R}^d$ that -$$\mathbf{w}^\top \nabla^2_{\bm{\theta}}\riske \mathbf{w} = \mathbf{w}^\top \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$ +$$\mathbf{w}^\top \nabla^2_{\thetab}\riske \mathbf{w} = \mathbf{w}^\top \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$ since obviously $\mathbf{D} = \bar{\mathbf{D}}^\top \bar{\mathbf{D}}.$ \\ \vspace*{0.3cm} -$\Rightarrow \nabla^2_{\bm{\theta}}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex. +$\Rightarrow \nabla^2_{\thetab}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex. \end{vbframe} diff --git a/slides/information-theory/slides-info-diffent.tex b/slides/information-theory/slides-info-diffent.tex index 92c18c43..fd63c505 100644 --- a/slides/information-theory/slides-info-diffent.tex +++ b/slides/information-theory/slides-info-diffent.tex @@ -23,8 +23,7 @@ $$ h(X) := h(f) := - \int_{\Xspace} f(x) \log(f(x)) dx $$ \item The base of the log is again somewhat arbitrary, and we could either use 2 (and measure in bits) or e (to measure in nats). \item The integral above does not necessarily exist for all densities. - \item Differential entropy lacks some properties of discrete entropy. - \item $h(X) < 0$ is possible because $f(x) > 1$ is possible. + \item Differential entropy lacks the non-negativeness property of the discrete entropy: $h(X) < 0$ is possible because $f(x) > 1$ is possible. \end{itemize} \end{vbframe} @@ -56,6 +55,7 @@ \end{equation*} \framebreak +$$ h(X) := - \int_{\Xspace} f(x) \log(f(x)) dx = \log(\sigma \sqrt{2\pi e})$$ \begin{itemize} \item $h(X)$ is not a function of $\mu$ (see translation invariance). \item As $\sigma^2$ increases, the differential entropy also increases. diff --git a/slides/information-theory/slides-info-entropy.tex b/slides/information-theory/slides-info-entropy.tex index e09bfe9d..a3f239df 100644 --- a/slides/information-theory/slides-info-entropy.tex +++ b/slides/information-theory/slides-info-entropy.tex @@ -71,12 +71,12 @@ \begin{vbframe}{Entropy Calculation} \begin{itemize} - \item The negative log probabilities $\log_2 p(x)$ are called "Surprise". + \item The negative log probabilities $\log_2 p(x)$ are called "Surprisal". \end{itemize} \begin{equation*} \begin{aligned} - H(X) = - \E[\log_2(p(X))] &= -\sum_{x \in \Xspace} p(x) \log_2 p(x)} + H(X) = - \E[\log_2(p(X))] &= -\sum_{x \in \Xspace} p(x) \log_2 p(x) \end{aligned} \end{equation*} @@ -92,6 +92,8 @@ \begin{vbframe}{Entropy Properties} +$$H(X) := H(p) = - \E[\log_2(p(X))] = -\sum_{x \in \Xspace} p(x) \log_2 p(x)$$ + We can directly note some basic properties: \vspace{0.2cm} \begin{enumerate}