Skip to content

Commit

Permalink
Updates from Overleaf
Browse files Browse the repository at this point in the history
  • Loading branch information
ludwigbothmann committed Nov 9, 2023
1 parent 285ae46 commit b13a310
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 43 deletions.
2 changes: 1 addition & 1 deletion latex-math/basic-ml.tex
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
\newcommand{\pdf}{p} % p
\newcommand{\pdfx}{p(\xv)} % p(x)
\newcommand{\pixt}{\pi(\xv~|~ \thetab)} % pi(x|theta), pdf of x given theta
\newcommand{\pixit}{\pi\left(\xi ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta
\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta
\newcommand{\pixii}{\pi\left(\xi\right)} % pi(x^i), pdf of i-th x

% pdf of (x, y)
Expand Down
73 changes: 35 additions & 38 deletions slides/advriskmin/slides-advriskmin-logreg-deepdive.tex
Original file line number Diff line number Diff line change
Expand Up @@ -34,75 +34,72 @@

\begin{vbframe}{Logistic regression: Problem}

Given $n \in \mathbb{N}$ observations $\left(\xi, \yi\right) \in \Xspace \times \Yspace$ with $\Xspace = \mathbb{R}^d, \Yspace = \{0, 1\}$ we want to minimize the following risk
Given $n \in \mathbb{N}$ observations $\left(\xi, \yi\right) \in \Xspace \times \Yspace$ with $\Xspace = \R^d, \Yspace = \{0, 1\}$ we want to minimize the following risk


\vspace*{-0.5cm}

\begin{eqnarray*}
\pixit[1]
\riske & = &
-\sum^n_{i=1} y^{(i)}\log(\pi_{\bm{\theta}}(\xi)) + (1-y^{(i)})\log(1-\pi_{\bm{\theta}}(\xi))
-\sum^n_{i=1} \yi\log\left(\pixit\right) + \left(1-\yi\log(1-\pixit\right)
\end{eqnarray*}

with respect to $\bm{\theta}$ where the probabilistic classifier
with respect to $\thetab$ where the probabilistic classifier

\begin{eqnarray*}
\pi_{\bm{\theta}}(\xi) & = &
\sigma(f(\xi, \bm{\theta})),
\pixit & = &
s\left(\fxit\right),
\end{eqnarray*}

the sigmoid function $\sigma(f) = \frac{1}{1 + \exp(-f)}$ and the score $f(\xi, \bm{\theta}) = \bm{\theta}^\top \xi.$
the sigmoid function $s(f) = \frac{1}{1 + \exp(-f)}$ and the score $\fxit = \thx.$

\vspace*{0.5cm}

NB: Note that $\frac{\partial}{\partial f} \sigma(f) = \sigma(f)(1-\sigma(f))$ and $\frac{\partial f(\xi, \bm{\theta})}{\partial \bm{\theta}} = \left(\xi\right)^\top.$
NB: Note that $\frac{\partial}{\partial f} s(f) = s(f)(1-s(f))$ and $\frac{\partial \fxit}{\partial \thetab} = \left(\xi\right)^\top.$

\end{vbframe}

\begin{vbframe}{Logistic regression: Gradient}

We find the gradient of logistic regression with the chain rule, s.t.,

\vspace*{-0.5cm}

\begin{align*}
\frac{\partial}{\partial\bm{\theta}}\riske & = &
-\sum^n_{i=1} \frac{\partial}{\partial \pi_{\bm{\theta}}(\xi)}y^{(i)}\log(\pi_{\bm{\theta}}(\xi))\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}} + \\
&& \frac{\partial}{\partial \pi_{\bm{\theta}}(\xi)}(1-y^{(i)})\log(1-\pi_{\bm{\theta}}(\xi))\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}}\\
\frac{\partial}{\partial\thetab}\riske & = &
-\sumin \frac{\partial}{\partial\pixit \yi}\log(\pixit)\frac{\partial\pixit}{\partial \thetab} + \\
&& \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetab}\\
& = &
-\sum^n_{i=1} \frac{y^{(i)}}{\pi_{\bm{\theta}}(\xi)}\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}} - \frac{1-y^{(i)}}{1-\pi_{\bm{\theta}}(\xi)}\frac{\partial \pi_{\bm{\theta}}(\xi)}{\partial \bm{\theta}}\\
-\sumin \frac{\yi}{\pixit}\frac{\partial\pixit}{\partial \thetab} - \frac{1-\yi}{1-\pixit}\frac{\partial\pixit}{\partial \thetab}\\
&=&
-\sum^n_{i=1} \left(\frac{y^{(i)}}{\pi_{\bm{\theta}}(\xi)} - \frac{1-y^{(i)}}{1-\pi_{\bm{\theta}}(\xi)}\right)\frac{\partial \sigma(f(\xi, \bm{\theta}))}{\partial f(\xi, \bm{\theta})}\frac{\partial f(\xi, \bm{\theta})}{\partial\bm{\theta}}\\
-\sumin \left(\frac{\yi}{\pixit} - \frac{1-\yi}{1-\pixit}\right)\frac{\partial s(\fxit)}{\partial \fxit}\frac{\partial \fxit}{\partial\thetab}\\
&=&
-\sum^n_{i=1} \left(y^{(i)}(1-\pi_{\bm{\theta}}(\xi)) - (1-y^{(i)})\pi_{\bm{\theta}}(\xi) \right)\left(\xi\right)^\top.\\
-\sum^n_{i=1} \left(\yi(1-\pixit) - (1-\yi)\pixit \right)\left(\xi\right)^\top.\\
\end{align*}

\framebreak


\framebreak
\begin{align*}
\quad &=&
\sum^n_{i=1} \left(\pi_{\bm{\theta}}(\xi) - y^{(i)}\right)\left(\xi\right)^\top.\\
\sumin \left(\pixit - \yi\right)\left(\xi\right)^\top.\\
\quad &=&
\left(\pi_{\bm{\theta}}(\mathbf{X}) - \mathbf{y}\right)^\top\mathbf{X}\\
\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)^\top\mathbf{X}\\
\end{align*}

where \\ $\mathbf{X} = \begin{pmatrix}
\xi[1]^\top \\
\vdots \\
\xi[n]^\top
\end{pmatrix} \in \mathbb{R}^{n\times d}, \mathbf{y} = \begin{pmatrix}
\yi[1] \\
\vdots \\
where $\mathbf{X} = \left(
\xi[1], \dots,
\xi[n]\right)^\top \in \R^{n\times d}, \mathbf{y} = \left(
\yi[1], \dots,
\yi[n]
\end{pmatrix}, \pi_{\bm{\theta}}(\mathbf{X}) = \begin{pmatrix}
\pi_{\bm{\theta}}(\xi[1]) \\
\vdots \\
\pi_{\bm{\theta}}(\xi[n])
\end{pmatrix} \in \mathbb{R}^{n}$.
\right)^\top,$ \\ $\pi(\mathbf{X}\vert\;\thetab) = \left(
\pixit[1], \dots,
\pixit[n]
\right)^\top \in \R^{n}$.

\vspace*{1cm}

$\Rightarrow$ The gradient $\nabla_{\bm{\theta}}\riske = \left(\frac{\partial}{\partial\bm{\theta}}\riske\right)^\top = \mathbf{X}^\top\left(\pi_{\bm{\theta}}(\mathbf{X}) - \mathbf{y}\right)$
$\Rightarrow$ The gradient $\nabla_{\thetab}\riske = \left(\frac{\partial}{\partial\thetab}\riske\right)^\top = \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)$

\end{vbframe}

Expand All @@ -112,16 +109,16 @@
We find the Hessian via differentiation, s.t.,

\begin{align*}
\nabla^2_{\bm{\theta}}\riske = \frac{\partial^2}{\partial{\bm{\theta}^\top}\partial\bm{\theta}}\riske & = &
\frac{\partial}{\partial{\bm{\theta}^\top}} \sum^n_{i=1} \left(\pi_{\bm{\theta}}(\xi) - y^{(i)}\right)\left(\xi\right)^\top\\
\nabla^2_{\thetab}\riske = \frac{\partial^2}{\partial{\thetab^\top}\partial\thetab}\riske & = &
\frac{\partial}{\partial{\thetab^\top}} \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\
& = &
\sum^n_{i=1}\xi \left(\pi_{\bm{\theta}}(\xi)(1-\pi_{\bm{\theta}}(\xi))\right)\left(\xi\right)^\top\\
\sum^n_{i=1}\xi \left(\pixit\left(1-\pixit\right)\right)\left(\xi\right)^\top\\
& = &
\mathbf{X}^\top \mathbf{D} \mathbf{X}\\
\end{align*}

where $\mathbf{D} \in \mathbb{R}^{n\times n}$ is a diagonal matrix with diagonal
$$(\pi_{\bm{\theta}}(\xi[1])(1-\pi_{\bm{\theta}}(\xi[1]), \dots, \pi_{\bm{\theta}}(\xi[n])(1-\pi_{\bm{\theta}}(\xi[n])).$$
$$\left(\pixit[1](1-\pixit[1], \dots, \pixit[n](1-\pixit[n]\right).$$

\end{vbframe}

Expand All @@ -130,16 +127,16 @@
\vspace*{0.3cm}

We define the diagonal matrix $\bar{\mathbf{D}} \in \mathbb{R}^{n \times n}$ with diagonal
$$\left(\sqrt{\pi_{\bm{\theta}}(\xi[1])(1-\pi_{\bm{\theta}}(\xi[1])}, \dots, \sqrt{\pi_{\bm{\theta}}(\xi[n])(1-\pi_{\bm{\theta}}(\xi[n])}\right) $$
which is possible since $\pi_{\bm{\theta}}$ maps into (0, 1). \\
$$\left(\sqrt{\pixit[1])(1-\pixit[1]}, \dots, \sqrt{\pixit[n](1-\pixit[n]}\right) $$
which is possible since $\pi$ maps into (0, 1). \\
\vspace*{0.3cm}
With this, we get for any $\mathbf{w} \in \mathbb{R}^d$ that

$$\mathbf{w}^\top \nabla^2_{\bm{\theta}}\riske \mathbf{w} = \mathbf{w}^\top \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$
$$\mathbf{w}^\top \nabla^2_{\thetab}\riske \mathbf{w} = \mathbf{w}^\top \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$

since obviously $\mathbf{D} = \bar{\mathbf{D}}^\top \bar{\mathbf{D}}.$ \\
\vspace*{0.3cm}
$\Rightarrow \nabla^2_{\bm{\theta}}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex.
$\Rightarrow \nabla^2_{\thetab}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex.

\end{vbframe}

Expand Down
4 changes: 2 additions & 2 deletions slides/information-theory/slides-info-diffent.tex
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
$$ h(X) := h(f) := - \int_{\Xspace} f(x) \log(f(x)) dx $$
\item The base of the log is again somewhat arbitrary, and we could either use 2 (and measure in bits) or e (to measure in nats).
\item The integral above does not necessarily exist for all densities.
\item Differential entropy lacks some properties of discrete entropy.
\item $h(X) < 0$ is possible because $f(x) > 1$ is possible.
\item Differential entropy lacks the non-negativeness property of the discrete entropy: $h(X) < 0$ is possible because $f(x) > 1$ is possible.
\end{itemize}
\end{vbframe}

Expand Down Expand Up @@ -56,6 +55,7 @@
\end{equation*}
\framebreak

$$ h(X) := - \int_{\Xspace} f(x) \log(f(x)) dx = \log(\sigma \sqrt{2\pi e})$$
\begin{itemize}
\item $h(X)$ is not a function of $\mu$ (see translation invariance).
\item As $\sigma^2$ increases, the differential entropy also increases.
Expand Down
6 changes: 4 additions & 2 deletions slides/information-theory/slides-info-entropy.tex
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@
\begin{vbframe}{Entropy Calculation}

\begin{itemize}
\item The negative log probabilities $\log_2 p(x)$ are called "Surprise".
\item The negative log probabilities $\log_2 p(x)$ are called "Surprisal".
\end{itemize}

\begin{equation*}
\begin{aligned}
H(X) = - \E[\log_2(p(X))] &= -\sum_{x \in \Xspace} p(x) \log_2 p(x)}
H(X) = - \E[\log_2(p(X))] &= -\sum_{x \in \Xspace} p(x) \log_2 p(x)
\end{aligned}
\end{equation*}

Expand All @@ -92,6 +92,8 @@

\begin{vbframe}{Entropy Properties}

$$H(X) := H(p) = - \E[\log_2(p(X))] = -\sum_{x \in \Xspace} p(x) \log_2 p(x)$$

We can directly note some basic properties:
\vspace{0.2cm}
\begin{enumerate}
Expand Down

0 comments on commit b13a310

Please sign in to comment.