diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex index 1304232b..7120ed85 100644 --- a/slides/information-theory/slides-info-kl.tex +++ b/slides/information-theory/slides-info-kl.tex @@ -71,11 +71,10 @@ \begin{vbframe} {KL-Divergence Example} -KL divergence of $p(x)=N(0,1)$ and $q(x)=LP(0, \sigma)$ for varying $\sigma$ - +KL divergence of $p(x)=N(0,1)$ and $q(x)=LP(0, \sigma)$ for varying $\sigma$ \\ +\lz \begin{figure} \includegraphics[width = 12cm ]{figure/kl_norm_lp.png} -\includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png} \end{figure} @@ -100,8 +99,7 @@ \end{vbframe} \begin{vbframe} {KL as Log-Difference} -Suppose that data is being generated from an unknown distribution $p(x)$. -Suppose we modeled $p(x)$ using an approximating distribution $q(x)$. +Suppose that data is being generated from an unknown distribution $p(x)$ and we model $p(x)$ using an approximating distribution $q(x)$. \lz @@ -112,6 +110,24 @@ This is why we integrate out with respect to the data distribution $p$. A \enquote{good} approximation $q(x)$ should minimize the difference to $p(x)$. +\usetikzlibrary{shapes, arrows.meta} +\begin{center} +\begin{tikzpicture}[>=Stealth, node distance=3cm, rounded corners] + + % Left box with Gaussian density curve + \node (leftbox) [draw, rounded corners, minimum width=3cm, minimum height=1.5cm] {$x \sim p(x)$}; + \begin{scope} + \clip (leftbox.south west) rectangle (leftbox.north east); + \end{scope} + + % Arrow + \draw[->] (leftbox.east) -- node[above] {$x$} ++(2,0); + + % Right box + \node (rightbox) [draw, rounded corners, minimum width=3cm, minimum height=1.5cm, right of=leftbox, node distance=5cm] {$\log p(x) - \log q(x)$}; + +\end{tikzpicture} +\end{center} \framebreak Let $p(x)=N(0,1)$ and $q(x)=LP(0, 3)$. Observe @@ -128,60 +144,69 @@ \framebreak -In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\ -\lz -\textbf{Example}: -Let $q(x)$ be a binomial distribution with $N = 2$ and $p = 0.3$ and let $p(x)$ be a discrete uniform distribution. Both distributions have the same support $\Xspace = \{0, 1, 2\}$. +%In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\ +% \lz +% \textbf{Example}: +% Let $q(x)$ be a binomial distribution with $N = 2$ and $p = 0.3$ and let $p(x)$ be a discrete uniform distribution. Both distributions have the same support $\Xspace = \{0, 1, 2\}$. -\begin{figure} -\includegraphics[width = 5cm ]{figure/kl_log_diff_plot.png} -\end{figure} +% \begin{figure} +% \includegraphics[width = 5cm ]{figure/kl_log_diff_plot.png} +% \end{figure} -\framebreak +% \framebreak -\begin{equation*} - \begin{split} - D_{KL}(p \| q) &= \sum_{x \in \Xspace} p(x) \log \left( \frac{p(x)}{q(x)} \right) - \\ &= 0.333 \log \left( \frac{0.333}{0.49} \right) + 0.333 \log \left( \frac{0.333}{0.42} \right) + 0.333 \log \left( \frac{0.333}{0.09} \right) \\ &= 0.23099 \text{ (nats)} - \end{split} -\end{equation*} +% \begin{equation*} +% \begin{split} +% D_{KL}(p \| q) &= \sum_{x \in \Xspace} p(x) \log \left( \frac{p(x)}{q(x)} \right) +% \\ &= 0.333 \log \left( \frac{0.333}{0.49} \right) + 0.333 \log \left( \frac{0.333}{0.42} \right) + 0.333 \log \left( \frac{0.333}{0.09} \right) \\ &= 0.23099 \text{ (nats)} +% \end{split} +% \end{equation*} -\begin{equation*} - \begin{split} - D_{KL}(q \| p) &= \sum_{x \in \Xspace} q(x) \log \left( \frac{q(x)}{p(x)} \right) - \\ &= 0.49 \log \left( \frac{0.49}{0.333} \right) + 0.42 \log \left( \frac{0.42}{0.333} \right) + 0.09 \log \left( \frac{0.09}{0.333} \right) \\ &= 0.16801 \text{ (nats)} - \end{split} -\end{equation*} +% \begin{equation*} +% \begin{split} +% D_{KL}(q \| p) &= \sum_{x \in \Xspace} q(x) \log \left( \frac{q(x)}{p(x)} \right) +% \\ &= 0.49 \log \left( \frac{0.49}{0.333} \right) + 0.42 \log \left( \frac{0.42}{0.333} \right) + 0.09 \log \left( \frac{0.09}{0.333} \right) \\ &= 0.16801 \text{ (nats)} +% \end{split} +% \end{equation*} -Again, note that $D_{KL}(p \| q) \neq D_{KL}(q \| p)$. -\end{vbframe} +% Again, note that $D_{KL}(p \| q) \neq D_{KL}(q \| p)$. + \end{vbframe} \begin{vbframe} {KL in Fitting} -Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data. +In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\ + +Because KL quantifies the difference between distributions, it can be used as a loss function between distributions. %to find a good fit for the observed data. \\ +\lz +In our example, we can identify an optimal $\sigma$ which minimizes the KL. \begin{figure} - \centering - \scalebox{0.9}{\includegraphics{figure_man/binom1.png}} - \tiny{\\ Credit: Will Kurt} - \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}} +\includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png} \end{figure} -{\tiny Will Kurt (2017): Kullback-Leibler Divergence Explained. -\emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}}\par} + +% \begin{figure} +% \centering +% \scalebox{0.9}{\includegraphics{figure_man/binom1.png}} +% \tiny{\\ Credit: Will Kurt} +% \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}} +% \end{figure} + +% {\tiny Will Kurt (2017): Kullback-Leibler Divergence Explained. +% \emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}}\par} \framebreak -Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data. +% Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data. -\begin{figure} - \centering - \scalebox{0.9}{\includegraphics{figure_man/binom2.png}} - \tiny{\\ Credit: Will Kurt} - \caption{\footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable which takes values between 0 and 10. \textit{Right}: Fitted Binomial distribution ($p \approx 0.57$).}} -\end{figure} +% \begin{figure} +% \centering +% \scalebox{0.9}{\includegraphics{figure_man/binom2.png}} +% \tiny{\\ Credit: Will Kurt} +% \caption{\footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable which takes values between 0 and 10. \textit{Right}: Fitted Binomial distribution ($p \approx 0.57$).}} +% \end{figure} -On the right is the Binomial distribution that minimizes the KL divergence. +% On the right is the Binomial distribution that minimizes the KL divergence. \end{vbframe} \begin{vbframe}{KL as likelihood ratio} @@ -191,33 +216,52 @@ \item How do we usually do that in stats? Likelihood ratio! \end{itemize} -$$ LR = \frac{p(x)}{q(x)} $$ +$$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$ -In the above, if for $x$ we have $LR>1$, then $p$ seems better, for $LR < 1$ $q$ seems better. +If for $\xi$ we have $p(\xi)/q(\xi)>1$, then $p$ seems better, for $p(\xi)/q(\xi) < 1$ $q$ seems better. \\ +\begin{itemize} + \item Let us assume that our data already come from p. It no longer makes sense to ask whether p or q fits the data better. + \item So we ask instead: "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?" +\end{itemize} +$$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$ +That expected LLR is really KL! -\framebreak +\end{vbframe} -Or we can compute LR for a complete set of data (as always, logs make our life easier): -$$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$ -Now let us assume that our data already come from $p$. It does not really make sense anymore to ask -whether $p$ or $q$ fit the data better. +% \begin{vbframe}{KL as likelihood ratio} -But maybe we want to pose the question "How different is $q$ from $p$?" by formulating it as: -"If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?" -$$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$ -That expected LR is really KL! +% \begin{itemize} +% \item Let us assume we have some data and want to figure out whether $p(x)$ or $q(x)$ matches it better. +% \item How do we usually do that in stats? Likelihood ratio! +% \end{itemize} -\framebreak +% $$ LR = \frac{p(x)}{q(x)} $$ + +% In the above, if for $x$ we have $LR>1$, then $p$ seems better, for $LR < 1$ $q$ seems better. -In summary we could say for KL: -\begin{itemize} -\item It measures how much "evidence" each sample provides on average to distinguish $p$ from $q$, - if you sample from $p$. -\item If $p$ and $q$ are very similar, most samples will not help much, and vice versa for very different distributions. -\item In practice, we often want to make the approximation $q$ as indistinguishable from the real $p$ (our data) as possible. We already did that when we fitted (in our log-difference perspective). -\end{itemize} +% \framebreak -\end{vbframe} +% Or we can compute LR for a complete set of data (as always, logs make our life easier): +% $$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$ +% Now let us assume that our data already come from $p$. It does not really make sense anymore to ask +% whether $p$ or $q$ fit the data better. + +% But maybe we want to pose the question "How different is $q$ from $p$?" by formulating it as: +% "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?" +% $$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$ +% That expected LR is really KL! + +% \framebreak + +% In summary we could say for KL: +% \begin{itemize} +% \item It measures how much "evidence" each sample provides on average to distinguish $p$ from $q$, +% if you sample from $p$. +% \item If $p$ and $q$ are very similar, most samples will not help much, and vice versa for very different distributions. +% \item In practice, we often want to make the approximation $q$ as indistinguishable from the real $p$ (our data) as possible. We already did that when we fitted (in our log-difference perspective). +% \end{itemize} + +% \end{vbframe} %\begin{vbframe} {Kullback-Leibler Divergence - Summary} diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex index b123ade8..b4dba618 100644 --- a/slides/regularization/slides-regu-ridge-deepdive.tex +++ b/slides/regularization/slides-regu-ridge-deepdive.tex @@ -37,9 +37,28 @@ =\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 $$ } -\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.} +\normalsize{$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!} %$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$ \end{vbframe} +\begin{vbframe}{Another perspective on $L2$} +Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations: +$$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$ +Expanding, we obtain +$$\riskt = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\bm{\delta}(y-\thetab^{\top}\bm{x}) + \thetab^{\top}\bm{\delta}\bm{\delta}^{\top}\thetab]$$ + +By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get +$$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$ + +Summed over $n$ samples this is exactly to Ridge regression with regularization strength $\lambda$.\\ +$\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$! + +\end{vbframe} + + + + + + \endlecture \end{document}