Skip to content

Commit

Permalink
ridge updates
Browse files Browse the repository at this point in the history
  • Loading branch information
ludwigbothmann committed Dec 5, 2023
1 parent d8109de commit dfb9a21
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 63 deletions.
168 changes: 106 additions & 62 deletions slides/information-theory/slides-info-kl.tex
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,10 @@

\begin{vbframe} {KL-Divergence Example}

KL divergence of $p(x)=N(0,1)$ and $q(x)=LP(0, \sigma)$ for varying $\sigma$

KL divergence of $p(x)=N(0,1)$ and $q(x)=LP(0, \sigma)$ for varying $\sigma$ \\
\lz
\begin{figure}
\includegraphics[width = 12cm ]{figure/kl_norm_lp.png}
\includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png}
\end{figure}


Expand All @@ -100,8 +99,7 @@
\end{vbframe}

\begin{vbframe} {KL as Log-Difference}
Suppose that data is being generated from an unknown distribution $p(x)$.
Suppose we modeled $p(x)$ using an approximating distribution $q(x)$.
Suppose that data is being generated from an unknown distribution $p(x)$ and we model $p(x)$ using an approximating distribution $q(x)$.

\lz

Expand All @@ -112,6 +110,24 @@
This is why we integrate out with respect to the data distribution $p$.
A \enquote{good} approximation $q(x)$ should minimize the difference to $p(x)$.

\usetikzlibrary{shapes, arrows.meta}
\begin{center}
\begin{tikzpicture}[>=Stealth, node distance=3cm, rounded corners]

% Left box with Gaussian density curve
\node (leftbox) [draw, rounded corners, minimum width=3cm, minimum height=1.5cm] {$x \sim p(x)$};
\begin{scope}
\clip (leftbox.south west) rectangle (leftbox.north east);
\end{scope}

% Arrow
\draw[->] (leftbox.east) -- node[above] {$x$} ++(2,0);

% Right box
\node (rightbox) [draw, rounded corners, minimum width=3cm, minimum height=1.5cm, right of=leftbox, node distance=5cm] {$\log p(x) - \log q(x)$};

\end{tikzpicture}
\end{center}
\framebreak

Let $p(x)=N(0,1)$ and $q(x)=LP(0, 3)$. Observe
Expand All @@ -128,60 +144,69 @@

\framebreak

In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\
\lz
\textbf{Example}:
Let $q(x)$ be a binomial distribution with $N = 2$ and $p = 0.3$ and let $p(x)$ be a discrete uniform distribution. Both distributions have the same support $\Xspace = \{0, 1, 2\}$.
%In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\
% \lz
% \textbf{Example}:
% Let $q(x)$ be a binomial distribution with $N = 2$ and $p = 0.3$ and let $p(x)$ be a discrete uniform distribution. Both distributions have the same support $\Xspace = \{0, 1, 2\}$.

\begin{figure}
\includegraphics[width = 5cm ]{figure/kl_log_diff_plot.png}
\end{figure}
% \begin{figure}
% \includegraphics[width = 5cm ]{figure/kl_log_diff_plot.png}
% \end{figure}

\framebreak
% \framebreak

\begin{equation*}
\begin{split}
D_{KL}(p \| q) &= \sum_{x \in \Xspace} p(x) \log \left( \frac{p(x)}{q(x)} \right)
\\ &= 0.333 \log \left( \frac{0.333}{0.49} \right) + 0.333 \log \left( \frac{0.333}{0.42} \right) + 0.333 \log \left( \frac{0.333}{0.09} \right) \\ &= 0.23099 \text{ (nats)}
\end{split}
\end{equation*}
% \begin{equation*}
% \begin{split}
% D_{KL}(p \| q) &= \sum_{x \in \Xspace} p(x) \log \left( \frac{p(x)}{q(x)} \right)
% \\ &= 0.333 \log \left( \frac{0.333}{0.49} \right) + 0.333 \log \left( \frac{0.333}{0.42} \right) + 0.333 \log \left( \frac{0.333}{0.09} \right) \\ &= 0.23099 \text{ (nats)}
% \end{split}
% \end{equation*}

\begin{equation*}
\begin{split}
D_{KL}(q \| p) &= \sum_{x \in \Xspace} q(x) \log \left( \frac{q(x)}{p(x)} \right)
\\ &= 0.49 \log \left( \frac{0.49}{0.333} \right) + 0.42 \log \left( \frac{0.42}{0.333} \right) + 0.09 \log \left( \frac{0.09}{0.333} \right) \\ &= 0.16801 \text{ (nats)}
\end{split}
\end{equation*}
% \begin{equation*}
% \begin{split}
% D_{KL}(q \| p) &= \sum_{x \in \Xspace} q(x) \log \left( \frac{q(x)}{p(x)} \right)
% \\ &= 0.49 \log \left( \frac{0.49}{0.333} \right) + 0.42 \log \left( \frac{0.42}{0.333} \right) + 0.09 \log \left( \frac{0.09}{0.333} \right) \\ &= 0.16801 \text{ (nats)}
% \end{split}
% \end{equation*}

Again, note that $D_{KL}(p \| q) \neq D_{KL}(q \| p)$.
\end{vbframe}
% Again, note that $D_{KL}(p \| q) \neq D_{KL}(q \| p)$.
\end{vbframe}

\begin{vbframe} {KL in Fitting}

Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data.
In machine learning, KL divergence is commonly used to quantify how different one distribution is from another.\\

Because KL quantifies the difference between distributions, it can be used as a loss function between distributions. %to find a good fit for the observed data. \\
\lz

In our example, we can identify an optimal $\sigma$ which minimizes the KL.
\begin{figure}
\centering
\scalebox{0.9}{\includegraphics{figure_man/binom1.png}}
\tiny{\\ Credit: Will Kurt}
\caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}}
\includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png}
\end{figure}

{\tiny Will Kurt (2017): Kullback-Leibler Divergence Explained.
\emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}}\par}

% \begin{figure}
% \centering
% \scalebox{0.9}{\includegraphics{figure_man/binom1.png}}
% \tiny{\\ Credit: Will Kurt}
% \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}}
% \end{figure}

% {\tiny Will Kurt (2017): Kullback-Leibler Divergence Explained.
% \emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}}\par}

\framebreak

Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data.
% Because KL quantifies the difference between distributions, it can be used as a loss function to find a good fit for the observed data.

\begin{figure}
\centering
\scalebox{0.9}{\includegraphics{figure_man/binom2.png}}
\tiny{\\ Credit: Will Kurt}
\caption{\footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable which takes values between 0 and 10. \textit{Right}: Fitted Binomial distribution ($p \approx 0.57$).}}
\end{figure}
% \begin{figure}
% \centering
% \scalebox{0.9}{\includegraphics{figure_man/binom2.png}}
% \tiny{\\ Credit: Will Kurt}
% \caption{\footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable which takes values between 0 and 10. \textit{Right}: Fitted Binomial distribution ($p \approx 0.57$).}}
% \end{figure}

On the right is the Binomial distribution that minimizes the KL divergence.
% On the right is the Binomial distribution that minimizes the KL divergence.
\end{vbframe}

\begin{vbframe}{KL as likelihood ratio}
Expand All @@ -191,33 +216,52 @@
\item How do we usually do that in stats? Likelihood ratio!
\end{itemize}

$$ LR = \frac{p(x)}{q(x)} $$
$$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$

In the above, if for $x$ we have $LR>1$, then $p$ seems better, for $LR < 1$ $q$ seems better.
If for $\xi$ we have $p(\xi)/q(\xi)>1$, then $p$ seems better, for $p(\xi)/q(\xi) < 1$ $q$ seems better. \\
\begin{itemize}
\item Let us assume that our data already come from p. It no longer makes sense to ask whether p or q fits the data better.
\item So we ask instead: "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
\end{itemize}
$$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
That expected LLR is really KL!

\framebreak
\end{vbframe}

Or we can compute LR for a complete set of data (as always, logs make our life easier):
$$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$
Now let us assume that our data already come from $p$. It does not really make sense anymore to ask
whether $p$ or $q$ fit the data better.
% \begin{vbframe}{KL as likelihood ratio}

But maybe we want to pose the question "How different is $q$ from $p$?" by formulating it as:
"If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
$$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
That expected LR is really KL!
% \begin{itemize}
% \item Let us assume we have some data and want to figure out whether $p(x)$ or $q(x)$ matches it better.
% \item How do we usually do that in stats? Likelihood ratio!
% \end{itemize}

\framebreak
% $$ LR = \frac{p(x)}{q(x)} $$

% In the above, if for $x$ we have $LR>1$, then $p$ seems better, for $LR < 1$ $q$ seems better.

In summary we could say for KL:
\begin{itemize}
\item It measures how much "evidence" each sample provides on average to distinguish $p$ from $q$,
if you sample from $p$.
\item If $p$ and $q$ are very similar, most samples will not help much, and vice versa for very different distributions.
\item In practice, we often want to make the approximation $q$ as indistinguishable from the real $p$ (our data) as possible. We already did that when we fitted (in our log-difference perspective).
\end{itemize}
% \framebreak

\end{vbframe}
% Or we can compute LR for a complete set of data (as always, logs make our life easier):
% $$ LR = \prod_i \frac{p(\xi)}{q(\xi)} \qquad LLR = \sum_i \log \frac{p(\xi)}{q(\xi)} $$
% Now let us assume that our data already come from $p$. It does not really make sense anymore to ask
% whether $p$ or $q$ fit the data better.

% But maybe we want to pose the question "How different is $q$ from $p$?" by formulating it as:
% "If we sample many data from $p$, how easily can we see that $p$ is better than $q$ through LR, on average?"
% $$ \E_p \left[\log \frac{p(X)}{q(X)}\right] $$
% That expected LR is really KL!

% \framebreak

% In summary we could say for KL:
% \begin{itemize}
% \item It measures how much "evidence" each sample provides on average to distinguish $p$ from $q$,
% if you sample from $p$.
% \item If $p$ and $q$ are very similar, most samples will not help much, and vice versa for very different distributions.
% \item In practice, we often want to make the approximation $q$ as indistinguishable from the real $p$ (our data) as possible. We already did that when we fitted (in our log-difference perspective).
% \end{itemize}

% \end{vbframe}


%\begin{vbframe} {Kullback-Leibler Divergence - Summary}
Expand Down
21 changes: 20 additions & 1 deletion slides/regularization/slides-regu-ridge-deepdive.tex
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,28 @@
=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2
$$
}
\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.}
\normalsize{$\Longrightarrow$ $\thetah_{\text{Ridge}}$ is the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!}
%$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$
\end{vbframe}

\begin{vbframe}{Another perspective on $L2$}
Now consider perturbed features $ \tilde{\xi}:= \xi + \bm{\delta}_i$ where $\bm{\delta}_i \overset{iid}{\sim} (\bm{0},\frac{\lambda}{n} \id_p)$. Note that no parametric family is assumed. We want to minimize the expected squared error taken w.r.t. the random perturbations:
$$\riskt:= \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\tilde{\bm{x}})^2] = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}(\bm{x}+\bm{\delta}))^2]$$
Expanding, we obtain
$$\riskt = \mathbb{E}_{\bm{\delta}}[(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\bm{\delta}(y-\thetab^{\top}\bm{x}) + \thetab^{\top}\bm{\delta}\bm{\delta}^{\top}\thetab]$$

By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}]=\bm{0}$ and $\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]=\frac{\lambda}{n} \id_p$, we get
$$\riskt=(y-\thetab^{\top}\bm{x})^2 - 2 \thetab^{\top}\mathbb{E}[\bm{\delta}](y-\thetab^{\top}\bm{x}) + \thetab^{\top}\mathbb{E}[\bm{\delta}\bm{\delta}^{\top}]\thetab = (y-\thetab^{\top}\bm{x})^2+\frac{\lambda}{n} \Vert \thetab \Vert_2^2$$

Summed over $n$ samples this is exactly to Ridge regression with regularization strength $\lambda$.\\
$\Longrightarrow$ Minimizing squared loss over noise distribution is Ridge regression on unperturbed features $\xi$!

\end{vbframe}






\endlecture
\end{document}

0 comments on commit dfb9a21

Please sign in to comment.