From 5841b102ad74624ed5d2c72c9ff5abd484dc2113 Mon Sep 17 00:00:00 2001 From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com> Date: Wed, 29 Nov 2023 15:03:57 +0100 Subject: [PATCH] Updates from Overleaf --- slides/information-theory/chapter-order.tex | 5 +- .../slides-info-cross-entropy-kld.tex | 21 +- slides/information-theory/slides-info-ml.tex | 33 +- .../slides-info-mutual-info.tex | 540 +---------------- .../slides-info-mutual-info2.tex | 562 ++++++++++++++++++ 5 files changed, 593 insertions(+), 568 deletions(-) create mode 100644 slides/information-theory/slides-info-mutual-info2.tex diff --git a/slides/information-theory/chapter-order.tex b/slides/information-theory/chapter-order.tex index 7cd6b11c..ced3d924 100644 --- a/slides/information-theory/chapter-order.tex +++ b/slides/information-theory/chapter-order.tex @@ -25,7 +25,10 @@ \subsection{Cross-Entropy, KL and Source Coding} \subsection{Information Theory for Machine Learning} \includepdf[pages=-]{../slides-pdf/slides-info-ml.pdf} -\subsection{Joint Entropy and Mutual Information} +\subsection{Joint Entropy and Mutual Information I} +\includepdf[pages=-]{../slides-pdf/slides-info-mutual-info.pdf} + +\subsection{Joint Entropy and Mutual Information II} \includepdf[pages=-]{../slides-pdf/slides-info-mutual-info.pdf} \subsection{Entropy and Optimal Code Length} diff --git a/slides/information-theory/slides-info-cross-entropy-kld.tex b/slides/information-theory/slides-info-cross-entropy-kld.tex index 350e1225..42bcf13b 100644 --- a/slides/information-theory/slides-info-cross-entropy-kld.tex +++ b/slides/information-theory/slides-info-cross-entropy-kld.tex @@ -21,28 +21,29 @@ \begin{vbframe} {Cross-Entropy - Discrete Case} \textbf{Cross-entropy} measures the average amount of information required to represent an event from one distribution $p$ using a predictive scheme based on another distribution $q$ (assume they have the same domain $\Xspace$ as in KL). - $$ H_q(p) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$ - + $$ H(p \| q) = \sum_{x \in \Xspace} p(x) \log\left(\frac{1}{q(x)}\right) = - \sum_{x \in \Xspace} p(x) \log\left(q(x)\right) = - \mathbb{E}_{X\sim p}[\log(q(X))]$$ +For now, we accept the formula as-is. More on the underlying intuition follows in the concent on inf. theory for ML and sourcecoding. \begin{itemize} -\setlength{\itemsep}{1.2em} +\setlength{\itemsep}{0.9em} \item Entropy = Avg. amount of information if we optimally encode $p$ \item Cross-Entropy = Avg. amount of information if we suboptimally encode $p$ with $q$ \item $DL_ {KL}(p \| q)$: Difference between the two +\item $H(p \| q)$ sometimes also denoted as $H_{q}(p)$ to set it apart from KL \end{itemize} -\lz +\framebreak We can summarize this also through this identity: \lz $$ -H_q(p) = H(p) + D_{KL}(p \| q) +H(p \| q) = H(p) + D_{KL}(p \| q) $$ This is because: \begin{eqnarray*} H(p) + D_{KL}(p \| q) &=& - \sum_{x \in \Xspace} p(x) \log p(x) + \sum_{x \in \Xspace} p(x) \log \frac{p(x)}{q(x)} \\ &=& \sum_{x \in \Xspace} p(x) (-\log p(x) + \log p(x) - \log q(x)) \\ -&=& - \sum_{x \in \Xspace} p(x) \log q(x) = H_q(p) \\ +&=& - \sum_{x \in \Xspace} p(x) \log q(x) = H(p \| q) \\ \end{eqnarray*} \framebreak @@ -52,11 +53,11 @@ For continuous density functions $p(x)$ and $q(x)$: -$$ H_q(p) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$ +$$ H(p \| q) = \int p(x) \log\left(\frac{1}{q(x)}\right) dx = - \int p(x) \log\left(q(x)\right) dx = - \mathbb{E}_{X \sim p}[\log(q(X))]$$ \begin{itemize} \item It is not symmetric. -\item As for the discrete case, $H_q(p) = h(p) + D_{KL}(p \| q)$ holds. +\item As for the discrete case, $H(p \| q) = h(p) + D_{KL}(p \| q)$ holds. \item Can now become negative, as the $h(p)$ can be negative! \end{itemize} \end{vbframe} @@ -103,10 +104,10 @@ % $$ H_p(q) = \sum_{x \in \Xspace} q(x) \log_2\left(\frac{1}{p(x)}\right) = - \sum_{x \in \Xspace} q(x) log_2(p(x))$$ % \item For probability densities $p(x)$ and $q(x)$, it is: % $$ H_p(q) = \int_{\Xspace} q(x) \ln\left(\frac{1}{p(x)}\right) dx = - \int_{\Xspace} q(x) \ln\left(p(x)\right) dx $$ -% \item It is not symmetric: $ H_p(q) \neq H_q(p)$. +% \item It is not symmetric: $ H_p(q) \neq H(p \| q)$. % \item Relationship to KL divergence: % \begin{align*} -% H_q(p) &= H(p) + D_{KL}(p \| q) \\ +% H(p \| q) &= H(p) + D_{KL}(p \| q) \\ % H_p(q) &= H(q) + D_{KL}(q \| p) % \end{align*} % \item It is non-negative. If the two distributions are the same, cross-entropy equals entropy and KL divergence is zero. diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex index d3851a7c..7ea1cd92 100644 --- a/slides/information-theory/slides-info-ml.tex +++ b/slides/information-theory/slides-info-ml.tex @@ -23,27 +23,26 @@ \begin{vbframe}{KL vs Maximum Likelihood} Minimizing KL between the true distribution $p(x)$ and approximating model $q(x|\thetab)$ is equivalent to maximizing the log-likelihood. \begin{align*} - D_{KL}(p \| q_{\thetab})) &= \E_{x \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\ - &= \E_{x \sim p} \log p(x) - \E_{x \sim p} \log q(x|\thetab) + D_{KL}(p \| q_{\thetab}) &= \E_{X \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\ + &= \E_{X \sim p} \log p(x) - \E_{X \sim p} \log q(x|\thetab) \end{align*} - The first term above does not depend on $\thetab$. Therefore, + The first term above does not depend on $\thetab$ and the second term can be defined as the cross-entropy. Therefore, \begin{align*} - \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{x \sim p} \log q(x|\thetab)\\ - &= \argmax_{\thetab} \E_{x \sim p} \log q(x|\thetab) + \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab)\\ + &= \argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab) \end{align*} For a finite dataset of $n$ samples from $p$, this is approximated as - $$\argmax_{\thetab} \E_{x \sim p} \log q(x|\thetab) \approx \argmax_{\thetab} \frac{1}{n} \sumin \log q(\xi|\thetab)\,.$$ + $$\argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab) \approx \argmax_{\thetab} \frac{1}{n} \sumin \log q(\xi|\thetab)\,.$$ % This demonstrates that density estimation and optimal coding are closely related. If the estimated distribution is different from the true one, any code based on the estimated distribution will necessarily be suboptimal (in terms of the expected length of \enquote{messages} from the true distribution). \end{vbframe} \begin{vbframe}{KL vs Cross-Entropy} From this here we can actually see much more: -$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{x \sim p} \log q(x|\thetab) = \argmin_{\thetab} H_{q_{\thetab}}(p) $$ +$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$ \begin{itemize} - \item So minimizing with respect to KL is the same as minimizing with respect to cross-entropy! - \item That implies minimizing with respect to cross-entropy is the same as maximum likelihood! - \item Remember, how we only characterized cross-entropy through source coding / bits? We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)! + \item So minimizing w.r.t. to KL is the same as minimizing with respect to cross-entropy, which implies minimizing w.r.t. cross-entropy is the same as maximum likelihood! + \item Remember, how we only characterized cross-entropy through bits? We could now motivate cross-entropy as the "relevant" term that you have to minimize, when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)! \item Or we could say: Cross-entropy between $p$ and $q$ is simply the expected negative log-likelihood of $q$, when our data comes from $p$! \end{itemize} \end{vbframe} @@ -67,14 +66,14 @@ To train the model, we minimize KL between $d^{(i)}$ and $\pi(\xv^{(i)}|\thetab)$ : -$$ \argmin_{\thetab} \sum_{i=1}^n D_{KL} (d^{(i)} \| \pi(\xv^{(i)}|\thetab)) = \argmin_{\thetab} \sum_{i=1}^n H_{\pi(\xv^{(i)}|\thetab)}(d^{(i)}) $$ +$$ \argmin_{\thetab} \sum_{i=1}^n D_{KL} (d^{(i)} \| \pi(\xv^{(i)}|\thetab)) = \argmin_{\thetab} \sum_{i=1}^n H(d^{(i)} \| \pi(\xv^{(i)}|\thetab)) $$ % where the entropy $H(d^{(i)})$ was dropped because it is not a function of $\thetab$. We see that this is equivalent to log-loss risk minimization! \begin{footnotesize} \begin{equation*} \begin{split} - R &= \sumin H_{\pi_k(\xv^{(i)}|\thetab)}(d^{(i)}) \\ + R &= \sumin H(d^{(i)} \| \pi_k(\xv^{(i)}|\thetab)) \\ &= \sumin \left( - \sum_k d^{(i)}_k \log\pi_k(\xv^{(i)}|\thetab) \right) \\ & = \sumin \underbrace{ \left( -\sum_{k = 1}^g [\yi = k]\log \pi_{k}(\xv^{(i)}|\thetab) \right) }_{\text{log loss}} \\ & = \sumin (-\log\pi_{y^{(i)}}(\xv^{(i)}|\thetab)) @@ -106,7 +105,7 @@ \lz - If $p$ represents a $\text{Ber}(y)$ distribution (so deterministic, where the true label receives probability mass 1) and we also interpret $\pix$ as a Bernoulli distribution $\text{Ber}(\pix)$, the Bernoulli loss $L(y,\pix)$ is the cross-entropy $H_{\pix}(p)$. + If $p$ represents a $\text{Ber}(y)$ distribution (so deterministic, where the true label receives probability mass 1) and we also interpret $\pix$ as a Bernoulli distribution $\text{Ber}(\pix)$, the Bernoulli loss $L(y,\pix)$ is the cross-entropy $H(p \| \pix)$. % \item If $\hat{y}$ is a Bernoulli random variable with distribution defined by $\pi(x)$, $L(y,\pix)$ is the cross-entropy $H_{\hat{y}}(y)$. % \item For a given training set with $n$ samples, the cost function is computed by taking the average of all the cross-entropies in the sample % $$-\frac{1}{n} \sum_{i=1}^{n}\left[\yi \log \pi(\xi)+\left(1-\yi\right) \log \left(1-\pi(\xi)\right)\right].$$ @@ -118,13 +117,9 @@ What is the (average) risk of that minimal constant model? -\framebreak - \begin{align*} - \risk &= \frac{1}{n} \sumin \left( -\sumkg [\yi = k]\log \pik \right) \\ - &= - \frac{1}{n} \sumkg \sumin [\yi = k]\log \pik \\ - &= -\sumkg \frac{n_k}{n}\log \pik \\ - &= -\sumkg \pi_k \log \pik = H(\pi) + \risk &= \frac{1}{n} \sumin \left( -\sumkg [\yi = k]\log \pik \right) = - \frac{1}{n} \sumkg \sumin [\yi = k]\log \pik \\ + &= -\sumkg \frac{n_k}{n}\log \pik = -\sumkg \pi_k \log \pik = H(\pi) \end{align*} So entropy is the (average) risk of the optimal "observed class frequency" model under log-loss! diff --git a/slides/information-theory/slides-info-mutual-info.tex b/slides/information-theory/slides-info-mutual-info.tex index c68e03f3..3e22f9db 100644 --- a/slides/information-theory/slides-info-mutual-info.tex +++ b/slides/information-theory/slides-info-mutual-info.tex @@ -15,7 +15,7 @@ \begin{document} -\lecturechapter{Joint Entropy and Mutual Information} +\lecturechapter{Joint Entropy and Mutual Information I} \lecture{Introduction to Machine Learning} @@ -34,7 +34,7 @@ % $$ H(X_1, X_2, \ldots, X_n) = - \sum_{x_1 \in \Xspace_1} \ldots \sum_{x_n \in \Xspace_n} p(x_1,x_2, \ldots, x_n) \log_2(p(x_1,x_2, \ldots, x_n)) $$ % \end{footnotesize} \item For continuous random variables $X$ and $Y$ with joint density $p(x,y)$, the differential joint entropy is:\\ - $$ h(X,Y) = - \int_{\Xspace,\Yspace} p(x,y) \log p(x,y) dx dy$$ + $$ h(X,Y) = - \int_{\Xspace \times \Yspace} p(x,y) \log p(x,y) dx dy$$ \end{itemize} \begin{footnotesize} @@ -283,542 +283,6 @@ \end{vbframe} -\begin{vbframe}{Mutual Information - Corollaries} - -\small - -\textbf{Non-negativity of mutual information:} For any two random variables, $X$, $Y$, $ I(X;Y) \geq 0$, with equality if and only if $X$ and $Y$ are independent. - -\lz - -\textbf{Proof:}$\quad I(X ; Y)=D_{KL}(p(x, y) \| p(x) p(y)) \geq 0,$ with equality if and only if $p(x, y)=p(x) p(y)$ (i.e., $X$ and $Y$ are independent). - -\lz - -\textbf{Conditioning reduces entropy (information can't hurt):} - -$$H(X | Y) \leq H(X),$$ -with equality if and only if $X$ and $Y$ are independent. - -\lz - -\textbf{Proof:}$\quad 0 \leq I(X ; Y)=H(X)-H(X | Y)$ - -Intuitively, the theorem says that knowing another random variable $Y$ can only reduce the uncertainty in $X$. Note that this is true only on the average. - -\framebreak - -% \textbf{Corollary:} - -% \footnotesize -% \begin{equation*} -% \begin{aligned} -% D_{KL}(p(y | x) \| q(y | x)) &= \sum_x p(x) \sum_y p(y|x) \log\frac{p(y|x)}{q(y|x)} \\ -% &= \E_{p(x,y)} \left[ \log\frac{p(Y|X)}{q(Y|X)}\right] \\ -% &\geq 0 -% \end{aligned} -% \end{equation*} -% \normalsize - -% with equality if and only if $p(y | x)=q(y | x)$ for all $y$ and $x$ such that $p(x)>0$. - -% In the continuous case with density functions $f$, $g$ and support set $S$ we have: - -% \begin{equation*} -% \begin{aligned} -% D_{KL}(f \| g) \geq 0, -% \end{aligned} -% \end{equation*} - -% with equality if and only if $f$ and $g$ are equal almost everywhere. - -% \framebreak - -% \textbf{Proof:} - -% \footnotesize -% \begin{equation*} -% \begin{aligned} -% -D_{KL}(f \| g) &= \int_{S} f \log \frac{g}{f} \\ -% &\leq \log \int_{S} f \frac{g}{f} \\ -% &= \log \int_{S} g \\ -% &\leq \log 1 = 0 -% \end{aligned} -% \end{equation*} -% \normalsize - -% \lz - -% \textbf{Corollary:}$\quad I(X ; Y | Z) \geq 0$, with equality if and only if $X$ and $Y$ are conditionally independent given $Z$, where \textbf{conditional mutual information} is defined as - -% \footnotesize -% \begin{equation*} -% \begin{aligned} -% I(X; Y | Z) &= H(X | Z) - H(X | Y, Z) \\ -% &= \E_{p(x,y,z)} \left[ \log\frac{p(X,Y|Z)}{p(X|Z)p(Y|Z)}\right]. -% \end{aligned} -% \end{equation*} -% \normalsize - -% \lz - -%left out Theorem 2.6.4 - - -\framebreak - -\textbf{Independence bound on entropy:} Let $X_{1}, X_{2}, \ldots, X_{n}$ be drawn according to $p\left(x_{1}, x_{2}, \ldots, x_{n}\right) .$ Then - -\footnotesize -$$H\left(X_{1}, X_{2}, \ldots, X_{n}\right) \leq \sum_{i=1}^{n} H\left(X_{i}\right),$$ -\normalsize - -with equality if and only if the $X_{i}$ are independent.\\ - -\lz - -\textbf{Proof:} With the chain rule for entropies, - -\footnotesize -\begin{equation*} -\begin{aligned} -H\left(X_{1}, X_{2}, \ldots, X_{n}\right) &=\sum_{i=1}^{n} H\left(X_{i} | X_{i-1}, \ldots, X_{1}\right) -&\leq \sum_{i=1}^{n} H\left(X_{i}\right), -\end{aligned} -\end{equation*} -\normalsize - -where the inequality follows directly from above. We have equality if and only if $X_{i}$ is independent of $X_{i-1}, \ldots, X_{1}$ for all $i$ (i.e., if and only if the $X_{i}$ 's are independent). - - -\end{vbframe} - -\begin{vbframe} {Mutual information Properties} - -%%The reduction of uncertainty in $Y$ after \textit{learning} $X$ is called \textbf{mutual information} - -%By symmetry and since $H(X,Y) = H(X) + H(Y|X)$, it also follows that - -%$$ -%I(Y;X) := H(Y) - H(Y|X) = H(Y) + H(X) - H(X, Y). -%$$ - -%\textbf{Remarks:} -%\begin{itemize} -%\item The mutual information is symmetric, i. e. $I(Y;X) = I(X;Y)$. -%\item It describes the amount of information about one random variable obtained through the other one (\textbf{information gain}). -%\end{itemize} - -%\begin{figure} -% \includegraphics{figure_man/mutualinformation.pdf} -%\end{figure} - -% \framebreak - -% Mutual information can be used to perform \textbf{feature selection}. Quite simply, each variable $X_i$ is rated according to $I(X_i;Y)$: The more information we gain on $Y$ by observing $X_i$, the more "useful" $X_i$. - -% \lz - -% Let $\D = \Dset$ and $\D \{\cdot\}$ a subset of $\D$ for which condition $\cdot$ is fulfilled. Then, \textbf{information gain} is defined as: - -% \footnotesize -% \begin{equation*} -% \begin{aligned} -% IG(\D, s) &= I(X_s;Y) \\ -% &= H(Y) - H(Y|X_s) \\ -% &= - \sum_{y \in Y} p(y) \log_2 p(y) + \sum_{x \in X_s} \sum_{y \in Y} p(x,y) \log_2 p(y|x) \\ -% &= - \sum_{y \in Y} \frac{|\D\{Y = y\}|}{|\D|} \log_2 \sum_{y \in Y} \frac{|\D\{Y = y\}|}{|\D|} \\ &+ -% \sum_{x \in X_s} \sum_{y \in Y} \frac{|\D\{Y = y, X_s = x\}|}{|\D|} \log_2 \frac{|\D\{Y = y, X_s = x\}|}{|\D\{X_s = x\}|}. -% \end{aligned} -% \end{equation*} -% \normalsize - -% \framebreak - -\begin{itemize} - % \item Intuitively, mutual information quantifies the amount of shared information between variables. - \item MI is a measure of the amount of "dependence" between variables. It is zero if and only if the variables are independent. - \item On the other hand, if one of the variables is a deterministic function of the other, the mutual information is maximal, i.e. entropy of the first. - \item Unlike (Pearson) correlation, mutual information is not limited to real-valued random variables. - \item Mutual information can be used to perform \textbf{feature selection}. Quite simply, each variable $X_i$ is rated according to $I(X_i;Y)$, this is sometimes called information gain. - \item The same principle can also be used in decision trees to select a feature to split on. Splitting on MI/IG is then equivalent to risk reduction with log-loss. -\end{itemize} -\end{vbframe} - -\begin{vbframe} {Mutual information properties} -\begin{itemize} - \item MI is invariant w.r.t. injective reparametrizations that are in $\mathcal{C}^1:$\\ - \medskip - Let $f, g: \R^d\rightarrow\R^d \in \mathcal{C}^1$ be injective transformations and $X, Y$ be continuous random variables in $\mathbb{R}^d$ then by the change of variables the joint and marginal densities of $\tilde{X} = f(X), \tilde{Y} = g(Y)$ - \begin{align*} - \tilde{p}(\tilde{x}, \tilde{y}) &= p(f^{-1}(\tilde{x}), g^{-1}(\tilde{y}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert\cdot\vert J_{g^{-1}}(\tilde{y})\vert, \\ - \tilde{p}(\tilde{x}) &= p(f^{-1}(\tilde{x}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert,\quad \tilde{p}(\tilde{y}) = p(g^{-1}(\tilde{y}))\cdot\vert J_{g^{-1}}(\tilde{y})\vert, - \end{align*} - where $p(x, y)$ is the joint density of $X$ and $Y$ and $p(x), p(y)$ are the respective marginal densities. ($J$ denotes the Jacobian) \\ - \medskip - With this, it follows that - \begin{align*} - I(\tilde{X}; \tilde{Y}) &= \int \tilde{p}(\tilde{x}, \tilde{y}) \log\left(\frac{\tilde{p}(\tilde{x}, \tilde{y})}{\tilde{p}(\tilde{x})\tilde{p}(\tilde{y})}\right)d\tilde{x}d\tilde{y} = * - \end{align*} -\end{itemize} -\end{vbframe} -\begin{vbframe}{Mutual information properties} -\begin{align*} -* & = \int p(f^{-1}(\tilde{x}), g^{-1}(\tilde{y}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert\cdot\vert J_{g^{-1}}(\tilde{y})\vert \\ &\quad\cdot \log\left(\frac{ p(f^{-1}(\tilde{x}), g^{-1}(\tilde{y}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert\cdot\vert J_{g^{-1}}(\tilde{y})\vert }{p(f^{-1}(\tilde{x}))\vert J_{f^{-1}}(\tilde{x})\vert \cdot p(g^{-1}(\tilde{y}))\vert J_{g^{-1}}(\tilde{y})\vert}\right)d\tilde{x}d\tilde{y} -\\&= \int p(f^{-1}(f(x)), g^{-1}(g(y)))\cdot\vert J_{f^{-1}}(f(x))\vert\cdot\vert J_{g^{-1}}(g(y))\vert \\ &\quad\cdot \log\left(\frac{p(f^{-1}(f(x)), g^{-1}(g(y)))}{p(f^{-1}(f(x)))p(g^{-1}(g(y)))}\right)\vert J_f(x)\vert\cdot\vert J_g(y)\vert dxdy \\ - &= \int p(x, y)\cdot\vert J_{f^{-1}}(f(x))J_f(x)\vert\cdot\vert J_{g^{-1}}(g(y))J_g(y)\vert \log\left(\frac{p(x, y)}{p(x)p(y)}\right)dxdy \\ - &= \int p(x, y)\cdot \log\left(\frac{p(x, y)}{p(x)p(y)}\right)dxdy = I(X; Y). -\end{align*} -(The fourth equality holds by the inverse function theorem) -\end{vbframe} - -\begin{vbframe} {Mutual information vs. correlation} - - \begin{itemize} - \item If two variables are independent, their correlation is 0. - \item However, the reverse is not necessarily true. It is possible for two dependent variables to have 0 correlation because correlation only measures linear dependence. - -\begin{center} -\includegraphics[width = 10cm ]{figure/correlation_plot.png} \\ -\end{center} - - \item The figure above shows various scatterplots where, in each case, the correlation is 0 even though the two variables are strongly dependent, and MI is large. - \item Mutual information can therefore be seen as a more general measure of dependence between variables than correlation. - \end{itemize} - -\end{vbframe} - -\begin{vbframe} {Mutual information - example} - -Let $X, Y$ be two correlated Gaussian random variables. $(X, Y) \sim \mathcal{N}(0, K)$ with correlation $\rho$ and covariance matrix $K$: - -$$ -K = -\begin{pmatrix} - \sigma^2 & \rho \sigma^2 \\ - \rho \sigma^2 & \sigma^2 -\end{pmatrix} -$$ - -Then $h(X) = h(Y) = \frac{1}{2} \log\left((2 \pi e) \sigma^2\right)$, and $h(X,Y) = \frac{1}{2}\log\left((2 \pi e)^2 \vert K\vert\right) = \frac{1}{2}\log\left((2 \pi e)^2 \sigma^4 (1 - \rho^2)\right)$, and thus - -\begin{equation*} -\begin{aligned} -I(X;Y) = h(X) + h(Y) - h(X,Y) = - \frac{1}{2} \log(1 - \rho^2). -\end{aligned} -\end{equation*} - -For $\rho = 0$, $X$ and $Y$ are independent and $I(X;Y) = 0$. \\ -For $\rho = \pm 1$, $X$ and $Y$ are perfectly correlated and $I(X;Y) \rightarrow \infty$. -\end{vbframe} - -% \begin{vbframe} {Chain rule for information} - - -% $$I\left(X_{1}, X_{2}, \ldots, X_{n} ; Y\right)=\sumin I\left(X_{i} ; Y | X_{i-1}, X_{i-2}, \ldots, X_{1}\right)$$ - -% \textbf{Proof:$\quad$} -% \footnotesize -% \begin{equation*} -% \begin{aligned} -% I\left(X_{1}, X_{2}, \ldots, X_{n} ; Y\right) &= H\left(X_{1}, X_{2}, \ldots, X_{n}\right)-H\left(X_{1}, X_{2}, \ldots, X_{n} | Y\right) \\ -% &=\sumin H\left(X_{i} | X_{i-1}, \ldots, X_{1}\right)-\sum_{i=1}^{n} H\left(X_{i} | X_{i-1}, \ldots, X_{1}, Y\right) \\ -% &=\sumin I\left(X_{i} ; Y | X_{1}, X_{2}, \ldots, X_{i-1}\right). -% \end{aligned} -% \end{equation*} - -% \normalsize - -% \end{vbframe} - - -% \begin{vbframe} {Chain rule for KL distance} - -% \begin{equation*} -% \begin{aligned} -% D_{KL}(p(x, y) \| q(x,y)) &= D_{KL}(p(x) \| q(x)) + D_{KL}(p(y|x) \| q(y|x)) -% \end{aligned} -% \end{equation*} - -% \textbf{Proof:} - -% \footnotesize - -% \begin{equation*} -% \begin{aligned} -% D_{KL}(p(x, y) \| q(x,y)) &= \sum_x \sum_y p(x,y) \log \frac{p(x,y)}{q(x,y)} \\ -% &= \sum_x \sum_y p(x,y) \log \frac{p(x)p(y|x)}{q(x)q(y|x)} \\ -% &= \sum_x \sum_y p(x,y) \log \frac{p(x)}{q(x)} + \sum_x \sum_y p(x,y) \log \frac{p(y|x)}{q(y|x)} \\ -% &= D_{KL}(p(x) \| q(x)) + D_{KL}(p(y|x) \| q(y|x)) -% \end{aligned} -% \end{equation*} - -% \normalsize - - -% \end{vbframe} - - -%old slide about KLD -% %\normalsize -% The mutual information between two variables $X$ and $Y$ is also the KL divergence of the product of the marginal distributions $p_x(x) p_y(y)$ from the joint distribution $p(x,y)$ : -% % $I(x;y)$ is the \emph{information gain} achieved if the the joint distribution $p_{xy}(x,y)$ is used instead of the product of marginal distributions : -% -% \begin{eqnarray*} -% I(X;Y) &\overset{(*)}{=}& D_{KL}(p_{xy}||p_x p_y) = \sum_{x \in \Xspace} \sum_{y \in \Yspace} p_{xy}(x, y) \cdot \log \biggl(\frac{p_{xy}(x, y)}{p_x(x)p_y(y)}\biggr) -% \end{eqnarray*} -% -% For continuous random variables $X$ and $Y$ with joint density $p(x,y)$ and marginal densities $p_x(x) p_y(y)$, the mutual information is: -% -% \begin{eqnarray*} -% I(X;Y) &=& D_{KL}(p_{xy}||p_x p_y) = \int_{x \in \Xspace} \int_{y \in \Yspace} p_{xy}(x, y) \cdot \log \biggl(\frac{p_{xy}(x, y)}{p_x(x)p_y(y)}\biggr) -% \end{eqnarray*} -% -% -% (Note: If $X$ and $Y$ are independent, $p(x,y)=p_x(x) p_y(y)$ and $I(X;Y)$ is zero.) -% \framebreak -% -% (*) Derivation: -% -% \footnotesize -% -% -% \begin{eqnarray*} -% I(X;Y) &=& H(Y) + H(X) - H(X, Y)\\ -% &=& -\sum_{y \in \Yspace} p_y(y) \log_2(p_y(y)) -\sum_{x \in \Xspace} p_x(x) \log_2(p_x(x)) \\ -% && -\sum_{x \in \Xspace, y \in \Yspace} p_{xy}(x, y) \log_2(p_{xy}(x, y))\\ -% &=& -\sum_{x \in \Xspace, y \in \Yspace}p_{xy}(x, y) \log_2(p_y(y)) -\sum_{x \in \Xspace, y \in \Yspace} p_{xy}(x, y) \log_2(p_x(x)) \\ -% && \quad+ \sum_{x \in \Xspace, y \in \Yspace} p_{xy}(x, y) \log_2(p_{xy}(x, y)) \\ -% &=& \sum_{x \in \Xspace} \sum_{y \in \Yspace} p_{xy}(x, y) \cdot \log_2 \biggl(\frac{p_{xy}(x, y)}{p_x(x)p_y(y)}\biggr) = D_{KL}(p_{xy}||p_x p_y) -% \end{eqnarray*} -% -% - - - - - - - - - - - - - - - - - - - - -% \begin{vbframe} {Summary} - -% \begin{figure} -% \centering -% \scalebox{0.75}{\includegraphics{figure_man/quants.png}} -% \end{figure} - - -% \begin{align*} -% H(X,Y) &= H(X) + H(Y|X) \\ -% &= H(Y) + H(X|Y) -% \end{align*} - -% \begin{align*} -% I(X;Y) &= H(X) - H(X|Y) \\ -% &= H(Y) - H(Y|X) -% \end{align*} - -% \end{vbframe} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%% REFERENCES %%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% \begin{vbframe} -% \frametitle{References} -% \footnotesize{ -% \begin{thebibliography}{99} -% -% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% \bibitem[Chris Olah, 2015]{1} Chris Olah (2015) -% \newblock Visual Information Theory -% \newblock \emph{\url{http://colah.github.io/posts/2015-09-Visual-Information/}} -% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% \bibitem[Massimiliano Tomassoli, 2016]{2} Massimiliano Tomassoli (2016) -% \newblock Information Theory for Machine Learning -% \newblock \emph{\url{https://github.com/mtomassoli/papers/blob/master/inftheory.pdf}} -% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% \bibitem[Will Kurt, 2017]{3} Will Kurt, (2017) -% \newblock Kullback-Leibler Divergence Explained -% \newblock \emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}} -% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% \bibitem[Eric Jang, 2016]{4} Eric Jang, (2016) -% \newblock A Beginner's Guide to Variational Methods: Mean-Field Approximation -% \newblock \emph{\url{https://blog.evjang.com/2016/08/variational-bayes.html}} -% -% \end{thebibliography} -% } -% \end{vbframe} - -% \section{Information Theory and Machine Learning} - -% \begin{vbframe} {KL to CE to LL} -% -% \begin{equation*} -% \begin{split} -% \theta^* & = \argmin_{\theta} \sum_1^n KL (d_i \parallel f_{\theta}(x_i)) \\ -% & = \argmin_{\theta} \sum_1^n [H(d_i, f_{\theta}(x_i)) - H(d_i)] \\ -% & = \sum_1^n H(d_i, f_{\theta}(x_i)) -% \end{split} -% \end{equation*} -% -% \framebreak -% -% \begin{equation*} -% \begin{split} -% \theta^* &= \argmin_{\theta} \sum_1^n \left( - \sum_y d_i(y) \log_2f_{\theta}(y|x_i) \right) \\ -% &= \argmin_{\theta} \sum_1^n (-\log_2f_{\theta}(y|x_i)) \\ -% &= \argmax_{\theta} \sum_1^n \log_2f_{\theta}(y|x_i) \\ -% &= \argmax_{\theta} \log \prod_1^n P(y_i|x_i;\theta) \\ -% &= \argmax_{\theta} \log P(y_1, \ldots , y_n | x_1, \ldots x_n ; \theta) \\ -% &= \argmax_{\theta} \log L(\theta) -% \end{split} -% \end{equation*} -% -% \end{vbframe} - -% \begin{vbframe} {Density Estimation} -% -% Minimizing the -% \begin{equation*} -% \begin{split} -% \theta^* &= \argmin_{\theta} KL(\hat{p} \parallel p_{\theta}) = \argmin_{\theta} [H(\hat{p},p_{\theta}) - H(\hat{p})] \\ &= \argmin_{\theta} H(\hat{p},p_{\theta}) = \argmin_{\theta} \E_{X \sim \hat{p}} [I_{p_{\theta}}(X)] \\ -% &= \argmin_{\theta} \sum_x \hat{p}(x) (- \log p_{\theta} (x)) \\ -% &= \argmax_{\theta} \sum_1^n \frac{1}{n} \log p_{\theta} (x_i) = \argmax_{\theta} \sum_1^n \log p_{\theta} (x_i) \\ -% &= \argmax_{\theta} \log \prod_1^n p_{\theta} (x_i) = \argmax_{\theta} \log P(x_1, \ldots x_n | \theta) \\ -% &= \argmax_{\theta} \log L(\theta) -% \end{split} -% \end{equation*} -% -% \end{vbframe} -% -% \begin{vbframe} {Information Gain} -% \begin{itemize} -% \item Feature selection using information gain -% \item Joint Mutual Information -% \end{itemize} -% \end{vbframe} -% -% \begin{vbframe} {Variational Inference} -% -% \begin{equation*} -% \begin{split} -% KL(q(z) \parallel p(z|x)) &= \E_q \left[ \log \frac {q(Z)} {p(Z|x)} \right] \\ -% &= \E_q [\log q(Z)] - \E_q [\log p(Z|x)] \\ -% &= \E_q [\log q(Z)] - \E_q [\log p(Z,x)] + \log p(x) \\ -% &= -(\E_q [\log p(Z,x) - \E_q [\log q(Z)]) + \log p(x) \\ -% &= -L + \log p(x) -% \end{split} -% \end{equation*} -% where L is the ELBO (Evidence Lower Bound) -% \end{vbframe} - -%\begin{vbframe} {Chain rule for entropy} -%\begin{columns}[T,onlytextwidth] -%\column{0.3\textwidth} -%\textbf{Example: Consider a node in a decision tree with 7 samples that belong to either the $+$ or the $-$ class.} \\ -%\lz -%\begin{center} -%<>= -%library(knitr) -%ex1 <- cbind(class=c("+","+","-","+","-","-", "-"), attr_1 = c(T,T,T,F,F,F,F), attr_2 = c(T,T,F,F,T,T,T)) -%kable(ex1) -%@ -%\end{center} -%\column{0.65\textwidth} -%\begin{itemize} -%\item How big is the uncertainty/entropy in \textit{class} (in bits)? -%%\small -%\begin{eqnarray*} -%H(\text{class}) &=& - \sum_{k=+,\, -} p(k) \log_2(p(k)) \\ -%&=& - \frac{3}{7} \log_2\left(\frac{3}{7}\right) - \frac{4}{7} \log_2\left(\frac{4}{7}\right) \\ -%&=& 0.985 -%\end{eqnarray*} -%%\normalsize -%\item How much can it be reduced by knowing the other attributes? -%\end{itemize} -%\end{columns} - - -%\framebreak -%\begin{columns}[T,onlytextwidth] -%\column{0.3\textwidth} -%\textbf{Example:} \\ -%\lz -%\begin{center} -%<>= -%library(knitr) -%kable(ex1) -%@ -%\end{center} -%\column{0.65\textwidth} -%\scriptsize - -%\vspace*{1.5cm} - -%$H(\text{class}|\text{attr}_1 = T) = - \frac{2}{3} \log_2(\frac{2}{3}) - \frac{1}{3} \log_2(\frac{1}{3}) = 0.92$ \\ -%$H(\text{class}|\text{attr}_1 = F) = - \frac{1}{4} \log_2(\frac{1}{4}) - \frac{3}{4} \log_2(\frac{3}{4}) = 0.81$ \\ -%$H(\text{class}|\text{attr}_2 = T) = - \frac{2}{5} \log_2(\frac{2}{5}) - \frac{3}{5} \log_2(\frac{3}{5}) = 0.97$ \\ -%$H(\text{class}|\text{attr}_2 = F) = - \frac{1}{2} \log_2(\frac{1}{2}) - \frac{1}{2} \log_2(\frac{1}{2}) = 1$ \\ -%\lz -%$H(\text{class}|\text{attr}_1) = \frac{3}{7} 0.92 + \frac{4}{7} 0.81 = 0.86$ \\ -%$H(\text{class}|\text{attr}_2) = \frac{5}{7} 0.97 + \frac{2}{7} 1 = 0.98$ - -%\normalsize - -%\end{columns} - -%\lz - -%By further splitting the node using either of the attributes, the uncertainty in class is reduced. - -%\framebreak - -%\begin{columns}[T,onlytextwidth] -%\column{0.3\textwidth} -%\textbf{Example:} \\ -%\lz -%\begin{center} -%<>= -%library(knitr) -%kable(ex1) -%@ -%\end{center} -%\column{0.65\textwidth} -%\begin{itemize} -%\item The reduction in uncertainty, or equivalently, gain in information is: -%\footnotesize -%\begin{eqnarray*} -%H(\text{class}) - H(\text{class}|\text{attr}_1) &=& 0.985 - 0.86 \\ -% &=& 0.125 -%\end{eqnarray*} - -%\begin{eqnarray*} -%H(\text{class}) - H(\text{class}|\text{attr}_2) &=& 0.985 - 0.98 \\ -%&=& 0.005 -%\end{eqnarray*} -%% \normalsize -%% \lz -%\item $\text{attr}_1$ tells us more about $\text{class}$. Therefore, to improve the predictive performance of the decision tree in the CART algorithm, it is better to further split the node using $\text{attr}_1$, rather than $\text{attr}_2$. -%\end{itemize} - -%\end{columns} - -%\end{vbframe} \endlecture \end{document} diff --git a/slides/information-theory/slides-info-mutual-info2.tex b/slides/information-theory/slides-info-mutual-info2.tex new file mode 100644 index 00000000..2af08c10 --- /dev/null +++ b/slides/information-theory/slides-info-mutual-info2.tex @@ -0,0 +1,562 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} + +\newcommand{\titlefigure}{figure/correlation_plot.png} +\newcommand{\learninggoals}{ + \item Know the joint entropy + \item Know conditional entropy as remaining uncertainty + \item Know mutual information as the amount of information of an RV obtained by another +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Joint Entropy and Mutual Information II} +\lecture{Introduction to Machine Learning} + +\begin{vbframe}{Mutual Information - Corollaries} + +\small + +\textbf{Non-negativity of mutual information:} For any two random variables, $X$, $Y$, $ I(X;Y) \geq 0$, with equality if and only if $X$ and $Y$ are independent. + +\lz + +\textbf{Proof:}$\quad I(X ; Y)=D_{KL}(p(x, y) \| p(x) p(y)) \geq 0,$ with equality if and only if $p(x, y)=p(x) p(y)$ (i.e., $X$ and $Y$ are independent). + +\lz + +\textbf{Conditioning reduces entropy (information can't hurt):} + +$$H(X | Y) \leq H(X),$$ +with equality if and only if $X$ and $Y$ are independent. + +\lz + +\textbf{Proof:}$\quad 0 \leq I(X ; Y)=H(X)-H(X | Y)$ + +Intuitively, the theorem says that knowing another random variable $Y$ can only reduce the uncertainty in $X$. Note that this is true only on the average. + +\framebreak + +% \textbf{Corollary:} + +% \footnotesize +% \begin{equation*} +% \begin{aligned} +% D_{KL}(p(y | x) \| q(y | x)) &= \sum_x p(x) \sum_y p(y|x) \log\frac{p(y|x)}{q(y|x)} \\ +% &= \E_{p(x,y)} \left[ \log\frac{p(Y|X)}{q(Y|X)}\right] \\ +% &\geq 0 +% \end{aligned} +% \end{equation*} +% \normalsize + +% with equality if and only if $p(y | x)=q(y | x)$ for all $y$ and $x$ such that $p(x)>0$. + +% In the continuous case with density functions $f$, $g$ and support set $S$ we have: + +% \begin{equation*} +% \begin{aligned} +% D_{KL}(f \| g) \geq 0, +% \end{aligned} +% \end{equation*} + +% with equality if and only if $f$ and $g$ are equal almost everywhere. + +% \framebreak + +% \textbf{Proof:} + +% \footnotesize +% \begin{equation*} +% \begin{aligned} +% -D_{KL}(f \| g) &= \int_{S} f \log \frac{g}{f} \\ +% &\leq \log \int_{S} f \frac{g}{f} \\ +% &= \log \int_{S} g \\ +% &\leq \log 1 = 0 +% \end{aligned} +% \end{equation*} +% \normalsize + +% \lz + +% \textbf{Corollary:}$\quad I(X ; Y | Z) \geq 0$, with equality if and only if $X$ and $Y$ are conditionally independent given $Z$, where \textbf{conditional mutual information} is defined as + +% \footnotesize +% \begin{equation*} +% \begin{aligned} +% I(X; Y | Z) &= H(X | Z) - H(X | Y, Z) \\ +% &= \E_{p(x,y,z)} \left[ \log\frac{p(X,Y|Z)}{p(X|Z)p(Y|Z)}\right]. +% \end{aligned} +% \end{equation*} +% \normalsize + +% \lz + +%left out Theorem 2.6.4 + + +\framebreak + +\textbf{Independence bound on entropy:} Let $X_{1}, X_{2}, \ldots, X_{n}$ be drawn according to $p\left(x_{1}, x_{2}, \ldots, x_{n}\right) .$ Then + +\footnotesize +$$H\left(X_{1}, X_{2}, \ldots, X_{n}\right) \leq \sum_{i=1}^{n} H\left(X_{i}\right),$$ +\normalsize + +with equality if and only if the $X_{i}$ are independent.\\ + +\lz + +\textbf{Proof:} With the chain rule for entropies, + +\footnotesize +\begin{equation*} +\begin{aligned} +H\left(X_{1}, X_{2}, \ldots, X_{n}\right) &=\sum_{i=1}^{n} H\left(X_{i} | X_{i-1}, \ldots, X_{1}\right) +&\leq \sum_{i=1}^{n} H\left(X_{i}\right), +\end{aligned} +\end{equation*} +\normalsize + +where the inequality follows directly from above. We have equality if and only if $X_{i}$ is independent of $X_{i-1}, \ldots, X_{1}$ for all $i$ (i.e., if and only if the $X_{i}$ 's are independent). + + +\end{vbframe} + +\begin{vbframe} {Mutual information Properties} + +%%The reduction of uncertainty in $Y$ after \textit{learning} $X$ is called \textbf{mutual information} + +%By symmetry and since $H(X,Y) = H(X) + H(Y|X)$, it also follows that + +%$$ +%I(Y;X) := H(Y) - H(Y|X) = H(Y) + H(X) - H(X, Y). +%$$ + +%\textbf{Remarks:} +%\begin{itemize} +%\item The mutual information is symmetric, i. e. $I(Y;X) = I(X;Y)$. +%\item It describes the amount of information about one random variable obtained through the other one (\textbf{information gain}). +%\end{itemize} + +%\begin{figure} +% \includegraphics{figure_man/mutualinformation.pdf} +%\end{figure} + +% \framebreak + +% Mutual information can be used to perform \textbf{feature selection}. Quite simply, each variable $X_i$ is rated according to $I(X_i;Y)$: The more information we gain on $Y$ by observing $X_i$, the more "useful" $X_i$. + +% \lz + +% Let $\D = \Dset$ and $\D \{\cdot\}$ a subset of $\D$ for which condition $\cdot$ is fulfilled. Then, \textbf{information gain} is defined as: + +% \footnotesize +% \begin{equation*} +% \begin{aligned} +% IG(\D, s) &= I(X_s;Y) \\ +% &= H(Y) - H(Y|X_s) \\ +% &= - \sum_{y \in Y} p(y) \log_2 p(y) + \sum_{x \in X_s} \sum_{y \in Y} p(x,y) \log_2 p(y|x) \\ +% &= - \sum_{y \in Y} \frac{|\D\{Y = y\}|}{|\D|} \log_2 \sum_{y \in Y} \frac{|\D\{Y = y\}|}{|\D|} \\ &+ +% \sum_{x \in X_s} \sum_{y \in Y} \frac{|\D\{Y = y, X_s = x\}|}{|\D|} \log_2 \frac{|\D\{Y = y, X_s = x\}|}{|\D\{X_s = x\}|}. +% \end{aligned} +% \end{equation*} +% \normalsize + +% \framebreak + +\begin{itemize} + % \item Intuitively, mutual information quantifies the amount of shared information between variables. + \item MI is a measure of the amount of "dependence" between variables. It is zero if and only if the variables are independent. + \item On the other hand, if one of the variables is a deterministic function of the other, the mutual information is maximal, i.e. entropy of the first. + \item Unlike (Pearson) correlation, mutual information is not limited to real-valued random variables. + \item Mutual information can be used to perform \textbf{feature selection}. Quite simply, each variable $X_i$ is rated according to $I(X_i;Y)$, this is sometimes called information gain. + \item The same principle can also be used in decision trees to select a feature to split on. Splitting on MI/IG is then equivalent to risk reduction with log-loss. +\end{itemize} +\end{vbframe} + +\begin{vbframe} {Mutual information properties} +\begin{itemize} + \item MI is invariant w.r.t. injective reparametrizations that are in $\mathcal{C}^1:$\\ + \medskip + Let $f, g: \R^d\rightarrow\R^d \in \mathcal{C}^1$ be injective transformations and $X, Y$ be continuous random variables in $\mathbb{R}^d$ then by the change of variables the joint and marginal densities of $\tilde{X} = f(X), \tilde{Y} = g(Y)$ + \begin{align*} + \tilde{p}(\tilde{x}, \tilde{y}) &= p(f^{-1}(\tilde{x}), g^{-1}(\tilde{y}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert\cdot\vert J_{g^{-1}}(\tilde{y})\vert, \\ + \tilde{p}(\tilde{x}) &= p(f^{-1}(\tilde{x}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert,\quad \tilde{p}(\tilde{y}) = p(g^{-1}(\tilde{y}))\cdot\vert J_{g^{-1}}(\tilde{y})\vert, + \end{align*} + where $p(x, y)$ is the joint density of $X$ and $Y$ and $p(x), p(y)$ are the respective marginal densities. ($J$ denotes the Jacobian) \\ + \medskip + With this, it follows that + \begin{align*} + I(\tilde{X}; \tilde{Y}) &= \int \tilde{p}(\tilde{x}, \tilde{y}) \log\left(\frac{\tilde{p}(\tilde{x}, \tilde{y})}{\tilde{p}(\tilde{x})\tilde{p}(\tilde{y})}\right)d\tilde{x}d\tilde{y} = * + \end{align*} +\end{itemize} +\end{vbframe} +\begin{vbframe}{Mutual information properties} +\begin{align*} +* & = \int p(f^{-1}(\tilde{x}), g^{-1}(\tilde{y}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert\cdot\vert J_{g^{-1}}(\tilde{y})\vert \\ &\quad\cdot \log\left(\frac{ p(f^{-1}(\tilde{x}), g^{-1}(\tilde{y}))\cdot\vert J_{f^{-1}}(\tilde{x})\vert\cdot\vert J_{g^{-1}}(\tilde{y})\vert }{p(f^{-1}(\tilde{x}))\vert J_{f^{-1}}(\tilde{x})\vert \cdot p(g^{-1}(\tilde{y}))\vert J_{g^{-1}}(\tilde{y})\vert}\right)d\tilde{x}d\tilde{y} +\\&= \int p(f^{-1}(f(x)), g^{-1}(g(y)))\cdot\vert J_{f^{-1}}(f(x))\vert\cdot\vert J_{g^{-1}}(g(y))\vert \\ &\quad\cdot \log\left(\frac{p(f^{-1}(f(x)), g^{-1}(g(y)))}{p(f^{-1}(f(x)))p(g^{-1}(g(y)))}\right)\vert J_f(x)\vert\cdot\vert J_g(y)\vert dxdy \\ + &= \int p(x, y)\cdot\vert J_{f^{-1}}(f(x))J_f(x)\vert\cdot\vert J_{g^{-1}}(g(y))J_g(y)\vert \log\left(\frac{p(x, y)}{p(x)p(y)}\right)dxdy \\ + &= \int p(x, y)\cdot \log\left(\frac{p(x, y)}{p(x)p(y)}\right)dxdy = I(X; Y). +\end{align*} +(The fourth equality holds by the inverse function theorem) +\end{vbframe} + +\begin{vbframe} {Mutual information vs. correlation} + + \begin{itemize} + \item If two variables are independent, their correlation is 0. + \item However, the reverse is not necessarily true. It is possible for two dependent variables to have 0 correlation because correlation only measures linear dependence. + +\begin{center} +\includegraphics[width = 10cm ]{figure/correlation_plot.png} \\ +\end{center} + + \item The figure above shows various scatterplots where, in each case, the correlation is 0 even though the two variables are strongly dependent, and MI is large. + \item Mutual information can therefore be seen as a more general measure of dependence between variables than correlation. + \end{itemize} + +\end{vbframe} + +\begin{vbframe} {Mutual information - example} + +Let $X, Y$ be two correlated Gaussian random variables. $(X, Y) \sim \mathcal{N}(0, K)$ with correlation $\rho$ and covariance matrix $K$: + +$$ +K = +\begin{pmatrix} + \sigma^2 & \rho \sigma^2 \\ + \rho \sigma^2 & \sigma^2 +\end{pmatrix} +$$ + +Then $h(X) = h(Y) = \frac{1}{2} \log\left((2 \pi e) \sigma^2\right)$, and $h(X,Y) = \frac{1}{2}\log\left((2 \pi e)^2 \vert K\vert\right) = \frac{1}{2}\log\left((2 \pi e)^2 \sigma^4 (1 - \rho^2)\right)$, and thus + +\begin{equation*} +\begin{aligned} +I(X;Y) = h(X) + h(Y) - h(X,Y) = - \frac{1}{2} \log(1 - \rho^2). +\end{aligned} +\end{equation*} + +For $\rho = 0$, $X$ and $Y$ are independent and $I(X;Y) = 0$. \\ +For $\rho = \pm 1$, $X$ and $Y$ are perfectly correlated and $I(X;Y) \rightarrow \infty$. +\end{vbframe} + +% \begin{vbframe} {Chain rule for information} + + +% $$I\left(X_{1}, X_{2}, \ldots, X_{n} ; Y\right)=\sumin I\left(X_{i} ; Y | X_{i-1}, X_{i-2}, \ldots, X_{1}\right)$$ + +% \textbf{Proof:$\quad$} +% \footnotesize +% \begin{equation*} +% \begin{aligned} +% I\left(X_{1}, X_{2}, \ldots, X_{n} ; Y\right) &= H\left(X_{1}, X_{2}, \ldots, X_{n}\right)-H\left(X_{1}, X_{2}, \ldots, X_{n} | Y\right) \\ +% &=\sumin H\left(X_{i} | X_{i-1}, \ldots, X_{1}\right)-\sum_{i=1}^{n} H\left(X_{i} | X_{i-1}, \ldots, X_{1}, Y\right) \\ +% &=\sumin I\left(X_{i} ; Y | X_{1}, X_{2}, \ldots, X_{i-1}\right). +% \end{aligned} +% \end{equation*} + +% \normalsize + +% \end{vbframe} + + +% \begin{vbframe} {Chain rule for KL distance} + +% \begin{equation*} +% \begin{aligned} +% D_{KL}(p(x, y) \| q(x,y)) &= D_{KL}(p(x) \| q(x)) + D_{KL}(p(y|x) \| q(y|x)) +% \end{aligned} +% \end{equation*} + +% \textbf{Proof:} + +% \footnotesize + +% \begin{equation*} +% \begin{aligned} +% D_{KL}(p(x, y) \| q(x,y)) &= \sum_x \sum_y p(x,y) \log \frac{p(x,y)}{q(x,y)} \\ +% &= \sum_x \sum_y p(x,y) \log \frac{p(x)p(y|x)}{q(x)q(y|x)} \\ +% &= \sum_x \sum_y p(x,y) \log \frac{p(x)}{q(x)} + \sum_x \sum_y p(x,y) \log \frac{p(y|x)}{q(y|x)} \\ +% &= D_{KL}(p(x) \| q(x)) + D_{KL}(p(y|x) \| q(y|x)) +% \end{aligned} +% \end{equation*} + +% \normalsize + + +% \end{vbframe} + + +%old slide about KLD +% %\normalsize +% The mutual information between two variables $X$ and $Y$ is also the KL divergence of the product of the marginal distributions $p_x(x) p_y(y)$ from the joint distribution $p(x,y)$ : +% % $I(x;y)$ is the \emph{information gain} achieved if the the joint distribution $p_{xy}(x,y)$ is used instead of the product of marginal distributions : +% +% \begin{eqnarray*} +% I(X;Y) &\overset{(*)}{=}& D_{KL}(p_{xy}||p_x p_y) = \sum_{x \in \Xspace} \sum_{y \in \Yspace} p_{xy}(x, y) \cdot \log \biggl(\frac{p_{xy}(x, y)}{p_x(x)p_y(y)}\biggr) +% \end{eqnarray*} +% +% For continuous random variables $X$ and $Y$ with joint density $p(x,y)$ and marginal densities $p_x(x) p_y(y)$, the mutual information is: +% +% \begin{eqnarray*} +% I(X;Y) &=& D_{KL}(p_{xy}||p_x p_y) = \int_{x \in \Xspace} \int_{y \in \Yspace} p_{xy}(x, y) \cdot \log \biggl(\frac{p_{xy}(x, y)}{p_x(x)p_y(y)}\biggr) +% \end{eqnarray*} +% +% +% (Note: If $X$ and $Y$ are independent, $p(x,y)=p_x(x) p_y(y)$ and $I(X;Y)$ is zero.) +% \framebreak +% +% (*) Derivation: +% +% \footnotesize +% +% +% \begin{eqnarray*} +% I(X;Y) &=& H(Y) + H(X) - H(X, Y)\\ +% &=& -\sum_{y \in \Yspace} p_y(y) \log_2(p_y(y)) -\sum_{x \in \Xspace} p_x(x) \log_2(p_x(x)) \\ +% && -\sum_{x \in \Xspace, y \in \Yspace} p_{xy}(x, y) \log_2(p_{xy}(x, y))\\ +% &=& -\sum_{x \in \Xspace, y \in \Yspace}p_{xy}(x, y) \log_2(p_y(y)) -\sum_{x \in \Xspace, y \in \Yspace} p_{xy}(x, y) \log_2(p_x(x)) \\ +% && \quad+ \sum_{x \in \Xspace, y \in \Yspace} p_{xy}(x, y) \log_2(p_{xy}(x, y)) \\ +% &=& \sum_{x \in \Xspace} \sum_{y \in \Yspace} p_{xy}(x, y) \cdot \log_2 \biggl(\frac{p_{xy}(x, y)}{p_x(x)p_y(y)}\biggr) = D_{KL}(p_{xy}||p_x p_y) +% \end{eqnarray*} +% +% + + + + + + + + + + + + + + + + + + + + +% \begin{vbframe} {Summary} + +% \begin{figure} +% \centering +% \scalebox{0.75}{\includegraphics{figure_man/quants.png}} +% \end{figure} + + +% \begin{align*} +% H(X,Y) &= H(X) + H(Y|X) \\ +% &= H(Y) + H(X|Y) +% \end{align*} + +% \begin{align*} +% I(X;Y) &= H(X) - H(X|Y) \\ +% &= H(Y) - H(Y|X) +% \end{align*} + +% \end{vbframe} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%% REFERENCES %%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \begin{vbframe} +% \frametitle{References} +% \footnotesize{ +% \begin{thebibliography}{99} +% +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \bibitem[Chris Olah, 2015]{1} Chris Olah (2015) +% \newblock Visual Information Theory +% \newblock \emph{\url{http://colah.github.io/posts/2015-09-Visual-Information/}} +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \bibitem[Massimiliano Tomassoli, 2016]{2} Massimiliano Tomassoli (2016) +% \newblock Information Theory for Machine Learning +% \newblock \emph{\url{https://github.com/mtomassoli/papers/blob/master/inftheory.pdf}} +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \bibitem[Will Kurt, 2017]{3} Will Kurt, (2017) +% \newblock Kullback-Leibler Divergence Explained +% \newblock \emph{\url{https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained}} +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% \bibitem[Eric Jang, 2016]{4} Eric Jang, (2016) +% \newblock A Beginner's Guide to Variational Methods: Mean-Field Approximation +% \newblock \emph{\url{https://blog.evjang.com/2016/08/variational-bayes.html}} +% +% \end{thebibliography} +% } +% \end{vbframe} + +% \section{Information Theory and Machine Learning} + +% \begin{vbframe} {KL to CE to LL} +% +% \begin{equation*} +% \begin{split} +% \theta^* & = \argmin_{\theta} \sum_1^n KL (d_i \parallel f_{\theta}(x_i)) \\ +% & = \argmin_{\theta} \sum_1^n [H(d_i, f_{\theta}(x_i)) - H(d_i)] \\ +% & = \sum_1^n H(d_i, f_{\theta}(x_i)) +% \end{split} +% \end{equation*} +% +% \framebreak +% +% \begin{equation*} +% \begin{split} +% \theta^* &= \argmin_{\theta} \sum_1^n \left( - \sum_y d_i(y) \log_2f_{\theta}(y|x_i) \right) \\ +% &= \argmin_{\theta} \sum_1^n (-\log_2f_{\theta}(y|x_i)) \\ +% &= \argmax_{\theta} \sum_1^n \log_2f_{\theta}(y|x_i) \\ +% &= \argmax_{\theta} \log \prod_1^n P(y_i|x_i;\theta) \\ +% &= \argmax_{\theta} \log P(y_1, \ldots , y_n | x_1, \ldots x_n ; \theta) \\ +% &= \argmax_{\theta} \log L(\theta) +% \end{split} +% \end{equation*} +% +% \end{vbframe} + +% \begin{vbframe} {Density Estimation} +% +% Minimizing the +% \begin{equation*} +% \begin{split} +% \theta^* &= \argmin_{\theta} KL(\hat{p} \parallel p_{\theta}) = \argmin_{\theta} [H(\hat{p},p_{\theta}) - H(\hat{p})] \\ &= \argmin_{\theta} H(\hat{p},p_{\theta}) = \argmin_{\theta} \E_{X \sim \hat{p}} [I_{p_{\theta}}(X)] \\ +% &= \argmin_{\theta} \sum_x \hat{p}(x) (- \log p_{\theta} (x)) \\ +% &= \argmax_{\theta} \sum_1^n \frac{1}{n} \log p_{\theta} (x_i) = \argmax_{\theta} \sum_1^n \log p_{\theta} (x_i) \\ +% &= \argmax_{\theta} \log \prod_1^n p_{\theta} (x_i) = \argmax_{\theta} \log P(x_1, \ldots x_n | \theta) \\ +% &= \argmax_{\theta} \log L(\theta) +% \end{split} +% \end{equation*} +% +% \end{vbframe} +% +% \begin{vbframe} {Information Gain} +% \begin{itemize} +% \item Feature selection using information gain +% \item Joint Mutual Information +% \end{itemize} +% \end{vbframe} +% +% \begin{vbframe} {Variational Inference} +% +% \begin{equation*} +% \begin{split} +% KL(q(z) \parallel p(z|x)) &= \E_q \left[ \log \frac {q(Z)} {p(Z|x)} \right] \\ +% &= \E_q [\log q(Z)] - \E_q [\log p(Z|x)] \\ +% &= \E_q [\log q(Z)] - \E_q [\log p(Z,x)] + \log p(x) \\ +% &= -(\E_q [\log p(Z,x) - \E_q [\log q(Z)]) + \log p(x) \\ +% &= -L + \log p(x) +% \end{split} +% \end{equation*} +% where L is the ELBO (Evidence Lower Bound) +% \end{vbframe} + +%\begin{vbframe} {Chain rule for entropy} +%\begin{columns}[T,onlytextwidth] +%\column{0.3\textwidth} +%\textbf{Example: Consider a node in a decision tree with 7 samples that belong to either the $+$ or the $-$ class.} \\ +%\lz +%\begin{center} +%<>= +%library(knitr) +%ex1 <- cbind(class=c("+","+","-","+","-","-", "-"), attr_1 = c(T,T,T,F,F,F,F), attr_2 = c(T,T,F,F,T,T,T)) +%kable(ex1) +%@ +%\end{center} +%\column{0.65\textwidth} +%\begin{itemize} +%\item How big is the uncertainty/entropy in \textit{class} (in bits)? +%%\small +%\begin{eqnarray*} +%H(\text{class}) &=& - \sum_{k=+,\, -} p(k) \log_2(p(k)) \\ +%&=& - \frac{3}{7} \log_2\left(\frac{3}{7}\right) - \frac{4}{7} \log_2\left(\frac{4}{7}\right) \\ +%&=& 0.985 +%\end{eqnarray*} +%%\normalsize +%\item How much can it be reduced by knowing the other attributes? +%\end{itemize} +%\end{columns} + + +%\framebreak +%\begin{columns}[T,onlytextwidth] +%\column{0.3\textwidth} +%\textbf{Example:} \\ +%\lz +%\begin{center} +%<>= +%library(knitr) +%kable(ex1) +%@ +%\end{center} +%\column{0.65\textwidth} +%\scriptsize + +%\vspace*{1.5cm} + +%$H(\text{class}|\text{attr}_1 = T) = - \frac{2}{3} \log_2(\frac{2}{3}) - \frac{1}{3} \log_2(\frac{1}{3}) = 0.92$ \\ +%$H(\text{class}|\text{attr}_1 = F) = - \frac{1}{4} \log_2(\frac{1}{4}) - \frac{3}{4} \log_2(\frac{3}{4}) = 0.81$ \\ +%$H(\text{class}|\text{attr}_2 = T) = - \frac{2}{5} \log_2(\frac{2}{5}) - \frac{3}{5} \log_2(\frac{3}{5}) = 0.97$ \\ +%$H(\text{class}|\text{attr}_2 = F) = - \frac{1}{2} \log_2(\frac{1}{2}) - \frac{1}{2} \log_2(\frac{1}{2}) = 1$ \\ +%\lz +%$H(\text{class}|\text{attr}_1) = \frac{3}{7} 0.92 + \frac{4}{7} 0.81 = 0.86$ \\ +%$H(\text{class}|\text{attr}_2) = \frac{5}{7} 0.97 + \frac{2}{7} 1 = 0.98$ + +%\normalsize + +%\end{columns} + +%\lz + +%By further splitting the node using either of the attributes, the uncertainty in class is reduced. + +%\framebreak + +%\begin{columns}[T,onlytextwidth] +%\column{0.3\textwidth} +%\textbf{Example:} \\ +%\lz +%\begin{center} +%<>= +%library(knitr) +%kable(ex1) +%@ +%\end{center} +%\column{0.65\textwidth} +%\begin{itemize} +%\item The reduction in uncertainty, or equivalently, gain in information is: +%\footnotesize +%\begin{eqnarray*} +%H(\text{class}) - H(\text{class}|\text{attr}_1) &=& 0.985 - 0.86 \\ +% &=& 0.125 +%\end{eqnarray*} + +%\begin{eqnarray*} +%H(\text{class}) - H(\text{class}|\text{attr}_2) &=& 0.985 - 0.98 \\ +%&=& 0.005 +%\end{eqnarray*} +%% \normalsize +%% \lz +%\item $\text{attr}_1$ tells us more about $\text{class}$. Therefore, to improve the predictive performance of the decision tree in the CART algorithm, it is better to further split the node using $\text{attr}_1$, rather than $\text{attr}_2$. +%\end{itemize} + +%\end{columns} + +%\end{vbframe} + +\endlecture +\end{document} + + +