diff --git a/slides-pdf/lecture_sl.pdf b/slides-pdf/lecture_sl.pdf index 93744b4e..dadfdbb5 100644 Binary files a/slides-pdf/lecture_sl.pdf and b/slides-pdf/lecture_sl.pdf differ diff --git a/slides-pdf/slides-gp-basic.pdf b/slides-pdf/slides-gp-basic.pdf new file mode 100644 index 00000000..2e012dba Binary files /dev/null and b/slides-pdf/slides-gp-basic.pdf differ diff --git a/slides-pdf/slides-gp-bayes-lm.pdf b/slides-pdf/slides-gp-bayes-lm.pdf new file mode 100644 index 00000000..7cd75c89 Binary files /dev/null and b/slides-pdf/slides-gp-bayes-lm.pdf differ diff --git a/slides-pdf/slides-gp-covariance.pdf b/slides-pdf/slides-gp-covariance.pdf new file mode 100644 index 00000000..db4cf6d5 Binary files /dev/null and b/slides-pdf/slides-gp-covariance.pdf differ diff --git a/slides-pdf/slides-gp-mean.pdf b/slides-pdf/slides-gp-mean.pdf new file mode 100644 index 00000000..57706e05 Binary files /dev/null and b/slides-pdf/slides-gp-mean.pdf differ diff --git a/slides-pdf/slides-gp-prediction.pdf b/slides-pdf/slides-gp-prediction.pdf new file mode 100644 index 00000000..a7346b5a Binary files /dev/null and b/slides-pdf/slides-gp-prediction.pdf differ diff --git a/slides-pdf/slides-gp-training.pdf b/slides-pdf/slides-gp-training.pdf new file mode 100644 index 00000000..0502c87a Binary files /dev/null and b/slides-pdf/slides-gp-training.pdf differ diff --git a/slides-pdf/slides-info-entropy2.pdf b/slides-pdf/slides-info-entropy2.pdf index ece57c2c..4b15ec87 100644 Binary files a/slides-pdf/slides-info-entropy2.pdf and b/slides-pdf/slides-info-entropy2.pdf differ diff --git a/slides/all/slides_sl.tex b/slides/all/slides_sl.tex index e3549741..c081135c 100644 --- a/slides/all/slides_sl.tex +++ b/slides/all/slides_sl.tex @@ -98,13 +98,14 @@ \section{Linear Support Vector Machine} \section{Nonlinear Support Vector Machine} \input{../nonlinear-svm/chapter-order.tex} -%\section{Gaussian Processes} -%\input{..//gaussian-processes/chapter-order.tex} \section{Boosting} \input{../boosting/chapter-order.tex} -\section{Feature Selection} -\input{../feature-selection/chapter-order.tex} +%\section{Feature Selection} +%\input{../feature-selection/chapter-order.tex} + +\section{Gaussian Processes} +\input{../gaussian-processes/chapter-order.tex} \end{document} diff --git a/slides/gaussian-processes/Makefile b/slides/gaussian-processes/Makefile new file mode 100644 index 00000000..6e753282 --- /dev/null +++ b/slides/gaussian-processes/Makefile @@ -0,0 +1 @@ +include ../tex.mk \ No newline at end of file diff --git a/slides/gaussian-processes/attic/slides-x-covariance-adv.tex b/slides/gaussian-processes/attic/slides-x-covariance-adv.tex new file mode 100644 index 00000000..2c63fc17 --- /dev/null +++ b/slides/gaussian-processes/attic/slides-x-covariance-adv.tex @@ -0,0 +1,100 @@ +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\newcommand{\titlefigure}{figure_man/up-crossings.png} +\newcommand{\learninggoals}{ + \item \textcolor{blue}{XXX} + \item \textcolor{blue}{XXX} +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Covariance Functions for GPs - Advanced} +\lecture{Introduction to Machine Learning} + +\begin{vbframe}{MS-Continuity and Differentiability} + +We wish to describe a Gaussian process in terms of its smoothness. There are several notions of continuity for random variables - one is continuity / differentiability in mean square (MS): + +\begin{block}{Definition} +A Gaussian process $f(\xv)$ is said to be +\begin{itemize} +\item continuous in MS in $\xv_*$, if $\E[|f(\xv^{(k)}) - f(\xv_*)|^2] \overset{k \to \infty}{\longrightarrow} 0$ for any sequence $\xv^{(k)} \overset{k \to \infty}{\to} \xv_*$ +\item MS differentiable in direction $i$ if $\lim_{h\to 0}\E[|\frac{f(\xv + h\bm{e}_i) - f(\xv)}{h}|]$ exists, where $\bm{e}_i = (0,\dots,0,1,0,\dots,0)^T$ is the unit vector in the $i$-th axis. +\end{itemize} +\end{block} + +\textbf{Remark:} MS continuity / differentiability does not necessarily imply continuity / differentiability of the sampled function! + +\framebreak + +MS continuity / differentiability of a Gaussian process can be derived from the smoothness properties of the kernel: + +\begin{itemize} +\item The GP is continuous in MS if and only if the covariance function $k(\xv, \xv +^\prime)$ is continuous +\item The MS derivative of a Gaussian process exists iff the second derivative $\frac{\partial^{2} k(\xv, \xv^\prime)}{\partial \xv\partial \xv^\prime}$ exists +\end{itemize} + +\end{vbframe} + + + +\begin{vbframe}{Squared exponential covariance function} + +One common used covariance function is the squared exponential covariance function: + +$$ +k(\xv, \xv^\prime) = \exp\biggl(- \frac{\|\xv - \xv^\prime\|^2}{2\ls^2}\biggr) +$$ + +\textbf{Properties}: +\begin{itemize} +\item as it depends on the distance $r = \|\xv - \xv^\prime\|$ only, it is a isotropic (and thus also stationary) covariance function +\item infinitely differentiable $\to$ corresponding GP is thus very smooth +\item due to its strong smoothness assumptions it is often unrealistic for modeling many physical processes + +\end{itemize} + +\end{vbframe} + +\begin{vbframe}{Upcrossing Rate and Characteristic Length-Scale} + +Another way to describe a Gaussian process is the expected number of up-crossings at level $0$ on the unit interval, which we denote by $N_0$. + +\begin{figure} + \includegraphics[width=0.7\textwidth]{figure_man/up-crossings.png} +\end{figure} + +For an isotropic covariance function $k(r)$, it can be shown that the expected number of up-crossings can be calculated explicitly + +$$ +\E[N_0] = \frac{1}{2\pi} \sqrt{\frac{- k^{\prime \prime}(0)}{k(0)}}. +$$ + +\framebreak + +\textbf{Example:} Squared exponential + +\begin{eqnarray*} +k(r) &=& \exp\biggl(-\frac{r^2}{2\ls^2}\biggr)\\ +k^\prime(r) &=& - k(r) \cdot \frac{r}{\ls^2} \\ +k^{\prime\prime}(r) &=& k(r) \cdot \frac{r^2}{\ls^4} - k(r) \cdot \frac{1}{\ls^2} +\end{eqnarray*} + +The expected number of level-0 upcrossing is thus + +$$ +\E[N_0] = \frac{1}{2\pi} \sqrt{\frac{- k^{\prime\prime}(0)}{k(0)}} = \frac{1}{2\pi} \sqrt{\frac{1}{\ls^2}} = (2\pi \ls)^{-1} +$$ + + +\end{vbframe} + +\endlecture +\end{document} diff --git a/slides/gaussian-processes/attic/slides-x-gp-additional.tex b/slides/gaussian-processes/attic/slides-x-gp-additional.tex new file mode 100644 index 00000000..788514fb --- /dev/null +++ b/slides/gaussian-processes/attic/slides-x-gp-additional.tex @@ -0,0 +1,219 @@ +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\newcommand{\titlefigure}{figure_man/post-mean.png} % does not fit +\newcommand{\learninggoals}{ + \item \textcolor{blue}{XXX} + \item \textcolor{blue}{XXX} +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Gaussian Proccesses: Additional Material} +\lecture{Introduction to Machine Learning} + +%http://www.gaussianprocess.org/gpml + +\begin{vbframe}{Notation} +% We would like to model a function +% +% $$ +% f: \mathcal{X} \to \Yspace +% $$ +% +% where +% +% \begin{itemize} +% \item $\Xspace$ is a p-dimensional input space (here: $\Xspace = \R^n$) +% \item $\Yspace$ is the target space (usually $\Yspace = \R$ for regression and $\Yspace = \{0, 1\}$ for binary classification) +% \item $\bm{x} \in \mathcal{X}$ is called independent / predictor variable +% \item $y \in \mathcal{Y}$ is called dependent variable (target, label, output) +% \end{itemize} +% +% \framebreak +% + +In this chapter + +\begin{itemize} +\item $(\xv_*, y_*)$ denotes one single test observation, excluded from training +\item $\Xmat_* \in \R^{n_* \times p}$ contains a set of $n_*$ test observations and +\item $\yv_* \in \R^{n_* \times p}$ the corresponding outcomes, excluded from training. +\end{itemize} + +% \framebreak + +% In the context of Gaussian processes + +% \begin{itemize} +% \item the function $m: \Xspace \to \R$ is called \textbf{mean function}. We define the \textbf{mean vector} + +% \vspace*{-0.3cm} +% $$ +% m(\Xmat):= \biggl(m\left(\bm{x}^{(1)}\right), m\left(\bm{x}^{(2)}\right), ..., m\left(\bm{x}^{(n)}\right)\biggr)^T +% $$ +% \item the bivariate, positive-definite function $k: \Xspace \times \Xspace \to \R$ is called \textbf{covariance function} or \textbf{kernel}; $k(\Xmat, \Xmat)$ denotes the $n\times n$ matrix that is obtained by plugging in all pairs $\bm{x}^{(i)}, \bm{x}^{(j)}$ and is called \textbf{kernel matrix} or \textbf{covariance matrix} + +% $$ +% k(\Xmat, \Xmat) := k(\bm{x}^{(i)}, \bm{x}^{(j)})_{i, j = 1, ..., n} +% $$ +% \item We sometimes use the abbreviations $\bm{K} := k(\Xmat, \Xmat)$, $\bm{K}_* := k(\Xmat_*, \Xmat)$, $\bm{K}_{**} := k(\Xmat_*, \Xmat_*)$. + + +% \end{itemize} + +\end{vbframe} + + + +\section{Noisy Gaussian Processes} + +\begin{vbframe}{Noisy Gaussian Process} + +In the above equations we implicitly assumed that we had access to the true function value $\fx$. In many cases, we only have access to a noisy version thereof +$$ +y = \fx + \eps.$$ + +Assuming additive i.i.d. Gaussian noise, the covariance function becomes + +$$ +\cov(y^{(i)}, y^{(j)}) = k(\bm{x}^{(i)}, \bm{x}^{(j)}) + \sigma_n^2 \delta_{ij} +$$ + +where $\delta_{ij} = 1$ if $i = j$. In matrix notation, this becomes + +$$ +\cov(\yv) = \Kmat + \sigma_n^2\id =: \Kmat_y. +$$ + +The $\sigma_n^2$ is also called \textbf{nugget}. + +\end{vbframe} + +\begin{vbframe}{GP vs. kernelized Ridge regression} + +The predictive function is then + +\begin{eqnarray*} +\bm{f}_* | \Xmat_*, \Xmat, \yv \sim \mathcal{N}(\bm{\bar f}_*, \cov(\bm{\bar f}_*)). +\end{eqnarray*} + +with + +\begin{itemize} +\item $\bm{\bar f}_* = \Kmat_{*}^{T} \Kmat_y^{-1}\yv$ and +\item $\cov(\bm{\bar f}_*) = \Kmat_{**}- \Kmat_{*}^{T}\Kmat_y^{-1}\Kmat_*$. +\end{itemize} + +The predicted mean values at the training points $\bm{\bar f} = \bm{K}\Kmat_y^{-1}\bm{y}$ are a \textbf{linear combination} of the $\bm{y}$ values. + +\lz + +\textbf{Note:} Predicting the posterior mean corresponds exactly to the predictions obtained by kernelized Ridge regression. However, a GP (as a Bayesian model) gives us much more information, namely a posterior distribution, whilst kernelized Ridge regression does not. + + +\end{vbframe} + + + + +\section{Bayesian Linear Regression as a GP} + + +\begin{vbframe}{Bayesian linear regression as a GP} + +One example for a Gaussian process is the Bayesian linear regression model covered earlier. For $\thetab \sim \mathcal{N}(\bm{0}, \tau^2 \id)$, the joint distribution of any set of function values + +$$ +f(\xi) = \thetab^T \xi + \epsi +$$ + +is Gaussian. + +\vspace*{0.3cm} + +The corresponding mean function is $m(\bm{x}) = \bm{0}$ and the covariance function is + +\vspace*{-0.5cm} + +\begin{eqnarray*} +\cov(f(\bm{x}), f(\bm{x}^\prime)) &=& \E[f(\bm{x}) f(\bm{x}^\prime)] - \underbrace{\E[f(\bm{x})] \E[f(\bm{x}^\prime]}_{= 0} \\ &=& \E[(\thetab^T \bm{x} + \epsi)^T(\thetab^T \bm{x}^\prime + \epsi)] \\ &=& \tau^2 \bm{x}^T\bm{x}^\prime + \sigma^2 =: k(\bm{x}, \bm{x}^\prime). +\end{eqnarray*} + +% As we have just described, the predictive distribution assuming a Gaussian process Prior for one single test point $\bm{x}^*$ is normal with mean +% +% $$ +% (\bm{x}^*)^T \bm{X}^T (\Xmat\Xmat^T + \id)^{-1} \yv. +% $$ +% +% Remember that we derived also a normal predictive distribution for a Bayesian linear regression case - the predictive mean was +% +% $$ +% \mu_{\text{post}} = (\bm{x}^*)^T(\Xmat^T\Xmat + \sigma^2 \id)^{-1}\Xmat^T\yv. +% $$ +% +% Using the matrix identity $(\bm{AB} + \id) +% ^{-1}\Amat = \Amat(\bm{BA} + \id)^{-1}$^*$, it can be seen that the predictive distributions are identical. +% +% \vfill +% \begin{footnotesize} +% $^*$ Searl Set of Identities, see \emph{http://matrixcookbook.com], 3.2} +% \end{footnotesize} + +\end{vbframe} + +\begin{vbframe}{Feature Spaces and the Kernel Trick} + +If one relaxes the linearity assumption by first projecting features into a higher dimensional feature space $\mathcal{Z}$ using a basis function $\phi: \Xspace \to \mathcal{Z}$, the corresponding covariance function is + +$$ +k(\bm{x}, \bm{x}^\prime) = \tau^2 \phi(\bm{x})^T\phi(\bm{x}^\prime) + \sigma^2. +$$ + +To get arbitrarily complicated functions, we would have to handle high-dimensional feature vectors $\phi(\bm{x})$. + +\lz + +Fortunately, all we need to know are the inner products $\phi(\bm{x})^T\phi(\bm{x}^\prime)$ - the feature vector itself never occurs in calculations. + +\framebreak + + +If we can get the inner product directly \textbf{without} calculating the infinite feature vectors, we can infer an infinitely complicated model with a \textbf{finite amount} of computation. This idea is known as \textbf{kernel trick}. + +\lz + + A Gaussian process can be defined by either + +\begin{itemize} +\item deriving the covariance function explicitly via inner products of evaluations of basis functions or +\item choosing a positive definite kernel function (Mercer Kernel) directly, which corresponds - according to Mercer's theorem - to taking inner products in some (possibly infinite) feature space +\end{itemize} + +\end{vbframe} + +\begin{vbframe}{Summary: Gaussian process regression} + +\begin{itemize} +\item Gaussian process regression is equivalent to \textbf{kernelized} Bayesian linear regression +\item The covariance function describes the shape of the Gaussian process +\item With the right choice of covariance function, remarkably flexible models can be built +\item But: naive implementations of Gaussian process models scale poorly with large datasets as +\begin{itemize} +\item the kernel matrix has to be inverted / factorized, which is $\order(n^3)$, +\item computing the kernel matrix uses $\order(n^2)$ memory - running out of memory places a hard limit on problem sizes +\item generating predictions is $\order(n)$ for the mean, but $\order(n^2)$ for the variance. +\end{itemize} +(...so we need special tricks) +\end{itemize} + +\end{vbframe} + + +\endlecture +\end{document} diff --git a/slides/gaussian-processes/attic/slides-x-gp-classification.tex b/slides/gaussian-processes/attic/slides-x-gp-classification.tex new file mode 100644 index 00000000..56e460c6 --- /dev/null +++ b/slides/gaussian-processes/attic/slides-x-gp-classification.tex @@ -0,0 +1,149 @@ +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\newcommand{\titlefigure}{figure_man/gp-classification.png} +\newcommand{\learninggoals}{ + \item \textcolor{blue}{XXX} + \item \textcolor{blue}{XXX} +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Gaussian Process Classification} +\lecture{Introduction to Machine Learning} + +\begin{vbframe}{Gaussian process classification} + +\begin{itemize} +\item Consider a binary classification problem where we want to learn $h: \Xspace \to \Yspace$, where $\Yspace = \{0, 1\}$. +\item The idea behind Gaussian process classification is very simple: a GP prior is placed over the score function $\fx$ and then transformed to a class probability via a sigmoid function $s(t)$ + +$$ +p(y = 1 ~|~ \fx ) = s(\fx). +$$ + +\item This is a non-Gaussian likelihood, so we need to use approximate inference methods, e.g. Laplace approximation, expectation propagation, MCMC +\item For more details see \emph{Rasmussen, Gaussian Processes for Machine Learning, Chapter 3}. + +\end{itemize} + +\vspace*{1cm} + +\begin{figure} +\includegraphics[width=1\textwidth]{figure_man/gp-classification.png} +\end{figure} + +\framebreak + +% Inference is divided into two steps + +% \begin{enumerate} +% \item Computing the predictive distribution over the latent variable $f_*$ for a new observation $\xv_*$ + +% \vspace*{-0.5cm} + + +% $$ +% p(f_*~|~\bm{X}, \bm{y}, \bm{x}_*) = \int p(f_*~|~\bm{X}, \bm{x}_*, \bm{f})p(\bm{f}~|~\bm{X}, \bm{y})d\bm{f}, +% $$ + +% where $p(\bm{f}~|~\bm{X, y})$ is the posterior over the latent variables. (We integrate out the unobserved latent variable $\bm{f}$). +% \item Using this distribution over $f_*$ to compute the posterior probability for class $1$ + +% \vspace*{-0.5cm} + +% $$ +% p(y_* = 1~|~ \bm{X}, \bm{y}, \bm{x}_*) = \int \sigma(f_*)p(f_*~|~\bm{X}, \bm{y}, \bm{x}_*)df_*. +% $$ +% \end{enumerate} + +% \begin{footnotesize} +% Note that both expressions might be analytically intractable. Thus we need to use either analytic approximations of integrals (e. g. Laplace approximation) or solutions based on Monte Carlo sampling, which are not covered here. +% \end{footnotesize} + +According to Bayes' rule, the posterior (of the score function $\bm{f}$) + +\vspace*{-0.5cm} + +\begin{eqnarray*} + p(\bm{f} ~|~ \Xmat, \yv) &=& \frac{p(\yv ~|~ \bm{f}, \Xmat) \cdot p(\bm{f} ~|~ \Xmat)}{p(\yv ~|~\Xmat)} \propto p(\yv ~|~ \bm{f}) \cdot p(\bm{f} ~|~ \Xmat) +\end{eqnarray*} + +(the denominator is independent of $\bm{f}$ and thus dropped). + +\lz + +Since $p(\bm{f} ~|~ \Xmat) \sim \mathcal{N}\left(0, \bm{K}\right)$ by the GP assumption, we have + +\vspace*{-0.2cm} + +$$ + \log p(\bm{f} ~|~ \Xmat, y) \propto \log p(\yv ~|~ \bm{f}) - \frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} - \frac{1}{2} \log |\bm{K}| - \frac{n}{2} \log 2 \pi. +$$ + +\framebreak + +If the kernel is fixed, the last two terms are fixed. To obtain the maximum a-posteriori estimate (MAP) we minimize + +$$ + \frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} - \sumin \log p(\yi ~|~ f^{(i)}) + C. +$$ + +Note that $- \sumin \log p(\yi ~|~ f^{(i)})$ +is the logistic loss. We can see that Gaussian process classification corresponds to \textbf{kernel Bayesian logistic regression}! + +\end{vbframe} + +\begin{vbframe}{Comparison: GP vs. SVM } + +The SVM + +\begin{eqnarray*} + && \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi, \\ +\end{eqnarray*} + +where $L(y, \fx) = \max\{0, 1-\fx\cdot y\}$ is the Hinge loss. + +\lz + +By the representer theorem we know that $\thetab = \sumin \beta_i \yi k\left(\xi, \cdot \right)$ and thus $\thetab^\top \thetab = \beta^\top \bm{K} \beta = \bm{f}^\top \bm{K}^{-1} \bm{f}$, as $\bm{K} \beta = \bm{f}$. Plugging that in, the optimization objective is + +\begin{eqnarray*} + && \frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} + C \sumin \Lxyi. +\end{eqnarray*} + +\framebreak + +For log-concave likelihoods $\log p(\yv ~|~ \bm{f})$, there is a close correspondence between the MAP solution of the GP classifier + +$$ + \argmin_f \frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} - \sumin \log p(\yi ~|~ f^{(i)}) + C \quad \text{(GP classifier)} +$$ + +and the SVM solution + +\begin{eqnarray*} + \argmin_f && \frac{1}{2} \bm{f}^\top \bm{K}^{-1} \bm{f} + C \sumin \Lxyi \quad \text{(SVM classifier)}. +\end{eqnarray*} + +\framebreak + +\begin{itemize} +\item Both the Hinge loss and the Bernoulli loss are monotonically decreasing with increasing margin $y \fx$. +\item The key difference is that the hinge loss takes on the value $0$ for $y \fx \ge 1$, while the Bernoulli loss just decays slowly. +\item It is this flat part of the hinge function that gives rise to the sparsity of the SVM solution. \item We can see the SVM classifier as a \enquote{sparse} GP classifier. +\end{itemize} + +\begin{figure} +\includegraphics[width=0.8\textwidth]{figure_man/gp-vs-svm.png} +\end{figure} + +\end{vbframe} + +\endlecture +\end{document} \ No newline at end of file diff --git a/slides/gaussian-processes/chapter-order.tex b/slides/gaussian-processes/chapter-order.tex new file mode 100644 index 00000000..ba9a1104 --- /dev/null +++ b/slides/gaussian-processes/chapter-order.tex @@ -0,0 +1,25 @@ +%Suggested order of slides +% slides-bayes-lm.tex +% slides-gp-basic.tex +% slides-gp-covariance.tex +% slides-gp-prediction.tex +% slides-gp-training.tex +% slides-gp-mean.tex + +% not included: +% slides-x-covariance-adv.tex +% slides-x-gp-additional.tex +% slides-x-gp-classification.tex + +\subsection{Bayes LM} +\includepdf[pages=-]{../../slides-pdf/slides-gp-bayes-lm.pdf} +\subsection{Gaussian Processes - Basics} +\includepdf[pages=-]{../../slides-pdf/slides-gp-basic.pdf} +\subsection{Gaussian Processes - Covariance} +\includepdf[pages=-]{../../slides-pdf/slides-gp-covariance.pdf} +\subsection{Gaussian Processes - Prediction} +\includepdf[pages=-]{../../slides-pdf/slides-gp-prediction.pdf} +\subsection{Gaussian Processes - Training} +\includepdf[pages=-]{../../slides-pdf/slides-gp-training.pdf} +\subsection{Gaussian Processes - Mean} +\includepdf[pages=-]{../../slides-pdf/slides-gp-mean.pdf} diff --git a/slides/gaussian-processes/figure/bayes_lm/example.pdf b/slides/gaussian-processes/figure/bayes_lm/example.pdf new file mode 100644 index 00000000..2b48dcdf Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/example.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_10_1.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_10_1.pdf new file mode 100644 index 00000000..d3dd0b90 Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_10_1.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_10_2.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_10_2.pdf new file mode 100644 index 00000000..57f040dc Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_10_2.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_10_3.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_10_3.pdf new file mode 100644 index 00000000..228741d0 Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_10_3.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_20_1.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_20_1.pdf new file mode 100644 index 00000000..2e23010a Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_20_1.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_20_2.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_20_2.pdf new file mode 100644 index 00000000..ad5ad767 Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_20_2.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_20_3.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_20_3.pdf new file mode 100644 index 00000000..428f6e8d Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_20_3.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_5_1.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_5_1.pdf new file mode 100644 index 00000000..0f490ff4 Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_5_1.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_5_2.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_5_2.pdf new file mode 100644 index 00000000..3d10b4cd Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_5_2.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/posterior_5_3.pdf b/slides/gaussian-processes/figure/bayes_lm/posterior_5_3.pdf new file mode 100644 index 00000000..a0d35570 Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/posterior_5_3.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/prior_1.pdf b/slides/gaussian-processes/figure/bayes_lm/prior_1.pdf new file mode 100644 index 00000000..1cccae2a Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/prior_1.pdf differ diff --git a/slides/gaussian-processes/figure/bayes_lm/prior_2.pdf b/slides/gaussian-processes/figure/bayes_lm/prior_2.pdf new file mode 100644 index 00000000..eaffb01b Binary files /dev/null and b/slides/gaussian-processes/figure/bayes_lm/prior_2.pdf differ diff --git a/slides/gaussian-processes/figure/covariance.pdf b/slides/gaussian-processes/figure/covariance.pdf new file mode 100644 index 00000000..fca18f1a Binary files /dev/null and b/slides/gaussian-processes/figure/covariance.pdf differ diff --git a/slides/gaussian-processes/figure/covariance2point/example_covariance_1.pdf b/slides/gaussian-processes/figure/covariance2point/example_covariance_1.pdf new file mode 100644 index 00000000..89d88349 Binary files /dev/null and b/slides/gaussian-processes/figure/covariance2point/example_covariance_1.pdf differ diff --git a/slides/gaussian-processes/figure/covariance2point/example_covariance_2.pdf b/slides/gaussian-processes/figure/covariance2point/example_covariance_2.pdf new file mode 100644 index 00000000..a5681bf4 Binary files /dev/null and b/slides/gaussian-processes/figure/covariance2point/example_covariance_2.pdf differ diff --git a/slides/gaussian-processes/figure/covariance2point/example_function_1_1.pdf b/slides/gaussian-processes/figure/covariance2point/example_function_1_1.pdf new file mode 100644 index 00000000..c12ddead Binary files /dev/null and b/slides/gaussian-processes/figure/covariance2point/example_function_1_1.pdf differ diff --git a/slides/gaussian-processes/figure/covariance2point/example_function_1_2.pdf b/slides/gaussian-processes/figure/covariance2point/example_function_1_2.pdf new file mode 100644 index 00000000..1a4b5edd Binary files /dev/null and b/slides/gaussian-processes/figure/covariance2point/example_function_1_2.pdf differ diff --git a/slides/gaussian-processes/figure/covariance2point/example_function_2_1.pdf b/slides/gaussian-processes/figure/covariance2point/example_function_2_1.pdf new file mode 100644 index 00000000..d137d622 Binary files /dev/null and b/slides/gaussian-processes/figure/covariance2point/example_function_2_1.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_10_1.pdf b/slides/gaussian-processes/figure/discrete/example_10_1.pdf new file mode 100644 index 00000000..d0db38b9 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_10_1.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_10_2.pdf b/slides/gaussian-processes/figure/discrete/example_10_2.pdf new file mode 100644 index 00000000..4556815b Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_10_2.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_10_3.pdf b/slides/gaussian-processes/figure/discrete/example_10_3.pdf new file mode 100644 index 00000000..b5545508 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_10_3.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_2_1.pdf b/slides/gaussian-processes/figure/discrete/example_2_1.pdf new file mode 100644 index 00000000..64272391 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_2_1.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_2_2.pdf b/slides/gaussian-processes/figure/discrete/example_2_2.pdf new file mode 100644 index 00000000..fc7e1c63 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_2_2.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_2_3.pdf b/slides/gaussian-processes/figure/discrete/example_2_3.pdf new file mode 100644 index 00000000..dcff7319 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_2_3.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_5_1.pdf b/slides/gaussian-processes/figure/discrete/example_5_1.pdf new file mode 100644 index 00000000..c3b0d650 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_5_1.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_5_2.pdf b/slides/gaussian-processes/figure/discrete/example_5_2.pdf new file mode 100644 index 00000000..66a6d1eb Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_5_2.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_5_3.pdf b/slides/gaussian-processes/figure/discrete/example_5_3.pdf new file mode 100644 index 00000000..aee687f5 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_5_3.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_extreme_50_1.pdf b/slides/gaussian-processes/figure/discrete/example_extreme_50_1.pdf new file mode 100644 index 00000000..4de06095 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_extreme_50_1.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_extreme_50_2.pdf b/slides/gaussian-processes/figure/discrete/example_extreme_50_2.pdf new file mode 100644 index 00000000..95630f6b Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_extreme_50_2.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_extreme_50_3.pdf b/slides/gaussian-processes/figure/discrete/example_extreme_50_3.pdf new file mode 100644 index 00000000..e7e72be9 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_extreme_50_3.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_extreme_50_4.pdf b/slides/gaussian-processes/figure/discrete/example_extreme_50_4.pdf new file mode 100644 index 00000000..41834526 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_extreme_50_4.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_limit.pdf b/slides/gaussian-processes/figure/discrete/example_limit.pdf new file mode 100644 index 00000000..13ca1d80 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_limit.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_marginalization_10.pdf b/slides/gaussian-processes/figure/discrete/example_marginalization_10.pdf new file mode 100644 index 00000000..98d675ca Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_marginalization_10.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_marginalization_5.pdf b/slides/gaussian-processes/figure/discrete/example_marginalization_5.pdf new file mode 100644 index 00000000..0ecc2bab Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_marginalization_5.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_marginalization_50.pdf b/slides/gaussian-processes/figure/discrete/example_marginalization_50.pdf new file mode 100644 index 00000000..1a85a5b8 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_marginalization_50.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_10_1_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_10_1_a.pdf new file mode 100644 index 00000000..26d341b0 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_10_1_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_10_1_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_10_1_b.pdf new file mode 100644 index 00000000..2974d841 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_10_1_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_10_2_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_10_2_a.pdf new file mode 100644 index 00000000..90de4857 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_10_2_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_10_2_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_10_2_b.pdf new file mode 100644 index 00000000..2974d841 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_10_2_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_10_3_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_10_3_a.pdf new file mode 100644 index 00000000..019d18a5 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_10_3_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_10_3_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_10_3_b.pdf new file mode 100644 index 00000000..2974d841 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_10_3_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_2_1_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_2_1_a.pdf new file mode 100644 index 00000000..3242205e Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_2_1_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_2_1_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_2_1_b.pdf new file mode 100644 index 00000000..5abb0b68 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_2_1_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_2_2_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_2_2_a.pdf new file mode 100644 index 00000000..a4225e33 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_2_2_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_2_2_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_2_2_b.pdf new file mode 100644 index 00000000..dc80f845 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_2_2_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_2_3_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_2_3_a.pdf new file mode 100644 index 00000000..b533d43b Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_2_3_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_2_3_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_2_3_b.pdf new file mode 100644 index 00000000..f75c2aee Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_2_3_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_5_1_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_5_1_a.pdf new file mode 100644 index 00000000..a13dc92f Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_5_1_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_5_1_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_5_1_b.pdf new file mode 100644 index 00000000..2e42d63f Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_5_1_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_5_2_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_5_2_a.pdf new file mode 100644 index 00000000..63cf9a1c Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_5_2_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_5_2_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_5_2_b.pdf new file mode 100644 index 00000000..2e42d63f Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_5_2_b.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_5_3_a.pdf b/slides/gaussian-processes/figure/discrete/example_norm_5_3_a.pdf new file mode 100644 index 00000000..96a4a393 Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_5_3_a.pdf differ diff --git a/slides/gaussian-processes/figure/discrete/example_norm_5_3_b.pdf b/slides/gaussian-processes/figure/discrete/example_norm_5_3_b.pdf new file mode 100644 index 00000000..2e42d63f Binary files /dev/null and b/slides/gaussian-processes/figure/discrete/example_norm_5_3_b.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/1.pdf b/slides/gaussian-processes/figure/gp_pred/1.pdf new file mode 100644 index 00000000..2aa9d5f3 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/1.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/2.pdf b/slides/gaussian-processes/figure/gp_pred/2.pdf new file mode 100644 index 00000000..18444b4b Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/2.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/3.pdf b/slides/gaussian-processes/figure/gp_pred/3.pdf new file mode 100644 index 00000000..d7464088 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/3.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/4.pdf b/slides/gaussian-processes/figure/gp_pred/4.pdf new file mode 100644 index 00000000..676f1c34 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/4.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/5.pdf b/slides/gaussian-processes/figure/gp_pred/5.pdf new file mode 100644 index 00000000..8a530c9c Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/5.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/6.pdf b/slides/gaussian-processes/figure/gp_pred/6.pdf new file mode 100644 index 00000000..669f2b85 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/6.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/gp_interpolator.pdf b/slides/gaussian-processes/figure/gp_pred/gp_interpolator.pdf new file mode 100644 index 00000000..41dc41fa Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/gp_interpolator.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/gp_regression.pdf b/slides/gaussian-processes/figure/gp_pred/gp_regression.pdf new file mode 100644 index 00000000..60c01036 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/gp_regression.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/post_mean.pdf b/slides/gaussian-processes/figure/gp_pred/post_mean.pdf new file mode 100644 index 00000000..d5c9df55 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/post_mean.pdf differ diff --git a/slides/gaussian-processes/figure/gp_pred/post_variance.pdf b/slides/gaussian-processes/figure/gp_pred/post_variance.pdf new file mode 100644 index 00000000..d59dba78 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_pred/post_variance.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/1_1.pdf b/slides/gaussian-processes/figure/gp_sample/1_1.pdf new file mode 100644 index 00000000..5d459b5e Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/1_1.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/2_1.pdf b/slides/gaussian-processes/figure/gp_sample/2_1.pdf new file mode 100644 index 00000000..f9d9f104 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/2_1.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/2_2.pdf b/slides/gaussian-processes/figure/gp_sample/2_2.pdf new file mode 100644 index 00000000..fff31c4d Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/2_2.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/2_3.pdf b/slides/gaussian-processes/figure/gp_sample/2_3.pdf new file mode 100644 index 00000000..1896a8dd Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/2_3.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/2_4.pdf b/slides/gaussian-processes/figure/gp_sample/2_4.pdf new file mode 100644 index 00000000..baeb6064 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/2_4.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/different_samples.pdf b/slides/gaussian-processes/figure/gp_sample/different_samples.pdf new file mode 100644 index 00000000..10b51e14 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/different_samples.pdf differ diff --git a/slides/gaussian-processes/figure/gp_sample/varying_length_scale.pdf b/slides/gaussian-processes/figure/gp_sample/varying_length_scale.pdf new file mode 100644 index 00000000..5d37172c Binary files /dev/null and b/slides/gaussian-processes/figure/gp_sample/varying_length_scale.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/datapoints.pdf b/slides/gaussian-processes/figure/gp_training/datapoints.pdf new file mode 100644 index 00000000..0334b8d3 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/datapoints.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/datapoints_0_2.pdf b/slides/gaussian-processes/figure/gp_training/datapoints_0_2.pdf new file mode 100644 index 00000000..28ce3309 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/datapoints_0_2.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/datapoints_0_5.pdf b/slides/gaussian-processes/figure/gp_training/datapoints_0_5.pdf new file mode 100644 index 00000000..87b65fec Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/datapoints_0_5.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/datapoints_2.pdf b/slides/gaussian-processes/figure/gp_training/datapoints_2.pdf new file mode 100644 index 00000000..59bd342c Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/datapoints_2.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/fit_vs_penalty.pdf b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty.pdf new file mode 100644 index 00000000..9a66e011 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_0_2.pdf b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_0_2.pdf new file mode 100644 index 00000000..ae182b94 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_0_2.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_0_5.pdf b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_0_5.pdf new file mode 100644 index 00000000..18905917 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_0_5.pdf differ diff --git a/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_2.pdf b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_2.pdf new file mode 100644 index 00000000..e2646fb1 Binary files /dev/null and b/slides/gaussian-processes/figure/gp_training/fit_vs_penalty_2.pdf differ diff --git a/slides/gaussian-processes/figure_man/covariance2D-2.png b/slides/gaussian-processes/figure_man/covariance2D-2.png new file mode 100644 index 00000000..72e12f70 Binary files /dev/null and b/slides/gaussian-processes/figure_man/covariance2D-2.png differ diff --git a/slides/gaussian-processes/figure_man/covariance2D.png b/slides/gaussian-processes/figure_man/covariance2D.png new file mode 100644 index 00000000..435853e5 Binary files /dev/null and b/slides/gaussian-processes/figure_man/covariance2D.png differ diff --git a/slides/gaussian-processes/figure_man/discrete/marginalization-5.png b/slides/gaussian-processes/figure_man/discrete/marginalization-5.png new file mode 100644 index 00000000..34d1ef6f Binary files /dev/null and b/slides/gaussian-processes/figure_man/discrete/marginalization-5.png differ diff --git a/slides/gaussian-processes/figure_man/discrete/marginalization-more.png b/slides/gaussian-processes/figure_man/discrete/marginalization-more.png new file mode 100644 index 00000000..0c40481e Binary files /dev/null and b/slides/gaussian-processes/figure_man/discrete/marginalization-more.png differ diff --git a/slides/gaussian-processes/figure_man/indexed_family/indexed_family_1.png b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_1.png new file mode 100644 index 00000000..b297f20a Binary files /dev/null and b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_1.png differ diff --git a/slides/gaussian-processes/figure_man/indexed_family/indexed_family_2.png b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_2.png new file mode 100644 index 00000000..1bbd9060 Binary files /dev/null and b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_2.png differ diff --git a/slides/gaussian-processes/figure_man/indexed_family/indexed_family_3.png b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_3.png new file mode 100644 index 00000000..688bc069 Binary files /dev/null and b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_3.png differ diff --git a/slides/gaussian-processes/figure_man/indexed_family/indexed_family_4.png b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_4.png new file mode 100644 index 00000000..b1d098ae Binary files /dev/null and b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_4.png differ diff --git a/slides/gaussian-processes/figure_man/indexed_family/indexed_family_5.png b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_5.png new file mode 100644 index 00000000..bcb6ac0f Binary files /dev/null and b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_5.png differ diff --git a/slides/gaussian-processes/figure_man/indexed_family/indexed_family_6.png b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_6.png new file mode 100644 index 00000000..1fc613ac Binary files /dev/null and b/slides/gaussian-processes/figure_man/indexed_family/indexed_family_6.png differ diff --git a/slides/gaussian-processes/figure_man/question.png b/slides/gaussian-processes/figure_man/question.png new file mode 100644 index 00000000..16fe498c Binary files /dev/null and b/slides/gaussian-processes/figure_man/question.png differ diff --git a/slides/gaussian-processes/rsrc/ggsave-classification.R b/slides/gaussian-processes/rsrc/ggsave-classification.R new file mode 100644 index 00000000..4c0aa82c --- /dev/null +++ b/slides/gaussian-processes/rsrc/ggsave-classification.R @@ -0,0 +1,65 @@ +library(knitr) +library(mlr) +library(mlrMBO) +library(ggplot2) +library(gridExtra) +library(reshape2) +library(kernlab) +library(mvtnorm) +library(gptk) +library(smoof) + +############################################################################ + + +squared.exp = function(x1, x2, l = 0.1) { + + D = as.matrix(dist(c(x1, x2), method = "euclidean")) + + K = exp(-1 / 2 * D^2 / l^2) + + return(K) +} + +set.seed(123) +n = 100 +x = seq(-2, 2, length.out = n) + +x.obs = c(-1, 1, 1.5) + +K = squared.exp(x, x.obs, l = 0.4) +K.xx = K[(n + 1):nrow(K), (n + 1):nrow(K)] +K.xxs = K[1:n, (n + 1):nrow(K)] +K.xsxs = K[1:n, 1:n] + +df = data.frame(x = x) + +# Drawing from Gaussian process Prior +df[, 2] = as.vector(mvtnorm::rmvnorm(1, sigma = K.xsxs)) +df[, 3] = 1 / (1 + exp(- df[, 2])) + +p1 = ggplot(data = df, aes(x = x, y = V2)) + geom_line() + theme_bw() +p1 = p1 + xlab("x") + ylab("f(x)") + ylim(c(-3, 3)) +p1 = p1 + theme(legend.position = "none") + ggtitle("Function drawn from a GP prior") + +p2 = ggplot(data = df, aes(x = x, y = V3)) + geom_line() + theme_bw() +p2 = p2 + xlab("x") + ylab(expression(s(f(x)))) + ylim(c(0, 1)) +p2 = p2 + theme(legend.position = "none") + ggtitle("Function transformed into probs") + +grid.arrange(p1, p2, nrow = 1) + +ggsave("figure_man/gp-classification.png") + +############################################################################ + +hinge = function(r) { + pmax(0, 1 - r) +} + +x = seq(-2, 2, by = 0.01); y = exp(-x) +df = data.frame(x = x, Logistic = y, Hinge = hinge(x)) +df = melt(df, id.vars = "x") + +p = ggplot(data = df) + geom_line(aes(x = x, y = value, color = variable)) + labs(color = "") +p + diff --git a/slides/gaussian-processes/rsrc/make_bayes_lm_plots.R b/slides/gaussian-processes/rsrc/make_bayes_lm_plots.R new file mode 100644 index 00000000..fbc3273a --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_bayes_lm_plots.R @@ -0,0 +1,97 @@ +library(mvtnorm) +library(ggplot2) + + +# Data + +set.seed(1234) + +n <- 50 + +x <- seq(-3.5, 7, length.out = n) +x <- sample(x, length(x)) +y <- 0.5 * x + rnorm(n) + +d <- data.frame(x0 = rep(1, n), x = x, y = y) + +p <- ggplot(data = d, aes(x = x, y = y)) + + geom_point() + + theme_bw() + + ylab("x") + + ylab("y") + +ggsave("../figure/bayes_lm/example.pdf", width = 4, height = 3) + + +# Prior and Posterior for the Bayes Linear Model + +grid <- expand.grid(theta0 = seq(-3, 3, length.out = 200), theta1 = seq(-3, 3, length.out = 200)) + +# Prior +# using a standard normal prior +probs <- cbind(grid, prior = mvtnorm::dmvnorm(grid)) + +p1 <- ggplot(probs, aes(x = theta0, y = theta1, z = prior)) + geom_contour() +p1 <- p1 + coord_fixed(xlim = c(-3, 3), ylim = c(-2, 2)) +p1 <- p1 + xlab(expression(theta[0])) + ylab(expression(theta[1])) +p1 <- p1 + geom_raster(aes(fill = prior)) + geom_contour(colour = "white", bins = 5) +p1 <- p1 + geom_point(x = 0, y = 0, color = "orange", size = 2) +p1 <- p1 + guides(fill = FALSE) + ggtitle(expression(paste("Prior ", theta, "~", N(0, 1)))) + theme(title = element_text(size = 11)) +p1 <- p1 + theme_bw() + +ggsave("../figure/bayes_lm/prior_1.pdf", p1, width = 4, height = 4) + +p2 <- ggplot() + geom_point(data = as.data.frame(d), aes(x = x, y = y), colour = "grey") +p2 <- p2 + geom_abline(intercept = 0, slope = 0, color = "orange") +p2 <- p2 + ggtitle("No data points observed") + theme_bw() + +ggsave("../figure/bayes_lm/prior_2.pdf", p2, width = 4, height = 4) + + +# Posterior distribution +nobs <- c(5, 10, 20) +d <- as.matrix(d) + +titles <- c("Posterior distribution after 5 obs.", "Posterior distribution after 10 obs.", "Posterior distribution after 20 obs.") + +for (j in 1:length(nobs)) { + i <- nobs[j] + A <- t(d[1:i, 1:2]) %*% d[1:i, 1:2] + diag(2) + b <- as.vector(d[1:i, 3] %*% d[1:i, 1:2]) + m.post <- solve(A, b) + sigma.post <- solve(A) + probs$posterior <- dmvnorm(grid, mean = m.post, sigma = sigma.post) + + p1 <- ggplot(probs, aes(x = theta0, y = theta1, z = posterior)) + geom_contour(colour = i) + p1 <- p1 + coord_fixed(xlim = c(-3, 3), ylim = c(-2, 2)) + p1 <- p1 + xlab(expression(theta[0])) + ylab(expression(theta[1])) + p1 <- p1 + geom_raster(aes(fill = posterior)) + geom_contour(colour = "white", bins = 5) + p1 <- p1 + geom_point(x = m.post[1], y = m.post[2], color = "orange", size = 2) + p1 <- p1 + guides(fill = FALSE) + ggtitle(expression(paste("Posterior of ", theta))) + theme(title = element_text(size = 11)) + p1 <- p1 + theme_bw() + + ggsave(paste0("../figure/bayes_lm/posterior_", i, "_1.pdf"), p1, width = 4, height = 4) + + p2 <- ggplot() + geom_point(data = as.data.frame(d), aes(x = x, y = y), colour = "grey") + p2 <- p2 + geom_point(data = as.data.frame(d[1:i, ]), aes(x = x, y = y)) + p2 <- p2 + geom_abline(intercept = m.post[1], slope = m.post[2], color = "orange") + p2 <- p2 + ggtitle(paste0("MAP after observing ", i, " data points")) + theme_bw() + + ggsave(paste0("../figure/bayes_lm/posterior_", i, "_2.pdf"), p2, width = 4, height = 4) + + xpred <- data.frame(x0 = rep(1, 200), x = as.numeric(seq(-3.5, 7, length.out = 200))) + postmean <- as.matrix(xpred) %*% m.post + post.sd <- as.matrix(xpred) %*% sigma.post %*% t(as.matrix(xpred)) + xpred$postmean <- postmean + xpred$postsd_upper <- as.vector(xpred$postmean + 2 * diag(post.sd)) + xpred$postsd_lower <- as.vector(xpred$postmean - 2 * diag(post.sd)) + + p2 <- ggplot(data = xpred) + geom_ribbon(aes(x = x, ymin = postsd_lower, ymax = postsd_upper), fill = "grey", alpha = 0.5) + p2 <- p2 + geom_point(data = as.data.frame(d), aes(x = x, y = y), colour = "grey") + p2 <- p2 + geom_point(data = as.data.frame(d[1:i, ]), aes(x = x, y = y)) + p2 <- p2 + geom_abline(intercept = m.post[1], slope = m.post[2], color = "orange") + p2 <- p2 + ggtitle(paste0("MAP after observing ", i, " data points")) + theme_bw() + p2 + ggsave(paste0("../figure/bayes_lm/posterior_", i, "_3.pdf"), p2, width = 4, height = 4) + +} diff --git a/slides/gaussian-processes/rsrc/make_covariance2point_plots.R b/slides/gaussian-processes/rsrc/make_covariance2point_plots.R new file mode 100644 index 00000000..6a29df9d --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_covariance2point_plots.R @@ -0,0 +1,87 @@ +# ----------------------------------------------------------------------- # +# Covariance 2D example # +# ----------------------------------------------------------------------- # + +# Initialization -------------------------------------------------------- # +library(mvtnorm) +library(ggplot2) +library(ggpubr) +library(gridExtra) + +# ----------------------------------------------------------------------- # + +# Squaorange exponential Covariance Function ------------------------------- # + +sqexp <- function(d, l = 1) { + exp(- 0.5 * d^2 / l) +} + +sqexp.vec <- function(x, l = 1) { + D <- as.matrix(dist(x)) + res <- sqexp(D) + return(res) + } + +# Example for a discrete function with input space containing two points +x <- c(3, 2.5, 5) +df <- data.frame(x = x) + +m <- c(0, 0, 0) +K <- sqexp.vec(x) + +# sample +set.seed(1234) +y <- t(rmvnorm(1, mean = m, sigma = K)) +df$y <- y + + +p1 <- ggplot(data.frame(x = c(- 4, 4)), aes(x)) + + stat_function(fun = sqexp, geom = "line") + + theme_bw() + + geom_point(aes(x = 0.5, y = sqexp(0.5)), color = "orange", size = 2) + + geom_segment(aes(x = 0.5, xend = 0.5, y = 0, yend = sqexp(0.5)), color = "orange", lty = 2) + + geom_text(aes(x = 1.8, y = sqexp(0.5), label = "high \n correlation \n of y values"), color = "orange", size = 3) + + xlab("d") + + ylab("k(d)") + + ggtitle("Covariance Function") + +p2 <- ggplot() + + ylim(c(-2, 2)) + + xlim(c(0, 6)) + + theme_bw() + + geom_vline(data = df, aes(xintercept = x), color = "grey", lty = 2) + + geom_text(aes(x = x[3] - 0.3, y = -2, label = "x[3]"), size = 3, parse = TRUE, colour = "darkgrey") + + geom_text(aes(x = x[1] + 0.3, y = -2, label = "x[1]"), size = 3, parse = TRUE, colour = "darkgrey") + + geom_segment(aes(x = x[1], xend = x[2], y = y[1], yend = y[1]), color = "orange") + + geom_text(aes(x = x[1] + 0.5 * (x[2] - x[1]), y = y[1] - 0.4, label = "d = 0.5"), color = "orange", size = 3) + + geom_text(aes(x = x[1] + 0.3, y = y[1], label = "y[1]"), size = 3, parse = TRUE) + + geom_point(aes(x = x[1], y = y[1]), size = 2) + + xlab("x") + + ylab("f(x)") + +ggsave(filename = "../figure/covariance2point/example_covariance_1.pdf", plot = p1,, height = 3, width = 3) +ggsave(filename = "../figure/covariance2point/example_function_1_1.pdf", plot = p2, height = 2.5, width = 3) + +p2 <- p2 + geom_segment(aes(x = x[2], xend = x[2], y = y[1], yend = y[2]), color = "orange") +p2 <- p2 + geom_point(aes(x = x[2], y = y[2]), size = 2) +p2 <- p2 + geom_text(aes(x = x[2] - 0.3, y = -2, label = "x[2]"), size = 3, parse = TRUE, colour = "darkgrey") +p2 <- p2 + geom_text(aes(x = x[2] - 0.3, y = y[2], label = "y[2]"), size = 3, parse = TRUE) +# p2 = p2 + geom_text(aes(x = x[2] - 1, y = y[2], label = "y values \n close"), size = 3, color = "orange") + +ggsave(filename = "../figure/covariance2point/example_function_1_2.pdf", plot = p2, height = 2.5, width = 3) + + +p1 <- p1 + geom_point(aes(x = - 2.5, y = sqexp(- 2.5)), color = "blue", size = 2) +p1 <- p1 + geom_segment(aes(x = - 2.5, xend = - 2.5, y = 0, yend = sqexp(2.5)), color = "blue", lty = 2) +p1 <- p1 + geom_text(aes(x = - 3, y = sqexp(- 2.5) + 0.2, label = "low \n correlation \n of y values"), color = "blue", size = 3) +p1 <- p1 + xlab("d") + ylab("k(d)") + +ggsave(filename = "../figure/covariance2point/example_covariance_2.pdf", plot = p1,, height = 3, width = 3) + +p2 <- p2 + geom_segment(aes(x = x[1], xend = x[3], y = y[1], yend = y[1]), color = "blue") +p2 <- p2 + geom_segment(aes(x = x[3], xend = x[3], y = y[1], yend = y[3]), color = "blue") +p2 <- p2 + geom_point(aes(x = x[3], y = y[3]), size = 2) +p2 <- p2 + geom_text(aes(x = x[3] - 0.3, y = y[3], label = "y[3]"), size = 3, parse = TRUE) + +ggsave(filename = "../figure/covariance2point/example_function_2_1.pdf", plot = p2, height = 2.5, width = 3) + diff --git a/slides/gaussian-processes/rsrc/make_covariance_plot.R b/slides/gaussian-processes/rsrc/make_covariance_plot.R new file mode 100644 index 00000000..eaa21e89 --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_covariance_plot.R @@ -0,0 +1,94 @@ +# Generation of Samples for different Covariance Functions + +# Initialization +library(MASS) +library(ggplot2) +library(RandomFieldsUtils) + +set.seed(123) + +n <- 1000 # number of points +x <- seq(-2, 2, length.out = n) # n equally spaced points +D <- as.matrix(dist(x, method = "euclidean")) # distance matrix + +# Squared exponential Covariance Function + +## Parameters to try out +l <- c(0.1, 1, 10) + +## Corresponding kernel matrixes +squared.exp <- lapply(l, function(l) exp(-1 / 2 * D^2 / l^2)) + +## Sampling from corresponding Gaussian +df.squared.exp <- sapply(squared.exp, function(x) as.vector(mvrnorm(1, mu = rep(0, length = ncol(x)), Sigma = x))) +df.squared.exp <- as.data.frame(df.squared.exp) +names(df.squared.exp) <- as.character(l) +df.squared.exp$x <- x + +## Plot +df.m <- melt(df.squared.exp, id.vars = "x") +p1 <- ggplot(data = df.m, aes(x = x, y = value, colour = variable)) +p1 <- p1 + geom_line() +p1 <- p1 + xlab("x") + ylab("f(x)") + ylim(c(-5, 5)) +p1 <- p1 + scale_colour_discrete(name = "Length Scale") +p1 <- p1 + theme_bw() + theme(legend.position = "top", legend.direction = "horizontal") +p1 <- p1 + ggtitle("Squared Exponential Covariance Function") + +# ------------------------------------------------------------------------ # + +# Polynomial Covariance Function ----------------------------------------- # + +## parameters to try out +p <- c(1, 2, 3) +## Corresponding kernel matrixes +poly <- lapply(p, function(l) as.matrix((x %*% t(x))^l)) +## Sampling from corresponding Gaussian +df.poly <- sapply(poly, function(x) as.vector(mvrnorm(1, mu = rep(0, ncol(x)), Sigma = x))) +df.poly <- as.data.frame(df.poly) +names(df.poly) <- as.character(p) +df.poly$x <- x + +## Plot +df.m <- melt(df.poly, id.vars = "x") +p2 <- ggplot(data = df.m, aes(x = x, y = value, colour = variable)) +p2 <- p2 + geom_line() +p2 <- p2 + xlab("x") + ylab("f(x)") + ylim(c(-5, 5)) +p2 <- p2 + scale_colour_discrete(name = "Degree") +p2 <- p2 + theme_bw() + theme(legend.position = "top", legend.direction = "horizontal") +p2 <- p2 + ggtitle("Polynomial Covariance Function") + +# ------------------------------------------------------------------------ # + + +# Matérn Covariance Function --------------------------------------------- # + +## parameters to try out +nu <- c(0.5, 2, 10) +## Corresponding kernel matrixes +matern <- lapply(nu, function(l) matrix(matern(D, nu = l, scaling = "matern"), + nrow = n, ncol = n)) +## Sampling from corresponding Gaussian +df.matern <- sapply(matern, function(x) as.vector(mvrnorm(1, mu = rep(0, ncol(x)), Sigma = x))) +df.matern <- as.data.frame(df.matern) +names(df.matern) <- as.character(nu) +df.matern$x <- x + +## Plot +df.m <- melt(df.matern, id.vars = "x") +p3 <- ggplot(data = df.m, aes(x = x, y = value, colour = variable)) +p3 <- p3 + geom_line() +p3 <- p3 + xlab("x") + ylab("f(x)") + ylim(c(-5, 5)) +p3 <- p3 + theme_bw() + theme(legend.position = "top", legend.direction = "horizontal") +p3 <- p3 + scale_colour_discrete(name = expression(paste(nu))) +p3 <- p3 + ggtitle("Matérn Covariance Functions") + +# ------------------------------------------------------------------------ # + +# Save the Plot ---------------------------------------------------------- # + +g <- grid.arrange(p1, p2, p3, ncol = 3) + +ggsave(filename = "../figure/covariance.pdf", plot = g, width = 10, height = 5) + +# ------------------------------------------------------------------------ # + diff --git a/slides/gaussian-processes/rsrc/make_discrete_plots.R b/slides/gaussian-processes/rsrc/make_discrete_plots.R new file mode 100644 index 00000000..9f6f96df --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_discrete_plots.R @@ -0,0 +1,336 @@ +# Distributions over discrete functions + +library(ggplot2) +library(gridExtra) +library(mvtnorm) +library(matrixcalc) +library(reshape2) + + +plotDiscreteFunction <- function(x, y, xlim, ylim) { + + df <- data.frame(x = x, y = y) + + p1 <- ggplot() + + xlim(xlim) + + theme_bw() + if (nrow(df) >= 20) { + # p1 = p1 + geom_line(data = df, aes(x = x, y = y, xend = x, yend = -Inf), colour = "grey", alpha = 0.5) + p1 <- p1 + + geom_point(data = df, + aes(x = x, y = y, color = x), + size = 1, shape = + 15) + } else { + p1 <- p1 + + geom_segment(data = df, + aes(x = x, y = y, xend = x, yend = -Inf), + color = "grey", + lty = 2) + + geom_point(data = df, aes(x = x, y = y, color = x), size = 3, shape = 15) + } + p1 <- p1 + + scale_x_continuous(breaks = x, labels = round(x, 1)) + + scale_color_gradientn(colours = c(low = "#E0E0E0", high = "#004C99")) + + ylab("h(x)") + ylim(ylim) + + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + legend.position = "none" + ) + + return(p1) +} + +plotBivariateDensity <- function(xlim, ylim, mu, sigma) { + + grid <- expand.grid(y1 = seq(xlim[1], xlim[2], length.out = 100), + y2 = seq(ylim[1], ylim[2], length.out = 100)) + + probs <- cbind(grid, + density = mvtnorm::dmvnorm(grid, mean = mu, sigma = sigma)) + + p2 <- ggplot() + + xlab(expression(y[1])) + + ylab(expression(y[2])) + + geom_raster(data = probs, + aes(x = y1, y = y2, fill = density)) + + geom_contour(data = probs, + aes(x = y1, y = y2, z = density), + colour = "white", + bins = 5) + + coord_fixed(xlim = xlim, ylim = ylim) + + guides(fill = FALSE) + + return(p2) +} + + +createMat <- function(n, scaleFactor) { + m <- matrix(data = 1, nrow = n, ncol = n) + if (scaleFactor< 0 | scaleFactor > 1) + stop("Scaling factor must be between 0 and 1") + + for (row in 1:(n-1)) { + for (col in 1:(n-1)) { + if (row <= col) m[row, col+1] <- round(m[row, col]*0.9, digits = 2) + if (row >= col) m[row+1, col] <- round(m[row, col]*0.9, digits = 2) + } + } + return(m) +} + + +squared.exp <- function(x1, x2, l = 0.1) { + + D <- as.matrix(dist(c(x1, x2), method = "euclidean")) + + K <- exp(-1 / 2 * D^2 / l^2) + + return(K) +} + + + + +# --- Initial Examples + +set.seed(1111) + +for (ninputs in c(2, 5, 10)) { + + x <- seq(0, 1, length.out = ninputs) + + for (i in 1:3) { + + y <- runif(ninputs, -1, 1) + + p <- plotDiscreteFunction(x, y, xlim = c(0, 1), ylim = c(- 2, 2)) + + ggsave(filename = paste0("../figure/discrete/example_", ninputs, "_", i, ".pdf"), plot = p, height = 2.5, width = 4) + } +} + +set.seed(123) + +# --- Examples Created from an "arbitrary" matrix (not a kernel) +ninputs <- c(2, 5, 10) +sigma <- lapply(ninputs, function(n) { + mat <- createMat(n, scaleFactor = 0.8) + diag(mat) <- 1 + mat + } +) + +sigma[[1]] <- matrix(c(1, 0.5, 0.5, 1), nrow = 2) + +names(sigma) <- ninputs + + +# Drawing functions from a normal distribution + + +# print(sigma) +for (input in ninputs) { + + x <- seq(0, 1, length.out = input) + + muc <- rep(0, input) + sigmac <- sigma[[as.character(input)]] + + for (i in 1:3) { + + # Draw a sample + y <- as.vector(rmvnorm(1, mean = muc, sigma = sigmac)) + + p1 <- plotDiscreteFunction(x, y, xlim = c(0, 1), ylim = c(- 2, 3)) + + ggtitle(paste0("Sample Function ", i, ", n = ", input)) + + if (input == 2) { + p2 <- plotBivariateDensity(mu = muc, + sigma = sigmac, + xlim = c(-2, 3), + ylim = c(-2, 3)) + + geom_point(data = data.frame(), + aes(x = y[1], y = y[2]), + color = "orange", + size = 3) + + theme_bw() + + ggtitle("Density of a 2-D Gaussian") + + ylab(expression(h[2])) + + xlab(expression(h[1])) + } + + if (input > 2) { + melted_comat <- reshape2::melt(sigmac) + p2 <- ggplot(data = melted_comat, aes(x=Var1, y=Var2, fill=value)) + + geom_tile() + + theme_bw() + + ggtitle("Covariance Matrix") + + scale_fill_gradientn(colours = c(low = "white", high = "black")) + + scale_x_discrete(labels = 1:input, limits = 1:input) + + scale_x_reverse() + + scale_y_discrete(labels = 1:input, limits = 1:input) + + theme(axis.line=element_blank(), + axis.ticks=element_blank(), + axis.title.x=element_blank(), + axis.title.y=element_blank(), + panel.border=element_blank(), + axis.text.x=element_blank(), + axis.text.y = element_blank(), + panel.grid.major=element_blank()) + } + + ggsave(paste0("../figure/discrete/example_norm_", input, "_", i, "_a.pdf"), p1, width = 3, height = 3) + ggsave(paste0("../figure/discrete/example_norm_", input, "_", i, "_b.pdf"), p2, width = 3, height = 3) + + } +} + + +# --- Two extreme Cases + +set.seed(123) + +input <- 50 + +muc <- rep(0, input) +sigmac <- matrix(0.999, input, input) + 0.01 * diag(input) + +x <- seq(0, 1, length.out = input) +y <- as.vector(rmvnorm(1, mean = muc, sigma = sigmac)) + +p1 <- plotDiscreteFunction(x, y, xlim = c(0, 4), ylim = c(- 2, 2)) + + theme(axis.text.x=element_blank()) + + ggtitle(paste0("Sample Function for a)", ", n = ", input)) + ylim(c(-2, 2)) + +ggsave(paste0("../figure/discrete/example_extreme_", input, "_1.pdf"), p1, width = 3.5, height = 3) + +sigmac <- diag(input) +y <- as.vector(rmvnorm(1, mean = muc, sigma = sigmac)) + +p1 <- plotDiscreteFunction(x, y, xlim = c(0, 4), ylim = c(- 2, 2)) + + theme(axis.text.x=element_blank()) + + ggtitle(paste0("Sample Function for b)", ", n = ", input)) + + ylim(c(-2, 2)) + +ggsave(paste0("../figure/discrete/example_extreme_", input, "_2.pdf"), p1, width = 3.5, height = 3) + +p1 <- p1 + + ggtitle(paste0("Sample Function for b) K = I", ", n = ", input)) + + ylim(c(-2, 2)) + +ggsave(paste0("../figure/discrete/example_extreme_", input, "_4.pdf"), p1, width = 3.5, height = 3) + + +sigmac <- squared.exp(x, x, l = 0.1)[1:input, 1:input] +y <- as.vector(rmvnorm(1, mean = muc, sigma = sigmac)) + +p1 <- plotDiscreteFunction(x, y, xlim = c(0, 4), ylim = c(- 2, 2)) + + theme(axis.text.x=element_blank()) + + ggtitle(paste0("Sample Function for c)", ", n = ", input)) + + ylim(c(-2, 2)) + +ggsave(paste0("../figure/discrete/example_extreme_", input, "_3.pdf"), p1, width = 3.5, height = 3) + + + +# --- Drawing Functions From a True Kernel Matrix + +# Drawing functions from a normal distribution +ninputs <- c(10, 50, 200) + +plist <- list() + +# print(sigma) +plist <- lapply(ninputs, function(input) { + + x <- seq(0, 1, length.out = input) + + muc <- rep(0, input) + sigmac <- squared.exp(x, x, l = 0.1)[1:input, 1:input] + + # Draw a sample + y <- as.vector(MASS::mvrnorm(1, mu = muc, Sigma = sigmac)) + + p <- plotDiscreteFunction(x, y, xlim = c(0, 4), ylim = c(- 2, 2)) + + ggtitle(paste0("n = ", input)) + + theme(axis.text.x = element_blank()) + p +}) + +ggsave(paste0("../figure/discrete/example_limit.pdf"), do.call(grid.arrange, c(plist, nrow = 1)), width = 8, height = 3) + + +set.seed(123) +ninputs <- c(5, 10, 50) + +# print(sigma) +for (input in ninputs) { + + x <- runif(input, 0, 1) + + muc <- rep(0, input) + sigmac <- squared.exp(x, x, l = 0.1)[1:input, 1:input] + + # Draw a sample + y <- as.vector(MASS::mvrnorm(1, mu = muc, Sigma = sigmac)) + + p1 <- plotDiscreteFunction(x, y, xlim = c(0, 4), ylim = c(- 2, 2)) + + ggtitle(paste0("Sample Function, n = ", input)) + + theme(axis.text.x = element_blank()) + + ggsave(paste0("../figure/discrete/example_marginalization_", input, ".pdf"), p1, width = 3, height = 3) +} + + + + +# mu = c(2, 1) # mean vector +# sigma = matrix(c(1, 0.5, 0.5, 1), ncol = 2) # covariance matrix + +# grid = expand.grid(y1 = seq(-1, 4, length.out = 100), y2 = seq(-1, 4, length.out = 100)) + +# probs = cbind(grid, density = mvtnorm::dmvnorm(grid, mean = mu, sigma = sigma)) + +# p2 = ggplot() + xlab(expression(y[1])) + ylab(expression(y[2])) +# p2 = p2 + geom_raster(data = probs, aes(x = y1, y = y2, fill = density)) +# p2 = p2 + geom_contour(data = probs, aes(x = y1, y = y2, z = density), colour = "white", bins = 5) +# p2 = p2 + coord_fixed(xlim = c(- 0.5, 3.5), ylim = c(- 0.5, 3.5)) + guides(fill = FALSE) + +# ggsave(filename = "figure_man/bivariate_density.png", plot = p2, height = 3, width = 6) + + + +# df1 = data.frame(x = x, y = t(y), label = "Sample 1") +# p1 = ggplot() + ylim(c(0, 3)) + xlim(c(0, 3)) + theme_bw() + geom_vline(data = df, aes(xintercept = x), color = "grey", lty = 2) +# p1 = p1 + geom_point(data = df1, aes(x = x, y = y, color = label), size = 3) +# p1 = p1 + guides(color = FALSE) + +# g = grid.arrange(p1, p2, ncol = 2) + +# ggsave(filename = "figure_man/discrete_sample1.png", plot = grid.arrange(p1, p2, ncol = 2), height = 3, width = 6) + +# # Drawing another sample +# y = rmvnorm(1, mean = mu, sigma = sigma) +# df2 = rbind(df2, data.frame(x = y[1], y = y[2], label = "Sample 2")) + +# p2 = ggplot() + xlab(expression(y[1])) + ylab(expression(y[2])) +# p2 = p2 + geom_raster(data = probs, aes(x = y1, y = y2, fill = density)) +# p2 = p2 + geom_contour(data = probs, aes(x = y1, y = y2, z = density), colour = "white", bins = 5) +# p2 = p2 + coord_fixed(xlim = c(- 0.5, 3.5), ylim = c(- 0.5, 3.5)) + guides(fill = FALSE) +# p2 = p2 + geom_point(data = df2, aes(x = x, y = y, color = label), size = 3) +# p2 = p2 + geom_text(data = df2, aes(x = x, y = y + 0.3, color = label, label = label), size = 4) +# p2 = p2 + geom_segment(data = df2, aes(x = x, xend = x, y = y, yend = -1, color = label), lty = 2) +# p2 = p2 + geom_segment(data = df2, aes(x = -1, xend = x, y = y, yend = y, color = label), lty = 2) +# p2 = p2 + guides(color = FALSE) + +# df1 = rbind(df1, data.frame(x = x, y = t(y), label = "Sample 2")) +# p1 = ggplot() + ylim(c(0, 3)) + xlim(c(0, 3)) + theme_bw() + geom_vline(data = df, aes(xintercept = x), color = "grey", lty = 2) +# p1 = p1 + geom_point(data = df1, aes(x = x, y = y, color = label), size = 3) +# p1 = p1 + guides(color = FALSE) + +# g = grid.arrange(p1, p2, ncol = 2) + +# ggsave(filename = "figure_man/discrete_sample2.png", plot = g, height = 3, width = 6) diff --git a/slides/gaussian-processes/rsrc/make_gp_pred_plots.R b/slides/gaussian-processes/rsrc/make_gp_pred_plots.R new file mode 100644 index 00000000..e6e75c2c --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_gp_pred_plots.R @@ -0,0 +1,236 @@ + +library(knitr) +library(mlr) +library(mlrMBO) +library(ggplot2) +library(gridExtra) +library(reshape2) +library(kernlab) +library(mvtnorm) +library(gptk) +library(smoof) +library(viridis) + + +x1 <- - 0.5 +x2 <- 0.5 +f.x1 <- 1 + +k <- exp(-0.5 * (x2 - x1)^2) + +K <- matrix(c(1, k, k, 1), nrow = 2) + +data.grid <- expand.grid(X1 = seq(-4, 4, length.out = 200), + X2 = seq(-4, 4, length.out = 200)) +data.grid <- cbind(data.grid, + prob = mvtnorm::dmvnorm(data.grid, mean = c(0, 0), sigma = K)) + +x <- seq(-4, 4, by = 0.01) +y <- dnorm(x = x, mean = 0, sd = 1) +p1 <- ggplot(data = data.frame(x = x, y = y)) + + geom_line(aes(x = x, y = y), lty = 2) + + theme_bw() + + coord_fixed(xlim = c(-4, 4), ylim = c(0, 1), ratio = 8) + + ggtitle("Marginal distribution of f(x*)") + + xlab("f(x*)") + + ylab("density") + +p4 <- p1 + + ggtitle("Marginal distribution of f(x)") + + xlab("f(x)") + +p2 <- ggplot(data.grid, aes(x = X1, y = X2, z = prob, fill = prob)) + + geom_raster(aes(fill = prob)) + + geom_contour(colour = "white") + + coord_fixed(xlim = c(-4, 4), ylim = c(-4, 4), ratio = 1) + + theme_bw() + + theme(legend.position = "none") + + ggtitle("Bivariate Normal Density") + + xlab("f(x)") + + ylab("f(x*)") + + scale_fill_viridis(end = 0.9) + +p3 <- ggplot() + + geom_line(data = data.frame(x = x, y = rep(0, length(x))), + aes(x = x, y = y), + lty = 2) + + coord_fixed(xlim = c(-2, 2), ylim = c(-4, 4), ratio = 1 / 2) + + ggtitle("Posterior process") + + theme_bw() + + ylab("f(x)") + +p <- grid.arrange(p1, p2, p3, p4, ncol = 2) +ggsave(filename = "../figure/gp_pred/1.pdf", plot = p, width = 7, height = 5) +########################################################### + +p3 <- p3 + + annotate(x = x1, y = f.x1, colour = "red", size = 2, geom = "point") + + annotate(x = - 0.5, y = 1.6, + label = "training point", + colour = "red", + geom = "text") +p <- grid.arrange(p1, p2, p3, p4, ncol = 2) +ggsave(filename = "../figure/gp_pred/2.pdf", plot = p, width = 7, height = 5) + +############################################################ +p4 <- p4 + + geom_vline(xintercept = f.x1, colour = "red") + + annotate(x = f.x1 - 1, y = 0.5, + label = "observed \n value", colour = "red", geom = "text") + +p2 <- p2 + geom_vline(xintercept = f.x1, colour = "red") + +p <- grid.arrange(p1, p2, p3, p4, ncol = 2) +ggsave(filename = "../figure/gp_pred/3.pdf", plot = p, width = 7, height = 5) +########################################################### + +m.cond <- K[1, 2] / K[1, 1] * 1 +s.cond <- K[2, 2] - K[1, 2]^2 / K[1, 1] + +y.post <- dnorm(x = x, mean = m.cond, sd = s.cond) +p1 <- p1 + + geom_line(data = data.frame(x = x, y = y.post), aes(x = x, y = y, colour = y)) + + theme(legend.position = "none") + + scale_color_viridis(end = 0.9) + +p <- grid.arrange(p1, p2, p3, p4, ncol = 2) +ggsave(filename = "../figure/gp_pred/4.pdf", plot = p, width = 7, height = 5) + +######################################################### + +p1 <- p1 + + geom_vline(xintercept = m.cond, colour = "orange") + +p3 <- p3 + + annotate(x = 0.5, y = m.cond, colour = "orange", size = 2, geom ="point") + + annotate(x = 0.5, y = m.cond + 0.6, + label = "prediction", colour = "orange", geom = "text") + +p <- grid.arrange(p1, p2, p3, p4, ncol = 2) +ggsave(filename = "../figure/gp_pred/5.pdf", plot = p, width = 7, height = 5) + +######################################################### + +d.process <- data.frame(x = seq(-2, 2, by = 0.01)) +d.process$k <- sapply(d.process, function(x) exp(-0.5 * (x - x1)^2)) +d.process$m.post <- d.process$k * 1 +d.process$k.post <- 1 - d.process$k^2 + +p3 <- p3 + + geom_line(data = d.process, aes(x = x, y = m.post)) + + geom_line(data = d.process, aes(x = x, y = m.post + 2 * k.post), color = "grey") + + geom_line(data = d.process, aes(x = x, y = m.post - 2 * k.post), color = "grey") + + geom_ribbon(data = d.process, + aes(x = x, ymax = m.post + 2 * k.post, ymin = m.post - 2 * k.post), + fill = "grey", alpha = .5) + +p <- grid.arrange(p1, p2, p3, p4, ncol = 2) +ggsave(filename = "../figure/gp_pred/6.pdf", plot = p, width = 7, height = 5) +########################################################## + +set.seed(124415) +x <- seq(-5, 5, by = 0.01) +y <- cos(x) +df <- data.frame(x = x, y = y) +train <- sample(1:length(x), 5) +istrain <- 1:length(x) %in% train + +configureMlr(show.info = FALSE, show.learner.output = FALSE) +tsk <- makeRegrTask(id = "GP as interpolator", data = df, target = "y") +lrn <- makeLearner("regr.km", predict.type = "se", par.vals = list(nugget.estim = FALSE)) +mod <- train(lrn, tsk, subset = train) +pred <- predict(mod, tsk) +pred <- cbind(pred$data, x = df$x, type = istrain) + +p <- ggplot() + geom_point(data = pred[train, ], aes(x = x, y = truth), size = 1, colour = "red") +p <- p + geom_line(data = pred, aes(x = x, y = response)) +p <- p + geom_ribbon(data = pred, aes(x = x, ymin = response - 2 * se, ymax = response + 2 * se), fill = "grey70", alpha = 0.3) +p <- p + theme_bw() + ylab(expression(hat(f)(x))) + + labs(caption = "After observing the training points (red), the posterior process (black) interpolates the training points.\n (k(x,x') is Matèrn with nu = 2.5, the default for DiceKriging::km)") +ggsave(filename = "../figure/gp_pred/gp_interpolator.pdf", plot = p, width = 6, height = 3) + +######################################################### + +configureMlr(show.info = FALSE, show.learner.output = FALSE) + +set.seed(1234) +obj.fun <- makeHimmelblauFunction() +des <- generateDesign(n = 10, par.set = getParamSet(obj.fun), ) +des$y <- apply(des, 1, obj.fun) +surr.km <- makeLearner("regr.km", predict.type = "se", covtype = "gauss") + +control <- makeMBOControl() +control <- setMBOControlTermination(control, iters = 5) +control <- setMBOControlInfill(control, crit = makeMBOInfillCritEI()) + +run <- mbo(obj.fun, design = des, learner = surr.km, control = control, show.info = TRUE) + +points <- data.frame(run$opt.path) + +x <- seq(-4.5, 4.5, length.out = 200) +z <- expand.grid(x1 = x, x2 = x) +pred <- predict(run$models[[1]], newdata = z) +z$se <- pred$data$se +z$pred <- pred$data$response + + +p1 <- ggplot() + + ylim(c(-4.5, 4.5)) + + xlim(c(-4.5, 4.5)) + + geom_raster(data = z, aes(x = x1, y = x2, fill = pred), alpha = 0.8) + + geom_point(data = points, aes(x = x1, y = x2), size = 3, color = "orange") + + scale_fill_gradient(low = "black", high = "white", name = "post. mean") + + xlab(expression(x[1])) + + ylab(expression(x[2])) + + theme(legend.text = element_text(size = 12)) + +p2 <- ggplot() + + ylim(c(-4.5, 4.5)) + + xlim(c(-4.5, 4.5)) + + geom_raster(data = z, aes(x = x1, y = x2, fill = se), alpha = 0.8) + + geom_point(data = points, aes(x = x1, y = x2), size = 3, color = "orange") + + labs(fill = "post. variance") + + xlab(expression(x[1])) + + ylab(expression(x[2])) + + theme(legend.text = element_text(size = 12)) + +ggsave(filename = "../figure/gp_pred/post_mean.pdf", + plot = p1, width = 5.5, height = 4) +ggsave(filename = "../figure/gp_pred/post_variance.pdf", + plot = p2, width = 5.5, height = 4) +######################################################## + +set.seed(124415) +n <- 6 +x.obs <- runif(n, -2, 2) +y.obs <- x.obs^2 + rnorm(n) +df <- data.frame(x = x.obs, y = y.obs) + +preddf <- data.frame(x = seq(-2, 2, by = 0.01)) + +configureMlr(show.info = FALSE, show.learner.output = FALSE) +tsk <- makeRegrTask(id = "GP as interpolator", data = df, target = "y") +lrn <- makeLearner("regr.km", predict.type = "se", par.vals = list(nugget.estim = TRUE, covtype = "gauss")) +mod <- train(lrn, tsk) +pred <- cbind(preddf, predict(mod, newdata = preddf)$data) + +p <- ggplot() + + geom_point(data = df, aes(x = x, y = y), size = 1, colour = "red") + + geom_line(data = pred, aes(x = x, y = response)) + + geom_ribbon(data = pred, + aes(x = x, ymin = response - 2 * se, ymax = response + 2 * se), + fill = "grey70", alpha = 0.3) + + theme_bw() + + ylab(expression(hat(f)(x))) + + labs(caption = "After observing the training points (red), we have a nugget-band around the oberved points. \n (k(x,x') is the squared exponential)") + +ggsave(filename = "../figure/gp_pred/gp_regression.pdf", + plot = p, width = 6, height = 3) + + + + + + + + diff --git a/slides/gaussian-processes/rsrc/make_gp_sample_plots.R b/slides/gaussian-processes/rsrc/make_gp_sample_plots.R new file mode 100644 index 00000000..8e498138 --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_gp_sample_plots.R @@ -0,0 +1,159 @@ + + +library(knitr) +library(ggplot2) +library(gridExtra) +library(reshape2) +library(kernlab) +library(mvtnorm) +library(viridis) + +########################## Functions drawn from a Gaussian process prior +squared.exp <- function(x1, x2, l = 1) { + + D <- as.matrix(dist(c(x1, x2), method = "euclidean")) + + K <- exp(-1 / 2 * D^2 / l^2) + + return(K) +} + +set.seed(131317) +n <- 100 +x <- seq(-2, 2, length.out = n) +x.obs <- c(-1.5, 1/3, 4/3, -0.5) +y.obs <- c(0, 1, 2, 1.5) + +K <- squared.exp(x, x.obs) +K.xx <- K[(n + 1):nrow(K), (n + 1):nrow(K)] +K.xxs <- K[1:n, (n + 1):nrow(K)] +K.xsxs <- K[1:n, 1:n] + +df <- data.frame(x = x) + +# Drawing from Gaussian process Prior +for (i in 1:50) { + df[, i + 1] <- as.vector(mvtnorm::rmvnorm(1, sigma = K.xsxs)) +} + +df.m <- melt(df, id.vars = "x") + +p1 <- ggplot(data = df.m, aes(x = x, y = value, colour = variable)) + + geom_line() + + theme_bw() + + xlab("x") + + ylab("f(x)") + + ylim(c(-3, 3)) + + theme(legend.position = "none") + + ggtitle("Functions drawn from a Gaussian process prior") + + scale_color_viridis(end=0.9, discrete = TRUE) + +ggsave("../figure/gp_sample/1_1.pdf", width = 6, height = 4) + +############################################## + +set.seed(131317) +p.list <- list() + +for (j in 1:length(x.obs)) { + # Update of posterior + m.post <- K.xxs[, 1:j] %*% solve(K.xx[1:j, 1:j]) %*% y.obs[1:j] + K.post <- K.xsxs - K.xxs[ , 1:j] %*% solve(K.xx[1:j, 1:j]) %*% t(K.xxs[, 1:j]) + + df <- data.frame(x = x) + + for (i in 1:20) { + df[, i + 1] <- as.vector(mvtnorm::rmvnorm(1, m.post, sigma = K.post)) + } + + df.m <- melt(df, id.vars = "x") + + p.list[[j]] <- ggplot() + + geom_line(data = df.m, aes(x = x, y = value, colour = variable)) + + geom_point(data = data.frame(x = x.obs[1:j], y = y.obs[1:j]), + aes(x = x, y = y), + size = 2) + + xlab("x") + + ylab("f(x)") + + ylim(c(-3, 3)) + + theme_bw() + + theme(legend.position = "none") + + ggtitle(paste0("Posterior process after ", j, " observation", ifelse(j == 1, "", "s"))) + + scale_color_viridis(end = 0.9, discrete = TRUE) + + ggsave(paste0("../figure/gp_sample/2_", j, ".pdf"), width = 6, height = 4) +} + +############################################### +##### 10 different samples + +# n <- 3 +# x <- c(1, 2, 5) +# +# K <- matrix(0, n, n) +# +# for (i in 1:n) { +# for (j in 1:n) { +# K[i, j] <- exp(- 1/2 * (abs(x[i] - x[j]))^2) +# # K[i, j] = t(x[i]) %*% x[j] # linear kernel +# } +# } +# +# df <- data.frame(x = x) +# df$y <- as.vector(rmvnorm(1, sigma = K)) +# +# p <- ggplot(data = df, aes(x = x, y = y)) + geom_line() + theme_bw() +# p <- p + xlab("x") + ylab("f(x)") +# p + +################################################ + +plot_10_samples <- function(l=1) { + set.seed(1221) + n <- 100 + + x <- seq(-2, 2, length.out = n) + K <- matrix(0, n, n) + + for (i in 1:n) { + for (j in 1:n) { + K[i, j] <- exp(- 1/(2*l^2) * (abs(x[i] - x[j]))^2) + # K[i, j] = t(x[i]) %*% x[j] # linear kernel + } + } + + df <- data.frame(x = x) + + for (i in 1:10) { + df[, i + 1] <- as.vector(mvtnorm::rmvnorm(1, sigma = K)) + } + + df.m <- melt(df, id.vars = "x") + + p <- ggplot(data = df.m, aes(x = x, y = value, colour = variable)) + + geom_line() + + theme_bw() + + xlab("x") + + ylab("f(x)") + + ylim(c(-5, 5)) + + theme(legend.position = "none") + + scale_color_viridis(end=0.9, discrete = TRUE) + + return(p) +} + +p1 <- plot_10_samples() + +ggsave(filename = "../figure/gp_sample/different_samples.pdf", + plot = p1, width = 6, height = 2) + + +p2 <- plot_10_samples(l=0.1) + +p <- grid.arrange(p1 + ggtitle("l = 1"), + p2 + ggtitle("l = 0.1"), + ncol = 2) + +ggsave(filename = "../figure/gp_sample/varying_length_scale.pdf", + plot = p, width = 6, height = 3) + diff --git a/slides/gaussian-processes/rsrc/make_gp_training_plots.R b/slides/gaussian-processes/rsrc/make_gp_training_plots.R new file mode 100644 index 00000000..3808520a --- /dev/null +++ b/slides/gaussian-processes/rsrc/make_gp_training_plots.R @@ -0,0 +1,101 @@ +# ----------------------------------------------------------------------- # +# Training of a Gaussian Process # +# ----------------------------------------------------------------------- # + +# Initialization -------------------------------------------------------- # +library(mvtnorm) +library(ggplot2) +# library(RandomFieldsUtils) + + +# Sampling from a Gaussian process with l = 1 +set.seed(123) + +n <- 15 # number of points +x <- seq(-2, 2, length.out = n) # n equally spaced points +D <- as.matrix(dist(x, method = "euclidean")) # distance matrix + +ltrue <- 0.5 +K <- exp(-1 / 2 * D^2 / ltrue^2) + +noise <- 0.5 +y <- as.vector(rmvnorm(1, Sigma = K)) + rnorm(n, sd = noise) + +df <- data.frame(x = x, y = y) + +p1 <- ggplot(data = df, aes(x = x, y = y)) + geom_point() + theme_bw() +p1 <- p1 + ggtitle("Data Points") + +ggsave("../figure/gp_training/datapoints.pdf", p1, height = 3, width = 4) + + +computeMarginalLogLik <- function(y, D, l, noise) { + + n <- length(y) + + Ky <- exp(-1 / 2 * D^2 / l^2) + diag(rep(noise, length(x))) + + fit <- - 0.5 * t(y) %*% solve(Ky) %*% y + penalty <- - 0.5 * log(det(Ky)) + + margll <- penalty + fit + + return(list(fit = as.numeric(fit), penalty = as.numeric(penalty), margll = as.numeric(margll))) +} + +computePosterior <- function(x, xnew, y, l, noise) { + + n <- length(y) + + Dnew <- as.matrix(dist(c(x, xnew), method = "euclidian")) + Knew <- exp(- 1 / 2 * Dnew^2 / l^2) + + K <- Knew[1:n, 1:n] + Kx <- Knew[1:n, (n + 1):nrow(Knew)] + Kxx <- Knew[(n + 1):nrow(Knew), (n + 1):nrow(Knew)] + + Ky <- K + diag(rep(noise, length(x))) + + Kyinv <- solve(Ky) + + mpost <- t(Kx) %*% Kyinv %*% y + Kpost <- Kxx - t(Kx) %*% Kyinv %*% Kx + + return(data.frame(x = xnew, mpost = mpost, sdpost = diag(Kpost))) +} + + +lengthscales <- seq(0.1, 2, by = 0.01) + +res <- lapply(lengthscales, function(l) computeMarginalLogLik(y, D, l, noise)) + +dfl <- data.frame(l = lengthscales) +dfl$Fit <- sapply(res, function(x) x$fit) +dfl$Penalty <- sapply(res, function(x) x$penalty) +dfl$LogLikelihood <- sapply(res, function(x) x$margll) + +dfl <- reshape2::melt(dfl, id.vars = c("l")) + +p2 <- ggplot() + geom_line(data = dfl, aes(x = l, y = value, color = variable)) +p2 <- p2 + theme_bw() + labs(colour = "") + theme(legend.position = "top") + +ggsave("../figure/gp_training/fit_vs_penalty.pdf", p2, height = 3, width = 4) + + + +xnew <- seq(-2, 2, length.out = 100) + +for (l in c(0.2, 0.5, 2)) { + + res <- computePosterior(x, xnew, y, l, noise) + res$sdupper <- res$mpost + 2 * res$sdpost + res$sdlower <- res$mpost - 2 * res$sdpost + + p2n <- p2 + geom_vline(data = data.frame(), aes(xintercept = l), lty = 2, colour = "grey") + + ggsave(paste0("../figure/gp_training/fit_vs_penalty_", gsub("\\.", "_", l), ".pdf"), p2n, height = 3, width = 4) + + p1n <- p1 + geom_line(data = res, aes(x = x, y = mpost), colour = "grey") + p1n <- p1n + ggtitle(paste0("l = ", l)) + ggsave(paste0("../figure/gp_training/datapoints_", gsub("\\.", "_", l), ".pdf"), p1n, height = 3, width = 4) +} \ No newline at end of file diff --git a/slides/gaussian-processes/slides-gp-basic.tex b/slides/gaussian-processes/slides-gp-basic.tex new file mode 100644 index 00000000..5d0bf339 --- /dev/null +++ b/slides/gaussian-processes/slides-gp-basic.tex @@ -0,0 +1,594 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\title{Introduction to Machine Learning} + +\begin{document} + +\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty + Gaussian Processes + }{% Lecture title + Basics + }{% Relative path to title page image: Can be empty but must not start with slides/ + figure_man/discrete/marginalization-more.png + }{ + \item GPs model distributions over functions + \item The marginalization property makes this distribution easily tractable + \item GPs are fully specified by mean and covariance function + \item GPs are indexed families +} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +%\input{../../style/preamble} +%\input{../../latex-math/basic-math} +%\input{../../latex-math/basic-ml} +%\input{../../latex-math/ml-gp} + +%\newcommand{\titlefigure}{figure_man/discrete/marginalization-more.png} %not best picture +%\newcommand{\learninggoals}{ +% \item GPs model distributions over functions +% \item The marginalization property makes this distribution easily tractable +% \item GPs are fully specified by mean and covariance function +% \item GPs are indexed families +%} + +%\title{Introduction to Machine Learning} +%\date{} + +%\begin{document} + +%\lecturechapter{Gaussian Processes} +%\lecture{Introduction to Machine Learning} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +\begin{vbframe}{Weight-Space View} + +\begin{itemize} + \item Until now we considered a hypothesis space $\Hspace$ of parameterized functions $\fxt$ (in particular, the space of linear functions). + \item Using Bayesian inference, we derived distributions for $\thetab$ after having observed data $\D$. + \item Prior believes about the parameter are expressed via a prior distribution $q(\thetab)$, which is updated according to Bayes' rule + + $$ + \underbrace{p(\thetab | \Xmat, \yv)}_{\text{posterior}} = \frac{\overbrace{p(\yv | \Xmat, \thetab)}^{\text{likelihood}}\overbrace{q(\thetab)}^{\text{prior}}}{\underbrace{p(\yv|\Xmat)}_{\text{marginal}}}. + $$ +\end{itemize} + +\end{vbframe} + + +\begin{vbframe}{Function-space View} + +Let us change our point of view: + +\begin{itemize} + \item Instead of \enquote{searching} for a parameter $\thetab$ in the parameter space, we directly search in a space of \enquote{allowed} functions $\Hspace$. + \item We still use Bayesian inference, but instead specifying a prior distribution over a parameter, we specify a prior distribution \textbf{over functions} and update it according to the data points we have observed. +\end{itemize} + +\framebreak + +Intuitively, imagine we could draw a huge number of functions from some prior distribution over functions $^{(*)}$. + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_sample/1_1.pdf} +\end{figure} + +\vspace*{-0.5cm} + +\begin{footnotesize} + $^{(*)}$ We will see in a minute how distributions over functions can be specified. +\end{footnotesize} + +\framebreak + +\foreach \x in{1,2,3} { + After observing some data points, we are only allowed to sample those functions, that are consistent with the data. \\ + \begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_sample/2_\x.pdf} + \end{figure} +} + +\framebreak + +As we observe more and more data points, the variety of functions consistent with the data shrinks. + \begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_sample/2_4.pdf} + \end{figure} + +\framebreak + +Inutitively, there is something like \enquote{mean} and a \enquote{variance} of a distribution over functions. + + \begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_sample/2_4.pdf} + \end{figure} + +\end{vbframe} + +\begin{frame}{Weight-space vs. Function-space View} + +\begin{table} + \begin{tabular}{cc} + \textbf{Weight-Space View} & \textbf{Function-Space View} \vspace{4mm}\\ + Parameterize functions & \vspace{1mm}\\ + \footnotesize Example: $\fxt = \thetab^\top \xv$ & \vspace{3mm}\\ + Define distributions on $\thetab$ & Define distributions on $f$ \vspace{4mm}\\ + Inference in parameter space $\Theta$ & Inference in function space $\Hspace$ + \end{tabular} +\end{table} + +\lz + +Next, we will see how we can define distributions over functions mathematically. + + +\end{frame} + +\section{Distributions on Functions} + +\begin{vbframe}{Discrete Functions} + +For simplicity, let us consider functions with finite domains first. + +\lz + + +Let $\mathcal{X} = \left\{\xv^{(1)}, \dots , \xv^{(n)}\right\}$ be a finite set of elements and $\Hspace$ the set of all functions from $\mathcal{X} \to \R$. + +\lz + +Since the domain of any $h(.) \in \Hspace$ has only $n$ elements, we can represent the function $h(.)$ compactly as a $n$-dimensional vector $$\bm{h} = \left[h\left(\xv^{(1)}\right), \dots, h\left(\xv^{(n)}\right)\right].$$ +\end{vbframe} + + +\begin{frame}{Discrete Functions} + +\textbf{Example 1:} Let us consider $h: \Xspace \to \Yspace$ where the input space consists of \textbf{two} points $\Xspace = \{0, 1\}$. + +\lz + +Examples for functions that live in this space: + +\begin{figure}[h] +\foreach \x in{1,2,3} { + \includegraphics<\x>[width=0.7\linewidth]{figure/discrete/example_2_\x.pdf} \par +} +\end{figure} + + +\end{frame} + +\begin{frame}{Discrete Functions} + +\textbf{Example 2:} Let us consider $h: \Xspace \to \Yspace$ where the input space consists of \textbf{five} points $\Xspace = \{0, 0.25, 0.5, 0.75, 1\}$. + +\lz + +Examples for functions that live in this space: + +\begin{figure}[h] +\foreach \x in{1,2,3} { + \includegraphics<\x>[width=0.7\linewidth]{figure/discrete/example_5_\x.pdf}\par +} +\end{figure} + +\end{frame} + + +\begin{frame}{Discrete Functions} + +\textbf{Example 3:} Let us consider $h: \Xspace \to \Yspace$ where the input space consists of \textbf{ten} points. + +\lz + +Examples for functions that live in this space: + +\begin{figure}[h] +\foreach \x in{1,2,3} { + \includegraphics<\x>[width=0.7\linewidth]{figure/discrete/example_10_\x.pdf}\par +} +\end{figure} + +\end{frame} + + +\begin{vbframe}{Distributions on Discrete Functions} + +\vspace*{0.5cm} + +One natural way to specify a probability function on discrete function $h \in \Hspace$ is to use the vector representation +$$ + \bm{h} = \left[h\left(\xi[1]\right), h\left(\xi[2]\right), \dots, h\left(\xi[n]\right)\right] +$$ + + +of the function. + +\lz + +Let us see $\bm{h}$ as a $n$-dimensional random variable. We will further assume the following normal distribution: + +$$ + \bm{h} \sim \mathcal{N}\left(\bm{m}, \bm{K}\right). +$$ + +\textbf{Note: } For now, we set $\bm{m} = \bm{0}$ and take the covariance matrix $\bm{K}$ as given. We will see later how they are chosen / estimated. + +\end{vbframe} + +\begin{frame}{Discrete Functions} + +\textbf{Example 1 (continued):} Let $h: \Xspace \to \Yspace$ be a function that is defined on \textbf{two} points $\Xspace$. We sample functions by sampling from a two-dimensional normal variable + +$$ +\bm{h} = [h(1), h(2)] \sim \mathcal{N}(\bm{m}, \bm{K}) +$$ + + +\begin{figure}[H] +\foreach \x in{1,2,3} { + \includegraphics<\x>[width=0.4\linewidth]{figure/discrete/example_norm_2_\x_a.pdf} ~ \includegraphics<\x>[width=0.4\linewidth]{figure/discrete/example_norm_2_\x_b.pdf} +} \par +\begin{footnotesize} +In this example, $m = (0, 0)$ and $K = \begin{pmatrix} 1 & 0.5 \\ 0.5 & 1 \end{pmatrix}$. +\end{footnotesize} +\end{figure} + +\end{frame} + + +\begin{frame}{Discrete Functions} + +\textbf{Example 2 (continued):} Let us consider $h: \Xspace \to \Yspace$ where the input space consists of \textbf{five} points. We sample functions by sampling from a five-dimensional normal variable + + +$$ +\bm{h} = [h(1), h(2), h(3), h(4), h(5)] \sim \mathcal{N}(\bm{m}, \bm{K}) +$$ + +\begin{figure}[h] +\foreach \x in{1,2,3} { + \includegraphics<\x>[width=0.4\linewidth]{figure/discrete/example_norm_5_\x_a.pdf} ~ \includegraphics<\x>[width=0.4\linewidth]{figure/discrete/example_norm_5_\x_b.pdf} +} +\end{figure} + +\end{frame} + +\begin{frame}{Discrete Functions} + +\textbf{Example 3 (continued):} Let us consider $h: \Xspace \to \Yspace$ where the input space consists of \textbf{ten} points. We sample functions by sampling from ten-dimensional normal variable + +$$ +\bm{h} = [h(1), h(2), \dots, h(10)] \sim \mathcal{N}(\bm{m}, \bm{K}) +$$ + +\begin{figure}[h] +\foreach \x in{1,2,3} { + \includegraphics<\x>[width=0.4\linewidth]{figure/discrete/example_norm_10_\x_a.pdf} ~ \includegraphics<\x>[width=0.4\linewidth]{figure/discrete/example_norm_10_\x_b.pdf} +} +\end{figure} + +\end{frame} + + +\begin{vbframe}{Role of the Covariance Function} + +Note that the covariance controls the \enquote{shape} of the drawn function. Consider two extreme cases where function values are + +\begin{enumerate} + \item[a)] strongly correlated: $\bm{K} = \begin{footnotesize}\begin{pmatrix} 1 & 0.99 & \dots & 0.99 \\ + 0.99 & 1 & \dots & 0.99 \\ + 0.99 & 0.99 & \ddots & 0.99 \\ + 0.99 & \dots & 0.99 & 1 \end{pmatrix}\end{footnotesize}$ + \item[b)] uncorrelated: $\bm{K} = \id$ +\end{enumerate} + +\begin{figure} + \includegraphics[width=0.35\linewidth]{figure/discrete/example_extreme_50_1.pdf} ~~ \includegraphics[width=0.35\linewidth]{figure/discrete/example_extreme_50_2.pdf} +\end{figure} + + +\framebreak + +\begin{itemize} + \item \enquote{Meaningful} functions (on a numeric space $\Xspace$) may be characterized by a spatial property: \vspace*{0.2cm} + \begin{itemize} + \item[] If two points $\xi, \xi[j]$ are close in $\Xspace$-space, their function values $f(\xi), f(\xi[j])$ should be close in $\Yspace$-space. + \end{itemize} \vspace*{0.2cm} + In other words: If they are close in $\Xspace$-space, their functions values should be \textbf{correlated}! \vspace*{0.4cm} + \item We can enforce that by choosing a covariance function with + $$ + \bm{K}_{ij} \text{ high, if } \xi[i], \xi[j] \text{ close.} + $$ + + \framebreak + + \item We can compute the entries of the covariance matrix by a function that is based on the distance between $\xi, \xi[j]$, for example: + + \vspace*{0.2cm} + \begin{enumerate} + \item[c)] Spatial correlation: \begin{footnotesize}$K_{ij} = k(\xi[i], \xi[j]) = \exp\left(-\frac{1}{2}\left|\xi - \xi[j]\right|^2\right)$\end{footnotesize} + \end{enumerate} + +\begin{figure} + \includegraphics[width=0.45\linewidth]{figure/discrete/example_extreme_50_4.pdf} ~~ \includegraphics[width=0.45\linewidth]{figure/discrete/example_extreme_50_3.pdf} +\end{figure} + +\end{itemize} + +\begin{footnotesize} +\textbf{Note}: $k(\cdot,\cdot)$ is known as the \textbf{covariance function} or \textbf{kernel}. It will be studied in more detail later on. +\end{footnotesize} + +\end{vbframe} + + + + +% \begin{vbframe} +% \begin{figure} +% \centering +% \includegraphics{figure_man/discrete/sample2.png} \\ +% \begin{footnotesize} If we sample again, we get another function. +% \end{footnotesize} +% \end{figure} + + +% However, we are usually interested in functions with infinite domain size. + +% \lz + +% This idea is extended to infinite domain size via \textbf{Gaussian processes}. + +% \end{vbframe} + + +\section{Gaussian Processes} + +\begin{vbframe}{From Discrete to Continuous Functions} + +\begin{itemize} + \item We defined distributions on functions with discrete domain by defining a Gaussian on the vector of the respective function values + $$ + \mathbf{h} = [h(\xi[1]), h(\xi[2]), \dots, h(\xi[n])] \sim \mathcal{N}(\bm{m}, \bm{K}) + $$ + + \item We can do this for $n \to \infty$ (as \enquote{granular} as we want) + \begin{figure} + \includegraphics[width = 0.9\textwidth]{figure/discrete/example_limit.pdf} + \end{figure} +\end{itemize} + +\end{vbframe} + +\begin{frame}{From Discrete to Continuous Functions} + + +\begin{itemize} + \item No matter how large $n$ is, we are still considering a function over a discrete domain. + \item How can we extend our definition to functions with \textbf{continuous domain} $\Xspace \subset \R$? +\end{itemize} + +\end{frame} + + +\begin{frame}{Gaussian Processes: Intuition} + +\begin{itemize} + \only<1>{ + \item Intuitively, a function $f$ drawn from \textbf{Gaussian process} can be understood as an \enquote{infinite} long Gaussian random vector. + \item It is unclear how to handle an \enquote{infinite} long Gaussian random vector! + \lz + \begin{figure} + \includegraphics[width=0.3\textwidth]{figure_man/question.png} + \end{figure} + } + \only<2-4>{ + \item Thus, it is required that for \textbf{any finite set} of inputs $\{\xi[1], \dots, \xi[n]\} \subset \Xspace$, the vector $\mathbf{f}$ has a Gaussian distribution + $$ + \bm{f} = \left[f\left(\xi[1]\right), \dots, f\left(\xi[n]\right)\right] \sim \mathcal{N}\left(\bm{m}, \bm{K}\right), + $$ + with $\bm{m}$ and $\bm{K}$ being calculated by a mean function $m(.)$ / covariance function $k(.,.)$. + \item This property is called \textbf{Marginalization Property}. + \begin{figure} + \only<2>{\includegraphics[width=0.4\textwidth]{figure/discrete/example_marginalization_5.pdf}\includegraphics[width=0.5\textwidth]{figure_man/discrete/marginalization-5.png}} + \only<3>{\includegraphics[width=0.4\textwidth]{figure/discrete/example_marginalization_10.pdf}\includegraphics[width=0.5\textwidth]{figure_man/discrete/marginalization-more.png}} + \only<4>{\includegraphics[width=0.4\textwidth]{figure/discrete/example_marginalization_50.pdf}\includegraphics[width=0.5\textwidth]{figure_man/discrete/marginalization-more.png}} + \end{figure} + } +\end{itemize} + +\end{frame} + + +\begin{vbframe}{Gaussian Processes} + +This intuitive explanation is formally defined as follows: + +\lz + +A function $\fx$ is generated by a GP $\gp$ if for \textbf{any finite} set of inputs $\left\{\xv^{(1)}, \dots, \xv^{(n)}\right\}$, the associated vector of function values $\bm{f} = \left(f(\xv^{(1)}), \dots, f(\xv^{(n)})\right)$ has a Gaussian distribution + +$$ +\bm{f} = \left[f\left(\xi[1]\right),\dots, f\left(\xi[n]\right)\right] \sim \mathcal{N}\left(\bm{m}, \bm{K}\right), +$$ + +with + + +\begin{eqnarray*} +\textbf{m} &:=& \left(m\left(\xi\right)\right)_{i}, \quad +\textbf{K} := \left(k\left(\xi, \xv^{(j)}\right)\right)_{i,j}, +\end{eqnarray*} + +where $m(\xv)$ is called mean function and $k(\xv, \xv^\prime)$ is called covariance function. + + +\framebreak + +\vspace*{0.5cm} + +A GP is thus \textbf{completely specified} by its mean and covariance function + +\vspace*{-0.2cm} +\begin{eqnarray*} +m(\xv) &=& \E[f(\xv)] \\ +k(\xv, \xv^\prime) &=& \E\biggl[\left( f(\xv) - \E[f(\xv)] \right) \left( f(\xv^\prime) - \E[f(\xv^\prime)] \right)\biggr] +\end{eqnarray*} + +\vfill + +\textbf{Note}: For now, we assume $m(\xv) \equiv 0$. This is not necessarily a drastic limitation - thus it is common to consider GPs with a zero mean function. + +% \framebreak + +% \vspace*{0.5cm} + +% Intuitively, one can think of a function $f$ drawn from a Gaussian process prior as a Gaussian distribution with an \enquote{infinitely} long mean vector and an \enquote{infinite by infinite} covariance matrix. + +% \lz + +% Each dimension of the Gaussian corresponds to an element $\xv$ from the domain $\mathcal{X}$. The corresponding component of the random vector represents the value of $f(\xv)$. + +% \lz + +% The \textbf{marginalization property} makes it possible to handle this \enquote{infinite} representation: evaluations of the process on any finite number of points follow a multivariate normal distribution. + +\end{vbframe} + +\begin{vbframe}{Sampling from a Gaussian process Prior} + +We can draw functions from a Gaussian process prior. Let us consider $\fx \sim \mathcal{GP}\left(0, k(\xv, \xv^\prime)\right)$ with the squared exponential covariance function $^{(*)}$ + +$$ +k(\xv, \xv^\prime) = \exp\left(-\frac{1}{2\ls^2}\|\xv - \xv^\prime\|^2\right), ~~ \ls = 1. +$$ +\vspace{-4cm} +This specifies the Gaussian process completely. + +\vspace{8cm} +\footnotesize +$^{(*)}$ We will talk later about different choices of covariance functions. + +\normalsize + +\framebreak + +To visualize a sample function, we + +\begin{itemize} + \item choose a high number $n$ (equidistant) points $\left\{\xv^{(1)}, \dots, \xv^{(n)}\right\}$ + \item compute the corresponding covariance matrix $\Kmat = \left(k\left(\xi, \xv^{(j)}\right)\right)_{i,j}$ by plugging in all pairs $\xv^{(i)}, \xv^{(j)}$ + \item sample from a Gaussian $\bm{f} \sim \mathcal{N}(\bm{0}, \bm{K})$. +\end{itemize} + +We draw $10$ times from the Gaussian, to get $10$ different samples. + +% Using $100$ equidistant points, we repeat the process of generating the Gaussian $10$ times ($10$ different functions) and draw each function by connecting the sampled values. + +% \lz + +\begin{figure} + \includegraphics[width=0.9\textwidth]{figure/gp_sample/different_samples.pdf} +\end{figure} + +\vspace{-0.2cm} +Since we specified the mean function to be zero $m(\xv) \equiv 0$, the drawn functions have zero mean. + +\end{vbframe} + + +\section{Gaussian Processes as Indexed Family} + + + + +\begin{vbframe}{Gaussian processes as an Indexed Family} + +% \begin{block}{Definition} +% A \textbf{Gaussian process} is a (infinite) collection of random variables, any \textbf{finite} number of which have a \textbf{joint Gaussian distribution}. +% \end{block} + +% \lz + +A Gaussian process is a special case of a \textbf{stochastic process} which is defined as a collection of random variables indexed by some index set (also called an \textbf{indexed family}). What does it mean? + +\lz + +An \textbf{indexed family} is a mathematical function (or \enquote{rule}) to map indices $t \in T$ to objects in $\mathcal{S}$. + +\begin{block}{Definition} +A \textbf{family of elements in $\mathcal{S}$ indexed by $T$} (indexed family) is a surjective function +\vspace*{-0.3cm} +\begin{eqnarray*} +s: T &\to& \mathcal{S} \\ + t &\mapsto& s_t = s(t) +\end{eqnarray*} +\end{block} + +\end{vbframe} + +\begin{vbframe}{Indexed Family} + +Some simple examples for indexed families are: + +\vspace*{0.3cm} + +\begin{minipage}{0.43\linewidth} + \begin{itemize} + \item finite sequences (lists): $T = \{1, 2, \dots, n\}$ and $\left(s_t\right)_{t \in T} \in \R$ \vspace{1cm} + \item infinite sequences: $T = \N$ and $\left(s_t\right)_{t \in T} \in \R$ + \end{itemize} +\end{minipage} +\begin{minipage}{0.55\linewidth} +\includegraphics{figure_man/indexed_family/indexed_family_1.png} \\ +\includegraphics{figure_man/indexed_family/indexed_family_2.png} +\end{minipage} + + +\framebreak + +But the indexed set $\mathcal{S}$ can be something more complicated, for example functions or \textbf{random variables} (RV): + +\begin{minipage}{0.43\linewidth} + \vspace*{0.5cm} + \begin{itemize} + \item $T = \{1, \dots, m\}$, $Y_t$'s are RVs: Indexed family is a random vector. \vspace*{0.2cm} + \item $T = \{1, \dots, m\}$, $Y_t$'s are RVs: Indexed family is a stochastic process in discrete time \vspace*{0.2cm} + \item $T = \Z^2$, $Y_t$'s are RVs: Indexed family is a 2D-random walk. + \end{itemize} +\end{minipage}\hfill +\begin{minipage}{0.5\linewidth} +\includegraphics{figure_man/indexed_family/indexed_family_4.png} \\ +\includegraphics{figure_man/indexed_family/indexed_family_3.png} +\end{minipage} + +\end{vbframe} + +\begin{frame}{Indexed Family} + +\begin{itemize} + \item A Gaussian process is also an indexed family, where the random variables $f(\xv)$ are indexed by the input values $\xv \in \Xspace$. + \item Their special feature: Any indexed (finite) random vector has a multivariate Gaussian distribution (which comes with all the nice properties of Gaussianity!). +\end{itemize} + +\begin{figure} + \includegraphics<1>[width=0.7\textwidth]{figure_man/indexed_family/indexed_family_5.png} \par + \only<1>{\begin{footnotesize} Visualization for a one-dimensional $\Xspace$. \end{footnotesize}} + \includegraphics<2>[width=0.6\textwidth]{figure_man/indexed_family/indexed_family_6.png}\par + \only<2>{\begin{footnotesize} Visualization for a two-dimensional $\Xspace$. \end{footnotesize}} +\end{figure} + +\end{frame} + + +\endlecture +\end{document} diff --git a/slides/gaussian-processes/slides-gp-bayes-lm.tex b/slides/gaussian-processes/slides-gp-bayes-lm.tex new file mode 100644 index 00000000..757651af --- /dev/null +++ b/slides/gaussian-processes/slides-gp-bayes-lm.tex @@ -0,0 +1,271 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\title{Introduction to Machine Learning} + +\begin{document} + +\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty + Gaussian Processes + }{% Lecture title + Bayesian Linear Model + }{% Relative path to title page image: Can be empty but must not start with slides/ + figure/bayes_lm/posterior_5_3.pdf + }{ + \item Know the Bayesian linear model + \item The Bayesian LM returns a (posterior) distribution instead of a point estimate + \item Know how to derive the posterior distribution for a Bayesian LM +} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +%\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +%\input{../../style/preamble} +%\input{../../latex-math/basic-math} +%\input{../../latex-math/basic-ml} +%\input{../../latex-math/ml-gp} + +%\newcommand{\titlefigure}{figure/bayes_lm/posterior_5_3.pdf} % does not fit +%\newcommand{\learninggoals}{ +% \item Know the Bayesian linear model +% \item The Bayesian LM returns a (posterior) distribution instead of a point estimate +% \item Know how to derive the posterior distribution for a Bayesian LM +%} + +%\title{Introduction to Machine Learning} +%\date{} + +%\begin{document} + +%\lecturechapter{The Bayesian Linear Model} +%\lecture{Introduction to Machine Learning} + + +\begin{vbframe}{Review: The Bayesian Linear Model} + +Let $\D = \left\{(\xi[1], \yi[1]), ..., (\xi[n], \yi[n])\right\}$ be a training set of i.i.d. observations from some unknown distribution. + +\begin{figure} + \includegraphics[width=0.6\textwidth]{figure/bayes_lm/example.pdf} +\end{figure} + +Let $\yv = (\yi[1], ..., \yi[n])^\top$ and $\Xmat \in \R^{n \times p}$ be the design matrix where the i-th row contains vector $\xi$. + +\framebreak + +The linear regression model is defined as + +$$ +y = \fx + \epsilon = \thetab^T \xv + \epsilon +$$ + +or on the data: + +\begin{eqnarray*} +\yi &=& \fxi + \epsi = \thetab^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\} +\end{eqnarray*} + + +We now assume (from a Bayesian perspective) that also our parameter vector $\thetab$ is stochastic and follows a distribution. +The observed values $\yi$ differ from the function values $\fxi$ by some additive noise, which is assumed to be i.i.d. Gaussian +$$ +\epsi \sim \mathcal{N}(0, \sigma^2)$$ +and independent of $\xv$ and $\thetab$. + +\framebreak + +Let us assume we have \textbf{prior beliefs} about the parameter $\thetab$ that are represented in a prior distribution $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p).$ + +\lz + +Whenever data points are observed, we update the parameters' prior distribution according to Bayes' rule + +$$ +\underbrace{p(\thetab | \Xmat, \yv)}_{\text{posterior}} = \frac{\overbrace{p(\yv | \Xmat, \thetab)}^{\text{likelihood}}\overbrace{q(\thetab)}^{\text{prior}}}{\underbrace{p(\yv|\Xmat)}_{\text{marginal}}}. +$$ + +\framebreak + +The posterior distribution of the parameter $\thetab$ is again normal distributed (the Gaussian family is self-conjugate): + +$$ +\thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) +$$ + +with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$. + +\lz + +\begin{footnotesize} +\textbf{Note:} If the posterior distribution $p(\thetab~|~\Xmat, \yv)$ are in the same probability distribution family as the prior $q(\thetab)$ w.r.t. a specific likelihood function $p(\yv~|~\Xmat, \thetab)$, they are called \textbf{conjugate distributions}. The prior is then called a \textbf{conjugate prior} for the likelihood. The Gaussian family is self-conjugate: Choosing a Gaussian prior for a Gaussian Likelihood ensures that the posterior is Gaussian. +\end{footnotesize} + +\framebreak + +\begin{figure} + \includegraphics[width=0.5\textwidth]{figure/bayes_lm/prior_1.pdf}~\includegraphics[width=0.5\textwidth]{figure/bayes_lm/prior_2.pdf} +\end{figure} + +\framebreak + +\foreach \x in{5, 10, 20} { +\begin{figure} + \includegraphics[width=0.5\textwidth]{figure/bayes_lm/posterior_\x_1.pdf}~ \includegraphics[width=0.5\textwidth]{figure/bayes_lm/posterior_\x_2.pdf} +\end{figure} +} + +\framebreak + +\begin{footnotesize} +\textbf{Proof:}\\ +We want to show that +\begin{itemize} + \item for a Gaussian prior on $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p)$ + \item for a Gaussian Likelihood $y ~|~ \Xmat, \thetab \sim \mathcal{N}(\Xmat^\top \thetab, \sigma^2 \id_n)$ +\end{itemize} +the resulting posterior is Gaussian $\mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1})$ with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$. + +Plugging in Bayes' rule and multiplying out yields +\begin{eqnarray*} +p(\thetab | \Xmat, \yv) &\propto& p(\yv | \Xmat, \thetab) q(\thetab) \propto \exp\biggl[-\frac{1}{2\sigma^2}(\yv - \Xmat\thetab)^\top(\yv - \Xmat\thetab)-\frac{1}{2\tau^2}\thetab^\top\thetab\biggr] \\ +&=& \exp\biggl[-\frac{1}{2}\biggl(\underbrace{\sigma^{-2}\yv^\top\yv}_{\text{doesn't depend on } \thetab} - 2 \sigma^{-2} \yv^\top \Xmat \thetab + \sigma^{-2}\thetab^\top \Xmat^\top \Xmat \thetab + \tau^{-2} \thetab^\top\thetab \biggr)\biggr] \\ +&\propto& \exp\biggl[-\frac{1}{2}\biggl(\sigma^{-2}\thetab^\top \Xmat^\top \Xmat \thetab + \tau^{-2} \thetab^\top\thetab - 2 \sigma^{-2} \yv^\top \Xmat \thetab \biggr)\biggr] \\ +&=& \exp\biggl[-\frac{1}{2}\thetab^\top\underbrace{\biggl(\sigma^{-2} \Xmat^\top \Xmat + \tau^{-2} \id_p \biggr)}_{:= \Amat} \thetab + \textcolor{red}{\sigma^{-2} \yv^\top \Xmat \thetab}\biggr] +\end{eqnarray*} + +This expression resembles a normal density - except for the term in red! + +\framebreak + +\textbf{Note:} We need not worry about the normalizing constant since its mere role is to convert probability functions to density functions with a total probability of one. + + +We subtract a (not yet defined) constant $c$ while compensating for this change by adding the respective terms (\enquote{adding $0$}), emphasized in green: + +\begin{eqnarray*} + p(\thetab | \Xmat, \yv) &\propto& \exp\biggl[-\frac{1}{2}(\thetab \textcolor{green}{- c})^\top\Amat (\thetab \textcolor{green}{- c}) \textcolor{green}{- c^\top \Amat \thetab} + \underbrace{\textcolor{green}{\frac{1}{2}c^\top\Amat c}}_{\text{doesn't depend on } \thetab} +\sigma^{-2} \yv^\top \Xmat \thetab\biggr] \\ + &\propto& \exp\biggl[-\frac{1}{2}(\thetab \textcolor{green}{- c})^\top\Amat (\thetab \textcolor{green}{- c}) \textcolor{green}{- c^\top \Amat \thetab} +\sigma^{-2} \yv^\top \Xmat \thetab\biggr] +\end{eqnarray*} + +If we choose $c$ such that $- c^\top \Amat \thetab +\sigma^{-2} \yv^\top \Xmat \thetab = 0$, the posterior is normal with mean $c$ and covariance matrix $\Amat^{-1}$. Taking into account that $\Amat$ is symmetric, this is if we choose + +\begin{eqnarray*} +&& \sigma^{-2} \yv^\top \Xmat = c^\top\Amat \\ +&\Leftrightarrow & \sigma^{-2} \yv^\top \Xmat \Amat^{-1} = c^\top \\ +&\Leftrightarrow& c = \sigma^{-2} \Amat^{-1} \Xmat^\top \yv +\end{eqnarray*} + +as claimed. + +\end{footnotesize} + +\framebreak + +Based on the posterior distribution + +$$ +\thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) +$$ + +we can derive the predictive distribution for a new observations $\xv_*$. The predictive distribution for the Bayesian linear model, i.e. the distribution of $\thetab^\top \xv_*$, is + +$$ +y_* ~|~ \Xmat, \yv, \xv_* \sim \mathcal{N}(\sigma^{-2}\yv^\top \Xmat \Amat^{-1}\xv_*, \xv_*^\top\Amat^{-1}\xv_*) +$$ + +(applying the rules for linear transformations of Gaussians). + +\framebreak + + +\foreach \x in{5, 10, 20} { +\begin{figure} + \includegraphics[width=0.5\textwidth]{figure/bayes_lm/posterior_\x_3.pdf} \\ + \begin{footnotesize} + For every test input $\xv_*$, we get a distribution over the prediction $y_*$. In particular, we get a posterior mean (orange) and a posterior variance (grey region equals $+/-$ two times standard deviation). + \end{footnotesize} +\end{figure} +} + +\end{vbframe} + + +\begin{vbframe}{Summary: The Bayesian Linear Model} + +\begin{itemize} + \item By switching to a Bayesian perspective, we do not only have point estimates for the parameter $\thetab$, but whole \textbf{distributions} + \item From the posterior distribution of $\thetab$, we can derive a predictive distribution for $y_* = \thetab^\top \xv_*$. + \item We can perform online updates: Whenever datapoints are observed, we can update the \textbf{posterior distribution} of $\thetab$ +\end{itemize} + +Next, we want to develop a theory for general shape functions, and not only for linear function. + +\end{vbframe} + + + + + + + +% \framebreak +% +% Let $\Xspace = \R$. In the simplest case, the features are not modified: $\phi(x) = x$. +% +% \lz +% +% In the above example we might assume that the function has a quadratic shape. We would project our one-dimensional features into a two-dimensional feature space via +% +% $$ +% \phi(x) = (x, x^2). +% $$ +% +% The resulting model is +% +% $$ +% f(x) = \theta^\top \phi(x) = \theta_1 x + \theta_2 x^2. +% $$ +% +% The following plots show the posterior distribution of $\theta_1, \theta_2$ for the observed data. The more observations we have the \enquote{surer} we are about the parameters value. +% +% \framebreak +% +% \vspace*{1cm} +% <>= +% plots = list() +% titles = c("a)", "b)", "c)") +% nobs = c(1, 5, 50) + +% for (j in 1:3) { +% i = nobs[j] +% A = 1 / sigma^2 * t(as.matrix(d[1:i, 1:2])) %*% as.matrix(d[1:i, 1:2]) + diag(c(1, 1)) +% probs$posterior = mvtnorm::dmvnorm(data.grid, mean = 1 / sigma^2 * solve(A) %*% t(d[1:i, c("x1", "x2")]) +% d[1:i, ]$y, sigma = solve(A)) +% p = ggplot(probs, aes(x = theta1, y = theta2, z = posterior)) + geom_contour( colour = i) +% p = p + coord_fixed(xlim = c(-2, 2), ylim = c(-2, 2), ratio = 1) +% p = p + xlab(expression(theta[1])) + ylab(expression(theta[2])) +% p = p + geom_raster(aes(fill = posterior)) + geom_contour(colour = "white", bins = 5) +% p = p + guides(fill = FALSE) + ggtitle(titles[j]) +% p + +% plots[[paste("n = ", i, sep = "")]] = p +% } + +% do.call("grid.arrange", c(plots, ncol = 3)) +% @ +% % \vspace*{-0.6cm} +% % +% % \begin{center} +% % \begin{footnotesize} +% % Contour lines of the posterior distribution of $(\theta_0, \theta_1)$ after a) 1 observation, b) 5 observations and c) 50 observations. +% % \end{footnotesize} +% % \end{center} + +% \end{vbframe} + +\endlecture +\end{document} diff --git a/slides/gaussian-processes/slides-gp-covariance.tex b/slides/gaussian-processes/slides-gp-covariance.tex new file mode 100644 index 00000000..51840880 --- /dev/null +++ b/slides/gaussian-processes/slides-gp-covariance.tex @@ -0,0 +1,346 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\title{Introduction to Machine Learning} + +\begin{document} + +\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty + Gaussian Processes + }{% Lecture title + Covariance functions for GPs + }{% Relative path to title page image: Can be empty but must not start with slides/ + figure_man/covariance2D-2.png + }{ + \item Covariance functions encode key assumptions about the GP + \item Know common covariance functions like squared exponential and Matérn +} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + +%\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +%\input{../../style/preamble} +%\input{../../latex-math/basic-math} +%\input{../../latex-math/basic-ml} +%\input{../../latex-math/ml-gp} + +%\newcommand{\titlefigure}{figure_man/covariance2D-2.png} +%\newcommand{\learninggoals}{ +% \item Covariance functions encode key assumptions about the GP +% \item Know common covariance functions like squared exponential and Matérn +%} + +%\title{Introduction to Machine Learning} +%\date{} + +%\begin{document} + +%\lecturechapter{Covariance Functions for GPs} +%\lecture{Introduction to Machine Learning} + +\begin{vbframe}{Covariance function of a GP} + + The marginalization property of the Gaussian process implies that for any finite set of input values, the corresponding vector of function values is Gaussian: + + $$ + \bm{f} = \left[f\left(\xi[1]\right), ..., f\left(\xi[n]\right)\right] \sim \mathcal{N}\left(\bm{m}, \bm{K}\right), + $$ + + +\begin{itemize} + \item The covariance matrix $\bm{K}$ is constructed based on the chosen inputs $\left\{\xv^{(1)}, ..., \xv^{(n)}\right\}$. + \item Entry $\bm{K}_{ij}$ is computed by $k\left(\xi, \xi[j]\right)$. + \item Technically, for \textbf{every} choice of inputs $\left\{\xv^{(1)}, ..., \xv^{(n)}\right\}$, $\bm{K}$ needs to be positive semi-definite in order to be a valid covariance matrix. + \item A function $k(.,.)$ satisfying this property is called \textbf{positive definite}. + +\framebreak + + \item Recall, the purpose of the covariance function is to control to which degree the following is fulfilled: \vspace*{0.4cm} + \begin{itemize} + \item[] If two points $\xi, \xi[j]$ are close in $\Xspace$-space, their function values $f(\xi), f(\xi[j])$ should be close (\textbf{correlated}!) in $\Yspace$-space. + \end{itemize} \vspace*{0.4cm} + % \item[$\to$] $\bm{K}_{ij}$ is the covariance of $f\left(\xi\right)$ and $f\left(\xi[j]\right)$ \vspace*{0.4cm} + + \item Closeness of two points $\xi, \xi[j]$ in input space $\Xspace$ is measured in terms of $\bm{d} = \xi - \xi[j]$: + $$ + k(\xi, \xi[j]) = k(\bm{d}) + $$ + % Such covoariance functions are called \textbf{stationary}. +\end{itemize} + +% \framebreak + +% Technically, in order to be a valid covariance function, the matrix $\Kmat$ needs to be positive semi-definite + +% \vspace*{-0.4cm} +% \begin{eqnarray*} +% \bm{a}^T\Kmat \bm{a} &\ge& 0 ~~~ \forall \bm{a} \in \R^n \\ +% \Leftrightarrow \sum_{i = 1}^n a_i a_j k(\xv^{(i)}, \xv^{(j)}) &\ge& 0 ~~~ \forall \bm{a} \in \R^n. +% \end{eqnarray*} + +% A function $k(.,.)$ satisfying this property for all possible pairs of inputs in $(\xv, \tilde \xv) \in \Xspace \times \Xspace$ +% is called \textbf{positive definite}. + +% \framebreak + +% Intuitively, the covariance function $k(\xv, \xv^\prime)$ is a \textbf{similarity} measure between points: +% \begin{itemize} + +% \item if two points are close in $\mathcal{X}$, $k(\xv, \xv^\prime)$ is usually high - the correlation between the function values $\fx, f(\xv^\prime)$ is high +% \item if they are far away from each other, $k(\xv, \xv^\prime)$ is small and the function values are not correlated that strongly +% \end{itemize} + +% The covariance function quantifies the notion of \enquote{closeness}. It therefore encodes \textbf{our assumptions} about the shape of the function we are interested in: how much variation in $\fx$ we expect over which distances in $\Xspace$. + +\end{vbframe} + +\begin{frame}{Covariance Function of a GP: Example} + +\begin{itemize} + \item Let $\fx$ be a GP with $k(\xv, \xv^\prime) = \exp(-\frac{1}{2}\|\bm{d}\|^2)$ with $\bm{d} = \xv - \xv^\prime$. + \item Consider two points $\xi[1] = 3$ and $\xi[2] = 2.5$. + \item If you want to know how correlated their function values are, compute their correlation! + \begin{figure} + \includegraphics[width=0.45\textwidth]{figure/covariance2point/example_covariance_1.pdf} + \includegraphics[width=0.45\textwidth]{figure/covariance2point/example_function_1_1.pdf} + \end{figure} +\end{itemize} + +\end{frame} + +\begin{vbframe}{Covariance Function of a GP: Example} + +\begin{itemize} + \item Assume we observed a value $\yi[1] = - 0.8$, the value of $\yi[2]$ should be close under the assumption of the above Gaussian process. +\end{itemize} + +\begin{figure} + \includegraphics[width=0.45\textwidth]{figure/covariance2point/example_covariance_1.pdf} ~ \includegraphics[width=0.45\textwidth]{figure/covariance2point/example_function_1_2.pdf} +\end{figure} + +\end{vbframe} + + +\begin{vbframe}{Covariance Function of a GP: Example} + +\begin{itemize} + \item Let us compare another point $\xi[3]$ to the point $\xi[1]$ + \item We again compute their correlation + \item Their function values are not very much correlated; $\yi[1]$ and $\yi[3]$ might be far away from each other +\end{itemize} + +\begin{figure} + \includegraphics[width=0.45\textwidth]{figure/covariance2point/example_covariance_2.pdf} ~ \includegraphics[width=0.45\textwidth]{figure/covariance2point/example_function_2_1.pdf} +\end{figure} + +\end{vbframe} + +% Given an initial point $\xv^{(1)} = 3$ with $f(\xv^{(1)}) = -0.8$, we want to draw a function value for $\xv^{(2)} = 2.5$. + +% \begin{center} +% \includegraphics{figure_man/covariance2point_1.png} +% \end{center} +% } + +% \only<2>{Calculating the distance and using the kernel function as a \enquote{look-up-table} for the correlation of the function values, we see that they are highly correlated. We expect the $y$ values being close. } + +% \only<2>{ +% \begin{center} +% \includegraphics{figure_man/covariance2point_2.png} +% \end{center}} + +% \only<3->{If in comparison we want to draw a function value for at $x^{(3)} = 5$ given $x^{(1)}$ we see on the right plot that the correlation is low. We do not expect the function value being close to $y^{(1)}$. } + +% \only<3>{ +% \begin{center} +% \includegraphics{figure_man/covariance2point_3.png} +% \end{center}} + +% \only<4>{ +% \begin{center} +% \includegraphics{figure_man/covariance2point_4.png} +% \end{center}} + + + +\begin{vbframe}{Covariance Functions} + +There are three types of commonly used covariance functions: + +\begin{itemize} + +\item $k(.,.)$ is called stationary if it is as a function of $\bm{d} = \bm{x} - \bm{x}^\prime$, we write $k(\bm{d})$.\\ +Stationarity is invariance to translations in the input space: $k(\bm{x},\bm{x} + \bm{d}) = k(\bm{0}, \bm{d})$ +\item $k(.,.)$ is called isotropic if it is a function of $r = \|\bm{x} - \bm{x}^\prime\|$, we write $k(r)$.\\ +Isotropy is invariance to rotations of the input space and implies stationarity. +\item $k(., .)$ is a dot product covariance function if $k$ is a function of $\bm{x}^T \bm{x}^\prime$ +\end{itemize} + +\end{vbframe} + + +\begin{vbframe}{Commonly used covariance functions} + +\begin{table}[] +\centering +\begin{tabular}{|c|c|} +\hline +Name & $k(\bm{x}, \bm{x}^\prime)$\\ +\hline +constant & $\sigma_0^2$ \\ [1em] +linear & $\sigma_0^2 + \bm{x}^T\bm{x}^\prime$ \\ [1em] +polynomial & $(\sigma_0^2 + \bm{x}^T\bm{x}^\prime)^p$ \\ [1em] +squared exponential & $\exp(- \frac{\|\bm{x} - \bm{x}^\prime\|^2}{2\ls^2})$ \\ [1em] +Matérn & \begin{footnotesize} $\frac{1}{2^\nu \Gamma(\nu)}\biggl(\frac{\sqrt{2 \nu}}{\ls}\|\bm{x} - \bm{x}^\prime\|\biggr)^{\nu} K_\nu\biggl(\frac{\sqrt{2 \nu}}{\ls}\|\bm{x} - \bm{x}^\prime\|\biggr)$\end{footnotesize} \\ [1em] +exponential & $\exp\left(- \frac{\|\bm{x} - \bm{x}^\prime\|}{\ls}\right)$ \\ [1em] +\hline +\end{tabular} +\end{table} +\begin{footnotesize} +$K_\nu(\cdot)$ is the modified Bessel function of the second kind. +\end{footnotesize} + + +\begin{center} + +\includegraphics{figure/covariance.pdf} +\end{center} +\vskip -1 em +\begin{footnotesize} +\begin{itemize} +\item Random functions drawn from Gaussian processes with a Squared Exponential Kernel (left), Polynomial Kernel (middle), and a Matérn Kernel (right, $\ls = 1$). +\item The length-scale hyperparameter determines the ``wiggliness'' of the function. +\item For Matérn, the $\nu$ parameter determines how differentiable the process is. +\end{itemize} +\end{footnotesize} +\end{vbframe} + +% \begin{vbframe}{Making New Kernels from Old} + +% Kernels can be + +% \begin{itemize} +% \item Summed together +% \begin{itemize} +% \item on the same space $k(\xv, \xv') = k_1(\xv, \xv') + k_2(\xv, \xv')$ +% \item on tensor space $k(\xv, \xv') = k_1(x_1, x_1') + k_2(x_2, x_2')$ +% \end{itemize} +% \item Multiplied together +% \begin{itemize} +% \item on the same space $k(\xv, \xv') = k_1(\xv, \xv') \cdot k_2(\xv, \xv')$ +% \item on tensor space $k(\xv, \xv') = k_1(x_1, x_1') \cdot k_2(x_2, x_2')$ +% \end{itemize} +% \item Composed with a function $k(\xv, \xv') = k_1(g(\xv), g(\xv'))$ +% \end{itemize} + +% All these operations will preserve the positive definiteness (see exercise). +% More details: lecture about kernel methods. + +% \end{vbframe} +%----------------------------------------------------------------------- + +\begin{vbframe}{Squared Exponential Covariance Function} + +The squared exponential function is one of the most commonly used covariance functions. +$$ +k(\xv, \xv^\prime) = \exp\biggl(- \frac{\|\xv - \xv^\prime\|^2}{2\ls^2}\biggr). +$$ + +\textbf{Properties}: +\begin{itemize} +\item It depends merely on the distance $r = \|\xv - \xv^\prime\|$ $\to$ isotropic and stationary.\lz +\item Infinitely differentiable $\to$ sometimes deemed + unrealistic for modeling most of the physical processes. + +\end{itemize} + +\end{vbframe} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[c,allowframebreaks]{Characteristic Length-Scale} + $$k(\xv, \xv^\prime) = \exp\left(-\frac{1}{2\ls^2}\|\xv - \xv^\prime\|^2\right)$$ +% \begin{figure} + % \includegraphics[width = .8\textwidth]{figure/lengthscale-1.pdf} +% \end{figure} +$\ls$ is called \textbf{characteristic length-scale}. Loosely speaking, the characteristic length-scale describes how far you need to move in input space for the function values to become uncorrelated. Higher $\ls$ induces smoother functions, lower $\ls$ induces more wiggly functions. + +\begin{figure} +\includegraphics[width=0.6\textwidth]{figure/gp_sample/varying_length_scale.pdf} +\end{figure} + +% \begin{itemize} + % \item Left: for higher $\ls$ the correlation between function values (for unchanged distance of input points) is also higher + % \item Right plot: a higher $\ls$ induces a smoother function +% \end{itemize} + +\framebreak + +For $p \geq 2$ dimensions, the squared exponential can be parameterized: + +$$ +k(\xv, \xv^\prime) = \exp\,\biggl(- \frac{1}{2}\left(\xv - \xv^\prime\right)^\top\bm{M}\left(\xv - \xv^\prime \right)\biggr) +$$ + +Possible choices for the matrix $\bm{M}$ include + +$$ +\bm{M}_1 = \ls^{-2}\id \qquad \bm{M}_2 = \text{diag}(\bm{\ls})^{-2} \qquad \bm{M}_3 = \Gamma \Gamma^\top + \text{diag}(\bm{\ls})^{-2} +$$ + +where $\bm{\ls}$ is a $p$-vector of positive values and $\Gamma$ is a $p \times k$ matrix. + +\lz + +The 2nd (and most important) case can also be written as +$$ + k(\bold{d}) = \exp\,\biggl(- \frac{1}{2} \sumjp \frac{d_j^2}{l_j^2} \biggr) +$$ + +% Here again, $\bm{\ls} = (\ls_1,\dots, \ls_p)$ are characteristic length-scales for each dimension. + + +\framebreak + + +What is the benefit of having an individual hyperparameter $\ls_i$ for each dimension? + +\vspace{4mm} + +\begin{itemize} +\item The $\ls_1,\dots, \ls_p$ hyperparameters play the role of \textbf{characteristic length-scales}. +\vspace{2mm} +\item Loosely speaking, $\ls_i$ describes how far you need to move along axis $i$ in input space for the function values to be uncorrelated. +\vspace{2mm} +\item Such a covariance function implements \textbf{automatic relevance determination} (ARD), since the inverse of the length-scale $\ls_i$ determines the relevancy of input feature $i$ to the regression. +\vspace{2mm} +\item If $\ls_i$ is very large, the covariance will become almost independent of that input, effectively removing it from inference. +\vspace{2mm} +\item If the features are on different scales, the data can be automatically \textbf{rescaled} by estimating $\ls_1,\dots, \ls_p$ + +\end{itemize} + + + +\framebreak + + +\begin{figure} + \includegraphics[width = .8\textwidth]{figure_man/covariance2D.png} +\end{figure} + +\vspace{3mm} +%\begin{footnotesize} +For the first plot, we have chosen $\bm{M} = \id$: the function varies the same in all directions. The second plot is for $\bm{M} = \text{diag}(\bm{\ls})^{-2}$ and $\bm{\ls} = \left(1, 3 \right)$: The function varies less rapidly as a function of $x_2$ than $x_1$ as the length-scale for $x_1$ is less. In the third plot $\bm{M} = \Gamma \Gamma^T + \text{diag}(\bm{\ls})^{-2}$ for $\Gamma = (1, -1)^\top$ and $\bm{\ls} = (6, 6)^\top$. Here $\Gamma$ gives the direction of the most rapid variation. (Image from Rasmussen \& Williams, 2006) +%\end{footnotesize} + + +\end{frame} + +\endlecture +\end{document} + diff --git a/slides/gaussian-processes/slides-gp-mean.tex b/slides/gaussian-processes/slides-gp-mean.tex new file mode 100644 index 00000000..4635819a --- /dev/null +++ b/slides/gaussian-processes/slides-gp-mean.tex @@ -0,0 +1,118 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\title{Introduction to Machine Learning} + +\begin{document} + +\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty + Gaussian Processes + }{% Lecture title + Mean functions for GPs + }{% Relative path to title page image: Can be empty but must not start with slides/ + figure/gp_sample/2_4.pdf + }{ + \item Trends can be modeled via specification of the mean function +} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +%\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +%\input{../../style/preamble} +%\input{../../latex-math/basic-math} +%\input{../../latex-math/basic-ml} +%\input{../../latex-math/ml-gp} + +%\newcommand{\titlefigure}{figure/gp_sample/2_4.pdf} %does not fit +%\newcommand{\learninggoals}{ +% \item Trends can be modeled via specification of the mean function +%} + +%\title{Introduction to Machine Learning} +%\date{} + +%\begin{document} + +%\lecturechapter{Mean Functions for Gaussian Processes} +%\lecture{Introduction to Machine Learning} + + +\begin{vbframe}{The Role of Mean Functions} + +\begin{itemize} + \item It is common but by no means necessary to consider GPs with a zero-mean function + $$ + m(\xv) \equiv 0 + $$ + \item Note that this is not necessarily a drastic limitation, since the mean of the posterior process is not confined to be zero + $$ + \bm{f}_* | \Xmat_*, \Xmat, \bm{f} \sim \mathcal{N}(\Kmat_{*}^{T}\Kmat^{-1}\bm{f}, \Kmat_{**} - \Kmat_*^T \Kmat ^{-1}\Kmat_*). + $$ + \item Yet there are several reasons why one might wish to explicitly model a mean function, including interpretability, convenience of expressing prior informations, ... + \item When assuming a non-zero mean GP prior $\gp$ with mean $m(\xv)$, the predictive mean becomes + $$ + m(\Xmat_*) + \Kmat_*\Kmat_y^{-1}\left(\bm{y} - m(\Xmat)\right) + $$ + while the predictive variance remains unchanged. + + \framebreak + + \item Gaussian processes with non-zero mean Gaussian process priors are also called Gaussian processes with trend. +\vspace{.3cm} + +\begin{figure} +\includegraphics[width=0.8\textwidth]{figure/gp_sample/1_1.pdf} +\end{figure} + +\framebreak + + +\begin{figure} +\includegraphics[width=0.8\textwidth]{figure/gp_sample/2_1.pdf} +\end{figure} + +\framebreak + + + +\begin{figure} +\includegraphics[width=0.8\textwidth]{figure/gp_sample/2_2.pdf} +\end{figure} + + \framebreak + + +\begin{figure} +\includegraphics[width=0.8\textwidth]{figure/gp_sample/2_3.pdf} +\end{figure} + +\framebreak + +\begin{figure} +\includegraphics[width=0.8\textwidth]{figure/gp_sample/2_4.pdf} +\end{figure} + +\framebreak + + +\item In practice it can often be difficult to specify a fixed mean function +\item In many cases it may be more convenient to specify a few fixed basis functions, whose coefficients, $\bm{\beta}$, are to be inferred from the data +\item Consider +$$ + g(\xv) = b(\xv)^\top \bm{\beta} + \fx, \text{ where } \fx \sim \mathcal{GP} \left(0, k(\xv, \tilde \xv)\right) +$$ +\item This formulation expresses that the data is close to a global linear model with the residuals being modelled by a GP. +\item For the estimation of $g(\xv)$ please refer to \emph{Rasmussen, Gaussian Processes for Machine Learning, 2006} + +\end{itemize} + + +\end{vbframe} + + +\endlecture +\end{document} diff --git a/slides/gaussian-processes/slides-gp-prediction.tex b/slides/gaussian-processes/slides-gp-prediction.tex new file mode 100644 index 00000000..6036c89f --- /dev/null +++ b/slides/gaussian-processes/slides-gp-prediction.tex @@ -0,0 +1,491 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\title{Introduction to Machine Learning} + +\begin{document} + +\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty + Gaussian Processes + }{% Lecture title + Gaussian Posterior Process and Prediction + }{% Relative path to title page image: Can be empty but must not start with slides/ + figure/gp_pred/post_variance.pdf + }{ + \item Know how to derive the posterior process + \item GPs are interpolating and spatial models + \item Model noise via a nugget term +} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +%\input{../../style/preamble} +%\input{../../latex-math/basic-math} +%\input{../../latex-math/basic-ml} +%\input{../../latex-math/ml-gp} + +%\newcommand{\titlefigure}{figure/gp_pred/post_variance.pdf} +%\newcommand{\learninggoals}{ +% \item Know how to derive the posterior process +% \item GPs are interpolating and spatial models +% \item Model noise via a nugget term +%} + +%\title{Introduction to Machine Learning} +%\date{} + +%\begin{document} + +%\lecturechapter{Gaussian Process Prediction} +%\lecture{Introduction to Machine Learning} + +\begin{vbframe}{Gaussian Posterior Process And Prediction} + +\vspace*{1cm} + +\begin{itemize} + \item So far, we have learned how to \textbf{sample} from a GP prior. +\item However, most of the time, we are not interested in drawing random functions from the prior. Instead, we usually like to use the knowledge provided by the training data to predict values of $f$ at a new test point $\xv_*$. +\item In what follows, we will investigate how to update the Gaussian process prior ($\to$ posterior process) and how to make predictions. +\end{itemize} + +\end{vbframe} + +\section{Gaussian Posterior Process and Prediction} + + +\begin{vbframe}{Posterior process} + +% \textbf{Noise-free Case:} +% +% In the noise-free case, $y^{(i)} = f(\xi)$ ($\epsi \equiv 0$, no additive noise). The targets correspond to the true function values $\yv = \bm{f}$ (training observations) and $\yv^* = \bm{f}^*$ (test observations). +% +% \lz + +\begin{itemize} + \item Let us now distinguish between observed training inputs, also denote by a design matrix $\Xmat$, and the corresponding observed values + $$ + \bm{f} = \left[f\left(\xi[1]\right), ..., f\left(\xi[n]\right)\right] + $$ + +and one single \textbf{unobserved test point} $\xv_*$ with $f_* = f\left(\xv_*\right).$ + +\item We now want to infer the distribution of $f_* | \xv_*, \bm{X}, \bm{f}$. + $$ + f_* = f\left(\xv_*\right) + $$ + \item Assuming a zero-mean GP prior $\mathcal{GP}\left(\bm{0}, k(\xv, \xv^\prime)\right)$ we know + +$$ +\begin{bmatrix} +\bm{f} \\ +f_* +\end{bmatrix} \sim +\mathcal{N}\biggl(\bm{0}, \begin{bmatrix} \Kmat & \bm{k}_* \\ \bm{k}_*^T & \bm{k}_{**}\end{bmatrix}\biggr). +$$ + +Here, $\Kmat = \left(k\left(\xi, \xv^{(j)}\right)\right)_{i,j}$, $\bm{k}_* = \left[k\left(\xv_*, \xi[1]\right), ..., k\left(\xv_*, \xi[n]\right)\right]$ and $ \bm{k}_{**}\ = k(\xv_*, \xv_*)$. + +\framebreak + +\item Given that $\bm{f}$ is observed, we can apply the general rule for condition $^{(*)}$ of Gaussian random variables and obtain the following formula: + +\begin{eqnarray*} +f_* ~|~ \xv_*, \Xmat, \bm{f} \sim \mathcal{N}(\bm{k}_{*}^{T}\Kmat^{-1}\bm{f}, \bm{k}_{**} - \bm{k}_*^T \Kmat ^{-1}\bm{k}_*). +\end{eqnarray*} + +% \begin{eqnarray*} +% \bm{f}_* | \Xmat_*, \Xmat, \bm{f} \sim \mathcal{N}(\Kmat_{*}^{T}\Kmat^{-1}\bm{f}, \Kmat_{**} - \Kmat_*^T \Kmat ^{-1}\Kmat_*). +% \end{eqnarray*} +\item As the posterior is a Gaussian, the maximum a-posteriori estimate, i.e. the mode of the posterior distribution, is $\bm{k}_{*}^{T}\Kmat^{-1}\bm{f}. $ +\end{itemize} + +\framebreak + +$^{(*)}$ General rule for condition of Gaussian random variables: + +\lz + + If the $m$-dimensional Gaussian vector $\bm{z} \sim \mathcal{N}(\mu, \Sigma)$ can be partitioned with $\bm{z} = \left(\bm{z}_1, \bm{z}_2\right)$ where $\bm{z}_1$ is $m_1$-dimensional and $\bm{z}_2$ is $m_2$-dimensional, and: +$$\left(\mu_1, \mu_2\right), \quad \Sigma = \begin{pmatrix} \Sigma_{11} & \Sigma_{12} \\ \Sigma_{21} & \Sigma_{22} \end{pmatrix},$$ + +then the conditioned distribution of $\bm{z}_2 ~|~ \bm{z}_1 = \bm{a}$ is a multivariate normal + +$$ + \mathcal{N}\left(\mu_2 + \Sigma_{21} \Sigma_{11}^{-1}\left(\bm{a} - \mu_1\right), \Sigma_{22} - \Sigma_{21}\Sigma_{11}^{-1}\Sigma_{12} \right) +$$ + +\end{vbframe} + +\begin{frame}{GP prediction: Two points} + +Let us visualize this by a simple example: +\begin{itemize} + \item Assume we observed a single training point $\xv = - 0.5$, and want to make a prediction at a test point $\xv_* = 0.5$. + \item Under a zero-mean GP with $k(\xv, \xv^\prime) = \exp(-\frac{1}{2}\|\xv - \xv^\prime\|^2)$, we compute the cov-matrix: + $$ + \begin{bmatrix} f \\ f_* \end{bmatrix} \sim \mathcal{N}\biggl(\bm{0}, \begin{bmatrix} 1 & 0.61 \\ 0.61 & 1\end{bmatrix}\biggr). + $$ + \item Assume that we observe the point $\fx = 1$. + \item We compute the posterior distribution: + \begin{eqnarray*} + f_* ~|~ \xv_*, \xv, f &\sim& \mathcal{N}(\bm{k}_{*}^{T}\Kmat^{-1}f, k_{**} - \bm{k}_*^T \Kmat^{-1}\bm{k}_*) \\ + &\sim& \mathcal{N}(0.61 \cdot 1 \cdot 1, 1 - 0.61 \cdot 1 \cdot 0.61) \\ + &\sim& \mathcal{N}\left(0.61, 0.6279\right) + \end{eqnarray*} + \item The MAP-estimate for $\xv_*$ is $f(\xv_*) = 0.61$, and the uncertainty estimate is $0.6279$. + % what can we say about the function value at a new point $\xv_* = -0.5$? + % \item<+-> We compute the covariance function and have + + % \item<+-> After observing $\fx = 1$ we want to predict a value for $f(\xv_*)$. + % \item<+-> We compute the posterior distribution conditioned on $\fx = 1$ + % \begin{eqnarray*} + + % \end{eqnarray*} +\end{itemize} + +\end{frame} + + +\begin{vbframe}{GP prediction: Two points} + +\begin{footnotesize} + Shown is the bivariate normal density, and the respective marginals. +\end{footnotesize}\vspace*{0.2cm} + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/1.pdf} +\end{figure} + + +\end{vbframe} + +\begin{frame}{GP prediction: Two points} + +\begin{footnotesize} + Assume we observed $\fx = 1$ for the training point $\xv = -0.5$. +\end{footnotesize}\vspace*{0.2cm} + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/2.pdf} +\end{figure} + +\end{frame} +\begin{frame}{GP prediction: Two points} + +\begin{footnotesize} + We condition the Gaussian on $\fx = 1$. +\end{footnotesize}\vspace*{0.2cm} + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/3.pdf} +\end{figure} + +\end{frame} + + +\begin{frame}{GP prediction: Two points} + +\begin{footnotesize} + We compute the posterior distribution of $f(\xv_*)$ given that $\fx = 1$. +\end{footnotesize}\vspace*{0.2cm} + + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/4.pdf} +\end{figure} + + +\end{frame} +\begin{frame}{GP prediction: Two points} + +\begin{footnotesize} + A possible predictor for $f$ at $\xv_*$ is the MAP of the posterior distribution. +\end{footnotesize}\vspace*{0.2cm} + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/5.pdf} +\end{figure} + + +\end{frame} + +\begin{frame}{GP prediction: Two points} + +\begin{footnotesize} + We can do this for different values $\xv_*$, and show the respective mean (grey line) and standard deviations (grey area is mean $\pm 2 \cdot $ posterior standard deviation). +\end{footnotesize}\vspace*{0.2cm} + + +\begin{figure} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/6.pdf} +\end{figure} + +\end{frame} + +\begin{vbframe}{Posterior Process} + +\begin{itemize} + \item We can generalize the formula for the posterior process for multiple unobserved test points: + +$$ + \bm{f}_* = \left[f\left(\xi[1]_*\right), ..., f\left(\xi[m]_*\right)\right]. +$$ + \item Under a zero-mean Gaussian process, we have + $$ + \begin{bmatrix} + \bm{f} \\ + \bm{f}_* + \end{bmatrix} \sim + \mathcal{N}\biggl(\bm{0}, \begin{bmatrix} \Kmat & \Kmat_* \\ \Kmat_*^T & \Kmat_{**} \end{bmatrix}\biggr), + $$ + with $\Kmat_* = \left(k\left(\xi, \xv_*^{(j)}\right)\right)_{i,j}$, $\Kmat_{**} = \left(k\left(\xi[i]_*, \xi[j]_*\right)\right)_{i,j}$. + + \framebreak + + \item Similar to the single test point situation, to get the posterior distribution, we exploit the general rule of conditioning for Gaussians: + \begin{eqnarray*} + \bm{f}_* ~|~ \Xmat_*, \Xmat, \bm{f} \sim \mathcal{N}(\Kmat_{*}^{T}\Kmat^{-1}\bm{f}, \Kmat_{**} - \Kmat_*^T \Kmat ^{-1}\Kmat_*). + \end{eqnarray*} + \item This formula enables us to talk about correlations among different test points and sample functions from the posterior process. +\end{itemize} + +\end{vbframe} + + +\section{Properties of a Gaussian Process} + +\begin{vbframe}{GP as interpolator} + +The \enquote{prediction} for a training point $\xi$ is the exact function value $\fxi$ + +\vspace*{-0.8cm} + +\begin{eqnarray*} +\bm{f} ~|~ \Xmat, \bm{f} \sim \mathcal{N}(\Kmat\Kmat^{-1}\bm{f}, \Kmat - \Kmat^T \Kmat^{-1} \Kmat) = \mathcal{N}(\bm{f}, \bm{0}). +\end{eqnarray*} + +Thus, a Gaussian process is a function \textbf{interpolator}. + +\begin{center} +\includegraphics[width=0.8\textwidth]{figure/gp_pred/gp_interpolator.pdf} +\end{center} +% \begin{footnotesize} +% A the posterior process (black) after observing the training points (red) interpolates the training points. +% \end{footnotesize} + +\end{vbframe} + + +\begin{vbframe}{GP as a spatial model} + +\vspace*{-0.3cm} + +\begin{itemize} + \begin{footnotesize} + \item The correlation among two outputs depends on distance of the corresponding input points $\xv$ and $\xv^\prime$ (e.g. Gaussian covariance kernel $k(\xv, \xv^\prime) = \exp \left(\frac{- \|\xv - \xv^\prime\|^2}{2 l^2}\right)$ ) + \item Hence, close data points with high spatial similarity $k(\xv, \xv^\prime)$ enter into more strongly correlated predictions: $\bm{k}_*^\top \bm{K}^{-1} \bm{f}$ ($\bm{k}_* := \left(k(\xv, \xv^{(1)}), ..., k(\xv, \xv^{(n)})\right)$). + \end{footnotesize} + + +\begin{center} +\includegraphics[width=0.5\textwidth]{figure/gp_pred/post_mean.pdf} +\end{center} + + +\begin{footnotesize} +Example: Posterior mean of a GP that was fitted with the Gaussian covariance kernel with $l = 1$. +\end{footnotesize} + + +\framebreak + +\item Posterior uncertainty increases if the new data points are far from the design points. +\item The uncertainty is minimal at the design points, since the posterior variance is zero at these points. +\end{itemize} + + +\begin{center} +\includegraphics[width=0.5\textwidth]{figure/gp_pred/post_variance.pdf} +\end{center} + +\begin{footnotesize} +Example (continued): Posterior variance. +\end{footnotesize} + + +\end{vbframe} + + +\section{Noisy Gaussian Process} + +\begin{vbframe}{Noisy Gaussian Process} + +\begin{itemize} + \item So far, we implicitly assumed that we had access to the true function value $\fx$. + \item For the squared exponential kernel, for example, we have + $$ + \cov\left(f(\xi), f(\xi)\right) = 1. + $$ + \item As a result, the posterior Gaussian process is an interpolator: + \begin{center} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/gp_interpolator.pdf} + \end{center} + +\framebreak + + \item In reality, however, this is often not the case. + \item We often only have access to a noisy version of the true function value + $$ + y = \fx + \eps, \eps \sim\mathcal{N}\left(0, \sigma^2\right). + $$ + \item Let us still assume that $\fx$ is a Gaussian process. + \item Then, + \begin{footnotesize} + \begin{eqnarray*} + &&\cov(y^{(i)}, y^{(j)}) = \cov\left(f\left(\xi\right) + \epsilon^{(i)}, f\left(\xi[j]\right) + \epsilon^{(j)}\right) \\ + &=& \cov\left(f\left(\xi\right), f\left(\xi[j]\right)\right) + 2 \cdot \cov\left(f\left(\xi\right), \epsilon^{(j)}\right) + \cov\left(\epsilon^{(i)}, \epsilon^{(j)}\right) + \\ &=& k\left(\xi, \xi[j]\right) + \sigma^2 \delta_{ij}. + \end{eqnarray*} + \end{footnotesize} + \item $\sigma^2$ is called \textbf{nugget}. +\end{itemize} + +\framebreak + +\begin{itemize} + \item Let us now derive the predictive distribution for the case of noisy observations. + \item The prior distribution of $y$, assuming that $f$ is modeled by a Gaussian process is then + $$ + \bm{y} = \begin{pmatrix} \yi[1] \\ \yi[2] \\ \vdots \\ \yi[n] \end{pmatrix} \sim \mathcal{N}\left(\bm{m}, \bm{K} + \sigma^2 \bm{I}_n \right), + $$ + with + \begin{eqnarray*} + \textbf{m} &:=& \left(m\left(\xi\right)\right)_{i}, \quad + \textbf{K} := \left(k\left(\xi, \xv^{(j)}\right)\right)_{i,j}. + \end{eqnarray*} + + \framebreak + + \item We distinguish again between + \begin{itemize} + \item observed training points $\Xmat, \yv$, and + \item unobserved test inputs $\Xmat_*$ with unobserved values $\bm{f}_*$ + \end{itemize} + and get + $$ + \begin{bmatrix} + \bm{y} \\ + \bm{f}_* + \end{bmatrix} \sim + \mathcal{N}\biggl(\bm{0}, \begin{bmatrix} \Kmat + \sigma^2 \bm{I}_n & \Kmat_* \\ \Kmat_*^T & \Kmat_{**} \end{bmatrix}\biggr). + $$ + +\framebreak + + \item Similarly to the noise-free case, we condition according to the rule of conditioning for Gaussians to get the posterior distribution for the test outputs $\bm{f}_*$ at $\Xmat_*$: + + \begin{eqnarray*} + \bm{f}_* ~|~ \Xmat_*, \Xmat, \bm{y} \sim \mathcal{N}(\bm{m}_{\text{post}}, \bm{K}_\text{post}). +\end{eqnarray*} + with + \begin{eqnarray*} + \bm{m}_{\text{post}} &=& \Kmat_{*}^{T} \left(\Kmat+ \sigma^2 \cdot \id\right)^{-1}\bm{y} \\ + \bm{K}_\text{post} &=& \Kmat_{**} - \Kmat_*^T \left(\Kmat + \sigma^2 \cdot \id\right)^{-1}\Kmat_*, + \end{eqnarray*} +\item This converts back to the noise-free formula if $\sigma^2 = 0$. + +\framebreak + +\item The noisy Gaussian process is not an interpolator any more. +\item A larger nugget term leads to a wider ``band'' around the observed training points. +\item The nugget term is estimated during training. + + +\begin{center} + \includegraphics[width=0.8\textwidth]{figure/gp_pred/gp_regression.pdf} +\end{center} +\end{itemize} + +\end{vbframe} + + + +\section{Decision Theory for Gaussian Processes} + +\begin{vbframe}{Risk Minimization for Gaussian Processes} + +In machine learning, we learned about risk minimization. We usually choose a loss function and minimize the empirical risk + +$$ + \riske(f) := \sumin \Lxyi +$$ +as an approximation to the theoretical risk + +$$ + \riskf := \E_{xy} [\Lxy] = \int \Lxy \text{d}\Pxy. +$$ + +\begin{itemize} + \item How does the theory of Gaussian processes fit into this theory? + \item What if we want to make a prediction which is optimal w.r.t. a certain loss function? +\end{itemize} + +\framebreak + +\begin{itemize} + \item The theory of Gaussian process gives us a posterior distribution + $$ + p(y ~|~\D) + $$ + \item If we now want to make a prediction at a test point $\bm{x}_*$, we approximate the theoretical risk in a different way, by using the posterior distribution: + $$ + \mathcal{R}(y_* ~|~ \bm{x}_*) \approx \int L(\tilde y_*, y_*) p(\tilde y_*~|~\bm{x}_*, \D)d\tilde y_*. + $$ + \item The optimal prediciton w.r.t the loss function is then: + $$ + \hat y_* | \bm{x}_* = \argmin_{y_*} \mathcal{R}(y_*~|~ \bm{x}_*). + $$ +\end{itemize} + + +% In practical applications, we are often forced to make predictions. We need a point-like prediction that is \enquote{optimal} in some sense. + +% \lz + +% We define \enquote{optimality} with respect to some loss function + +% $$ +% L(y_\text{true}, y_\text{guess}). +% $$ + +% \vfill + +% \begin{footnotesize} +% Notice that we computed the predictive distribution without reference to the loss function. In non-Bayesian paradigms, the model is typically trained by minimizing the empirical risk (or loss). In contrast, in the Bayesian setting there is a clear separation between the likelihood function (used for training in addition to the prior) and the loss function. +% \end{footnotesize} + +% \framebreak + +% As we do not know the true value $y_\text{true}$ for our test input $\bm{x}_*$, we minimize w.r.t. to the expected loss called \textbf{risk} w.r.t. our model's opinion as to what the truth might be + +% $$ +% \mathcal{R}(y_\text{guess} | \bm{x}_*) = \int L(y_*, y_\text{guess}) p(y_*|\bm{x}_*, \D)dy_*. +% $$ + +% Our best guess w.r.t. $L$ is then + +% $$ +% $$ + +% For quadratic loss $L(y, y^\prime) = (y - y^\prime)^2$ this corresponds to the posterior mean. + +\end{vbframe} + + +\endlecture +\end{document} \ No newline at end of file diff --git a/slides/gaussian-processes/slides-gp-training.tex b/slides/gaussian-processes/slides-gp-training.tex new file mode 100644 index 00000000..988afa1c --- /dev/null +++ b/slides/gaussian-processes/slides-gp-training.tex @@ -0,0 +1,215 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-gp} + +\title{Introduction to Machine Learning} + +\begin{document} + +\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty + Gaussian Processes + }{% Lecture title + Training of a Gaussian Process + }{% Relative path to title page image: Can be empty but must not start with slides/ + figure/gp_training/fit_vs_penalty.pdf + }{ + \item Training of GPs via Maximum Likelihood estimation of its hyperparameters + \item Computational complexity is governed by matrix inversion of the covariance matrix +} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +%\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +%\input{../../style/preamble} +%\input{../../latex-math/basic-math} +%\input{../../latex-math/basic-ml} +%\input{../../latex-math/ml-gp} + +%\newcommand{\titlefigure}{figure/gp_training/fit_vs_penalty.pdf} %does not fit +%\newcommand{\learninggoals}{ +% \item Training of GPs via Maximum Likelihood estimation of its hyperparameters +% \item Computational complexity is governed by matrix inversion of the covariance matrix +%} + +%\title{Introduction to Machine Learning} +%\date{} + +%\begin{document} + +%\lecturechapter{Gaussian Process Training} +%\lecture{Introduction to Machine Learning} + + +\begin{vbframe}{Training of a Gaussian process} + +\begin{itemize} +\item To make predictions for a regression task by a Gaussian process, one simply needs to perform matrix computations. +\vspace{.5cm} +\item But for this to work out, we assume that the covariance functions is fully given, including all of its hyperparameters.\vspace{.5cm} +\item A very nice property of GPs is that we can learn the numerical hyperparameters of a selected covariance function directly during GP training. +\end{itemize} + +\end{vbframe} + +\begin{vbframe}{Training a GP via maximum likelihood} + +Let us assume + +$$ + y = \fx + \eps, ~ \eps \sim \mathcal{N}\left(0, \sigma^2\right), +$$ +where $\fx \sim \mathcal{GP}\left(\bm{0}, k\left(\xv, \xv^\prime | \thetab \right)\right)$. + +\lz + +Observing $\bm{y} \sim \mathcal{N}\left(\bm{0}, \bm{K} + \sigma^2 \id\right)$, the marginal log-likelihood (or evidence) is +\begin{eqnarray*} +\log p(\bm{y} ~|~ \bm{X}, \thetab) &=& \log \left[\left(2 \pi\right)^{-n / 2} |\bm{K}_y|^{-1 / 2} \exp\left(- \frac{1}{2} \bm{y}^\top \bm{K}_y^{-1} \bm{y}\right) \right]\\ +&=& -\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y} - \frac{1}{2} \log \left| \bm{K}_y \right| - \frac{n}{2} \log 2\pi. +\end{eqnarray*} + +with $\bm{K}_y:=\bm{K} + \sigma^2 \id$ and $\thetab$ denoting the hyperparameters (the parameters of the covariance function). + +\framebreak + + +The three terms of the marginal likelihood have interpretable roles, considering that +the model becomes less flexible as the length-scale increases: + +\begin{itemize} +\item the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, which tends to decrease if the length scale increases +\item the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$, which depends on the covariance function only and which increases with the length-scale, because the model gets less complex with growing length-scale +\item a normalization constant $- \frac{n}{2} \log 2\pi$ +\end{itemize} + +\end{vbframe} + +\begin{vbframe}{Training a GP: Example} + +To visualize this, we consider a zero-mean Gaussian process with squared exponential kernel + +$$ +k(\xv, \xv^\prime) = \exp\left(-\frac{1}{2\ls^2}\|\xv - \xv^\prime\|^2\right), +$$ + + +\begin{itemize} + \item Recall, the model is smoother and less complex for higher length-scale $\ls$. + \item We show how the + \begin{itemize} + \item data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, + \item the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$, and + \item the overall value of the marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ + \end{itemize} + behave for increasing value of $\ls$. +\end{itemize} + +\framebreak + +\begin{figure} + \includegraphics[width = 0.5\textwidth]{figure/gp_training/fit_vs_penalty.pdf}~ \includegraphics[width = 0.5\textwidth]{figure/gp_training/datapoints.pdf} +\end{figure} + +\begin{footnotesize} + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$. +\end{footnotesize} + + +\framebreak + +\begin{figure} + \includegraphics[width = 0.5\textwidth]{figure/gp_training/fit_vs_penalty_0_2.pdf}~ \includegraphics[width = 0.5\textwidth]{figure/gp_training/datapoints_0_2.pdf} +\end{figure} + +\begin{footnotesize} + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$.\\ + A small $\ls$ results in a good fit, but a high complexity penalty (low $- \frac{1}{2} \log \left| \bm{K}_y \right|$). +\end{footnotesize} + +\framebreak + +\begin{figure} + \includegraphics[width = 0.5\textwidth]{figure/gp_training/fit_vs_penalty_2.pdf}~ \includegraphics[width = 0.5\textwidth]{figure/gp_training/datapoints_2.pdf} +\end{figure} + +\begin{footnotesize} + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$.\\ + A large $\ls$ results in a poor fit. +\end{footnotesize} + +\framebreak + +\begin{figure} + \includegraphics[width = 0.5\textwidth]{figure/gp_training/fit_vs_penalty_0_5.pdf}~ \includegraphics[width = 0.5\textwidth]{figure/gp_training/datapoints_0_5.pdf} +\end{figure} + +\begin{footnotesize} + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$.\\ + The maximizer of the log-likelihood, $\ls = 0.5$, balances complexity and fit. +\end{footnotesize} + + +\end{vbframe} + +\begin{vbframe}{Training a GP via maximum likelihood} + +To set the hyperparameters by maximizing the marginal likelihood, we seek the partial derivatives w.r.t. the hyperparameters + +\begin{footnotesize} +\begin{eqnarray*} +\frac{\partial}{\partial\theta_j} \log p(\bm{y} ~|~ \bm{X}, \thetab) &=& \frac{\partial}{\partial\theta_j} \left(-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y} - \frac{1}{2} \log \left| \bm{K}_y \right| - \frac{n}{2} \log 2\pi\right) \\ +&=&\frac{1}{2} \yv^\top \bm{K}^{-1} \frac{\partial \bm{K}}{\partial \theta_j}\bm{K}^{-1} \yv - \frac{1}{2} \text{tr}\left(\bm{K}^{-1} \frac{\partial \bm{K}}{\partial \thetab} \right) \\ +&=& \frac{1}{2} \text{tr}\left((\bm{K}^{-1}\bm{y}\bm{y}^T\bm{K}^{-1} - \bm{K}^{-1})\frac{\partial\bm{K}}{\partial\theta_j}\right) +\end{eqnarray*} +\end{footnotesize} + +using $\frac{\partial}{\partial \theta_j} \bm{K}^{-1} = - \bm{K}^{-1} \frac{\partial \bm{K}}{\partial \theta_j}\bm{K}^{-1}$ and $\frac{\partial}{\partial \thetab} \log |\bm{K}| = \text{tr}\left(\bm{K}^{-1} \frac{\partial \bm{K}}{\partial \thetab} \right)$. + +\framebreak + +\begin{itemize} + \item The complexity and the runtime of training a Gaussian process is dominated by the computational task of inverting $\bm{K}$ - or let's rather say for decomposing it. + \item Standard methods require $\order(n^3)$ time (!) for this. + \item Once $\bm{K}^{-1}$ - or rather the decomposition -is known, the computation of the partial derivatives requires only $\order(n^2)$ time per hyperparameter. + \item Thus, the computational overhead of computing derivatives is small, so using a gradient based optimizer is advantageous. +\end{itemize} + + +\framebreak + +% https://arxi.org/pdf/1807.01065v1.pdf +Workarounds to make GP estimation feasible for big data include: +\begin{itemize} +\item using kernels that yield sparse $\bm K$: cheaper to invert. +\item subsampling the data to estimate $\theta$: $\order(m^3)$ for subset of size $m$. +\item combining estimates on different subsets of size $m$:\\ \textbf{Bayesian committee}, $\order(n m^2)$. +\item using low-rank approximations of $\bm{K}$ by using only a representative subset (\enquote{inducing points}) of $m$ training data $\bm X_m$:\\ \textbf{Nyström approximation} $\bm K \approx \bm K_{nm} \bm K_{mm}^{-} \bm K_{mn}$,\\ $\order(nmk + m^3)$ for a rank-k-approximate inverse of $\bm K_{mm}$. +\item exploiting structure in $\bm{K}$ induced by the kernel: exact solutions but complicated maths, not applicable for all kernels. +\end{itemize} + +... this is still an active area of research. + +\end{vbframe} + +% \begin{vbframe}{Gaussian process as a linear smoother} +% +% Let's consider mean prediction at training points only. For simplicity, we write $\bm{K}:= \bm{K}(\Xmat, \Xmat)$. The predicted mean values at the training points are +% +% $$ +% \bm{\bar f} = \bm{K}(\bm{K} + \sigma_n^2\id)^{-1}. +% $$ +% +% Let $\bm{K}$ have the eigendecomposition $\bm{K} = \sumin \lambda_i\bm{u}_i \bm{u}_i^T. $\lambda_i$ is the $i$-th eigenvalue, and $\bm{u}_i$ is the corresponding eigenvalue. The predicted mean can be written as +% +% $$ +% \bm{\bar f} = \sumin \frac{\gamma_i \lambda_i}{\lambda_i + \sigma_n^2}\bm{u}_i +% $$ +% +% +% \end{vbframe} + +\endlecture +\end{document} \ No newline at end of file diff --git a/slides/information-theory/slides-info-entropy2.tex b/slides/information-theory/slides-info-entropy2.tex index 41ffea98..508d1aed 100644 --- a/slides/information-theory/slides-info-entropy2.tex +++ b/slides/information-theory/slides-info-entropy2.tex @@ -108,12 +108,15 @@ \begin{vbframe}{The Maximum Entropy Principle} Can be solved via Lagrangian multipliers (here with base $e$) -\small{$$L(p(x),(\lambda_m)_{m=0}^{M}) = - \sum_{x \in \Xspace} p(x) \log(p(x)) + \lambda_0 \big( \sum_{x \in \Xspace} p(x) - 1 \big) + \sum_{m=1}^{M} \lambda_m \big( \sum_{x \in \Xspace} g_m(x)p(x)-\alpha_m \big)$$} +\footnotesize{$$L(p(x),(\lambda_m)_{m=0}^{M}) = - \sum_{x \in \Xspace} p(x) \log(p(x)) + \lambda_0 \big( \sum_{x \in \Xspace} p(x) - 1 \big) + \sum_{m=1}^{M} \lambda_m \big( \sum_{x \in \Xspace} g_m(x)p(x)-\alpha_m \big)$$ +} Finding critical points $p^{\ast}(x)$ : $$\frac{\partial L}{\partial p(x)} = -\log(p(x)) -1 + \lambda_0 + \sum_{m=1}^{M} \lambda_m g_m(x) \overset{!}{=} 0 \iff p^{\ast}(x)=\textcolor{blue}{\exp(\lambda_0-1)}\textcolor{red}{ \exp\big(\sum_{m=1}^{M} \lambda_m g_m(x)\big)}$$ This is a maximum as $-1/p(x)<0$. Since probs must sum to 1 we get +{\footnotesize $$1\overset{!}{=}\sum_{x \in \Xspace} p^{\ast}(x)=\textcolor{blue}{\frac{1}{\exp(1-\lambda_0)}} \sum_{x \in \Xspace} \textcolor{red}{\exp\big(\sum_{m=1}^{M} \lambda_m g_m(x)\big)} \Rightarrow \textcolor{blue}{\exp(1-\lambda_0)}=\textcolor{blue}{\sum_{x \in \Xspace} \exp\big(\sum_{m=1}^{M} \lambda_m g_m(x)\big)}$$ +} Plugging $\textcolor{blue}{\exp(1-\lambda_0)}$ into $p^{\ast}(x)$ we obtain the constrained maxent distribution: $$p^{\ast}(x)=\frac{\textcolor{red}{\exp{\sum_{m=1}^{M}\lambda_m g_m(x)}}}{\textcolor{blue}{\sum_{x \in \Xspace} \exp{\sum_{m=1}^{M}\lambda_m g_m(x)}}}$$