Merge overleaf-2024-10-17-2027 into main

slds-lmu · Oct 17, 2024 · b0634b0 · b0634b0
2 parents 41605d5 + c31e079
commit b0634b0
Show file tree

Hide file tree

Showing 118 changed files with 3,713 additions and 5 deletions.
diff --git a/slides-pdf/lecture_sl.pdf b/slides-pdf/lecture_sl.pdf
diff --git a/slides-pdf/slides-gp-basic.pdf b/slides-pdf/slides-gp-basic.pdf
diff --git a/slides-pdf/slides-gp-bayes-lm.pdf b/slides-pdf/slides-gp-bayes-lm.pdf
diff --git a/slides-pdf/slides-gp-covariance.pdf b/slides-pdf/slides-gp-covariance.pdf
diff --git a/slides-pdf/slides-gp-mean.pdf b/slides-pdf/slides-gp-mean.pdf
diff --git a/slides-pdf/slides-gp-prediction.pdf b/slides-pdf/slides-gp-prediction.pdf
diff --git a/slides-pdf/slides-gp-training.pdf b/slides-pdf/slides-gp-training.pdf
diff --git a/slides-pdf/slides-info-entropy2.pdf b/slides-pdf/slides-info-entropy2.pdf
diff --git a/slides/all/slides_sl.tex b/slides/all/slides_sl.tex
@@ -98,13 +98,14 @@ \section{Linear Support Vector Machine}
 \section{Nonlinear Support Vector Machine}
 \input{../nonlinear-svm/chapter-order.tex}
 
-%\section{Gaussian Processes}
-%\input{..//gaussian-processes/chapter-order.tex}
 
 \section{Boosting}
 \input{../boosting/chapter-order.tex}
 
-\section{Feature Selection}
-\input{../feature-selection/chapter-order.tex}
+%\section{Feature Selection}
+%\input{../feature-selection/chapter-order.tex}
+
+\section{Gaussian Processes}
+\input{../gaussian-processes/chapter-order.tex}
 
 \end{document}
diff --git a/slides/gaussian-processes/Makefile b/slides/gaussian-processes/Makefile
@@ -0,0 +1 @@
+include ../tex.mk
diff --git a/slides/gaussian-processes/attic/slides-x-covariance-adv.tex b/slides/gaussian-processes/attic/slides-x-covariance-adv.tex
@@ -0,0 +1,100 @@
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+\input{../../latex-math/ml-gp}
+
+\newcommand{\titlefigure}{figure_man/up-crossings.png}
+\newcommand{\learninggoals}{
+  \item \textcolor{blue}{XXX}
+  \item \textcolor{blue}{XXX}
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Covariance Functions for GPs - Advanced}
+\lecture{Introduction to Machine Learning}
+
+\begin{vbframe}{MS-Continuity and Differentiability}
+
+We wish to describe a Gaussian process in terms of its smoothness. There are several notions of continuity for random variables - one is continuity / differentiability in mean square (MS): 
+
+\begin{block}{Definition}
+A Gaussian process $f(\xv)$ is said to be 
+\begin{itemize}
+\item continuous in MS in $\xv_*$, if $\E[|f(\xv^{(k)}) - f(\xv_*)|^2] \overset{k \to \infty}{\longrightarrow} 0$ for any sequence $\xv^{(k)} \overset{k \to \infty}{\to} \xv_*$
+\item MS differentiable in direction $i$ if $\lim_{h\to 0}\E[|\frac{f(\xv + h\bm{e}_i) - f(\xv)}{h}|]$ exists, where $\bm{e}_i = (0,\dots,0,1,0,\dots,0)^T$ is the unit vector in the $i$-th axis.
+\end{itemize}
+\end{block}
+
+\textbf{Remark:} MS continuity / differentiability does not necessarily imply continuity / differentiability of the sampled function! 
+
+\framebreak
+
+MS continuity / differentiability of a Gaussian process can be derived from the smoothness properties of the kernel:
+
+\begin{itemize}
+\item The GP is continuous in MS if and only if the covariance function $k(\xv, \xv
+^\prime)$ is continuous 
+\item The MS derivative of a Gaussian process exists iff the second derivative $\frac{\partial^{2} k(\xv, \xv^\prime)}{\partial \xv\partial \xv^\prime}$ exists
+\end{itemize}
+
+\end{vbframe}
+
+
+
+\begin{vbframe}{Squared exponential covariance function}
+
+One common used covariance function is the squared exponential covariance function:
+
+$$
+k(\xv, \xv^\prime) = \exp\biggl(- \frac{\|\xv - \xv^\prime\|^2}{2\ls^2}\biggr)
+$$
+
+\textbf{Properties}:
+\begin{itemize}
+\item as it depends on the distance $r = \|\xv - \xv^\prime\|$ only, it is a isotropic (and thus also stationary) covariance function
+\item infinitely differentiable $\to$ corresponding GP is thus very smooth
+\item due to its strong smoothness assumptions it is often unrealistic for modeling many physical processes
+
+\end{itemize}
+
+\end{vbframe}
+
+\begin{vbframe}{Upcrossing Rate and Characteristic Length-Scale}
+
+Another way to describe a Gaussian process is the expected number of up-crossings at level $0$ on the unit interval, which we denote by $N_0$. 
+
+\begin{figure}
+  \includegraphics[width=0.7\textwidth]{figure_man/up-crossings.png}
+\end{figure}
+
+For an isotropic covariance function $k(r)$, it can be shown that the expected number of up-crossings can be calculated explicitly
+
+$$
+\E[N_0] = \frac{1}{2\pi} \sqrt{\frac{- k^{\prime \prime}(0)}{k(0)}}.
+$$
+
+\framebreak
+
+\textbf{Example:} Squared exponential
+
+\begin{eqnarray*}
+k(r) &=& \exp\biggl(-\frac{r^2}{2\ls^2}\biggr)\\
+k^\prime(r) &=& - k(r) \cdot \frac{r}{\ls^2} \\
+k^{\prime\prime}(r) &=&  k(r) \cdot \frac{r^2}{\ls^4} - k(r) \cdot \frac{1}{\ls^2}
+\end{eqnarray*}
+
+The expected number of level-0 upcrossing is thus
+
+$$
+\E[N_0] = \frac{1}{2\pi} \sqrt{\frac{- k^{\prime\prime}(0)}{k(0)}} = \frac{1}{2\pi} \sqrt{\frac{1}{\ls^2}} = (2\pi \ls)^{-1}
+$$
+
+
+\end{vbframe} 
+
+\endlecture
+\end{document}
diff --git a/slides/gaussian-processes/attic/slides-x-gp-additional.tex b/slides/gaussian-processes/attic/slides-x-gp-additional.tex
@@ -0,0 +1,219 @@
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+\input{../../latex-math/ml-gp}
+
+\newcommand{\titlefigure}{figure_man/post-mean.png} % does not fit
+\newcommand{\learninggoals}{
+  \item \textcolor{blue}{XXX}
+  \item \textcolor{blue}{XXX}
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Gaussian Proccesses: Additional Material}
+\lecture{Introduction to Machine Learning}
+
+%http://www.gaussianprocess.org/gpml
+
+\begin{vbframe}{Notation}
+% We would like to model a function
+% 
+% $$
+% f: \mathcal{X} \to \Yspace
+% $$
+% 
+% where
+% 
+% \begin{itemize}
+% \item $\Xspace$ is a p-dimensional input space (here: $\Xspace = \R^n$)
+% \item $\Yspace$ is the target space (usually $\Yspace = \R$ for regression and $\Yspace = \{0, 1\}$ for binary classification)
+% \item $\bm{x} \in \mathcal{X}$ is called independent / predictor variable
+% \item $y \in \mathcal{Y}$ is called dependent variable (target, label, output)
+% \end{itemize}
+% 
+% \framebreak
+% 
+
+In this chapter 
+
+\begin{itemize}
+\item $(\xv_*, y_*)$ denotes one single test observation, excluded from training
+\item $\Xmat_* \in \R^{n_* \times p}$ contains a set of $n_*$ test observations and  
+\item $\yv_* \in \R^{n_* \times p}$ the corresponding outcomes, excluded from training. 
+\end{itemize}
+
+% \framebreak
+
+% In the context of Gaussian processes 
+
+% \begin{itemize}
+% \item the function $m: \Xspace \to \R$ is called \textbf{mean function}. We define the \textbf{mean vector}
+
+% \vspace*{-0.3cm}
+% $$
+% m(\Xmat):= \biggl(m\left(\bm{x}^{(1)}\right), m\left(\bm{x}^{(2)}\right), ..., m\left(\bm{x}^{(n)}\right)\biggr)^T
+% $$
+% \item  the bivariate, positive-definite function $k: \Xspace \times \Xspace \to \R$ is called \textbf{covariance function} or \textbf{kernel}; $k(\Xmat, \Xmat)$ denotes the $n\times n$ matrix that is obtained by plugging in all pairs $\bm{x}^{(i)}, \bm{x}^{(j)}$ and is called \textbf{kernel matrix} or \textbf{covariance matrix}
+
+% $$
+% k(\Xmat, \Xmat) := k(\bm{x}^{(i)}, \bm{x}^{(j)})_{i, j = 1, ..., n}
+% $$ 
+% \item We sometimes use the abbreviations $\bm{K} := k(\Xmat, \Xmat)$, $\bm{K}_* := k(\Xmat_*, \Xmat)$, $\bm{K}_{**} := k(\Xmat_*, \Xmat_*)$.
+
+
+% \end{itemize}
+
+\end{vbframe}
+
+
+
+\section{Noisy Gaussian Processes}
+
+\begin{vbframe}{Noisy Gaussian Process}
+
+In the above equations we implicitly assumed that we had access to the true function value $\fx$. In many cases, we only have access to a noisy version thereof 
+$$
+y = \fx + \eps.$$ 
+
+Assuming additive i.i.d. Gaussian noise, the covariance function becomes
+
+$$
+\cov(y^{(i)}, y^{(j)}) = k(\bm{x}^{(i)}, \bm{x}^{(j)}) + \sigma_n^2 \delta_{ij}
+$$
+
+where $\delta_{ij} = 1$ if $i = j$. In matrix notation, this becomes
+
+$$
+\cov(\yv) = \Kmat + \sigma_n^2\id =: \Kmat_y.  
+$$
+
+The $\sigma_n^2$ is also called \textbf{nugget}. 
+
+\end{vbframe}
+
+\begin{vbframe}{GP vs. kernelized Ridge regression} 
+
+The predictive function is then 
+
+\begin{eqnarray*}
+\bm{f}_* | \Xmat_*, \Xmat, \yv \sim \mathcal{N}(\bm{\bar f}_*, \cov(\bm{\bar f}_*)).
+\end{eqnarray*}
+
+with 
+
+\begin{itemize}
+\item $\bm{\bar f}_* = \Kmat_{*}^{T} \Kmat_y^{-1}\yv$ and
+\item $\cov(\bm{\bar f}_*) = \Kmat_{**}- \Kmat_{*}^{T}\Kmat_y^{-1}\Kmat_*$.
+\end{itemize}
+
+The predicted mean values at the training points $\bm{\bar f} = \bm{K}\Kmat_y^{-1}\bm{y}$ are a \textbf{linear combination} of the $\bm{y}$ values. 
+
+\lz 
+
+\textbf{Note:} Predicting the posterior mean corresponds exactly to the predictions obtained by kernelized Ridge regression. However, a GP (as a Bayesian model) gives us much more information, namely a posterior distribution, whilst kernelized Ridge regression does not. 
+
+
+\end{vbframe}
+
+
+
+
+\section{Bayesian Linear Regression as a GP}
+
+
+\begin{vbframe}{Bayesian linear regression as a GP}
+
+One example for a Gaussian process is the Bayesian linear regression model covered earlier. For  $\thetab \sim \mathcal{N}(\bm{0}, \tau^2 \id)$, the joint distribution of any set of function values 
+
+$$
+f(\xi) = \thetab^T \xi + \epsi
+$$
+
+is Gaussian. 
+
+\vspace*{0.3cm}
+
+The corresponding mean function is $m(\bm{x}) = \bm{0}$ and the covariance function is
+
+\vspace*{-0.5cm}
+
+\begin{eqnarray*}
+\cov(f(\bm{x}), f(\bm{x}^\prime)) &=& \E[f(\bm{x}) f(\bm{x}^\prime)] - \underbrace{\E[f(\bm{x})] \E[f(\bm{x}^\prime]}_{= 0} \\ &=& \E[(\thetab^T \bm{x} + \epsi)^T(\thetab^T \bm{x}^\prime + \epsi)] \\ &=&  \tau^2 \bm{x}^T\bm{x}^\prime + \sigma^2 =: k(\bm{x}, \bm{x}^\prime).
+\end{eqnarray*}
+
+% As we have just described, the predictive distribution assuming a Gaussian process Prior for one single test point $\bm{x}^*$  is normal with mean 
+% 
+% $$
+% (\bm{x}^*)^T \bm{X}^T (\Xmat\Xmat^T + \id)^{-1} \yv.
+% $$
+% 
+% Remember that we derived also a normal predictive distribution for a Bayesian linear regression case - the predictive mean was
+% 
+% $$
+% \mu_{\text{post}} = (\bm{x}^*)^T(\Xmat^T\Xmat + \sigma^2 \id)^{-1}\Xmat^T\yv.
+% $$
+% 
+% Using the matrix identity $(\bm{AB} + \id)
+% ^{-1}\Amat = \Amat(\bm{BA} + \id)^{-1}$^*$, it can be seen that the predictive distributions are identical.
+% 
+% \vfill
+% \begin{footnotesize}
+% $^*$ Searl Set of Identities, see \emph{http://matrixcookbook.com], 3.2}
+% \end{footnotesize}
+
+\end{vbframe}
+
+\begin{vbframe}{Feature Spaces and the Kernel Trick}
+
+If one relaxes the linearity assumption by first projecting features into a higher dimensional feature space $\mathcal{Z}$ using a basis function $\phi: \Xspace \to \mathcal{Z}$, the corresponding covariance function is
+
+$$
+k(\bm{x}, \bm{x}^\prime) = \tau^2 \phi(\bm{x})^T\phi(\bm{x}^\prime) + \sigma^2.
+$$
+
+To get arbitrarily complicated functions, we would have to handle high-dimensional feature vectors $\phi(\bm{x})$. 
+
+\lz 
+
+Fortunately, all we need to know are the inner products $\phi(\bm{x})^T\phi(\bm{x}^\prime)$ - the feature vector itself never occurs in calculations. 
+
+\framebreak
+
+
+If we can get the inner product directly \textbf{without} calculating the infinite feature vectors, we can infer an infinitely complicated model with a \textbf{finite amount} of computation. This idea is known as \textbf{kernel trick}.
+
+\lz 
+
+ A Gaussian process can be defined by either
+
+\begin{itemize}
+\item deriving the covariance function explicitly via inner products of evaluations of basis functions or
+\item choosing a positive definite kernel function (Mercer Kernel) directly, which  corresponds - according to Mercer's theorem - to taking inner products in some (possibly infinite) feature space
+\end{itemize}
+
+\end{vbframe}
+
+\begin{vbframe}{Summary: Gaussian process regression}
+
+\begin{itemize}
+\item Gaussian process regression is equivalent to \textbf{kernelized} Bayesian linear regression
+\item The covariance function describes the shape of the Gaussian process
+\item With the right choice of covariance function, remarkably flexible models can be built
+\item But: naive implementations of Gaussian process models scale poorly with large datasets as
+\begin{itemize}
+\item the kernel matrix has to be inverted / factorized, which is $\order(n^3)$,
+\item computing the kernel matrix uses $\order(n^2)$ memory - running out of memory places a hard limit on problem sizes
+\item generating predictions is $\order(n)$ for the mean, but $\order(n^2)$ for the variance.
+\end{itemize}
+(...so we need special tricks)
+\end{itemize}
+
+\end{vbframe}
+
+
+\endlecture
+\end{document}