diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex index ca1d4f63..9f28130b 100644 --- a/slides/regularization/chapter-order.tex +++ b/slides/regularization/chapter-order.tex @@ -32,17 +32,24 @@ \subsection{L0 Regularization} \subsection{Nonlinear and Bayes} \includepdf[pages=-]{../slides-pdf/slides-regu-nonlin-bayes.pdf} -\subsection{Geometric Analysis of L2-Regularization and Weight Decay} +\subsection{Geometric Analysis of L2 Regularization and Weight Decay} \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l2-wdecay.pdf} -\subsection{Geometric Analysis of L1-regularization} +\subsection{Geometric Analysis of L1 Regularization} \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l1.pdf} +\subsection{Early Stopping} +\includepdf[pages=-]{../slides-pdf/slides-regu-early-stopping.pdf} + +\subsection{Details on Ridge Regression (Deep-Dive)} +\includepdf[pages=-]{../slides-pdf/slides-regu-ridge-deepdive.pdf} + \subsection{Soft-thresholding and L1 regularization deep-dive} \includepdf[pages=-]{../slides-pdf/slides-regu-lasso-deepdive.pdf} -\subsection{Early Stopping} -\includepdf[pages=-]{../slides-pdf/slides-regu-early-stopping.pdf} +\subsection{Bagging as a Regularization Method (Deep-Dive)} +\includepdf[pages=-]{../slides-pdf/slides-regu-bagging-deepdive.pdf} + diff --git a/slides/regularization/figure/eval_ofit_1a.pdf b/slides/regularization/figure/eval_ofit_1a.pdf new file mode 100644 index 00000000..7dfa288c Binary files /dev/null and b/slides/regularization/figure/eval_ofit_1a.pdf differ diff --git a/slides/regularization/figure/eval_ofit_1o.pdf b/slides/regularization/figure/eval_ofit_1o.pdf new file mode 100644 index 00000000..03080f7a Binary files /dev/null and b/slides/regularization/figure/eval_ofit_1o.pdf differ diff --git a/slides/regularization/figure_man/bagging.pdf b/slides/regularization/figure_man/bagging.pdf new file mode 100644 index 00000000..7540b570 Binary files /dev/null and b/slides/regularization/figure_man/bagging.pdf differ diff --git a/slides/regularization/figure_man/rf_majvot_averaging.png b/slides/regularization/figure_man/rf_majvot_averaging.png new file mode 100644 index 00000000..bc5f6844 Binary files /dev/null and b/slides/regularization/figure_man/rf_majvot_averaging.png differ diff --git a/slides/regularization/rsrc/fig-eval_ofit_1.R b/slides/regularization/rsrc/fig-eval_ofit_1.R new file mode 100644 index 00000000..42392c24 --- /dev/null +++ b/slides/regularization/rsrc/fig-eval_ofit_1.R @@ -0,0 +1,68 @@ + +library(mlr3misc) +library(mvtnorm) +library(mlr3) +library(mlr3learners) +library(mlr3viz) +library(ggplot2) +library(gridExtra) + +set.seed(600000) +n = 100000 + +mu1 = c(0, 3) +mu2 = c(3, 0) +s1 = matrix(c(1, 0.1, 0.1, 2), 2, 2, byrow = TRUE) +s2 = matrix(c(30, 0.3, 0.3, 1), 2, 2, byrow = TRUE) +d1 = as.data.table(rmvnorm(n = n/2, mean = mu1, sigma = s1)) +d1$class = 1 +d2 = as.data.table(rmvnorm(n = n/2, mean = mu2, sigma = s2)) +d2$class = 2 +dd = rbind(d1, d2) +dd$class = as.factor(dd$class) +oo = sample(n) +dd = dd[oo,] +task = TaskClassif$new("2dgauss", dd, target = "class") + +trainsize = 200 +trainset = 1:trainsize +testset = (trainsize+1):n + +l1 = lrn("classif.qda", predict_type = "prob") +l2 = lrn("classif.log_reg", predict_type = "prob") +l3 = lrn("classif.svm", type = "C-classification", predict_type = "prob", + kernel = "radial", gamma = 99, cost = 1) + +l1$train(task) +r1 = range(dd[trainset,]$V1) +r2 = range(dd[trainset,]$V2) +r1seq = seq(r1[1], r1[2], length.out = 200) +r2seq = seq(r2[1], r2[2], length.out = 200) +d_grid = expand.grid(V1 = r1seq, V2 = r2seq) +pred_true = as.data.table(l1$predict_newdata(d_grid)) +d_grid$prob = pred_true$prob.1 +true_decb = d_grid[d_grid$prob > 0.47 & d_grid$prob < 0.53,] + + +make_plot = function(ll, file_postfix) { + ll$train(task, row_ids = trainset) + pred_train = ll$predict(task, row_ids = trainset) + trainerr = pred_train$score(msr("classif.ce")) + pred_test = ll$predict(task, row_ids = testset) + testerr = pred_test$score(msr("classif.ce")) + fname = sprintf("../figure/eval_ofit_1%s.pdf", file_postfix) + task_train = task$filter(rows = trainset) + pl = plot_learner_prediction(ll, task) + guides(shape = FALSE, alpha = FALSE) + pl = pl + ggtitle(sprintf("TrainErr=%.2f; TestErr=%.2f", trainerr, testerr)) + pl = pl + geom_point(data = true_decb, alpha=0.5, size=0.2) + ggsave(plot = pl, filename = fname, width = 8, height = 6) + return(pl) +} + +p1 = make_plot(l1, file_postfix = "a") +p2 = make_plot(l2, file_postfix = "u") +p3 = make_plot(l3, file_postfix = "o") + +#grid.arrange(p1, p2, p3) +#print(p2) + diff --git a/slides/regularization/slides-regu-bagging-deepdive.tex b/slides/regularization/slides-regu-bagging-deepdive.tex new file mode 100644 index 00000000..69b8ae07 --- /dev/null +++ b/slides/regularization/slides-regu-bagging-deepdive.tex @@ -0,0 +1,155 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} +\input{../../latex-math/ml-ensembles.tex} + +\newcommand{\titlefigure}{figure_man/bagging.pdf} +\newcommand{\learninggoals}{ + \item Understand that bagging can be seen as a form of regularization + \item Know which factors influence the effectiveness of bagging +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Bagging as Regularization (Deep-Dive)} +\lecture{Introduction to Machine Learning} + + + +\begin{vbframe}{Recap: What is bagging?} + +\begin{itemize} + \item Bagging is short for \textbf{B}ootstrap \textbf{Agg}regation. + \item It's an \textbf{ensemble method}, i.e., it combines many models into one + big \enquote{meta-model}. Ensembles often work much better than their members alone would. + \item The components of an ensemble are called \textbf{base learners} (BLs) + % that improves instable / high variance learners by variance smoothing + \item In a \textbf{bagging} ensemble, all base learners are of the same type. The only difference between the models is the data they are trained on.\\ +\end{itemize} + +\begin{center} +% FIGURE SOURCE: https://docs.google.com/presentation/d/1xodP6ayu1Gay6mMKgzVWYEFmSoeG5kNuqsaTkFFmd78/edit +\includegraphics[width=0.45\textwidth]{figure_man/bagging.pdf} +\end{center} + +\framebreak +Specifically, we train base learners $\bl, m = 1, \dots, M$ on $M$ \textbf{bootstrap} samples of training data $\D$: +\begin{itemize} \setlength{\itemsep}{1.0em} + \item Draw $n$ observations from $\D$ with replacement + \item Fit the base learner on each of the $M$ bootstrap samples to get models $\fh(x) = \blh, m = 1, \dots, M$ +\item Aggregate predictions of the $M$ fitted base learners to get ensemble model $\fMh$ via \textbf{averaging} (regression) or \textbf{majority voting} (classification) + %\item Posterior probs $\pikxh$ can be estimated as predicted class frequencies over the ensemble +\end{itemize} +\vspace{0.1cm} +{\small Bagging helps because variability of the averaged prediction over many base learners is smaller than variability of the predictions from one such model. If error of BL is mostly due to (random) variability and not structural reasons bagging helps reducing this variability.} + + +%\begin{center} +% FIGURE SOURCE: No source +%\includegraphics[width=0.6\textwidth]{figure_man/rf_majvot_averaging.png} +%\end{center} +%\end{vbframe} + +% \begin{algorithm}[H] +% \small +% \setstretch{1.15} +% \caption*{Bagging algorithm} +% \begin{algorithmic}[1] +% \State {\bf Input: } Dataset $\D$, base learner, number of bootstraps $M$ +% \For {$m = 1 \to M$} +% \State Draw a bootstrap sample $\D^{[m]}$ from $\D$. +% \State Train base learner on $\D^{[m]}$ to obtain model $\bl$ +% \EndFor +% \State Aggregate the predictions of the $M$ estimators (via averaging or majority voting), to determine the bagging estimator: +% \begin{align*} +% \fM &= \frac{1}{M} \sum_{m=1}^M \bl \\ +% \text{or}\quad \fM &= \argmax_{k \in \Yspace} \sum_{m=1}^M \I\left(\bl = k\right) +% \end{align*} +% \end{algorithmic} +% \end{algorithm} + +\end{vbframe} + +\begin{vbframe}{Why/when does Bagging help?} + +%In one sentence:\\ +%\lz + +%Because the variability of the average of the predictions of many base learner models is smaller than the variability of the predictions from one such base learner model.\\ + +%If the error of a base learner model is mostly due to (random) variability and not due to structural reasons, combining many such base learners by bagging helps reducing this variability. +%\framebreak +\small Assume we use quadratic loss and measure instability of the ensemble with +\begin{scriptsize} +$\ambifM = \tfrac{1}{M}\sum^M_{m} \left(\bl- \fM \right)^2$: + \vskip -2em + \begin{align*} + \ambifM &= \tfrac{1}{M}\sum^M_{m} \left(\bl- \fM\right)^2 \\ + &= \tfrac{1}{M}\sum^M_{m} \left(\left(\bl - y\right) + \left(y - \fM\right)\right)^2\\ + &= \tfrac{1}{M}\sum^M_{m} L(y, \bl) + L(y, \fM) \underbrace{- 2 \left(y - \tfrac{1}{M}\sum^M_{m=1}\bl\right)\left(y - \fM\right)}_{- 2 L\left(y, \fM\right)} \\[-.5em] \end{align*} +\end{scriptsize} +\vspace{-0.3cm} +So, if we take the expected value over the data's distribution: +$${\scriptsize \Exy\left[L\left(y, \fM\right)\right] = \tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right] - \Exy\left[\ambifM\right]}$$ +$\Rightarrow$ Expected loss of the ensemble is lower than the average loss of single BL by the amount of instability in the ensemble's BLs. The more accurate and diverse the BLs, the better. +\normalsize +\framebreak +\end{vbframe} + +\begin{vbframe}{Determinants of Bagging Effectiveness} + +{\small How to make $\Exy\left[\ambifM\right]$ as large as possible?} +\begin{scriptsize} + +%\shortintertext{How to make $\Exy\left[\ambifM\right]$ as large as possible?} +$$\Exy\left[L\left(y, \fM \right)\right] = \tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right] - \Exy\left[\ambifM\right]$$ \\ +{\small For simplicity, assume $\Exy\left[\bl\right] = 0$, $\var_{xy}\left[\bl\right] = \Exy\left[(\bl)^2\right] = \sigma^2$, $\corr_{xy}\left[\bl, \bl{m'}\right] = \rho$ for all $m, m'$.} +\begin{align*} +\implies +\var_{xy}\left[\fM\right] &= \tfrac{1}{M} \sigma^2 + \tfrac{M-1}{M} \rho \sigma^2 \qquad\left(... = \Exy\left[(\fM)^2\right]\right)\\ + \Exy\left[\ambifM\right] &= \tfrac{1}{M}\sum^M_{m} \Exy\left[\left(\bl- \fM\right)^2\right]\\ + & = \tfrac{1}{M}\left(M \Exy\left[(\bl)^2\right] + M \Exy\left[(\fM)^2\right] - + 2 M \Exy\left[\bl\fM\right]\right) \\ + & = \sigma^2 + \Exy\left[(\fM)^2\right] - 2 \tfrac{1}{M}\sum^M_{m'} \underbrace{\Exy\left[\bl \bl{m'} \right]}_{\mathclap{\qquad\qquad\qquad\qquad= \cov_{xy}\left[\bl, \bl{m'} \right] + \Exy\left[\bl\right]\Exy\left[\bl{m'}\right]}} \\ + &= \sigma^2 + \left(\tfrac{1}{M} \sigma^2 + \tfrac{M-1}{M} \rho \sigma^2\right) - 2\left(\tfrac{M-1}{M} \rho\sigma^2 + \tfrac{1}{M}\sigma^2 + 0 \cdot 0 \right)\\ + &= \tfrac{M-1}{M} \sigma^2 (1-\rho) +\end{align*} +\end{scriptsize} + +\begin{small} +\begin{align*} +\Exy\left[L\left(y, \fM\right)\right] &= \textcolor{blue}{\tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right]} - \Exy\left[\ambifM\right]\\ +\Exy\left[\ambifM\right] &\cong +\textcolor{purple}{\frac{M-1}{M}} \textcolor{cyan}{\var_{xy}\left[\bl\right]} \textcolor{violet}{\left(1 - \corr_{xy}\left[\bl, \bl{m'}\right]\right)} +\end{align*} +\end{small} +\begin{itemize} +\item[$\Rightarrow$] \textcolor{blue}{\textbf{better base learners}} are better {\small (... duh)} +\item[$\Rightarrow$] \textcolor{purple}{\textbf{more base learners}} are better {\small (theoretically, at least...)}\\ +\item[$\Rightarrow$] \textcolor{cyan}{\textbf{more variable base learners}} are better {\small(as long as their risk stays the same, of course!)} +\item[$\Rightarrow$] \textcolor{violet}{\textbf{less correlation between base learners}} is better:\\ bagging helps more if base learners are wrong in different ways so that their errors \enquote{cancel} each other out.\\ +\end{itemize} + + +\end{vbframe} + +\begin{vbframe}{Bagging Summary} + + \begin{itemize} + \item Basic idea: fit the same model repeatedly on many \textbf{bootstrap} replications of the training data set and \textbf{aggregate} the results + \item Gains in performance by reducing variance of predictions, but (slightly) increases the bias: it reuses training data many times, so small mistakes can get amplified.\\ Bagging is thus a \textbf{form of regularization} + \item Works best for unstable/high-variance BLs, where small changes in training set can cause large changes in predictions:\\ + e.g., CART, neural networks, step-wise/forward/backward variable selection for regression\\ + \item Works best if BL predictions are only weakly correlated: they don't all make the same mistakes. + \item Can degrade performance for stable methods like $k$-NN, LDA, Naive Bayes, linear regression + \end{itemize} +\end{vbframe} + + + +\endlecture +\end{document} diff --git a/slides/regularization/slides-regu-bias-variance.tex b/slides/regularization/slides-regu-bias-variance.tex new file mode 100644 index 00000000..f220ae91 --- /dev/null +++ b/slides/regularization/slides-regu-bias-variance.tex @@ -0,0 +1,45 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} + +\newcommand{\titlefigure}{figure_man/biasvariance_scheme.png} +\newcommand{\learninggoals}{ + \item Understand why overfitting happens + \item Know how overfitting can be avoided + \item Know regularized empirical risk minimization +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Bias variance} +\lecture{Introduction to Machine Learning} + +%\section{Motivation for Regularization} + +\begin{vbframe}{Bias variance} + +In this slide set, we will visualize the bias-variance trade-off. \\ +\lz + +First, we start with the DGP. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$ + +Let the feature $x$ be uniformly d + +\framebreak + +\center +\vspace*{0.5cm} +\includegraphics[width=0.6\textwidth]{figure_man/biasvariance_scheme.png} \\ +\footnotesize{Hastie, The Elements of Statistical Learning, 2009 (p. 225)} + + +\end{vbframe} + + + +\endlecture +\end{document} diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex index 729e856c..41d015a5 100644 --- a/slides/regularization/slides-regu-intro.tex +++ b/slides/regularization/slides-regu-intro.tex @@ -33,8 +33,8 @@ \end{vbframe} -\begin{vbframe}{Regularization for Invariance/Symmetry} -Prior knowledge often requires that model predictions should be invariant under certain input transformations.\\ +\begin{vbframe}{Regularization for Invariance} +Prior knowledge can also be of the form that predictions should remain invariant under certain input transformations.\\ In image classification, label ``cat'' should hold regardless of position or size of relevant object (translation/scale invariance) \begin{enumerate}\setlength\itemsep{1.02em} \item \textbf{Pre-processing}: By computing invariant features under transformations, downstream models too will respect invariances @@ -49,8 +49,31 @@ \end{vbframe} +\begin{vbframe}{Recap: Overfitting} -\begin{vbframe}{Example: Overfitting} +Reducing overfitting is an important application of regularization, so let's first motivate it from that perspective. + +\begin{itemize} + \item Overfitting occurs when the model reflects noise or artifacts in training data which do not generalize (small train error, at cost of test high error) + \item Hence, predictions of overfitting models cannot be trusted to generalize beyond the training data +\end{itemize} +\lz +\begin{columns} +\begin{column}{0.5\textwidth} + \raggedright + Overfitted model\\ + \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1o} +\end{column} +\begin{column}{0.5\textwidth} + \raggedright + Appropriate model\\ + \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1a} +\end{column} +\end{columns} + +\end{vbframe} + +\begin{vbframe}{Example I: Overfitting} \begin{itemize} \item Assume we want to predict the daily maximum \textbf{ozone level} in LA given a data set containing $50$ observations. @@ -88,10 +111,10 @@ \end{vbframe} -\begin{vbframe}{Example: Overfitting} - -We train an overparameterized neural network and a support vector machine with RBF kernel on the Boston housing dataset. No form of regularization is imposed on the models. The target variable is house price. +\begin{vbframe}{Example II: Overfitting} +We train a shallow neural network with one hidden layer and 100 hidden units as well as a SVM with RBF kernel on a small regression task. No form of explicit regularization is imposed on the models. %The target variable is house price. +\vspace{0.2cm} \begin{table}[ht] \centering \begin{tabular}{rrr} @@ -103,14 +126,18 @@ \hline \end{tabular} \end{table} - -Both neural network and support vector machine perform significantly better on the training set. +\vspace{0.3cm} +\begin{itemize} + \item Both neural network and SVM perform significantly better on the training set + \item The shallow NN even achieves zero training error (interpolating) + \item Test error is significantly higher for both models, indicating overfitting +\end{itemize} \end{vbframe} \begin{vbframe}{Avoid Overfitting} -Why can \textbf{overfitting} happen? And how to avoid it? +Why can \textbf{overfitting} happen in practice? And how to avoid it? \lz \lz \begin{enumerate} @@ -138,7 +165,7 @@ \includegraphics[width=0.7\textwidth]{figure/avoid_overfitting_01.png}\\ \end{figure} -Good idea, but often not feasible in practice. +Good insight, but getting more data is often not feasible in practice. \framebreak diff --git a/slides/regularization/slides-regu-l0.tex b/slides/regularization/slides-regu-l0.tex index 812a628a..ca96dcfd 100644 --- a/slides/regularization/slides-regu-l0.tex +++ b/slides/regularization/slides-regu-l0.tex @@ -5,8 +5,8 @@ \newcommand{\titlefigure}{figure_man/lasso_ridge_hat.png} \newcommand{\learninggoals}{ - \item Know LQ norm regularization - \item Understand that L0 norm realization simply counts the number of non-zero parameters + \item Know Lq (quasi-)norm regularization + \item Understand that L0 regularization simply counts the number of non-zero parameters } \title{Introduction to Machine Learning} diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex index 4581c8c8..76ee3881 100644 --- a/slides/regularization/slides-regu-l1l2.tex +++ b/slides/regularization/slides-regu-l1l2.tex @@ -24,7 +24,7 @@ \begin{itemize} \setlength{\itemsep}{1.3em} \item Linear models can also overfit if we operate in a high-dimensional space with not that many observations. \item The OLS estimator requires a full-rank design matrix. - \item For highly correlated features, the least-squares estimate becomes highly sensitive to random errors in the observed response, producing a large variance in the fit. + \item For highly correlated features, OLS becomes highly sensitive to random errors in the observed response, producing a large variance in the fit. \item We now add a complexity penalty to the loss: $$ \riskrt = \sumin \left(\yi - \thetab^\top \xi \right)^2 + \lambda \cdot J(\thetab). diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex index 15548ef8..bda35cae 100644 --- a/slides/regularization/slides-regu-ridge-deepdive.tex +++ b/slides/regularization/slides-regu-ridge-deepdive.tex @@ -7,7 +7,7 @@ \newcommand{\learninggoals}{ \item Know interpretation of $L2$ regularization as row-augmentation \item Know interpretation of $L2$ regularization as minimizing risk under feature noise - %\item Derivation of the bias-variance tradeoff for Ridge regression + \item Bias-variance tradeoff for Ridge regression } \title{Introduction to Machine Learning}