generated from slds-lmu/lecture_template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e9049d6
commit 5f64fab
Showing
12 changed files
with
320 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
|
||
library(mlr3misc) | ||
library(mvtnorm) | ||
library(mlr3) | ||
library(mlr3learners) | ||
library(mlr3viz) | ||
library(ggplot2) | ||
library(gridExtra) | ||
|
||
set.seed(600000) | ||
n = 100000 | ||
|
||
mu1 = c(0, 3) | ||
mu2 = c(3, 0) | ||
s1 = matrix(c(1, 0.1, 0.1, 2), 2, 2, byrow = TRUE) | ||
s2 = matrix(c(30, 0.3, 0.3, 1), 2, 2, byrow = TRUE) | ||
d1 = as.data.table(rmvnorm(n = n/2, mean = mu1, sigma = s1)) | ||
d1$class = 1 | ||
d2 = as.data.table(rmvnorm(n = n/2, mean = mu2, sigma = s2)) | ||
d2$class = 2 | ||
dd = rbind(d1, d2) | ||
dd$class = as.factor(dd$class) | ||
oo = sample(n) | ||
dd = dd[oo,] | ||
task = TaskClassif$new("2dgauss", dd, target = "class") | ||
|
||
trainsize = 200 | ||
trainset = 1:trainsize | ||
testset = (trainsize+1):n | ||
|
||
l1 = lrn("classif.qda", predict_type = "prob") | ||
l2 = lrn("classif.log_reg", predict_type = "prob") | ||
l3 = lrn("classif.svm", type = "C-classification", predict_type = "prob", | ||
kernel = "radial", gamma = 99, cost = 1) | ||
|
||
l1$train(task) | ||
r1 = range(dd[trainset,]$V1) | ||
r2 = range(dd[trainset,]$V2) | ||
r1seq = seq(r1[1], r1[2], length.out = 200) | ||
r2seq = seq(r2[1], r2[2], length.out = 200) | ||
d_grid = expand.grid(V1 = r1seq, V2 = r2seq) | ||
pred_true = as.data.table(l1$predict_newdata(d_grid)) | ||
d_grid$prob = pred_true$prob.1 | ||
true_decb = d_grid[d_grid$prob > 0.47 & d_grid$prob < 0.53,] | ||
|
||
|
||
make_plot = function(ll, file_postfix) { | ||
ll$train(task, row_ids = trainset) | ||
pred_train = ll$predict(task, row_ids = trainset) | ||
trainerr = pred_train$score(msr("classif.ce")) | ||
pred_test = ll$predict(task, row_ids = testset) | ||
testerr = pred_test$score(msr("classif.ce")) | ||
fname = sprintf("../figure/eval_ofit_1%s.pdf", file_postfix) | ||
task_train = task$filter(rows = trainset) | ||
pl = plot_learner_prediction(ll, task) + guides(shape = FALSE, alpha = FALSE) | ||
pl = pl + ggtitle(sprintf("TrainErr=%.2f; TestErr=%.2f", trainerr, testerr)) | ||
pl = pl + geom_point(data = true_decb, alpha=0.5, size=0.2) | ||
ggsave(plot = pl, filename = fname, width = 8, height = 6) | ||
return(pl) | ||
} | ||
|
||
p1 = make_plot(l1, file_postfix = "a") | ||
p2 = make_plot(l2, file_postfix = "u") | ||
p3 = make_plot(l3, file_postfix = "o") | ||
|
||
#grid.arrange(p1, p2, p3) | ||
#print(p2) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} | ||
\input{../../style/preamble} | ||
\input{../../latex-math/basic-math} | ||
\input{../../latex-math/basic-ml} | ||
\input{../../latex-math/ml-ensembles.tex} | ||
|
||
\newcommand{\titlefigure}{figure_man/bagging.pdf} | ||
\newcommand{\learninggoals}{ | ||
\item Understand that bagging can be seen as a form of regularization | ||
\item Know which factors influence the effectiveness of bagging | ||
} | ||
|
||
\title{Introduction to Machine Learning} | ||
\date{} | ||
|
||
\begin{document} | ||
|
||
\lecturechapter{Bagging as Regularization (Deep-Dive)} | ||
\lecture{Introduction to Machine Learning} | ||
|
||
|
||
|
||
\begin{vbframe}{Recap: What is bagging?} | ||
|
||
\begin{itemize} | ||
\item Bagging is short for \textbf{B}ootstrap \textbf{Agg}regation. | ||
\item It's an \textbf{ensemble method}, i.e., it combines many models into one | ||
big \enquote{meta-model}. Ensembles often work much better than their members alone would. | ||
\item The components of an ensemble are called \textbf{base learners} (BLs) | ||
% that improves instable / high variance learners by variance smoothing | ||
\item In a \textbf{bagging} ensemble, all base learners are of the same type. The only difference between the models is the data they are trained on.\\ | ||
\end{itemize} | ||
|
||
\begin{center} | ||
% FIGURE SOURCE: https://docs.google.com/presentation/d/1xodP6ayu1Gay6mMKgzVWYEFmSoeG5kNuqsaTkFFmd78/edit | ||
\includegraphics[width=0.45\textwidth]{figure_man/bagging.pdf} | ||
\end{center} | ||
|
||
\framebreak | ||
Specifically, we train base learners $\bl, m = 1, \dots, M$ on $M$ \textbf{bootstrap} samples of training data $\D$: | ||
\begin{itemize} \setlength{\itemsep}{1.0em} | ||
\item Draw $n$ observations from $\D$ with replacement | ||
\item Fit the base learner on each of the $M$ bootstrap samples to get models $\fh(x) = \blh, m = 1, \dots, M$ | ||
\item Aggregate predictions of the $M$ fitted base learners to get ensemble model $\fMh$ via \textbf{averaging} (regression) or \textbf{majority voting} (classification) | ||
%\item Posterior probs $\pikxh$ can be estimated as predicted class frequencies over the ensemble | ||
\end{itemize} | ||
\vspace{0.1cm} | ||
{\small Bagging helps because variability of the averaged prediction over many base learners is smaller than variability of the predictions from one such model. If error of BL is mostly due to (random) variability and not structural reasons bagging helps reducing this variability.} | ||
|
||
|
||
%\begin{center} | ||
% FIGURE SOURCE: No source | ||
%\includegraphics[width=0.6\textwidth]{figure_man/rf_majvot_averaging.png} | ||
%\end{center} | ||
%\end{vbframe} | ||
|
||
% \begin{algorithm}[H] | ||
% \small | ||
% \setstretch{1.15} | ||
% \caption*{Bagging algorithm} | ||
% \begin{algorithmic}[1] | ||
% \State {\bf Input: } Dataset $\D$, base learner, number of bootstraps $M$ | ||
% \For {$m = 1 \to M$} | ||
% \State Draw a bootstrap sample $\D^{[m]}$ from $\D$. | ||
% \State Train base learner on $\D^{[m]}$ to obtain model $\bl$ | ||
% \EndFor | ||
% \State Aggregate the predictions of the $M$ estimators (via averaging or majority voting), to determine the bagging estimator: | ||
% \begin{align*} | ||
% \fM &= \frac{1}{M} \sum_{m=1}^M \bl \\ | ||
% \text{or}\quad \fM &= \argmax_{k \in \Yspace} \sum_{m=1}^M \I\left(\bl = k\right) | ||
% \end{align*} | ||
% \end{algorithmic} | ||
% \end{algorithm} | ||
|
||
\end{vbframe} | ||
|
||
\begin{vbframe}{Why/when does Bagging help?} | ||
|
||
%In one sentence:\\ | ||
%\lz | ||
|
||
%Because the variability of the average of the predictions of many base learner models is smaller than the variability of the predictions from one such base learner model.\\ | ||
|
||
%If the error of a base learner model is mostly due to (random) variability and not due to structural reasons, combining many such base learners by bagging helps reducing this variability. | ||
%\framebreak | ||
\small Assume we use quadratic loss and measure instability of the ensemble with | ||
\begin{scriptsize} | ||
$\ambifM = \tfrac{1}{M}\sum^M_{m} \left(\bl- \fM \right)^2$: | ||
\vskip -2em | ||
\begin{align*} | ||
\ambifM &= \tfrac{1}{M}\sum^M_{m} \left(\bl- \fM\right)^2 \\ | ||
&= \tfrac{1}{M}\sum^M_{m} \left(\left(\bl - y\right) + \left(y - \fM\right)\right)^2\\ | ||
&= \tfrac{1}{M}\sum^M_{m} L(y, \bl) + L(y, \fM) \underbrace{- 2 \left(y - \tfrac{1}{M}\sum^M_{m=1}\bl\right)\left(y - \fM\right)}_{- 2 L\left(y, \fM\right)} \\[-.5em] \end{align*} | ||
\end{scriptsize} | ||
\vspace{-0.3cm} | ||
So, if we take the expected value over the data's distribution: | ||
$${\scriptsize \Exy\left[L\left(y, \fM\right)\right] = \tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right] - \Exy\left[\ambifM\right]}$$ | ||
$\Rightarrow$ Expected loss of the ensemble is lower than the average loss of single BL by the amount of instability in the ensemble's BLs. The more accurate and diverse the BLs, the better. | ||
\normalsize | ||
\framebreak | ||
\end{vbframe} | ||
|
||
\begin{vbframe}{Determinants of Bagging Effectiveness} | ||
|
||
{\small How to make $\Exy\left[\ambifM\right]$ as large as possible?} | ||
\begin{scriptsize} | ||
|
||
%\shortintertext{How to make $\Exy\left[\ambifM\right]$ as large as possible?} | ||
$$\Exy\left[L\left(y, \fM \right)\right] = \tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right] - \Exy\left[\ambifM\right]$$ \\ | ||
{\small For simplicity, assume $\Exy\left[\bl\right] = 0$, $\var_{xy}\left[\bl\right] = \Exy\left[(\bl)^2\right] = \sigma^2$, $\corr_{xy}\left[\bl, \bl{m'}\right] = \rho$ for all $m, m'$.} | ||
\begin{align*} | ||
\implies | ||
\var_{xy}\left[\fM\right] &= \tfrac{1}{M} \sigma^2 + \tfrac{M-1}{M} \rho \sigma^2 \qquad\left(... = \Exy\left[(\fM)^2\right]\right)\\ | ||
\Exy\left[\ambifM\right] &= \tfrac{1}{M}\sum^M_{m} \Exy\left[\left(\bl- \fM\right)^2\right]\\ | ||
& = \tfrac{1}{M}\left(M \Exy\left[(\bl)^2\right] + M \Exy\left[(\fM)^2\right] - | ||
2 M \Exy\left[\bl\fM\right]\right) \\ | ||
& = \sigma^2 + \Exy\left[(\fM)^2\right] - 2 \tfrac{1}{M}\sum^M_{m'} \underbrace{\Exy\left[\bl \bl{m'} \right]}_{\mathclap{\qquad\qquad\qquad\qquad= \cov_{xy}\left[\bl, \bl{m'} \right] + \Exy\left[\bl\right]\Exy\left[\bl{m'}\right]}} \\ | ||
&= \sigma^2 + \left(\tfrac{1}{M} \sigma^2 + \tfrac{M-1}{M} \rho \sigma^2\right) - 2\left(\tfrac{M-1}{M} \rho\sigma^2 + \tfrac{1}{M}\sigma^2 + 0 \cdot 0 \right)\\ | ||
&= \tfrac{M-1}{M} \sigma^2 (1-\rho) | ||
\end{align*} | ||
\end{scriptsize} | ||
|
||
\begin{small} | ||
\begin{align*} | ||
\Exy\left[L\left(y, \fM\right)\right] &= \textcolor{blue}{\tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right]} - \Exy\left[\ambifM\right]\\ | ||
\Exy\left[\ambifM\right] &\cong | ||
\textcolor{purple}{\frac{M-1}{M}} \textcolor{cyan}{\var_{xy}\left[\bl\right]} \textcolor{violet}{\left(1 - \corr_{xy}\left[\bl, \bl{m'}\right]\right)} | ||
\end{align*} | ||
\end{small} | ||
\begin{itemize} | ||
\item[$\Rightarrow$] \textcolor{blue}{\textbf{better base learners}} are better {\small (... duh)} | ||
\item[$\Rightarrow$] \textcolor{purple}{\textbf{more base learners}} are better {\small (theoretically, at least...)}\\ | ||
\item[$\Rightarrow$] \textcolor{cyan}{\textbf{more variable base learners}} are better {\small(as long as their risk stays the same, of course!)} | ||
\item[$\Rightarrow$] \textcolor{violet}{\textbf{less correlation between base learners}} is better:\\ bagging helps more if base learners are wrong in different ways so that their errors \enquote{cancel} each other out.\\ | ||
\end{itemize} | ||
|
||
|
||
\end{vbframe} | ||
|
||
\begin{vbframe}{Bagging Summary} | ||
|
||
\begin{itemize} | ||
\item Basic idea: fit the same model repeatedly on many \textbf{bootstrap} replications of the training data set and \textbf{aggregate} the results | ||
\item Gains in performance by reducing variance of predictions, but (slightly) increases the bias: it reuses training data many times, so small mistakes can get amplified.\\ Bagging is thus a \textbf{form of regularization} | ||
\item Works best for unstable/high-variance BLs, where small changes in training set can cause large changes in predictions:\\ | ||
e.g., CART, neural networks, step-wise/forward/backward variable selection for regression\\ | ||
\item Works best if BL predictions are only weakly correlated: they don't all make the same mistakes. | ||
\item Can degrade performance for stable methods like $k$-NN, LDA, Naive Bayes, linear regression | ||
\end{itemize} | ||
\end{vbframe} | ||
|
||
|
||
|
||
\endlecture | ||
\end{document} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} | ||
\input{../../style/preamble} | ||
\input{../../latex-math/basic-math} | ||
\input{../../latex-math/basic-ml} | ||
|
||
\newcommand{\titlefigure}{figure_man/biasvariance_scheme.png} | ||
\newcommand{\learninggoals}{ | ||
\item Understand why overfitting happens | ||
\item Know how overfitting can be avoided | ||
\item Know regularized empirical risk minimization | ||
} | ||
|
||
\title{Introduction to Machine Learning} | ||
\date{} | ||
|
||
\begin{document} | ||
|
||
\lecturechapter{Bias variance} | ||
\lecture{Introduction to Machine Learning} | ||
|
||
%\section{Motivation for Regularization} | ||
|
||
\begin{vbframe}{Bias variance} | ||
|
||
In this slide set, we will visualize the bias-variance trade-off. \\ | ||
\lz | ||
|
||
First, we start with the DGP. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$ | ||
|
||
Let the feature $x$ be uniformly d | ||
|
||
\framebreak | ||
|
||
\center | ||
\vspace*{0.5cm} | ||
\includegraphics[width=0.6\textwidth]{figure_man/biasvariance_scheme.png} \\ | ||
\footnotesize{Hastie, The Elements of Statistical Learning, 2009 (p. 225)} | ||
|
||
|
||
\end{vbframe} | ||
|
||
|
||
|
||
\endlecture | ||
\end{document} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.