regularization updates

slds-lmu · Jan 16, 2024 · 5f64fab · 5f64fab
1 parent e9049d6
commit 5f64fab
Show file tree

Hide file tree

Showing 12 changed files with 320 additions and 18 deletions.
diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex
@@ -32,17 +32,24 @@ \subsection{L0 Regularization}
 \subsection{Nonlinear and Bayes}
 \includepdf[pages=-]{../slides-pdf/slides-regu-nonlin-bayes.pdf}
 
-\subsection{Geometric Analysis of L2-Regularization and Weight Decay}
+\subsection{Geometric Analysis of L2 Regularization and Weight Decay}
 \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l2-wdecay.pdf}
 
-\subsection{Geometric Analysis of L1-regularization}
+\subsection{Geometric Analysis of L1 Regularization}
 \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l1.pdf}
 
+\subsection{Early Stopping}
+\includepdf[pages=-]{../slides-pdf/slides-regu-early-stopping.pdf}
+
+\subsection{Details on Ridge Regression (Deep-Dive)}
+\includepdf[pages=-]{../slides-pdf/slides-regu-ridge-deepdive.pdf}
+
 \subsection{Soft-thresholding and L1 regularization deep-dive}
 \includepdf[pages=-]{../slides-pdf/slides-regu-lasso-deepdive.pdf}
 
-\subsection{Early Stopping}
-\includepdf[pages=-]{../slides-pdf/slides-regu-early-stopping.pdf}
+\subsection{Bagging as a Regularization Method (Deep-Dive)}
+\includepdf[pages=-]{../slides-pdf/slides-regu-bagging-deepdive.pdf}
+
 
 
 

diff --git a/slides/regularization/figure/eval_ofit_1a.pdf b/slides/regularization/figure/eval_ofit_1a.pdf
diff --git a/slides/regularization/figure/eval_ofit_1o.pdf b/slides/regularization/figure/eval_ofit_1o.pdf
diff --git a/slides/regularization/figure_man/bagging.pdf b/slides/regularization/figure_man/bagging.pdf
diff --git a/slides/regularization/figure_man/rf_majvot_averaging.png b/slides/regularization/figure_man/rf_majvot_averaging.png
diff --git a/slides/regularization/rsrc/fig-eval_ofit_1.R b/slides/regularization/rsrc/fig-eval_ofit_1.R
@@ -0,0 +1,68 @@
+
+library(mlr3misc)
+library(mvtnorm)
+library(mlr3)
+library(mlr3learners)
+library(mlr3viz)
+library(ggplot2)
+library(gridExtra)
+
+set.seed(600000)
+n = 100000
+
+mu1 = c(0, 3)
+mu2 = c(3, 0)
+s1 = matrix(c(1, 0.1, 0.1, 2), 2, 2, byrow = TRUE)
+s2 = matrix(c(30, 0.3, 0.3, 1), 2, 2, byrow = TRUE)
+d1 = as.data.table(rmvnorm(n = n/2, mean = mu1, sigma = s1))
+d1$class = 1
+d2 = as.data.table(rmvnorm(n = n/2, mean = mu2, sigma = s2))
+d2$class = 2
+dd = rbind(d1, d2)
+dd$class = as.factor(dd$class)
+oo = sample(n)
+dd = dd[oo,]
+task = TaskClassif$new("2dgauss", dd, target = "class")
+
+trainsize = 200
+trainset = 1:trainsize
+testset = (trainsize+1):n
+
+l1 = lrn("classif.qda", predict_type = "prob")
+l2 = lrn("classif.log_reg", predict_type = "prob")
+l3 = lrn("classif.svm", type = "C-classification", predict_type = "prob", 
+  kernel = "radial", gamma = 99, cost = 1)
+
+l1$train(task)
+r1 = range(dd[trainset,]$V1)
+r2 = range(dd[trainset,]$V2)
+r1seq = seq(r1[1], r1[2], length.out = 200)
+r2seq = seq(r2[1], r2[2], length.out = 200)
+d_grid = expand.grid(V1 = r1seq, V2 = r2seq)
+pred_true = as.data.table(l1$predict_newdata(d_grid))
+d_grid$prob = pred_true$prob.1 
+true_decb = d_grid[d_grid$prob > 0.47 & d_grid$prob < 0.53,]
+
+
+make_plot = function(ll, file_postfix) {
+  ll$train(task, row_ids = trainset)
+  pred_train = ll$predict(task, row_ids = trainset)
+  trainerr = pred_train$score(msr("classif.ce"))
+  pred_test = ll$predict(task, row_ids = testset)
+  testerr = pred_test$score(msr("classif.ce"))
+  fname = sprintf("../figure/eval_ofit_1%s.pdf", file_postfix)
+  task_train = task$filter(rows = trainset)
+  pl = plot_learner_prediction(ll, task) + guides(shape = FALSE, alpha = FALSE)
+  pl = pl + ggtitle(sprintf("TrainErr=%.2f;    TestErr=%.2f", trainerr, testerr))
+  pl = pl + geom_point(data = true_decb, alpha=0.5, size=0.2)
+  ggsave(plot = pl, filename = fname, width = 8, height = 6)
+  return(pl)
+}
+
+p1 = make_plot(l1, file_postfix = "a")
+p2 = make_plot(l2, file_postfix = "u")
+p3 = make_plot(l3, file_postfix = "o")
+
+#grid.arrange(p1, p2, p3)
+#print(p2)
+
diff --git a/slides/regularization/slides-regu-bagging-deepdive.tex b/slides/regularization/slides-regu-bagging-deepdive.tex
@@ -0,0 +1,155 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+\input{../../latex-math/ml-ensembles.tex}
+
+\newcommand{\titlefigure}{figure_man/bagging.pdf}
+\newcommand{\learninggoals}{
+  \item Understand that bagging can be seen as a form of regularization
+  \item Know which factors influence the effectiveness of bagging
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Bagging as Regularization (Deep-Dive)}
+\lecture{Introduction to Machine Learning}
+
+
+
+\begin{vbframe}{Recap: What is bagging?}
+
+\begin{itemize}
+  \item Bagging is short for \textbf{B}ootstrap \textbf{Agg}regation.
+  \item It's an \textbf{ensemble method}, i.e., it combines many models into one 
+        big \enquote{meta-model}. Ensembles often work much better than their members alone would.
+  \item The components of an ensemble are called \textbf{base learners} (BLs)
+      % that improves instable / high variance learners by variance smoothing
+  \item In a \textbf{bagging} ensemble, all base learners are of the same type. The only difference between the models is the data they are trained on.\\
+\end{itemize}
+
+\begin{center}
+% FIGURE SOURCE: https://docs.google.com/presentation/d/1xodP6ayu1Gay6mMKgzVWYEFmSoeG5kNuqsaTkFFmd78/edit
+\includegraphics[width=0.45\textwidth]{figure_man/bagging.pdf}
+\end{center}
+
+\framebreak 
+Specifically, we train base learners $\bl, m = 1, \dots, M$ on $M$ \textbf{bootstrap} samples of training data $\D$:
+\begin{itemize}  \setlength{\itemsep}{1.0em}
+  \item Draw $n$ observations from $\D$ with replacement
+  \item Fit the base learner on each of the $M$ bootstrap samples to get models $\fh(x) = \blh, m = 1, \dots, M$
+\item Aggregate predictions of the $M$ fitted base learners to get ensemble model $\fMh$ via \textbf{averaging} (regression) or \textbf{majority voting} (classification)
+    %\item Posterior probs $\pikxh$ can be estimated as predicted class frequencies over the ensemble
+\end{itemize}
+\vspace{0.1cm}
+{\small Bagging helps because variability of the averaged prediction over many base learners is smaller than variability of the predictions from one such model. If error of BL is mostly due to (random) variability and not structural reasons bagging helps reducing this variability.}
+
+
+%\begin{center}
+% FIGURE SOURCE: No source
+%\includegraphics[width=0.6\textwidth]{figure_man/rf_majvot_averaging.png}
+%\end{center}
+%\end{vbframe}
+
+% \begin{algorithm}[H]
+%   \small
+%   \setstretch{1.15}
+%   \caption*{Bagging algorithm}
+%   \begin{algorithmic}[1]
+%     \State {\bf Input: } Dataset $\D$, base learner, number of bootstraps $M$
+%     \For {$m = 1 \to M$}
+%       \State Draw a bootstrap sample $\D^{[m]}$ from $\D$.
+%       \State Train base learner on $\D^{[m]}$ to obtain model $\bl$
+%     \EndFor
+%     \State Aggregate the predictions of the $M$ estimators (via averaging or majority voting), to determine the bagging estimator:
+%     \begin{align*}
+%     \fM &= \frac{1}{M} \sum_{m=1}^M \bl \\
+%     \text{or}\quad \fM &= \argmax_{k \in \Yspace} \sum_{m=1}^M \I\left(\bl = k\right)
+%     \end{align*}
+%   \end{algorithmic}
+% \end{algorithm}
+
+\end{vbframe}
+
+\begin{vbframe}{Why/when does Bagging help?}
+
+%In one sentence:\\
+%\lz
+
+%Because the variability of the average of the predictions of many base learner models is smaller than the variability of the predictions from one such base learner model.\\
+
+%If the error of a base learner model is mostly due to (random) variability and not due to structural reasons, combining many such base learners by bagging helps reducing this variability.
+%\framebreak
+\small Assume we use quadratic loss and measure instability of the ensemble with
+\begin{scriptsize}
+$\ambifM = \tfrac{1}{M}\sum^M_{m} \left(\bl- \fM \right)^2$:
+ \vskip -2em
+ \begin{align*}
+ \ambifM &= \tfrac{1}{M}\sum^M_{m} \left(\bl- \fM\right)^2 \\
+         &= \tfrac{1}{M}\sum^M_{m} \left(\left(\bl - y\right)  + \left(y - \fM\right)\right)^2\\
+         &= \tfrac{1}{M}\sum^M_{m} L(y, \bl) + L(y, \fM) \underbrace{- 2 \left(y - \tfrac{1}{M}\sum^M_{m=1}\bl\right)\left(y - \fM\right)}_{- 2 L\left(y, \fM\right)} \\[-.5em]  \end{align*}
+\end{scriptsize}
+\vspace{-0.3cm}
+So, if we take the expected value over the data's distribution:
+$${\scriptsize \Exy\left[L\left(y, \fM\right)\right] = \tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right] - \Exy\left[\ambifM\right]}$$
+$\Rightarrow$ Expected loss of the ensemble is lower than the average loss of single BL by the amount of instability in the ensemble's BLs. The more accurate and diverse the BLs, the better.
+\normalsize
+\framebreak
+\end{vbframe}
+
+\begin{vbframe}{Determinants of Bagging Effectiveness}
+
+{\small How to make $\Exy\left[\ambifM\right]$ as large as possible?}
+\begin{scriptsize}
+
+%\shortintertext{How to make $\Exy\left[\ambifM\right]$ as large as possible?}
+$$\Exy\left[L\left(y, \fM \right)\right] = \tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right] - \Exy\left[\ambifM\right]$$ \\
+{\small For simplicity, assume $\Exy\left[\bl\right] = 0$, $\var_{xy}\left[\bl\right] = \Exy\left[(\bl)^2\right] = \sigma^2$, $\corr_{xy}\left[\bl, \bl{m'}\right] = \rho$ for all $m, m'$.}
+\begin{align*}
+\implies 
+\var_{xy}\left[\fM\right] &= \tfrac{1}{M} \sigma^2 +  \tfrac{M-1}{M} \rho \sigma^2 \qquad\left(... = \Exy\left[(\fM)^2\right]\right)\\
+ \Exy\left[\ambifM\right] &= \tfrac{1}{M}\sum^M_{m} \Exy\left[\left(\bl- \fM\right)^2\right]\\
+ & = \tfrac{1}{M}\left(M \Exy\left[(\bl)^2\right] + M \Exy\left[(\fM)^2\right] - 
+     2 M \Exy\left[\bl\fM\right]\right) \\
+  & = \sigma^2  + \Exy\left[(\fM)^2\right] - 2 \tfrac{1}{M}\sum^M_{m'} \underbrace{\Exy\left[\bl \bl{m'} \right]}_{\mathclap{\qquad\qquad\qquad\qquad= \cov_{xy}\left[\bl, \bl{m'} \right] + \Exy\left[\bl\right]\Exy\left[\bl{m'}\right]}} \\
+  &=  \sigma^2  + \left(\tfrac{1}{M} \sigma^2 +   \tfrac{M-1}{M} \rho \sigma^2\right) - 2\left(\tfrac{M-1}{M} \rho\sigma^2 + \tfrac{1}{M}\sigma^2 + 0 \cdot 0 \right)\\
+  &= \tfrac{M-1}{M} \sigma^2 (1-\rho)
+\end{align*}
+\end{scriptsize}
+
+\begin{small}
+\begin{align*}
+\Exy\left[L\left(y, \fM\right)\right] &= \textcolor{blue}{\tfrac{1}{M}\sum^M_{m} \Exy\left[L\left(y, \bl \right)\right]} - \Exy\left[\ambifM\right]\\
+\Exy\left[\ambifM\right] &\cong 
+\textcolor{purple}{\frac{M-1}{M}} \textcolor{cyan}{\var_{xy}\left[\bl\right]} \textcolor{violet}{\left(1 - \corr_{xy}\left[\bl, \bl{m'}\right]\right)}
+\end{align*}
+\end{small}
+\begin{itemize}
+\item[$\Rightarrow$] \textcolor{blue}{\textbf{better base learners}} are better {\small (... duh)}
+\item[$\Rightarrow$] \textcolor{purple}{\textbf{more base learners}} are better {\small (theoretically, at least...)}\\
+\item[$\Rightarrow$] \textcolor{cyan}{\textbf{more variable base learners}} are better {\small(as long as their risk stays the same, of course!)}
+\item[$\Rightarrow$] \textcolor{violet}{\textbf{less correlation between base learners}} is better:\\ bagging helps more if base learners are wrong in different ways so that their errors \enquote{cancel} each other out.\\
+\end{itemize}
+
+
+\end{vbframe}
+
+\begin{vbframe}{Bagging Summary}
+
+  \begin{itemize}
+    \item Basic idea: fit the same model repeatedly on many \textbf{bootstrap} replications of the training data set and \textbf{aggregate} the results
+    \item Gains in performance by reducing variance of predictions, but (slightly) increases the bias: it reuses training data many times, so small mistakes can get amplified.\\ Bagging is thus a \textbf{form of regularization}
+    \item Works best for unstable/high-variance BLs, where small changes in training set can cause large changes in predictions:\\
+    e.g., CART, neural networks, step-wise/forward/backward variable selection for regression\\
+    \item Works best if BL predictions are only weakly correlated: they don't all make the same mistakes.
+    \item Can degrade performance for stable methods like $k$-NN, LDA, Naive Bayes, linear regression
+  \end{itemize}
+\end{vbframe}
+
+
+
+\endlecture
+\end{document}
diff --git a/slides/regularization/slides-regu-bias-variance.tex b/slides/regularization/slides-regu-bias-variance.tex
@@ -0,0 +1,45 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+
+\newcommand{\titlefigure}{figure_man/biasvariance_scheme.png}
+\newcommand{\learninggoals}{
+  \item Understand why overfitting happens
+  \item Know how overfitting can be avoided
+  \item Know regularized empirical risk minimization 
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Bias variance}
+\lecture{Introduction to Machine Learning}
+
+%\section{Motivation for Regularization}
+
+\begin{vbframe}{Bias variance}
+
+In this slide set, we will visualize the bias-variance trade-off. \\
+\lz 
+
+First, we start with the DGP. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$
+
+Let the feature $x$ be uniformly d
+
+\framebreak 
+
+\center
+\vspace*{0.5cm}
+\includegraphics[width=0.6\textwidth]{figure_man/biasvariance_scheme.png} \\
+\footnotesize{Hastie, The Elements of Statistical Learning, 2009 (p. 225)}
+
+
+\end{vbframe}
+
+
+
+\endlecture
+\end{document}
diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex
@@ -33,8 +33,8 @@
 
 \end{vbframe}
 
-\begin{vbframe}{Regularization for Invariance/Symmetry}
-Prior knowledge often requires that model predictions should be invariant under certain input transformations.\\
+\begin{vbframe}{Regularization for Invariance}
+Prior knowledge can also be of the form that predictions should remain invariant under certain input transformations.\\
 In image classification, label ``cat'' should hold regardless of position or size of relevant object (translation/scale invariance)
 \begin{enumerate}\setlength\itemsep{1.02em}
     \item \textbf{Pre-processing}: By computing invariant features under transformations, downstream models too will respect invariances
@@ -49,8 +49,31 @@
 
 \end{vbframe}
 
+\begin{vbframe}{Recap: Overfitting}
 
-\begin{vbframe}{Example: Overfitting}
+Reducing overfitting is an important application of regularization, so let's first motivate it from that perspective.
+
+\begin{itemize}
+  \item Overfitting occurs when the model reflects noise or artifacts in training data which do not generalize (small train error, at cost of test high error)
+  \item Hence, predictions of overfitting models cannot be trusted to generalize beyond the training data
+\end{itemize}
+\lz
+\begin{columns}
+\begin{column}{0.5\textwidth}
+  \raggedright
+  Overfitted model\\
+  \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1o}
+\end{column}
+\begin{column}{0.5\textwidth}
+  \raggedright
+    Appropriate model\\
+  \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1a}
+\end{column}
+\end{columns}
+
+\end{vbframe}
+
+\begin{vbframe}{Example I: Overfitting}
 
 \begin{itemize}
 \item Assume we want to predict the daily maximum \textbf{ozone level} in LA given a data set containing $50$ observations.
@@ -88,10 +111,10 @@
 
 \end{vbframe}
 
-\begin{vbframe}{Example: Overfitting}
-
-We train an overparameterized neural network and a support vector machine with RBF kernel on the Boston housing dataset. No form of regularization is imposed on the models. The target variable is house price.
+\begin{vbframe}{Example II: Overfitting}
 
+We train a shallow neural network with one hidden layer and 100 hidden units as well as a SVM with RBF kernel on a small regression task. No form of explicit regularization is imposed on the models. %The target variable is house price.
+\vspace{0.2cm}
 \begin{table}[ht]
 \centering
 \begin{tabular}{rrr}
@@ -103,14 +126,18 @@
    \hline
 \end{tabular}
 \end{table}
-
-Both neural network and support vector machine perform significantly better on the training set.
+\vspace{0.3cm}
+\begin{itemize}
+    \item Both neural network and SVM perform significantly better on the training set
+    \item The shallow NN even achieves zero training error (interpolating)
+    \item Test error is significantly higher for both models, indicating overfitting
+\end{itemize}
 
 \end{vbframe}
 
 \begin{vbframe}{Avoid Overfitting} 
 
-Why can \textbf{overfitting} happen? And how to avoid it?
+Why can \textbf{overfitting} happen in practice? And how to avoid it?
 \lz
 \lz
 \begin{enumerate}
@@ -138,7 +165,7 @@
 \includegraphics[width=0.7\textwidth]{figure/avoid_overfitting_01.png}\\
 \end{figure}
 
-Good idea, but often not feasible in practice.
+Good insight, but getting more data is often not feasible in practice.
 
 \framebreak
 

diff --git a/slides/regularization/slides-regu-l0.tex b/slides/regularization/slides-regu-l0.tex
@@ -5,8 +5,8 @@
 
 \newcommand{\titlefigure}{figure_man/lasso_ridge_hat.png}
 \newcommand{\learninggoals}{
-  \item Know LQ norm regularization
-  \item Understand that L0 norm realization simply counts the number of non-zero parameters
+  \item Know Lq (quasi-)norm regularization
+  \item Understand that L0 regularization simply counts the number of non-zero parameters
 }
 
 \title{Introduction to Machine Learning}