diff --git a/slides/regularization/figure_man/bv_true_fun.png b/slides/regularization/figure_man/bv_true_fun.png new file mode 100644 index 00000000..85078c24 Binary files /dev/null and b/slides/regularization/figure_man/bv_true_fun.png differ diff --git a/slides/regularization/figure_man/lq-penalty-plots.png b/slides/regularization/figure_man/lq-penalty-plots.png new file mode 100644 index 00000000..3f04f42c Binary files /dev/null and b/slides/regularization/figure_man/lq-penalty-plots.png differ diff --git a/slides/regularization/rsrc/lq-penalty-plot.py b/slides/regularization/rsrc/lq-penalty-plot.py new file mode 100644 index 00000000..abe96e40 --- /dev/null +++ b/slides/regularization/rsrc/lq-penalty-plot.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Jan 16 22:32:46 2024 + +@author: chris +""" + +import numpy as np +import matplotlib.pyplot as plt + +# Define the funs +def f1(x): return x**2 +def f2(x): return np.abs(x) +def f3(x): return np.abs(x)**(2/3) +def f4(x): return np.abs(x)**(1/2) +def f0(x): return np.where(np.abs(x)>=0.007, 1, 0) + +x_values = np.linspace(-2, 2, 400) + +fig, axes = plt.subplots(1, 5, figsize=(15, 4)) + +# Plot each function +for i, (func, q) in enumerate(zip([f1, f2, f3, f4, f0], ['q=2', 'q=1', 'q=2/3', 'q=1/2', 'q=0'])): + y_values = func(x_values) + axes[i].plot(x_values, y_values, color=f'C{i}') + axes[i].set_title(q, size=16) + axes[i].set_xlim(-1.2, 1.2) + axes[i].set_ylim(0, 1.1) + axes[i].label_outer() + axes[i].set_xlabel(r'$\theta$', size=20) + axes[i].set_ylabel('Penalty', size=16) + axes[i].tick_params(axis='both', which='major', labelsize=10) + axes[i].grid(False) + axes[i].spines['top'].set_visible(False) + axes[i].spines['right'].set_visible(False) + +plt.tight_layout() + +plt.show() diff --git a/slides/regularization/slides-regu-bias-variance.tex b/slides/regularization/slides-regu-bias-variance.tex index f220ae91..dccbb2c1 100644 --- a/slides/regularization/slides-regu-bias-variance.tex +++ b/slides/regularization/slides-regu-bias-variance.tex @@ -25,12 +25,26 @@ In this slide set, we will visualize the bias-variance trade-off. \\ \lz -First, we start with the DGP. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$ - -Let the feature $x$ be uniformly d +First, we start with the DGP $\mathbb{P}_{xy}$. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$ +Let the feature $x \sim \mathcal{U}([0, 1])$ and the target $y\vert x \sim \mathcal{N}(f(x), \sigma).$ +\begin{center} +\includegraphics[width=0.5\textwidth]{slides/regularization/figure_man/bv_true_fun.png} +\end{center} \framebreak +Obviously, $f$ is an element of the function family $$\mathcal{H} := \{f: [0, 1] \rightarrow \mathbb{R}\vert\; f \text{ is continuous except for at most 2 jump discontinuities}\}$$ + +To make our following discussion more formal, we introduce two distance functions. +\begin{itemize} + \item The distance between two functions $d:\mathcal{H}^2\rightarrow \mathbb{R}_{\geq 0}$ in $\mathcal{H}$ such that $$d(f_1, f_2) = \int_0^1(f_1(x) - f_2(x))^2dx.$$ + \item The distance between a function and $k$ observations $\overline{d}:\mathcal{H}\times([0,1]\times \mathbb{R})^k \rightarrow \mathbb{R}_{\geq 0}$ in $\mathcal{H}$ such that + $$\overline d(f, ((x_1, y_1), \dots, (x_k, y_k)) = \frac{1}{k}\sum^k_{i=1}(f_1(x_i) - y_i)^2 $$ +\end{itemize} + + +\framebreak + \center \vspace*{0.5cm} \includegraphics[width=0.6\textwidth]{figure_man/biasvariance_scheme.png} \\ diff --git a/slides/regularization/slides-regu-l0.tex b/slides/regularization/slides-regu-l0.tex index ca96dcfd..b2599b8b 100644 --- a/slides/regularization/slides-regu-l0.tex +++ b/slides/regularization/slides-regu-l0.tex @@ -17,7 +17,7 @@ \lecturechapter{L0 Regularization} \lecture{Introduction to Machine Learning} -\begin{vbframe}{LQ norm Regularization} +\begin{vbframe}{Lq norm Regularization} Besides $L1$ and $L2$ norm we could use any $Lq$ norm for regularization. @@ -39,11 +39,12 @@ \riskrt = \risket + \lambda \|\thetab\|_0 := \risket + \lambda \sum_j |\theta_j|^0. $$ \item Unlike the $L1$ and $L2$ norms, the $L0$ "norm" simply counts the number of non-zero parameters in the model. + \vspace{0.3cm} \begin{figure} \centering - \scalebox{0.8}{\includegraphics{figure_man/l0_norm.png}} - \tiny{\\ Credit: Christos Louizos} - \caption{\footnotesize $Lp$ norm penalties for a parameter $\thetab$ according to different values of $p$.} + \scalebox{0.99}{\includegraphics{figure_man/lq-penalty-plots.png}} + %\tiny{\\ Credit: Christos Louizos} + \caption{\footnotesize $Lq$ (quasi-)norm penalties for a scalar parameter $\thetab$ for different values of $q$} \end{figure} \item For any parameter $\thetab$, the $L0$ penalty is zero for $\thetab = 0$ (defining $0^0 := 0$) and is constant for any $\thetab \neq 0$, no matter how large or small it is. \item $L0$ regularization induces sparsity in the parameter vector more aggressively than $L1$ regularization, but does not shrink concrete parameter values as L1 and L2 does.