add reproduced lq penalty plot

slds-lmu · Jan 16, 2024 · ca32b67 · ca32b67
1 parent 5f64fab
commit ca32b67
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 7 deletions.
diff --git a/slides/regularization/figure_man/bv_true_fun.png b/slides/regularization/figure_man/bv_true_fun.png
diff --git a/slides/regularization/figure_man/lq-penalty-plots.png b/slides/regularization/figure_man/lq-penalty-plots.png
diff --git a/slides/regularization/rsrc/lq-penalty-plot.py b/slides/regularization/rsrc/lq-penalty-plot.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jan 16 22:32:46 2024
+
+@author: chris
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Define the funs
+def f1(x): return x**2
+def f2(x): return np.abs(x)
+def f3(x): return np.abs(x)**(2/3)
+def f4(x): return np.abs(x)**(1/2)
+def f0(x): return np.where(np.abs(x)>=0.007, 1, 0)
+
+x_values = np.linspace(-2, 2, 400)
+
+fig, axes = plt.subplots(1, 5, figsize=(15, 4))
+
+# Plot each function
+for i, (func, q) in enumerate(zip([f1, f2, f3, f4, f0], ['q=2', 'q=1', 'q=2/3', 'q=1/2', 'q=0'])):
+    y_values = func(x_values)
+    axes[i].plot(x_values, y_values, color=f'C{i}')
+    axes[i].set_title(q, size=16)
+    axes[i].set_xlim(-1.2, 1.2)
+    axes[i].set_ylim(0, 1.1)
+    axes[i].label_outer()
+    axes[i].set_xlabel(r'$\theta$', size=20)
+    axes[i].set_ylabel('Penalty', size=16)
+    axes[i].tick_params(axis='both', which='major', labelsize=10)
+    axes[i].grid(False)
+    axes[i].spines['top'].set_visible(False)
+    axes[i].spines['right'].set_visible(False)
+
+plt.tight_layout()
+
+plt.show()
diff --git a/slides/regularization/slides-regu-bias-variance.tex b/slides/regularization/slides-regu-bias-variance.tex
@@ -25,12 +25,26 @@
 In this slide set, we will visualize the bias-variance trade-off. \\
 \lz 
 
-First, we start with the DGP. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$
-
-Let the feature $x$ be uniformly d
+First, we start with the DGP $\mathbb{P}_{xy}$. Assume the true function $$f: [0, 1] \rightarrow \mathbb{R}, x\mapsto +\I_{\{x \geq 0.3\}}(x) - \I_{\{x \geq 0.6\}}(x).$$
 
+Let the feature $x \sim \mathcal{U}([0, 1])$ and the target $y\vert x \sim \mathcal{N}(f(x), \sigma).$
+\begin{center}
+\includegraphics[width=0.5\textwidth]{slides/regularization/figure_man/bv_true_fun.png}
+\end{center}
 \framebreak 
 
+Obviously, $f$ is an element of the function family $$\mathcal{H} := \{f: [0, 1] \rightarrow \mathbb{R}\vert\; f \text{ is continuous except for at most 2 jump discontinuities}\}$$
+
+To make our following discussion more formal, we introduce two distance functions.
+\begin{itemize}
+    \item The distance between two functions $d:\mathcal{H}^2\rightarrow \mathbb{R}_{\geq 0}$ in $\mathcal{H}$ such that $$d(f_1, f_2) = \int_0^1(f_1(x) - f_2(x))^2dx.$$
+    \item The distance between a function and $k$ observations $\overline{d}:\mathcal{H}\times([0,1]\times \mathbb{R})^k \rightarrow \mathbb{R}_{\geq 0}$ in $\mathcal{H}$ such that 
+    $$\overline d(f, ((x_1, y_1), \dots, (x_k, y_k)) = \frac{1}{k}\sum^k_{i=1}(f_1(x_i) - y_i)^2 $$
+\end{itemize}
+
+
+\framebreak
+
 \center
 \vspace*{0.5cm}
 \includegraphics[width=0.6\textwidth]{figure_man/biasvariance_scheme.png} \\

diff --git a/slides/regularization/slides-regu-l0.tex b/slides/regularization/slides-regu-l0.tex
@@ -17,7 +17,7 @@
 \lecturechapter{L0 Regularization}
 \lecture{Introduction to Machine Learning}
 
-\begin{vbframe}{LQ norm Regularization} 
+\begin{vbframe}{Lq norm Regularization} 
 
 Besides $L1$ and $L2$ norm we could use any $Lq$ norm for regularization.
 
@@ -39,11 +39,12 @@
   \riskrt = \risket + \lambda \|\thetab\|_0 := \risket + \lambda \sum_j |\theta_j|^0.
   $$
       \item Unlike the $L1$ and $L2$ norms, the $L0$ "norm" simply counts the number of non-zero parameters in the model.
+      \vspace{0.3cm}
     \begin{figure}
       \centering
-        \scalebox{0.8}{\includegraphics{figure_man/l0_norm.png}}
-        \tiny{\\ Credit: Christos Louizos}
-        \caption{\footnotesize $Lp$ norm penalties for a parameter $\thetab$ according to different values of $p$.}
+        \scalebox{0.99}{\includegraphics{figure_man/lq-penalty-plots.png}}
+        %\tiny{\\ Credit: Christos Louizos}
+        \caption{\footnotesize $Lq$ (quasi-)norm penalties for a scalar parameter $\thetab$ for different values of $q$}
     \end{figure}
     \item For any parameter $\thetab$, the $L0$ penalty is zero for $\thetab = 0$ (defining $0^0 := 0$) and is constant for any $\thetab \neq 0$, no matter how large or small it is.
     \item $L0$ regularization induces sparsity in the parameter vector more aggressively than $L1$ regularization, but does not shrink concrete parameter values as L1 and L2 does.