diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex index cb43367c..faa9ad81 100644 --- a/slides/information-theory/slides-info-kl-ment.tex +++ b/slides/information-theory/slides-info-kl-ment.tex @@ -44,7 +44,7 @@ \begin{vbframe}{Constructing the KL} \textbf{1) Locality} \\ The constraint must only update the prior distribution in $D, i.e.,$ the region where it is active. \\ -\includegraphics[width=0.3\linewidth]{slides/information-theory/figure_man/kl_me_constraint.png} \\ +\includegraphics[width=0.3\linewidth]{figure_man/kl_me_constraint.png} \\ \lz For this, it can be shown that the non-overlapping domains of $\mathcal{X}$ must contribute additively to the entropy, i.e., $$S(p) = \int F(p(\xv), \xv) d\mu(\xv)$$ @@ -54,7 +54,7 @@ \textbf{2) Invariance to coordinate system} \\ \lz - \includegraphics[width=0.5\linewidth]{slides/information-theory/figure_man/kl_me_cosy.png} \\ + \includegraphics[width=0.5\linewidth]{figure_man/kl_me_cosy.png} \\ Enforcing 2) results in $$S(p) = \int \bm{\Phi}\left(\frac{dp}{dm}(\xv)\right)dm(\xv)$$ where $\bm{\Phi}$ is an unknown function, $m$ is another measure on $\mathcal{X}$ dominated by $\mu$ and $\frac{dp}{dm}$ the Radon–Nikodym derivative which becomes diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex index 9fa5d05b..f4397d21 100644 --- a/slides/information-theory/slides-info-kl-ml.tex +++ b/slides/information-theory/slides-info-kl-ml.tex @@ -47,7 +47,7 @@ \item \textbf{Feature selection} In feature selection, we want to choose features the target strongly depends on. -\includegraphics[width=0.6\linewidth]{slides/information-theory/figure_man/kl_ml_mi.png} +\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_mi.png} We can measure dependency by measuring the similarity between $p(\mathbf{x}, y)$ and $p(\mathbf{x})\cdot p(y).$ \\ We will later see that measuring this similarity with KL leads to the concept of mutual information. @@ -72,7 +72,7 @@ \begin{vbframe}{KL divergence} -Divergences can be used to measure the similarity of distributions. \\ \lz \\For distributions $p, q$ they are defined such that +Divergences can be used to measure the similarity of distributions. \lz For distributions $p, q$ they are defined such that \begin{enumerate} \item $D(p, q) \geq 0,$ \item $D(p, q) = 0$ iff $p = q.$ @@ -97,7 +97,7 @@ \\ We have samples from the DGP $p(y|x)$ when we fit our ML model. \\ \lz - \\ + If we have a probabilistic ML model $q_{\bm{\phi}}$ and can specify $p(y|x)$ then the forward KL can be directly applied such that $$\E_{\xv \sim p_{\xv}}D_{KL}(p(\cdot|\xv) \| q_{\bm{\phi}}(\cdot|\xv)) = \E_{\xv \sim p_{\xv}}\E_{y \sim p_{y|\xv}}\log\left(\frac{p(y|\xv)}{q_{\bm{\phi}}(y|\xv)}\right).$$ For example, if $p$ and $q_{\bm{\phi}}$ are Gaussians with the same $\sigma$, minimizing this expression is equivalent to L2 minimization. \\ diff --git a/slides/regularization/figure/reg_surfaces_l1_l2.png b/slides/regularization/figure/reg_surfaces_l1_l2.png new file mode 100644 index 00000000..45e9f1ac Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_l2.png differ diff --git a/slides/regularization/rsrc/make_reg_surfaces.py b/slides/regularization/rsrc/make_reg_surfaces.py index c44e67d1..349c5c18 100644 --- a/slides/regularization/rsrc/make_reg_surfaces.py +++ b/slides/regularization/rsrc/make_reg_surfaces.py @@ -32,7 +32,7 @@ def updated_objective(beta, x1, x2, y, lam, regularization): # Compute the Minima for each plot minima = {} regularizations = ['l1', 'l2'] -lambdas = [0, 0.5, 5] +lambdas = [0, 1, 10] for reg in regularizations: for lam in lambdas: result = minimize(updated_objective, [0, 0], args=(x1, x2, y, lam, reg), method='L-BFGS-B') @@ -44,7 +44,7 @@ def updated_objective(beta, x1, x2, y, lam, regularization): beta1_grid, beta2_grid = np.meshgrid(beta1_range, beta2_range) # Plotting -fig, axes = plt.subplots(2, 3, subplot_kw={"projection": "3d", "facecolor": "white"}, figsize=(18, 12)) +fig, axes = plt.subplots(2, 3, subplot_kw={"projection": "3d"}, figsize=(18, 12)) for i, reg in enumerate(regularizations): for j, lam in enumerate(lambdas): objective_values = np.array([updated_objective([b1, b2], x1, x2, y, lam, reg) @@ -53,16 +53,10 @@ def updated_objective(beta, x1, x2, y, lam, regularization): ax = axes[i, j] ax.plot_surface(beta1_grid, beta2_grid, objective_values, cmap='viridis') - ax.set_title(f'Regularization: {reg.upper()}, Lambda: {lam}', fontsize=14) # Increased font size - ax.set_xlabel('Theta1', fontsize=10) # Increased font size - ax.set_ylabel('Theta2', fontsize=10) # Increased font size - ax.set_zlabel('Objective', fontsize=10) # Increased font size - ax.w_xaxis.pane.fill = False - ax.w_yaxis.pane.fill = False - ax.w_zaxis.pane.fill = False - ax.w_xaxis.pane.set_edgecolor('white') - ax.w_yaxis.pane.set_edgecolor('white') - ax.w_zaxis.pane.set_edgecolor('white') + ax.set_title(f'Regularization: {reg.upper()}, Lambda: {lam}', fontsize=20) # Increased font size + ax.set_xlabel('Theta 1', fontsize=14) # Increased font size + ax.set_ylabel('Theta 2', fontsize=14) # Increased font size + ax.set_zlabel('Emp. risk', fontsize=14) # Increased font size # Add the minima as a red dot min_beta1, min_beta2 = minima[(reg, lam)] diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex index b80d52e4..bacb428e 100644 --- a/slides/regularization/slides-regu-l1l2.tex +++ b/slides/regularization/slides-regu-l1l2.tex @@ -174,10 +174,10 @@ \end{vbframe} \begin{vbframe}{Loss Surfaces} -Loss surfaces for increasing amounts of regularization. Ridge surface becomes more ellipsoidal, Lasso surface less smooth. +Regularized empirical risk $\riskr$ using squared loss for $\lambda \uparrow$. $L1$ penalty makes non-smooth kinks at coordinate axes more pronounced, while $L2$ penalty warps $\riskr$ toward an elliptic paraboloid. \begin{figure} -\includegraphics[width=0.8\textwidth]{figure/reg_surfaces.png}\\ +\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\ \end{figure} \end{vbframe}