update loss surface figure

slds-lmu · Nov 29, 2023 · 9f0746c · 9f0746c
1 parent 71c1c8e
commit 9f0746c
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 19 deletions.
diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex
@@ -44,7 +44,7 @@
 \begin{vbframe}{Constructing the KL}
     \textbf{1) Locality} \\
     The constraint must only update the prior distribution in $D, i.e.,$ the region where it is active. \\
-\includegraphics[width=0.3\linewidth]{slides/information-theory/figure_man/kl_me_constraint.png} \\
+\includegraphics[width=0.3\linewidth]{figure_man/kl_me_constraint.png} \\
     \lz
     For this, it can be shown that the non-overlapping domains of $\mathcal{X}$ must contribute additively to the entropy, i.e.,
     $$S(p) = \int F(p(\xv), \xv) d\mu(\xv)$$
@@ -54,7 +54,7 @@
 
     \textbf{2) Invariance to coordinate system} \\
     \lz 
-    \includegraphics[width=0.5\linewidth]{slides/information-theory/figure_man/kl_me_cosy.png} \\
+    \includegraphics[width=0.5\linewidth]{figure_man/kl_me_cosy.png} \\
     Enforcing 2) results in 
     $$S(p) = \int \bm{\Phi}\left(\frac{dp}{dm}(\xv)\right)dm(\xv)$$
     where $\bm{\Phi}$ is an unknown function, $m$ is another measure on $\mathcal{X}$ dominated by $\mu$ and $\frac{dp}{dm}$ the Radon–Nikodym derivative which becomes 

diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex
@@ -47,7 +47,7 @@
     \item \textbf{Feature selection}
 In feature selection, we want to choose features the target strongly depends on. 
 
-\includegraphics[width=0.6\linewidth]{slides/information-theory/figure_man/kl_ml_mi.png}
+\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_mi.png}
 
 We can measure dependency by measuring the similarity between $p(\mathbf{x}, y)$ and $p(\mathbf{x})\cdot p(y).$ \\
 We will later see that measuring this similarity with KL  leads to the concept of mutual information.
@@ -72,7 +72,7 @@
 
 \begin{vbframe}{KL divergence}
 
-Divergences can be used to measure the similarity of distributions. \\ \lz \\For distributions $p, q$ they are defined such that
+Divergences can be used to measure the similarity of distributions. \lz For distributions $p, q$ they are defined such that
 \begin{enumerate}
     \item $D(p, q) \geq 0,$
     \item $D(p, q) = 0$ iff $p = q.$
@@ -97,7 +97,7 @@
     \\ We have samples from the DGP $p(y|x)$ when we fit our ML model.
     \\
     \lz
-    \\
+
     If we have a probabilistic ML model $q_{\bm{\phi}}$ and can specify $p(y|x)$ then the forward KL can be directly applied such that
     $$\E_{\xv \sim p_{\xv}}D_{KL}(p(\cdot|\xv) \| q_{\bm{\phi}}(\cdot|\xv)) = \E_{\xv \sim p_{\xv}}\E_{y \sim p_{y|\xv}}\log\left(\frac{p(y|\xv)}{q_{\bm{\phi}}(y|\xv)}\right).$$
 For example, if $p$ and $q_{\bm{\phi}}$ are Gaussians with the same $\sigma$, minimizing this expression is equivalent to L2 minimization. \\

diff --git a/slides/regularization/figure/reg_surfaces_l1_l2.png b/slides/regularization/figure/reg_surfaces_l1_l2.png
diff --git a/slides/regularization/rsrc/make_reg_surfaces.py b/slides/regularization/rsrc/make_reg_surfaces.py
@@ -32,7 +32,7 @@ def updated_objective(beta, x1, x2, y, lam, regularization):
 # Compute the Minima for each plot
 minima = {}
 regularizations = ['l1', 'l2']
-lambdas = [0, 0.5, 5]
+lambdas = [0, 1, 10]
 for reg in regularizations:
     for lam in lambdas:
         result = minimize(updated_objective, [0, 0], args=(x1, x2, y, lam, reg), method='L-BFGS-B')
@@ -44,7 +44,7 @@ def updated_objective(beta, x1, x2, y, lam, regularization):
 beta1_grid, beta2_grid = np.meshgrid(beta1_range, beta2_range)
 
 # Plotting
-fig, axes = plt.subplots(2, 3, subplot_kw={"projection": "3d", "facecolor": "white"}, figsize=(18, 12))
+fig, axes = plt.subplots(2, 3, subplot_kw={"projection": "3d"}, figsize=(18, 12))
 for i, reg in enumerate(regularizations):
     for j, lam in enumerate(lambdas):
         objective_values = np.array([updated_objective([b1, b2], x1, x2, y, lam, reg) 
@@ -53,16 +53,10 @@ def updated_objective(beta, x1, x2, y, lam, regularization):
 
         ax = axes[i, j]
         ax.plot_surface(beta1_grid, beta2_grid, objective_values, cmap='viridis')
-        ax.set_title(f'Regularization: {reg.upper()}, Lambda: {lam}', fontsize=14)  # Increased font size
-        ax.set_xlabel('Theta1', fontsize=10)  # Increased font size
-        ax.set_ylabel('Theta2', fontsize=10)  # Increased font size
-        ax.set_zlabel('Objective', fontsize=10)  # Increased font size
-        ax.w_xaxis.pane.fill = False
-        ax.w_yaxis.pane.fill = False
-        ax.w_zaxis.pane.fill = False
-        ax.w_xaxis.pane.set_edgecolor('white')
-        ax.w_yaxis.pane.set_edgecolor('white')
-        ax.w_zaxis.pane.set_edgecolor('white')
+        ax.set_title(f'Regularization: {reg.upper()}, Lambda: {lam}', fontsize=20)  # Increased font size
+        ax.set_xlabel('Theta 1', fontsize=14)  # Increased font size
+        ax.set_ylabel('Theta 2', fontsize=14)  # Increased font size
+        ax.set_zlabel('Emp. risk', fontsize=14)  # Increased font size
 
         # Add the minima as a red dot
         min_beta1, min_beta2 = minima[(reg, lam)]

diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex
@@ -174,10 +174,10 @@
 \end{vbframe}
 
 \begin{vbframe}{Loss Surfaces}
-Loss surfaces for increasing amounts of regularization. Ridge surface becomes more ellipsoidal, Lasso surface less smooth.
+Regularized empirical risk $\riskr$ using squared loss for $\lambda \uparrow$. $L1$ penalty makes non-smooth kinks at coordinate axes more pronounced, while $L2$ penalty warps $\riskr$ toward an elliptic paraboloid. 
 
 \begin{figure}
-\includegraphics[width=0.8\textwidth]{figure/reg_surfaces.png}\\
+\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\
 \end{figure}
 
 \end{vbframe}