update bayes plot + code and comment on L2 chunk plot

slds-lmu · Jun 18, 2024 · 919496e · 919496e
1 parent 74ed3f3
commit 919496e
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 15 deletions.
diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex
@@ -60,8 +60,8 @@ \subsection{Regularization: Perspectives on Ridge Regression (Deep-Dive)}
 \subsection{Regularization: Soft-thresholding and L1 regularization (Deep-Dive)}
 \includepdf[pages=-]{../../slides-pdf/slides-regu-lasso-deepdive.pdf}
 
-\subsection{Regularization: Bagging as a Regularization Method (Deep-Dive)}
-\includepdf[pages=-]{../../slides-pdf/slides-regu-bagging-deepdive.pdf}
+%\subsection{Regularization: Bagging as a Regularization Method (Deep-Dive)}
+%\includepdf[pages=-]{../../slides-pdf/slides-regu-bagging-deepdive.pdf}
 
 
 

diff --git a/slides/regularization/figure/bayes-plot-posterior.png b/slides/regularization/figure/bayes-plot-posterior.png
diff --git a/slides/regularization/rsrc/bayes-plot-posterior.R b/slides/regularization/rsrc/bayes-plot-posterior.R
@@ -1,17 +1,24 @@
+### Simulate n samples from y = theta + eps where theta=1 and eps is N(0,1)
+### Use 3 different values tau2 for Gaussian prior variances of theta
+### Compute posterior of theta using empirical mean update formula
+### compare MAP and ridge loss surface/minimizer
+
 library(ggplot2)
+library(ggpubr)
+
+set.seed(10)
 
 # Define prior variances
 tau2 <- c(0.25^2, 0.5^2, 2^2)
 taus <- sqrt(tau2)
 
 # Simulate data
-set.seed(42)
 n <- 20
 sigma2 <- 1
 y <- rnorm(n, mean = 1, sd = sqrt(sigma2))
 y_bar <- mean(y)
 
-# Generate x-axis sequence
+# Generate x-axis sequence for range of theta values
 x_seq <- seq(-3, 3, length.out = 300)
 
 # df for plotting priors
@@ -49,10 +56,10 @@ loss_df <- data.frame(
 # Max loss to scale second y-axis
 max_loss <- max(loss_df$loss)
 
-# Prior densities
+# Plot for prior densities
 p1 <- ggplot(prior_df, aes(x = x, y = density, color = as.factor(tau))) +
   geom_line(linewidth=1.5) +
-  labs(title = "Prior Densities", y = "Density", x = "Theta", color = "Prior Std. Dev.") +
+  labs(title = "Prior Densities", y = "Density", x = expression(theta), color = "Prior Std. Dev.") +
   theme_minimal() +
   theme(
     plot.title = element_text(size = 20, hjust=0.5),
@@ -62,12 +69,12 @@ p1 <- ggplot(prior_df, aes(x = x, y = density, color = as.factor(tau))) +
     legend.text = element_text(size = 16)
   )
 
-# Posterior denisities + ridge risk
+# Plot for posterior densities + ridge risk
 p2 <- ggplot(posterior_df, aes(x = x, y = density, color = as.factor(tau))) +
   geom_line(linewidth = 1.5) +
   geom_vline(data = data.frame(tau = as.factor(taus), xintercept = posterior_mean), aes(xintercept = xintercept, color = tau), linetype = "dashed", size = 1.2) +
   geom_line(data = loss_df, aes(x = theta, y = loss / max_loss * max(posterior_df$density), color = as.factor(tau)), linetype = "solid", size=1) +
-  labs(title = "Posterior Densities (MAP=Ridge)", y = "Density", x = "Theta", color = "Std. Dev.") +
+  labs(title = "Posterior Densities (MAP=Ridge)", y = "Density", x = expression(theta), color = "Std. Dev.") +
   scale_y_continuous(sec.axis = sec_axis(~ . / max(posterior_df$density) * max_loss, name = "Reg. Emp. Risk", breaks = scales::pretty_breaks())) +
   theme_minimal() +
   xlim(-1,3) +
@@ -79,8 +86,7 @@ p2 <- ggplot(posterior_df, aes(x = x, y = density, color = as.factor(tau))) +
     legend.text = element_text(size = 16)
   )
 
-# Combine plots
-library(ggpubr)
+# Combine plots and save
 p.comb <- ggarrange(p1,p2, ncol=2, hjust=-0.5, common.legend=TRUE)
+ggsave(filename = paste0("../figure/bayes-plot-posterior.png"), plot = p.comb, width = 12, height = 4) 
 
-print(p.comb)
diff --git a/slides/regularization/slides-regu-bayes.tex b/slides/regularization/slides-regu-bayes.tex
@@ -136,11 +136,12 @@
     with Gaussian prior on $\theta$, so $\mathcal{N}(0, \tau^2)$ for $\tau \in \{0.25, 0.5, 2\}$
     \item For $n=20$, posterior of $\theta$ and MAP can be calculated analytically
     \item Plotting the $L2$ regularized empirical risk $\riskr(\theta) = \sum_{i=1}^{n} (y_i-\theta)^2+\lambda \theta^2$ with $\lambda = 1/\tau^2$ shows that ridge solution is identical with MAP
+    \item In our simulation, the empirical mean is $\Bar{y}=0.94$, with shrinkage toward $0$ induced in the MAP
 \end{itemize}
 
 \begin{figure}
   \centering
-  \scalebox{1}{\includegraphics[width=0.9\textwidth]{figure_man/bayes-plot-posterior.png}}
+  \scalebox{1}{\includegraphics[width=0.95\textwidth]{figure/bayes-plot-posterior.png}}
 \end{figure}
 
   % \item The conditional distribution of $\ydat$ in linear regression with 

diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex
@@ -194,6 +194,7 @@
     %\item Grouping property = highly corr. features tend to have equal effects
     \item $L1$ penalty is not \textit{strictly} convex. Hence, no unique solution exists if $x_4=x_5$, and sum of coefficients can be arbitrarily allocated to both features while remaining minimizers (no grouping property!):\\
     For any solution $\thetah_{4,lasso},\thetah_{5,lasso}$, equivalent minimizers are given by 
+    \vspace{-0.1cm}
     {\small $$\Tilde{\theta}_{4,lasso}=s\cdot(\thetah_{4,lasso}+\thetah_{5,lasso}) \,\,\text{and}\,\,\Tilde{\theta}_{5,lasso}=(1-s)\cdot(\thetah_{4,lasso}+\thetah_{5,lasso})\,\forall s\in[0,1]$$}
 \end{itemize}
 

diff --git a/slides/regularization/slides-regu-l2.tex b/slides/regularization/slides-regu-l2.tex
@@ -87,7 +87,7 @@
 \includegraphics[width=0.8\textwidth]{figure/ridge_contours.png}
 \end{figure}
 \vspace{-0.2cm}
-Green  = true minimizer of the unreg.objective and red = ridge solution.
+Green  = true coefs of the DGP and red = ridge solution.
 
 \framebreak
 

diff --git a/slides/regularization/slides-regu-lasso-deepdive.tex b/slides/regularization/slides-regu-lasso-deepdive.tex
@@ -44,7 +44,7 @@
 \lz 
 %For this, we assume we already know the sign of the minimizer and then derive conditions for which our assumption holds. \\
 %\lz 
-We now separately investigate $z_j$ for $\theta_j > 0 and \theta_j < 0.$\\
+We now separately investigate $z_j$ for $\theta_j > 0$ and $\theta_j < 0.$\\
 \lz
 NB: on these halflines $z_j$ is differentiable (with possible stationary point) since
 \begin{itemize}
@@ -73,7 +73,7 @@
 \end{minipage}
  \newpage
 
-2) $\hat{\theta}_{\text{lasso},j} < 0:$ \\
+2) $\theta_j < 0:$ \\
 \lz
 \begin{minipage}{0.4\textwidth}
     \includegraphics[width=5cm]{figure/th_l1_neg.pdf}