regu overhaul

slds-lmu · Jan 31, 2024 · e62d0ab · e62d0ab
1 parent 9893e8b
commit e62d0ab
Show file tree

Hide file tree

Showing 15 changed files with 474 additions and 315 deletions.
diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex
@@ -127,7 +127,7 @@
 
 \begin{vbframe}{Entropy as prediction loss}
 Assume log-loss for a situation where you only model with a constant probability vector $\pi$. We know the optimal model under that loss: 
-$$\pik = \frac{n_k}{n} = \frac{\sumin [\yi = 1]}{n}$$
+$$\pik = \frac{n_k}{n} = \frac{\sumin [\yi = k]}{n}$$
 
 What is the (average) risk of that minimal constant model?
 

diff --git a/slides/information-theory/slides-info-mutual-info.tex b/slides/information-theory/slides-info-mutual-info.tex
@@ -228,7 +228,7 @@
 \begin{aligned}
 I(X ; Y) &= H(X) - H(X | Y) \\
 I(X ; Y) &= H(Y) - H(Y | X) \\
-I(X ; Y) &\leq \min\{H(X),H(Y)\} \\
+I(X ; Y) &\leq \min\{H(X),H(Y)\} \text { if $X,Y$ are discrete RVs}\\
 I(X ; Y) &= H(X) + H(Y) - H(X, Y) \\
 I(X ; Y) &= I(Y ; X) \\
 I(X ; X) &= H(X)\\

diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex
@@ -14,11 +14,11 @@
 \subsection{Introduction to Regularization}
 \includepdf[pages=-]{../slides-pdf/slides-regu-intro.pdf}
 
-\subsection{Ridge and Lasso Regression I}
-\includepdf[pages=-]{../slides-pdf/slides-regu-l1l2.pdf}
+\subsection{Ridge Regression}
+\includepdf[pages=-]{../slides-pdf/slides-regu-l2.pdf}
 
-\subsection{Ridge and Lasso Regression II}
-\includepdf[pages=-]{../slides-pdf/slides-regu-l1l2-2.pdf}
+\subsection{Lasso Regression}
+\includepdf[pages=-]{../slides-pdf/slides-regu-l1.pdf}
 
 \subsection{Lasso vs. Ridge Regression}
 \includepdf[pages=-]{../slides-pdf/slides-regu-l1vsl2.pdf}
@@ -29,11 +29,14 @@ \subsection{Elastic Net and Regularization for GLMs}
 %\subsection{Regularization for Underdetermined Problem}
 %\includepdf[pages=-]{../slides-pdf/slides-regu-underdetermined.pdf}
 
-\subsection{L0 Regularization}
-\includepdf[pages=-]{../slides-pdf/slides-regu-l0.pdf}
+\subsection{Other Types of Regularization}
+\includepdf[pages=-]{../slides-pdf/slides-regu-others.pdf}
 
-\subsection{Nonlinear and Bayes}
-\includepdf[pages=-]{../slides-pdf/slides-regu-nonlin-bayes.pdf}
+\subsection{Regularization in Non-Linear Models}
+\includepdf[pages=-]{../slides-pdf/slides-regu-nonlin.pdf}
+
+\subsection{Regularization and Bayesian Priors}
+\includepdf[pages=-]{../slides-pdf/slides-regu-bayes.pdf}
 
 \subsection{Geometric Analysis of L2 Regularization and Weight Decay}
 \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l2-wdecay.pdf}

diff --git a/slides/regularization/figure_man/penalties-comparison.pdf b/slides/regularization/figure_man/penalties-comparison.pdf
diff --git a/slides/regularization/figure_man/soft-thresholding.pdf b/slides/regularization/figure_man/soft-thresholding.pdf
diff --git a/slides/regularization/rsrc/lasso-scad-mcp.R b/slides/regularization/rsrc/lasso-scad-mcp.R
@@ -0,0 +1,56 @@
+library(ggplot2)
+
+# Set the regularization parameters for demonstration
+lambda <- 1
+a <- 3.7  # For SCAD, typically > 2
+gamma <- 3  # For MCP
+
+# Lasso Penalty Function
+lasso_penalty <- function(theta) {
+  lambda * abs(theta)
+}
+
+# SCAD Penalty Function
+scad_penalty <- function(theta) {
+  ifelse(abs(theta) <= lambda, 
+         lambda * abs(theta), 
+         ifelse(abs(theta) <= a * lambda, 
+                (-theta^2 + 2 * a * lambda * abs(theta) - lambda^2) / (2 * (a - 1)), 
+                (a + 1) * lambda^2 / 2))
+}
+
+# MCP Penalty Function
+mcp_penalty <- function(theta) {
+  ifelse(abs(theta) <= gamma * lambda, 
+         lambda * abs(theta) - theta^2 / (2 * gamma), 
+         0.5 * gamma * lambda^2)
+}
+
+# Create a sequence of theta values
+theta_vals <- seq(-4, 4, by = 0.1)
+
+# Create a data frame for plotting
+penalties <- data.frame(
+  theta = theta_vals,
+  Lasso = sapply(theta_vals, lasso_penalty),
+  SCAD = sapply(theta_vals, scad_penalty),
+  MCP = sapply(theta_vals, mcp_penalty)
+)
+
+# Plot using ggplot2
+ggplot(penalties, aes(x = theta)) + 
+  geom_line(aes(y = Lasso, color = "Lasso"), linewidth=1.2) +
+  geom_line(aes(y = SCAD, color = "SCAD"), linewidth=1.2) +
+  geom_line(aes(y = MCP, color = "MCP"), linewidth=1.2) +
+  labs(title = "Lasso, SCAD, and MCP",
+       x = expression(theta),
+       y = "Penalty") +
+  theme_minimal() +
+  theme(
+    plot.title = element_text(hjust = 0.5, size = 18),
+    axis.title = element_text(size = 16),
+    axis.text = element_text(size = 13),
+    legend.title = element_blank(),
+    legend.text = element_text(size=13)
+  ) +
+  scale_color_manual(values = c("Lasso" = "blue", "SCAD" = "red", "MCP" = "green"))
diff --git a/slides/regularization/rsrc/soft-thresholding.R b/slides/regularization/rsrc/soft-thresholding.R
@@ -31,7 +31,7 @@ p <- ggplot(data, aes(x = rho)) +
   geom_line(aes(y = theta), color = 'blue', linetype = "solid", size=1.2) +
   geom_line(aes(y = OLS), color = 'grey', linetype = "dashed", size=1.2) +
   geom_line(aes(y = Ridge), color = 'red', linetype = "solid", size=1.2) + 
-  labs(x = expression(theta[j]), y = expression(theta[j]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design)') +
+  labs(x = expression(theta[j]), y = expression(theta[j]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design, lambda=3)') +
   theme_minimal() +
   theme(
     plot.title = element_text(hjust = 0.5, size = 20),  

diff --git a/...gularization/slides-regu-nonlin-bayes.tex → slides/regularization/slides-regu-bayes.tex b/...gularization/slides-regu-nonlin-bayes.tex → slides/regularization/slides-regu-bayes.tex
@@ -5,196 +5,18 @@
 
 \newcommand{\titlefigure}{figure_man/bayes_reg.png}
 \newcommand{\learninggoals}{
-  \item Understand that regularization and parameter shrinkage can be applied to non-linear models
-  \item Know structural risk minimization 
-  \item Know how regularization risk minimization is same as MAP 
-      in Bayesian perspective, where penalty corresponds to a parameter prior
+  \item Know how regularized risk minimization is same as MAP in Bayesian perspective
+  \item Know correspondence of Gaussian/Laplace priors and $L2$/$L1$ regularization
 }
 
 \title{Introduction to Machine Learning}
 \date{}
 
 \begin{document}
 
-\lecturechapter{Regularization in Non-Linear Models and Bayesian Priors}
+\lecturechapter{Regularization and Bayesian Priors}
 \lecture{Introduction to Machine Learning}
 
-%-------------------------------------------------------------------------------
-
-\begin{vbframe}{Summary: Regularized Risk Minimization}
-
-If we should define ML in only one line, this might be it:
-
-$$
-\min_{\thetab} \riskrt= \min_{\thetab} \left(\sumin \Lxyit + \lambda \cdot J(\thetab) \right)
-$$
-
-We can choose for a task at hand:
-
-\begin{itemize}
-  \item the \textbf{hypothesis space} of $f$, which determines how features can 
-  influence the predicted $y$
-  \item the \textbf{loss} function $L$, which measures how errors should be treated
-  \item the \textbf{regularization} $J(\thetab)$, which encodes our inductive 
-  bias and preference for certain simpler models
-\end{itemize}
-
-\vfill
-
-By varying these choices one can construct a huge number of different ML models. 
-Many ML models follow this construction principle or can be interpreted through 
-the lens of regularized risk minimization.
-
-\end{vbframe}
-
-%-------------------------------------------------------------------------------
-
-\begin{vbframe}{Regularization in Nonlinear Models}
-
-\begin{itemize}
-  \item So far we have mainly considered regularization in LMs.
-  \item Can also be applied to non-linear models (with numeric parameters), where it is 
-  often important to prevent overfitting.
-  \item Here, we typically use $L2$ regularization, which
-      still results in parameter shrinkage and weight decay.
-  \item By adding regularization, prediction surfaces in regression and 
-  classification become smoother. 
-  \item Note: In the chapter on non-linear SVMs we will study the effects of
-  regularization on a non-linear model in detail. 
-\end{itemize}
-
-\end{vbframe}
-
-
-%-------------------------------------------------------------------------------
-%-------------------------------------------------------------------------------
-\begin{frame}{Regularization in Nonlinear Models}
-
-\small
-\textbf{Setting}: Classification for the \texttt{spirals} data.
-Neural network with single hidden layer containing 10 neurons and logistic 
-output activation, regularized with $L2$ penalty term for $\lambda > 0$.
-Varying $\lambda$ affects smoothness of the decision boundary and magnitude of 
-network weights:
-
-\vfill
-
-\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-1.png}\end{center}}
-\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-2.png}\end{center}}
-\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-3.png}\end{center}}
-\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-4.png}\end{center}}
-
-%\only<5>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-5.png}}
-%\only<6>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-6.png}}
-\end{frame}
-
-\begin{frame}{Regularization in Nonlinear Models}
-
-The prevention of overfitting can also be seen in CV.
-Same settings as before, but each $\lambda$ is evaluated with
-repeated CV (10 folds, 5 reps). 
-
-\begin{center}\includegraphics[width=0.7\textwidth]{figure/fig-regu-nonlin-srm-1.png}\end{center}
-
-We see the typical U-shape with the sweet spot between overfitting (LHS, low $\lambda$) and 
-underfitting (RHS, high $\lambda$) in the middle.
-\end{frame}
-
-
-%-------------------------------------------------------------------------------
-\begin{vbframe} {Structural Risk Minimization}
-
-\begin{itemize}
-  % \item Complex models generalize poorly (overfitting) if merely the empirical risk is optimized. 
-  \item Thus far, we only considered adding a complexity penalty to empirical risk minimization. 
-  \item Instead,  structural risk minimization (SRM) assumes that the hypothesis space $\Hspace$ can be decomposed into increasingly complex hypotheses (size or capacity): $\Hspace = \cup_{k \geq 1 }\Hspace_{k}$. 
-  \item Complexity parameters can be the, e.g. the degree of polynomials in linear models or the size of hidden layers in neural networks.  
-\end{itemize}
-
-\begin{center}
-\includegraphics[width=0.5\textwidth]{figure_man/fig-regu-srm-1}
-% FIGURE SOURCE:https://docs.google.com/drawings/d/1qFoFSyuY4glsNvgYgIZ96yRcznOdA5q3oogI5fVBQ1A/edit?usp=sharing
-\end{center}
-
-\framebreak
-
-
-\begin{itemize}
-
-    \item SRM chooses the smallest $k$ such that the optimal model from $\Hspace_k$ found by ERM or RRM cannot significantly
-        be outperformed by a model from a $\Hspace_m$ with $m > k$.
-  \item By this, the simplest model can be chosen, which minimizes the generalization bound.  
-  \item One challenge might be choosing an adequate complexity measure, as for some models, multiple complexity measures exist.
-\end{itemize}
-
-\begin{center}
-\includegraphics[width=0.6\textwidth]{figure_man/fig-regu-srm-2}
-% FIGURE SOURCE: https://docs.google.com/drawings/d/1mk_qVUbfOYwwmuE0AgmnPiNSMoX--pE_nZsWYND0IhQ/edit?usp=sharing
-\end{center}
-
-\end{vbframe}
-
-%-------------------------------------------------------------------------------
-\begin{frame} {Structural Risk Minimization}
-
-\small
-
-\textbf{Setting}: Classification for the \texttt{spirals} data.
-Neural network with single hidden layer containing $k$ neurons and logistic 
-output activation, L2 regularized with $\lambda = 0.001$. 
-So here SRM and RRM are both used.
-Varying the size of the hidden layer affects smoothness of the decision boundary:
-
-
-\vfill
-
-
-\only<1>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-1.png}\end{center}}
-\only<2>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-2.png}\end{center}}
-\only<3>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-3.png}\end{center}}
-\only<4>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-4.png}\end{center}}
-\only<5>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-5.png}\end{center}}
-\only<6>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-6.png}\end{center}}
-
-
-\end{frame}
-
-\begin{frame} {Structural Risk Minimization}
-Again, complexity vs CV score. 
-
-\begin{center}\includegraphics[width=0.7\textwidth]{figure/fig-regu-nonlin-srm-2.png}\end{center}
-
-A minimal model with good generalization seems to have ca. 6-8 hidden neurons.
-
-\end{frame}
-
-
-\begin{frame} {Structural Risk Minimization and RRM}
-
-Note that normal RRM can also be interpreted through SRM, if we rewrite the penalized ERM as constrained ERM.
-
-\begin{columns}
-\begin{column}{0.5\textwidth}
-\begin{eqnarray*}
-\min_{\thetab} && \sumin \Lxyit  \\
-  \text{s.t. } && \|\thetab\|_2^2  \leq t \\
-\end{eqnarray*}
-\end{column}
-\begin{column}{0.5\textwidth}
-\begin{figure}
-\includegraphics[width=0.6\textwidth]{figure_man/ridge_hat.png}
-\end{figure}
-\end{column}
-\end{columns}
-
-\vspace{0.5cm}
-
-We can interpret going through $\lambda$ from large to small as through $t$ from small to large.
-This constructs a series of ERM problems with hypothesis spaces $\Hspace_\lambda$, 
-where we constrain the norm of $\thetab$ to unit balls of growing size.
-\end{frame}
-
-
 %-------------------------------------------------------------------------------
 
 % \section{Regularization from a Bayesian Perspective}

diff --git a/slides/regularization/slides-regu-enetlogreg.tex b/slides/regularization/slides-regu-enetlogreg.tex
@@ -21,7 +21,7 @@
 
 % \section{Elastic Net}
 
-\begin{vbframe} {Elastic Net}
+\begin{vbframe} {Elastic Net \citebutton{Zou and Hastie, 2005}{https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2005.00503.x}}
 
 
 Elastic Net combines the $L1$ and $L2$ penalties:
@@ -43,40 +43,41 @@
 \framebreak
 \footnotesize
 Simulating 50 data sets with 100 observations each for two coefficient settings: \\
-
-
+\vspace{-0.3cm}
 $$\yv =\Xmat \boldsymbol{\theta}+ \epsilon, \quad \epsilon \sim N(0,1)$$
-  
-  \begin{columns}
+\vspace{-0.3cm}
+\begin{columns}
 \begin{column}{0.5\textwidth}
 \begin{center}
-\textbf{Ridge} performs better for: \\ 
+{\footnotesize \textbf{Ridge} performs better for correlated features}: \\ 
 $\boldsymbol{\theta}=(\underbrace{2,\ldots,2}_{5},\underbrace{0,\ldots,0}_{5})$\\
 $ \operatorname{corr}(\Xmat_{i},\Xmat_{j})=0.8^{|i-j|}$ for all $i$ and $j$
   \end{center}
 \end{column}
 \begin{column}{0.5\textwidth} 
 \begin{center}
-\textbf{Lasso} performs better for: \\
+{\footnotesize \textbf{Lasso} performs better for sparse truth/no correlation:} \\
 $\boldsymbol{\theta}=(2, 2, 2,\underbrace{0,\ldots,0}_{7})$ \\
 $\operatorname{corr}(\Xmat_{i},\Xmat_{j})= 0$ for all $i \neq j$, otherwise 1
 \end{center}
 \end{column}
 \end{columns}
 
 \begin{figure}
-\includegraphics[width=1\textwidth]{figure/enet_lasso_ridge_mse.png}\\
+\includegraphics[width=0.7\textwidth]{figure/enet_lasso_ridge_mse.png}\\
 \end{figure}
-
+{\normalsize $\implies$ Elastic Net handles both cases well}
 \framebreak
 
 \begin{figure}
 \includegraphics[width=0.9\textwidth]{figure/enet_tradeoff.png}\\
 \end{figure}
 
 
-\normalsize
-Since Elastic Net offers a compromise between Ridge and Lasso, it is suitable for both data situations.
+\footnotesize
+LHS: Ridge can not perform variable selection compared to Lasso/E-Net. Lasso more often ignores relevant features than E-Net (longer tails in violin plot).\\
+RHS: Ridge estimates of noise features hover around $0$ while Lasso/E-Net produce $0$s.
+%Since Elastic Net offers a compromise between Ridge and Lasso, it is suitable for both data situations.
 
 \end{vbframe}