diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex
index 168c5e23..9e05f927 100644
--- a/slides/information-theory/slides-info-ml.tex
+++ b/slides/information-theory/slides-info-ml.tex
@@ -127,7 +127,7 @@
 
 \begin{vbframe}{Entropy as prediction loss}
 Assume log-loss for a situation where you only model with a constant probability vector $\pi$. We know the optimal model under that loss: 
-$$\pik = \frac{n_k}{n} = \frac{\sumin [\yi = 1]}{n}$$
+$$\pik = \frac{n_k}{n} = \frac{\sumin [\yi = k]}{n}$$
 
 What is the (average) risk of that minimal constant model?
 
diff --git a/slides/information-theory/slides-info-mutual-info.tex b/slides/information-theory/slides-info-mutual-info.tex
index 13f046b6..eef9932b 100644
--- a/slides/information-theory/slides-info-mutual-info.tex
+++ b/slides/information-theory/slides-info-mutual-info.tex
@@ -228,7 +228,7 @@
 \begin{aligned}
 I(X ; Y) &= H(X) - H(X | Y) \\
 I(X ; Y) &= H(Y) - H(Y | X) \\
-I(X ; Y) &\leq \min\{H(X),H(Y)\} \\
+I(X ; Y) &\leq \min\{H(X),H(Y)\} \text { if $X,Y$ are discrete RVs}\\
 I(X ; Y) &= H(X) + H(Y) - H(X, Y) \\
 I(X ; Y) &= I(Y ; X) \\
 I(X ; X) &= H(X)\\
diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex
index da606196..3b48fb30 100644
--- a/slides/regularization/chapter-order.tex
+++ b/slides/regularization/chapter-order.tex
@@ -14,11 +14,11 @@
 \subsection{Introduction to Regularization}
 \includepdf[pages=-]{../slides-pdf/slides-regu-intro.pdf}
 
-\subsection{Ridge and Lasso Regression I}
-\includepdf[pages=-]{../slides-pdf/slides-regu-l1l2.pdf}
+\subsection{Ridge Regression}
+\includepdf[pages=-]{../slides-pdf/slides-regu-l2.pdf}
 
-\subsection{Ridge and Lasso Regression II}
-\includepdf[pages=-]{../slides-pdf/slides-regu-l1l2-2.pdf}
+\subsection{Lasso Regression}
+\includepdf[pages=-]{../slides-pdf/slides-regu-l1.pdf}
 
 \subsection{Lasso vs. Ridge Regression}
 \includepdf[pages=-]{../slides-pdf/slides-regu-l1vsl2.pdf}
@@ -29,11 +29,14 @@ \subsection{Elastic Net and Regularization for GLMs}
 %\subsection{Regularization for Underdetermined Problem}
 %\includepdf[pages=-]{../slides-pdf/slides-regu-underdetermined.pdf}
 
-\subsection{L0 Regularization}
-\includepdf[pages=-]{../slides-pdf/slides-regu-l0.pdf}
+\subsection{Other Types of Regularization}
+\includepdf[pages=-]{../slides-pdf/slides-regu-others.pdf}
 
-\subsection{Nonlinear and Bayes}
-\includepdf[pages=-]{../slides-pdf/slides-regu-nonlin-bayes.pdf}
+\subsection{Regularization in Non-Linear Models}
+\includepdf[pages=-]{../slides-pdf/slides-regu-nonlin.pdf}
+
+\subsection{Regularization and Bayesian Priors}
+\includepdf[pages=-]{../slides-pdf/slides-regu-bayes.pdf}
 
 \subsection{Geometric Analysis of L2 Regularization and Weight Decay}
 \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l2-wdecay.pdf}
diff --git a/slides/regularization/figure_man/penalties-comparison.pdf b/slides/regularization/figure_man/penalties-comparison.pdf
new file mode 100644
index 00000000..c1a54338
Binary files /dev/null and b/slides/regularization/figure_man/penalties-comparison.pdf differ
diff --git a/slides/regularization/figure_man/soft-thresholding.pdf b/slides/regularization/figure_man/soft-thresholding.pdf
new file mode 100644
index 00000000..12208bc0
Binary files /dev/null and b/slides/regularization/figure_man/soft-thresholding.pdf differ
diff --git a/slides/regularization/rsrc/lasso-scad-mcp.R b/slides/regularization/rsrc/lasso-scad-mcp.R
new file mode 100644
index 00000000..8c73179f
--- /dev/null
+++ b/slides/regularization/rsrc/lasso-scad-mcp.R
@@ -0,0 +1,56 @@
+library(ggplot2)
+
+# Set the regularization parameters for demonstration
+lambda <- 1
+a <- 3.7  # For SCAD, typically > 2
+gamma <- 3  # For MCP
+
+# Lasso Penalty Function
+lasso_penalty <- function(theta) {
+  lambda * abs(theta)
+}
+
+# SCAD Penalty Function
+scad_penalty <- function(theta) {
+  ifelse(abs(theta) <= lambda, 
+         lambda * abs(theta), 
+         ifelse(abs(theta) <= a * lambda, 
+                (-theta^2 + 2 * a * lambda * abs(theta) - lambda^2) / (2 * (a - 1)), 
+                (a + 1) * lambda^2 / 2))
+}
+
+# MCP Penalty Function
+mcp_penalty <- function(theta) {
+  ifelse(abs(theta) <= gamma * lambda, 
+         lambda * abs(theta) - theta^2 / (2 * gamma), 
+         0.5 * gamma * lambda^2)
+}
+
+# Create a sequence of theta values
+theta_vals <- seq(-4, 4, by = 0.1)
+
+# Create a data frame for plotting
+penalties <- data.frame(
+  theta = theta_vals,
+  Lasso = sapply(theta_vals, lasso_penalty),
+  SCAD = sapply(theta_vals, scad_penalty),
+  MCP = sapply(theta_vals, mcp_penalty)
+)
+
+# Plot using ggplot2
+ggplot(penalties, aes(x = theta)) + 
+  geom_line(aes(y = Lasso, color = "Lasso"), linewidth=1.2) +
+  geom_line(aes(y = SCAD, color = "SCAD"), linewidth=1.2) +
+  geom_line(aes(y = MCP, color = "MCP"), linewidth=1.2) +
+  labs(title = "Lasso, SCAD, and MCP",
+       x = expression(theta),
+       y = "Penalty") +
+  theme_minimal() +
+  theme(
+    plot.title = element_text(hjust = 0.5, size = 18),
+    axis.title = element_text(size = 16),
+    axis.text = element_text(size = 13),
+    legend.title = element_blank(),
+    legend.text = element_text(size=13)
+  ) +
+  scale_color_manual(values = c("Lasso" = "blue", "SCAD" = "red", "MCP" = "green"))
\ No newline at end of file
diff --git a/slides/regularization/rsrc/soft-thresholding.R b/slides/regularization/rsrc/soft-thresholding.R
index c867f55a..5b96abba 100644
--- a/slides/regularization/rsrc/soft-thresholding.R
+++ b/slides/regularization/rsrc/soft-thresholding.R
@@ -31,7 +31,7 @@ p <- ggplot(data, aes(x = rho)) +
   geom_line(aes(y = theta), color = 'blue', linetype = "solid", size=1.2) +
   geom_line(aes(y = OLS), color = 'grey', linetype = "dashed", size=1.2) +
   geom_line(aes(y = Ridge), color = 'red', linetype = "solid", size=1.2) + 
-  labs(x = expression(theta[j]), y = expression(theta[j]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design)') +
+  labs(x = expression(theta[j]), y = expression(theta[j]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design, lambda=3)') +
   theme_minimal() +
   theme(
     plot.title = element_text(hjust = 0.5, size = 20),  
diff --git a/slides/regularization/slides-regu-nonlin-bayes.tex b/slides/regularization/slides-regu-bayes.tex
similarity index 57%
rename from slides/regularization/slides-regu-nonlin-bayes.tex
rename to slides/regularization/slides-regu-bayes.tex
index 02bad29d..be227b51 100644
--- a/slides/regularization/slides-regu-nonlin-bayes.tex
+++ b/slides/regularization/slides-regu-bayes.tex
@@ -5,10 +5,8 @@
 
 \newcommand{\titlefigure}{figure_man/bayes_reg.png}
 \newcommand{\learninggoals}{
-  \item Understand that regularization and parameter shrinkage can be applied to non-linear models
-  \item Know structural risk minimization 
-  \item Know how regularization risk minimization is same as MAP 
-      in Bayesian perspective, where penalty corresponds to a parameter prior
+  \item Know how regularized risk minimization is same as MAP in Bayesian perspective
+  \item Know correspondence of Gaussian/Laplace priors and $L2$/$L1$ regularization
 }
 
 \title{Introduction to Machine Learning}
@@ -16,185 +14,9 @@
 
 \begin{document}
 
-\lecturechapter{Regularization in Non-Linear Models and Bayesian Priors}
+\lecturechapter{Regularization and Bayesian Priors}
 \lecture{Introduction to Machine Learning}
 
-%-------------------------------------------------------------------------------
-
-\begin{vbframe}{Summary: Regularized Risk Minimization}
-
-If we should define ML in only one line, this might be it:
-
-$$
-\min_{\thetab} \riskrt= \min_{\thetab} \left(\sumin \Lxyit + \lambda \cdot J(\thetab) \right)
-$$
-
-We can choose for a task at hand:
-
-\begin{itemize}
-  \item the \textbf{hypothesis space} of $f$, which determines how features can 
-  influence the predicted $y$
-  \item the \textbf{loss} function $L$, which measures how errors should be treated
-  \item the \textbf{regularization} $J(\thetab)$, which encodes our inductive 
-  bias and preference for certain simpler models
-\end{itemize}
-
-\vfill
-
-By varying these choices one can construct a huge number of different ML models. 
-Many ML models follow this construction principle or can be interpreted through 
-the lens of regularized risk minimization.
-
-\end{vbframe}
-
-%-------------------------------------------------------------------------------
-
-\begin{vbframe}{Regularization in Nonlinear Models}
-
-\begin{itemize}
-  \item So far we have mainly considered regularization in LMs.
-  \item Can also be applied to non-linear models (with numeric parameters), where it is 
-  often important to prevent overfitting.
-  \item Here, we typically use $L2$ regularization, which
-      still results in parameter shrinkage and weight decay.
-  \item By adding regularization, prediction surfaces in regression and 
-  classification become smoother. 
-  \item Note: In the chapter on non-linear SVMs we will study the effects of
-  regularization on a non-linear model in detail. 
-\end{itemize}
-
-\end{vbframe}
-
-
-%-------------------------------------------------------------------------------
-%-------------------------------------------------------------------------------
-\begin{frame}{Regularization in Nonlinear Models}
-
-\small
-\textbf{Setting}: Classification for the \texttt{spirals} data.
-Neural network with single hidden layer containing 10 neurons and logistic 
-output activation, regularized with $L2$ penalty term for $\lambda > 0$.
-Varying $\lambda$ affects smoothness of the decision boundary and magnitude of 
-network weights:
-
-\vfill
-
-\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-1.png}\end{center}}
-\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-2.png}\end{center}}
-\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-3.png}\end{center}}
-\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-4.png}\end{center}}
-
-%\only<5>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-5.png}}
-%\only<6>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-6.png}}
-\end{frame}
-
-\begin{frame}{Regularization in Nonlinear Models}
-
-The prevention of overfitting can also be seen in CV.
-Same settings as before, but each $\lambda$ is evaluated with
-repeated CV (10 folds, 5 reps). 
-
-\begin{center}\includegraphics[width=0.7\textwidth]{figure/fig-regu-nonlin-srm-1.png}\end{center}
-
-We see the typical U-shape with the sweet spot between overfitting (LHS, low $\lambda$) and 
-underfitting (RHS, high $\lambda$) in the middle.
-\end{frame}
-
-
-%-------------------------------------------------------------------------------
-\begin{vbframe} {Structural Risk Minimization}
-
-\begin{itemize}
-  % \item Complex models generalize poorly (overfitting) if merely the empirical risk is optimized. 
-  \item Thus far, we only considered adding a complexity penalty to empirical risk minimization. 
-  \item Instead,  structural risk minimization (SRM) assumes that the hypothesis space $\Hspace$ can be decomposed into increasingly complex hypotheses (size or capacity): $\Hspace = \cup_{k \geq 1 }\Hspace_{k}$. 
-  \item Complexity parameters can be the, e.g. the degree of polynomials in linear models or the size of hidden layers in neural networks.  
-\end{itemize}
-
-\begin{center}
-\includegraphics[width=0.5\textwidth]{figure_man/fig-regu-srm-1}
-% FIGURE SOURCE:https://docs.google.com/drawings/d/1qFoFSyuY4glsNvgYgIZ96yRcznOdA5q3oogI5fVBQ1A/edit?usp=sharing
-\end{center}
-
-\framebreak
-
-
-\begin{itemize}
-
-    \item SRM chooses the smallest $k$ such that the optimal model from $\Hspace_k$ found by ERM or RRM cannot significantly
-        be outperformed by a model from a $\Hspace_m$ with $m > k$.
-  \item By this, the simplest model can be chosen, which minimizes the generalization bound.  
-  \item One challenge might be choosing an adequate complexity measure, as for some models, multiple complexity measures exist.
-\end{itemize}
-
-\begin{center}
-\includegraphics[width=0.6\textwidth]{figure_man/fig-regu-srm-2}
-% FIGURE SOURCE: https://docs.google.com/drawings/d/1mk_qVUbfOYwwmuE0AgmnPiNSMoX--pE_nZsWYND0IhQ/edit?usp=sharing
-\end{center}
-
-\end{vbframe}
-
-%-------------------------------------------------------------------------------
-\begin{frame} {Structural Risk Minimization}
-
-\small
-
-\textbf{Setting}: Classification for the \texttt{spirals} data.
-Neural network with single hidden layer containing $k$ neurons and logistic 
-output activation, L2 regularized with $\lambda = 0.001$. 
-So here SRM and RRM are both used.
-Varying the size of the hidden layer affects smoothness of the decision boundary:
-
-
-\vfill
-
-
-\only<1>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-1.png}\end{center}}
-\only<2>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-2.png}\end{center}}
-\only<3>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-3.png}\end{center}}
-\only<4>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-4.png}\end{center}}
-\only<5>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-5.png}\end{center}}
-\only<6>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-6.png}\end{center}}
-
-
-\end{frame}
-
-\begin{frame} {Structural Risk Minimization}
-Again, complexity vs CV score. 
-
-\begin{center}\includegraphics[width=0.7\textwidth]{figure/fig-regu-nonlin-srm-2.png}\end{center}
-
-A minimal model with good generalization seems to have ca. 6-8 hidden neurons.
-
-\end{frame}
-
-
-\begin{frame} {Structural Risk Minimization and RRM}
-
-Note that normal RRM can also be interpreted through SRM, if we rewrite the penalized ERM as constrained ERM.
-
-\begin{columns}
-\begin{column}{0.5\textwidth}
-\begin{eqnarray*}
-\min_{\thetab} && \sumin \Lxyit  \\
-  \text{s.t. } && \|\thetab\|_2^2  \leq t \\
-\end{eqnarray*}
-\end{column}
-\begin{column}{0.5\textwidth}
-\begin{figure}
-\includegraphics[width=0.6\textwidth]{figure_man/ridge_hat.png}
-\end{figure}
-\end{column}
-\end{columns}
-
-\vspace{0.5cm}
-
-We can interpret going through $\lambda$ from large to small as through $t$ from small to large.
-This constructs a series of ERM problems with hypothesis spaces $\Hspace_\lambda$, 
-where we constrain the norm of $\thetab$ to unit balls of growing size.
-\end{frame}
-
-
 %-------------------------------------------------------------------------------
 
 % \section{Regularization from a Bayesian Perspective}
diff --git a/slides/regularization/slides-regu-enetlogreg.tex b/slides/regularization/slides-regu-enetlogreg.tex
index 9f819fe4..59fa6393 100644
--- a/slides/regularization/slides-regu-enetlogreg.tex
+++ b/slides/regularization/slides-regu-enetlogreg.tex
@@ -21,7 +21,7 @@
 
 % \section{Elastic Net}
 
-\begin{vbframe} {Elastic Net}
+\begin{vbframe} {Elastic Net \citebutton{Zou and Hastie, 2005}{https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2005.00503.x}}
 
 
 Elastic Net combines the $L1$ and $L2$ penalties:
@@ -43,21 +43,20 @@
 \framebreak
 \footnotesize
 Simulating 50 data sets with 100 observations each for two coefficient settings: \\
-
-
+\vspace{-0.3cm}
 $$\yv =\Xmat \boldsymbol{\theta}+ \epsilon, \quad \epsilon \sim N(0,1)$$
-  
-  \begin{columns}
+\vspace{-0.3cm}
+\begin{columns}
 \begin{column}{0.5\textwidth}
 \begin{center}
-\textbf{Ridge} performs better for: \\ 
+{\footnotesize \textbf{Ridge} performs better for correlated features}: \\ 
 $\boldsymbol{\theta}=(\underbrace{2,\ldots,2}_{5},\underbrace{0,\ldots,0}_{5})$\\
 $ \operatorname{corr}(\Xmat_{i},\Xmat_{j})=0.8^{|i-j|}$ for all $i$ and $j$
   \end{center}
 \end{column}
 \begin{column}{0.5\textwidth} 
 \begin{center}
-\textbf{Lasso} performs better for: \\
+{\footnotesize \textbf{Lasso} performs better for sparse truth/no correlation:} \\
 $\boldsymbol{\theta}=(2, 2, 2,\underbrace{0,\ldots,0}_{7})$ \\
 $\operatorname{corr}(\Xmat_{i},\Xmat_{j})= 0$ for all $i \neq j$, otherwise 1
 \end{center}
@@ -65,9 +64,9 @@
 \end{columns}
 
 \begin{figure}
-\includegraphics[width=1\textwidth]{figure/enet_lasso_ridge_mse.png}\\
+\includegraphics[width=0.7\textwidth]{figure/enet_lasso_ridge_mse.png}\\
 \end{figure}
-
+{\normalsize $\implies$ Elastic Net handles both cases well}
 \framebreak
 
 \begin{figure}
@@ -75,8 +74,10 @@
 \end{figure}
 
 
-\normalsize
-Since Elastic Net offers a compromise between Ridge and Lasso, it is suitable for both data situations.
+\footnotesize
+LHS: Ridge can not perform variable selection compared to Lasso/E-Net. Lasso more often ignores relevant features than E-Net (longer tails in violin plot).\\
+RHS: Ridge estimates of noise features hover around $0$ while Lasso/E-Net produce $0$s.
+%Since Elastic Net offers a compromise between Ridge and Lasso, it is suitable for both data situations.
 
 \end{vbframe}
 
diff --git a/slides/regularization/slides-regu-l0.tex b/slides/regularization/slides-regu-l0.tex
deleted file mode 100644
index b2599b8b..00000000
--- a/slides/regularization/slides-regu-l0.tex
+++ /dev/null
@@ -1,59 +0,0 @@
-\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
-\input{../../style/preamble}
-\input{../../latex-math/basic-math}
-\input{../../latex-math/basic-ml}
-
-\newcommand{\titlefigure}{figure_man/lasso_ridge_hat.png}
-\newcommand{\learninggoals}{
-  \item Know Lq (quasi-)norm regularization
-  \item Understand that L0 regularization simply counts the number of non-zero parameters
-}
-
-\title{Introduction to Machine Learning}
-\date{}
-
-\begin{document}
-
-\lecturechapter{L0 Regularization}
-\lecture{Introduction to Machine Learning}
-
-\begin{vbframe}{Lq norm Regularization} 
-
-Besides $L1$ and $L2$ norm we could use any $Lq$ norm for regularization.
-
-\begin{figure}
-  \scalebox{0.7}{\includegraphics{figure_man/lasso_ridge_hat.png}}\\
-%\includegraphics[height=2.3cm]{figure_man/contour.pdf}
-\caption{\textit{Top:} Ridge and Lasso loss contours and feasible regions.
-  \textit{Bottom:} Different feasible region shapes for $Lq$ norms $\sum_j |\theta_j|^q$.}
-\end{figure}
-  
-\end{vbframe}
-
-
-\begin{vbframe} {L0 regularization}
-
-  \begin{itemize}
-    \item Consider the $L0$-regularized risk of a model $\fxt$
-  $$
-  \riskrt = \risket + \lambda \|\thetab\|_0 := \risket + \lambda \sum_j |\theta_j|^0.
-  $$
-      \item Unlike the $L1$ and $L2$ norms, the $L0$ "norm" simply counts the number of non-zero parameters in the model.
-      \vspace{0.3cm}
-    \begin{figure}
-      \centering
-        \scalebox{0.99}{\includegraphics{figure_man/lq-penalty-plots.png}}
-        %\tiny{\\ Credit: Christos Louizos}
-        \caption{\footnotesize $Lq$ (quasi-)norm penalties for a scalar parameter $\thetab$ for different values of $q$}
-    \end{figure}
-    \item For any parameter $\thetab$, the $L0$ penalty is zero for $\thetab = 0$ (defining $0^0 := 0$) and is constant for any $\thetab \neq 0$, no matter how large or small it is.
-    \item $L0$ regularization induces sparsity in the parameter vector more aggressively than $L1$ regularization, but does not shrink concrete parameter values as L1 and L2 does.
-    \item Model selection criteria such as Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC) are special cases of $L0$ regularization (corresponding to specific values of $\lambda$).
-    \item The $L0$-regularized risk is neither continuous, differentiable or convex. 
-    \item It is computationally hard to optimize (NP-hard) and likely intractable. 
-      For smaller $n$ and $p$ we might be able to solve this nowadays directly, for larger scenarios efficient approximations of the $L0$ are still topic of current research.
-  \end{itemize}
-\end{vbframe}
-
-\endlecture
-\end{document}
diff --git a/slides/regularization/slides-regu-l1l2-2.tex b/slides/regularization/slides-regu-l1.tex
similarity index 75%
rename from slides/regularization/slides-regu-l1l2-2.tex
rename to slides/regularization/slides-regu-l1.tex
index 9184ae28..6445d35b 100644
--- a/slides/regularization/slides-regu-l1l2-2.tex
+++ b/slides/regularization/slides-regu-l1.tex
@@ -5,9 +5,8 @@
 
 \newcommand{\titlefigure}{figure/lin_reg_l1.png}
 \newcommand{\learninggoals}{
-  \item Know the regularized linear model
-  \item Know Ridge regression ($L2$ penalty)
   \item Know Lasso regression ($L1$ penalty)
+  \item Know the properties of $L1$ regularization
 }
 
 \title{Introduction to Machine Learning}
@@ -15,7 +14,7 @@
 
 \begin{document}
 
-\lecturechapter{Ridge and Lasso Regression II}
+\lecturechapter{Lasso Regression}
 \lecture{Introduction to Machine Learning}
 
 % \section{Lasso Regression}
@@ -24,9 +23,10 @@
 
 Another shrinkage method is the so-called \textbf{Lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetab$:
 \vspace{0.4cm}
-\begin{eqnarray*}
-\thetah_{\text{Lasso}}= \argmin_{\thetab} \underbrace{\sumin \left(\yi - \thetab^T \xi\right)^2}_{\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right)} + \lambda \|\thetab\|_1
-\end{eqnarray*}
+\begin{align*}
+\thetah_{\text{Lasso}}&= \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi\right)^2 + \lambda \sum_{j=1}^{p} \vert\theta_j\vert\\
+&= \argmin_{\thetab}\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \|\thetab\|_1
+\end{align*}
 
 \vspace{0.4cm}
 
@@ -36,13 +36,34 @@
 
 \framebreak
 
-Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$. 
+Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$. \\
+\vspace{0.1cm}
+Left plot shows effect of $L1$ regularization, right plot shows corresponding with $L2$ for comparison:
+\begin{columns}
+\begin{column}{0.5\textwidth}
+\lz
+\begin{figure}
+\includegraphics[width=0.99\textwidth]{figure/lin_reg_l1.png}
+\end{figure}
+\end{column}
 
+\begin{column}{0.5\textwidth}
+\lz
 \begin{figure}
-\includegraphics[width=0.8\textwidth]{figure/lin_reg_l1.png}
+\includegraphics[width=0.99\textwidth]{figure/lin_reg_l2.png}
 \end{figure}
+\end{column}
+\end{columns}
+
+%\begin{figure}
+%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l1.png}
+%\end{figure}
+
+%\begin{figure}
+%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png}
+%\end{figure}
 
-With increasing regularization, $\theta_{\textit{reg}}$ is pulled back to the origin.
+With increasing regularization, $\hat{\theta}_{\textit{Lasso}}$ is pulled back to the origin, but takes a different ``route''.
 
 %\textbf{NB}: lasso=least absolute shrinkage and selection operator.
 
@@ -53,7 +74,7 @@
 \includegraphics[width=0.85\textwidth]{figure/lasso_contours.png}
 \end{figure}
 
-Green marks true minimizer and red the estimates.
+Green  = true minimizer of the unreg.objective and red = lasso solution.
 
 \framebreak
 
@@ -83,7 +104,7 @@
 %Soft threshold ensures exact zeros, while $L2$ penalty shrinks uniformly.
 \vspace{-0.16cm}
 \begin{figure}
-\includegraphics[width=0.52\textwidth]{figure_man/soft-threshold-ridge-ols.pdf}\\
+\includegraphics[width=0.5\textwidth]{figure_man/soft-thresholding.pdf}\\
 \end{figure}
 
 \end{vbframe}
diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex
index 4a898676..c0f021fa 100644
--- a/slides/regularization/slides-regu-l1vsl2.tex
+++ b/slides/regularization/slides-regu-l1vsl2.tex
@@ -32,7 +32,7 @@
     \item \small{In both cases, the solution which minimizes $\riskrt$ is always a point on the boundary of the feasible region (for sufficiently large $\lambda$).
     \item As expected, $\hat{\thetab}_{\text{Lasso}}$ and $\hat{\thetab}_{\text{Ridge}}$ have smaller parameter norms than $\thetah$.}
     \item For Lasso, the solution likely touches vertices of the constraint region. This induces sparsity and is a form of variable selection.
-    \item In the $p>n$ case, the Lasso selects at most $n$ features (due to the nature of the convex optimization problem).
+    \item In the $p>n$ case, Lasso selects at most $n$ features \citebutton{Zou and Hastie, 2005}{https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00503.x}.
     
   \end{itemize}
   
@@ -128,7 +128,8 @@
 
 \includegraphics[width=0.9\textwidth]{figure/regu_example_multicollinearity.png}
 
-Consider $n=100$ simulated observations using $y = 0.2X_1 + 0.2X_2 + 0.2X_3 + 0.2X_4 + 0.2X_5 + \epsilon$, $\epsilon \sim \normal (0,1)$. $X_1$-$X_4$ are independently drawn from different normal distributions: $X_1, X_2, X_3, X_4 \sim \normal (0,2)$. While $X_1$-$X_4$ have pairwise correlation coefficients of 0, $X_4$ and $X_5$ are nearly perfectly correlated: $X_5 = X_4 + \delta, \delta \sim \normal (0,0.3), \rho(X_4, X_5) = 0.98. $
+Consider $n=100$ simulated observations using $y = 0.2X_1 + 0.2X_2 + 0.2X_3 + 0.2X_4 + 0.2X_5 + \epsilon$.\\
+$X_1$-$X_4$ are independent, but $X_4$ and $X_5$ are strongly correlated.
 
 \vspace{0.1cm}
 
@@ -137,12 +138,12 @@
 \end{vbframe}
 
 
-\begin{vbframe}{Summarizing Comments}
+\begin{vbframe}{Synopsis \citebutton{Tibshirani, 1996}{https://www.jstor.org/stable/2346178} \citebutton{Zou and Hastie, 2005}{https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2005.00503.x}}
 
 \begin{itemize}
-\item Neither one can be classified as overall better (no free lunch!)
-\item Lasso is likely better if true underlying structure is sparse, so if only few features influence $y$. Ridge works well if there are many (weakly) influential features.
+\item Neither one can be classified as overall better
 \item Lasso can set some coefficients to zero, thus performing variable selection, while Ridge regression usually leads to smaller estimated coefficients, but still dense parameter vectors $\thetab$.
+\item Lasso is likely better if true underlying structure is sparse, so if only few features influence $y$. Ridge works well if there are many (weakly) influential features.
 \item Lasso has difficulties handling correlated predictors. For high correlation Ridge dominates Lasso in performance.
 \item For Lasso one of the correlated predictors will have a larger coefficient, while the rest are (nearly) zeroed. The respective feature is, however, selected randomly. 
 \item For Ridge the coefficients of correlated features are similar.
diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l2.tex
similarity index 88%
rename from slides/regularization/slides-regu-l1l2.tex
rename to slides/regularization/slides-regu-l2.tex
index c25035a1..9eb71fb6 100644
--- a/slides/regularization/slides-regu-l1l2.tex
+++ b/slides/regularization/slides-regu-l2.tex
@@ -3,7 +3,7 @@
 \input{../../latex-math/basic-math}
 \input{../../latex-math/basic-ml}
 
-\newcommand{\titlefigure}{figure_man/solution_path_l2.png}
+\newcommand{\titlefigure}{figure/ridge_outside.png}
 \newcommand{\learninggoals}{
   \item Know the regularized linear model
   \item Know Ridge regression ($L2$ penalty)
@@ -16,7 +16,7 @@
 
 \begin{document}
 
-\lecturechapter{Ridge and Lasso Regression I}
+\lecturechapter{Ridge Regression}
 \lecture{Introduction to Machine Learning}
 
 
@@ -54,8 +54,8 @@
 \vspace{0.2cm}
 \textbf{Ridge regression} uses a simple $L2$ penalty:
 \begin{eqnarray*}  
-\thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 \\
-&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab
+\thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \sum_{j=1}^{p} \theta_j^2 \\
+&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \underbrace{\thetab^\top \thetab}_{\|\thetab\|_2^2}
 \end{eqnarray*}
 
 Optimization is possible (as in the normal LM) in analytical form:
@@ -70,17 +70,17 @@
 \begin{figure}
 \includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png}
 \end{figure}
-
-With increasing regularization, $\theta_{\textit{reg}}$ is pulled back to the origin.
+%\vspace{-0.2cm}
+With increasing regularization, $\hat{\theta}_{\textit{Ridge}}$ is pulled back to the origin.
 
 \framebreak 
 Contours of regularized objective for different $\lambda$ values.
 
 \begin{figure}
-\includegraphics[width=0.85\textwidth]{figure/ridge_contours.png}
+\includegraphics[width=0.8\textwidth]{figure/ridge_contours.png}
 \end{figure}
-
-Green marks true minimizer and red the estimates.
+\vspace{-0.2cm}
+Green  = true minimizer of the unreg.objective and red = ridge solution.
 
 \framebreak
 
@@ -104,18 +104,19 @@
 \end{footnotesize}
 
 \framebreak
-  
+
 \begin{columns}
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=\textwidth]{figure_man/solution_path_l2.png}
+\includegraphics[width=\textwidth]{figure/ridge_inside.png}
 \end{figure}
 \end{column}
 
 \begin{column}{0.5\textwidth}
 \begin{footnotesize} 
 \begin{itemize}
+  \item Inside constraints perspective: From origin, jump from contour line to contour line (better) until you become infeasible, stop before.
   \item We still optimize the $\risket$, but cannot leave a ball around the origin.
   \item $\risket$ grows monotonically if we move away from $\thetah$ (elliptic contours).
   \item Solution path moves from origin to border of feasible region with minimal $L_2$ distance.
@@ -124,24 +125,6 @@
 \end{column}
 \end{columns}
 
-\framebreak
-
-\begin{columns}
-\begin{column}{0.5\textwidth}
-\lz
-\begin{figure}
-\includegraphics[width=\textwidth]{figure/ridge_inside.png}
-\end{figure}
-\end{column}
-
-\begin{column}{0.5\textwidth}
-\begin{footnotesize} 
-\begin{itemize}
-  \item Inside constraints perspective: From origin, jump from contour line to contour line (better) until you become infeasible, stop before.
-\end{itemize}
-\end{footnotesize}
-\end{column}
-\end{columns}
 
 \framebreak
 
@@ -149,7 +132,7 @@
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=\textwidth]{figure_man/solution_path.png}
+\includegraphics[width=\textwidth]{figure/ridge_outside.png}
 \end{figure}
 \end{column}
 
@@ -157,7 +140,9 @@
 \begin{footnotesize} 
 \begin{itemize}
 
-	\item Solution path moves from unregularized estimate to feasible region of regularized objective with minimal $L_2$ distance.
+	\item Outside constraints perspective: From $\thetah$, jump from contour line to contour line (worse) until you become feasible, stop then.
+  \item So our new optimum will lie on the boundary of that ball.
+  \item Solution path moves from unregularized estimate to feasible region of regularized objective with minimal $L_2$ distance.
 \end{itemize}
 \end{footnotesize}
 \end{column}
@@ -169,22 +154,23 @@
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=\textwidth]{figure/ridge_outside.png}
+\includegraphics[width=\textwidth]{slides/regularization/figure_man/solution-path-ridge-only.png}
 \end{figure}
 \end{column}
 
 \begin{column}{0.5\textwidth}
+\lz
 \begin{footnotesize} 
 \begin{itemize}
-
-	\item Outside constraints perspective: From $\thetah$, jump from contour line to contour line (worse) until you become feasible, stop then.
-  \item So our new optimum will lie on the boundary of that ball.
+    \item Here we can see entire solution path for Ridge regression
+    \item Cyan contours indicate feasible regions induced by different $\lambda$s
+    \item Red contour lines indicate different levels of the unreg. objective
+    \item Ridge solution (red points) gets pulled toward origin for increasing $\lambda$
 \end{itemize}
 \end{footnotesize}
 \end{column}
 \end{columns}
 
-
 \end{vbframe}
 
 
@@ -250,7 +236,7 @@
 \includegraphics[width=0.7\textwidth]{figure/lin_reg_l1.png}
 \end{figure}
 
-With increasing regularization, $\theta_{\textit{reg}}$ is pulled back to the origin. Contours = unreg. objective, dots = reg. solution for increasing $\lambda$.
+With increasing regularization, $\theta_{\textit{Ridge}}$ is pulled back to the origin. Contours = unreg. objective, dots = reg. solution for increasing $\lambda$.
 
 %\textbf{NB}: lasso=least absolute shrinkage and selection operator.
 
diff --git a/slides/regularization/slides-regu-nonlin.tex b/slides/regularization/slides-regu-nonlin.tex
new file mode 100644
index 00000000..c632dab6
--- /dev/null
+++ b/slides/regularization/slides-regu-nonlin.tex
@@ -0,0 +1,197 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+
+\newcommand{\titlefigure}{figure/fig-regu-nonlin-2.png}
+\newcommand{\learninggoals}{
+  \item Understand that regularization and parameter shrinkage can be applied to non-linear models
+  \item Know structural risk minimization 
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Regularization in Non-Linear Models and Structural Risk Minimization}
+\lecture{Introduction to Machine Learning}
+
+%-------------------------------------------------------------------------------
+
+\begin{vbframe}{Summary: Regularized Risk Minimization}
+
+If we should define (supervised) ML in only one line, this might be it:
+
+$$
+\min_{\thetab} \riskrt= \min_{\thetab} \left(\sumin \Lxyit + \lambda \cdot J(\thetab) \right)
+$$
+
+We can choose for a task at hand:
+
+\begin{itemize}
+  \item the \textbf{hypothesis space} of $f$, which determines how features can 
+  influence the predicted $y$
+  \item the \textbf{loss} function $L$, which measures how errors should be treated
+  \item the \textbf{regularization} $J(\thetab)$, which encodes our inductive 
+  bias and preference for certain simpler models
+\end{itemize}
+
+\vfill
+
+By varying these choices one can construct a huge number of different ML models. 
+Many ML models follow this construction principle or can be interpreted through 
+the lens of regularized risk minimization.
+
+\end{vbframe}
+
+%-------------------------------------------------------------------------------
+
+\begin{vbframe}{Regularization in Nonlinear Models}
+
+\begin{itemize}
+  \item So far we have mainly considered regularization in LMs.
+  \item Can also be applied to non-linear models (with numeric parameters), where it is 
+  often important to prevent overfitting.
+  \item Here, we typically use $L2$ regularization, which
+    still results in parameter shrinkage and weight decay.
+  \item Adding regularization is commonplace and sometimes crucial in non-linear methods such as NNs, SVMs, or boosting.
+  \item By adding regularization, prediction surfaces in regression and 
+  classification become smoother. 
+  \item Note: In the chapter on non-linear SVMs we will study the effects of
+  regularization on a non-linear model in detail. 
+\end{itemize}
+
+\end{vbframe}
+
+
+%-------------------------------------------------------------------------------
+%-------------------------------------------------------------------------------
+\begin{frame}{Regularization in Nonlinear Models}
+
+\small
+\textbf{Setting}: Classification for the \texttt{spirals} data.
+Neural network with single hidden layer containing 10 neurons, regularized with $L2$. \\
+
+Varying $\lambda$ affects smoothness of the decision boundary and magnitude of 
+network weights:
+
+\vfill
+
+\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-1.png}\end{center}}
+\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-2.png}\end{center}}
+\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-3.png}\end{center}}
+\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-4.png}\end{center}}
+
+%\only<5>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-5.png}}
+%\only<6>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-6.png}}
+\end{frame}
+
+\begin{frame}{Regularization in Nonlinear Models}
+
+The prevention of overfitting can also be seen in CV.
+Same settings as before, but each $\lambda$ is evaluated with
+repeated CV (10 folds, 5 reps). 
+
+\begin{center}\includegraphics[width=0.7\textwidth]{figure/fig-regu-nonlin-srm-1.png}\end{center}
+
+We see the typical U-shape with the sweet spot between overfitting (LHS, low $\lambda$) and 
+underfitting (RHS, high $\lambda$) in the middle.
+\end{frame}
+
+
+%-------------------------------------------------------------------------------
+\begin{vbframe} {Structural Risk Minimization}
+
+\begin{itemize}
+  % \item Complex models generalize poorly (overfitting) if merely the empirical risk is optimized. 
+  \item Thus far, we only considered adding a complexity penalty to empirical risk minimization. 
+  \item Instead,  structural risk minimization (SRM) assumes that the hypothesis space $\Hspace$ can be decomposed into increasingly complex hypotheses (size or capacity): $\Hspace = \cup_{k \geq 1 }\Hspace_{k}$. 
+  \item Complexity parameters can be the, e.g. the degree of polynomials in linear models or the size of hidden layers in neural networks.  
+\end{itemize}
+
+\begin{center}
+\includegraphics[width=0.5\textwidth]{figure_man/fig-regu-srm-1}
+% FIGURE SOURCE:https://docs.google.com/drawings/d/1qFoFSyuY4glsNvgYgIZ96yRcznOdA5q3oogI5fVBQ1A/edit?usp=sharing
+\end{center}
+
+\framebreak
+
+
+\begin{itemize}
+
+    \item SRM chooses the smallest $k$ such that the optimal model from $\Hspace_k$ found by ERM or RRM cannot significantly
+        be outperformed by a model from a $\Hspace_m$ with $m > k$.
+  \item By this, the simplest model can be chosen, which minimizes the generalization bound.  
+  \item One challenge might be choosing an adequate complexity measure, as for some models, multiple complexity measures exist.
+\end{itemize}
+
+\begin{center}
+\includegraphics[width=0.6\textwidth]{figure_man/fig-regu-srm-2}
+% FIGURE SOURCE: https://docs.google.com/drawings/d/1mk_qVUbfOYwwmuE0AgmnPiNSMoX--pE_nZsWYND0IhQ/edit?usp=sharing
+\end{center}
+
+\end{vbframe}
+
+%-------------------------------------------------------------------------------
+\begin{frame} {Structural Risk Minimization}
+
+\small
+
+\textbf{Setting}: Classification for the \texttt{spirals} data.
+NN with 1 hidden layer, and fixed (small) L2 penalty. \\
+Varying the size of the hidden layer affects smoothness of the decision boundary:
+
+
+\vfill
+
+
+\only<1>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-1.png}\end{center}}
+\only<2>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-2.png}\end{center}}
+\only<3>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-3.png}\end{center}}
+\only<4>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-4.png}\end{center}}
+\only<5>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-5.png}\end{center}}
+\only<6>{\begin{center}\includegraphics[width=0.5\textwidth]{figure/fig-regu-nonlin-size-6.png}\end{center}}
+
+
+\end{frame}
+
+\begin{frame} {Structural Risk Minimization}
+Again, complexity vs CV score. 
+
+\begin{center}\includegraphics[width=0.7\textwidth]{figure/fig-regu-nonlin-srm-2.png}\end{center}
+
+A minimal model with good generalization seems to have ca. 6-8 hidden neurons.
+
+\end{frame}
+
+
+\begin{frame} {Structural Risk Minimization and RRM}
+
+Note that normal RRM can also be interpreted through SRM, if we rewrite the penalized ERM as constrained ERM.
+
+\begin{columns}
+\begin{column}{0.5\textwidth}
+\begin{eqnarray*}
+\min_{\thetab} && \sumin \Lxyit  \\
+  \text{s.t. } && \|\thetab\|_2^2  \leq t \\
+\end{eqnarray*}
+\end{column}
+\begin{column}{0.5\textwidth}
+\begin{figure}
+\includegraphics[width=0.6\textwidth]{figure_man/ridge_hat.png}
+\end{figure}
+\end{column}
+\end{columns}
+
+\vspace{0.5cm}
+
+We can interpret going through $\lambda$ from large to small as through $t$ from small to large.
+This constructs a series of ERM problems with hypothesis spaces $\Hspace_\lambda$, 
+where we constrain the norm of $\thetab$ to unit balls of growing size.
+\end{frame}
+
+
+\endlecture
+\end{document}
+
diff --git a/slides/regularization/slides-regu-others.tex b/slides/regularization/slides-regu-others.tex
new file mode 100644
index 00000000..6062596f
--- /dev/null
+++ b/slides/regularization/slides-regu-others.tex
@@ -0,0 +1,131 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+
+\newcommand{\titlefigure}{figure_man/lasso_ridge_hat.png}
+\newcommand{\learninggoals}{
+  \item Know $L1$/$L2$ regularization induces bias
+  \item Know Lq (quasi-)norm regularization
+  \item Understand that L0 regularization simply counts number of non-zero parameters
+  \item Know SCAD and MCP
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Other Types of Regularizers}
+\lecture{Introduction to Machine Learning}
+
+\begin{vbframe}{Ridge and Lasso are biased estimators} 
+Although ridge and lasso regression have many nice properties, they are biased estimators and the bias does not (necessarily) vanish as $n \rightarrow \infty$.\\
+\vspace{0.3cm}
+
+For example, in the orthonormal case ($\Xmat^{\top}\Xmat=\bm{I}$) the bias of the lasso is
+$$
+\begin{cases}\mathbb{E}\left|\widehat{\theta}_j-\theta_j\right|=0 & \text { if } \theta_j=0 \\ \mathbb{E}\left|\widehat{\theta}_j-\theta_j\right| \approx \theta_j & \text { if }\left|\theta_j\right| \in[0, \lambda] \\ \mathbb{E}\left|\widehat{\theta}_j-\theta_j\right| \approx \lambda & \text { if }\left|\theta_j\right|>\lambda\end{cases}
+$$
+\vspace{0.3cm}
+
+The bias of the lasso for noise features is thus about $\lambda$ for large $|\theta|$.\\
+\vspace{0.2cm}
+To reduce the bias/shrinkage of regularized estimators various penalties were proposed, a few of which we briefly introduce now.
+
+\end{vbframe}
+
+\begin{vbframe}{$Lq$ regularization}
+Besides $L1$ and $L2$ norm we could use any $Lq$ (quasi-)norm for regularization.
+
+
+\begin{figure}
+  \scalebox{0.55}{\includegraphics{figure_man/lasso_ridge_hat.png}}\\
+%\includegraphics[height=2.3cm]{figure_man/contour.pdf}
+\caption{\textit{Top:} Ridge and Lasso loss contours and feasible regions.
+\textit{Bottom:} Different feasible region shapes for $Lq$ norms $\sum_j |\theta_j|^q$.}
+\end{figure}
+
+Note that for $q<1$ the penalty becomes non-convex (much harder to optimize!) and for $q>1$ no sparsity is obtained
+
+\end{vbframe}
+
+
+\begin{vbframe}{L0 regularization}
+
+  \begin{itemize}
+    \item Consider the $L0$-regularized risk of a model $\fxt$
+  $$
+  \riskrt = \risket + \lambda \|\thetab\|_0 := \risket + \lambda \sum_j |\theta_j|^0.
+  $$
+      \item Unlike the $L1$ and $L2$ norms, the $L0$ "norm" simply counts the number of non-zero parameters in the model.
+      \vspace{0.3cm}
+    \begin{figure}
+      \centering
+        \scalebox{0.99}{\includegraphics{figure_man/lq-penalty-plots.png}}
+        %\tiny{\\ Credit: Christos Louizos}
+        \caption{\footnotesize $Lq$ (quasi-)norm penalties for a scalar parameter $\thetab$ for different values of $q$}
+    \end{figure}
+
+    \end{itemize}
+    
+\end{vbframe}
+
+\begin{vbframe} {L0 regularization}
+
+    \begin{itemize}
+    \item For any parameter $\thetab$, the $L0$ penalty is zero for $\thetab = 0$ (defining $0^0 := 0$) and is constant for any $\thetab \neq 0$, no matter how large or small it is.
+    \item $L0$ regularization induces sparsity in the parameter vector more aggressively than $L1$ regularization, but does not shrink concrete parameter values as L1 and L2 does (unbiased).
+    \item Model selection criteria such as Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC) are special cases of $L0$ regularization (corresponding to specific values of $\lambda$).
+    \item The $L0$-regularized risk is neither continuous, differentiable nor convex. 
+    \item It is computationally hard to optimize (NP-hard) and likely intractable. 
+      For smaller $n$ and $p$ we might be able to solve this nowadays directly, for larger scenarios efficient approximations of the $L0$ are still topic of current research.
+  \end{itemize}
+\end{vbframe}
+
+\begin{vbframe}{SCAD \citebutton{Fan and Li, 2007}{https://www.tandfonline.com/doi/full/10.1080/00401706.2020.1801256?casa_token=JhnIrgzTysMAAAAA:Z216Mc0l0qPEBUOW7kL2W0NjHC9TxdU4J6RtVs6ME7MW3_rN7CwqXZMAjKUwZo2Qz5iPd-jzKc4ffA}}
+
+The SCAD ({\footnotesize{Smoothly Clipped Absolute Deviations}}) penalty is non-convex regularizer with piece-wise definition using add. hyperparam $\gamma>2$ controlling how fast penalty tapers off:
+$$
+\text{SCAD}(\theta \mid \lambda, \gamma)= \begin{cases}\lambda|\theta| & \text { if }|\theta| \leq \lambda \\ \frac{2 \gamma \lambda|\theta|-\theta^2-\lambda^2}{2(\gamma-1)} & \text { if } \lambda<|\theta|<\gamma \lambda \\ \frac{\lambda^2(\gamma+1)}{2} & \text { if }|\theta| \geq \gamma \lambda\end{cases}
+$$
+
+The SCAD penalty 
+\begin{enumerate}
+    \item coincides with the lasso for small values until $|\theta|=\lambda$,
+    \item then (smoothly) transitions to a quadratic up to $|\theta|=\gamma \lambda$,
+    \item remains constant for all $|\theta|>\gamma \lambda$
+\end{enumerate}
+\vspace{0.3cm}
+As opposed to the Lasso/Ridge regression, SCAD continuously relaxes penalization rate as $|\theta|$ increases above $\lambda$. %SCAD is asymptotically unbiased due to the ``clipping'' of the penalty.
+
+
+\end{vbframe}
+
+\begin{vbframe}{MCP \citebutton{Zhang, 2010}{https://arxiv.org/pdf/1002.4734.pdf}}
+
+MCP ({\footnotesize{Minimax Concave Penalty}}) is another non-convex regularizer with a similar idea to SCAD, defined as (for $\gamma>1$):
+
+$$
+MCP(\theta | \lambda, \gamma)= \begin{cases}\lambda|\theta|-\frac{\theta^2}{2 \gamma}, & \text { if }|\theta| \leq \gamma \lambda \\ \frac{1}{2} \gamma \lambda^2, & \text { if }|\theta|>\gamma \lambda\end{cases}
+$$
+\vspace{0.3cm}
+\begin{itemize}\setlength{\itemsep}{1.3em}
+    \item As with SCAD, MCP starts by applying same penalization rate as lasso, then smoothly reduces rate down to zero as $|\theta|$ increases
+    \item Different from SCAD, MCP immediately starts relaxing the penalization rate, while for SCAD rate remains flat until $|\theta|>\lambda$
+    \item Both SCAD and MCP possess oracle property: they can consistently select true model as $n \to \infty$ while lasso may fail
+\end{itemize}
+
+\end{vbframe}
+
+\begin{vbframe}{SCAD and MCP vs Lasso}
+
+\begin{figure}
+      \centering
+        \scalebox{0.95}{\includegraphics{figure_man/penalties-comparison.pdf}}
+        \caption{\footnotesize Lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetab$}
+    \end{figure}
+\end{vbframe}
+
+\endlecture
+\end{document}