diff --git a/slides/information-theory/slides-info-entropy.tex b/slides/information-theory/slides-info-entropy.tex
index 5fd1aded..2c90f718 100644
--- a/slides/information-theory/slides-info-entropy.tex
+++ b/slides/information-theory/slides-info-entropy.tex
@@ -80,14 +80,14 @@
   H(X) = - \E[\log_2(p(X))]           &= -\sum_{x \in \Xspace} p(x) \log_2 p(x) 
 \end{aligned} 
 \end{equation*}
-
 \begin{itemize}
 \setlength\itemsep{1.2em} 
 \item \textbf{Definition:}
 Base $2$ means the information is measured in bits, but you can use any number $>1$ as base of the logarithm.
 \item \textbf{Note:} If $p(x) = 0$, then $p(x) \log_2 p(x)$ is taken to be zero, because $\lim _{p \rightarrow 0} p \log_2 p=0$. %for $x=0$.
 \item NB: $H$ is actually Greek capital letter \textbf{E}ta ($\eta$) for \textbf{e}ntropy
-\item The negative log probabilities $\log_2 p(x)$ are called "Surprisal".
+\item The negative log probabilities $-\log_2 p(x)$ are called "Surprisal".
+\item More surprising means less likely. Distributions are more surprising, i.e., have higher entropy, when events are equally likely.
 \end{itemize}
 
 
diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex
index faa9ad81..d4d7b409 100644
--- a/slides/information-theory/slides-info-kl-ment.tex
+++ b/slides/information-theory/slides-info-kl-ment.tex
@@ -69,7 +69,7 @@
     \\ $\Rightarrow m$ must be the prior distribution $q$, and our entropy measure must be understood relatively to this prior, so $S(p)$ becomes, in fact, $S(p\|q).$\\
     \lz 
     \textbf{3) Independent subsystems} \\
-    \includegraphics[width=0.6\linewidth]{slides/information-theory/figure_man/kl_me_indep_sub.png} \\
+    \includegraphics[width=0.6\linewidth]{figure_man/kl_me_indep_sub.png} \\
     If the prior distribution defines a subsystem of $\mathcal{X}$ to be independent, then the priors can be independently updated, and the resulting posterior is just their product density.
     
     \framebreak
diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex
index f4397d21..0e8e7151 100644
--- a/slides/information-theory/slides-info-kl-ml.tex
+++ b/slides/information-theory/slides-info-kl-ml.tex
@@ -22,7 +22,7 @@
 \begin{vbframe} {Measuring Distribution Similarity in ML}
 \begin{itemize}
     \item Information theory provides tools (e.g., divergence measures) to quantify the similarity between probability distributions
-\includegraphics[width=0.4\linewidth]{slides/information-theory/figure_man/kl_ml_dist_sim.png}
+\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_dist_sim.png}
     \item The most prominent divergence measure is the KL divergence 
 \item In ML, measuring (and maximizing) the similarity between probability distributions is a ubiquitous concept, which will be shown in the following.
 \end{itemize}
@@ -60,7 +60,7 @@
     \item \textbf{Variational inference (VI)}
 Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable.
 
-\includegraphics[width=0.99\linewidth]{slides/information-theory/figure_man/kl_ml_vi.png}
+\includegraphics[width=0.99\linewidth]{figure_man/kl_ml_vi.png}
 
 In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to 
     $p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$
diff --git a/slides/regularization/figure/lasso_contours_cases.png b/slides/regularization/figure/lasso_contours_cases.png
index 6bc8354f..aff79eb5 100644
Binary files a/slides/regularization/figure/lasso_contours_cases.png and b/slides/regularization/figure/lasso_contours_cases.png differ
diff --git a/slides/regularization/figure/th_l1_neg.pdf b/slides/regularization/figure/th_l1_neg.pdf
new file mode 100644
index 00000000..86c71ecc
Binary files /dev/null and b/slides/regularization/figure/th_l1_neg.pdf differ
diff --git a/slides/regularization/figure/th_l1_pos.pdf b/slides/regularization/figure/th_l1_pos.pdf
new file mode 100644
index 00000000..31678084
Binary files /dev/null and b/slides/regularization/figure/th_l1_pos.pdf differ
diff --git a/slides/regularization/figure/th_l1_zero.pdf b/slides/regularization/figure/th_l1_zero.pdf
new file mode 100644
index 00000000..98b06139
Binary files /dev/null and b/slides/regularization/figure/th_l1_zero.pdf differ
diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png
new file mode 100644
index 00000000..1b67caaf
Binary files /dev/null and b/slides/regularization/figure_man/bias-variance-ridge.png differ
diff --git a/slides/regularization/figure_man/cv-error-lambda-path.png b/slides/regularization/figure_man/cv-error-lambda-path.png
new file mode 100644
index 00000000..50269c29
Binary files /dev/null and b/slides/regularization/figure_man/cv-error-lambda-path.png differ
diff --git a/slides/regularization/rsrc/bias-var-decomp-ridge.py b/slides/regularization/rsrc/bias-var-decomp-ridge.py
new file mode 100644
index 00000000..9c8d7a6f
--- /dev/null
+++ b/slides/regularization/rsrc/bias-var-decomp-ridge.py
@@ -0,0 +1,94 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mean_squared_error
+
+
+# Set the random seed for reproducibility
+np.random.seed(0)
+
+# Define the true function and the number of datasets
+true_function = lambda x: np.sin(x)
+n_datasets = 100  # Number of datasets for training
+n_samples = 100
+n_test_samples = 10000
+n_order = 8
+lambdas = np.exp(np.linspace(-6, 7, 25))  
+
+# Generate polynomial features
+poly = PolynomialFeatures(degree=n_order, include_bias=False)
+
+# Initialize arrays to store the bias, variance, and error
+bias_squared = np.zeros_like(lambdas)
+variance = np.zeros_like(lambdas)
+test_error = np.zeros_like(lambdas)
+
+# Generate shared x values for all datasets
+x_shared = np.random.uniform(0, 1, n_samples).reshape(-1, 1)
+x_shared_poly = poly.fit_transform(x_shared)
+
+# Generate test data
+x_test = np.random.uniform(0, 1, n_test_samples).reshape(-1, 1)
+y_test = true_function(x_test).reshape(-1, 1) + np.random.randn(n_test_samples,1)
+x_test_poly = poly.transform(x_test)
+
+# Loop over the lambda values
+for i, lambda_val in enumerate(lambdas):
+    # Initialize arrays to store predictions for each model
+    predictions = np.zeros((n_datasets, n_samples))
+
+    # Train and predict with n_datasets models
+    for j in range(n_datasets):
+        # Generate new y values for each dataset
+        epsilon = np.random.randn(n_samples, 1)
+        y = true_function(x_shared) + epsilon
+
+        # Fit Ridge regression model
+        model = Ridge(alpha=lambda_val, fit_intercept=True)
+        model.fit(x_shared_poly, y)
+        predictions[j, :] = model.predict(x_shared_poly).flatten()
+
+    # Calculate the average prediction for each x
+    average_prediction = np.mean(predictions, axis=0)
+
+    # Compute itegrated bias^2 and variance using MC
+    bias_squared[i] = np.mean((average_prediction - true_function(x_shared).flatten()) ** 2)
+    variance[i] = np.mean(np.var(predictions, axis=0))
+
+# Train a final model on a new dataset and compute test error for each lambda
+for i, lambda_val in enumerate(lambdas):
+    # Generate new data for the final model
+    x_train_final = np.random.uniform(0, 1, n_samples).reshape(-1, 1)
+    y_train_final = true_function(x_train_final) + np.random.randn(n_samples, 1)
+    x_train_final_poly = poly.transform(x_train_final)
+
+    # Fit the final model
+    model_final = Ridge(alpha=lambda_val, fit_intercept=True)
+    model_final.fit(x_train_final_poly, y_train_final)
+
+    # Predict on the test set and compute the error
+    y_test_pred_final = model_final.predict(x_test_poly).flatten()
+    # The test error
+    test_error[i] = mean_squared_error(y_test, y_test_pred_final)
+
+# Plotting the results with two y-axes
+fig, ax1 = plt.subplots(figsize=(12, 6))
+
+# Plot bias^2 and variance on the primary y-axis
+ax1.plot(np.log(lambdas), bias_squared, label='(bias)^2', color='red')
+ax1.plot(np.log(lambdas), variance, label='variance', color='blue')
+ax1.plot(np.log(lambdas), bias_squared + variance, label='(bias)^2 + variance', color='green')
+
+ax1.set_xlabel('ln(λ)', fontsize=16)
+ax1.set_ylabel('(bias)^2, variance', fontsize=16)
+ax1.legend(loc='upper left')
+
+# Create secondary y-axis for test error
+ax2 = ax1.twinx()
+ax2.plot(np.log(lambdas), test_error, label='test error', color='magenta', linestyle='--', alpha=.6)
+ax2.set_ylabel('Test error on single dataset', fontsize=16)
+ax2.legend(loc='upper right')
+
+plt.title('Bias-Variance Tradeoff with L2 Regularization', fontsize=20)
+plt.show()
diff --git a/slides/regularization/rsrc/cv-error-lambda-path.py b/slides/regularization/rsrc/cv-error-lambda-path.py
new file mode 100644
index 00000000..7d2c3738
--- /dev/null
+++ b/slides/regularization/rsrc/cv-error-lambda-path.py
@@ -0,0 +1,52 @@
+from sklearn.datasets import load_wine
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import RidgeCV
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Load wine dataset
+X, y = load_wine(return_X_y=True)
+
+# Generating standard normal noise features
+np.random.seed(42)
+noise_features = np.random.normal(size=(X.shape[0], 75))
+
+# Adding these noise features to the original dataset
+X_extended = np.hstack((X, noise_features))
+
+# Splitting the augmented dataset into training and test sets
+X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split(
+    X_extended, y, test_size=0.2, random_state=42
+)
+
+# Standardizing the augmented dataset
+scaler_ext = StandardScaler()
+X_train_ext_scaled = scaler_ext.fit_transform(X_train_ext)
+X_test_ext_scaled = scaler_ext.transform(X_test_ext)
+
+# Define a range of lambda (alpha) values
+lambda_values = np.logspace(-4, 4, 50)
+
+# Performing Ridge Regression with Cross-Validation on the extended dataset
+ridge_cv_ext = RidgeCV(alphas=lambda_values, store_cv_values=True)
+ridge_cv_ext.fit(X_train_ext_scaled, y_train_ext)
+
+# Plotting the CV Curve for the extended dataset
+mean_cv_scores_ext = np.mean(ridge_cv_ext.cv_values_, axis=0)
+
+# Finding the lambda value with the minimum CV score
+min_lambda_index = np.argmin(mean_cv_scores_ext)
+min_lambda_value = lambda_values[min_lambda_index]
+
+# Re-plotting with a vertical blue bar at the minimum CV score
+plt.figure(figsize=(8, 6))
+plt.plot(lambda_values, mean_cv_scores_ext, marker='o', color='red')
+plt.axvline(x=min_lambda_value, color='blue', linestyle='--', label=f'Min CV Score at λ={min_lambda_value:.4f}')
+plt.xscale('log')
+plt.xlabel('Lambda (Regularization strength)', fontsize = 14)
+plt.ylabel('Generalization error', fontsize = 14)
+#plt.title('Wine dataset with add. noise features', fontsize=12)
+plt.title('Effect of L2 Regularization', fontsize = 16)
+plt.legend()
+plt.show()
diff --git a/slides/regularization/rsrc/softthresholding_l1.R b/slides/regularization/rsrc/softthresholding_l1.R
new file mode 100644
index 00000000..33dd6b43
--- /dev/null
+++ b/slides/regularization/rsrc/softthresholding_l1.R
@@ -0,0 +1,59 @@
+library(ggplot2)
+
+lambda = 2
+
+fun1 <- function(x){
+  return(x^2 + 3*abs(x)+ 1)
+}
+
+fun2 <- function(x){
+  return(0.5*(x-4)^2 + lambda*abs(x)+ 1)
+}
+
+fun3 <- function(x){
+  return(0.5*(x+4)^2 + lambda*abs(x)+ 1)
+}
+
+
+p1 <- ggplot() +
+  xlim(-7, 7) +
+  geom_function(fun = fun1) +
+  xlab(expression(theta)) +
+  ylab(expression(R[reg])) +
+  geom_vline(xintercept = 0,
+             linetype="dashed") +
+  theme_bw(base_size = 20)
+
+pdf("../figure/th_l1_zero.pdf")
+print(p1)
+dev.off()
+
+p2 <- ggplot() +
+  xlim(-7, 7) +
+  geom_function(fun = fun2) +
+  xlab(expression(theta)) +
+  ylab(expression(R[reg])) +
+  geom_vline(xintercept = 4 - lambda,
+             linetype="dashed") +
+  theme_bw(base_size = 20)
+
+pdf("../figure/th_l1_pos.pdf")
+print(p2)
+dev.off()
+
+p3 <- ggplot() +
+  xlim(-7, 7) +
+  geom_function(fun = fun3) +
+  xlab(expression(theta)) +
+  ylab(expression(R[reg])) +
+  geom_vline(xintercept = -4 + lambda,
+             linetype="dashed") +
+  theme_bw(base_size = 20)
+
+
+pdf("../figure/th_l1_neg.pdf")
+print(p3)
+dev.off()
+
+
+
diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex
index 58150f3e..1a43b639 100644
--- a/slides/regularization/slides-regu-intro.tex
+++ b/slides/regularization/slides-regu-intro.tex
@@ -3,7 +3,7 @@
 \input{../../latex-math/basic-math}
 \input{../../latex-math/basic-ml}
 
-\newcommand{\titlefigure}{figure_man/biasvariance_scheme.png}
+\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png}
 \newcommand{\learninggoals}{
   \item Understand why overfitting happens
   \item Know how overfitting can be avoided
@@ -20,6 +20,19 @@
 
 %\section{Motivation for Regularization}
 
+\begin{vbframe}{What is Regularization?}
+
+Regularization comprises all methods that add preferences for specific solutions (\textbf{inductive bias}) to a model, usually in the context of ``low complexity'' priors (shrinkage and sparsity). By controlling complexity we can reduce overfitting and achieve an optimal bias-variance tradeoff.
+\vspace{0.1cm}
+\begin{itemize}
+\setlength{\itemsep}{1.0em}
+    \item \textbf{Explicit regularization} methods define an explicit measure of model complexity and add this as penalty to empirical risk (e.g., $L1/L2$)
+    \item \textbf{Implicit regularization} includes removing outliers, early stopping, data augmentation, parameter sharing, dropout or ensembling
+    \item \textbf{Structured regularization} methods incorporate structural prior knowledge over groups of parameters or subnetworks (e.g., the group lasso \citebutton{Yuan and Lin, 2005}{https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00532.x})
+\end{itemize}
+
+\end{vbframe}
+
 
 \begin{vbframe}{Example: Overfitting}
 
diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex
index 67bb0502..38914b7a 100644
--- a/slides/regularization/slides-regu-l1l2.tex
+++ b/slides/regularization/slides-regu-l1l2.tex
@@ -143,28 +143,33 @@
 
 \begin{vbframe}{Lasso Regression}
 
-Another shrinkage method is the so-called \textbf{Lasso regression}, which uses an $L1$ penalty on $\thetab$:
-
+Another shrinkage method is the so-called \textbf{Lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetab$:
+\vspace{-0.2cm}
 \begin{eqnarray*}
-\thetah_{\text{Lasso}} &=&  \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi\right)^2 + \lambda \|\thetab\|_1 \\
-  &=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \|\thetab\|_1.
+\thetah_{\text{Lasso}}= \argmin_{\thetab} \underbrace{\sumin \left(\yi - \thetab^T \xi\right)^2}_{\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right)} + \lambda \|\thetab\|_1
 \end{eqnarray*}
+Optimization is much harder now. $\riskrt$ is still convex, but in general there is no analytical solution and it is non-differentiable.\\
+\vspace{0.2cm}
 
-Note that optimization now becomes much harder. $\riskrt$ is still convex, but we have moved from an optimization problem with an analytical solution towards a non-differentiable problem.
+For special case of orthonormal design $\Xmat^{\top}\Xmat=\id$ we can get closed-form solution in terms of $\thetah_{\text{OLS}}=(\Xmat^{\top}\Xmat)^{-1}\Xmat^{\top}\yv=\Xmat^{\top}\yv$:
+$$\thetah_{\text{Lasso}}=\text{sign}(\thetah_{\text{OLS}})(\vert \thetah_{\text{OLS}} \vert - \lambda)_{+}\quad(\text{sparsity})$$
 
-\lz
+Comparing this to $\thetah_{\text{Ridge}}$ we see different behavior as $\lambda \uparrow$:
+$$\thetah_{\text{Ridge}}=\frac{\thetah_{\text{OLS}}}{1+\lambda}\quad (\text{no sparsity, uniform downscaling})$$
 
-Name: least absolute shrinkage and selection operator.
+
+%\textbf{NB}: lasso=least absolute shrinkage and selection operator.
 
 \framebreak 
 
 We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape.
 \vspace{-0.2cm}
 \begin{eqnarray*}
-\min_{\thetab} && \sumin \left(\yi - \fxit\right)^2\\
-\text{subject to: } && \|\thetab\|_1 \leq t \\
+\min_{\thetab} \sumin \left(\yi - \fxit\right)^2\,
+\text{subject to: } \|\thetab\|_1 \leq t
 \end{eqnarray*}
-\vspace{-0.2cm}
+The kinks in $L1$ enforce sparse solutions because ``the loss contours first hit the sharp corners of the constraint'' at coordinate axes where (some) entries are zero. 
+\vspace{-0.1cm}
 \begin{figure}%\includegraphics[width=0.3\textwidth]{figure_man/lasso_hat.png}\\
 \includegraphics[width=0.95\textwidth]{figure/lasso_contours_cases.png}\\
 \end{figure}
diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
new file mode 100644
index 00000000..b123ade8
--- /dev/null
+++ b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -0,0 +1,45 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+
+\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png}
+\newcommand{\learninggoals}{
+  \item Know alternative interpretations of Ridge regression
+  \item Derivation of the bias-variance tradeoff for Ridge regression
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Ridge Regression Deep-Dive}
+\lecture{Introduction to Machine Learning}
+
+
+
+\begin{vbframe}{Perspectives on $L2$ regularization}
+We already saw that $L2$ regularization is equivalent to a constrained optimization problem:
+\begin{eqnarray*}  
+  \thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv\\
+  %&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\
+  &=& \argmin_{\thetab} \sumin \left(\yi - \fxit\right)^2 \,
+  \text{s.t. } \|\thetab\|_2^2  \leq t
+  \end{eqnarray*}
+We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let \scriptsize{$\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix}
+    \yv \\ \bm{0}_{p}
+\end{pmatrix}$.} \normalsize{Using the augmented data, the least-squares objective becomes}
+\small{
+$$%\argmin_{\thetab} 
+\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 = %\argmin_{\thetab} 
+\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 %= \thetah_{\text{Ridge}}
+=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2
+$$
+}
+\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.}
+%$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv$$
+\end{vbframe}
+
+\endlecture
+\end{document}
diff --git a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex
index 5235f6aa..fdd75fad 100644
--- a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex
+++ b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex
@@ -3,9 +3,9 @@
 \input{../../latex-math/basic-math}
 \input{../../latex-math/basic-ml}
 
-\newcommand{\titlefigure}{figure/graddes_vs_weightdecay.png}
+\newcommand{\titlefigure}{slides/regularization/figure/th_l1_pos.pdf}
 \newcommand{\learninggoals}{
-  \item todo
+  \item Understand the relationship between soft-thresholding and L1 regularization
 }
 
 \title{Introduction to Machine Learning}
@@ -21,7 +21,7 @@
 \begin{vbframe}{Soft-thresholding and L1 regularization}
 In the lecture, we wanted to solve
      \[
-      \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) =  \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j|.
+      \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) =  \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_k \left[ \frac{1}{2} H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right] + \sum_k \lambda |\theta_k|.
       \] 
 This is a convex problem (since it is the sum of convex functions) for which, in general, no analytical solution exists. \\
 \lz
@@ -36,7 +36,7 @@
 
 First, we will focus on the everywhere differentiable part:
 \begin{align*}
-\frac{\partial}{\partial \thetab_j}\sum_j \left[\frac{1}{2}  H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right]
+\frac{\partial}{\partial \thetab_j}\sum_k \left[\frac{1}{2}  H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right]
     &=  H_{j,j} (\theta_j - \hat{\theta}_j)  \\
         &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j  \\
 \end{align*}
@@ -44,21 +44,51 @@
 First, we consider the cases $\hat{\theta}_{\text{Lasso},j} > 0, \hat{\theta}_{\text{Lasso},j} < 0.$ \\
 (Here $\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}}$ exists) \\
 \lz
-1) $\hat{\theta}_{\text{Lasso},j} > 0:$
-$\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} = H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0$ \\
-$\quad \Rightarrow  \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j 
- -\frac{\lambda}{H_{j,j}} > 0 \iff \hat{\theta}_j >  \frac{\lambda}{H_{j,j}}$\\
 
 \framebreak
 
-2) $\hat{\theta}_{\text{Lasso},j} < 0:$
-$\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} = H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0$ \\
-$\quad \Rightarrow  \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j 
- + \frac{\lambda}{H_{j,j}} < 0 \iff \hat{\theta}_j <  -\frac{\lambda}{H_{j,j}}$\\
- \lz
+1) $\hat{\theta}_{\text{Lasso},j} > 0:$ \\
+\lz
+\begin{minipage}{0.4\textwidth}
+    \includegraphics[width=5cm]{slides/regularization/figure/th_l1_pos.pdf}
+\end{minipage}
+\hfill
+\begin{minipage}{0.49\textwidth}
+\begin{align*}
+    \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0 \\
+    &\Rightarrow  \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j 
+ -\frac{\lambda}{H_{j,j}} > 0 \\
+ &\iff \hat{\theta}_j >  \frac{\lambda}{H_{j,j}}
+\end{align*}
+\end{minipage}
+ \newpage
+
+2) $\hat{\theta}_{\text{Lasso},j} < 0:$ \\
+\lz
+\begin{minipage}{0.4\textwidth}
+    \includegraphics[width=5cm]{slides/regularization/figure/th_l1_neg.pdf}
+\end{minipage}
+\hfill
+\begin{minipage}{0.49\textwidth}
+\begin{align*}
+    \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0 \\
+    &\Rightarrow  \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j 
+ +\frac{\lambda}{H_{j,j}} < 0 \\
+ &\iff \hat{\theta}_j < -\frac{\lambda}{H_{j,j}}
+\end{align*}
+\end{minipage}
+ \newpage
+
 
+\begin{minipage}{0.4\textwidth}
+    \includegraphics[width=5cm]{slides/regularization/figure/th_l1_zero.pdf}
+\end{minipage}
+\hfill
+\begin{minipage}{0.49\textwidth}
 $\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $\mathcal{\tilde R}_{\text{reg}}$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\
-However, there must be at least one stationary point since $\mathcal{\tilde R}_{\text{reg}}$ is a regularized convex risk. \\
+However, there must be at least one stationary point since $\mathcal{\tilde R}_{\text{reg}}$ is a regularized convex risk.
+\end{minipage}
+ \\
 
 \begin{align*}\Rightarrow \hat{\theta}_{\text{Lasso},j} &= \begin{cases} 
      \hat{\theta}_j + \frac{\lambda}{H_{j,j}} &, \text{if}   \;\hat{\theta}_j < -\frac{\lambda}{H_{j,j}} \\