diff --git a/slides/information-theory/slides-info-entropy.tex b/slides/information-theory/slides-info-entropy.tex index 5fd1aded..2c90f718 100644 --- a/slides/information-theory/slides-info-entropy.tex +++ b/slides/information-theory/slides-info-entropy.tex @@ -80,14 +80,14 @@ H(X) = - \E[\log_2(p(X))] &= -\sum_{x \in \Xspace} p(x) \log_2 p(x) \end{aligned} \end{equation*} - \begin{itemize} \setlength\itemsep{1.2em} \item \textbf{Definition:} Base $2$ means the information is measured in bits, but you can use any number $>1$ as base of the logarithm. \item \textbf{Note:} If $p(x) = 0$, then $p(x) \log_2 p(x)$ is taken to be zero, because $\lim _{p \rightarrow 0} p \log_2 p=0$. %for $x=0$. \item NB: $H$ is actually Greek capital letter \textbf{E}ta ($\eta$) for \textbf{e}ntropy -\item The negative log probabilities $\log_2 p(x)$ are called "Surprisal". +\item The negative log probabilities $-\log_2 p(x)$ are called "Surprisal". +\item More surprising means less likely. Distributions are more surprising, i.e., have higher entropy, when events are equally likely. \end{itemize} diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex index faa9ad81..d4d7b409 100644 --- a/slides/information-theory/slides-info-kl-ment.tex +++ b/slides/information-theory/slides-info-kl-ment.tex @@ -69,7 +69,7 @@ \\ $\Rightarrow m$ must be the prior distribution $q$, and our entropy measure must be understood relatively to this prior, so $S(p)$ becomes, in fact, $S(p\|q).$\\ \lz \textbf{3) Independent subsystems} \\ - \includegraphics[width=0.6\linewidth]{slides/information-theory/figure_man/kl_me_indep_sub.png} \\ + \includegraphics[width=0.6\linewidth]{figure_man/kl_me_indep_sub.png} \\ If the prior distribution defines a subsystem of $\mathcal{X}$ to be independent, then the priors can be independently updated, and the resulting posterior is just their product density. \framebreak diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex index f4397d21..0e8e7151 100644 --- a/slides/information-theory/slides-info-kl-ml.tex +++ b/slides/information-theory/slides-info-kl-ml.tex @@ -22,7 +22,7 @@ \begin{vbframe} {Measuring Distribution Similarity in ML} \begin{itemize} \item Information theory provides tools (e.g., divergence measures) to quantify the similarity between probability distributions -\includegraphics[width=0.4\linewidth]{slides/information-theory/figure_man/kl_ml_dist_sim.png} +\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_dist_sim.png} \item The most prominent divergence measure is the KL divergence \item In ML, measuring (and maximizing) the similarity between probability distributions is a ubiquitous concept, which will be shown in the following. \end{itemize} @@ -60,7 +60,7 @@ \item \textbf{Variational inference (VI)} Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable. -\includegraphics[width=0.99\linewidth]{slides/information-theory/figure_man/kl_ml_vi.png} +\includegraphics[width=0.99\linewidth]{figure_man/kl_ml_vi.png} In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to $p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$ diff --git a/slides/regularization/figure/lasso_contours_cases.png b/slides/regularization/figure/lasso_contours_cases.png index 6bc8354f..aff79eb5 100644 Binary files a/slides/regularization/figure/lasso_contours_cases.png and b/slides/regularization/figure/lasso_contours_cases.png differ diff --git a/slides/regularization/figure/th_l1_neg.pdf b/slides/regularization/figure/th_l1_neg.pdf new file mode 100644 index 00000000..86c71ecc Binary files /dev/null and b/slides/regularization/figure/th_l1_neg.pdf differ diff --git a/slides/regularization/figure/th_l1_pos.pdf b/slides/regularization/figure/th_l1_pos.pdf new file mode 100644 index 00000000..31678084 Binary files /dev/null and b/slides/regularization/figure/th_l1_pos.pdf differ diff --git a/slides/regularization/figure/th_l1_zero.pdf b/slides/regularization/figure/th_l1_zero.pdf new file mode 100644 index 00000000..98b06139 Binary files /dev/null and b/slides/regularization/figure/th_l1_zero.pdf differ diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png new file mode 100644 index 00000000..1b67caaf Binary files /dev/null and b/slides/regularization/figure_man/bias-variance-ridge.png differ diff --git a/slides/regularization/figure_man/cv-error-lambda-path.png b/slides/regularization/figure_man/cv-error-lambda-path.png new file mode 100644 index 00000000..50269c29 Binary files /dev/null and b/slides/regularization/figure_man/cv-error-lambda-path.png differ diff --git a/slides/regularization/rsrc/bias-var-decomp-ridge.py b/slides/regularization/rsrc/bias-var-decomp-ridge.py new file mode 100644 index 00000000..9c8d7a6f --- /dev/null +++ b/slides/regularization/rsrc/bias-var-decomp-ridge.py @@ -0,0 +1,94 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import Ridge +from sklearn.metrics import mean_squared_error + + +# Set the random seed for reproducibility +np.random.seed(0) + +# Define the true function and the number of datasets +true_function = lambda x: np.sin(x) +n_datasets = 100 # Number of datasets for training +n_samples = 100 +n_test_samples = 10000 +n_order = 8 +lambdas = np.exp(np.linspace(-6, 7, 25)) + +# Generate polynomial features +poly = PolynomialFeatures(degree=n_order, include_bias=False) + +# Initialize arrays to store the bias, variance, and error +bias_squared = np.zeros_like(lambdas) +variance = np.zeros_like(lambdas) +test_error = np.zeros_like(lambdas) + +# Generate shared x values for all datasets +x_shared = np.random.uniform(0, 1, n_samples).reshape(-1, 1) +x_shared_poly = poly.fit_transform(x_shared) + +# Generate test data +x_test = np.random.uniform(0, 1, n_test_samples).reshape(-1, 1) +y_test = true_function(x_test).reshape(-1, 1) + np.random.randn(n_test_samples,1) +x_test_poly = poly.transform(x_test) + +# Loop over the lambda values +for i, lambda_val in enumerate(lambdas): + # Initialize arrays to store predictions for each model + predictions = np.zeros((n_datasets, n_samples)) + + # Train and predict with n_datasets models + for j in range(n_datasets): + # Generate new y values for each dataset + epsilon = np.random.randn(n_samples, 1) + y = true_function(x_shared) + epsilon + + # Fit Ridge regression model + model = Ridge(alpha=lambda_val, fit_intercept=True) + model.fit(x_shared_poly, y) + predictions[j, :] = model.predict(x_shared_poly).flatten() + + # Calculate the average prediction for each x + average_prediction = np.mean(predictions, axis=0) + + # Compute itegrated bias^2 and variance using MC + bias_squared[i] = np.mean((average_prediction - true_function(x_shared).flatten()) ** 2) + variance[i] = np.mean(np.var(predictions, axis=0)) + +# Train a final model on a new dataset and compute test error for each lambda +for i, lambda_val in enumerate(lambdas): + # Generate new data for the final model + x_train_final = np.random.uniform(0, 1, n_samples).reshape(-1, 1) + y_train_final = true_function(x_train_final) + np.random.randn(n_samples, 1) + x_train_final_poly = poly.transform(x_train_final) + + # Fit the final model + model_final = Ridge(alpha=lambda_val, fit_intercept=True) + model_final.fit(x_train_final_poly, y_train_final) + + # Predict on the test set and compute the error + y_test_pred_final = model_final.predict(x_test_poly).flatten() + # The test error + test_error[i] = mean_squared_error(y_test, y_test_pred_final) + +# Plotting the results with two y-axes +fig, ax1 = plt.subplots(figsize=(12, 6)) + +# Plot bias^2 and variance on the primary y-axis +ax1.plot(np.log(lambdas), bias_squared, label='(bias)^2', color='red') +ax1.plot(np.log(lambdas), variance, label='variance', color='blue') +ax1.plot(np.log(lambdas), bias_squared + variance, label='(bias)^2 + variance', color='green') + +ax1.set_xlabel('ln(λ)', fontsize=16) +ax1.set_ylabel('(bias)^2, variance', fontsize=16) +ax1.legend(loc='upper left') + +# Create secondary y-axis for test error +ax2 = ax1.twinx() +ax2.plot(np.log(lambdas), test_error, label='test error', color='magenta', linestyle='--', alpha=.6) +ax2.set_ylabel('Test error on single dataset', fontsize=16) +ax2.legend(loc='upper right') + +plt.title('Bias-Variance Tradeoff with L2 Regularization', fontsize=20) +plt.show() diff --git a/slides/regularization/rsrc/cv-error-lambda-path.py b/slides/regularization/rsrc/cv-error-lambda-path.py new file mode 100644 index 00000000..7d2c3738 --- /dev/null +++ b/slides/regularization/rsrc/cv-error-lambda-path.py @@ -0,0 +1,52 @@ +from sklearn.datasets import load_wine +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import RidgeCV +import numpy as np +import matplotlib.pyplot as plt + +# Load wine dataset +X, y = load_wine(return_X_y=True) + +# Generating standard normal noise features +np.random.seed(42) +noise_features = np.random.normal(size=(X.shape[0], 75)) + +# Adding these noise features to the original dataset +X_extended = np.hstack((X, noise_features)) + +# Splitting the augmented dataset into training and test sets +X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split( + X_extended, y, test_size=0.2, random_state=42 +) + +# Standardizing the augmented dataset +scaler_ext = StandardScaler() +X_train_ext_scaled = scaler_ext.fit_transform(X_train_ext) +X_test_ext_scaled = scaler_ext.transform(X_test_ext) + +# Define a range of lambda (alpha) values +lambda_values = np.logspace(-4, 4, 50) + +# Performing Ridge Regression with Cross-Validation on the extended dataset +ridge_cv_ext = RidgeCV(alphas=lambda_values, store_cv_values=True) +ridge_cv_ext.fit(X_train_ext_scaled, y_train_ext) + +# Plotting the CV Curve for the extended dataset +mean_cv_scores_ext = np.mean(ridge_cv_ext.cv_values_, axis=0) + +# Finding the lambda value with the minimum CV score +min_lambda_index = np.argmin(mean_cv_scores_ext) +min_lambda_value = lambda_values[min_lambda_index] + +# Re-plotting with a vertical blue bar at the minimum CV score +plt.figure(figsize=(8, 6)) +plt.plot(lambda_values, mean_cv_scores_ext, marker='o', color='red') +plt.axvline(x=min_lambda_value, color='blue', linestyle='--', label=f'Min CV Score at λ={min_lambda_value:.4f}') +plt.xscale('log') +plt.xlabel('Lambda (Regularization strength)', fontsize = 14) +plt.ylabel('Generalization error', fontsize = 14) +#plt.title('Wine dataset with add. noise features', fontsize=12) +plt.title('Effect of L2 Regularization', fontsize = 16) +plt.legend() +plt.show() diff --git a/slides/regularization/rsrc/softthresholding_l1.R b/slides/regularization/rsrc/softthresholding_l1.R new file mode 100644 index 00000000..33dd6b43 --- /dev/null +++ b/slides/regularization/rsrc/softthresholding_l1.R @@ -0,0 +1,59 @@ +library(ggplot2) + +lambda = 2 + +fun1 <- function(x){ + return(x^2 + 3*abs(x)+ 1) +} + +fun2 <- function(x){ + return(0.5*(x-4)^2 + lambda*abs(x)+ 1) +} + +fun3 <- function(x){ + return(0.5*(x+4)^2 + lambda*abs(x)+ 1) +} + + +p1 <- ggplot() + + xlim(-7, 7) + + geom_function(fun = fun1) + + xlab(expression(theta)) + + ylab(expression(R[reg])) + + geom_vline(xintercept = 0, + linetype="dashed") + + theme_bw(base_size = 20) + +pdf("../figure/th_l1_zero.pdf") +print(p1) +dev.off() + +p2 <- ggplot() + + xlim(-7, 7) + + geom_function(fun = fun2) + + xlab(expression(theta)) + + ylab(expression(R[reg])) + + geom_vline(xintercept = 4 - lambda, + linetype="dashed") + + theme_bw(base_size = 20) + +pdf("../figure/th_l1_pos.pdf") +print(p2) +dev.off() + +p3 <- ggplot() + + xlim(-7, 7) + + geom_function(fun = fun3) + + xlab(expression(theta)) + + ylab(expression(R[reg])) + + geom_vline(xintercept = -4 + lambda, + linetype="dashed") + + theme_bw(base_size = 20) + + +pdf("../figure/th_l1_neg.pdf") +print(p3) +dev.off() + + + diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex index 58150f3e..1a43b639 100644 --- a/slides/regularization/slides-regu-intro.tex +++ b/slides/regularization/slides-regu-intro.tex @@ -3,7 +3,7 @@ \input{../../latex-math/basic-math} \input{../../latex-math/basic-ml} -\newcommand{\titlefigure}{figure_man/biasvariance_scheme.png} +\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png} \newcommand{\learninggoals}{ \item Understand why overfitting happens \item Know how overfitting can be avoided @@ -20,6 +20,19 @@ %\section{Motivation for Regularization} +\begin{vbframe}{What is Regularization?} + +Regularization comprises all methods that add preferences for specific solutions (\textbf{inductive bias}) to a model, usually in the context of ``low complexity'' priors (shrinkage and sparsity). By controlling complexity we can reduce overfitting and achieve an optimal bias-variance tradeoff. +\vspace{0.1cm} +\begin{itemize} +\setlength{\itemsep}{1.0em} + \item \textbf{Explicit regularization} methods define an explicit measure of model complexity and add this as penalty to empirical risk (e.g., $L1/L2$) + \item \textbf{Implicit regularization} includes removing outliers, early stopping, data augmentation, parameter sharing, dropout or ensembling + \item \textbf{Structured regularization} methods incorporate structural prior knowledge over groups of parameters or subnetworks (e.g., the group lasso \citebutton{Yuan and Lin, 2005}{https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00532.x}) +\end{itemize} + +\end{vbframe} + \begin{vbframe}{Example: Overfitting} diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex index 67bb0502..38914b7a 100644 --- a/slides/regularization/slides-regu-l1l2.tex +++ b/slides/regularization/slides-regu-l1l2.tex @@ -143,28 +143,33 @@ \begin{vbframe}{Lasso Regression} -Another shrinkage method is the so-called \textbf{Lasso regression}, which uses an $L1$ penalty on $\thetab$: - +Another shrinkage method is the so-called \textbf{Lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetab$: +\vspace{-0.2cm} \begin{eqnarray*} -\thetah_{\text{Lasso}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi\right)^2 + \lambda \|\thetab\|_1 \\ - &=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \|\thetab\|_1. +\thetah_{\text{Lasso}}= \argmin_{\thetab} \underbrace{\sumin \left(\yi - \thetab^T \xi\right)^2}_{\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right)} + \lambda \|\thetab\|_1 \end{eqnarray*} +Optimization is much harder now. $\riskrt$ is still convex, but in general there is no analytical solution and it is non-differentiable.\\ +\vspace{0.2cm} -Note that optimization now becomes much harder. $\riskrt$ is still convex, but we have moved from an optimization problem with an analytical solution towards a non-differentiable problem. +For special case of orthonormal design $\Xmat^{\top}\Xmat=\id$ we can get closed-form solution in terms of $\thetah_{\text{OLS}}=(\Xmat^{\top}\Xmat)^{-1}\Xmat^{\top}\yv=\Xmat^{\top}\yv$: +$$\thetah_{\text{Lasso}}=\text{sign}(\thetah_{\text{OLS}})(\vert \thetah_{\text{OLS}} \vert - \lambda)_{+}\quad(\text{sparsity})$$ -\lz +Comparing this to $\thetah_{\text{Ridge}}$ we see different behavior as $\lambda \uparrow$: +$$\thetah_{\text{Ridge}}=\frac{\thetah_{\text{OLS}}}{1+\lambda}\quad (\text{no sparsity, uniform downscaling})$$ -Name: least absolute shrinkage and selection operator. + +%\textbf{NB}: lasso=least absolute shrinkage and selection operator. \framebreak We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape. \vspace{-0.2cm} \begin{eqnarray*} -\min_{\thetab} && \sumin \left(\yi - \fxit\right)^2\\ -\text{subject to: } && \|\thetab\|_1 \leq t \\ +\min_{\thetab} \sumin \left(\yi - \fxit\right)^2\, +\text{subject to: } \|\thetab\|_1 \leq t \end{eqnarray*} -\vspace{-0.2cm} +The kinks in $L1$ enforce sparse solutions because ``the loss contours first hit the sharp corners of the constraint'' at coordinate axes where (some) entries are zero. +\vspace{-0.1cm} \begin{figure}%\includegraphics[width=0.3\textwidth]{figure_man/lasso_hat.png}\\ \includegraphics[width=0.95\textwidth]{figure/lasso_contours_cases.png}\\ \end{figure} diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex new file mode 100644 index 00000000..b123ade8 --- /dev/null +++ b/slides/regularization/slides-regu-ridge-deepdive.tex @@ -0,0 +1,45 @@ +\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} +\input{../../style/preamble} +\input{../../latex-math/basic-math} +\input{../../latex-math/basic-ml} + +\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png} +\newcommand{\learninggoals}{ + \item Know alternative interpretations of Ridge regression + \item Derivation of the bias-variance tradeoff for Ridge regression +} + +\title{Introduction to Machine Learning} +\date{} + +\begin{document} + +\lecturechapter{Ridge Regression Deep-Dive} +\lecture{Introduction to Machine Learning} + + + +\begin{vbframe}{Perspectives on $L2$ regularization} +We already saw that $L2$ regularization is equivalent to a constrained optimization problem: +\begin{eqnarray*} + \thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv\\ + %&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\ + &=& \argmin_{\thetab} \sumin \left(\yi - \fxit\right)^2 \, + \text{s.t. } \|\thetab\|_2^2 \leq t + \end{eqnarray*} +We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let \scriptsize{$\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix} + \yv \\ \bm{0}_{p} +\end{pmatrix}$.} \normalsize{Using the augmented data, the least-squares objective becomes} +\small{ +$$%\argmin_{\thetab} +\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 = %\argmin_{\thetab} +\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 %= \thetah_{\text{Ridge}} +=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 +$$ +} +\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.} +%$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$ +\end{vbframe} + +\endlecture +\end{document} diff --git a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex index 5235f6aa..fdd75fad 100644 --- a/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex +++ b/slides/regularization/slides-regu-softthresholding-lasso-deepdive.tex @@ -3,9 +3,9 @@ \input{../../latex-math/basic-math} \input{../../latex-math/basic-ml} -\newcommand{\titlefigure}{figure/graddes_vs_weightdecay.png} +\newcommand{\titlefigure}{slides/regularization/figure/th_l1_pos.pdf} \newcommand{\learninggoals}{ - \item todo + \item Understand the relationship between soft-thresholding and L1 regularization } \title{Introduction to Machine Learning} @@ -21,7 +21,7 @@ \begin{vbframe}{Soft-thresholding and L1 regularization} In the lecture, we wanted to solve \[ - \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) = \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j|. + \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) = \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_k \left[ \frac{1}{2} H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right] + \sum_k \lambda |\theta_k|. \] This is a convex problem (since it is the sum of convex functions) for which, in general, no analytical solution exists. \\ \lz @@ -36,7 +36,7 @@ First, we will focus on the everywhere differentiable part: \begin{align*} -\frac{\partial}{\partial \thetab_j}\sum_j \left[\frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] +\frac{\partial}{\partial \thetab_j}\sum_k \left[\frac{1}{2} H_{k,k} (\theta_k - \hat{\theta}_k)^2 \right] &= H_{j,j} (\theta_j - \hat{\theta}_j) \\ &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j \\ \end{align*} @@ -44,21 +44,51 @@ First, we consider the cases $\hat{\theta}_{\text{Lasso},j} > 0, \hat{\theta}_{\text{Lasso},j} < 0.$ \\ (Here $\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}}$ exists) \\ \lz -1) $\hat{\theta}_{\text{Lasso},j} > 0:$ -$\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} = H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0$ \\ -$\quad \Rightarrow \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j - -\frac{\lambda}{H_{j,j}} > 0 \iff \hat{\theta}_j > \frac{\lambda}{H_{j,j}}$\\ \framebreak -2) $\hat{\theta}_{\text{Lasso},j} < 0:$ -$\frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} = H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0$ \\ -$\quad \Rightarrow \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j - + \frac{\lambda}{H_{j,j}} < 0 \iff \hat{\theta}_j < -\frac{\lambda}{H_{j,j}}$\\ - \lz +1) $\hat{\theta}_{\text{Lasso},j} > 0:$ \\ +\lz +\begin{minipage}{0.4\textwidth} + \includegraphics[width=5cm]{slides/regularization/figure/th_l1_pos.pdf} +\end{minipage} +\hfill +\begin{minipage}{0.49\textwidth} +\begin{align*} + \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j + \lambda \overset{!}{=} 0 \\ + &\Rightarrow \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j + -\frac{\lambda}{H_{j,j}} > 0 \\ + &\iff \hat{\theta}_j > \frac{\lambda}{H_{j,j}} +\end{align*} +\end{minipage} + \newpage + +2) $\hat{\theta}_{\text{Lasso},j} < 0:$ \\ +\lz +\begin{minipage}{0.4\textwidth} + \includegraphics[width=5cm]{slides/regularization/figure/th_l1_neg.pdf} +\end{minipage} +\hfill +\begin{minipage}{0.49\textwidth} +\begin{align*} + \frac{\partial}{\partial \thetab_j}\mathcal{\tilde R}_{\text{reg}} &= H_{j,j}\theta_j - H_{j,j} \hat{\theta}_j - \lambda \overset{!}{=} 0 \\ + &\Rightarrow \hat{\theta}_{\text{Lasso},j} = \hat{\theta}_j + +\frac{\lambda}{H_{j,j}} < 0 \\ + &\iff \hat{\theta}_j < -\frac{\lambda}{H_{j,j}} +\end{align*} +\end{minipage} + \newpage + +\begin{minipage}{0.4\textwidth} + \includegraphics[width=5cm]{slides/regularization/figure/th_l1_zero.pdf} +\end{minipage} +\hfill +\begin{minipage}{0.49\textwidth} $\Rightarrow$ If $\hat{\theta}_j \in [-\frac{\lambda}{H_{j,j}}, \frac{\lambda}{H_{j,j}}]$ then $\mathcal{\tilde R}_{\text{reg}}$ has no stationary point with $$\hat{\theta}_{\text{Lasso},j} < 0 \text{ or } \hat{\theta}_{\text{Lasso},j} > 0.$$ \\ -However, there must be at least one stationary point since $\mathcal{\tilde R}_{\text{reg}}$ is a regularized convex risk. \\ +However, there must be at least one stationary point since $\mathcal{\tilde R}_{\text{reg}}$ is a regularized convex risk. +\end{minipage} + \\ \begin{align*}\Rightarrow \hat{\theta}_{\text{Lasso},j} &= \begin{cases} \hat{\theta}_j + \frac{\lambda}{H_{j,j}} &, \text{if} \;\hat{\theta}_j < -\frac{\lambda}{H_{j,j}} \\