Updates from Overleaf

slds-lmu · Dec 5, 2023 · d8109de · d8109de
1 parent a0c36e7
commit d8109de
Show file tree

Hide file tree

Showing 16 changed files with 328 additions and 30 deletions.
diff --git a/slides/information-theory/slides-info-entropy.tex b/slides/information-theory/slides-info-entropy.tex
@@ -80,14 +80,14 @@
   H(X) = - \E[\log_2(p(X))]           &= -\sum_{x \in \Xspace} p(x) \log_2 p(x) 
 \end{aligned} 
 \end{equation*}
-
 \begin{itemize}
 \setlength\itemsep{1.2em} 
 \item \textbf{Definition:}
 Base $2$ means the information is measured in bits, but you can use any number $>1$ as base of the logarithm.
 \item \textbf{Note:} If $p(x) = 0$, then $p(x) \log_2 p(x)$ is taken to be zero, because $\lim _{p \rightarrow 0} p \log_2 p=0$. %for $x=0$.
 \item NB: $H$ is actually Greek capital letter \textbf{E}ta ($\eta$) for \textbf{e}ntropy
-\item The negative log probabilities $\log_2 p(x)$ are called "Surprisal".
+\item The negative log probabilities $-\log_2 p(x)$ are called "Surprisal".
+\item More surprising means less likely. Distributions are more surprising, i.e., have higher entropy, when events are equally likely.
 \end{itemize}
 
 

diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex
@@ -69,7 +69,7 @@
     \\ $\Rightarrow m$ must be the prior distribution $q$, and our entropy measure must be understood relatively to this prior, so $S(p)$ becomes, in fact, $S(p\|q).$\\
     \lz 
     \textbf{3) Independent subsystems} \\
-    \includegraphics[width=0.6\linewidth]{slides/information-theory/figure_man/kl_me_indep_sub.png} \\
+    \includegraphics[width=0.6\linewidth]{figure_man/kl_me_indep_sub.png} \\
     If the prior distribution defines a subsystem of $\mathcal{X}$ to be independent, then the priors can be independently updated, and the resulting posterior is just their product density.
 
     \framebreak

diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex
@@ -22,7 +22,7 @@
 \begin{vbframe} {Measuring Distribution Similarity in ML}
 \begin{itemize}
     \item Information theory provides tools (e.g., divergence measures) to quantify the similarity between probability distributions
-\includegraphics[width=0.4\linewidth]{slides/information-theory/figure_man/kl_ml_dist_sim.png}
+\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_dist_sim.png}
     \item The most prominent divergence measure is the KL divergence 
 \item In ML, measuring (and maximizing) the similarity between probability distributions is a ubiquitous concept, which will be shown in the following.
 \end{itemize}
@@ -60,7 +60,7 @@
     \item \textbf{Variational inference (VI)}
 Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable.
 
-\includegraphics[width=0.99\linewidth]{slides/information-theory/figure_man/kl_ml_vi.png}
+\includegraphics[width=0.99\linewidth]{figure_man/kl_ml_vi.png}
 
 In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to 
     $p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$

diff --git a/slides/regularization/figure/lasso_contours_cases.png b/slides/regularization/figure/lasso_contours_cases.png
diff --git a/slides/regularization/figure/th_l1_neg.pdf b/slides/regularization/figure/th_l1_neg.pdf
diff --git a/slides/regularization/figure/th_l1_pos.pdf b/slides/regularization/figure/th_l1_pos.pdf
diff --git a/slides/regularization/figure/th_l1_zero.pdf b/slides/regularization/figure/th_l1_zero.pdf
diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png
diff --git a/slides/regularization/figure_man/cv-error-lambda-path.png b/slides/regularization/figure_man/cv-error-lambda-path.png
diff --git a/slides/regularization/rsrc/bias-var-decomp-ridge.py b/slides/regularization/rsrc/bias-var-decomp-ridge.py
@@ -0,0 +1,94 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mean_squared_error
+
+
+# Set the random seed for reproducibility
+np.random.seed(0)
+
+# Define the true function and the number of datasets
+true_function = lambda x: np.sin(x)
+n_datasets = 100  # Number of datasets for training
+n_samples = 100
+n_test_samples = 10000
+n_order = 8
+lambdas = np.exp(np.linspace(-6, 7, 25))  
+
+# Generate polynomial features
+poly = PolynomialFeatures(degree=n_order, include_bias=False)
+
+# Initialize arrays to store the bias, variance, and error
+bias_squared = np.zeros_like(lambdas)
+variance = np.zeros_like(lambdas)
+test_error = np.zeros_like(lambdas)
+
+# Generate shared x values for all datasets
+x_shared = np.random.uniform(0, 1, n_samples).reshape(-1, 1)
+x_shared_poly = poly.fit_transform(x_shared)
+
+# Generate test data
+x_test = np.random.uniform(0, 1, n_test_samples).reshape(-1, 1)
+y_test = true_function(x_test).reshape(-1, 1) + np.random.randn(n_test_samples,1)
+x_test_poly = poly.transform(x_test)
+
+# Loop over the lambda values
+for i, lambda_val in enumerate(lambdas):
+    # Initialize arrays to store predictions for each model
+    predictions = np.zeros((n_datasets, n_samples))
+
+    # Train and predict with n_datasets models
+    for j in range(n_datasets):
+        # Generate new y values for each dataset
+        epsilon = np.random.randn(n_samples, 1)
+        y = true_function(x_shared) + epsilon
+
+        # Fit Ridge regression model
+        model = Ridge(alpha=lambda_val, fit_intercept=True)
+        model.fit(x_shared_poly, y)
+        predictions[j, :] = model.predict(x_shared_poly).flatten()
+
+    # Calculate the average prediction for each x
+    average_prediction = np.mean(predictions, axis=0)
+
+    # Compute itegrated bias^2 and variance using MC
+    bias_squared[i] = np.mean((average_prediction - true_function(x_shared).flatten()) ** 2)
+    variance[i] = np.mean(np.var(predictions, axis=0))
+
+# Train a final model on a new dataset and compute test error for each lambda
+for i, lambda_val in enumerate(lambdas):
+    # Generate new data for the final model
+    x_train_final = np.random.uniform(0, 1, n_samples).reshape(-1, 1)
+    y_train_final = true_function(x_train_final) + np.random.randn(n_samples, 1)
+    x_train_final_poly = poly.transform(x_train_final)
+
+    # Fit the final model
+    model_final = Ridge(alpha=lambda_val, fit_intercept=True)
+    model_final.fit(x_train_final_poly, y_train_final)
+
+    # Predict on the test set and compute the error
+    y_test_pred_final = model_final.predict(x_test_poly).flatten()
+    # The test error
+    test_error[i] = mean_squared_error(y_test, y_test_pred_final)
+
+# Plotting the results with two y-axes
+fig, ax1 = plt.subplots(figsize=(12, 6))
+
+# Plot bias^2 and variance on the primary y-axis
+ax1.plot(np.log(lambdas), bias_squared, label='(bias)^2', color='red')
+ax1.plot(np.log(lambdas), variance, label='variance', color='blue')
+ax1.plot(np.log(lambdas), bias_squared + variance, label='(bias)^2 + variance', color='green')
+
+ax1.set_xlabel('ln(λ)', fontsize=16)
+ax1.set_ylabel('(bias)^2, variance', fontsize=16)
+ax1.legend(loc='upper left')
+
+# Create secondary y-axis for test error
+ax2 = ax1.twinx()
+ax2.plot(np.log(lambdas), test_error, label='test error', color='magenta', linestyle='--', alpha=.6)
+ax2.set_ylabel('Test error on single dataset', fontsize=16)
+ax2.legend(loc='upper right')
+
+plt.title('Bias-Variance Tradeoff with L2 Regularization', fontsize=20)
+plt.show()
diff --git a/slides/regularization/rsrc/cv-error-lambda-path.py b/slides/regularization/rsrc/cv-error-lambda-path.py
@@ -0,0 +1,52 @@
+from sklearn.datasets import load_wine
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import RidgeCV
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Load wine dataset
+X, y = load_wine(return_X_y=True)
+
+# Generating standard normal noise features
+np.random.seed(42)
+noise_features = np.random.normal(size=(X.shape[0], 75))
+
+# Adding these noise features to the original dataset
+X_extended = np.hstack((X, noise_features))
+
+# Splitting the augmented dataset into training and test sets
+X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split(
+    X_extended, y, test_size=0.2, random_state=42
+)
+
+# Standardizing the augmented dataset
+scaler_ext = StandardScaler()
+X_train_ext_scaled = scaler_ext.fit_transform(X_train_ext)
+X_test_ext_scaled = scaler_ext.transform(X_test_ext)
+
+# Define a range of lambda (alpha) values
+lambda_values = np.logspace(-4, 4, 50)
+
+# Performing Ridge Regression with Cross-Validation on the extended dataset
+ridge_cv_ext = RidgeCV(alphas=lambda_values, store_cv_values=True)
+ridge_cv_ext.fit(X_train_ext_scaled, y_train_ext)
+
+# Plotting the CV Curve for the extended dataset
+mean_cv_scores_ext = np.mean(ridge_cv_ext.cv_values_, axis=0)
+
+# Finding the lambda value with the minimum CV score
+min_lambda_index = np.argmin(mean_cv_scores_ext)
+min_lambda_value = lambda_values[min_lambda_index]
+
+# Re-plotting with a vertical blue bar at the minimum CV score
+plt.figure(figsize=(8, 6))
+plt.plot(lambda_values, mean_cv_scores_ext, marker='o', color='red')
+plt.axvline(x=min_lambda_value, color='blue', linestyle='--', label=f'Min CV Score at λ={min_lambda_value:.4f}')
+plt.xscale('log')
+plt.xlabel('Lambda (Regularization strength)', fontsize = 14)
+plt.ylabel('Generalization error', fontsize = 14)
+#plt.title('Wine dataset with add. noise features', fontsize=12)
+plt.title('Effect of L2 Regularization', fontsize = 16)
+plt.legend()
+plt.show()
diff --git a/slides/regularization/rsrc/softthresholding_l1.R b/slides/regularization/rsrc/softthresholding_l1.R
@@ -0,0 +1,59 @@
+library(ggplot2)
+
+lambda = 2
+
+fun1 <- function(x){
+  return(x^2 + 3*abs(x)+ 1)
+}
+
+fun2 <- function(x){
+  return(0.5*(x-4)^2 + lambda*abs(x)+ 1)
+}
+
+fun3 <- function(x){
+  return(0.5*(x+4)^2 + lambda*abs(x)+ 1)
+}
+
+
+p1 <- ggplot() +
+  xlim(-7, 7) +
+  geom_function(fun = fun1) +
+  xlab(expression(theta)) +
+  ylab(expression(R[reg])) +
+  geom_vline(xintercept = 0,
+             linetype="dashed") +
+  theme_bw(base_size = 20)
+
+pdf("../figure/th_l1_zero.pdf")
+print(p1)
+dev.off()
+
+p2 <- ggplot() +
+  xlim(-7, 7) +
+  geom_function(fun = fun2) +
+  xlab(expression(theta)) +
+  ylab(expression(R[reg])) +
+  geom_vline(xintercept = 4 - lambda,
+             linetype="dashed") +
+  theme_bw(base_size = 20)
+
+pdf("../figure/th_l1_pos.pdf")
+print(p2)
+dev.off()
+
+p3 <- ggplot() +
+  xlim(-7, 7) +
+  geom_function(fun = fun3) +
+  xlab(expression(theta)) +
+  ylab(expression(R[reg])) +
+  geom_vline(xintercept = -4 + lambda,
+             linetype="dashed") +
+  theme_bw(base_size = 20)
+
+
+pdf("../figure/th_l1_neg.pdf")
+print(p3)
+dev.off()
+
+
+
diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex
@@ -3,7 +3,7 @@
 \input{../../latex-math/basic-math}
 \input{../../latex-math/basic-ml}
 
-\newcommand{\titlefigure}{figure_man/biasvariance_scheme.png}
+\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png}
 \newcommand{\learninggoals}{
   \item Understand why overfitting happens
   \item Know how overfitting can be avoided
@@ -20,6 +20,19 @@
 
 %\section{Motivation for Regularization}
 
+\begin{vbframe}{What is Regularization?}
+
+Regularization comprises all methods that add preferences for specific solutions (\textbf{inductive bias}) to a model, usually in the context of ``low complexity'' priors (shrinkage and sparsity). By controlling complexity we can reduce overfitting and achieve an optimal bias-variance tradeoff.
+\vspace{0.1cm}
+\begin{itemize}
+\setlength{\itemsep}{1.0em}
+    \item \textbf{Explicit regularization} methods define an explicit measure of model complexity and add this as penalty to empirical risk (e.g., $L1/L2$)
+    \item \textbf{Implicit regularization} includes removing outliers, early stopping, data augmentation, parameter sharing, dropout or ensembling
+    \item \textbf{Structured regularization} methods incorporate structural prior knowledge over groups of parameters or subnetworks (e.g., the group lasso \citebutton{Yuan and Lin, 2005}{https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00532.x})
+\end{itemize}
+
+\end{vbframe}
+
 
 \begin{vbframe}{Example: Overfitting}
 

diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex
@@ -143,28 +143,33 @@
 
 \begin{vbframe}{Lasso Regression}
 
-Another shrinkage method is the so-called \textbf{Lasso regression}, which uses an $L1$ penalty on $\thetab$:
-
+Another shrinkage method is the so-called \textbf{Lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetab$:
+\vspace{-0.2cm}
 \begin{eqnarray*}
-\thetah_{\text{Lasso}} &=&  \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi\right)^2 + \lambda \|\thetab\|_1 \\
-  &=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \|\thetab\|_1.
+\thetah_{\text{Lasso}}= \argmin_{\thetab} \underbrace{\sumin \left(\yi - \thetab^T \xi\right)^2}_{\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right)} + \lambda \|\thetab\|_1
 \end{eqnarray*}
+Optimization is much harder now. $\riskrt$ is still convex, but in general there is no analytical solution and it is non-differentiable.\\
+\vspace{0.2cm}
 
-Note that optimization now becomes much harder. $\riskrt$ is still convex, but we have moved from an optimization problem with an analytical solution towards a non-differentiable problem.
+For special case of orthonormal design $\Xmat^{\top}\Xmat=\id$ we can get closed-form solution in terms of $\thetah_{\text{OLS}}=(\Xmat^{\top}\Xmat)^{-1}\Xmat^{\top}\yv=\Xmat^{\top}\yv$:
+$$\thetah_{\text{Lasso}}=\text{sign}(\thetah_{\text{OLS}})(\vert \thetah_{\text{OLS}} \vert - \lambda)_{+}\quad(\text{sparsity})$$
 
-\lz
+Comparing this to $\thetah_{\text{Ridge}}$ we see different behavior as $\lambda \uparrow$:
+$$\thetah_{\text{Ridge}}=\frac{\thetah_{\text{OLS}}}{1+\lambda}\quad (\text{no sparsity, uniform downscaling})$$
 
-Name: least absolute shrinkage and selection operator.
+
+%\textbf{NB}: lasso=least absolute shrinkage and selection operator.
 
 \framebreak 
 
 We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape.
 \vspace{-0.2cm}
 \begin{eqnarray*}
-\min_{\thetab} && \sumin \left(\yi - \fxit\right)^2\\
-\text{subject to: } && \|\thetab\|_1 \leq t \\
+\min_{\thetab} \sumin \left(\yi - \fxit\right)^2\,
+\text{subject to: } \|\thetab\|_1 \leq t
 \end{eqnarray*}
-\vspace{-0.2cm}
+The kinks in $L1$ enforce sparse solutions because ``the loss contours first hit the sharp corners of the constraint'' at coordinate axes where (some) entries are zero. 
+\vspace{-0.1cm}
 \begin{figure}%\includegraphics[width=0.3\textwidth]{figure_man/lasso_hat.png}\\
 \includegraphics[width=0.95\textwidth]{figure/lasso_contours_cases.png}\\
 \end{figure}

diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -0,0 +1,45 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+
+\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png}
+\newcommand{\learninggoals}{
+  \item Know alternative interpretations of Ridge regression
+  \item Derivation of the bias-variance tradeoff for Ridge regression
+}
+
+\title{Introduction to Machine Learning}
+\date{}
+
+\begin{document}
+
+\lecturechapter{Ridge Regression Deep-Dive}
+\lecture{Introduction to Machine Learning}
+
+
+
+\begin{vbframe}{Perspectives on $L2$ regularization}
+We already saw that $L2$ regularization is equivalent to a constrained optimization problem:
+\begin{eqnarray*}  
+  \thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv\\
+  %&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\
+  &=& \argmin_{\thetab} \sumin \left(\yi - \fxit\right)^2 \,
+  \text{s.t. } \|\thetab\|_2^2  \leq t
+  \end{eqnarray*}
+We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let \scriptsize{$\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix}
+    \yv \\ \bm{0}_{p}
+\end{pmatrix}$.} \normalsize{Using the augmented data, the least-squares objective becomes}
+\small{
+$$%\argmin_{\thetab} 
+\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 = %\argmin_{\thetab} 
+\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 %= \thetah_{\text{Ridge}}
+=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2
+$$
+}
+\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.}
+%$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv$$
+\end{vbframe}
+
+\endlecture
+\end{document}