Merge overleaf-2023-11-02-1709 into main

slds-lmu · Nov 2, 2023 · e812ba2 · e812ba2
2 parents a257c13 + c06d1c1
commit e812ba2
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 34 deletions.
diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex
@@ -23,8 +23,8 @@ \subsection{Lasso vs. Ridge Regression}
 \subsection{Elastic Net and Regularization for GLMs}
 \includepdf[pages=-]{../slides-pdf/slides-regu-enetlogreg.pdf}
 
-\subsection{Regularization for Underdetermined Problem}
-\includepdf[pages=-]{../slides-pdf/slides-regu-underdetermined.pdf}
+%\subsection{Regularization for Underdetermined Problem}
+%\includepdf[pages=-]{../slides-pdf/slides-regu-underdetermined.pdf}
 
 \subsection{L0 Regularization}
 \includepdf[pages=-]{../slides-pdf/slides-regu-l0.pdf}

diff --git a/slides/regularization/slides-regu-enetlogreg.tex b/slides/regularization/slides-regu-enetlogreg.tex
@@ -25,28 +25,27 @@
 
 
 Elastic Net combines the $L1$ and $L2$ penalties:
-
-$$
-\mathcal{R}_{\text{elnet}}(\thetab) =  \sumin (\yi - \thetab^\top \xi)^2 + \lambda_1 \|\thetab\|_1 + \lambda_2 \|\thetab\|_2^2.
-$$
-
+\small{
+\begin{align*}
+\mathcal{R}_{\text{elnet}}(\thetab) &=  \sumin (\yi - \thetab^\top \xi)^2 + \lambda_1 \|\thetab\|_1 + \lambda_2 \|\thetab\|_2^2 \\
+&= \sumin (\yi - \thetab^\top \xi)^2 + \lambda \left( (1-\alpha) \|\thetab\|_1 + \alpha \|\thetab\|_2^2\right),\, \alpha=\frac{\lambda_2}{\lambda_1+\lambda_2}, \lambda=\lambda_1+\lambda_2
+\end{align*}}
 \begin{figure}
-\includegraphics[width=0.6\textwidth]{figure/lasso_ridge_enet_2d.png}\\
+\includegraphics[width=0.55\textwidth]{figure/lasso_ridge_enet_2d.png}\\
 \end{figure}
-
-
+\vspace{-0.2cm}
 \begin{itemize}
-\item Correlated predictors tend to be either selected or zeroed out together.
+\item Correlated features tend to be either selected or zeroed out together.
 \item Selection of more than $n$ features possible for $p>n$.
 \end{itemize}
 
 
 \framebreak
 \footnotesize
-Simulating two examples with each 50 data sets and 100 observations each: \\
+Simulating 50 data sets with 100 observations each for two coefficient settings: \\
 
 
-$$\yv =\Xmat \boldsymbol{\beta}+\sigma \epsilon, \quad \epsilon \sim N(0,1), \quad \sigma = 1$$
+$$\yv =\Xmat \boldsymbol{\beta}+ \epsilon, \quad \epsilon \sim N(0,1)$$
 
   \begin{columns}
 \begin{column}{0.5\textwidth}
@@ -90,7 +89,7 @@
 
 \lz 
 
-Hence, we can, e.g., construct $L1$- or $L2$-penalized logistic regression to enable coefficient shrinkage and variable selection in this model. 
+Hence, we can construct, e.g., $L1$- or $L2$-penalized logistic regression to enable coefficient shrinkage and variable selection in this model class. 
 
 % \lz 
 % We can add a regularizer to the risk of logistic regression

diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex
@@ -62,7 +62,8 @@ \section{Motivation for Regularization}
 \begin{vbframe}{Avoid Overfitting} 
 
 Why can \textbf{overfitting} happen? And how to avoid it?
-
+\lz
+\lz
 \begin{enumerate}
 \item Not enough data \\
 $\to$ collect \textbf{more data} 
@@ -96,7 +97,7 @@ \section{Motivation for Regularization}
 
 \lz 
 
-We try the simplest model we can think of: the constant model. For the $L2$ loss, the optimal constant model is
+We try the simplest model we can think of: the constant model. For the $L2$ loss, the optimal constant model is the empirical mean
 
 $$
 \fxt = \frac{1}{n}\sumin \yi
@@ -163,7 +164,7 @@ \section{Motivation for Regularization}
 
 \framebreak 
 
-Until now, we can either add a feature completely or not at all.
+Until now, we can either include or exclude features in a binary fashsion.
 
 \lz 
 
@@ -183,7 +184,7 @@ \section{Regularized Empirical Risk Minimization}
 \begin{vbframe}{Regularized Empirical Risk Minimization}
 
 
-Recall, empirical risk minimization with a complex hypothesis set tends to overfit. A major tool to handle overfitting is \textbf{regularization}.
+Recall, empirical risk minimization with a complex hypothesis set tends to overfit. A major tool for handling overfitting is \textbf{regularization}.
 
   \lz
 

diff --git a/slides/regularization/slides-regu-l1l2.tex b/slides/regularization/slides-regu-l1l2.tex
@@ -23,8 +23,8 @@
 
   \begin{itemize}
   \item Linear models can also overfit if we operate in a high-dimensional space with not that many observations.    
-  \item OLS usually require a full-rank design matrix.
-  \item When features are highly correlated, the least-squares estimate becomes highly sensitive to random errors in the observed response, producing a large variance in the fit. 
+  \item The OLS estimator requires a full-rank design matrix.
+  \item For highly correlated features, the least-squares estimate becomes highly sensitive to random errors in the observed response, producing a large variance in the fit. 
   \item We now add a complexity penalty to the loss:
   $$
   \riskrt = \sumin \left(\yi - \thetab^\top \xi \right)^2 + \lambda \cdot J(\thetab). 
@@ -86,7 +86,7 @@
 \begin{footnotesize} 
 \begin{itemize}
   \item We still optimize the $\risket$, but cannot leave a ball around the origin.
-  \item $\risket$ grows monotonically if we move away from $\thetah$.
+  \item $\risket$ grows monotonically if we move away from $\thetah$ (elliptic contours).
   \item Inside constraints perspective: From origin, jump from contour line to contour line (better) until you become infeasible, stop before.
 \item Outside constraints perspective: From $\thetah$, jump from contour line to contour line (worse) until you become feasible, stop then.
   \item So our new optimum will lie on the boundary of that ball.
@@ -102,11 +102,11 @@
 
 \begin{vbframe}{Example: Polynomial Ridge Regression}
 
-True (unknown) function is \(f(x) = 5 + 2x +10x^2 - 2x^3 + \epsilon\) (in red).
+Consider $y=f(x)+\epsilon$ where the true (unknown) function is \(f(x) = 5 + 2x +10x^2 - 2x^3\) (in red).
 
 \lz
 
-Let us consider a \(d\)th-order polynomial
+We now fit the data using a \(d\)th-order polynomial
 \[ f(x) = \theta_0 + \theta_1 x + \cdots + \theta_d x^d = \sum_{j = 0}^{d} \theta_j x^j\text{.} \]
 Using model complexity $d = 10$ overfits:
 

diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex
@@ -42,7 +42,7 @@
 
 \textbf{Example 1: Boston Housing} (few features removed for readability) \\
 
-We cannot overfit here with an unregularized linear model as the task is so low-dimensional. But we see how only Lasso shrinks to sparsely 0.
+We cannot overfit here with an unregularized linear model as the task is so low-dimensional. But we see how only Lasso shrinks to exactly 0.
 
 \begin{figure}
 \includegraphics[width=0.9\textwidth]{figure/shrinkage_1.png}\\
@@ -53,7 +53,7 @@
 \framebreak
 \textbf{Example 2: High-dimensional simulated data} \\
 We simulate a continuous, correlated dataset with 50 features, 100 observations $\xv^{(1)},\dots, \xv^{(100)} \overset{\text{i.i.d.}}{\sim} \normal \left(\mathbf{0}, \Sigma \right)$ and 
-$$ y = 10 \cdot (x_1 + x_2) + 5 \cdot (x_3 + x_4) + \sum_{j = 5}^{14} x_j + \epsilon $$
+$$ y = 10 \cdot (x_1 + x_2) + 5 \cdot (x_3 + x_4) + 1 \cdot \sum_{j = 5}^{14} x_j + \epsilon $$
 where $\epsilon \sim \normal \left(0, 1\right)$ and $ \forall k, l \in \{1, ..., 50\}$: 
 $$Cov(x_k, x_l) = 
   \begin{cases}
@@ -63,7 +63,7 @@
 $$ 
 Note that 36 of the 50 features are noise variables. \\
 \framebreak 
-Coefficient histograms for different $\lambda$ values for Ridge and Lasso, on high-dimensional data along with the cross-validated MSE.
+Coefficient histograms for different $\lambda$ values for Ridge and Lasso for simulated data along with the cross-validated MSE.
 
 \begin{figure}
 \includegraphics[width=0.9\textwidth]{figure/shrinkage_2.png}\\
@@ -74,8 +74,8 @@
 \begin{vbframe}{Regularization and Feature Scaling}
 
   \begin{itemize}
-    \item Note that very often we do not include $\theta_0$ in the penalty term $J(\thetab)$ (but this can be implementation-dependent).
-    \item These methods are typically not equivariant under scaling of the inputs, so one usually standardizes the features. 
+    \item Typically we omit $\theta_0$ in the penalty term $J(\thetab)$ so that the ``infinitely'' regularized model is the constant model (but this can be implementation-dependent).
+    \item Penalty methods are typically not equivariant under scaling of the inputs, so one usually standardizes the features beforehand. 
     \item Note that for a normal LM, if you scale some features, we can simply "anti-scale" the coefficients the same way. The risk does not change. For regularized models this is not so simple. If you scale features to smaller values, coefficients have to become larger to counteract. They now are penalized more heavily in $J(\thetab)$. Such a scaling would make some features less attractive without changing anything relevant in the data.
 
     % \item While ridge regression usually leads to smaller estimated coefficients, but still dense $\thetab$ vectors,
@@ -127,7 +127,7 @@
 
 \includegraphics[width=0.9\textwidth]{figure/regu_example_multicollinearity.png}
 
-Fictional example for the model $y = 0.2X_1 + 0.2X_2 + 0.2X_3 + 0.2X_4 + 0.2X_5 + \epsilon$ of 100 observations, $\epsilon \sim \normal (0,1)$. $X_1$-$X_4$ are independently drawn from different normal distributions: $X_1, X_2, X_3, X_4 \sim \normal (0,2)$. While $X_1$-$X_4$ have pairwise correlation coefficients of 0, $X_4$ and $X_5$ are nearly perfectly correlated: $X_5 = X_4 + \delta, \delta \sim \normal (0,0.3), \rho(X_4, X_5) = 0.98. $
+Consider $n=100$ simulated observations using $y = 0.2X_1 + 0.2X_2 + 0.2X_3 + 0.2X_4 + 0.2X_5 + \epsilon$, $\epsilon \sim \normal (0,1)$. $X_1$-$X_4$ are independently drawn from different normal distributions: $X_1, X_2, X_3, X_4 \sim \normal (0,2)$. While $X_1$-$X_4$ have pairwise correlation coefficients of 0, $X_4$ and $X_5$ are nearly perfectly correlated: $X_5 = X_4 + \delta, \delta \sim \normal (0,0.3), \rho(X_4, X_5) = 0.98. $
 
 \vspace{0.1cm}
 
@@ -140,8 +140,8 @@
 
 \begin{itemize}
 \item Neither one can be classified as overall better. 
-\item Lasso is likely better if the true underlying structure is sparse, so if only few features influence $y$. Ridge works well if there are many influential features.
-\item Lasso can set some coefficients to zero, thus performing variable selection, while Ridge regression usually leads to smaller estimated coefficients, but still dense $\thetab$ vectors.
+\item Lasso is likely better if the true underlying structure is sparse, so if only few features influence $y$. Ridge works well if there are many (weakly) influential features.
+\item Lasso can set some coefficients to zero, thus performing variable selection, while Ridge regression usually leads to smaller estimated coefficients, but still dense parameter vectors $\thetab$.
 \item Lasso has difficulties handling correlated predictors. For high correlation Ridge dominates Lasso in performance.
 \item For Lasso one of the correlated predictors will have a larger coefficient, while the rest are (nearly) zeroed. The respective feature is, however, selected randomly. 
 \item For Ridge the coefficients of correlated features are similar.

diff --git a/slides/regularization/slides-regu-underdetermined.tex b/slides/regularization/slides-regu-underdetermined.tex
@@ -30,16 +30,17 @@
     \item $\Xmat$ is of square form and has full rank. This is normal linear system solving and irrelevant for us here, now. 
    \item $\Xmat$ has more rows than columns. The system is "overdetermined". We now try to solve
       $\Xmat \thetab \approx \bm{y}$, by minimizing $|| \Xmat \thetab - \bm{y}||$. Ideally, this difference would be zero, but due to the too many rows this is often not possible. This is equivalent to linear regression!
-   \item $\Xmat$ has more columns than rows / linear dependence between columns exists. Now there are usually an infinite number of solutions. We have to define a "preference" for them to make the problem well-defined (sounds familiar?). Such problems are called $"$underdetermined$"$.
+   \item $\Xmat$ has more columns than rows / linear dependence between columns exists. Now there is usually an infinite number of solutions. We have to define a "preference" for them to make the problem well-defined (sounds familiar?). Such problems are called $"$underdetermined$"$.
   \end{footnotesize}
   \end{enumerate}
   \end{itemize}
 
 \framebreak
   \begin{itemize}
-   \item A very old and well-known approach in underdetermined cases is to still reduce the problem to optimization by minimizing $|| \Xmat \thetab - \bm{y}||$, but adding a small positiv constant to the diagonal of $\Xmat^T \Xmat$.
+   \item A very old and well-known approach in underdetermined cases is to still reduce the problem to optimization by minimizing $|| \Xmat \thetab - \bm{y}||$, but adding a small positive constant to the diagonal of $\Xmat^T \Xmat$.
    \item In optimization / numerical analysis this is known as \textbf{Tikhonov} regularization. 
-   \item But as you should be able to see now: This is completely equivalent to Ridge regression!
+   \item But as you should be able to see now: This is completely equivalent to Ridge regression! Recall:
+   $$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat  + \lambda \id)^{-1} \Xmat^T\yv$$
   \end{itemize}
 
 \framebreak