Skip to content

Commit

Permalink
update svm pros cons and regu/inf theory changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ludwigbothmann committed Dec 21, 2023
1 parent c52d781 commit 70d83f4
Show file tree
Hide file tree
Showing 12 changed files with 247 additions and 25 deletions.
1 change: 1 addition & 0 deletions slides/information-theory/slides-info-mutual-info.tex
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@
\begin{aligned}
I(X ; Y) &= H(X) - H(X | Y) \\
I(X ; Y) &= H(Y) - H(Y | X) \\
I(X ; Y) &\leq \min\{H(X),H(Y)\} \\
I(X ; Y) &= H(X) + H(Y) - H(X, Y) \\
I(X ; Y) &= I(Y ; X) \\
I(X ; X) &= H(X)\\
Expand Down
3 changes: 3 additions & 0 deletions slides/information-theory/slides-info-mutual-info2.tex
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
\textbf{Proof:}$\quad 0 \leq I(X ; Y)=H(X)-H(X | Y)$

Intuitively, the theorem says that knowing another random variable $Y$ can only reduce the uncertainty in $X$. Note that this is true only on the average.
\vspace{0.5cm}

\textbf{Remark}: Because $H(X)\geq H(X|Y)$ and $H(X)$ is only bounded from below, $I(X ; Y)$ is unbounded from above (lives in all of $\mathbb{R}_{0}^{+}$)

\framebreak

Expand Down
18 changes: 9 additions & 9 deletions slides/information-theory/slides-info-sourcecoding.tex
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,21 @@
\scalebox{1.05}{\includegraphics{figure_man/length_same.png}}
\tiny{\\ Credit: Chris Olah}
\end{figure}
\item The length $L(x)$ is simply the number of bits in the corresponding codeword. In this example, all codewords have length 2.
\item Length $L(x)$ is simply number of bits in corresponding codeword. Here all codewords have length 2.
\end{itemize}
\framebreak
%\framebreak

\begin{figure}
\centering
\scalebox{1}{\includegraphics{figure_man/length_same.png}}
\end{figure}
% \begin{figure}
% \centering
% \scalebox{1}{\includegraphics{figure_man/length_same.png}}
% \end{figure}

\begin{itemize}
\item For this code, the expected length of a message emitted by the source is, naturally:
%\item For this code, the expected length of a message emitted by the source is, naturally:

$$\E[L(X)] = \frac{1}{2} \cdot 2 + \frac{1}{4} \cdot 2 + \frac{1}{8} \cdot 2 + \frac{1}{8} \cdot 2 = 2 \text{ bits.}$$
%$$\E[L(X)] = \frac{1}{2} \cdot 2 + \frac{1}{4} \cdot 2 + \frac{1}{8} \cdot 2 + \frac{1}{8} \cdot 2 = 2 \text{ bits.}$$

\item The area of a rectangle in the image on the right reflects the size of the corresponding term in the expectation.
\item Area of rectangles on the right reflect contributions to $\E[L(X)]$

\end{itemize}

Expand Down
77 changes: 77 additions & 0 deletions slides/nonlinear-svm/slides-nonlinsvm-uniapprox.tex
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,83 @@ \section{SVMs as Non-Parametric Models}

\end{vbframe}

\begin{vbframe}{SVM -- Pro's \& Con's}

\begin{columns}[T, totalwidth=\textwidth]
\begin{column}{0.5\textwidth}
\textbf{Advantages}
\normalsize
\begin{itemize}
% \item High \textbf{accuracy}
\item Often \textbf{sparse} solution (w.r.t. observations)
\item Robust against overfitting (\textbf{regularized}); especially in
high-dimensional space
\item \textbf{Stable} solutions (w.r.t. changes in train data)\\
$\rightarrow$ Non-SV do not affect decision boundary
\item Convex optimization problem \\
$\rightarrow$ local minimum $\hat{=}$ global minimum
%\item \textbf{memory efficient} (only use non-SVs)
\end{itemize}

% \textbf{Advantages (nonlinear SVM)}
% \begin{itemize}
% \item Can learn \textbf{nonlinear decision boundaries}
% \item \textbf{Very flexible} due to custom kernels \\
% $\rightarrow$ RBF kernel yields local model \\
% $\rightarrow$ kernel for time series, strings etc.
% \end{itemize}
\end{column}

\begin{column}{0.5\textwidth}
\textbf{Disadvantages}
\normalsize
\begin{itemize}
\item \textbf{Long} training times $\rightarrow O(n^2 p + n^3)$
%\item \textbf{Limited scalability} to larger data sets
%\textcolor{blue}{\textbf{??}}
\item Confined to \textbf{linear model}
\item Restricted to \textbf{continuous features}
\item Optimization can also fail or get stuck
% \item Poor \textbf{interpretability}
%\item No handling of \textbf{missing} data
\end{itemize}

% \textbf{Disadvantages (nonlinear SVM)}
% \begin{itemize}
% \item Poor \textbf{interpretability} due to complex kernel
% \item \textbf{Not easy tunable} as it is highly important to choose the right kernel (which also introduces further hyperparameters)
% \end{itemize}
\end{column}
\end{columns}

\framebreak
\lz

\begin{columns}[t, totalwidth=\textwidth]
\begin{column}{0.5\textwidth}
\textbf{Advantages (nonlinear SVM)}
\begin{itemize}
\item Can learn \textbf{nonlin. decision boundaries}
\item \textbf{Very flexible} due to custom kernels \\
$\rightarrow$ RBF kernel yields local model \\
$\rightarrow$ kernel for time series, strings etc.
\end{itemize}
\end{column}

\begin{column}{0.5\textwidth}
\textbf{Disadvantages (nonlin. SVM)}
\begin{itemize}
\item Poor \textbf{interpretability} due to complex kernel
\item \textbf{Not easy tunable} as it is highly important to choose the right kernel (which also introduces further hyperparameters)
\end{itemize}
\end{column}
\end{columns}

% \conclbox{Very accurate solution for high-dimensional data that is linearly
% separable}

\end{vbframe}



\section{Kernels on Infinite-Dimensional Vector Spaces}
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
75 changes: 75 additions & 0 deletions slides/regularization/rsrc/make_ridge_vs_sgd_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

# Set the random seed for reproducibility
np.random.seed(6)

# Function to generate data
def generate_data(n, p):
X = np.random.normal(0, 1, (n, p))
true_coef = np.linspace(-1, 1, p)
noise = np.random.normal(0, 1, n)
y = X.dot(true_coef) + noise
return X, y, true_coef

# Function to compute the ridge coefficients analytically
def compute_ridge_path(X, y, alphas):
coefs = [np.zeros(X.shape[1])] # Start with a row of zeros
n, p = X.shape
for alpha in alphas:
ridge_coefs = np.linalg.inv(X.T @ X + alpha * np.identity(p)) @ X.T @ y
coefs.append(ridge_coefs)
return np.array(coefs)

# Function to compute the optimization trajectory for SGD
def compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter):
w = np.zeros(X.shape[1])
coefs = [w.copy()] # Start with a row of zeros
for i in range(n_iter):
X_shuffled, y_shuffled = shuffle(X, y)
for j in range(0, n, batch_size):
X_batch = X_shuffled[j:j+batch_size]
y_batch = y_shuffled[j:j+batch_size]
gradient = -2 * X_batch.T @ (y_batch - X_batch @ w) / batch_size
w -= learning_rate * gradient
coefs.append(w.copy())
return np.array(coefs)

# Parameters
n = 100
p = 10
batch_size = 4
learning_rate = 0.01
n_iter = 50
t_values = np.arange(0.001, n_iter + 1) # Include 0 in t_values for the zero coefficients
alphas = 1/(learning_rate * t_values[0:]) # Exclude 0 to avoid division by zero

# Generate data
X, y, true_coef = generate_data(n, p)

# Compute the regularization path for ridge regression
ridge_coefs = compute_ridge_path(X, y, alphas)

# Compute the optimization trajectory for SGD
sgd_coefs = compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter)

# Plotting
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
# Regularization path for ridge regression
# Skip the first element (0) in t_values for plotting to match dimensions with ridge_coefs
axs[0].plot(1/alphas, ridge_coefs[1:])
axs[0].set_xlabel('1/(lr * lambda)', fontsize=18)
axs[0].set_ylabel('Parameters', fontsize=18)
axs[0].set_title('Ridge Regression Path', fontsize=22)

# Optimization trajectory for SGD
# Use t_values for x-axis to include the initial zero coefficients
axs[1].plot(t_values, sgd_coefs)
axs[1].set_xlabel('iteration', fontsize=18)
axs[1].set_ylabel('Parameters', fontsize=18)
axs[1].set_title('SGD Trajectory', fontsize=22)

plt.tight_layout()
plt.show()

39 changes: 28 additions & 11 deletions slides/regularization/slides-regu-early-stopping.tex
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
\newcommand{\learninggoals}{
\item Know how early stopping works
\item Understand how early stopping acts as a regularizer
\item Know early stopping imitates $L2$ regularization in some cases
}

\title{Introduction to Machine Learning}
Expand Down Expand Up @@ -39,7 +40,9 @@
\item Use parameters of the previous step for the actual model.
\end{enumerate}
More sophisticated forms also apply cross-validation.
\framebreak
\end{vbframe}

\begin{vbframe}{Early Stopping and $L2$ \citebutton{Goodfellow et al., 2016, p. 249 ff.}{https://www.deeplearningbook.org/contents/regularization.html}}
\begin{table}
\begin{tabular}{p{4cm}|p{6cm}}
Strengths & Weaknesses \\
Expand All @@ -49,18 +52,20 @@
\hline
Applicable to almost any model without adjustment \note{of objective function, parameter space, training procedure} & Temporary copy of $\thetab$ (we have to save the whole model each time validation error improves) \\
\hline
Combinable with other regularization methods & Less data for training $\rightarrow$ include $\mathcal{D}_{\text{val}}$ afterwards
Combinable with other regularization methods & Less data for training $\rightarrow$ include $\mathcal{D}_{\text{val}}$ afterwards\\ \hline\hline
\end{tabular}
\end{table}
\begin{itemize}
\item Relation between optimal early-stopping iteration $T_{\text{stop}}$ and weight-decay penalization parameter $\lambda$ for step-size $\alpha$ (see Goodfellow et al. (2016) page 251-252 for proof):
\end{itemize}
\begin{equation*}
T_{\text{stop}} \approx \frac{1}{\alpha \lambda}
\Leftrightarrow \lambda \approx \frac{1}{T_{\text{stop}} \alpha}
\end{equation*}
\item For simple case of LM with squared loss and GD optim initialized at $\thetab=0$: Early stopping has exact correspondence with $L2$ regularization/WD: %Relation between
optimal early-stopping iter $T_{\text{stop}}$ inversely proportional to $\lambda$ scaled by step-size $\alpha$

\end{itemize}
\begin{equation*}
T_{\text{stop}} \approx \frac{1}{\alpha \lambda}
\Leftrightarrow \lambda \approx \frac{1}{T_{\text{stop}} \alpha}
\end{equation*}
\begin{itemize}
\item Small $\lambda$ (low penalization) $\Rightarrow$ high $T_{\text{stop}}$ (complex model / lots of updates).
\item Small $\lambda$ ( regu. $\downarrow$) $\Rightarrow$ large $T_{\text{stop}}$ (complexity $\uparrow$) and vice versa
\end{itemize}
\framebreak
% \begin{itemize}
Expand All @@ -75,14 +80,26 @@
\begin{figure}
\centering
\scalebox{0.75}{\includegraphics{figure_man/earlystop_int_hat.png}}
\tiny{\\ Credit: Goodfellow et al. (2016)\\}
\tiny{\\Goodfellow et al. (2016)\\}
\end{figure}

\footnotesize
\textbf{Figure:} An illustration of the effect of early stopping. \textit{Left:} The solid contour lines indicate the contours of the negative log-likelihood. The dashed line indicates the trajectory taken by SGD beginning from the origin. Rather than stopping at the point $\thetah$ that minimizes the risk, early stopping results in the trajectory stopping at an earlier point $\hat{\thetab}_{\text{Ridge}}$. \textit{Right:} An illustration of the effect of $L2$ regularization for comparison. The dashed circles indicate the contours of the $L2$ penalty which causes the minimum of the total cost to lie closer to the origin than the minimum of the unregularized cost.
\textbf{Figure:} Effect of early stopping. \textit{Left:} The solid lines indicate contours of the square loss objective. Dashed line indicates trajectory taken by GD initialized at origin. Instead of reaching minimizer $\thetah$, ES results in trajectory stopping earlier at $\hat{\thetab}_{\text{Ridge}}$. \textit{Right:} Effect of $L2$ regularization. Dashed circles indicate contours of $L2$ constraint which push minimizer of regularized cost closer to origin than minimizer of unregularized cost.
\end{vbframe}

\begin{vbframe}{SGD Trajectory and $L2$ \citebutton{Ali et al., 2020}{https://proceedings.mlr.press/v119/ali20a/ali20a.pdf}}
Solution paths for $L2$ regularized linear model closely matches SGD trajectory of unregularized LM initialized at $\thetab=0$
\lz
\begin{figure}
\centering
%\scalebox{0.75}
{\includegraphics{figure_man/ridge-vs-sgd-path.png}}
%\scriptsize{\\Ali et al. (2020)\\}
\end{figure}

\textbf{Caveat}: Initialization at the origin is crucial for this equivalence to hold, which is almost never used in practice in ML/DL applications

\end{vbframe}

\endlecture
\end{document}
5 changes: 3 additions & 2 deletions slides/regularization/slides-regu-geom-l2-wdecay.tex
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@

The minimum of $\mathcal{\tilde R}_{\text{emp}}(\thetab)$ occurs where $\nabla_{\thetab}\mathcal{\tilde R}_{\text{emp}}(\thetab) = \bm{H}(\thetab - \thetah)$ is $0$.

\lz

Now we $L2$-regularize $\mathcal{\tilde R}_{\text{emp}}(\thetab)$, such that
\[
\mathcal{\tilde R}_{\text{reg}}(\thetab) = \mathcal{\tilde R}_{\text{emp}}(\thetab) + \frac{\lambda}{2} \|\thetab\|^2_2\]
Expand All @@ -104,6 +102,9 @@

% where $\id$ is the identity matrix.
This gives us a formula to see how the minimizer of the $L2$-regularized version is a transformation of the minimizer of the unpenalized version.
\vspace{0.2cm}

\textbf{Caveat}: Equivalence of weight decay and $L2$ regularization only holds for vanilla SGD (not e.g. Adam)


\framebreak
Expand Down
3 changes: 2 additions & 1 deletion slides/regularization/slides-regu-l1vsl2.tex
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@
\begin{itemize}
\item Typically we omit $\theta_0$ in the penalty term $J(\thetab)$ so that the ``infinitely'' regularized model is the constant model (but this can be implementation-dependent).
\item Penalty methods are typically not equivariant under scaling of the inputs, so one usually standardizes the features beforehand.
\item Note that for a normal LM, if you scale some features, we can simply "anti-scale" the coefficients the same way. The risk does not change. For regularized models this is not so simple. If you scale features to smaller values, coefficients have to become larger to counteract. They now are penalized more heavily in $J(\thetab)$. Such a scaling would make some features less attractive without changing anything relevant in the data.
\item Note a normal LM has the inductive bias of rescaling equivariance, i.e., if you scale some features, we can simply "anti-scale" the coefficients the same way. The risk does not change.
\item While regularized LMs exhibit low-complexity inductive bias, they lose equivariance property: if you down-scale features, coefficients have to become larger to counteract. Then they are penalized stronger in $J(\thetab)$, making some features less attractive without relevant changes in data.

% \item While ridge regression usually leads to smaller estimated coefficients, but still dense $\thetab$ vectors,
% the Lasso will usually create a sparse $\thetab$ vector and can therefore be used for variable selection.
Expand Down
4 changes: 2 additions & 2 deletions slides/regularization/slides-regu-nonlin-bayes.tex
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
\newcommand{\learninggoals}{
\item Understand that regularization and parameter shrinkage can be applied to non-linear models
\item Know structural risk minimization
\item Know how regularization risk minimization is the same as MAP
in a Bayesian perspective, where the penalty corresponds to parameter prior.
\item Know how regularization risk minimization is same as MAP
in Bayesian perspective, where penalty corresponds to a parameter prior
}

\title{Introduction to Machine Learning}
Expand Down
47 changes: 47 additions & 0 deletions slides/regularization/slides-regu-ridge-deepdive.tex
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,53 @@
$\Longrightarrow$ Ridge regression on unperturbed features {\small $\xi$} turns out to be minimizing squared loss averaged over feature noise distribution!
\end{vbframe}

\begin{vbframe}{Bias-Variance Decomposition for Ridge}
For linear model $\yv = \Xmat \thetab + \bm{\varepsilon}$ with $\Xmat \in \mathbb{R}^{n \times p},\,\bm{\varepsilon} \sim (\bm{0},\sigma^2 \bm{I}_n)$, bias of ridge estimator $\thetah_{\text{Ridge}}$ is given by
\begin{equation*}
\begin{aligned}
\text{Bias}(\thetah_{\text{Ridge}}) := \mathbb{E}[\thetah_{\text{Ridge}}-\bm{\theta}] &= \mathbb{E}[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}\bm{y}] - \bm{\theta}\\
&= \mathbb{E}[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}(\bm{X}\bm{\theta}+\bm{\varepsilon})] - \bm{\theta} \\
&= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \bm{X} \bm{\theta} + (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \underbrace{\mathbb{E}[\bm{\varepsilon}]}_{=0} - \bm{\theta} \\
&= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \bm{X} \bm{\theta} - \bm{\theta} \\
&= \left[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} - (\bm{X}^{\top} \bm{X})^{-1} \right] \bm{X}^{\top} \bm{X} \bm{\theta}
\end{aligned}
\end{equation*}

\begin{itemize}
\item Last expression shows bias of Ridge estimator only vanishes for $\lambda=0$, which is simply (unbiased) OLS solution
\item It follows $\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2>0$ for all $\lambda>0$, later important
\end{itemize}

For the variance of $\thetah_{\text{Ridge}}$, we have
\begin{equation*}
\begin{aligned}
\text{Var}(\thetah_{\text{Ridge}}) &= \text{Var}\left((\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}\bm{y}\right) \quad \quad \big| \;\; \text{apply} \;\; \text{Var}_u(\bm{A}\bm{u}) = \bm{A} \text{Var}(\bm{u}) \bm{A}^{\top} \\
&= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \text{Var}(\bm{y}) \left( (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \right)^{\top} \\
&= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \text{Var}(\bm{\varepsilon}) \bm{X} (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \\
&= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \sigma^2 \bm{I}_n \bm{X} (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \\
&= \sigma^2 (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \bm{X} (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \\
\end{aligned}
\end{equation*}

\begin{itemize}
\item $\text{Var}(\thetah_{\text{Ridge}})$ is strictly smaller than $\text{Var}(\thetah_{\text{OLS}})=\sigma^2 (\Xmat^{\top}\Xmat)^{-1}$ for any $\lambda>0$, meaning matrix of their difference $\text{Var}(\thetah_{\text{OLS}})-\text{Var}(\thetah_{\text{Ridge}})$ is positive definite (bit tedious derivation)
\item This further means $\text{trace}\big({\text{Var}(\thetah_{\text{OLS}})}-{\text{Var}(\thetah_{\text{Ridge}})}\big)>0 \, \forall \lambda>0$
\end{itemize}

\framebreak

With bias and variance of the ridge estimator we can decompose its mean squared error as follows:

$$\text{MSE}(\thetah_{\text{Ridge}})=\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2 + \text{trace}\big(\text{Var}(\thetah_{\text{Ridge}})\big)$$

Comparing MSEs of $\thetah_{\text{Ridge}}$ and $\thetah_{\text{OLS}}$ and using $\text{Bias}(\thetah_{\text{OLS}})=0$ we find
$$\text{MSE}(\thetah_{\text{OLS}})-\text{MSE}(\thetah_{\text{Ridge}}) = \underbrace{\text{trace}\big({\text{Var}(\thetah_{\text{OLS}})}-{\text{Var}(\thetah_{\text{Ridge}})}\big)}_{>0} - \underbrace{\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2}_{>0}$$

Since both terms are positive, their difference is \textit{a priori} undetermined. \citebutton{Theobald, 1973}{https://www.jstor.org/stable/2984775} and \citebutton{Farebrother, 1976}{https://www.jstor.org/stable/2984971} prove there always exists some $\lambda^{\ast}>0$ so that
$$\text{MSE}(\thetah_{\text{OLS}})-\text{MSE}(\thetah_{\text{Ridge}})>0$$
Important theoretical result: While Gauss-Markov guarantuees $\thetah_{\text{OLS}}$ is best linear unbiased estimator (BLUE) there are biased estimators with lower MSE.

\end{vbframe}



Expand Down

0 comments on commit 70d83f4

Please sign in to comment.