update svm pros cons and regu/inf theory changes

slds-lmu · Dec 21, 2023 · 70d83f4 · 70d83f4
1 parent c52d781
commit 70d83f4
Show file tree

Hide file tree

Showing 12 changed files with 247 additions and 25 deletions.
diff --git a/slides/information-theory/slides-info-mutual-info.tex b/slides/information-theory/slides-info-mutual-info.tex
@@ -229,6 +229,7 @@
 \begin{aligned}
 I(X ; Y) &= H(X) - H(X | Y) \\
 I(X ; Y) &= H(Y) - H(Y | X) \\
+I(X ; Y) &\leq \min\{H(X),H(Y)\} \\
 I(X ; Y) &= H(X) + H(Y) - H(X, Y) \\
 I(X ; Y) &= I(Y ; X) \\
 I(X ; X) &= H(X)\\

diff --git a/slides/information-theory/slides-info-mutual-info2.tex b/slides/information-theory/slides-info-mutual-info2.tex
@@ -39,6 +39,9 @@
 \textbf{Proof:}$\quad 0 \leq I(X ; Y)=H(X)-H(X | Y)$
 
 Intuitively, the theorem says that knowing another random variable $Y$ can only reduce the uncertainty in $X$. Note that this is true only on the average. 
+\vspace{0.5cm}
+
+\textbf{Remark}: Because $H(X)\geq H(X|Y)$ and $H(X)$ is only bounded from below, $I(X ; Y)$ is unbounded from above (lives in all of $\mathbb{R}_{0}^{+}$)
 
 \framebreak
 

diff --git a/slides/information-theory/slides-info-sourcecoding.tex b/slides/information-theory/slides-info-sourcecoding.tex
@@ -53,21 +53,21 @@
       \scalebox{1.05}{\includegraphics{figure_man/length_same.png}}
       \tiny{\\ Credit: Chris Olah}
   \end{figure}
-  \item The length $L(x)$ is simply the number of bits in the corresponding codeword. In this example, all codewords have length 2.
+  \item Length $L(x)$ is simply number of bits in corresponding codeword. Here all codewords have length 2.
 \end{itemize}
-\framebreak
+%\framebreak
 
-  \begin{figure}
-    \centering
-      \scalebox{1}{\includegraphics{figure_man/length_same.png}}
-  \end{figure}
+%  \begin{figure}
+%    \centering
+%      \scalebox{1}{\includegraphics{figure_man/length_same.png}}
+%  \end{figure}
 
   \begin{itemize}
-    \item For this code, the expected length of a message emitted by the source is, naturally:
+    %\item For this code, the expected length of a message emitted by the source is, naturally:
 
-      $$\E[L(X)] = \frac{1}{2} \cdot 2 + \frac{1}{4} \cdot 2 + \frac{1}{8} \cdot 2 + \frac{1}{8} \cdot 2 = 2 \text{ bits.}$$
+      %$$\E[L(X)] = \frac{1}{2} \cdot 2 + \frac{1}{4} \cdot 2 + \frac{1}{8} \cdot 2 + \frac{1}{8} \cdot 2 = 2 \text{ bits.}$$
 
-    \item The area of a rectangle in the image on the right reflects the size of the corresponding term in the expectation.
+    \item Area of rectangles on the right reflect contributions to $\E[L(X)]$
 
   \end{itemize}
 

diff --git a/slides/nonlinear-svm/slides-nonlinsvm-uniapprox.tex b/slides/nonlinear-svm/slides-nonlinsvm-uniapprox.tex
@@ -115,6 +115,83 @@ \section{SVMs as Non-Parametric Models}
 
 \end{vbframe}
 
+\begin{vbframe}{SVM -- Pro's \& Con's}
+
+\begin{columns}[T, totalwidth=\textwidth]
+  \begin{column}{0.5\textwidth}
+    \textbf{Advantages}
+    \normalsize
+    \begin{itemize}
+      % \item High \textbf{accuracy}
+      \item Often \textbf{sparse} solution (w.r.t. observations)
+      \item Robust against overfitting (\textbf{regularized}); especially in 
+      high-dimensional space
+      \item \textbf{Stable} solutions (w.r.t. changes in train data)\\
+      $\rightarrow$ Non-SV do not affect decision boundary
+      \item Convex optimization problem \\
+      $\rightarrow$ local minimum $\hat{=}$ global minimum
+      %\item \textbf{memory efficient} (only use non-SVs)
+    \end{itemize}
+
+    % \textbf{Advantages (nonlinear SVM)}
+    % \begin{itemize}
+    %    \item Can learn \textbf{nonlinear decision boundaries}
+    %    \item \textbf{Very flexible} due to custom kernels \\
+    %    $\rightarrow$ RBF kernel yields local model \\
+    %    $\rightarrow$ kernel for time series, strings etc.
+    % \end{itemize}
+  \end{column}
+
+  \begin{column}{0.5\textwidth}
+    \textbf{Disadvantages}
+    \normalsize
+    \begin{itemize}
+      \item \textbf{Long} training times $\rightarrow O(n^2 p + n^3)$
+      %\item \textbf{Limited scalability} to larger data sets 
+      %\textcolor{blue}{\textbf{??}}
+      \item Confined to \textbf{linear model}
+      \item Restricted to \textbf{continuous features}
+      \item Optimization can also fail or get stuck
+      % \item Poor \textbf{interpretability}
+      %\item No handling of \textbf{missing} data
+    \end{itemize}
+
+    %     \textbf{Disadvantages (nonlinear SVM)}
+    % \begin{itemize}
+    %    \item Poor \textbf{interpretability} due to complex kernel
+    %    \item \textbf{Not easy tunable} as it is highly important to choose the right kernel (which also introduces further hyperparameters)
+    % \end{itemize}
+  \end{column}
+\end{columns}
+
+\framebreak
+\lz 
+
+\begin{columns}[t, totalwidth=\textwidth]
+  \begin{column}{0.5\textwidth}    
+    \textbf{Advantages (nonlinear SVM)}
+    \begin{itemize}
+       \item Can learn \textbf{nonlin. decision boundaries}
+       \item \textbf{Very flexible} due to custom kernels \\
+       $\rightarrow$ RBF kernel yields local model \\
+       $\rightarrow$ kernel for time series, strings etc.
+    \end{itemize}
+  \end{column}
+
+  \begin{column}{0.5\textwidth}
+    \textbf{Disadvantages (nonlin. SVM)}
+    \begin{itemize}
+       \item Poor \textbf{interpretability} due to complex kernel
+       \item \textbf{Not easy tunable} as it is highly important to choose the right kernel (which also introduces further hyperparameters)
+    \end{itemize}
+  \end{column}
+\end{columns}
+
+% \conclbox{Very accurate solution for high-dimensional data that is linearly 
+% separable}
+
+\end{vbframe}
+
 
 
 \section{Kernels on Infinite-Dimensional Vector Spaces}

diff --git a/slides/regularization/figure_man/path-ridge-vs-sgd-ali2020.png b/slides/regularization/figure_man/path-ridge-vs-sgd-ali2020.png
diff --git a/slides/regularization/figure_man/ridge-vs-sgd-path.png b/slides/regularization/figure_man/ridge-vs-sgd-path.png
diff --git a/slides/regularization/rsrc/make_ridge_vs_sgd_path.py b/slides/regularization/rsrc/make_ridge_vs_sgd_path.py
@@ -0,0 +1,75 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.utils import shuffle
+
+# Set the random seed for reproducibility
+np.random.seed(6)
+
+# Function to generate data
+def generate_data(n, p):
+    X = np.random.normal(0, 1, (n, p))
+    true_coef = np.linspace(-1, 1, p)
+    noise = np.random.normal(0, 1, n)
+    y = X.dot(true_coef) + noise
+    return X, y, true_coef
+
+# Function to compute the ridge coefficients analytically
+def compute_ridge_path(X, y, alphas):
+    coefs = [np.zeros(X.shape[1])]  # Start with a row of zeros
+    n, p = X.shape
+    for alpha in alphas:
+        ridge_coefs = np.linalg.inv(X.T @ X + alpha * np.identity(p)) @ X.T @ y
+        coefs.append(ridge_coefs)
+    return np.array(coefs)
+
+# Function to compute the optimization trajectory for SGD
+def compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter):
+    w = np.zeros(X.shape[1])
+    coefs = [w.copy()]  # Start with a row of zeros
+    for i in range(n_iter):
+        X_shuffled, y_shuffled = shuffle(X, y)
+        for j in range(0, n, batch_size):
+            X_batch = X_shuffled[j:j+batch_size]
+            y_batch = y_shuffled[j:j+batch_size]
+            gradient = -2 * X_batch.T @ (y_batch - X_batch @ w) / batch_size
+            w -= learning_rate * gradient
+        coefs.append(w.copy())
+    return np.array(coefs)
+
+# Parameters
+n = 100
+p = 10
+batch_size = 4
+learning_rate = 0.01
+n_iter = 50
+t_values = np.arange(0.001, n_iter + 1)  # Include 0 in t_values for the zero coefficients
+alphas = 1/(learning_rate * t_values[0:])  # Exclude 0 to avoid division by zero
+
+# Generate data
+X, y, true_coef = generate_data(n, p)
+
+# Compute the regularization path for ridge regression
+ridge_coefs = compute_ridge_path(X, y, alphas)
+
+# Compute the optimization trajectory for SGD
+sgd_coefs = compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter)
+
+# Plotting
+fig, axs = plt.subplots(1, 2, figsize=(14, 5))
+# Regularization path for ridge regression
+# Skip the first element (0) in t_values for plotting to match dimensions with ridge_coefs
+axs[0].plot(1/alphas, ridge_coefs[1:])
+axs[0].set_xlabel('1/(lr * lambda)', fontsize=18)
+axs[0].set_ylabel('Parameters', fontsize=18)
+axs[0].set_title('Ridge Regression Path', fontsize=22)
+
+# Optimization trajectory for SGD
+# Use t_values for x-axis to include the initial zero coefficients
+axs[1].plot(t_values, sgd_coefs)
+axs[1].set_xlabel('iteration', fontsize=18)
+axs[1].set_ylabel('Parameters', fontsize=18)
+axs[1].set_title('SGD Trajectory', fontsize=22)
+
+plt.tight_layout()
+plt.show()
+
diff --git a/slides/regularization/slides-regu-early-stopping.tex b/slides/regularization/slides-regu-early-stopping.tex
@@ -7,6 +7,7 @@
 \newcommand{\learninggoals}{
   \item Know how early stopping works 
   \item Understand how early stopping acts as a regularizer
+  \item Know early stopping imitates $L2$ regularization in some cases
 }
 
 \title{Introduction to Machine Learning}
@@ -39,7 +40,9 @@
     \item Use parameters of the previous step for the actual model.
   \end{enumerate}
   More sophisticated forms also apply cross-validation.
-\framebreak
+\end{vbframe}
+
+\begin{vbframe}{Early Stopping and $L2$ \citebutton{Goodfellow et al., 2016, p. 249 ff.}{https://www.deeplearningbook.org/contents/regularization.html}}
   \begin{table}
     \begin{tabular}{p{4cm}|p{6cm}}
     Strengths & Weaknesses \\
@@ -49,18 +52,20 @@
     \hline
     Applicable to almost any model without adjustment \note{of objective function, parameter space, training procedure} & Temporary copy of $\thetab$ (we have to save the whole model each time validation error improves) \\
     \hline
-    Combinable with other regularization methods & Less data for training $\rightarrow$ include $\mathcal{D}_{\text{val}}$ afterwards
+    Combinable with other regularization methods & Less data for training $\rightarrow$ include $\mathcal{D}_{\text{val}}$ afterwards\\ \hline\hline
     \end{tabular}
   \end{table}
   \begin{itemize}
-    \item Relation between optimal early-stopping iteration $T_{\text{stop}}$ and weight-decay penalization parameter $\lambda$ for step-size $\alpha$ (see Goodfellow et al. (2016) page 251-252 for proof):
-  \end{itemize}
-    \begin{equation*}
-      T_{\text{stop}} \approx \frac{1}{\alpha \lambda} 
-        \Leftrightarrow \lambda \approx \frac{1}{T_{\text{stop}} \alpha}
-    \end{equation*}
+    \item For simple case of LM with squared loss and GD optim initialized at $\thetab=0$: Early stopping has exact correspondence with $L2$ regularization/WD: %Relation between
+    optimal early-stopping iter $T_{\text{stop}}$ inversely proportional to  $\lambda$ scaled by step-size $\alpha$
+
+ \end{itemize}
+\begin{equation*}
+T_{\text{stop}} \approx \frac{1}{\alpha \lambda} 
+\Leftrightarrow \lambda \approx \frac{1}{T_{\text{stop}} \alpha}
+\end{equation*}
   \begin{itemize}
-    \item Small $\lambda$ (low penalization) $\Rightarrow$ high $T_{\text{stop}}$ (complex model / lots of updates).
+    \item Small $\lambda$ ( regu. $\downarrow$) $\Rightarrow$ large $T_{\text{stop}}$ (complexity $\uparrow$) and vice versa
   \end{itemize}
 \framebreak
   % \begin{itemize}
@@ -75,14 +80,26 @@
   \begin{figure}
     \centering
       \scalebox{0.75}{\includegraphics{figure_man/earlystop_int_hat.png}}
-      \tiny{\\ Credit: Goodfellow et al. (2016)\\}
+      \tiny{\\Goodfellow et al. (2016)\\}
   \end{figure}
 
 \footnotesize 
-\textbf{Figure:} An illustration of the effect of early stopping. \textit{Left:} The solid contour lines indicate the contours of the negative log-likelihood. The dashed line indicates the trajectory taken by SGD beginning from the origin. Rather than stopping at the point $\thetah$ that minimizes the risk, early stopping results in the trajectory stopping at an earlier point $\hat{\thetab}_{\text{Ridge}}$. \textit{Right:} An illustration of the effect of $L2$ regularization for comparison. The dashed circles indicate the contours of the $L2$ penalty which causes the minimum of the total cost to lie closer to the origin than the minimum of the unregularized cost.
+\textbf{Figure:} Effect of early stopping. \textit{Left:} The solid lines indicate contours of the square loss objective. Dashed line indicates trajectory taken by GD initialized at origin. Instead of reaching minimizer $\thetah$, ES results in trajectory stopping earlier at $\hat{\thetab}_{\text{Ridge}}$. \textit{Right:} Effect of $L2$ regularization. Dashed circles indicate contours of $L2$ constraint which push minimizer of regularized cost closer to origin than minimizer of unregularized cost.
 \end{vbframe}
 
+\begin{vbframe}{SGD Trajectory and $L2$ \citebutton{Ali et al., 2020}{https://proceedings.mlr.press/v119/ali20a/ali20a.pdf}}
+Solution paths for $L2$ regularized linear model closely matches SGD trajectory of unregularized LM initialized at $\thetab=0$
+\lz
+  \begin{figure}
+    \centering
+      %\scalebox{0.75}
+      {\includegraphics{figure_man/ridge-vs-sgd-path.png}}
+      %\scriptsize{\\Ali et al. (2020)\\}
+  \end{figure}
 
+\textbf{Caveat}: Initialization at the origin is crucial for this equivalence to hold, which is almost never used in practice in ML/DL applications
+
+\end{vbframe}
 
 \endlecture
 \end{document}
diff --git a/slides/regularization/slides-regu-geom-l2-wdecay.tex b/slides/regularization/slides-regu-geom-l2-wdecay.tex
@@ -88,8 +88,6 @@
 
 The minimum of $\mathcal{\tilde R}_{\text{emp}}(\thetab)$ occurs where $\nabla_{\thetab}\mathcal{\tilde R}_{\text{emp}}(\thetab) = \bm{H}(\thetab - \thetah)$ is $0$.
 
-\lz
-
 Now we $L2$-regularize $\mathcal{\tilde R}_{\text{emp}}(\thetab)$, such that 
 \[
 \mathcal{\tilde R}_{\text{reg}}(\thetab) = \mathcal{\tilde R}_{\text{emp}}(\thetab) + \frac{\lambda}{2} \|\thetab\|^2_2\]
@@ -104,6 +102,9 @@
 
 % where $\id$ is the identity matrix.
 This gives us a formula to see how the minimizer of the $L2$-regularized version is a transformation of the minimizer of the unpenalized version.
+\vspace{0.2cm}
+
+\textbf{Caveat}: Equivalence of weight decay and $L2$ regularization only holds for vanilla SGD (not e.g. Adam)
 
 
 \framebreak

diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex
@@ -76,7 +76,8 @@
   \begin{itemize}
     \item Typically we omit $\theta_0$ in the penalty term $J(\thetab)$ so that the ``infinitely'' regularized model is the constant model (but this can be implementation-dependent).
     \item Penalty methods are typically not equivariant under scaling of the inputs, so one usually standardizes the features beforehand. 
-    \item Note that for a normal LM, if you scale some features, we can simply "anti-scale" the coefficients the same way. The risk does not change. For regularized models this is not so simple. If you scale features to smaller values, coefficients have to become larger to counteract. They now are penalized more heavily in $J(\thetab)$. Such a scaling would make some features less attractive without changing anything relevant in the data.
+    \item Note a normal LM has the inductive bias of rescaling equivariance, i.e., if you scale some features, we can simply "anti-scale" the coefficients the same way. The risk does not change. 
+    \item While regularized LMs exhibit low-complexity inductive bias, they lose equivariance property: if you down-scale features, coefficients have to become larger to counteract. Then they are penalized stronger in $J(\thetab)$, making some features less attractive without relevant changes in data.
 
     % \item While ridge regression usually leads to smaller estimated coefficients, but still dense $\thetab$ vectors,
     %   the Lasso will usually create a sparse $\thetab$ vector and can therefore be used for variable selection.

diff --git a/slides/regularization/slides-regu-nonlin-bayes.tex b/slides/regularization/slides-regu-nonlin-bayes.tex
@@ -7,8 +7,8 @@
 \newcommand{\learninggoals}{
   \item Understand that regularization and parameter shrinkage can be applied to non-linear models
   \item Know structural risk minimization 
-  \item Know how regularization risk minimization is the same as MAP 
-      in a Bayesian perspective, where the penalty corresponds to parameter prior.
+  \item Know how regularization risk minimization is same as MAP 
+      in Bayesian perspective, where penalty corresponds to a parameter prior
 }
 
 \title{Introduction to Machine Learning}

diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex
@@ -67,6 +67,53 @@
 $\Longrightarrow$ Ridge regression on unperturbed features {\small $\xi$} turns out to be minimizing squared loss averaged over feature noise distribution!
 \end{vbframe}
 
+\begin{vbframe}{Bias-Variance Decomposition for Ridge}
+For linear model $\yv = \Xmat \thetab + \bm{\varepsilon}$ with $\Xmat \in \mathbb{R}^{n \times p},\,\bm{\varepsilon} \sim (\bm{0},\sigma^2 \bm{I}_n)$, bias of ridge estimator $\thetah_{\text{Ridge}}$ is given by 
+\begin{equation*}
+    \begin{aligned}
+        \text{Bias}(\thetah_{\text{Ridge}}) := \mathbb{E}[\thetah_{\text{Ridge}}-\bm{\theta}] &= \mathbb{E}[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}\bm{y}] - \bm{\theta}\\
+        &= \mathbb{E}[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}(\bm{X}\bm{\theta}+\bm{\varepsilon})] - \bm{\theta} \\
+        &= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \bm{X} \bm{\theta} + (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \underbrace{\mathbb{E}[\bm{\varepsilon}]}_{=0} - \bm{\theta} \\
+        &= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \bm{X} \bm{\theta} - \bm{\theta} \\
+        &= \left[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} - (\bm{X}^{\top} \bm{X})^{-1} \right] \bm{X}^{\top} \bm{X} \bm{\theta}
+        \end{aligned}
+    \end{equation*}
+
+\begin{itemize}
+    \item Last expression shows bias of Ridge estimator only vanishes for $\lambda=0$, which is simply (unbiased) OLS solution
+    \item It follows $\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2>0$ for all $\lambda>0$, later important
+\end{itemize}
+
+    For the variance of $\thetah_{\text{Ridge}}$, we have
+    \begin{equation*}
+        \begin{aligned}
+            \text{Var}(\thetah_{\text{Ridge}})  &= \text{Var}\left((\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}\bm{y}\right) \quad \quad \big| \;\; \text{apply} \;\; \text{Var}_u(\bm{A}\bm{u}) = \bm{A} \text{Var}(\bm{u}) \bm{A}^{\top} \\
+            &= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \text{Var}(\bm{y}) \left( (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \right)^{\top} \\
+            &= (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \text{Var}(\bm{\varepsilon}) \bm{X} (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1}  \\
+            &=  (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \sigma^2 \bm{I}_n \bm{X} (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1}  \\
+            &= \sigma^2 (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top} \bm{X} (\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1}  \\
+        \end{aligned}
+    \end{equation*}
+
+\begin{itemize}
+    \item $\text{Var}(\thetah_{\text{Ridge}})$ is strictly smaller than $\text{Var}(\thetah_{\text{OLS}})=\sigma^2 (\Xmat^{\top}\Xmat)^{-1}$ for any $\lambda>0$, meaning matrix of their difference $\text{Var}(\thetah_{\text{OLS}})-\text{Var}(\thetah_{\text{Ridge}})$ is positive definite (bit tedious derivation)
+    \item This further means $\text{trace}\big({\text{Var}(\thetah_{\text{OLS}})}-{\text{Var}(\thetah_{\text{Ridge}})}\big)>0 \, \forall \lambda>0$
+\end{itemize}
+
+\framebreak
+
+With bias and variance of the ridge estimator we can decompose its mean squared error as follows:
+
+$$\text{MSE}(\thetah_{\text{Ridge}})=\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2 + \text{trace}\big(\text{Var}(\thetah_{\text{Ridge}})\big)$$
+
+Comparing MSEs of $\thetah_{\text{Ridge}}$ and $\thetah_{\text{OLS}}$ and using $\text{Bias}(\thetah_{\text{OLS}})=0$ we find 
+$$\text{MSE}(\thetah_{\text{OLS}})-\text{MSE}(\thetah_{\text{Ridge}}) = \underbrace{\text{trace}\big({\text{Var}(\thetah_{\text{OLS}})}-{\text{Var}(\thetah_{\text{Ridge}})}\big)}_{>0} - \underbrace{\Vert \text{Bias}(\thetah_{\text{Ridge}})\Vert_2^2}_{>0}$$
+
+Since both terms are positive, their difference is \textit{a priori} undetermined. \citebutton{Theobald, 1973}{https://www.jstor.org/stable/2984775} and \citebutton{Farebrother, 1976}{https://www.jstor.org/stable/2984971} prove there always exists some $\lambda^{\ast}>0$ so that
+$$\text{MSE}(\thetah_{\text{OLS}})-\text{MSE}(\thetah_{\text{Ridge}})>0$$
+Important theoretical result: While Gauss-Markov guarantuees $\thetah_{\text{OLS}}$ is best linear unbiased estimator (BLUE) there are biased estimators with lower MSE.
+
+\end{vbframe}