From d5ae5a4df6242d8c526b9315a2dc8623df3dded0 Mon Sep 17 00:00:00 2001 From: Lukas Burk Date: Wed, 23 Oct 2024 13:24:00 +0200 Subject: [PATCH] latex-math update: rename thetab -> thetav --- cheatsheets/cheatsheet_sl.tex | 8 +- cheatsheets/cheatsheet_sl_2.tex | 32 +++--- cheatsheets/cheatsheet_sl_3.tex | 18 ++-- .../ex_rnw/ex_connection_mle_erm.Rnw | 6 +- exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw | 12 +-- .../ex_rnw/sol_connection_mle_erm.Rnw | 18 ++-- exercises/advriskmin/ex_rnw/sol_glm_optim.Rnw | 34 +++--- .../ex_rnw/ex_multiclass_hinge_loss.Rnw | 6 +- .../ex_rnw/dd_Lasso_scaling.Rnw | 44 ++++---- .../ex_rnw/ex_l0_regularization.Rnw | 10 +- .../ex_rnw/ex_lasso_regularization.Rnw | 10 +- .../ex_rnw/ic_lasso_regularization.Rnw | 10 +- .../ex_rnw/sol_l0_regularization.Rnw | 24 ++--- .../ex_rnw/sol_lasso_regularization.Rnw | 18 ++-- exercises/svm/ex_rnw/ex_linsvm_hardmargin.Rnw | 8 +- exercises/svm/ex_rnw/ex_linsvm_regression.Rnw | 26 ++--- .../ex_rnw/ex_svm_kernelized_multiclass.Rnw | 14 +-- exercises/svm/ex_rnw/ic_linsvm_hardmargin.Rnw | 8 +- .../svm/ex_rnw/sol_linsvm_hardmargin.Rnw | 20 ++-- .../svm/ex_rnw/sol_linsvm_regression.Rnw | 82 +++++++------- .../ex_rnw/sol_svm_kernelized_multiclass.Rnw | 42 ++++---- ...es-advriskmin-classification-bernoulli.tex | 2 +- .../slides-advriskmin-logreg-deepdive.tex | 28 ++--- .../slides-advriskmin-losses-properties.tex | 62 +++++------ .../slides-advriskmin-max-likelihood-l2.tex | 18 ++-- ...slides-advriskmin-max-likelihood-other.tex | 10 +- .../slides-advriskmin-pseudo-residuals.tex | 14 +-- ...s-advriskmin-regression-further-losses.tex | 18 ++-- .../slides-advriskmin-risk-minimizer.tex | 4 +- .../componentwise_gradient_boosting.tex | 4 +- slides/boosting/cheatsheet_new/cheatsheet.tex | 4 +- .../boosting/slides-boosting-cwb-basics.tex | 6 +- .../boosting/slides-boosting-cwb-basics2.tex | 8 +- ...des-boosting-gradient-boosting-concept.tex | 4 +- ...ides-boosting-regression-illustrations.tex | 18 ++-- slides/boosting/tex/cwb-algo-short.tex | 4 +- .../slides-fs-introduction.tex | 2 +- slides/gaussian-processes/slides-gp-basic.tex | 12 +-- .../gaussian-processes/slides-gp-bayes-lm.tex | 46 ++++---- .../gaussian-processes/slides-gp-training.tex | 22 ++-- slides/information-theory/slides-info-ml.tex | 34 +++--- slides/linear-svm/slides-linsvm-erm.tex | 10 +- .../slides-linsvm-hard-margin-dual.tex | 92 ++++++++-------- .../linear-svm/slides-linsvm-hard-margin.tex | 102 +++++++++--------- .../linear-svm/slides-linsvm-optimization.tex | 10 +- .../linear-svm/slides-linsvm-soft-margin.tex | 16 +-- slides/multiclass/slides-mc-losses.tex | 2 +- .../slides-mc-softmax-regression.tex | 22 ++-- .../slides-nonlinsvm-rkhs-repr.tex | 24 ++--- .../regularization/slides-bias-var-ridge.tex | 8 +- slides/regularization/slides-regu-bayes.tex | 84 +++++++-------- .../slides-regu-early-stopping.tex | 6 +- .../regularization/slides-regu-enetlogreg.tex | 16 +-- slides/regularization/slides-regu-geom-l1.tex | 14 +-- slides/regularization/slides-regu-geom-l2.tex | 34 +++--- slides/regularization/slides-regu-intro.tex | 6 +- slides/regularization/slides-regu-l1.tex | 24 ++--- slides/regularization/slides-regu-l1vsl2.tex | 14 +-- .../regularization/slides-regu-l2-nonlin.tex | 50 ++++----- slides/regularization/slides-regu-l2.tex | 26 ++--- .../slides-regu-lasso-deepdive.tex | 4 +- slides/regularization/slides-regu-nonlin.tex | 10 +- slides/regularization/slides-regu-others.tex | 10 +- .../slides-regu-ridge-deepdive.tex | 30 +++--- .../regularization/slides-regu-wd-vs-l2.tex | 18 ++-- 65 files changed, 701 insertions(+), 701 deletions(-) diff --git a/cheatsheets/cheatsheet_sl.tex b/cheatsheets/cheatsheet_sl.tex index f4f70a3a..ab3074ac 100644 --- a/cheatsheets/cheatsheet_sl.tex +++ b/cheatsheets/cheatsheet_sl.tex @@ -236,8 +236,8 @@ \begin{tabular}{c|cc} & Logistic Regression & Softmax Regression \\ \hline $\Yspace$ & $\{0, 1\}$ & $\{1, 2, ..., g\}$ \\[0.5cm] - Discriminant fun. & $f(\xv) = \thetab^\top \xv$ & $f_k(\xv) = \thetab_{k}^{\top} \xv, k = 1, \ldots, g$ \\[0.5cm] - Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetab^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetab_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv) }$ \\[0.5cm] + Discriminant fun. & $f(\xv) = \thetav^\top \xv$ & $f_k(\xv) = \thetav_{k}^{\top} \xv, k = 1, \ldots, g$ \\[0.5cm] + Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetav^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetav_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv) }$ \\[0.5cm] $L(y, \pix)$ & Bernoulli / logarithmic loss & Multiclass logarithmic loss\\[-0.3cm] & $-y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right)$ & $ - \sum_{k = 1}^g [y = k] \log\left(\pi_k(\xv)\right)$ \\ \end{tabular} @@ -444,12 +444,12 @@ %\begin{myblock}{Components of Learning} %\textbf{Learning = Hypothesis space + Risk + Optimization} \\ -%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta} +%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta} %\risket$ % % \textbf{Learning &= Hypothesis space &+ Risk &+ Optimization} \\ -% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$ +% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$ % % \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$ % can be learned from the data. diff --git a/cheatsheets/cheatsheet_sl_2.tex b/cheatsheets/cheatsheet_sl_2.tex index 32c8b9fa..27575571 100644 --- a/cheatsheets/cheatsheet_sl_2.tex +++ b/cheatsheets/cheatsheet_sl_2.tex @@ -169,21 +169,21 @@ % \item $J(f)$ is the \textbf{complexity/roughness penalty} or \textbf{regularizer}. \item $\lambda > 0$ is the \textbf{complexity control} parameter. - \item For parameterized hypotheses: $\riskrt = \risket + \lambda \cdot J(\thetab)$. + \item For parameterized hypotheses: $\riskrt = \risket + \lambda \cdot J(\thetav)$. \end{itemize} % Tackles the trade-off: \emph{maximizing} the fit (minimizing the train loss) vs.\ \emph{minimizing} the complexity of the model. \\ % - Regularization in the linear model ($\fx = \thetab^\top \xv$): + Regularization in the linear model ($\fx = \thetav^\top \xv$): % \begin{itemize} \setlength{\itemindent}{+.3in} % - \item Ridge regression: $J(\thetab) = \|\thetab\|_2^2 = \thetab^\top \thetab.$ - \item Lasso regression: $J(\thetab) = \|\thetab\|_1 = \sum_{j=1}^p |\theta_j|.$ - \item Elastic net regression: $J(\thetab) = (\|\thetab\|_2^2, \|\thetab\|_1)^\top$ and $\lambda=(\lambda_1,\lambda_2).$ - \item L0 regression: $J(\thetab) = \|\thetab\|_0 = \sum_{j=1}^p |\theta_j|^0.$ + \item Ridge regression: $J(\thetav) = \|\thetav\|_2^2 = \thetav^\top \thetav.$ + \item Lasso regression: $J(\thetav) = \|\thetav\|_1 = \sum_{j=1}^p |\theta_j|.$ + \item Elastic net regression: $J(\thetav) = (\|\thetav\|_2^2, \|\thetav\|_1)^\top$ and $\lambda=(\lambda_1,\lambda_2).$ + \item L0 regression: $J(\thetav) = \|\thetav\|_0 = \sum_{j=1}^p |\theta_j|^0.$ % \end{itemize} % @@ -204,7 +204,7 @@ % Signed distance to the separating hyperplane: $$ - d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetab\|} = \yi \frac{\thetab^T \xi + \theta_0}{\|\thetab\|} + d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetav\|} = \yi \frac{\thetav^T \xi + \theta_0}{\|\thetav\|} $$ Distance of $f$ to the whole dataset $\D:$ $ @@ -214,13 +214,13 @@ \textbf{Primal linear hard-margin SVM:} % \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset + & \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset \end{eqnarray*} % Support vectors: All instances $(\xi, \yi)$ with minimal margin $\yi \fxi = 1$, fulfilling the inequality constraints with equality. - All have distance of $\gamma = 1 / \|\thetab\|$ from the separating hyperplane. + All have distance of $\gamma = 1 / \|\thetav\|$ from the separating hyperplane. \textbf{Dual linear hard-margin SVM:} % @@ -233,7 +233,7 @@ Solution (if existing): % $$ - \thetah = \sum\nolimits_{i=1}^n \hat \alpha_i \yi \xi, \quad \theta_0 = \yi - \scp{\thetab}{\xi}. + \thetah = \sum\nolimits_{i=1}^n \hat \alpha_i \yi \xi, \quad \theta_0 = \yi - \scp{\thetav}{\xi}. $$ % \end{myblock} @@ -256,8 +256,8 @@ % \textbf{Primal linear soft-margin SVM:} \begin{eqnarray*} - & \min\limits_{\thetab, \thetab_0,\sli} & \frac{1}{2} \|\thetab\|^2 + C \sum_{i=1}^n \sli \\ - & \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ + & \min\limits_{\thetav, \thetav_0,\sli} & \frac{1}{2} \|\thetav\|^2 + C \sum_{i=1}^n \sli \\ + & \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ & \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset,\\ \end{eqnarray*} % @@ -285,7 +285,7 @@ % Regularized empirical risk minimization representation: % - $$ \risket = \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$ + $$ \risket = \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$ % \end{myblock} @@ -366,12 +366,12 @@ %\begin{myblock}{Components of Learning} %\textbf{Learning = Hypothesis space + Risk + Optimization} \\ - %\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta} + %\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta} %\risket$ % % \textbf{Learning &= Hypothesis space &+ Risk &+ Optimization} \\ - % &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$ + % &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$ % % \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$ % can be learned from the data. diff --git a/cheatsheets/cheatsheet_sl_3.tex b/cheatsheets/cheatsheet_sl_3.tex index 393f9ae7..1da7debb 100644 --- a/cheatsheets/cheatsheet_sl_3.tex +++ b/cheatsheets/cheatsheet_sl_3.tex @@ -109,23 +109,23 @@ Bayesian Linear Model: % \begin{eqnarray*} - \yi &=& \fxi + \epsi = \thetab^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\} + \yi &=& \fxi + \epsi = \thetav^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\} \end{eqnarray*} % where $\epsi \sim \mathcal{N}(0, \sigma^2).$ % - Parameter vector $\thetab$ is stochastic and follows a distribution.\\ + Parameter vector $\thetav$ is stochastic and follows a distribution.\\ % Gaussian variant: % \begin{itemize} \setlength{\itemindent}{+.3in} - \item Prior distribution: $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p)$ + \item Prior distribution: $\thetav \sim \mathcal{N}(\zero, \tau^2 \id_p)$ \item Posterior distribution: $ - \thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) + \thetav ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) $ with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$ - \item Predictive distribution of $y_* = \thetab^\top \xv_*$ for a new observations $\xv_*$: + \item Predictive distribution of $y_* = \thetav^\top \xv_*$ for a new observations $\xv_*$: $$ y_* ~|~ \Xmat, \yv, \xv_* \sim \mathcal{N}(\sigma^{-2}\yv^\top \Xmat \Amat^{-1}\xv_*, \xv_*^\top\Amat^{-1}\xv_*) $$ @@ -141,8 +141,8 @@ \begin{tabular}{cc} \textbf{Weight-Space View} & \textbf{Function-Space View} \vspace{4mm}\\ Parameterize functions & \vspace{1mm}\\ - \footnotesize Example: $\fxt = \thetab^\top \xv$ & \vspace{3mm}\\ - Define distributions on $\thetab$ & Define distributions on $f$ \vspace{4mm}\\ + \footnotesize Example: $\fxt = \thetav^\top \xv$ & \vspace{3mm}\\ + Define distributions on $\thetav$ & Define distributions on $f$ \vspace{4mm}\\ Inference in parameter space $\Theta$ & Inference in function space $\Hspace$ \end{tabular} \end{table} @@ -393,12 +393,12 @@ %\begin{myblock}{Components of Learning} %\textbf{Learning = Hypothesis space + Risk + Optimization} \\ - %\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta} + %\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta} %\risket$ % % \textbf{Learning &= Hypothesis space &+ Risk &+ Optimization} \\ - % &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$ + % &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$ % % \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$ % can be learned from the data. diff --git a/exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw b/exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw index 68f29098..398f1be7 100644 --- a/exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw +++ b/exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw @@ -32,12 +32,12 @@ where $\eps^{(1)},\ldots,\eps^{(n)}$ are iid with distribution $\mathcal{N}(0, \ % \begin{equation*} \begin{split} - \Hspace = \{f(\cdot~|~ \thetab): \Xspace \to \R \ ~|~ & f(\cdot~|~ \thetab) \text{ belongs to a certain - functional family parameterized by } \thetab \in \Theta \}, + \Hspace = \{f(\cdot~|~ \thetav): \Xspace \to \R \ ~|~ & f(\cdot~|~ \thetav) \text{ belongs to a certain + functional family parameterized by } \thetav \in \Theta \}, \end{split} \end{equation*} % - where $\thetab = (\theta_1, \theta_2, \ldots, \theta_d)$ is a parameter vector, which is an element of a \textbf{parameter space} + where $\thetav = (\theta_1, \theta_2, \ldots, \theta_d)$ is a parameter vector, which is an element of a \textbf{parameter space} $\Theta$. % Based on your findings in (a), establish a relationship between minimizing the negative log-likelihood for $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ and empirical loss minimization over $\Hspace$ of the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$ diff --git a/exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw b/exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw index f887bc92..a29124e4 100644 --- a/exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw +++ b/exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw @@ -12,18 +12,18 @@ current competitor and market data. \{1, 2, \dots, n\}, i \neq j$, with sample size $n$. \begin{itemize} \item Argue which of the following distributions from the one-parametric exponential family is most suitable for the underlying use case: normal, Bernoulli, gamma or Poisson. - \item Write down the probability distribution of the chosen distribution depending on $\thetab$ assuming a log link function. + \item Write down the probability distribution of the chosen distribution depending on $\thetav$ assuming a log link function. \end{itemize} %The GLM models the target as a linear function of the features - %with Gaussian error term: $\ydat = \Xmat \thetab + \epsilon$, \\ + %with Gaussian error term: $\ydat = \Xmat \thetav + \epsilon$, \\ %$\epsilon \sim N(\bm{0}, \mathit{diag}(\sigma^2)), ~~ \sigma > 0$. % Furthermore, you have reason to believe that the effect of mileage might be % non-linear, so you decide to include this quantity logarithmically (using the % natural logarithm). \item State the hypothesis space for the corresponding model class. - For this, assume the parameter vector $\thetab$ to include the intercept + For this, assume the parameter vector $\thetav$ to include the intercept coefficient. \item Which parameters need to be learned? Define the corresponding parameter space $\Theta$. @@ -32,15 +32,15 @@ current competitor and market data. likelihood estimation (MLE). %The likelihood for the LM is given by: % \[ - % \ell(\thetab) = - \frac{n}{2} \log(2 \sigma^2 \pi) - \frac{1}{2 \sigma^2} - % (\ydat - \Xmat \thetab)^T(\ydat - \Xmat \thetab) + % \ell(\thetav) = - \frac{n}{2} \log(2 \sigma^2 \pi) - \frac{1}{2 \sigma^2} + % (\ydat - \Xmat \thetav)^T(\ydat - \Xmat \thetav) % \] % \\ % &= \left( \frac{1}{2 \pi \sigma^2} \right)^{\frac{n}{2}} \exp \left(- % \frac{1}{2 \sigma^2} \sumin \left(\yi - \thetat \xi \right)^2 \right) \\ % &= \left( \frac{1}{2 \pi \sigma^2} \right)^{\frac{n}{2}} \exp \left(- - % \frac{1}{2 \sigma^2} \| \ydat - \Xmat \thetab \|^2 \right) + % \frac{1}{2 \sigma^2} \| \ydat - \Xmat \thetav \|^2 \right) Describe how you can make use of the likelihood in empirical risk minimization (ERM) and write down the likelihood as well as the resulting empirical risk. %\item Now you need to optimize this risk to find the best parameters, diff --git a/exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw b/exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw index 46e9f596..87047bcf 100644 --- a/exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw +++ b/exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw @@ -10,26 +10,26 @@ The likelihood for $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ is % \begin{eqnarray*} - \LL(\thetab) &=& \prod_{i=1}^n \pdf\left(z^{(i)} ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \exp\left(-\frac{1}{2\sigma^2}\sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)\,. + \LL(\thetav) &=& \prod_{i=1}^n \pdf\left(z^{(i)} ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \exp\left(-\frac{1}{2\sigma^2}\sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)\,. \end{eqnarray*} % So, the negative log-likelihood for $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ is % \begin{eqnarray*} - - \loglt &=& - \log\left(\LL(\thetab)\right) \\ + - \loglt &=& - \log\left(\LL(\thetav)\right) \\ &=& - \log\left( \exp\left(-\frac{1}{2\sigma^2} \sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right) \right) \\ &\propto& \sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2 \\ &=& \sumin \left[ m(\yi) - m\left(\fxit\right)\right]^2. \end{eqnarray*} % - Thus, the negative log-likelihood for a parameter $\thetab$ is proportional to the empirical risk of a hypothesis $f(\cdot ~|~ \thetab)$ w.r.t. the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$ + Thus, the negative log-likelihood for a parameter $\thetav$ is proportional to the empirical risk of a hypothesis $f(\cdot ~|~ \thetav)$ w.r.t. the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$ % \item First, we specify the feature space: $\Xspace = \{1\} \times \R,$ i.e., any feature $\xv \in \Xspace$ is of the form $\xv=(x_1,x_2)^\top = (1,x_2)^\top$ for some $x_2\in \R.$ % According to the exercise we use $m(x)=\log(x),$ whose inverse is $m^{-1}(x)=\exp(x).$ % -Let us rewrite Forbes' conjectured model $ y = \theta_1 \exp(\theta_2 x + \eps)$ into $y = m^{-1} \left( m(f(\xv~|~ \thetab)) + \eps \right),$ for some suitable hypothesis $f(\xv~|~ \thetab):$ +Let us rewrite Forbes' conjectured model $ y = \theta_1 \exp(\theta_2 x + \eps)$ into $y = m^{-1} \left( m(f(\xv~|~ \thetav)) + \eps \right),$ for some suitable hypothesis $f(\xv~|~ \thetav):$ % \begin{align*} % @@ -47,13 +47,13 @@ Let us rewrite Forbes' conjectured model $ y = \theta_1 \exp(\theta_2 x + \eps) % \end{align*} % -With this, we see that $f(\xv~|~ \thetab) = \theta_1 x_1 \exp(\theta_2 x_2) = \theta_1 \exp(\theta_2 x_2)$ is a suitable functional form for the hypotheses. +With this, we see that $f(\xv~|~ \thetav) = \theta_1 x_1 \exp(\theta_2 x_2) = \theta_1 \exp(\theta_2 x_2)$ is a suitable functional form for the hypotheses. % Thus, we use as our parameter space $\Theta = \R_+ \times \R$ which gives rise to the hypothesis space % \begin{equation*} \begin{split} - \Hspace = \{f(\xv~|~ \thetab) = \theta_1 x_1 \exp(\theta_2 x_2) ~|~ \thetab \in \Theta \}. + \Hspace = \{f(\xv~|~ \thetav) = \theta_1 x_1 \exp(\theta_2 x_2) ~|~ \thetav \in \Theta \}. \end{split} \end{equation*} % @@ -76,15 +76,15 @@ A suitable hypothesis space is then % \begin{equation*} \begin{split} - \Hspace = \{f(\xv~|~ \thetab) = \log(\theta_1) x_1 + \theta_2 x_2 ~|~ \thetab \in \Theta \}, + \Hspace = \{f(\xv~|~ \thetav) = \log(\theta_1) x_1 + \theta_2 x_2 ~|~ \thetav \in \Theta \}, \end{split} \end{equation*} % -which are the linear functions\footnote{Note that $\log(\theta_1)$ can be any value in $\R.$} $\xv^\top \thetab$ of features in $\Xspace.$ +which are the linear functions\footnote{Note that $\log(\theta_1)$ can be any value in $\R.$} $\xv^\top \thetav$ of features in $\Xspace.$ % The empirical risk minimizer in this case is specified by the parameter % -$$(\log(\hat{\theta}_1),\hat{\theta}_2)^\top = \thetabh=\left(\Xmat^T \Xmat\right)^{-1}\Xmat^T \bm{z}, \qquad \bm{z} = (\log y^{(1)},\ldots,\log y^{(n)})^\top,$$ +$$(\log(\hat{\theta}_1),\hat{\theta}_2)^\top = \thetavh=\left(\Xmat^T \Xmat\right)^{-1}\Xmat^T \bm{z}, \qquad \bm{z} = (\log y^{(1)},\ldots,\log y^{(n)})^\top,$$ % (see \href{https://slds-lmu.github.io/i2ml/chapters/02_supervised_regression/02-02-linearmodel/}{Chapter 02.02 of I2ML}) which for this simple case is: % diff --git a/exercises/advriskmin/ex_rnw/sol_glm_optim.Rnw b/exercises/advriskmin/ex_rnw/sol_glm_optim.Rnw index 8d29151a..887c6b22 100644 --- a/exercises/advriskmin/ex_rnw/sol_glm_optim.Rnw +++ b/exercises/advriskmin/ex_rnw/sol_glm_optim.Rnw @@ -5,10 +5,10 @@ \item The normal and gamma distribution assume a continuous target variable, however, we do have a discrete target variable. Since the target variable (number of cars) represents a count variable, Bernoulli (taking only the values 0 and 1) is not a suitable choice in this context. It follows that the only reasonable choice of the given distributions is the Poisson distribution. The Poisson distribution depends on the parameter $\lambda$, where the expected value is given by $E(Y|\xv) = \lambda ( \xv)$. - The log link function is given by $\log(\lambda(\xv)) = \thetab^T \xv$. + The log link function is given by $\log(\lambda(\xv)) = \thetav^T \xv$. Following from that the probability function is given by $$ - P(Y = y) = \frac{\exp{(-\lambda(\xv))} \cdot (\lambda(\xv))^{y}}{y!} = \frac{\exp{(-\exp{(\thetab^T \xv)})} \cdot \exp({\thetab^T \xv})^{y}}{y!} + P(Y = y) = \frac{\exp{(-\lambda(\xv))} \cdot (\lambda(\xv))^{y}}{y!} = \frac{\exp{(-\exp{(\thetav^T \xv)})} \cdot \exp({\thetav^T \xv})^{y}}{y!} $$ for $y \in \N_0$. @@ -16,11 +16,11 @@ \item We can write the hypothesis space as: \begin{flalign*} - \Hspace = \{\fxt = \exp(\thetab^T \xv) ~|~ \thetab \in \R^3 \} + \Hspace = \{\fxt = \exp(\thetav^T \xv) ~|~ \thetav \in \R^3 \} = \{\fxt = \exp(\theta_0 + \theta_1 x_1 + \theta_2 x_2) ~|~ (\theta_0, \theta_1, \theta_2) \in \R^3 \}. % \Hspace &= \{ f: (\R_{0}^{+})^2 \rightarrow \R ~|~ - % \fx = \theta_0 + \thetab^T \xv, ~ (\theta_0, \thetab) \in \R^3 \} \\ + % \fx = \theta_0 + \thetav^T \xv, ~ (\theta_0, \thetav) \in \R^3 \} \\ % &= \{ f: (\R_{0}^{+})^2 \rightarrow \R ~|~ % \fx = \theta_0 + \theta_{\text{age}} x_{\text{age}} + % \theta_{\text{mileage}} x_{\text{mileage}}, ~ (\theta_0, @@ -28,7 +28,7 @@ \end{flalign*} Note the \textbf{slight abuse of notation} here: in the lecture, we first - define $\thetab$ to only consist of the feature coefficients, with $\xv$ + define $\thetav$ to only consist of the feature coefficients, with $\xv$ likewise being the plain feature vector. For the sake of simplicity, however, it is more convenient to append the intercept coefficient to the vector of feature coefficients. This does not change our model formulation, but we have @@ -44,8 +44,8 @@ \item The likelihood for the Poisson distribution is defined by: \begin{flalign*} - \LL(\thetab | \xv) &= - \prodin \frac{\exp{(-\exp{(\thetab^T \xi)})} \cdot (\exp{(\thetab^T \xi)})^{\yi}}{\yi!} + \LL(\thetav | \xv) &= + \prodin \frac{\exp{(-\exp{(\thetav^T \xi)})} \cdot (\exp{(\thetav^T \xi)})^{\yi}}{\yi!} \end{flalign*} @@ -66,22 +66,22 @@ Let's put these reflections to practice: \begin{flalign*} - L_{NLL}\left (\yi, f\left( \xi | \thetab \right) \right) - &= - \log \LL(\thetab | \xi) \\ - &= - \ell(\thetab | \xi) \\ - &= - \log \frac{\exp{(-\exp{(\thetab^T \xi)})} \cdot (\exp{(\thetab^T \xi)})^{\yi}}{\yi!} \\ - &= \exp{(\thetab^T \xi)} - \yi (\thetab^T \xi) + \log(\yi!) + L_{NLL}\left (\yi, f\left( \xi | \thetav \right) \right) + &= - \log \LL(\thetav | \xi) \\ + &= - \ell(\thetav | \xi) \\ + &= - \log \frac{\exp{(-\exp{(\thetav^T \xi)})} \cdot (\exp{(\thetav^T \xi)})^{\yi}}{\yi!} \\ + &= \exp{(\thetav^T \xi)} - \yi (\thetav^T \xi) + \log(\yi!) \end{flalign*} \begin{flalign*} - \risket &= \sumin - \ell(\thetab | \xi) \\ - &= \sumin L_{NLL}\left (\yi, f\left( \xi | \thetab \right) \right) \\ - &= \sumin \exp{(\thetab^T \xi)} - \yi (\thetab^T \xi) + \log(\yi!) \\ - &\propto \sumin \exp{(\thetab^T \xi)} - \yi (\thetab^T \xi) \\ + \risket &= \sumin - \ell(\thetav | \xi) \\ + &= \sumin L_{NLL}\left (\yi, f\left( \xi | \thetav \right) \right) \\ + &= \sumin \exp{(\thetav^T \xi)} - \yi (\thetav^T \xi) + \log(\yi!) \\ + &\propto \sumin \exp{(\thetav^T \xi)} - \yi (\thetav^T \xi) \\ \end{flalign*} As we are only interested in the feature coefficients here, we neglect all - irrelevant terms that do not depend on $\thetab$ as they have no effect on + irrelevant terms that do not depend on $\thetav$ as they have no effect on the solution (i.e., the $\argmin$ of $\risket$). This is what the proportional sign $\propto$, often used in contexts of optimization and Bayesian statistics, means: we keep diff --git a/exercises/multiclass/ex_rnw/ex_multiclass_hinge_loss.Rnw b/exercises/multiclass/ex_rnw/ex_multiclass_hinge_loss.Rnw index 4fca714c..18452277 100644 --- a/exercises/multiclass/ex_rnw/ex_multiclass_hinge_loss.Rnw +++ b/exercises/multiclass/ex_rnw/ex_multiclass_hinge_loss.Rnw @@ -47,7 +47,7 @@ Show that the upper bound in (b) coincides with the binary hinge loss $L(y,\fx)= Can we say something similar for the alternative multiclass hinge loss in (b)? % % -\item Now consider the case in which the score functions are linear, i.e., $f_k(\xv)=\thetab_k^\top \xv$ for each $k \in \Yspace.$ +\item Now consider the case in which the score functions are linear, i.e., $f_k(\xv)=\thetav_k^\top \xv$ for each $k \in \Yspace.$ % What is the difference between % @@ -60,9 +60,9 @@ What is the difference between \end{itemize} % % -%\item Denote by $\thetab= (\thetab_1^\top, \ldots, \thetab_g^\top)^\top \in \R^{g\cdot p}$ the ``stacked'' parameter vector of the score functions obtained by stacking the parameters of the score functions. +%\item Denote by $\thetav= (\thetav_1^\top, \ldots, \thetav_g^\top)^\top \in \R^{g\cdot p}$ the ``stacked'' parameter vector of the score functions obtained by stacking the parameters of the score functions. %% -%Show that the multiclass hinge loss is convex with respect to $\thetab,$ i.e., $\thetab \mapsto \max\limits_{k} \left( \thetab_k^\top \xv - \thetab_y^\top \xv + \mathds{1}_{\{ y \ne k \}} \right)$ is convex. +%Show that the multiclass hinge loss is convex with respect to $\thetav,$ i.e., $\thetav \mapsto \max\limits_{k} \left( \thetav_k^\top \xv - \thetav_y^\top \xv + \mathds{1}_{\{ y \ne k \}} \right)$ is convex. % %\emph{Hint:} Use the following two facts: %\begin{itemize} diff --git a/exercises/regularization/ex_rnw/dd_Lasso_scaling.Rnw b/exercises/regularization/ex_rnw/dd_Lasso_scaling.Rnw index fa4c6cd6..dfa6edd5 100644 --- a/exercises/regularization/ex_rnw/dd_Lasso_scaling.Rnw +++ b/exercises/regularization/ex_rnw/dd_Lasso_scaling.Rnw @@ -4,7 +4,7 @@ Consider the regression learning setting, i.e., $\mathcal{Y} = \R,$ and feature % Let the hypothesis space be the linear models: % -$$ \Hspace = \{ \fx = \thetab^\top \xv ~|~ \thetab \in \R^p \}. $$ +$$ \Hspace = \{ \fx = \thetav^\top \xv ~|~ \thetav \in \R^p \}. $$ % %Suppose your loss function of interest is the L2 loss $\Lxy= \frac12 \big(y-\fx\big)^2.$ % @@ -66,23 +66,23 @@ and 0 & s_2 & \ldots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \ldots & s_p - \end{pmatrix} \underbrace{( {\Xmat}^T \Xmat )^{-1} {\Xmat}^T \yv}_{= \thetabh^X} + \end{pmatrix} \underbrace{( {\Xmat}^T \Xmat )^{-1} {\Xmat}^T \yv}_{= \thetavh^X} % = \begin{pmatrix} - s_1 \thetabh^X_1\\ - s_2 \thetabh^X_2\\ + s_1 \thetavh^X_1\\ + s_2 \thetavh^X_2\\ \vdots \\ - s_p \thetabh^X_p + s_p \thetavh^X_p \end{pmatrix}. % \end{align} % -So, $\thetabh^X$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss) if we wouldn't rescale the features and accordingly use the original $\xv$ data. +So, $\thetavh^X$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss) if we wouldn't rescale the features and accordingly use the original $\xv$ data. Let us now specify the Lasso regularization risk for the scaled data (the $\mathbf{z}$ data): % $$ -\mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetab) = \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \mathbf{z}^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i|. +\mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetav) = \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \mathbf{z}^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i|. $$ \clearpage % @@ -90,39 +90,39 @@ $$ % \item We can show that % - $$ \argmin_{\thetab} \mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetab) = \argmin_{\thetab} \sum_{i=1}^p - s_i \thetah_{i}^X \theta_i + \frac{s_i^2 \theta_i^2}{2} + \lambda|\theta_i|$$ + $$ \argmin_{\thetav} \mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetav) = \argmin_{\thetav} \sum_{i=1}^p - s_i \thetah_{i}^X \theta_i + \frac{s_i^2 \theta_i^2}{2} + \lambda|\theta_i|$$ % holds as follows: % \begin{align*} % - \argmin_{\thetab} \mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetab) + \argmin_{\thetav} \mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetav) % - &= \argmin_{\thetab} \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \mathbf{z}^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i| \\ + &= \argmin_{\thetav} \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \mathbf{z}^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i| \\ % - &= \argmin_{\thetab} \frac12 \| \yv - \mathbf{Z} \thetab \|_2^2 + \lambda \sum_{i=1}^p |\theta_i| \\ + &= \argmin_{\thetav} \frac12 \| \yv - \mathbf{Z} \thetav \|_2^2 + \lambda \sum_{i=1}^p |\theta_i| \\ % - &= \argmin_{\thetab} \frac12 (\yv - \mathbf{Z} \thetab)^\top(\yv - \mathbf{Z} \thetab) + \lambda \sum_{i=1}^p |\theta_i| \\ + &= \argmin_{\thetav} \frac12 (\yv - \mathbf{Z} \thetav)^\top(\yv - \mathbf{Z} \thetav) + \lambda \sum_{i=1}^p |\theta_i| \\ % - &= \argmin_{\thetab} \frac12 \yv^\top \yv -\yv^\top \mathbf{Z} \thetab + \frac12 \thetab^\top \mathbf{Z}^\top \mathbf{Z} \thetab + \lambda \sum_{i=1}^p |\theta_i| \\ + &= \argmin_{\thetav} \frac12 \yv^\top \yv -\yv^\top \mathbf{Z} \thetav + \frac12 \thetav^\top \mathbf{Z}^\top \mathbf{Z} \thetav + \lambda \sum_{i=1}^p |\theta_i| \\ % - &= \argmin_{\thetab} -\yv^\top \mathbf{Z} \thetab + \frac12 \thetab^\top \mathbf{Z}^\top \mathbf{Z} \thetab + \lambda \sum_{i=1}^p |\theta_i| \tag{ $ \yv^\top \yv$ does not depend on $\thetab$ }\\ + &= \argmin_{\thetav} -\yv^\top \mathbf{Z} \thetav + \frac12 \thetav^\top \mathbf{Z}^\top \mathbf{Z} \thetav + \lambda \sum_{i=1}^p |\theta_i| \tag{ $ \yv^\top \yv$ does not depend on $\thetav$ }\\ % - &= \argmin_{\thetab} -\yv^\top \mathbf{Z} \thetab + \frac12 \thetab^\top \begin{pmatrix} + &= \argmin_{\thetav} -\yv^\top \mathbf{Z} \thetav + \frac12 \thetav^\top \begin{pmatrix} s_1^2 & 0 & \ldots & 0 \\ 0 & s_2^2 & \ldots & 0 \\ 0 & 0 & \ldots & s_p^2 - \end{pmatrix} \thetab + \lambda \sum_{i=1}^p |\theta_i| \tag{By \eqref{z_product} }\\ + \end{pmatrix} \thetav + \lambda \sum_{i=1}^p |\theta_i| \tag{By \eqref{z_product} }\\ % - &= \argmin_{\thetab} -(s_1 \thetabh^X_1, s_2 \thetabh^X_2, + &= \argmin_{\thetav} -(s_1 \thetavh^X_1, s_2 \thetavh^X_2, \ldots , - s_p \thetabh^X_p )^\top \thetab + \frac12 \thetab^\top \begin{pmatrix} + s_p \thetavh^X_p )^\top \thetav + \frac12 \thetav^\top \begin{pmatrix} s_1^2 & 0 & \ldots & 0 \\ 0 & s_2^2 & \ldots & 0 \\ 0 & 0 & \ldots & s_p^2 - \end{pmatrix} \thetab + \lambda \sum_{i=1}^p |\theta_i| \tag{By \eqref{y_projection} }\\ + \end{pmatrix} \thetav + \lambda \sum_{i=1}^p |\theta_i| \tag{By \eqref{y_projection} }\\ % - &= \argmin_{\thetab} \sum_{i=1}^p - s_i \thetah_{i}^X \theta_i + \frac{s_i^2 \theta_i^2}{2} + \lambda |\theta_i| \tag{Writing out the inner products}\\ + &= \argmin_{\thetav} \sum_{i=1}^p - s_i \thetah_{i}^X \theta_i + \frac{s_i^2 \theta_i^2}{2} + \lambda |\theta_i| \tag{Writing out the inner products}\\ % \end{align*} % @@ -156,13 +156,13 @@ $$ % \end{itemize} % -Thus, the minimizer of $\mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetab) $ is $\thetabh^Z_{\text{Lasso}} = ( \thetah^Z_{\text{Lasso},1}, \ldots, \thetah^Z_{\text{Lasso},p} )^\top $ with +Thus, the minimizer of $\mathcal{R}_{\text{reg}}^{\mathbf{Z}}(\thetav) $ is $\thetavh^Z_{\text{Lasso}} = ( \thetah^Z_{\text{Lasso},1}, \ldots, \thetah^Z_{\text{Lasso},p} )^\top $ with % $$ \thetah^Z_{\text{Lasso},i} = sgn(\thetah_{i}^X) \max\left\{ \frac{|\thetah_{i}^X|}{s_i} - \frac{\lambda}{s_i^2} ,0 \right\}, \quad i=1,\ldots,p, $$ % while the minimizer of the $L_1$-regularized empirical risk for the original $\xv$ data, i.e., $$ -\riskrt = \risket + \lambda \|\thetab\|_1 = \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \xi \right)^2 + \lambda \sum_{i=1}^p |\theta_i| +\riskrt = \risket + \lambda \|\thetav\|_1 = \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \xi \right)^2 + \lambda \sum_{i=1}^p |\theta_i| $$ % is\footnote{This is what we show on in-class exercise sheet 09.} $\thetah_{\text{Lasso}} = (\thetah_{\text{Lasso},1},\ldots,\thetah_{\text{Lasso},p})^\top$ where diff --git a/exercises/regularization/ex_rnw/ex_l0_regularization.Rnw b/exercises/regularization/ex_rnw/ex_l0_regularization.Rnw index 9657a314..9e781d85 100644 --- a/exercises/regularization/ex_rnw/ex_l0_regularization.Rnw +++ b/exercises/regularization/ex_rnw/ex_l0_regularization.Rnw @@ -4,13 +4,13 @@ Consider the regression learning setting, i.e., $\mathcal{Y} = \R,$ and feature % Let the hypothesis space be the linear models: % -$$ \Hspace = \{ \fx = \thetab^\top \xv ~|~ \thetab \in \R^p \}. $$ +$$ \Hspace = \{ \fx = \thetav^\top \xv ~|~ \thetav \in \R^p \}. $$ % Suppose your loss function of interest is the L2 loss $\Lxy= \frac12 \big(y-\fx\big)^2.$ % Consider the $L_0$-regularized empirical risk of a model $\fxt:$ $$ - \riskrt = \risket + \lambda \|\thetab\|_0 = \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \xi \right)^2 + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0}. + \riskrt = \risket + \lambda \|\thetav\|_0 = \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \xi \right)^2 + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0}. $$ % Assume that ${\Xmat}^T \Xmat = \id,$ which holds if $\Xmat$ has orthonormal columns. @@ -19,7 +19,7 @@ Show that the minimizer $\thetah_{\text{L0}} = (\thetah_{\text{L0},1},\ldots,\th % $$ \thetah_{\text{L0},i} = \thetah_{i} \mathds{1}_{|\thetah_{i}| > \sqrt{2\lambda}}, \quad i=1,\ldots,p, $$ % -where $\thetabh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss). +where $\thetavh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss). For this purpose, use the following steps: @@ -28,12 +28,12 @@ For this purpose, use the following steps: % \item [(i)] Derive that % - $$ \argmin_{\thetab} \riskrt = \argmin_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0}.$$ + $$ \argmin_{\thetav} \riskrt = \argmin_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0}.$$ % \item [(ii)] Note that the minimization problem on the right-hand side of (i) can be written as $\sum_{i=1}^p g_i(\theta_i),$ where $$ g_i(\theta) = - \thetah_{i}\theta + \frac{\theta^2}{2} + \lambda \mathds{1}_{|\theta| \neq 0}. $$ % - What is the advantage of this representation if we seek to find the $\thetab$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt?$ + What is the advantage of this representation if we seek to find the $\thetav$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt?$ % \item [(iii)] Consider first the case that $|\thetah_i|>\sqrt{2\lambda}$ and infer that for the minimizer $\theta^*_i$ of $g_i$ it must hold that $\theta^*_i=\thetah_i.$ diff --git a/exercises/regularization/ex_rnw/ex_lasso_regularization.Rnw b/exercises/regularization/ex_rnw/ex_lasso_regularization.Rnw index 16936442..e5563ef7 100644 --- a/exercises/regularization/ex_rnw/ex_lasso_regularization.Rnw +++ b/exercises/regularization/ex_rnw/ex_lasso_regularization.Rnw @@ -3,11 +3,11 @@ Consider the regression learning setting, i.e., $\mathcal{Y} = \R,$ and feature Let the hypothesis space be the linear models: $$ - \Hspace = \{ \fx = \thetab^\top \Xmat ~|~ \thetab \in \R^p \}. + \Hspace = \{ \fx = \thetav^\top \Xmat ~|~ \thetav \in \R^p \}. $$ Suppose your loss function of interest is the L2 loss $\Lxy= \frac12 \big(y-\fx\big)^2.$ Consider the $L_1$-regularized empirical risk of a model $\fxt$ (i.e., Lasso regression): $$ - \riskrt = \risket + \lambda \|\thetab\|_1 = \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \Xmat^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i|. + \riskrt = \risket + \lambda \|\thetav\|_1 = \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \Xmat^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i|. $$ Assume that ${\Xmat}^T \Xmat = \id,$ which holds if $\Xmat$ has orthonormal columns. Show that the minimizer $\thetah_{\text{Lasso}} = (\thetah_{\text{Lasso},1},\ldots,\thetah_{\text{Lasso},p})^\top$ is given by @@ -15,13 +15,13 @@ $$ \thetah_{\text{Lasso},i} = sgn(\thetah_{i}) \max\{|\thetah_{i}| - \lambda,0 \}, \quad i=1,\ldots,p, $$ -where $\thetabh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss). For this purpose, use the following steps: +where $\thetavh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss). For this purpose, use the following steps: \begin{enumerate} \item Derive that $$ - \argmin_{\thetab} \riskrt = \argmin_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda|\theta_i|. + \argmin_{\thetav} \riskrt = \argmin_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda|\theta_i|. $$ \item @@ -31,7 +31,7 @@ where $\thetabh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} g_i(\theta_i) = - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda |\theta_i|. $$ - What is the advantage of this representation if we seek to find the $\thetab$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt?$ + What is the advantage of this representation if we seek to find the $\thetav$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt?$ diff --git a/exercises/regularization/ex_rnw/ic_lasso_regularization.Rnw b/exercises/regularization/ex_rnw/ic_lasso_regularization.Rnw index 85e08e6d..eff118c0 100644 --- a/exercises/regularization/ex_rnw/ic_lasso_regularization.Rnw +++ b/exercises/regularization/ex_rnw/ic_lasso_regularization.Rnw @@ -3,11 +3,11 @@ Consider the regression learning setting, i.e., $\mathcal{Y} = \R,$ and feature Let the hypothesis space be the linear models: $$ - \Hspace = \{ \fx = \thetab^\top \Xmat ~|~ \thetab \in \R^p \}. + \Hspace = \{ \fx = \thetav^\top \Xmat ~|~ \thetav \in \R^p \}. $$ Suppose your loss function of interest is the L2 loss $\Lxy= \frac12 \big(y-\fx\big)^2.$ Consider the $L_1$-regularized empirical risk of a model $\fxt$ (i.e., Lasso regression): $$ - \riskrt = \risket + \lambda \|\thetab\|_1 = \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \Xmat^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i|. + \riskrt = \risket + \lambda \|\thetav\|_1 = \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \Xmat^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i|. $$ Assume that ${\Xmat}^T \Xmat = \id,$ which holds if $\Xmat$ has orthonormal columns. Show that the minimizer $\thetah_{\text{Lasso}} = (\thetah_{\text{Lasso},1},\ldots,\thetah_{\text{Lasso},p})^\top$ is given by @@ -15,13 +15,13 @@ $$ \thetah_{\text{Lasso},i} = sgn(\thetah_{i}) \max\{|\thetah_{i}| - \lambda,0 \}, \quad i=1,\ldots,p, $$ -where $\thetabh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss). For this purpose, use the following steps: +where $\thetavh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv$ is the minimizer of the unregularized empirical risk (w.r.t.\ the L2 loss). For this purpose, use the following steps: \begin{enumerate} \item Derive that $$ - \argmin_{\thetab} \riskrt = \argmin_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda|\theta_i|. + \argmin_{\thetav} \riskrt = \argmin_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda|\theta_i|. $$ \lz @@ -43,7 +43,7 @@ where $\thetabh = (\thetah_{1},\ldots,\thetah_{p})^\top = ({\Xmat}^T \Xmat)^{-1} g_i(\theta_i) = - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda |\theta_i|. $$ - What is the advantage of this representation if we seek to find the $\thetab$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt?$ + What is the advantage of this representation if we seek to find the $\thetav$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt?$ \lz \lz diff --git a/exercises/regularization/ex_rnw/sol_l0_regularization.Rnw b/exercises/regularization/ex_rnw/sol_l0_regularization.Rnw index d02baab5..98d3c1f2 100644 --- a/exercises/regularization/ex_rnw/sol_l0_regularization.Rnw +++ b/exercises/regularization/ex_rnw/sol_l0_regularization.Rnw @@ -5,36 +5,36 @@ % \item We can show % -$$\argmin_{\thetab} \riskrt = \argmin_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0}$$ +$$\argmin_{\thetav} \riskrt = \argmin_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0}$$ % as follows: % \begin{align*} % - \argmin_{\thetab} \riskrt + \argmin_{\thetav} \riskrt % - &= \argmin_{\thetab} \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \xi \right)^2 + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ + &= \argmin_{\thetav} \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \xi \right)^2 + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ % - &= \argmin_{\thetab} \frac12 \| \yv - \Xmat \thetab \|_2^2 + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ + &= \argmin_{\thetav} \frac12 \| \yv - \Xmat \thetav \|_2^2 + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ % - &= \argmin_{\thetab} \frac12 (\yv - \Xmat \thetab)^\top(\yv - \Xmat \thetab) + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ + &= \argmin_{\thetav} \frac12 (\yv - \Xmat \thetav)^\top(\yv - \Xmat \thetav) + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ % - &= \argmin_{\thetab} \frac12 \yv^\top \yv -\yv^\top \Xmat \thetab + \frac12 \thetab^\top \Xmat^\top \Xmat \thetab + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ + &= \argmin_{\thetav} \frac12 \yv^\top \yv -\yv^\top \Xmat \thetav + \frac12 \thetav^\top \Xmat^\top \Xmat \thetav + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \\ % - &= \argmin_{\thetab} -\yv^\top \Xmat \thetab + \frac12 \thetab^\top \Xmat^\top \Xmat \thetab + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \tag{ $ \yv^\top \yv$ does not depend on $\thetab$ }\\ + &= \argmin_{\thetav} -\yv^\top \Xmat \thetav + \frac12 \thetav^\top \Xmat^\top \Xmat \thetav + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \tag{ $ \yv^\top \yv$ does not depend on $\thetav$ }\\ % - &= \argmin_{\thetab} -\yv^\top \Xmat \thetab + \frac12 \thetab^\top\thetab + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \tag{By assumption $ \Xmat^\top \Xmat = \id$ }\\ + &= \argmin_{\thetav} -\yv^\top \Xmat \thetav + \frac12 \thetav^\top\thetav + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \tag{By assumption $ \Xmat^\top \Xmat = \id$ }\\ % - &= \argmin_{\thetab} -\thetabh^\top \thetab + \frac12 \thetab^\top\thetab + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \tag{By assumption $ \Xmat^\top \Xmat = \id$ so that $ \thetabh = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv = \Xmat^T\yv$ }\\ + &= \argmin_{\thetav} -\thetavh^\top \thetav + \frac12 \thetav^\top\thetav + \lambda \sum_{i=1}^p \mathds{1}_{|\theta_i| \neq 0} \tag{By assumption $ \Xmat^\top \Xmat = \id$ so that $ \thetavh = ({\Xmat}^T \Xmat)^{-1} \Xmat^T\yv = \Xmat^T\yv$ }\\ % - &= \argmin_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0} \tag{Writing out the inner products}\\ + &= \argmin_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0} \tag{Writing out the inner products}\\ % \end{align*} % \item Note that the minimization problem on the right-hand side of the previous math display can be written as $\sum_{i=1}^p g_i(\theta_i),$ where $$ g_i(\theta) = - \thetah_{i}\theta + \frac{\theta^2}{2} + \lambda \mathds{1}_{|\theta| \neq 0}. $$ % -The advantage of this representation, if we are interested in finding the $\thetab$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt,$ is that we can minimize each $g_i$ separately to obtain the optimal entries. +The advantage of this representation, if we are interested in finding the $\thetav$ with entries $\theta_1,\ldots,\theta_p$ minimizing $\riskrt,$ is that we can minimize each $g_i$ separately to obtain the optimal entries. % \item Consider first the case that $|\thetah_i|>\sqrt{2\lambda}.$ % @@ -135,7 +135,7 @@ Hence, the minimizer $\theta^*_i$ of $g_i$ is $\theta^*_i=0,$ which can be writt % Since % -$$\min_{\thetab} \riskrt = \min_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0} = \min_{\thetab} \sum_{i=1}^p g_i(\theta_i),$$ +$$\min_{\thetav} \riskrt = \min_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda \mathds{1}_{|\theta_i| \neq 0} = \min_{\thetav} \sum_{i=1}^p g_i(\theta_i),$$ % we conclude that % diff --git a/exercises/regularization/ex_rnw/sol_lasso_regularization.Rnw b/exercises/regularization/ex_rnw/sol_lasso_regularization.Rnw index 083dfd97..4ffe4d48 100644 --- a/exercises/regularization/ex_rnw/sol_lasso_regularization.Rnw +++ b/exercises/regularization/ex_rnw/sol_lasso_regularization.Rnw @@ -6,26 +6,26 @@ \begin{equation}\label{eq:solve_b} \begin{aligned} - \thetabh &= \left(\underbrace{{\Xmat}^T \Xmat}_{\id}\right)^{-1} \Xmat^T\yv \\ - \thetabh &= \Xmat^T \yv + \thetavh &= \left(\underbrace{{\Xmat}^T \Xmat}_{\id}\right)^{-1} \Xmat^T\yv \\ + \thetavh &= \Xmat^T \yv \end{aligned} \end{equation} We will now use the result of eq. \ref{eq:solve_b} to show that: \begin{equation} \begin{aligned} - \argmin_{\thetab} \riskrt &= \argmin_{\thetab} \frac12 \sum_{i=1}^n \left( \yi - \thetab^\top \Xmat^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i| \\ - &= \argmin_{\thetab} \frac12 \| \yv - \Xmat \thetab \|_2^2 + \lambda \|\thetab\|_1 \\ - &= \argmin_{\thetab} \frac12 ( \yv - \Xmat \thetab)^T ( \yv - \Xmat \thetab) + \lambda \|\thetab\|_1 \\ - &= \argmin_{\thetab} \frac12 \left(\underbrace{\yv^T \yv}_{indep. \ of \ \thetab} - 2 \underbrace{\yv^T \Xmat}_{\thetab^T} \thetab + \thetab^T \underbrace{\Xmat^T \Xmat}_{\id} \thetab \right) + \lambda \|\thetab\|_1 \\ - &= \argmin_{\thetab} - \thetabh^T \thetab + \frac{\thetab^T \thetab}{2} + \lambda \|\thetab\|_1 \\ - &= \argmin_{\thetab} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda|\theta_i|. + \argmin_{\thetav} \riskrt &= \argmin_{\thetav} \frac12 \sum_{i=1}^n \left( \yi - \thetav^\top \Xmat^{(i)} \right)^2 + \lambda \sum_{i=1}^p |\theta_i| \\ + &= \argmin_{\thetav} \frac12 \| \yv - \Xmat \thetav \|_2^2 + \lambda \|\thetav\|_1 \\ + &= \argmin_{\thetav} \frac12 ( \yv - \Xmat \thetav)^T ( \yv - \Xmat \thetav) + \lambda \|\thetav\|_1 \\ + &= \argmin_{\thetav} \frac12 \left(\underbrace{\yv^T \yv}_{indep. \ of \ \thetav} - 2 \underbrace{\yv^T \Xmat}_{\thetav^T} \thetav + \thetav^T \underbrace{\Xmat^T \Xmat}_{\id} \thetav \right) + \lambda \|\thetav\|_1 \\ + &= \argmin_{\thetav} - \thetavh^T \thetav + \frac{\thetav^T \thetav}{2} + \lambda \|\thetav\|_1 \\ + &= \argmin_{\thetav} \sum_{i=1}^p - \thetah_{i}\theta_i + \frac{\theta_i^2}{2} + \lambda|\theta_i|. \end{aligned} \end{equation} \item - The advantage of this representation if we are interested in finding $\thetab$ is that we can optimize each $g_i(\theta_i)$ separately to get optimal entries for $\theta_1,\dots,\theta_p$. + The advantage of this representation if we are interested in finding $\thetav$ is that we can optimize each $g_i(\theta_i)$ separately to get optimal entries for $\theta_1,\dots,\theta_p$. \item diff --git a/exercises/svm/ex_rnw/ex_linsvm_hardmargin.Rnw b/exercises/svm/ex_rnw/ex_linsvm_hardmargin.Rnw index 9f4e4286..da517554 100644 --- a/exercises/svm/ex_rnw/ex_linsvm_hardmargin.Rnw +++ b/exercises/svm/ex_rnw/ex_linsvm_hardmargin.Rnw @@ -4,8 +4,8 @@ \begin{equation*} \label{eq:softmargin} \begin{aligned} - & \min_{\mathbf{\thetab}, \theta_0} - & &\frac{1}{2} ||\mathbf{\thetab}||^2 \\ + & \min_{\mathbf{\thetav}, \theta_0} + & &\frac{1}{2} ||\mathbf{\thetav}||^2 \\ & \text{s.t. :} & & \svmhplane \geq 1 \end{aligned} @@ -38,8 +38,8 @@ \begin{itemize} \item $\gamma$ - \item $ \| \thetab \|$ - \item $ \thetab $ + \item $ \| \thetav \|$ + \item $ \thetav $ \item $ \theta_0 $ \item Determine which points are support vectors. \end{itemize} diff --git a/exercises/svm/ex_rnw/ex_linsvm_regression.Rnw b/exercises/svm/ex_rnw/ex_linsvm_regression.Rnw index 41dc4c10..30247e3e 100644 --- a/exercises/svm/ex_rnw/ex_linsvm_regression.Rnw +++ b/exercises/svm/ex_rnw/ex_linsvm_regression.Rnw @@ -1,16 +1,16 @@ - \renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetab)} + \renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetav)} % - For the data set $\D = \Dset$ with $\yi \in \R,$ assume that for a fixed $\eps>0$ all observations are within the $\eps$-tube around $\fxt = \thetab^\top \xv + \theta_0$ for any $(\theta_0,\thetab)^\top \in \tilde \Theta$, i.e., + For the data set $\D = \Dset$ with $\yi \in \R,$ assume that for a fixed $\eps>0$ all observations are within the $\eps$-tube around $\fxt = \thetav^\top \xv + \theta_0$ for any $(\theta_0,\thetav)^\top \in \tilde \Theta$, i.e., % - $$ \yi \in \left[f(\xi ~|~ \theta_0,\thetab)-\eps,~ f(\xi ~|~ \theta_0,\thetab)+\eps\right], \quad \forall i\in\{1,\ldots,n\},~ \forall (\theta_0,\thetab)^\top \in \tilde \Theta, $$ + $$ \yi \in \left[f(\xi ~|~ \theta_0,\thetav)-\eps,~ f(\xi ~|~ \theta_0,\thetav)+\eps\right], \quad \forall i\in\{1,\ldots,n\},~ \forall (\theta_0,\thetav)^\top \in \tilde \Theta, $$ % where $\tilde \Theta \subset \R^{p+1}$ is some non-empty parameter subset. % Let % $$ - d_\eps \left(f(\cdot~|~\theta_0,\thetab), \xi \right) := \eps - |\yi - f(\xi ~|~ \theta_0,\thetab)| = \eps - | \yi - \thetab^\top \xi - \theta_0| + d_\eps \left(f(\cdot~|~\theta_0,\thetav), \xi \right) := \eps - |\yi - f(\xi ~|~ \theta_0,\thetav)| = \eps - | \yi - \thetav^\top \xi - \theta_0| $$ % be the (signed) $\eps$-distance of the prediction error. @@ -18,22 +18,22 @@ The maximal $\eps$-distance of the prediction error of $f$ to the whole data set $\D$ is $$ - \gamma_\eps = \max\limits_{i=1,\ldots,n} \Big\{ d_\eps \left(f(\cdot~|~\theta_0,\thetab), \xi \right) \Big\}. + \gamma_\eps = \max\limits_{i=1,\ldots,n} \Big\{ d_\eps \left(f(\cdot~|~\theta_0,\thetav), \xi \right) \Big\}. $$ % \begin{enumerate} % - \item Let $(\theta_0,\thetab)^\top \in \R^{p+1}$ be arbitrary. Which (type of) values does $d_\eps \left(f(\cdot~|~\theta_0,\thetab),\xi \right)$ have if $\yi$ is + \item Let $(\theta_0,\thetav)^\top \in \R^{p+1}$ be arbitrary. Which (type of) values does $d_\eps \left(f(\cdot~|~\theta_0,\thetav),\xi \right)$ have if $\yi$ is % \begin{itemize} % - \item within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetab)?$ + \item within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetav)?$ % - \item not within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetab)?$ + \item not within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetav)?$ % \end{itemize} % - What would be a desirable choice of the parameters $(\theta_0,\thetab)^\top$ with respect to $\gamma_\eps?$ + What would be a desirable choice of the parameters $(\theta_0,\thetav)^\top$ with respect to $\gamma_\eps?$ % Is the choice of the parameters unique in general? % @@ -41,17 +41,17 @@ % \begin{eqnarray*} % - & \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\ + & \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\ % - & \text{s.t.} & \,\, \eps -\yi + \thetab^\top\xi + \theta_0 \geq 0 \quad \forall\, i \in \nset \\ + & \text{s.t.} & \,\, \eps -\yi + \thetav^\top\xi + \theta_0 \geq 0 \quad \forall\, i \in \nset \\ % - & \text{and} & \,\, \eps + \yi - \thetab^\top\xi - \theta_0 \geq 0 \quad \forall\, i \in \nset. + & \text{and} & \,\, \eps + \yi - \thetav^\top\xi - \theta_0 \geq 0 \quad \forall\, i \in \nset. % \end{eqnarray*} % is a suitable optimization problem for the desired choice in (a). % - \item Derive the Lagrange function $L(\thetab, \theta_0, \alphav)$ of the optimization problem as well as its dual form. + \item Derive the Lagrange function $L(\thetav, \theta_0, \alphav)$ of the optimization problem as well as its dual form. % \item Find the stationary points of $L.$ What can be inferred from the solution of the dual problem? % diff --git a/exercises/svm/ex_rnw/ex_svm_kernelized_multiclass.Rnw b/exercises/svm/ex_rnw/ex_svm_kernelized_multiclass.Rnw index 672365de..0efc60a7 100644 --- a/exercises/svm/ex_rnw/ex_svm_kernelized_multiclass.Rnw +++ b/exercises/svm/ex_rnw/ex_svm_kernelized_multiclass.Rnw @@ -1,5 +1,5 @@ \newcommand{\betab}{\bm{\beta}} - \renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetab)} + \renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetav)} % For the data set $\D = \Dset$ with $\yi \in \Yspace=\{-1,1\},$ assume we are provided with a suitable feature map $\phi:\Xspace \to \Phi,$ where $\Phi \subset \R^d.$ % @@ -8,8 +8,8 @@ \vspace*{-0.5cm} \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0,\sli} & \frac{1}{2} \thetab^\top \thetab + C \sum_{i=1}^n \sli \\ - & \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\phi\left(\xi\right)} + \theta_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ + & \min\limits_{\thetav, \theta_0,\sli} & \frac{1}{2} \thetav^\top \thetav + C \sum_{i=1}^n \sli \\ + & \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\phi\left(\xi\right)} + \theta_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ & \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset, \end{eqnarray*} % @@ -19,15 +19,15 @@ % \item Argue that this is equivalent to the following ERM problem: % - $$ \risket = \frac{1}{2} \|\thetab\|^2 + C \sumin \max(1-\yi (\thetab^\top \phi(\xi) + \theta_0), 0),$$ + $$ \risket = \frac{1}{2} \|\thetav\|^2 + C \sumin \max(1-\yi (\thetav^\top \phi(\xi) + \theta_0), 0),$$ % i.e., the regularized ERM problem for the hinge loss for the hypothesis space % - $$ \Hspace = \{ f:\Phi \to \R ~|~ f(\mathbf{z}) = \thetab^\top \mathbf{z} + \theta_0 \quad \thetab \in \R^d,\theta_0\in\R \}. $$ + $$ \Hspace = \{ f:\Phi \to \R ~|~ f(\mathbf{z}) = \thetav^\top \mathbf{z} + \theta_0 \quad \thetav \in \R^d,\theta_0\in\R \}. $$ % \item Now assume we deal with a multiclass classification problem with a data set $\D = \Dset$ such that $\yi \in \Yspace=\{1,\ldots,g\}$ for each $i \in \nset.$ In this case, we can derive a similar regularized ERM problem by using the multiclass hinge loss (see Exercise Sheet 4 (b)): % - $$ \risket = \frac{1}{2} \|\thetab\|^2 + C \sumin \sum_{y\neq \yi} \max(1 + \thetab^\top\psi(\xi,y) - \thetab^\top\psi(\xi,\yi) , 0),$$ + $$ \risket = \frac{1}{2} \|\thetav\|^2 + C \sumin \sum_{y\neq \yi} \max(1 + \thetav^\top\psi(\xi,y) - \thetav^\top\psi(\xi,\yi) , 0),$$ % where $\psi: \Xspace \times \Yspace \to \R^d$ is suitable (multiclass) feature map. % @@ -53,5 +53,5 @@ % Here, $(\bm{K}\betab)_{(i-1)g+y}$ denotes the $\big((i-1)g+y\big)$-th entry of the vector $\bm{K}\betab.$ - \emph{Hint:} The representer theorem tells us that for the solution $\thetab^*$ (if it exists) of $\risket$ it holds that $\thetab^* \in \spn\{ (\psi(\xv^{(i)},y))_{i=1,\ldots,n,y=1,\ldots,g} \}.$ + \emph{Hint:} The representer theorem tells us that for the solution $\thetav^*$ (if it exists) of $\risket$ it holds that $\thetav^* \in \spn\{ (\psi(\xv^{(i)},y))_{i=1,\ldots,n,y=1,\ldots,g} \}.$ \end{enumerate} \ No newline at end of file diff --git a/exercises/svm/ex_rnw/ic_linsvm_hardmargin.Rnw b/exercises/svm/ex_rnw/ic_linsvm_hardmargin.Rnw index d7b1cf63..246a36f0 100644 --- a/exercises/svm/ex_rnw/ic_linsvm_hardmargin.Rnw +++ b/exercises/svm/ex_rnw/ic_linsvm_hardmargin.Rnw @@ -4,8 +4,8 @@ \begin{equation*} \label{eq:softmargin} \begin{aligned} - & \min_{\mathbf{\thetab}, \theta_0} - & &\frac{1}{2} ||\mathbf{\thetab}||^2 \\ + & \min_{\mathbf{\thetav}, \theta_0} + & &\frac{1}{2} ||\mathbf{\thetav}||^2 \\ & \text{s.t. :} & & \svmhplane \geq 1 \end{aligned} @@ -42,12 +42,12 @@ \lz \lz \lz - \item $ \| \thetab \|$ + \item $ \| \thetav \|$ \lz \lz \lz \lz - \item $ \thetab $ + \item $ \thetav $ \lz \lz \lz diff --git a/exercises/svm/ex_rnw/sol_linsvm_hardmargin.Rnw b/exercises/svm/ex_rnw/sol_linsvm_hardmargin.Rnw index e3d110b3..f0353ab7 100644 --- a/exercises/svm/ex_rnw/sol_linsvm_hardmargin.Rnw +++ b/exercises/svm/ex_rnw/sol_linsvm_hardmargin.Rnw @@ -13,10 +13,10 @@ The maximum margin is achieved when the hyperplane lies exactly between these two points, with a value of $\gamma =1$. \item - The norm of $\thetab$ can be calculated as : + The norm of $\thetav$ can be calculated as : $$ - \| \thetab \| = \frac{1}{\gamma} = \frac{1}{1} = 1 + \| \thetav \| = \frac{1}{\gamma} = \frac{1}{1} = 1 $$ \item @@ -26,7 +26,7 @@ x_1 = 5 $$ - Using the formula $ \thetab^T \xv + \theta_0 = 0 $, we obtain that $\theta_1$ = 1, $\theta_2 = 0$ and $\theta_0=-5$ + Using the formula $ \thetav^T \xv + \theta_0 = 0 $, we obtain that $\theta_1$ = 1, $\theta_2 = 0$ and $\theta_0=-5$ \item The support vectors are the ones that determine the margins, in this case: $(4,1)^T$ and $(6,1)^T$ @@ -92,8 +92,8 @@ text(4, 4.25, expression(theta), pos = 1, cex = 1.5,col='chartreuse4') @ - The safety margin $\gamma$ and thus the norm of $\thetab$ remain the same. - However, the direction of $\thetab$ changed because of the rotation. + The safety margin $\gamma$ and thus the norm of $\thetav$ remain the same. + However, the direction of $\thetav$ changed because of the rotation. The support vectors are still the same as before, but rotated. \item @@ -153,8 +153,8 @@ text(5.1, 3.5, expression(theta), pos = 1, cex = 1.5,col='chartreuse4') @ - The margin $\gamma$ is now smaller, and thus $\| \thetab \|$ is bigger. - The direction of $\thetab$ and $\theta_0$ remain the same. + The margin $\gamma$ is now smaller, and thus $\| \thetav \|$ is bigger. + The direction of $\thetav$ and $\theta_0$ remain the same. The support vectors are now $(4,1)^T$ and $(5.5,1)^T$. \item @@ -181,8 +181,8 @@ text(5.1, 3.5, expression(theta), pos = 1, cex = 1.5,col='chartreuse4') @ - The margin $\gamma$ increases, and thus $\| \thetab \|$ will decrease. - The direction of $\thetab$ remains the same but $\theta_0$ changes. + The margin $\gamma$ increases, and thus $\| \thetav \|$ will decrease. + The direction of $\thetav$ remains the same but $\theta_0$ changes. The new Support vectors are $(4,1)^T$ and $(6.5,1)^T$. \end{itemize} @@ -195,7 +195,7 @@ \begin{table}[h!] \centering \begin{tabular}{c|c|c|c|c} -$\gamma$ & $ \| \thetab \| $ & $ \thetab $ & $\theta_0$ & SV \\ \hline +$\gamma$ & $ \| \thetav \| $ & $ \thetav $ & $\theta_0$ & SV \\ \hline 1 & 1 & $\frac{1}{\sqrt{2}} (1,1)^T$ & -5 & Rotation ($(4,1)^T, (6,1)^T$) \\ 1 & 1 & $(1,0)^T$ & -7 & $(6,1)^T, (8,1)^T $ \\ 0.75 & 4/3 & $ (4/3,0)^T$ & -6.33 & $(4,1)^T, (5.5,1)^T $ \\ diff --git a/exercises/svm/ex_rnw/sol_linsvm_regression.Rnw b/exercises/svm/ex_rnw/sol_linsvm_regression.Rnw index ddb72bd8..17463c7f 100644 --- a/exercises/svm/ex_rnw/sol_linsvm_regression.Rnw +++ b/exercises/svm/ex_rnw/sol_linsvm_regression.Rnw @@ -1,26 +1,26 @@ - \renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetab)} + \renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetav)} % \begin{enumerate} % - \item Regarding the values of $d_\eps \left(f(\cdot~|~\theta_0,\thetab),\xi \right)$ for an outcome $\yi$ we have that: + \item Regarding the values of $d_\eps \left(f(\cdot~|~\theta_0,\thetav),\xi \right)$ for an outcome $\yi$ we have that: % \begin{itemize} % - \item If $\yi$ is within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetab),$ then $d_\eps \left(f(\cdot~|~\theta_0,\thetab),\xi \right)\geq 0.$ The largest possible value of $d_\eps \left(f(\cdot~|~\theta_0,\thetab),\xi \right)$ is $\eps,$ which corresponds to a perfect prediction for that point, i.e., $\yi = f(\xi ~|~ \theta_0,\thetab).$ + \item If $\yi$ is within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetav),$ then $d_\eps \left(f(\cdot~|~\theta_0,\thetav),\xi \right)\geq 0.$ The largest possible value of $d_\eps \left(f(\cdot~|~\theta_0,\thetav),\xi \right)$ is $\eps,$ which corresponds to a perfect prediction for that point, i.e., $\yi = f(\xi ~|~ \theta_0,\thetav).$ % - \item If $\yi$ is not within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetab),$ then $d_\eps \left(f(\cdot~|~\theta_0,\thetab),\xi \right)<0.$ + \item If $\yi$ is not within the $\eps$-tube around $f(\xi ~|~ \theta_0,\thetav),$ then $d_\eps \left(f(\cdot~|~\theta_0,\thetav),\xi \right)<0.$ % \end{itemize} % - A desirable choice of the parameters $(\theta_0,\thetab)^\top$ with respect to $\gamma_\eps$ would be such that $\gamma_\eps$ is maximized, as this would make sure that the prediction errors are as far away as possible from the $\eps$-boundaries, but still within the $\eps$-tube. + A desirable choice of the parameters $(\theta_0,\thetav)^\top$ with respect to $\gamma_\eps$ would be such that $\gamma_\eps$ is maximized, as this would make sure that the prediction errors are as far away as possible from the $\eps$-boundaries, but still within the $\eps$-tube. \begin{minipage}{0.45\textwidth} % - The choice of the parameters $(\theta_0,\thetab)^\top$ is not unique, as the plot on the right shows for $\eps=0.5.$ + The choice of the parameters $(\theta_0,\thetav)^\top$ is not unique, as the plot on the right shows for $\eps=0.5.$ % - Both the black and the green model have $\gamma_\eps = \eps,$ since $d_\eps \left(f(\cdot~|~\theta_0,\thetab), -0.2 \right) = \eps = {\color{green} d_\eps \left(f(\cdot~|~\theta_0,\thetab), 0 \right)}$ + Both the black and the green model have $\gamma_\eps = \eps,$ since $d_\eps \left(f(\cdot~|~\theta_0,\thetav), -0.2 \right) = \eps = {\color{green} d_\eps \left(f(\cdot~|~\theta_0,\thetav), 0 \right)}$ and we cannot find another model such that its $\eps$-tube covers the outcomes. % \end{minipage} @@ -38,13 +38,13 @@ % \begin{eqnarray*} % - & \max\limits_{\thetab, \theta_0} & \gamma_\eps \\ + & \max\limits_{\thetav, \theta_0} & \gamma_\eps \\ % - & \text{s.t.} & \,\, d_\eps \left(f(\cdot~|~\theta_0,\thetab), \xi \right) \geq 0 \quad \forall\, i \in \nset. + & \text{s.t.} & \,\, d_\eps \left(f(\cdot~|~\theta_0,\thetav), \xi \right) \geq 0 \quad \forall\, i \in \nset. % \end{eqnarray*} % - The constraints mean that we require that any instance $i$ should have a positive $\eps$-distance of the prediction error for $f(\xi ~|~ \theta_0,\thetab).$ + The constraints mean that we require that any instance $i$ should have a positive $\eps$-distance of the prediction error for $f(\xi ~|~ \theta_0,\thetav).$ % In other words, the differences between the predictions and the outcomes should be at most $\eps$ and within the $\eps$-tube of the predictions. % @@ -52,9 +52,9 @@ % \begin{eqnarray*} % - & \max\limits_{\thetab, \theta_0} & \gamma_\eps \\ + & \max\limits_{\thetav, \theta_0} & \gamma_\eps \\ % - & \text{s.t.} & \,\, \eps - | \yi - \thetab^\top \xi - \theta_0| \geq 0 \quad \forall\, i \in \nset. + & \text{s.t.} & \,\, \eps - | \yi - \thetav^\top \xi - \theta_0| \geq 0 \quad \forall\, i \in \nset. % \end{eqnarray*} % @@ -62,25 +62,25 @@ % \begin{eqnarray*} % - & \max\limits_{\thetab, \theta_0} & \gamma_\eps \\ + & \max\limits_{\thetav, \theta_0} & \gamma_\eps \\ % % - & \text{s.t.} & \,\, \eps -\yi + \thetab^\top\xi + \theta_0 \geq 0 \quad \forall\, i \in \nset \\ + & \text{s.t.} & \,\, \eps -\yi + \thetav^\top\xi + \theta_0 \geq 0 \quad \forall\, i \in \nset \\ % - & \text{and} & \,\, \eps + \yi - \thetab^\top\xi - \theta_0 \geq 0 \quad \forall\, i \in \nset. + & \text{and} & \,\, \eps + \yi - \thetav^\top\xi - \theta_0 \geq 0 \quad \forall\, i \in \nset. % % \end{eqnarray*} % - As we have seen before the solution might not be unique, so that we make the reference choice $\gamma_\eps = C/\|\thetab\|$ for some constant $C>0,$ leading to + As we have seen before the solution might not be unique, so that we make the reference choice $\gamma_\eps = C/\|\thetav\|$ for some constant $C>0,$ leading to % \begin{eqnarray*} % - & \min\limits_{\thetab, \theta_0} \quad & C \|\thetab\|^2 \\ + & \min\limits_{\thetav, \theta_0} \quad & C \|\thetav\|^2 \\ % - & \text{s.t.} & \,\, \eps -\yi + \thetab^\top\xi + \theta_0 \geq 0 \quad \forall\, i \in \nset \\ + & \text{s.t.} & \,\, \eps -\yi + \thetav^\top\xi + \theta_0 \geq 0 \quad \forall\, i \in \nset \\ % - & \text{and} & \,\, \eps + \yi - \thetab^\top\xi - \theta_0 \geq 0 \quad \forall\, i \in \nset. + & \text{and} & \,\, \eps + \yi - \thetav^\top\xi - \theta_0 \geq 0 \quad \forall\, i \in \nset. % \end{eqnarray*} % @@ -93,21 +93,21 @@ \small \begin{eqnarray*} - &L(\thetab, \theta_0, \alphav,\tilde \alphav) = & \frac{1}{2}\|\thetab\|^2 - \sum_{i=1}^n \alpha_i \left[ \eps - \yi + \left( \thetab^\top\xi + \theta_0 \right) \right] - \sum_{i=1}^n \tilde\alpha_i \left[\eps - \left( \thetab^\top\xi + \theta_0 \right) + \yi \right]\\ + &L(\thetav, \theta_0, \alphav,\tilde \alphav) = & \frac{1}{2}\|\thetav\|^2 - \sum_{i=1}^n \alpha_i \left[ \eps - \yi + \left( \thetav^\top\xi + \theta_0 \right) \right] - \sum_{i=1}^n \tilde\alpha_i \left[\eps - \left( \thetav^\top\xi + \theta_0 \right) + \yi \right]\\ & \text{s.t.} & \,\, \alpha_i,\tilde \alpha_i \ge 0 \quad \forall\, i \in \nset. \end{eqnarray*} \normalsize The \textbf{dual} form of this problem is - $$\max\limits_{\alphav,\tilde\alphav} \min\limits_{\thetab, \theta_0} L(\thetab, \theta_0,\alphav,\tilde \alphav).$$ + $$\max\limits_{\alphav,\tilde\alphav} \min\limits_{\thetav, \theta_0} L(\thetav, \theta_0,\alphav,\tilde \alphav).$$ % - \item The stationary points of $L$ can be derived by setting the derivative of the Lagrangian function to 0 and solve with respect to the corresponding term of interest, i.e., for $\thetab:$ + \item The stationary points of $L$ can be derived by setting the derivative of the Lagrangian function to 0 and solve with respect to the corresponding term of interest, i.e., for $\thetav:$ % \begin{align*} % - \nabla_{\thetab} L(\thetab, \theta_0, \alphav,\tilde \alphav) = \thetab - \sum_{i=1}^n \alpha_i \xi + \sum_{i=1}^n \tilde\alpha_i &\xi \stackrel{!}{=} 0 \\ + \nabla_{\thetav} L(\thetav, \theta_0, \alphav,\tilde \alphav) = \thetav - \sum_{i=1}^n \alpha_i \xi + \sum_{i=1}^n \tilde\alpha_i &\xi \stackrel{!}{=} 0 \\ % - &\Leftrightarrow \thetab = \sum_{i=1}^n (\alpha_i - \tilde \alpha_i) \xi. + &\Leftrightarrow \thetav = \sum_{i=1}^n (\alpha_i - \tilde \alpha_i) \xi. % \end{align*} % @@ -115,15 +115,15 @@ % \begin{align*} % - \nabla_{\theta_0} L(\thetab, \theta_0, \alphav,\tilde \alphav) = - \sum_{i=1}^n \alpha_i + \sum_{i=1}^n \tilde\alpha_i & \stackrel{!}{=} 0 \\ + \nabla_{\theta_0} L(\thetav, \theta_0, \alphav,\tilde \alphav) = - \sum_{i=1}^n \alpha_i + \sum_{i=1}^n \tilde\alpha_i & \stackrel{!}{=} 0 \\ % &\Leftrightarrow 0 = \sum_{i=1}^n (\alpha_i - \tilde \alpha_i) . % \end{align*} % - If $(\thetab, \theta_0, \alphav, \tilde{\alphav})$ fulfills the KKT conditions (stationarity, primal/dual feasibility, complementary slackness), it solves both the primal and dual problem (strong duality). + If $(\thetav, \theta_0, \alphav, \tilde{\alphav})$ fulfills the KKT conditions (stationarity, primal/dual feasibility, complementary slackness), it solves both the primal and dual problem (strong duality). % - Under these conditions, and if we solve the dual problem and obtain $\alphavh$ or $\widetilde{\alphavh}$, we know that $\thetab$ is a linear combination of our data points: + Under these conditions, and if we solve the dual problem and obtain $\alphavh$ or $\widetilde{\alphavh}$, we know that $\thetav$ is a linear combination of our data points: % $$ \thetah = \sumin ( \alphah_i - \widetilde{\alphah_i}) \xi @@ -133,22 +133,22 @@ % \begin{align*} % - &\alphah_i \left[ \eps - \yi + \left( \thetab^\top\xi + \theta_0 \right) \right] = 0 \quad \forall ~ i \in \{1, ..., n \}, \\ + &\alphah_i \left[ \eps - \yi + \left( \thetav^\top\xi + \theta_0 \right) \right] = 0 \quad \forall ~ i \in \{1, ..., n \}, \\ % - & \widetilde{\alphah_i} \left[\eps - \left( \thetab^\top\xi + \theta_0 \right) + \yi \right] = 0 \quad \forall ~ i \in \{1, ..., n \}. + & \widetilde{\alphah_i} \left[\eps - \left( \thetav^\top\xi + \theta_0 \right) + \yi \right] = 0 \quad \forall ~ i \in \{1, ..., n \}. % \end{align*} - So either $\alphah_i = 0$, or $\alphah_i > 0$, then $\eps = \yi - \left( \thetab^\top\xi + \theta_0 \right) $, and $(\xi, \yi)$ is exactly on the boundary of the $\eps$-tube of the prediction and $ \thetab^\top\xi + \theta_0 $ underestimates $\yi$ (by exactly $\eps$). + So either $\alphah_i = 0$, or $\alphah_i > 0$, then $\eps = \yi - \left( \thetav^\top\xi + \theta_0 \right) $, and $(\xi, \yi)$ is exactly on the boundary of the $\eps$-tube of the prediction and $ \thetav^\top\xi + \theta_0 $ underestimates $\yi$ (by exactly $\eps$). % - Similarly, it holds either $\widetilde{\alphah_i} = 0$, or $\widetilde{\alphah_i} > 0$, then $\eps = \left( \thetab^\top\xi + \theta_0 \right) - \yi $, and $(\xi, \yi)$ is exactly on the boundary of the $\eps$-tube of the prediction and $ \thetab^\top\xi + \theta_0 $ overestimates $\yi$ (by exactly $\eps$). + Similarly, it holds either $\widetilde{\alphah_i} = 0$, or $\widetilde{\alphah_i} > 0$, then $\eps = \left( \thetav^\top\xi + \theta_0 \right) - \yi $, and $(\xi, \yi)$ is exactly on the boundary of the $\eps$-tube of the prediction and $ \thetav^\top\xi + \theta_0 $ overestimates $\yi$ (by exactly $\eps$). % For the bias term $\theta_0$ we infer that % - $$ \theta_0 = \yi - \thetab^\top\xi -\eps$$ + $$ \theta_0 = \yi - \thetav^\top\xi -\eps$$ % in the case $\alphah_i > 0$ and % - $$ \theta_0 = \yi - \thetab^\top\xi + \eps$$ + $$ \theta_0 = \yi - \thetav^\top\xi + \eps$$ % in the case $\widetilde{\alphah_i} > 0.$ % @@ -157,32 +157,32 @@ \begin{eqnarray*} % % - & \,\, \eps -\yi + \thetab^\top\xi + \theta_0 \geq \sli \quad \forall\, i \in \nset \\ + & \,\, \eps -\yi + \thetav^\top\xi + \theta_0 \geq \sli \quad \forall\, i \in \nset \\ % - & \text{and} \,\, \eps + \yi - \thetab^\top\xi - \theta_0 \geq \widetilde{\sli} \quad \forall\, i \in \nset. + & \text{and} \,\, \eps + \yi - \thetav^\top\xi - \theta_0 \geq \widetilde{\sli} \quad \forall\, i \in \nset. % \end{eqnarray*} % - We minimize then a weighted sum of $\|\thetab\|^2$ and the sum of the slack variables: + We minimize then a weighted sum of $\|\thetav\|^2$ and the sum of the slack variables: \begin{eqnarray*} % - & \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 + C \sum_{i=1}^n \sli + \widetilde{\sli} \\ + & \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 + C \sum_{i=1}^n \sli + \widetilde{\sli} \\ % - & \text{s.t.} & \,\, \eps -\yi + \thetab^\top\xi + \theta_0 \geq - \sli\quad \forall\, i \in \nset \\ + & \text{s.t.} & \,\, \eps -\yi + \thetav^\top\xi + \theta_0 \geq - \sli\quad \forall\, i \in \nset \\ % - & \text{and} & \,\, \eps + \yi - \thetab^\top\xi - \theta_0 \geq - \widetilde{\sli} \quad \forall\, i \in \nset, \\ + & \text{and} & \,\, \eps + \yi - \thetav^\top\xi - \theta_0 \geq - \widetilde{\sli} \quad \forall\, i \in \nset, \\ % & &\sli,\widetilde{\sli} \geq 0 \quad \forall\, i \in \nset. % \end{eqnarray*} % - \item In the optimum, the inequalities will hold with equality (as we minimize the slacks), so $\sli = \eps -\yi + \thetab^\top\xi + \theta_0$ and $ \widetilde{\sli} = \eps + \yi - \thetab^\top\xi - \theta_0$, but the lowest value $\sli$ and $\widetilde{\sli}$ can take is 0. + \item In the optimum, the inequalities will hold with equality (as we minimize the slacks), so $\sli = \eps -\yi + \thetav^\top\xi + \theta_0$ and $ \widetilde{\sli} = \eps + \yi - \thetav^\top\xi - \theta_0$, but the lowest value $\sli$ and $\widetilde{\sli}$ can take is 0. % So we can rewrite the above: % \begin{align*} - \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lxy = + \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lxy = \begin{cases} 0, & \text{ if } |y - \fx| \leq \eps, \\ |y - \fx| -\eps, & \text{ else. } diff --git a/exercises/svm/ex_rnw/sol_svm_kernelized_multiclass.Rnw b/exercises/svm/ex_rnw/sol_svm_kernelized_multiclass.Rnw index be9d3857..6f212722 100644 --- a/exercises/svm/ex_rnw/sol_svm_kernelized_multiclass.Rnw +++ b/exercises/svm/ex_rnw/sol_svm_kernelized_multiclass.Rnw @@ -1,23 +1,23 @@ \newcommand{\betab}{\bm{\beta}} -\renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetab)} +\renewcommand{\fxt}{f(\xv ~|~ \theta_0,\thetav)} % \begin{enumerate} % \item We consider the following constrained optimization problem: % \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0,\sli} & \frac{1}{2} \thetab^\top \thetab + C \sum_{i=1}^n \sli \\ - & \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\phi\left(\xi\right)} + \theta_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ + & \min\limits_{\thetav, \theta_0,\sli} & \frac{1}{2} \thetav^\top \thetav + C \sum_{i=1}^n \sli \\ + & \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\phi\left(\xi\right)} + \theta_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ & \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset. \end{eqnarray*} % - In the optimum, the inequalities will hold with equality (as we minimize the slacks), so $\sli = 1 - \yi \left( \scp{\thetab}{\phi\left(\xi\right)} + \theta_0 \right)$, but the lowest value $\sli$ can take is 0 (we do no get a bonus for points beyond the margin on the correct side). + In the optimum, the inequalities will hold with equality (as we minimize the slacks), so $\sli = 1 - \yi \left( \scp{\thetav}{\phi\left(\xi\right)} + \theta_0 \right)$, but the lowest value $\sli$ can take is 0 (we do no get a bonus for points beyond the margin on the correct side). % So we can rewrite the above: % \begin{align*} % - \frac{1}{2} \|\thetab\|^2 + C \sumin \max(1-\yi (\thetab^\top \phi(\xi) + \theta_0), 0). + \frac{1}{2} \|\thetav\|^2 + C \sumin \max(1-\yi (\thetav^\top \phi(\xi) + \theta_0), 0). % \end{align*} % @@ -25,26 +25,26 @@ % \item % - Let $\psi(\xv,y) = \frac12y\tilde{\phi}(x),$ with $\tilde{\phi}(x):=(1,\phi(x))^{\top}$, where $\phi$ is the feature map of the regularized binary ERM problem in (a). Further, define $\tilde{\thetab}:=(\theta_0,\thetab)^{\top}$, i.e. we absorb the intercept into the feature map and parameter vector for convenience. + Let $\psi(\xv,y) = \frac12y\tilde{\phi}(x),$ with $\tilde{\phi}(x):=(1,\phi(x))^{\top}$, where $\phi$ is the feature map of the regularized binary ERM problem in (a). Further, define $\tilde{\thetav}:=(\theta_0,\thetav)^{\top}$, i.e. we absorb the intercept into the feature map and parameter vector for convenience. % Now, if $y \neq \yi$ it holds that $y = - \yi,$ so that % \begin{align*} % - 1 + \tilde{\thetab}^\top \psi(\xi,y) &- \tilde{\thetab}^\top \psi (\xi,\yi) \\ + 1 + \tilde{\thetav}^\top \psi(\xi,y) &- \tilde{\thetav}^\top \psi (\xi,\yi) \\ % - &= 1 + \frac12 y \tilde{\thetab}^\top\tilde{\phi}(\xi) - \frac12 \yi \tilde{\thetab}^\top\tilde{\phi}(\xi) \tag{Definition of $\psi$}\\ + &= 1 + \frac12 y \tilde{\thetav}^\top\tilde{\phi}(\xi) - \frac12 \yi \tilde{\thetav}^\top\tilde{\phi}(\xi) \tag{Definition of $\psi$}\\ % - &= 1 + \frac12 \left( y - \yi \right) \tilde{\thetab}^\top \tilde{\phi}(\xi) \tag{Distributivity}\\ + &= 1 + \frac12 \left( y - \yi \right) \tilde{\thetav}^\top \tilde{\phi}(\xi) \tag{Distributivity}\\ % &= \begin{cases} % - 1 + \tilde{\thetab}^\top \tilde{\phi}(\xi), & \mbox{if } \yi=-1 \\ - 1 - \tilde{\thetab}^\top \tilde{\phi}(\xi), & \mbox{if } \yi=+1 \\ + 1 + \tilde{\thetav}^\top \tilde{\phi}(\xi), & \mbox{if } \yi=-1 \\ + 1 - \tilde{\thetav}^\top \tilde{\phi}(\xi), & \mbox{if } \yi=+1 \\ % \end{cases} \tag{Since $y = - \yi$}\\ % - &= 1 - \yi \tilde{\thetab}^\top \tilde{\phi}(\xi). + &= 1 - \yi \tilde{\thetav}^\top \tilde{\phi}(\xi). % \end{align*} % @@ -54,19 +54,19 @@ % \risket % - &= \frac{1}{2} \|\thetab\|^2 + C \sumin \sum_{y\neq \yi} \max(1 + \tilde{\thetab}^\top\psi(\xi,y) - \tilde{\thetab}^\top\psi(\xi,\yi) , 0) \\ + &= \frac{1}{2} \|\thetav\|^2 + C \sumin \sum_{y\neq \yi} \max(1 + \tilde{\thetav}^\top\psi(\xi,y) - \tilde{\thetav}^\top\psi(\xi,\yi) , 0) \\ % - &= \frac{1}{2} \|\thetab\|^2 + C \sumin \max(1 + \tilde{\thetab}^\top\psi(\xi,-\yi) - \tilde{\thetab}^\top\psi(\xi,\yi) , 0) \tag{$y \neq \yi$ implies $y = - \yi$} \\ + &= \frac{1}{2} \|\thetav\|^2 + C \sumin \max(1 + \tilde{\thetav}^\top\psi(\xi,-\yi) - \tilde{\thetav}^\top\psi(\xi,\yi) , 0) \tag{$y \neq \yi$ implies $y = - \yi$} \\ % - &= \frac{1}{2} \|\thetab\|^2 + C \sumin \max(1 - \yi \tilde{\thetab}^\top \tilde{\phi}(\xi) , 0) \\ + &= \frac{1}{2} \|\thetav\|^2 + C \sumin \max(1 - \yi \tilde{\thetav}^\top \tilde{\phi}(\xi) , 0) \\ % - &= \frac{1}{2} \|\thetab\|^2 + C \sumin \max(1 - \yi (\thetab^\top \phi(\xi)+\theta_0) , 0) . + &= \frac{1}{2} \|\thetav\|^2 + C \sumin \max(1 - \yi (\thetav^\top \phi(\xi)+\theta_0) , 0) . % \end{align*} % - \item The representer theorem tells us that for the solution $\thetab^*$ (if it exists) of $\risket$ it holds that $\thetab^* \in \spn\{ (\psi(\xv^{(i)},y))_{i=1,\ldots,n,y=1,\ldots,g} \}.$ + \item The representer theorem tells us that for the solution $\thetav^*$ (if it exists) of $\risket$ it holds that $\thetav^* \in \spn\{ (\psi(\xv^{(i)},y))_{i=1,\ldots,n,y=1,\ldots,g} \}.$ % - This means that $\thetab$ has to be a linear combination of $(\psi(\xv^{(i)},y))_{i=1,\ldots,n,y=1,\ldots,g},$ so that we can write $\thetab = \Xmat^\top \betab$ for $\betab \in \R^{ng}$ and + This means that $\thetav$ has to be a linear combination of $(\psi(\xv^{(i)},y))_{i=1,\ldots,n,y=1,\ldots,g},$ so that we can write $\thetav = \Xmat^\top \betab$ for $\betab \in \R^{ng}$ and % $$ \Xmat = \begin{pmatrix} \psi(\xv^{(1)},1)^\top \\ @@ -80,13 +80,13 @@ % For $\bm{K} = \Xmat \Xmat^\top$ we obtain that % - $$ \|\thetab\|^2 = \thetab^\top \thetab = (\Xmat^\top \betab)^\top \Xmat^\top \betab = \betab^\top \Xmat \Xmat^\top \betab = \betab^\top \bm{K} \betab. $$ + $$ \|\thetav\|^2 = \thetav^\top \thetav = (\Xmat^\top \betab)^\top \Xmat^\top \betab = \betab^\top \Xmat \Xmat^\top \betab = \betab^\top \bm{K} \betab. $$ % Further, it holds that % \begin{align*} % - \thetab^\top\psi(\xi,y) - \thetab^\top\psi(\xi,\yi) + \thetav^\top\psi(\xi,y) - \thetav^\top\psi(\xi,\yi) % &= \betab^\top \Xmat \psi(\xi,y) - \betab^\top \Xmat \psi(\xi,\yi) \\ % @@ -107,7 +107,7 @@ % \risket % - &= \frac{1}{2} \|\thetab\|^2 + C \sumin \sum_{y\neq \yi} \max(1 + \thetab^\top\psi(\xi,y) - \thetab^\top\psi(\xi,\yi) , 0) \\ + &= \frac{1}{2} \|\thetav\|^2 + C \sumin \sum_{y\neq \yi} \max(1 + \thetav^\top\psi(\xi,y) - \thetav^\top\psi(\xi,\yi) , 0) \\ % &= \frac{1}{2} \betab^\top \bm{K} \betab + \sumin \sum_{y\neq \yi} \max\left(1 + (\bm{K}\betab)_{(i-1)g+y} - (\bm{K}\betab)_{(i-1)g+\yi} ) ~ , ~ 0\right). % diff --git a/slides/advriskmin/slides-advriskmin-classification-bernoulli.tex b/slides/advriskmin/slides-advriskmin-classification-bernoulli.tex index 4fa1aba3..44225a47 100644 --- a/slides/advriskmin/slides-advriskmin-classification-bernoulli.tex +++ b/slides/advriskmin/slides-advriskmin-classification-bernoulli.tex @@ -262,7 +262,7 @@ % for $\pix$ in the hypothesis space % \begin{eqnarray*} -% \Hspace = \left\{\pi: \Xspace \to [0, 1] ~|~\pix = s(\thetab^\top \xv)\right\} +% \Hspace = \left\{\pi: \Xspace \to [0, 1] ~|~\pix = s(\thetav^\top \xv)\right\} % \end{eqnarray*} % with $s(.)$ again being the logistic function. diff --git a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex index 96872c71..7aa68d68 100644 --- a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex +++ b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex @@ -41,7 +41,7 @@ -\sum^n_{i=1} \yi\log\left(\pixit\right) + \left(1-\yi\log(1-\pixit\right) \end{eqnarray*} -with respect to $\thetab$ where the probabilistic classifier +with respect to $\thetav$ where the probabilistic classifier \begin{eqnarray*} \pixit & = & @@ -52,7 +52,7 @@ \vspace*{0.5cm} -NB: Note that $\frac{\partial}{\partial f} s(f) = s(f)(1-s(f))$ and $\frac{\partial \fxit}{\partial \thetab} = \left(\xi\right)^\top.$ +NB: Note that $\frac{\partial}{\partial f} s(f) = s(f)(1-s(f))$ and $\frac{\partial \fxit}{\partial \thetav} = \left(\xi\right)^\top.$ \end{vbframe} @@ -64,13 +64,13 @@ {\small \begin{align*} - \frac{\partial}{\partial\thetab}\riske & = - -\sumin \frac{\partial}{\partial\pixit }\yi\log(\pixit)\frac{\partial\pixit}{\partial \thetab} + \\ - & \qquad \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetab}\\ + \frac{\partial}{\partial\thetav}\riske & = + -\sumin \frac{\partial}{\partial\pixit }\yi\log(\pixit)\frac{\partial\pixit}{\partial \thetav} + \\ + & \qquad \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetav}\\ & = - -\sumin \frac{\yi}{\pixit}\frac{\partial\pixit}{\partial \thetab} - \frac{1-\yi}{1-\pixit}\frac{\partial\pixit}{\partial \thetab}\\ + -\sumin \frac{\yi}{\pixit}\frac{\partial\pixit}{\partial \thetav} - \frac{1-\yi}{1-\pixit}\frac{\partial\pixit}{\partial \thetav}\\ &= - -\sumin \left(\frac{\yi}{\pixit} - \frac{1-\yi}{1-\pixit}\right)\frac{\partial s(\fxit)}{\partial \fxit}\frac{\partial \fxit}{\partial\thetab}\\ + -\sumin \left(\frac{\yi}{\pixit} - \frac{1-\yi}{1-\pixit}\right)\frac{\partial s(\fxit)}{\partial \fxit}\frac{\partial \fxit}{\partial\thetav}\\ &= -\sum^n_{i=1} \left(\yi(1-\pixit) - (1-\yi)\pixit \right)\left(\xi\right)^\top.\\ \end{align*} @@ -82,7 +82,7 @@ \quad &=& \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\ \quad &=& - \left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)^\top\mathbf{X}\\ + \left(\pi(\mathbf{X}\vert\;\thetav) - \mathbf{y}\right)^\top\mathbf{X}\\ \end{align*} where $\mathbf{X} = \left( @@ -90,14 +90,14 @@ \xi[n]\right)^\top \in \R^{n\times d}, \mathbf{y} = \left( \yi[1], \dots, \yi[n] -\right)^\top,$ \\ $\pi(\mathbf{X}\vert\;\thetab) = \left( +\right)^\top,$ \\ $\pi(\mathbf{X}\vert\;\thetav) = \left( \pixit[1], \dots, \pixit[n] \right)^\top \in \R^{n}$. \vspace*{1cm} -$\implies$ The gradient $\nabla_{\thetab}\riske = \left(\frac{\partial}{\partial\thetab}\riske\right)^\top = \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)$ +$\implies$ The gradient $\nabla_{\thetav}\riske = \left(\frac{\partial}{\partial\thetav}\riske\right)^\top = \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetav) - \mathbf{y}\right)$ \vspace*{1cm} @@ -112,8 +112,8 @@ {\small \begin{align*} - \nabla^2_{\thetab}\riske = \frac{\partial^2}{\partial{\thetab^\top}\partial\thetab}\riske & = - \frac{\partial}{\partial{\thetab^\top}} \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\ + \nabla^2_{\thetav}\riske = \frac{\partial^2}{\partial{\thetav^\top}\partial\thetav}\riske & = + \frac{\partial}{\partial{\thetav^\top}} \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\ & = \sum^n_{i=1}\xi \left(\pixit\left(1-\pixit\right)\right)\left(\xi\right)^\top\\ & = @@ -139,11 +139,11 @@ \vspace*{0.3cm} With this, we get for any $\mathbf{w} \in \mathbb{R}^d$ that -$$\mathbf{w}^\top \nabla^2_{\thetab}\riske \mathbf{w} = \mathbf{w}^\top \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$ +$$\mathbf{w}^\top \nabla^2_{\thetav}\riske \mathbf{w} = \mathbf{w}^\top \mathbf{X}^\top \bar{\mathbf{D}}^\top \bar{\mathbf{D}}\mathbf{X} \mathbf{w} = (\bar{\mathbf{D}}\mathbf{X} \mathbf{w})^\top\bar{\mathbf{D}}\mathbf{X} \mathbf{w} = \Vert \bar{\mathbf{D}}\mathbf{X} \mathbf{w} \Vert^2_2 \geq 0$$ since obviously $\mathbf{D} = \bar{\mathbf{D}}^\top \bar{\mathbf{D}}.$ \\ \vspace*{0.3cm} -$\Rightarrow \nabla^2_{\thetab}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex. +$\Rightarrow \nabla^2_{\thetav}\riske$ is positive semi-definite $\Rightarrow \riske$ is convex. \end{vbframe} diff --git a/slides/advriskmin/slides-advriskmin-losses-properties.tex b/slides/advriskmin/slides-advriskmin-losses-properties.tex index 31d45e62..f2378554 100644 --- a/slides/advriskmin/slides-advriskmin-losses-properties.tex +++ b/slides/advriskmin/slides-advriskmin-losses-properties.tex @@ -228,8 +228,8 @@ % \begin{itemize} % \small % \item Problem: Lasso has a non-differentiable -% objective function $$\riskrt = \| \yv - \Xmat \thetab \|^2_2 -% + \lambda \| \thetab \|_1 ~ \in \mathcal{C}^0,$$ +% objective function $$\riskrt = \| \yv - \Xmat \thetav \|^2_2 +% + \lambda \| \thetav \|_1 ~ \in \mathcal{C}^0,$$ % but many optimization methods are derivative-based, e.g., % \begin{itemize} % \small @@ -260,7 +260,7 @@ % \tiny Example: $y = x_1 + 1.2 x_2 + \epsilon$. \textit{Left:} unpenalized % objective, \textit{middle:} $L1$ penalty, \textit{right:} penalized objective -% (all as functions of $\thetab$). We see how the $L1$ penalty nudges the optimum +% (all as functions of $\thetav$). We see how the $L1$ penalty nudges the optimum % towards (0, 0) and compromises the original objective's smoothness. \end{vbframe} @@ -274,10 +274,10 @@ \setlength\itemsep{1.2em} \item A function $\risket$ is convex if $$ - \riske\left(t \cdot \thetab + (1 - t) \cdot \tilde \thetab\right) \le t \cdot - \riske\left(\thetab\right) + (1 - t) \cdot \riske\left(\tilde \thetab \right) + \riske\left(t \cdot \thetav + (1 - t) \cdot \tilde \thetav\right) \le t \cdot + \riske\left(\thetav\right) + (1 - t) \cdot \riske\left(\tilde \thetav \right) $$ - $\forall$ $t \in [0, 1], ~\thetab, \tilde \thetab \in \Theta$\\ + $\forall$ $t \in [0, 1], ~\thetav, \tilde \thetav \in \Theta$\\ (strictly convex if the above holds with strict inequality). \item In optimization, convex problems have a number of convenient properties. E.g., all local minima are global. \vspace{0.2cm }\\ @@ -298,7 +298,7 @@ \item If we model our data using an exponential family distribution, we always get convex losses \begin{itemize} %\footnotesize - \item For $\fxt$ linear in $\thetab$, linear/logistic/softmax/poisson/$\ldots$ regression are convex problems (all GLMs)! + \item For $\fxt$ linear in $\thetav$, linear/logistic/softmax/poisson/$\ldots$ regression are convex problems (all GLMs)! \end{itemize} \end{itemize} @@ -354,12 +354,12 @@ \end{minipage}% \begin{itemize} - \item $f$ linear in $\thetab$, e.g., + \item $f$ linear in $\thetav$, e.g., \textbf{logistic regression} with $\fxt = \thx$ %\begin{itemize} %\small - \item Data perfectly separable by our learner, so we can find $\thetab$: - $$ \yi \fxit = \yi \thetab^T \xi > 0 ~~ \forall \xi$$ + \item Data perfectly separable by our learner, so we can find $\thetav$: + $$ \yi \fxit = \yi \thetav^T \xi > 0 ~~ \forall \xi$$ % \\ \vspace{0.1cm} \scriptsize %as every $\xi$ is correctly classified: $\fxit < 0$ for %$\yi = -1$, $> 0$ for $\yi = 1$ \small @@ -369,7 +369,7 @@ % \vspace{0.1cm} % \begin{itemize} % \small - % \item[$\Rightarrow$] $f \left( \xv ~|~ a \cdot \thetab \right) = + % \item[$\Rightarrow$] $f \left( \xv ~|~ a \cdot \thetav \right) = % a \cdot \fxt$ for $a > 1$ % \end{itemize} %\end{itemize} @@ -377,41 +377,41 @@ % \vfill % With optimization, e.g., gradient descent, we can always find a set of -% parameters $\thetab^\prime$ that fully separates the data. +% parameters $\thetav^\prime$ that fully separates the data. %\framebreak %\begin{itemize} %\small % \item In optimization, e.g., with gradient descent, we can always find a set - %of parameters $\thetab^\prime$ that classifies all samples perfectly. + %of parameters $\thetav^\prime$ that classifies all samples perfectly. %But taking a closer look at $\risket$, we find that the same can be - %achieved with $2 \cdot \thetab^\prime$ -- and at lower risk: + %achieved with $2 \cdot \thetav^\prime$ -- and at lower risk: -\item Can now a construct a strictly better $\thetab$ +\item Can now a construct a strictly better $\thetav$ -$$ \riske(2 \cdot \thetab) = \sumin - % \log \left( 1 + \exp \left( - |2 \thetab^T \xi| \right)\right) \\ - L \left( 2 \yi \thetab^T \xi +$$ \riske(2 \cdot \thetav) = \sumin + % \log \left( 1 + \exp \left( - |2 \thetav^T \xi| \right)\right) \\ + L \left( 2 \yi \thetav^T \xi \right) < \risket $$ -\item As ||$\thetab$|| increases, sum strictly decreases, as argument of L is strictly larger +\item As ||$\thetav$|| increases, sum strictly decreases, as argument of L is strictly larger % \begin{flalign* - %\riske(2 \cdot \thetab) &= \sumin - % \log \left( 1 + \exp \left( - |2 \thetab^T \xi| \right)\right) \\ - % L \left( \left \rvert f\left( \xi ~|~ 2 \cdot \thetab \right) \right \rvert + %\riske(2 \cdot \thetav) &= \sumin + % \log \left( 1 + \exp \left( - |2 \thetav^T \xi| \right)\right) \\ + % L \left( \left \rvert f\left( \xi ~|~ 2 \cdot \thetav \right) \right \rvert % \right) %5 = \sumin L \left(2 \cdot \left \rvert %\fxit \right \rvert \right) \\ %5 &< \sumin L \left( \left \rvert \fxit \right \rvert \right) = \risket % \end{flalign*} \item We can iterate that, so there is no local (or global) optimum, and no numerical procedure can converge -% \item This actually holds true for every $a \cdot \thetab$ with $a > 1$. +% \item This actually holds true for every $a \cdot \thetav$ with $a > 1$. % \begin{itemize} % \small -% \item[$\Rightarrow$] By increasing $\| \thetab \|$, our loss becomes smaller +% \item[$\Rightarrow$] By increasing $\| \thetav \|$, our loss becomes smaller % and smaller, and optimization runs infinitely. % \item[$\Rightarrow$] This is a consequence of the above assumptions and % sometimes encountered in logistic regression and linear support vector @@ -477,23 +477,23 @@ % \item In the case of complete separation, we have % \footnotesize % \begin{flalign*} -% \risket &= \sumin \log \left( 1 + \exp \left( - \yi \thetab^T \xi \right) +% \risket &= \sumin \log \left( 1 + \exp \left( - \yi \thetav^T \xi \right) % \right) \\ &= -% \sumin \log \left( 1 + \exp \left( - | \thetab^T \xi| \right) +% \sumin \log \left( 1 + \exp \left( - | \thetav^T \xi| \right) % \right), % \end{flalign*} % \small -% as every observation is correctly classified (i.e., $\thetab^T \xi < 0$ \\for -% $\yi = -1$ and $\thetab^T \xi > 0$ for $\yi = 1$). +% as every observation is correctly classified (i.e., $\thetav^T \xi < 0$ \\for +% $\yi = -1$ and $\thetav^T \xi > 0$ for $\yi = 1$). % \end{itemize} % % \framebreak % % \begin{itemize} % \small -% \item $\risket$ thus monotonically decreases in $\thetab$: if a parameter -% vector $\thetab^\prime$ is able to classify the samples perfectly, then -% $2\thetab^\prime$ also classifies the samples perfectly, and at lower risk. +% \item $\risket$ thus monotonically decreases in $\thetav$: if a parameter +% vector $\thetav^\prime$ is able to classify the samples perfectly, then +% $2\thetav^\prime$ also classifies the samples perfectly, and at lower risk. % \item Geometrically, this translates to an ever steeper slope of the % logistic/softmax function, leading to increasingly sharp discrimination and % infinitely running optimization. diff --git a/slides/advriskmin/slides-advriskmin-max-likelihood-l2.tex b/slides/advriskmin/slides-advriskmin-max-likelihood-l2.tex index 15ee779d..35e36f6b 100644 --- a/slides/advriskmin/slides-advriskmin-max-likelihood-l2.tex +++ b/slides/advriskmin/slides-advriskmin-max-likelihood-l2.tex @@ -33,7 +33,7 @@ Let's consider regression from a maximum likelihood perspective. Assume: $$ - y~|~ \xv \sim p(y~|~\xv, \thetab) + y~|~ \xv \sim p(y~|~\xv, \thetav) $$ \vspace{0.5cm} @@ -52,12 +52,12 @@ \includegraphics[width = 0.75\textwidth]{figure/ftrue.pdf} \end{minipage} -where $\ftrue$ has params $\thetab$ and $\eps$ a RV that follows some distribution $\P_\eps$, with $\E[\eps] = 0$. Also, assume $\epsilon \perp \!\!\! \perp \xv$. +where $\ftrue$ has params $\thetav$ and $\eps$ a RV that follows some distribution $\P_\eps$, with $\E[\eps] = 0$. Also, assume $\epsilon \perp \!\!\! \perp \xv$. \framebreak -From a statistics / maximum-likelihood perspective, we assume (or we pretend) we know the underlying distribution $p(y~|~\xv, \thetab)$. +From a statistics / maximum-likelihood perspective, we assume (or we pretend) we know the underlying distribution $p(y~|~\xv, \thetav)$. \begin{itemize} \item Then, given i.i.d data $ @@ -116,7 +116,7 @@ The likelihood is then \begin{eqnarray*} -\LL(\thetab) &=& \prodin \pdf\left(\yi ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \prodin \exp\left(-\frac{1}{2\sigma^2} \left(\yi - \fxit\right)^2\right) +\LL(\thetav) &=& \prodin \pdf\left(\yi ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \prodin \exp\left(-\frac{1}{2\sigma^2} \left(\yi - \fxit\right)^2\right) \end{eqnarray*} \framebreak @@ -124,7 +124,7 @@ Easy to see: minimizing neg. negative log-likelihood with Gaussian errors is the same as ERM with $L2$-loss: \begin{eqnarray*} -- \loglt &=& - \log\left(\LL(\thetab)\right) \\ +- \loglt &=& - \log\left(\LL(\thetav)\right) \\ &=& - \log\left(\prodin \exp\left(-\frac{1}{2\sigma^2} \left(\yi - \fxit\right)^2\right)\right) \\ &\propto& \sumin \left(\yi - \fxit\right)^2 \end{eqnarray*} @@ -148,10 +148,10 @@ \begin{vbframe}{Distributions and losses} \begin{itemize} -\item For every error distribution $\P_\eps$ we can derive an equivalent loss function, which leads to the same point estimator for the parameter vector $\thetab$ as maximum-likelihood. Formally, +\item For every error distribution $\P_\eps$ we can derive an equivalent loss function, which leads to the same point estimator for the parameter vector $\thetav$ as maximum-likelihood. Formally, \begin{itemize} - \item $\thetah \in \argmax_{\thetab} \LL(\thetab) \implies \thetah \in \argmin_{\thetab}-\log(\LL(\thetab))$ %\implies \thetah \in \argmin_{\thetab}-\log(\LL(\thetab))$ - %\item $\thetah \in \argmax_{\thetab} \log\left(\LL(\thetab)\right) \implies $ + \item $\thetah \in \argmax_{\thetav} \LL(\thetav) \implies \thetah \in \argmin_{\thetav}-\log(\LL(\thetav))$ %\implies \thetah \in \argmin_{\thetav}-\log(\LL(\thetav))$ + %\item $\thetah \in \argmax_{\thetav} \log\left(\LL(\thetav)\right) \implies $ \end{itemize} %\lz \item \textbf{But}: The other way around does not always work: We cannot derive a corresponding pdf or error distribution for every loss function -- the Hinge loss is one prominent example, for which some probabilistic interpretation is still possible however, see \citelink{SOLLICH1999NINTH}. @@ -159,7 +159,7 @@ \framebreak When does the reverse direction hold? -\item If we can write the loss as $L(y,\fx)=L(y-\fx)=L(r)$ for $r \in \mathbb{R}$, then minimizing $L(y-\fx)$ is equivalent to maximizing a conditional log-likelihood $\log(p(y-f(\xv|\thetab))$ if +\item If we can write the loss as $L(y,\fx)=L(y-\fx)=L(r)$ for $r \in \mathbb{R}$, then minimizing $L(y-\fx)$ is equivalent to maximizing a conditional log-likelihood $\log(p(y-f(\xv|\thetav))$ if \begin{itemize} \setlength{\itemsep}{1.2em} \item $\log(p(r))$ is affine trafo of $L$ (undoing the $\propto$): diff --git a/slides/advriskmin/slides-advriskmin-max-likelihood-other.tex b/slides/advriskmin/slides-advriskmin-max-likelihood-other.tex index 32c8132a..8c3f4818 100644 --- a/slides/advriskmin/slides-advriskmin-max-likelihood-other.tex +++ b/slides/advriskmin/slides-advriskmin-max-likelihood-other.tex @@ -52,7 +52,7 @@ The likelihood is then \begin{eqnarray*} -\LL(\thetab) &=& \prod_{i=1}^n \pdf\left(\yi ~\bigg|~ \fxit, \sigma\right) \\ &\propto& \exp\left(-\frac{1}{\sigma}\sumin \left|\yi - \fxit\right|\right)\,. +\LL(\thetav) &=& \prod_{i=1}^n \pdf\left(\yi ~\bigg|~ \fxit, \sigma\right) \\ &\propto& \exp\left(-\frac{1}{\sigma}\sumin \left|\yi - \fxit\right|\right)\,. \end{eqnarray*} The negative log-likelihood is $$ @@ -127,7 +127,7 @@ % \vspace*{-0.3cm} % \begin{eqnarray*} -% \Hspace = \left\{f: \Xspace \to \R ~|~\fx = \thetab^\top \xv\right\} +% \Hspace = \left\{f: \Xspace \to \R ~|~\fx = \thetav^\top \xv\right\} % \end{eqnarray*} % Scores are afterwards transformed into probabilities by the logistic function $\pix = \left(1 + \exp(- \fx)\right)^{-1}$ @@ -256,20 +256,20 @@ % by \emph{squared error} as our \emph{loss function} in \emph{risk minimization}: % $$ -% \riske(\thetab) = SSE(\thetab) = \sumin \Lxyit = \sumin \left(\yi - \theta^T \xi\right)^2 +% \riske(\thetav) = SSE(\thetav) = \sumin \Lxyit = \sumin \left(\yi - \theta^T \xi\right)^2 % $$ % NB: We assume here and from now on that $\theta_0$ is included in $\theta$. % Using matrix notation the empirical risk can be written as % $$ -% SSE(\thetab) = (\ydat - \Xmat\thetab)^T(\ydat - \Xmat\thetab). +% SSE(\thetav) = (\ydat - \Xmat\thetav)^T(\ydat - \Xmat\thetav). % $$ % Differentiating w.r.t $\theta$ yields the so-called \emph{normal equations}: % $$ -% \Xmat^T(\ydat - \Xmat\thetab) = 0 +% \Xmat^T(\ydat - \Xmat\thetav) = 0 % $$ % The optimal $\theta$ is % $$ diff --git a/slides/advriskmin/slides-advriskmin-pseudo-residuals.tex b/slides/advriskmin/slides-advriskmin-pseudo-residuals.tex index c7747d35..4c0c2b44 100644 --- a/slides/advriskmin/slides-advriskmin-pseudo-residuals.tex +++ b/slides/advriskmin/slides-advriskmin-pseudo-residuals.tex @@ -102,7 +102,7 @@ \begin{itemize} \item In GD, we move in the direction of the negative gradient by updating the parameters: $$ - \thetab^{[t + 1]} = \thetab^{[t]} - \alpha^{[t]} \cdot \nabla_{\thetab} \left.\risket\right|_{\thetab = \thetab^{[t]}} + \thetav^{[t + 1]} = \thetav^{[t]} - \alpha^{[t]} \cdot \nabla_{\thetav} \left.\risket\right|_{\thetav = \thetav^{[t]}} $$ % with step size $\alpha^{[t]}$. \item This can be seen as approximating the unexplained information (measured by the loss) through a model update. @@ -110,10 +110,10 @@ \item Using the chain rule: % mwe see that the pseudo-residuals are input to the update direction \begin{eqnarray*} - \nabla_{\thetab} \risket &=&\sumin \left.\frac{\partial L\left(\yi, f\right)}{\partial f} \right|_{f = \fxit} + \nabla_{\thetav} \risket &=&\sumin \left.\frac{\partial L\left(\yi, f\right)}{\partial f} \right|_{f = \fxit} % _{= - \tilde r^{(i)}} - \cdot \nabla_{\thetab} \fxit \\ - &=& - \sumin \tilde r^{(i)} \cdot \nabla_{\thetab} \fxit. + \cdot \nabla_{\thetav} \fxit \\ + &=& - \sumin \tilde r^{(i)} \cdot \nabla_{\thetav} \fxit. \end{eqnarray*} \item Hence the update is determined by a loss-optimal directional change of the model output and a loss-independent derivate of $f$. @@ -121,11 +121,11 @@ % The unexplained information -- the negative gradient -- can be thought of as residuals, which is therefore also called pseudo-residuals. \end{itemize} -% For risk minimization, the update rule for the parameter $\thetab$ is +% For risk minimization, the update rule for the parameter $\thetav$ is % \begin{footnotesize} % \begin{eqnarray*} -% \thetab^{[t+1]} &\leftarrow & \thetab^{[t]} - \alpha^{[t]} \sumin \nabla_{\thetab} \left. \Lxyit \right|_{\thetab = \thetab^{[t]}} \\ -% \thetab^{[t+1]} &\leftarrow & \thetab^{[t]} + \alpha^{[t]} \sumin \tilde r^{(i)} \cdot \left. \nabla_{\thetab} \fxit \right|_{\thetab = \thetab^{[t]}} +% \thetav^{[t+1]} &\leftarrow & \thetav^{[t]} - \alpha^{[t]} \sumin \nabla_{\thetav} \left. \Lxyit \right|_{\thetav = \thetav^{[t]}} \\ +% \thetav^{[t+1]} &\leftarrow & \thetav^{[t]} + \alpha^{[t]} \sumin \tilde r^{(i)} \cdot \left. \nabla_{\thetav} \fxit \right|_{\thetav = \thetav^{[t]}} % \end{eqnarray*} % \end{footnotesize} % $\alpha^{[t]} \in [0,1]$ is called \enquote{learning rate} in this context. diff --git a/slides/advriskmin/slides-advriskmin-regression-further-losses.tex b/slides/advriskmin/slides-advriskmin-regression-further-losses.tex index c75673ea..d6c2f3bf 100644 --- a/slides/advriskmin/slides-advriskmin-regression-further-losses.tex +++ b/slides/advriskmin/slides-advriskmin-regression-further-losses.tex @@ -140,7 +140,7 @@ \begin{figure} \centering \scalebox{0.99}{\includegraphics{figure_man/logcosh-derivation.png}} - %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetab$} + %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetav$} \end{figure} \end{column} @@ -179,7 +179,7 @@ \begin{figure} \centering \scalebox{1}{\includegraphics{figure_man/cosh-gaussian-densities.png}} - %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetab$} + %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetav$} \end{figure} \end{column} @@ -282,13 +282,13 @@ %% \begin{vbframe}{$\epsilon$-insensitive Loss: Optimal Constant} %% % Derive Constant model and eps-insens loss -%What is the optimal constant model $f = \thetab$ w.r.t. the $\epsilon$-insensitive loss $\Lyf = |y - f| ~ \mathds{1}_{ \left\{|y - f| > \epsilon \right\}}$? +%What is the optimal constant model $f = \thetav$ w.r.t. the $\epsilon$-insensitive loss $\Lyf = |y - f| ~ \mathds{1}_{ \left\{|y - f| > \epsilon \right\}}$? %\vspace{-0.2cm} %\begin{eqnarray*} -%\hat \thetab&=& \argmin_{\thetab\in \R}\sumin \Lyfi \\ -%&=& \argmin_{\thetab\in \R} \sum_{i \in I_\eps} \left| \yi - \thetab \right| - \eps \\ -%&=& \argmin_{\thetab\in \R} \sum_{i \in I_\eps} \left| \yi - \thetab \right| - \sum_{i \in I_\eps} \eps \\ +%\hat \thetav&=& \argmin_{\thetav\in \R}\sumin \Lyfi \\ +%&=& \argmin_{\thetav\in \R} \sum_{i \in I_\eps} \left| \yi - \thetav \right| - \eps \\ +%&=& \argmin_{\thetav\in \R} \sum_{i \in I_\eps} \left| \yi - \thetav \right| - \sum_{i \in I_\eps} \eps \\ %&=& \text{median}\left(\left\{\yi ~|~ i \in I_\eps\right\}\right) - |I_\eps| \cdot \eps \\ %\end{eqnarray*} @@ -338,12 +338,12 @@ % \framebreak % BB: this proof cannot be true, it simply states what it claimed??? -% What is the optimal constant model $f = \thetab$ w.r.t.\ the quantile loss? +% What is the optimal constant model $f = \thetav$ w.r.t.\ the quantile loss? % \vspace{-0.2cm} % \begin{eqnarray*} -% \thetah&=& \argmin_{\thetab\in \R}\sumin \Lxyi \\ +% \thetah&=& \argmin_{\thetav\in \R}\sumin \Lxyi \\ % \Leftrightarrow\quad -% \thetah &=& \argmin_{\thetab \in \R}\left\{ (1 - \alpha) \sum_{\yi<\thetab} \left|\yi-\thetab\right| + \alpha \sum_{\yi \geq\thetab} \left|\yi-\thetab\right|\right\} \\ +% \thetah &=& \argmin_{\thetav \in \R}\left\{ (1 - \alpha) \sum_{\yi<\thetav} \left|\yi-\thetav\right| + \alpha \sum_{\yi \geq\thetav} \left|\yi-\thetav\right|\right\} \\ % \Leftrightarrow\quad \thetah &=& Q_\alpha(\{\yi\}) % \end{eqnarray*} diff --git a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex index a37ed2b5..e96624a9 100644 --- a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex +++ b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex @@ -59,10 +59,10 @@ \begin{vbframe}{Two short examples} \textbf{Regression with linear model:}\\ \begin{itemize} - \item Model: $f(\xv) = \thetab^\top \xv + \theta_0$ + \item Model: $f(\xv) = \thetav^\top \xv + \theta_0$ \item Squared loss: $\Lyf = \left(y-f\right)^2$ - \item Hypothesis space: $$\Hspace_{\text{lin}} = \left\{ \xv \mapsto \thetab^\top \xv + \theta_0 : \thetab \in \mathbb{R}^d, \theta_0 \in \mathbb{R} \right\}$$ + \item Hypothesis space: $$\Hspace_{\text{lin}} = \left\{ \xv \mapsto \thetav^\top \xv + \theta_0 : \thetav \in \mathbb{R}^d, \theta_0 \in \mathbb{R} \right\}$$ \end{itemize} \vspace{0.3cm} diff --git a/slides/boosting/algorithms/componentwise_gradient_boosting.tex b/slides/boosting/algorithms/componentwise_gradient_boosting.tex index 95911461..bdfa224f 100644 --- a/slides/boosting/algorithms/componentwise_gradient_boosting.tex +++ b/slides/boosting/algorithms/componentwise_gradient_boosting.tex @@ -10,8 +10,8 @@ \color{algocol} \For {$j= 1\to J$} \State Fit regression base learner $b_j \in \mathcal{B}_j$ to the vector of pseudo-residuals $\rmm$: - \State $\thetamh_j = \argmin_{\thetab \in \bm{\Theta_j}} \sum \limits_{i=1}^n - (\rmi - b_j(\xi, \thetab))^2$ + \State $\thetamh_j = \argmin_{\thetav \in \bm{\Theta_j}} \sum \limits_{i=1}^n + (\rmi - b_j(\xi, \thetav))^2$ \EndFor \State $j^{[m]} = \argmin_{j} \sum \limits_{i=1}^n (\rmi - \hat{b}_j(\xi, \thetamh_j))^2$ \color{lightgray} diff --git a/slides/boosting/cheatsheet_new/cheatsheet.tex b/slides/boosting/cheatsheet_new/cheatsheet.tex index 0beededb..fd62a47b 100644 --- a/slides/boosting/cheatsheet_new/cheatsheet.tex +++ b/slides/boosting/cheatsheet_new/cheatsheet.tex @@ -285,12 +285,12 @@ % \end{codebox} % \hspace*{1ex} % \begin{codebox} -% $\min\limits_{\thetab \in \Theta} \risket$ : Empirical Risk Minimization (ERM) Problem +% $\min\limits_{\thetav \in \Theta} \risket$ : Empirical Risk Minimization (ERM) Problem % \end{codebox} % \hspace*{1ex} \textbf{Note:} We often consider the ERM problem in ML as an example of an optimization problem. In this case, we switch from the general optimization notation to the ML notation: % \begin{itemize} % \item We optimize the function $\risket$ (instead of $f$); $f$ instead denotes the ML model. -% \item We optimize over $\thetab \in \Theta$ (instead over $\xv \in \mathcal{S}$). +% \item We optimize over $\thetav \in \Theta$ (instead over $\xv \in \mathcal{S}$). % \end{itemize} % All further notation changes accordingly. \\ % \hspace*{1ex} diff --git a/slides/boosting/slides-boosting-cwb-basics.tex b/slides/boosting/slides-boosting-cwb-basics.tex index b26f08fa..1f6d6481 100644 --- a/slides/boosting/slides-boosting-cwb-basics.tex +++ b/slides/boosting/slides-boosting-cwb-basics.tex @@ -142,8 +142,8 @@ Two BLs of the same type can simply be added by adding up their parameter vectors: $$ - b_j(\xv, \thetab^{[1]}) + b_j(\xv, \thetab^{[2]}) = - b_j(\xv, \thetab^{[1]} + \thetab^{[2]}). + b_j(\xv, \thetav^{[1]}) + b_j(\xv, \thetav^{[2]}) = + b_j(\xv, \thetav^{[1]} + \thetav^{[2]}). $$ %% Figure: rsrc/fig-compboost-add.R \begin{center} @@ -167,7 +167,7 @@ \end{center} \vspace*{0.1cm} -Thus, if $\{ b_j(\xv, \thetab^{[1]}), b_j(\xv, \thetab^{[2]}) \} \in \mathcal{B}_j$, then $b_j(\xv, \thetab^{[1]} + \thetab^{[2]}) \in \mathcal{B}_j$. +Thus, if $\{ b_j(\xv, \thetav^{[1]}), b_j(\xv, \thetav^{[2]}) \} \in \mathcal{B}_j$, then $b_j(\xv, \thetav^{[1]} + \thetav^{[2]}) \in \mathcal{B}_j$. \end{vbframe} diff --git a/slides/boosting/slides-boosting-cwb-basics2.tex b/slides/boosting/slides-boosting-cwb-basics2.tex index 6736260d..5083bd12 100644 --- a/slides/boosting/slides-boosting-cwb-basics2.tex +++ b/slides/boosting/slides-boosting-cwb-basics2.tex @@ -33,8 +33,8 @@ \begin{itemize} \item One base learner to simultaneously estimate all categories: - $$b_j(x_j | \thetab_j) = \sum_{g=1}^G \theta_{j,g}\mathds{1}_{\{g = x_j\}} = (\mathds{1}_{\{x_j = 1\}}, ..., \mathds{1}_{\{x_j = G\}}) \thetab_j$$ - Hence, $b_j$ incorporates a one-hot encoded feature with group means $\thetab\in\R^G$ as estimators. + $$b_j(x_j | \thetav_j) = \sum_{g=1}^G \theta_{j,g}\mathds{1}_{\{g = x_j\}} = (\mathds{1}_{\{x_j = 1\}}, ..., \mathds{1}_{\{x_j = G\}}) \thetav_j$$ + Hence, $b_j$ incorporates a one-hot encoded feature with group means $\thetav\in\R^G$ as estimators. \item One binary base learner per category: @@ -52,9 +52,9 @@ Much faster estimation compared to using individual binary BLs \item - Explicit solution of $\thetabh = \argmin_{\thetab\in\R^G}\sumin (\yi - b_j(x^{(i)}_j | \thetab))^2$: + Explicit solution of $\thetavh = \argmin_{\thetav\in\R^G}\sumin (\yi - b_j(x^{(i)}_j | \thetav))^2$: $$ - %\thetabh = (\thetah_1, \dots, \thetah_G)^T,\ + %\thetavh = (\thetah_1, \dots, \thetah_G)^T,\ \thetah_g = n_g^{-1}\sumin \yi \mathds{1}_{\{x^{(i)}_j = g\}} $$ diff --git a/slides/boosting/slides-boosting-gradient-boosting-concept.tex b/slides/boosting/slides-boosting-gradient-boosting-concept.tex index 8a2a8c52..1a988498 100644 --- a/slides/boosting/slides-boosting-gradient-boosting-concept.tex +++ b/slides/boosting/slides-boosting-gradient-boosting-concept.tex @@ -125,7 +125,7 @@ $$ \riskef = \sum_{i=1}^n L\left(\yi,\fxi \right) = -\sum_{i=1}^n L\left(\yi, \sum_{m=1}^M \alpha^{[m]} b(\xi, \thetab^{[m]}) %\blxt +\sum_{i=1}^n L\left(\yi, \sum_{m=1}^M \alpha^{[m]} b(\xi, \thetav^{[m]}) %\blxt \right) $$ @@ -427,7 +427,7 @@ %% %% The pseudo-residuals are calculated exactly as stated above, %% then we fit a regression model $b(\bm{x}, \thetam)$ to them: -%% $$ \thetamh = \argmin_{\thetab} \sum_{i=1}^n (\rmi - b(\xi, \thetab))^2 $$ +%% $$ \thetamh = \argmin_{\thetav} \sum_{i=1}^n (\rmi - b(\xi, \thetav))^2 $$ %% So, evaluated on the training data, %% our $b(x, \thetam)$ corresponds as closely as possible to the negative %% loss function gradient and generalizes to the whole space. diff --git a/slides/boosting/slides-boosting-regression-illustrations.tex b/slides/boosting/slides-boosting-regression-illustrations.tex index ee3c5ba3..643ddfcc 100644 --- a/slides/boosting/slides-boosting-regression-illustrations.tex +++ b/slides/boosting/slides-boosting-regression-illustrations.tex @@ -340,29 +340,29 @@ % % \item Equivalent to fitting LM via GD. % \item Here, a grad step in param space equals a grad step in function space. % \item GD update for LMs: -% $\thetab^{[m+1]} \leftarrow \thetam - \alpha \cdot \nabla_{\thetam} +% $\thetav^{[m+1]} \leftarrow \thetam - \alpha \cdot \nabla_{\thetam} % \riske(\thetam) = % \thetam + \alpha (-\Xmat^T \yv + \Xmat^T\Xmat \thetam)$ \\ % \item Now let's fit LM against PRs in GB\\ % NB: adding a linear BL to an LM simply sums params % \begin{eqnarray*} % \footnotesize -% \frac{\partial}{\partial \thetab^{[m+1]}} -% \left \| (\yv - \Xmat \thetam) - \Xmat \thetab^{[m+1]} \right \|^2_2 +% \frac{\partial}{\partial \thetav^{[m+1]}} +% \left \| (\yv - \Xmat \thetam) - \Xmat \thetav^{[m+1]} \right \|^2_2 % &=& 0 \\ % -2\Xmat^T (\yv - \Xmat \thetam) + 2\Xmat^T\Xmat -% \thetab^{[m+1]} &=& 0 \\ -% \thetab^{[m+1]} &=& (\Xmat^T\Xmat)^{-1} \Xmat^T +% \thetav^{[m+1]} &=& 0 \\ +% \thetav^{[m+1]} &=& (\Xmat^T\Xmat)^{-1} \Xmat^T % (\yv - \Xmat \thetam) \\ -% \thetab^{[m+1]} &=& (\Xmat^T\Xmat)^{-1} \Xmat^T \yv +% \thetav^{[m+1]} &=& (\Xmat^T\Xmat)^{-1} \Xmat^T \yv % - (\Xmat^T\Xmat)^{-1} \Xmat^T \Xmat \thetam \\ -% \thetab^{[m+1]} &=& (\Xmat^T\Xmat)^{-1} \Xmat^T \yv +% \thetav^{[m+1]} &=& (\Xmat^T\Xmat)^{-1} \Xmat^T \yv % - \thetam % \quad \quad \textcolor{gray}{\rvert \cdot (-\Xmat^T\Xmat)} % \\ -% \thetab^{[m+1]} &=& - \Xmat^T \yv + \Xmat^T\Xmat +% \thetav^{[m+1]} &=& - \Xmat^T \yv + \Xmat^T\Xmat % \thetam % \end{eqnarray*} -% $\Rightarrow \fh^{[m+1]} = \Xmat \tilde \thetab^{[m+1]} = +% $\Rightarrow \fh^{[m+1]} = \Xmat \tilde \thetav^{[m+1]} = % \Xmat \left( \thetam + \alpha (-\Xmat^T \yv + % \Xmat^T\Xmat \thetam) \right)$ % \end{itemize} diff --git a/slides/boosting/tex/cwb-algo-short.tex b/slides/boosting/tex/cwb-algo-short.tex index aa2ea090..e065bd4e 100644 --- a/slides/boosting/tex/cwb-algo-short.tex +++ b/slides/boosting/tex/cwb-algo-short.tex @@ -7,8 +7,8 @@ \For {$j= 1\to J$} \State Fit regression base learner $b_j \in \mathcal{B}_j$ to the vector of pseudo-residuals $\rmm$: - \State $\thetamh_j = \argmin_{\thetab \in \bm{\Theta_j}} \sum \limits_{i=1}^n - (\rmi - b_j(\xi, \thetab))^2$ + \State $\thetamh_j = \argmin_{\thetav \in \bm{\Theta_j}} \sum \limits_{i=1}^n + (\rmi - b_j(\xi, \thetav))^2$ \EndFor \State $j^{[m]} = \argmin_{j} \sum \limits_{i=1}^n (\rmi - \hat{b}_j(\xi, \thetamh_j))^2$ diff --git a/slides/feature-selection/slides-fs-introduction.tex b/slides/feature-selection/slides-fs-introduction.tex index 20ceffc1..77a6b50c 100644 --- a/slides/feature-selection/slides-fs-introduction.tex +++ b/slides/feature-selection/slides-fs-introduction.tex @@ -187,7 +187,7 @@ \textbf{Example: embedded method (Lasso)} regularizing model params with $L1$ penalty %in the empirical risk enables ``automatic" feature selection: \vspace{-0.28cm} - $ \riskrt = \risket + \lambda \|\thetab\|_1 = \sumin \left(\yi - \thetab^\top \xi \right)^2 +\lambda \sum_{j=1}^p |\theta_j| $ + $ \riskrt = \risket + \lambda \|\thetav\|_1 = \sumin \left(\yi - \thetav^\top \xi \right)^2 +\lambda \sum_{j=1}^p |\theta_j| $ %are very popular for high-dimensional data. %\item The penalty shrinks the coefficients towards 0 in the final model. %\item Many (improved) variants: group LASSO, adaptive LASSO, ElasticNet, ... diff --git a/slides/gaussian-processes/slides-gp-basic.tex b/slides/gaussian-processes/slides-gp-basic.tex index 5d0bf339..d21b84e2 100644 --- a/slides/gaussian-processes/slides-gp-basic.tex +++ b/slides/gaussian-processes/slides-gp-basic.tex @@ -53,11 +53,11 @@ \begin{itemize} \item Until now we considered a hypothesis space $\Hspace$ of parameterized functions $\fxt$ (in particular, the space of linear functions). - \item Using Bayesian inference, we derived distributions for $\thetab$ after having observed data $\D$. - \item Prior believes about the parameter are expressed via a prior distribution $q(\thetab)$, which is updated according to Bayes' rule + \item Using Bayesian inference, we derived distributions for $\thetav$ after having observed data $\D$. + \item Prior believes about the parameter are expressed via a prior distribution $q(\thetav)$, which is updated according to Bayes' rule $$ - \underbrace{p(\thetab | \Xmat, \yv)}_{\text{posterior}} = \frac{\overbrace{p(\yv | \Xmat, \thetab)}^{\text{likelihood}}\overbrace{q(\thetab)}^{\text{prior}}}{\underbrace{p(\yv|\Xmat)}_{\text{marginal}}}. + \underbrace{p(\thetav | \Xmat, \yv)}_{\text{posterior}} = \frac{\overbrace{p(\yv | \Xmat, \thetav)}^{\text{likelihood}}\overbrace{q(\thetav)}^{\text{prior}}}{\underbrace{p(\yv|\Xmat)}_{\text{marginal}}}. $$ \end{itemize} @@ -69,7 +69,7 @@ Let us change our point of view: \begin{itemize} - \item Instead of \enquote{searching} for a parameter $\thetab$ in the parameter space, we directly search in a space of \enquote{allowed} functions $\Hspace$. + \item Instead of \enquote{searching} for a parameter $\thetav$ in the parameter space, we directly search in a space of \enquote{allowed} functions $\Hspace$. \item We still use Bayesian inference, but instead specifying a prior distribution over a parameter, we specify a prior distribution \textbf{over functions} and update it according to the data points we have observed. \end{itemize} @@ -119,8 +119,8 @@ \begin{tabular}{cc} \textbf{Weight-Space View} & \textbf{Function-Space View} \vspace{4mm}\\ Parameterize functions & \vspace{1mm}\\ - \footnotesize Example: $\fxt = \thetab^\top \xv$ & \vspace{3mm}\\ - Define distributions on $\thetab$ & Define distributions on $f$ \vspace{4mm}\\ + \footnotesize Example: $\fxt = \thetav^\top \xv$ & \vspace{3mm}\\ + Define distributions on $\thetav$ & Define distributions on $f$ \vspace{4mm}\\ Inference in parameter space $\Theta$ & Inference in function space $\Hspace$ \end{tabular} \end{table} diff --git a/slides/gaussian-processes/slides-gp-bayes-lm.tex b/slides/gaussian-processes/slides-gp-bayes-lm.tex index 757651af..5ce3753d 100644 --- a/slides/gaussian-processes/slides-gp-bayes-lm.tex +++ b/slides/gaussian-processes/slides-gp-bayes-lm.tex @@ -60,40 +60,40 @@ The linear regression model is defined as $$ -y = \fx + \epsilon = \thetab^T \xv + \epsilon +y = \fx + \epsilon = \thetav^T \xv + \epsilon $$ or on the data: \begin{eqnarray*} -\yi &=& \fxi + \epsi = \thetab^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\} +\yi &=& \fxi + \epsi = \thetav^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\} \end{eqnarray*} -We now assume (from a Bayesian perspective) that also our parameter vector $\thetab$ is stochastic and follows a distribution. +We now assume (from a Bayesian perspective) that also our parameter vector $\thetav$ is stochastic and follows a distribution. The observed values $\yi$ differ from the function values $\fxi$ by some additive noise, which is assumed to be i.i.d. Gaussian $$ \epsi \sim \mathcal{N}(0, \sigma^2)$$ -and independent of $\xv$ and $\thetab$. +and independent of $\xv$ and $\thetav$. \framebreak -Let us assume we have \textbf{prior beliefs} about the parameter $\thetab$ that are represented in a prior distribution $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p).$ +Let us assume we have \textbf{prior beliefs} about the parameter $\thetav$ that are represented in a prior distribution $\thetav \sim \mathcal{N}(\zero, \tau^2 \id_p).$ \lz Whenever data points are observed, we update the parameters' prior distribution according to Bayes' rule $$ -\underbrace{p(\thetab | \Xmat, \yv)}_{\text{posterior}} = \frac{\overbrace{p(\yv | \Xmat, \thetab)}^{\text{likelihood}}\overbrace{q(\thetab)}^{\text{prior}}}{\underbrace{p(\yv|\Xmat)}_{\text{marginal}}}. +\underbrace{p(\thetav | \Xmat, \yv)}_{\text{posterior}} = \frac{\overbrace{p(\yv | \Xmat, \thetav)}^{\text{likelihood}}\overbrace{q(\thetav)}^{\text{prior}}}{\underbrace{p(\yv|\Xmat)}_{\text{marginal}}}. $$ \framebreak -The posterior distribution of the parameter $\thetab$ is again normal distributed (the Gaussian family is self-conjugate): +The posterior distribution of the parameter $\thetav$ is again normal distributed (the Gaussian family is self-conjugate): $$ -\thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) +\thetav ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) $$ with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$. @@ -101,7 +101,7 @@ \lz \begin{footnotesize} -\textbf{Note:} If the posterior distribution $p(\thetab~|~\Xmat, \yv)$ are in the same probability distribution family as the prior $q(\thetab)$ w.r.t. a specific likelihood function $p(\yv~|~\Xmat, \thetab)$, they are called \textbf{conjugate distributions}. The prior is then called a \textbf{conjugate prior} for the likelihood. The Gaussian family is self-conjugate: Choosing a Gaussian prior for a Gaussian Likelihood ensures that the posterior is Gaussian. +\textbf{Note:} If the posterior distribution $p(\thetav~|~\Xmat, \yv)$ are in the same probability distribution family as the prior $q(\thetav)$ w.r.t. a specific likelihood function $p(\yv~|~\Xmat, \thetav)$, they are called \textbf{conjugate distributions}. The prior is then called a \textbf{conjugate prior} for the likelihood. The Gaussian family is self-conjugate: Choosing a Gaussian prior for a Gaussian Likelihood ensures that the posterior is Gaussian. \end{footnotesize} \framebreak @@ -124,17 +124,17 @@ \textbf{Proof:}\\ We want to show that \begin{itemize} - \item for a Gaussian prior on $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p)$ - \item for a Gaussian Likelihood $y ~|~ \Xmat, \thetab \sim \mathcal{N}(\Xmat^\top \thetab, \sigma^2 \id_n)$ + \item for a Gaussian prior on $\thetav \sim \mathcal{N}(\zero, \tau^2 \id_p)$ + \item for a Gaussian Likelihood $y ~|~ \Xmat, \thetav \sim \mathcal{N}(\Xmat^\top \thetav, \sigma^2 \id_n)$ \end{itemize} the resulting posterior is Gaussian $\mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1})$ with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$. Plugging in Bayes' rule and multiplying out yields \begin{eqnarray*} -p(\thetab | \Xmat, \yv) &\propto& p(\yv | \Xmat, \thetab) q(\thetab) \propto \exp\biggl[-\frac{1}{2\sigma^2}(\yv - \Xmat\thetab)^\top(\yv - \Xmat\thetab)-\frac{1}{2\tau^2}\thetab^\top\thetab\biggr] \\ -&=& \exp\biggl[-\frac{1}{2}\biggl(\underbrace{\sigma^{-2}\yv^\top\yv}_{\text{doesn't depend on } \thetab} - 2 \sigma^{-2} \yv^\top \Xmat \thetab + \sigma^{-2}\thetab^\top \Xmat^\top \Xmat \thetab + \tau^{-2} \thetab^\top\thetab \biggr)\biggr] \\ -&\propto& \exp\biggl[-\frac{1}{2}\biggl(\sigma^{-2}\thetab^\top \Xmat^\top \Xmat \thetab + \tau^{-2} \thetab^\top\thetab - 2 \sigma^{-2} \yv^\top \Xmat \thetab \biggr)\biggr] \\ -&=& \exp\biggl[-\frac{1}{2}\thetab^\top\underbrace{\biggl(\sigma^{-2} \Xmat^\top \Xmat + \tau^{-2} \id_p \biggr)}_{:= \Amat} \thetab + \textcolor{red}{\sigma^{-2} \yv^\top \Xmat \thetab}\biggr] +p(\thetav | \Xmat, \yv) &\propto& p(\yv | \Xmat, \thetav) q(\thetav) \propto \exp\biggl[-\frac{1}{2\sigma^2}(\yv - \Xmat\thetav)^\top(\yv - \Xmat\thetav)-\frac{1}{2\tau^2}\thetav^\top\thetav\biggr] \\ +&=& \exp\biggl[-\frac{1}{2}\biggl(\underbrace{\sigma^{-2}\yv^\top\yv}_{\text{doesn't depend on } \thetav} - 2 \sigma^{-2} \yv^\top \Xmat \thetav + \sigma^{-2}\thetav^\top \Xmat^\top \Xmat \thetav + \tau^{-2} \thetav^\top\thetav \biggr)\biggr] \\ +&\propto& \exp\biggl[-\frac{1}{2}\biggl(\sigma^{-2}\thetav^\top \Xmat^\top \Xmat \thetav + \tau^{-2} \thetav^\top\thetav - 2 \sigma^{-2} \yv^\top \Xmat \thetav \biggr)\biggr] \\ +&=& \exp\biggl[-\frac{1}{2}\thetav^\top\underbrace{\biggl(\sigma^{-2} \Xmat^\top \Xmat + \tau^{-2} \id_p \biggr)}_{:= \Amat} \thetav + \textcolor{red}{\sigma^{-2} \yv^\top \Xmat \thetav}\biggr] \end{eqnarray*} This expression resembles a normal density - except for the term in red! @@ -147,11 +147,11 @@ We subtract a (not yet defined) constant $c$ while compensating for this change by adding the respective terms (\enquote{adding $0$}), emphasized in green: \begin{eqnarray*} - p(\thetab | \Xmat, \yv) &\propto& \exp\biggl[-\frac{1}{2}(\thetab \textcolor{green}{- c})^\top\Amat (\thetab \textcolor{green}{- c}) \textcolor{green}{- c^\top \Amat \thetab} + \underbrace{\textcolor{green}{\frac{1}{2}c^\top\Amat c}}_{\text{doesn't depend on } \thetab} +\sigma^{-2} \yv^\top \Xmat \thetab\biggr] \\ - &\propto& \exp\biggl[-\frac{1}{2}(\thetab \textcolor{green}{- c})^\top\Amat (\thetab \textcolor{green}{- c}) \textcolor{green}{- c^\top \Amat \thetab} +\sigma^{-2} \yv^\top \Xmat \thetab\biggr] + p(\thetav | \Xmat, \yv) &\propto& \exp\biggl[-\frac{1}{2}(\thetav \textcolor{green}{- c})^\top\Amat (\thetav \textcolor{green}{- c}) \textcolor{green}{- c^\top \Amat \thetav} + \underbrace{\textcolor{green}{\frac{1}{2}c^\top\Amat c}}_{\text{doesn't depend on } \thetav} +\sigma^{-2} \yv^\top \Xmat \thetav\biggr] \\ + &\propto& \exp\biggl[-\frac{1}{2}(\thetav \textcolor{green}{- c})^\top\Amat (\thetav \textcolor{green}{- c}) \textcolor{green}{- c^\top \Amat \thetav} +\sigma^{-2} \yv^\top \Xmat \thetav\biggr] \end{eqnarray*} -If we choose $c$ such that $- c^\top \Amat \thetab +\sigma^{-2} \yv^\top \Xmat \thetab = 0$, the posterior is normal with mean $c$ and covariance matrix $\Amat^{-1}$. Taking into account that $\Amat$ is symmetric, this is if we choose +If we choose $c$ such that $- c^\top \Amat \thetav +\sigma^{-2} \yv^\top \Xmat \thetav = 0$, the posterior is normal with mean $c$ and covariance matrix $\Amat^{-1}$. Taking into account that $\Amat$ is symmetric, this is if we choose \begin{eqnarray*} && \sigma^{-2} \yv^\top \Xmat = c^\top\Amat \\ @@ -168,10 +168,10 @@ Based on the posterior distribution $$ -\thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) +\thetav ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1}) $$ -we can derive the predictive distribution for a new observations $\xv_*$. The predictive distribution for the Bayesian linear model, i.e. the distribution of $\thetab^\top \xv_*$, is +we can derive the predictive distribution for a new observations $\xv_*$. The predictive distribution for the Bayesian linear model, i.e. the distribution of $\thetav^\top \xv_*$, is $$ y_* ~|~ \Xmat, \yv, \xv_* \sim \mathcal{N}(\sigma^{-2}\yv^\top \Xmat \Amat^{-1}\xv_*, \xv_*^\top\Amat^{-1}\xv_*) @@ -197,9 +197,9 @@ \begin{vbframe}{Summary: The Bayesian Linear Model} \begin{itemize} - \item By switching to a Bayesian perspective, we do not only have point estimates for the parameter $\thetab$, but whole \textbf{distributions} - \item From the posterior distribution of $\thetab$, we can derive a predictive distribution for $y_* = \thetab^\top \xv_*$. - \item We can perform online updates: Whenever datapoints are observed, we can update the \textbf{posterior distribution} of $\thetab$ + \item By switching to a Bayesian perspective, we do not only have point estimates for the parameter $\thetav$, but whole \textbf{distributions} + \item From the posterior distribution of $\thetav$, we can derive a predictive distribution for $y_* = \thetav^\top \xv_*$. + \item We can perform online updates: Whenever datapoints are observed, we can update the \textbf{posterior distribution} of $\thetav$ \end{itemize} Next, we want to develop a theory for general shape functions, and not only for linear function. diff --git a/slides/gaussian-processes/slides-gp-training.tex b/slides/gaussian-processes/slides-gp-training.tex index 988afa1c..981c7f37 100644 --- a/slides/gaussian-processes/slides-gp-training.tex +++ b/slides/gaussian-processes/slides-gp-training.tex @@ -61,17 +61,17 @@ $$ y = \fx + \eps, ~ \eps \sim \mathcal{N}\left(0, \sigma^2\right), $$ -where $\fx \sim \mathcal{GP}\left(\bm{0}, k\left(\xv, \xv^\prime | \thetab \right)\right)$. +where $\fx \sim \mathcal{GP}\left(\bm{0}, k\left(\xv, \xv^\prime | \thetav \right)\right)$. \lz Observing $\bm{y} \sim \mathcal{N}\left(\bm{0}, \bm{K} + \sigma^2 \id\right)$, the marginal log-likelihood (or evidence) is \begin{eqnarray*} -\log p(\bm{y} ~|~ \bm{X}, \thetab) &=& \log \left[\left(2 \pi\right)^{-n / 2} |\bm{K}_y|^{-1 / 2} \exp\left(- \frac{1}{2} \bm{y}^\top \bm{K}_y^{-1} \bm{y}\right) \right]\\ +\log p(\bm{y} ~|~ \bm{X}, \thetav) &=& \log \left[\left(2 \pi\right)^{-n / 2} |\bm{K}_y|^{-1 / 2} \exp\left(- \frac{1}{2} \bm{y}^\top \bm{K}_y^{-1} \bm{y}\right) \right]\\ &=& -\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y} - \frac{1}{2} \log \left| \bm{K}_y \right| - \frac{n}{2} \log 2\pi. \end{eqnarray*} -with $\bm{K}_y:=\bm{K} + \sigma^2 \id$ and $\thetab$ denoting the hyperparameters (the parameters of the covariance function). +with $\bm{K}_y:=\bm{K} + \sigma^2 \id$ and $\thetav$ denoting the hyperparameters (the parameters of the covariance function). \framebreak @@ -102,7 +102,7 @@ \begin{itemize} \item data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, \item the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$, and - \item the overall value of the marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ + \item the overall value of the marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetav)$ \end{itemize} behave for increasing value of $\ls$. \end{itemize} @@ -114,7 +114,7 @@ \end{figure} \begin{footnotesize} - The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$. + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetav)$ behave for increasing values of $\ls$. \end{footnotesize} @@ -125,7 +125,7 @@ \end{figure} \begin{footnotesize} - The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$.\\ + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetav)$ behave for increasing values of $\ls$.\\ A small $\ls$ results in a good fit, but a high complexity penalty (low $- \frac{1}{2} \log \left| \bm{K}_y \right|$). \end{footnotesize} @@ -136,7 +136,7 @@ \end{figure} \begin{footnotesize} - The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$.\\ + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetav)$ behave for increasing values of $\ls$.\\ A large $\ls$ results in a poor fit. \end{footnotesize} @@ -147,7 +147,7 @@ \end{figure} \begin{footnotesize} - The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetab)$ behave for increasing values of $\ls$.\\ + The left plot shows how values of the data fit $-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y}$, the complexity penalty $- \frac{1}{2} \log \left| \bm{K}_y \right|$ (high value means less penalization) and the overall marginal likelihood $\log p(\bm{y} ~|~ \bm{X}, \thetav)$ behave for increasing values of $\ls$.\\ The maximizer of the log-likelihood, $\ls = 0.5$, balances complexity and fit. \end{footnotesize} @@ -160,13 +160,13 @@ \begin{footnotesize} \begin{eqnarray*} -\frac{\partial}{\partial\theta_j} \log p(\bm{y} ~|~ \bm{X}, \thetab) &=& \frac{\partial}{\partial\theta_j} \left(-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y} - \frac{1}{2} \log \left| \bm{K}_y \right| - \frac{n}{2} \log 2\pi\right) \\ -&=&\frac{1}{2} \yv^\top \bm{K}^{-1} \frac{\partial \bm{K}}{\partial \theta_j}\bm{K}^{-1} \yv - \frac{1}{2} \text{tr}\left(\bm{K}^{-1} \frac{\partial \bm{K}}{\partial \thetab} \right) \\ +\frac{\partial}{\partial\theta_j} \log p(\bm{y} ~|~ \bm{X}, \thetav) &=& \frac{\partial}{\partial\theta_j} \left(-\frac{1}{2}\bm{y}^T\bm{K}_y^{-1} \bm{y} - \frac{1}{2} \log \left| \bm{K}_y \right| - \frac{n}{2} \log 2\pi\right) \\ +&=&\frac{1}{2} \yv^\top \bm{K}^{-1} \frac{\partial \bm{K}}{\partial \theta_j}\bm{K}^{-1} \yv - \frac{1}{2} \text{tr}\left(\bm{K}^{-1} \frac{\partial \bm{K}}{\partial \thetav} \right) \\ &=& \frac{1}{2} \text{tr}\left((\bm{K}^{-1}\bm{y}\bm{y}^T\bm{K}^{-1} - \bm{K}^{-1})\frac{\partial\bm{K}}{\partial\theta_j}\right) \end{eqnarray*} \end{footnotesize} -using $\frac{\partial}{\partial \theta_j} \bm{K}^{-1} = - \bm{K}^{-1} \frac{\partial \bm{K}}{\partial \theta_j}\bm{K}^{-1}$ and $\frac{\partial}{\partial \thetab} \log |\bm{K}| = \text{tr}\left(\bm{K}^{-1} \frac{\partial \bm{K}}{\partial \thetab} \right)$. +using $\frac{\partial}{\partial \theta_j} \bm{K}^{-1} = - \bm{K}^{-1} \frac{\partial \bm{K}}{\partial \theta_j}\bm{K}^{-1}$ and $\frac{\partial}{\partial \thetav} \log |\bm{K}| = \text{tr}\left(\bm{K}^{-1} \frac{\partial \bm{K}}{\partial \thetav} \right)$. \framebreak diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex index 4779ecac..fefe8be8 100644 --- a/slides/information-theory/slides-info-ml.tex +++ b/slides/information-theory/slides-info-ml.tex @@ -20,20 +20,20 @@ } \begin{vbframe}{KL vs Maximum Likelihood} -Minimizing KL between the true distribution $p(x)$ and approximating model $q(x|\thetab)$ is equivalent to maximizing the log-likelihood. +Minimizing KL between the true distribution $p(x)$ and approximating model $q(x|\thetav)$ is equivalent to maximizing the log-likelihood. \begin{align*} - D_{KL}(p \| q_{\thetab}) &= \E_{X \sim p} \left[ \log \frac{p(x)}{q(x|\thetab)}\right] \\ - &= \E_{X \sim p} \log p(x) - \E_{X \sim p} \log q(x|\thetab) + D_{KL}(p \| q_{\thetav}) &= \E_{X \sim p} \left[ \log \frac{p(x)}{q(x|\thetav)}\right] \\ + &= \E_{X \sim p} \log p(x) - \E_{X \sim p} \log q(x|\thetav) \end{align*} - as first term above does not depend on $\thetab$. + as first term above does not depend on $\thetav$. %and the second term we could also as a def for CE! Therefore, \begin{align*} - \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) &= \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab)\\ - &= \argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab) + \argmin_{\thetav} D_{KL}(p \| q_{\thetav}) &= \argmin_{\thetav} -\E_{X \sim p} \log q(x|\thetav)\\ + &= \argmax_{\thetav} \E_{X \sim p} \log q(x|\thetav) \end{align*} For a finite dataset of $n$ samples from $p$, this is approximated as - $$\argmax_{\thetab} \E_{X \sim p} \log q(x|\thetab) \approx \argmax_{\thetab} \frac{1}{n} \sumin \log q(\xi|\thetab)\,.$$ + $$\argmax_{\thetav} \E_{X \sim p} \log q(x|\thetav) \approx \argmax_{\thetav} \frac{1}{n} \sumin \log q(\xi|\thetav)\,.$$ This also directly implies an equivalence to risk minimization! @@ -42,7 +42,7 @@ \begin{vbframe}{KL vs Cross-Entropy} From this here we can see much more: -$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$ +$$ \argmin_{\thetav} D_{KL}(p \| q_{\thetav}) = \argmin_{\thetav} -\E_{X \sim p} \log q(x|\thetav) = \argmin_{\thetav} H(p \| q_{\thetav}) $$ \begin{itemize} \item So minimizing KL is the same as minimizing CE, is the same as maximum likelihood! \item We could now motivate CE as the "relevant" term that you have to minimize when you minimize KL - after you drop $\E_p \log p(x)$, which is simply the neg. entropy H(p)! @@ -52,7 +52,7 @@ \begin{vbframe}{KL vs Cross-Entropy Example} Let $p(x)=N(0,1)$ and $q(x)=LP(0,\sigma)$ and consider again -$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$ +$$ \argmin_{\thetav} D_{KL}(p \| q_{\thetav}) = \argmin_{\thetav} -\E_{X \sim p} \log q(x|\thetav) = \argmin_{\thetav} H(p \| q_{\thetav}) $$ \begin{center} \includegraphics[width=1\textwidth]{figure/kl_ce_comparison.png} @@ -66,7 +66,7 @@ \item Consider a multi-class classification task with dataset $\D = \Dset$. % \item More concretely, let us assume this is an image classification task where each $\xi$ is an image and $\yi$ is the corresponding label. \item For $g$ classes, each $\yi$ can be one-hot-encoded as a vector $d^{(i)}$ of length $g$. $d^{(i)}$ can be interpreted as a categorical distribution which puts all its probability mass on the true label $\yi$ of $\xi$. - \item $\pi(\xv^{(i)}|\thetab)$ is the probability output vector of the model, and also a categorical distribution over the classes. + \item $\pi(\xv^{(i)}|\thetav)$ is the probability output vector of the model, and also a categorical distribution over the classes. \end{itemize} \lz @@ -78,18 +78,18 @@ \framebreak -To train the model, we minimize KL between $d^{(i)}$ and $\pi(\xv^{(i)}|\thetab)$ : -$$ \argmin_{\thetab} \sum_{i=1}^n D_{KL} (d^{(i)} \| \pi(\xv^{(i)}|\thetab)) = \argmin_{\thetab} \sum_{i=1}^n H(d^{(i)} \| \pi(\xv^{(i)}|\thetab)) $$ - % where the entropy $H(d^{(i)})$ was dropped because it is not a function of $\thetab$. +To train the model, we minimize KL between $d^{(i)}$ and $\pi(\xv^{(i)}|\thetav)$ : +$$ \argmin_{\thetav} \sum_{i=1}^n D_{KL} (d^{(i)} \| \pi(\xv^{(i)}|\thetav)) = \argmin_{\thetav} \sum_{i=1}^n H(d^{(i)} \| \pi(\xv^{(i)}|\thetav)) $$ + % where the entropy $H(d^{(i)})$ was dropped because it is not a function of $\thetav$. We see that this is equivalent to log-loss risk minimization! \begin{footnotesize} \begin{equation*} \begin{split} - R &= \sumin H(d^{(i)} \| \pi_k(\xv^{(i)}|\thetab)) \\ - &= \sumin \left( - \sum_k d^{(i)}_k \log\pi_k(\xv^{(i)}|\thetab) \right) \\ - & = \sumin \underbrace{ \left( -\sum_{k = 1}^g [\yi = k]\log \pi_{k}(\xv^{(i)}|\thetab) \right) }_{\text{log loss}} \\ - & = \sumin (-\log\pi_{y^{(i)}}(\xv^{(i)}|\thetab)) + R &= \sumin H(d^{(i)} \| \pi_k(\xv^{(i)}|\thetav)) \\ + &= \sumin \left( - \sum_k d^{(i)}_k \log\pi_k(\xv^{(i)}|\thetav) \right) \\ + & = \sumin \underbrace{ \left( -\sum_{k = 1}^g [\yi = k]\log \pi_{k}(\xv^{(i)}|\thetav) \right) }_{\text{log loss}} \\ + & = \sumin (-\log\pi_{y^{(i)}}(\xv^{(i)}|\thetav)) \end{split} \end{equation*} \end{footnotesize} diff --git a/slides/linear-svm/slides-linsvm-erm.tex b/slides/linear-svm/slides-linsvm-erm.tex index ac48ad59..7b869175 100644 --- a/slides/linear-svm/slides-linsvm-erm.tex +++ b/slides/linear-svm/slides-linsvm-erm.tex @@ -41,15 +41,15 @@ We derived this QP for the soft-margin SVM: \begin{eqnarray*} - & \min\limits_{\thetab, \thetab_0,\sli} & \frac{1}{2} \|\thetab\|^2 + C \sum_{i=1}^n \sli \\ - & \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ + & \min\limits_{\thetav, \thetav_0,\sli} & \frac{1}{2} \|\thetav\|^2 + C \sum_{i=1}^n \sli \\ + & \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ & \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset. \end{eqnarray*} In the optimum, the inequalities will hold with equality (as we minimize the slacks), so $\sli = 1 - \yi \fxi$, but the lowest value $\sli$ can take is 0 (we do no get a bonus for points beyond the margin on the correct side). So we can rewrite the above: \begin{align*} - \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lyf = + \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lyf = \begin{cases} 1 - y f & \text{ if } y f \leq 1 \\ 0 & \text{ if } y f > 1 @@ -58,7 +58,7 @@ We can also write $\Lyf = \max(1-yf, 0)$. \framebreak - $$ \risket = \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$ + $$ \risket = \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$ \begin{itemize} \item This now obviously L2-regularized empirical risk minimization. \item Actually, a lot of ERM theory was established when Vapnik (co-)invented the SVM in the beginning of the 90s. @@ -74,7 +74,7 @@ \framebreak - $$ \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$ + $$ \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$ \begin{itemize} \item The ERM interpretation does not require any of the terms -- the loss or the regularizer -- to be geometrically meaningful. \item The above form is a very compact form to define the convex optimization problem of the SVM. diff --git a/slides/linear-svm/slides-linsvm-hard-margin-dual.tex b/slides/linear-svm/slides-linsvm-hard-margin-dual.tex index e17c0402..6ceee1f1 100644 --- a/slides/linear-svm/slides-linsvm-hard-margin-dual.tex +++ b/slides/linear-svm/slides-linsvm-hard-margin-dual.tex @@ -32,30 +32,30 @@ %\vspace*{-0.5cm} % % \begin{align*} -% \min\limits_{\thetab} &\, f(\thetab) \\ -% \text{s.t. } & \quad g_i(\thetab) \le 0 \quad\forall\, i \in \{1, ..., k \} &\quad(\text{inequality constraints}) \\ -% h_j(\thetab) & = 0 \quad\forall\, j \in \{1, ..., l \} &\quad(\text{equality constraints}) +% \min\limits_{\thetav} &\, f(\thetav) \\ +% \text{s.t. } & \quad g_i(\thetav) \le 0 \quad\forall\, i \in \{1, ..., k \} &\quad(\text{inequality constraints}) \\ +% h_j(\thetav) & = 0 \quad\forall\, j \in \{1, ..., l \} &\quad(\text{equality constraints}) % \end{align*} % %This is called the \textbf{primal form}. \\ -%Let $\fh_P$ be the minimal value of $f(\thetab)$ satisfying all of the constraints. +%Let $\fh_P$ be the minimal value of $f(\thetav)$ satisfying all of the constraints. % %\framebreak % %The corresponding \textbf{Lagrangian} is % %$$ -%L(\thetab, \alpha, \beta) = f(\thetab) + \sum_{i=1}^k \alpha_i g_i(\thetab) + \sum_{j=1}^l \beta_j h_j(\thetab) +%L(\thetav, \alpha, \beta) = f(\thetav) + \sum_{i=1}^k \alpha_i g_i(\thetav) + \sum_{j=1}^l \beta_j h_j(\thetav) %$$ %with the \textbf{Lagrange Multipliers} $\alpha_i, \beta_j \ge 0$. %\lz -%Minimizing the Lagrangian is minimizing $f(\thetab)$ under the constraints:\\ -%Since $\frac{\partial L(\thetab, \alpha, \beta)}{\partial \beta_j}= h_j(\thetab)$, -%setting the partial derivative of $L(\thetab, \beta)$ w.r.t. $\beta_j$ to 0 just restates the constraints.\\ -%Since $\frac{\partial L(\thetab, \alpha, \beta)}{\partial \thetab} = \nabla f(\thetab) + \sum_{i=1}^k \alpha_i \nabla g_i(\thetab) + \sum_{j=1}^l \beta_j \nabla h_j(\thetab)$, setting the partial derivative of $L(\%theta, \beta)$ w.r.t. $\thetab$ to 0 also aligns the gradients of the objective and constraint functions (see following slides).\\ +%Minimizing the Lagrangian is minimizing $f(\thetav)$ under the constraints:\\ +%Since $\frac{\partial L(\thetav, \alpha, \beta)}{\partial \beta_j}= h_j(\thetav)$, +%setting the partial derivative of $L(\thetav, \beta)$ w.r.t. $\beta_j$ to 0 just restates the constraints.\\ +%Since $\frac{\partial L(\thetav, \alpha, \beta)}{\partial \thetav} = \nabla f(\thetav) + \sum_{i=1}^k \alpha_i \nabla g_i(\thetav) + \sum_{j=1}^l \beta_j \nabla h_j(\thetav)$, setting the partial derivative of $L(\%theta, \beta)$ w.r.t. $\thetav$ to 0 also aligns the gradients of the objective and constraint functions (see following slides).\\ %\end{vbframe} @@ -69,18 +69,18 @@ %\vspace*{-0.5cm} % % \begin{eqnarray*} -% \min\limits_{\thetab_1, \thetab_2} & \thetab_1 + \thetab_2 & \\ -% \text{s.t.} & \thetab_1^2 + \thetab_2^2 & = 1 +% \min\limits_{\thetav_1, \thetav_2} & \thetav_1 + \thetav_2 & \\ +% \text{s.t.} & \thetav_1^2 + \thetav_2^2 & = 1 % \end{eqnarray*} % %\lz % %The Lagrangian function is %$$ -%L(\thetab, \beta) = f(\thetab) + \beta h(\thetab) = (\thetab_1 + \thetab_2) + \beta(\thetab_1^2 + \thetab_2^2 - 1) +%L(\thetav, \beta) = f(\thetav) + \beta h(\thetav) = (\thetav_1 + \thetav_2) + \beta(\thetav_1^2 + \thetav_2^2 - 1) %$$ % -%You can think of the optimization problem $L(\thetab, \beta) \to \min$ as moving the (linear) contours of the objective forward and backward until you find a point $(\thetab_1, \thetab_2)$ that fulfills the %constraint \emph{and} minimizes the objective function or as moving around the feasible region given by the constraint until you are at the minimum of the objective function. +%You can think of the optimization problem $L(\thetav, \beta) \to \min$ as moving the (linear) contours of the objective forward and backward until you find a point $(\thetav_1, \thetav_2)$ that fulfills the %constraint \emph{and} minimizes the objective function or as moving around the feasible region given by the constraint until you are at the minimum of the objective function. % % \center % \includegraphics{figure_man/optimization/constraints_violated.pdf} @@ -128,17 +128,17 @@ % \framebreak % % \flushleft -% At the optima, the contour lines of the objective function $f(\thetab)$ are tangential to the constraint $h(\thetab)$. +% At the optima, the contour lines of the objective function $f(\thetav)$ are tangential to the constraint $h(\thetav)$. % % \lz % -% This means that the gradients of $f(\thetab)$ and $h(\thetab)$ are parallel (since the gradient is orthogonal to the contour line), so $\nabla f(\thetab) \propto \nabla h(\thetab)$, so: +% This means that the gradients of $f(\thetav)$ and $h(\thetav)$ are parallel (since the gradient is orthogonal to the contour line), so $\nabla f(\thetav) \propto \nabla h(\thetav)$, so: % % \begin{align*} -% \exists \beta: \nabla f(\thetab) &= - \beta \nabla h(\thetab) \text{ for optimal $\thetab$}\\ +% \exists \beta: \nabla f(\thetav) &= - \beta \nabla h(\thetav) \text{ for optimal $\thetav$}\\ % \intertext{which is equivalent to} -% \nabla f(\thetab) + \beta \nabla h(\thetab) &= 0 \\ -% \text{i.e., }\qquad \frac{\partial}{\partial \thetab} L(\thetab, \beta) &= 0, +% \nabla f(\thetav) + \beta \nabla h(\thetav) &= 0 \\ +% \text{i.e., }\qquad \frac{\partial}{\partial \thetav} L(\thetav, \beta) &= 0, % \end{align*} % so finding a minimum of the Lagrangian solves the constrained optimization problem.\\ % This idea is extended to more complex objective functions and constraints. @@ -150,11 +150,11 @@ % % Using the Lagrangian, we can write the primal problem equivalently as % $$ -% \min\limits_{\thetab} \max\limits_{\alpha, \beta} L(\thetab, \alpha, \beta). +% \min\limits_{\thetav} \max\limits_{\alpha, \beta} L(\thetav, \alpha, \beta). % $$ -% For any given $\thetab$:\\ -% If $\thetab$ satisfies the constraints, $L(\thetab, \alpha, \beta) \equiv f(\thetab)$ so $\alpha, \beta$ are irrelevant.\linebreak -% If $\thetab$ doesn't, we want $L(\thetab, \alpha, \beta) \to \infty$, because the constraints \emph{need} to be satisfied. +% For any given $\thetav$:\\ +% If $\thetav$ satisfies the constraints, $L(\thetav, \alpha, \beta) \equiv f(\thetav)$ so $\alpha, \beta$ are irrelevant.\linebreak +% If $\thetav$ doesn't, we want $L(\thetav, \alpha, \beta) \to \infty$, because the constraints \emph{need} to be satisfied. % % \framebreak % @@ -162,9 +162,9 @@ % We obtain the \textbf{dual} optimization problem by switching $\max$ and $\min$, i. e. % % $$ -% \max\limits_{\alpha, \beta} \min\limits_{\thetab} L(\thetab, \alpha, \beta). +% \max\limits_{\alpha, \beta} \min\limits_{\thetav} L(\thetav, \alpha, \beta). % $$ -% Since $L(\thetab, \alpha, \beta)$ is a simple sum over $\alpha, \beta$ for fixed $\thetab$, the +% Since $L(\thetav, \alpha, \beta)$ is a simple sum over $\alpha, \beta$ for fixed $\thetav$, the % optimzation here is much easier. % % Let $\fh_D$ be the optimal value of the dual problem. @@ -180,14 +180,14 @@ % % \framebreak % -% In case of strong duality, a necessary and sufficient condition for a solution to the primal and dual problem $\thetabh, \hat \alpha, \hat \beta$ are the \textbf{Karush-Kuhn-Tucker-Conditions} +% In case of strong duality, a necessary and sufficient condition for a solution to the primal and dual problem $\thetavh, \hat \alpha, \hat \beta$ are the \textbf{Karush-Kuhn-Tucker-Conditions} % % \begin{align*} -% \left.\fp{L(\thetab, \alpha, \beta)}{\thetab}\right|_{(\thetabh, \hat\alpha, \hat\beta)} &= 0 &\text{(Stationarity)}\\ -% g_i(\thetabh) &\le 0 \quad \forall ~ i \in \{1, ..., k \} &\text{(Primal feasibility I)}\\ -% h_j(\thetabh) &= 0 \quad \forall ~ j \in \{1, ..., l \} & \text{(Primal feasibility II)}\\ +% \left.\fp{L(\thetav, \alpha, \beta)}{\thetav}\right|_{(\thetavh, \hat\alpha, \hat\beta)} &= 0 &\text{(Stationarity)}\\ +% g_i(\thetavh) &\le 0 \quad \forall ~ i \in \{1, ..., k \} &\text{(Primal feasibility I)}\\ +% h_j(\thetavh) &= 0 \quad \forall ~ j \in \{1, ..., l \} & \text{(Primal feasibility II)}\\ % \hat \alpha_i &\ge 0 \quad \forall ~ i \in \{1, ..., k \} &\text{(Dual feasibility)}\\ -% \hat \alpha_i g_i(\thetabh) &= 0 \quad \forall ~ i \in \{1, ..., k \} &\text{(Complementarity)} +% \hat \alpha_i g_i(\thetavh) &= 0 \quad \forall ~ i \in \{1, ..., k \} &\text{(Complementarity)} % \end{align*} % % Particularly for SVMs, the KKT conditions are necessary and sufficient for a solution. @@ -198,8 +198,8 @@ We before derived the primal quadratic program for the hard margin SVM. We could directly solve this, but traditionally the SVM is solved in the dual and this has some advantages. In any case, many algorithms and derivations are based on it, so we need to know it. \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. + & \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. \end{eqnarray*} @@ -220,25 +220,25 @@ \small \begin{eqnarray*} -&L(\thetab, \theta_0, \alphav) = & \frac{1}{2}\|\thetab\|^2 - \sum_{i=1}^n \alpha_i \left[\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) - 1\right]\\ +&L(\thetav, \theta_0, \alphav) = & \frac{1}{2}\|\thetav\|^2 - \sum_{i=1}^n \alpha_i \left[\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) - 1\right]\\ & \text{s.t.} & \,\, \alpha_i \ge 0 \quad \forall\, i \in \nset. \end{eqnarray*} \small The \textbf{dual} form of this problem is -$$\max\limits_{\alpha} \min\limits_{\thetab, \theta_0} L(\thetab, \theta_0,\alphav).$$ +$$\max\limits_{\alpha} \min\limits_{\thetav, \theta_0} L(\thetav, \theta_0,\alphav).$$ \framebreak -Notice how the (p+1) decision variables $(\thetab,\theta_0)$ have become $n$ decisions variables $\alphav$, as constraints turned into variables and vice versa. +Notice how the (p+1) decision variables $(\thetav,\theta_0)$ have become $n$ decisions variables $\alphav$, as constraints turned into variables and vice versa. Now every data point has an associated non-negative weight. \begin{eqnarray*} -&L(\thetab, \theta_0, \alphav) = & \frac{1}{2}\|\thetab\|^2 - \sum_{i=1}^n \alpha_i \left[\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) - 1\right]\\ +&L(\thetav, \theta_0, \alphav) = & \frac{1}{2}\|\thetav\|^2 - \sum_{i=1}^n \alpha_i \left[\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) - 1\right]\\ & \text{s.t.} & \,\, \alpha_i \ge 0 \quad \forall\, i \in \nset. \end{eqnarray*} -We find the stationary point of $L(\thetab, \theta_0,\alphav)$ w.r.t. $\thetab, \theta_0$ and obtain +We find the stationary point of $L(\thetav, \theta_0,\alphav)$ w.r.t. $\thetav, \theta_0$ and obtain \begin{eqnarray*} - \thetab & = & \sum_{i=1}^n \alpha_i \yi \xi, \\ + \thetav & = & \sum_{i=1}^n \alpha_i \yi \xi, \\ 0 & = & \sum_{i=1}^n \alpha_i \yi \quad \forall\, i \in \nset.\\ \end{eqnarray*} @@ -246,7 +246,7 @@ \framebreak By inserting these expressions -% 1/2 \sum_{i,j}\alpha_i\alpha_j y_i y_j - \sum_{i,j}\alpha_i\alpha_j y_i y_j - \sum_i alpha_i y_i \thetab0 + \sum_i \alpha_i +% 1/2 \sum_{i,j}\alpha_i\alpha_j y_i y_j - \sum_{i,j}\alpha_i\alpha_j y_i y_j - \sum_i alpha_i y_i \thetav0 + \sum_i \alpha_i \& simplifying we obtain the dual problem \vspace*{-0.5cm} @@ -269,10 +269,10 @@ \framebreak -If $(\thetab, \theta_0, \alphav)$ fulfills the KKT conditions (stationarity, primal/dual feasibility, complementary slackness), it solves both the primal and dual problem (strong duality). +If $(\thetav, \theta_0, \alphav)$ fulfills the KKT conditions (stationarity, primal/dual feasibility, complementary slackness), it solves both the primal and dual problem (strong duality). -Under these conditions, and if we solve the dual problem and obtain $\alphavh$, we know that $\thetab$ is a linear combination of our data points: +Under these conditions, and if we solve the dual problem and obtain $\alphavh$, we know that $\thetav$ is a linear combination of our data points: $$ \thetah = \sumin \alphah_i \yi \xi @@ -281,7 +281,7 @@ Complementary slackness means: $$ -\alphah_i \left[\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) - 1\right] = 0 \quad \forall ~ i \in \{1, ..., n \}. +\alphah_i \left[\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) - 1\right] = 0 \quad \forall ~ i \in \{1, ..., n \}. $$ \framebreak @@ -290,16 +290,16 @@ \thetah = \sumin \alphah_i \yi \xi $$ $$ -\alphah_i \left[\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) - 1\right] = 0 \quad \forall ~ i \in \{1, ..., n \}. +\alphah_i \left[\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) - 1\right] = 0 \quad \forall ~ i \in \{1, ..., n \}. $$ \begin{itemize} \item So either $\alphah_i = 0$, and is not active in the linear combination, - or $\alphah_i > 0$, then $\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) = 1$, and $(\xi, \yi)$ has minimal margin and is a support vector! - \item We see that we can directly extract the support vectors from the dual variables and the $\thetab$ solution only depends on them. + or $\alphah_i > 0$, then $\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) = 1$, and $(\xi, \yi)$ has minimal margin and is a support vector! + \item We see that we can directly extract the support vectors from the dual variables and the $\thetav$ solution only depends on them. \item We can reconstruct the bias term $\theta_0$ from any support vector: $$ - \theta_0 = \yi - \scp{\thetab}{\xi}. + \theta_0 = \yi - \scp{\thetav}{\xi}. $$ \end{itemize} @@ -309,7 +309,7 @@ \begin{itemize} \item SVs are defined to be points with $\alphah_i > 0$. In the case of hard margin linear SVM, the SVs are on the edge of margin. \item However, not all points on edge of margin are necessarily SVs. - \item In other words, it is possible that both $\alphah_i = 0$ and $\yi \left(\langle\thetab, \xi \rangle \right) - 1 = 0$ hold. + \item In other words, it is possible that both $\alphah_i = 0$ and $\yi \left(\langle\thetav, \xi \rangle \right) - 1 = 0$ hold. \end{itemize} \begin{minipage}[t]{0.4\columnwidth} diff --git a/slides/linear-svm/slides-linsvm-hard-margin.tex b/slides/linear-svm/slides-linsvm-hard-margin.tex index 7cb430b4..926a656c 100644 --- a/slides/linear-svm/slides-linsvm-hard-margin.tex +++ b/slides/linear-svm/slides-linsvm-hard-margin.tex @@ -61,12 +61,12 @@ % \begin{vbframe}{Recall: Hyperplanes} -% A hyperplane in $\Xspace = \R^p$ is a $p-1$ dimensional linear subspace defined by a normal vector $\thetab$ (usually with $||\thetab|| = 1$), perpendicular to the hyperplane, and an offset $\theta_0$. +% A hyperplane in $\Xspace = \R^p$ is a $p-1$ dimensional linear subspace defined by a normal vector $\thetav$ (usually with $||\thetav|| = 1$), perpendicular to the hyperplane, and an offset $\theta_0$. % \lz -% For $\fx := \scp{\thetab}{\xv} + \theta_0$, the hyperplane is defined as +% For $\fx := \scp{\thetav}{\xv} + \theta_0$, the hyperplane is defined as % \vspace{-0.3cm} % $$ -% \{\xv \in \Xspace: \scp{\thetab}{\xv} + \theta_0 = 0 \} = \{\xv \in \Xspace ~|~ \fx = 0 \} +% \{\xv \in \Xspace: \scp{\thetav}{\xv} + \theta_0 = 0 \} = \{\xv \in \Xspace ~|~ \fx = 0 \} % $$ % \begin{center} % \includegraphics[width=3cm]{figure_man/introduction/hyperplane2d.pdf} ~~~~~ @@ -79,18 +79,18 @@ % \framebreak -% Positive halfspace: $\phantom{i}\{x \in \Xspace : f(x) > 0\}$ (in direction of $\thetab$)\\ +% Positive halfspace: $\phantom{i}\{x \in \Xspace : f(x) > 0\}$ (in direction of $\thetav$)\\ % Negative halfspace: $\{x \in \Xspace : f(x) < 0\}$ % % \vspace{-0.5cm} % The distance between point $x$ and hyperplane is -% $$d(f, x) = \frac{|\scp{\thetab}{x} + \theta_0|}{\|\thetab\|} = \frac{|f(x)|}{||\thetab||},$$\\ -% i.e., $d(f, 0) = |\theta_0| / ||\thetab||$. +% $$d(f, x) = \frac{|\scp{\thetav}{x} + \theta_0|}{\|\thetav\|} = \frac{|f(x)|}{||\thetav||},$$\\ +% i.e., $d(f, 0) = |\theta_0| / ||\thetav||$. -% For unit length $\thetab$, these simplify to $$d(f, x) = |f(x)|$$ and $d(f, 0) = |\theta_0|$ . +% For unit length $\thetav$, these simplify to $$d(f, x) = |f(x)|$$ and $d(f, 0) = |\theta_0|$ . -% % \frac{|\scp{\thetab}{x} + \theta_0|}{\|\thetab\|}. +% % \frac{|\scp{\thetav}{x} + \theta_0|}{\|\thetav\|}. % % $f(x)$ @@ -104,7 +104,7 @@ For labeled data $\D = \Dset$, with $\yi \in \{-1, +1\}$: \begin{itemize} - \item Assume linear separation by $\fx = \thetab^\top \xv + \theta_0$, such that all $+$-observations are in the positive halfspace + \item Assume linear separation by $\fx = \thetav^\top \xv + \theta_0$, such that all $+$-observations are in the positive halfspace $$ \phantom{i}\{\xv \in \Xspace: \fx > 0\} @@ -118,13 +118,13 @@ \item For a linear separating hyperplane, we have $$ - \yi \underbrace{\left(\thetab^\top \xi + \theta_0\right)}_{= \fxi} > 0 \quad \forall i \in \{1, 2, ..., n\}. + \yi \underbrace{\left(\thetav^\top \xi + \theta_0\right)}_{= \fxi} > 0 \quad \forall i \in \{1, 2, ..., n\}. $$ \item % For correctly classified points $\left(\xi, \yi\right)$, $$ - d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetab\|} = \yi \frac{\thetab^T \xi + \theta_0}{\|\thetab\|} + d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetav\|} = \yi \frac{\thetav^T \xi + \theta_0}{\|\thetav\|} $$ computes the (signed) distance to the separating hyperplane $\fx = 0$, positive for correct classifications, negative for incorrect. @@ -160,12 +160,12 @@ We formulate the desired property of a large \enquote{safety margin} as an optimization problem: \begin{eqnarray*} - & \max\limits_{\thetab, \theta_0} & \gamma \\ + & \max\limits_{\thetav, \theta_0} & \gamma \\ & \text{s.t.} & \,\, d \left(f, \xi \right) \geq \gamma \quad \forall\, i \in \nset. \end{eqnarray*} \begin{itemize} - \item The constraints mean: We require that any instance $i$ should have a \enquote{safety} distance of at least $\gamma$ from the decision boundary defined by $f (= \thetab^T \xv + \theta_0) = 0$. + \item The constraints mean: We require that any instance $i$ should have a \enquote{safety} distance of at least $\gamma$ from the decision boundary defined by $f (= \thetav^T \xv + \theta_0) = 0$. \item Our objective is to maximize the \enquote{safety} distance. \end{itemize} @@ -177,17 +177,17 @@ We reformulate the problem: \begin{eqnarray*} - & \max \limits_{\thetab, \theta_0} & \gamma \\ - & \text{s.t.} & \,\, \frac{\yi \left( \scp{\thetab}{\xi} + \theta_0 \right)}{\|\thetab\|} \geq \gamma \quad \forall\, i \in \nset. + & \max \limits_{\thetav, \theta_0} & \gamma \\ + & \text{s.t.} & \,\, \frac{\yi \left( \scp{\thetav}{\xi} + \theta_0 \right)}{\|\thetav\|} \geq \gamma \quad \forall\, i \in \nset. \end{eqnarray*} \begin{itemize} - \item The inequality is rearranged by multiplying both sides with $\|\thetab\|$: + \item The inequality is rearranged by multiplying both sides with $\|\thetav\|$: \end{itemize} \begin{eqnarray*} - & \max \limits_{\thetab, \theta_0} & \gamma \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq \|\thetab\| \gamma \quad \forall\, i \in \nset. + & \max \limits_{\thetav, \theta_0} & \gamma \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq \|\thetav\| \gamma \quad \forall\, i \in \nset. \end{eqnarray*} \framebreak @@ -195,35 +195,35 @@ \begin{itemize} \item Note that the same hyperplane does not have a unique representation: $$ - \{\xv \in \Xspace ~|~ \thetab^\top \xv = 0\} = \{\xv \in \Xspace ~|~ c \cdot \thetab^\top \xv = 0\} + \{\xv \in \Xspace ~|~ \thetav^\top \xv = 0\} = \{\xv \in \Xspace ~|~ c \cdot \thetav^\top \xv = 0\} $$ for arbitrary $c \ne 0$. - \item To ensure uniqueness of the solution, we make a reference choice -- we only consider hyperplanes with $\|\thetab\| = 1 / \gamma$: + \item To ensure uniqueness of the solution, we make a reference choice -- we only consider hyperplanes with $\|\thetav\| = 1 / \gamma$: \end{itemize} \begin{eqnarray*} - & \max \limits_{\thetab, \theta_0} & \gamma \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. + & \max \limits_{\thetav, \theta_0} & \gamma \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. \end{eqnarray*} \framebreak \begin{itemize} - \item Substituting $\gamma = 1 / \|\thetab\|$ in the objective yields: + \item Substituting $\gamma = 1 / \|\thetav\|$ in the objective yields: \end{itemize} \begin{eqnarray*} - & \max \limits_{\thetab, \theta_0} & \frac{1}{\|\thetab\|} \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. + & \max \limits_{\thetav, \theta_0} & \frac{1}{\|\thetav\|} \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. \end{eqnarray*} \begin{itemize} - \item Maximizing $1 / \|\thetab\|$ is the same as minimizing $\|\thetab\|$, which is the same as minimizing $\frac{1}{2}\|\thetab\|^2$: + \item Maximizing $1 / \|\thetav\|$ is the same as minimizing $\|\thetav\|$, which is the same as minimizing $\frac{1}{2}\|\thetav\|^2$: \end{itemize} \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. + & \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. \end{eqnarray*} \end{vbframe} @@ -233,8 +233,8 @@ We derived the following optimization problem: \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\ - & \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. + & \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\ + & \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset. \end{eqnarray*} This turns out to be a \textbf{convex optimization problem} -- particularly, a \textbf{quadratic program}: The objective function is quadratic, and the constraints are linear inequalities. @@ -251,24 +251,24 @@ % \frame{ % \frametitle{Maximum margin separation} % \begin{eqnarray*} -% \only<1-5> {& \max \limits_{\thetab, \theta_0} & \gamma} -% \only<6->{& \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2} \\ +% \only<1-5> {& \max \limits_{\thetav, \theta_0} & \gamma} +% \only<6->{& \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2} \\ % \only<1>{& \text{s.t.} & \,\, d(f, \xi) \geq \gamma \quad \forall\, i \in \nset} -% \only<2>{& \text{s.t.} & \,\, \frac{\yi \left( \scp{\thetab}{\xi} + \theta_0 \right)}{\|\thetab\|} \geq \gamma \quad \forall\, i \in \nset} \\ -% \only<3-4> {& \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq \|\thetab\| \gamma \quad \forall\, i \in \nset} \\ -% \only<5-> {& \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset} +% \only<2>{& \text{s.t.} & \,\, \frac{\yi \left( \scp{\thetav}{\xi} + \theta_0 \right)}{\|\thetav\|} \geq \gamma \quad \forall\, i \in \nset} \\ +% \only<3-4> {& \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq \|\thetav\| \gamma \quad \forall\, i \in \nset} \\ +% \only<5-> {& \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset} % \end{eqnarray*} % \vspace{-1em} % \pause % % \begin{itemize} -% % \item<1-> We can get rid of the $\|\thetab\|=1$ constraint by dividing the inequality constraint by $\|\thetab\|$ +% % \item<1-> We can get rid of the $\|\thetav\|=1$ constraint by dividing the inequality constraint by $\|\thetav\|$ % \pause -% \item<3-> The inequality is rearranged by multiplying both sides with $\|\thetab\|$ +% \item<3-> The inequality is rearranged by multiplying both sides with $\|\thetav\|$ % \pause -% \item<4-> Remember: As we assume linear separability, any positively scaled $\thetab, \theta_0$ satisfies the constraint, too \\ -% \item<5-> We substitute $\|\thetab\| = \frac{1}{\gamma} \Leftrightarrow \gamma = 1/\|\thetab\|$ -% \item<6-> Maximizing $\gamma$ is the same as minimizing $\|\thetab\|$ which is the same as minimizing $\frac{1}{2}\|\thetab\|^2$ +% \item<4-> Remember: As we assume linear separability, any positively scaled $\thetav, \theta_0$ satisfies the constraint, too \\ +% \item<5-> We substitute $\|\thetav\| = \frac{1}{\gamma} \Leftrightarrow \gamma = 1/\|\thetav\|$ +% \item<6-> Maximizing $\gamma$ is the same as minimizing $\|\thetav\|$ which is the same as minimizing $\frac{1}{2}\|\thetav\|^2$ % % \item There are efficient ``off-the-shelf'' algorithms for solving % % such problems. % \end{itemize} @@ -281,13 +281,13 @@ % % Now we will reformulate the optimization problem: % \begin{eqnarray*} -% \only<1> {& \max \limits_{\thetab, \theta_0} & \gamma} -% \only<2-3> {& \min\limits_{\thetab, \theta_0} & 1 / (2 \gamma^2)} -% \only<4->{& \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2} \\ -% \only<1-2>{& \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq \gamma \quad \forall\, i \in \nset} -% \only<3->{& \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset} \\ -% \only<-2>{& \quad & \|\thetab\| = 1} -% \only<3>{& \quad & \|\thetab\| = 1/\gamma} +% \only<1> {& \max \limits_{\thetav, \theta_0} & \gamma} +% \only<2-3> {& \min\limits_{\thetav, \theta_0} & 1 / (2 \gamma^2)} +% \only<4->{& \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2} \\ +% \only<1-2>{& \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq \gamma \quad \forall\, i \in \nset} +% \only<3->{& \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset} \\ +% \only<-2>{& \quad & \|\thetav\| = 1} +% \only<3>{& \quad & \|\thetav\| = 1/\gamma} % \only<4->{& \enspace & \enspace} % \end{eqnarray*} % \vspace{-1em} @@ -295,11 +295,11 @@ % \begin{itemize} % \item Maximizing $\gamma$ is the same as minimizing $1 / (2 \gamma^2)$. % \pause -% \item The solution $(\thetab, \theta_0)$ can be scaled without changing the classifier. We scale it with the factor $1 / \gamma$. +% \item The solution $(\thetav, \theta_0)$ can be scaled without changing the classifier. We scale it with the factor $1 / \gamma$. % \pause % \item The second constraint can be used to eliminate $\gamma$ from % the optimization problem. -% It still holds $\gamma = 1 / \|\thetab\|$. +% It still holds $\gamma = 1 / \|\thetav\|$. % \pause % \item This turns out to be a convex optimization problem. % This particular form is called a \textbf{quadratic program}: @@ -330,7 +330,7 @@ $\yi \fxi = 1$, fulfilling the inequality constraints with equality. \item They are called \textbf{support vectors (SVs)}. They are located exactly at - a distance of $\gamma = 1 / \|\thetab\|$ from the separating hyperplane. + a distance of $\gamma = 1 / \|\thetav\|$ from the separating hyperplane. \item It is already geometrically obvious that the solution does not depend on the non-SVs! We could delete them from the data and would arrive at the same solution. \vspace{0.5cm} @@ -373,7 +373,7 @@ % It can be shown that the solution of a hard-margin support vector machine can be written as follows: % % $$ -% \thetah = \sumin \alpha_i \yi \xi \quad \text{ and }\quad \theta_0 = y^{(i^\star)} - \scp{\thetab}{\xv^{(i^\star)}}, +% \thetah = \sumin \alpha_i \yi \xi \quad \text{ and }\quad \theta_0 = y^{(i^\star)} - \scp{\thetav}{\xv^{(i^\star)}}, % $$ % % where $(\xv^{(i^\star)}, y^{(i^\star)})$ can be any support vector. diff --git a/slides/linear-svm/slides-linsvm-optimization.tex b/slides/linear-svm/slides-linsvm-optimization.tex index 80a8fa41..a7cf2c49 100644 --- a/slides/linear-svm/slides-linsvm-optimization.tex +++ b/slides/linear-svm/slides-linsvm-optimization.tex @@ -39,9 +39,9 @@ Unconstrained formulation of soft-margin SVM: % \begin{eqnarray*} $$ -\min\limits_{\thetab, \theta_0} \quad \frac{\lambda}{2} \|\thetab\|^2 + \sumin \Lxyit +\min\limits_{\thetav, \theta_0} \quad \frac{\lambda}{2} \|\thetav\|^2 + \sumin \Lxyit $$ -where $\Lyf = \max(0, 1 - y f)$ and $\fxt = \thetab^T \xv + \theta_0$.\\ +where $\Lyf = \max(0, 1 - y f)$ and $\fxt = \thetav^T \xv + \theta_0$.\\ (We inconsequentially changed the regularization constant.) \vspace*{2mm} @@ -84,9 +84,9 @@ Approximate the risk by a stochastic 1-sample version: \vspace{-0.3cm} -$$ \frac{\lambda}{2} \|\thetab\|^2 + \Lxyit $$ -With: $\fxt = \thetab^T \xv + \theta_0$ and $\Lyf = \max(0, 1 - y f)$\\ -The subgradient for $\thetab$ is $\lambda \thetab - \yi \xi \I_{yf < 1}$ +$$ \frac{\lambda}{2} \|\thetav\|^2 + \Lxyit $$ +With: $\fxt = \thetav^T \xv + \theta_0$ and $\Lyf = \max(0, 1 - y f)$\\ +The subgradient for $\thetav$ is $\lambda \thetav - \yi \xi \I_{yf < 1}$ \vspace{-0.1cm} diff --git a/slides/linear-svm/slides-linsvm-soft-margin.tex b/slides/linear-svm/slides-linsvm-soft-margin.tex index 1f7ce532..a7b61f29 100644 --- a/slides/linear-svm/slides-linsvm-soft-margin.tex +++ b/slides/linear-svm/slides-linsvm-soft-margin.tex @@ -42,7 +42,7 @@ \item We still want a large margin for most of the examples. \item We allow violations of the margin constraints via slack vars $\sli \geq 0$ $$ - \yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) \geq 1 - \sli + \yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) \geq 1 - \sli $$ \item Even for separable data, a decision boundary with a few violations and a large average margin may be preferable to one without any violations and a small average margin. \end{itemize} @@ -75,7 +75,7 @@ \end{enumerate} \item Let's minimize a weighted sum of them: $ - \frac{1}{2} \|\thetab\|^2 + C \sum_{i=1}^n \sli + \frac{1}{2} \|\thetav\|^2 + C \sum_{i=1}^n \sli $ \item Constant $C > 0$ controls the relative importance of the two parts. % \item It represents the relative weight assigned to either having a large @@ -94,8 +94,8 @@ The linear \textbf{soft-margin} SVM is the convex quadratic program: \begin{eqnarray*} - & \min\limits_{\thetab, \thetab_0,\sli} & \frac{1}{2} \|\thetab\|^2 + C \sum_{i=1}^n \sli \\ - & \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ + & \min\limits_{\thetav, \thetav_0,\sli} & \frac{1}{2} \|\thetav\|^2 + C \sum_{i=1}^n \sli \\ + & \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ & \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset.\\ \end{eqnarray*} @@ -116,17 +116,17 @@ \small The Lagrange function of the soft-margin SVM is given by: \begin{align*} - \mathcal{L}(\thetab, \theta_0, \bm{\sl}, \bm{\alpha}, \bm{\mu}) &= \frac{1}{2}\left\Vert\thetab\right\Vert^2_2 + C\sumin\sli - \sumin \alpha_i\left(\yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) -1 + \sli\right) \\ + \mathcal{L}(\thetav, \theta_0, \bm{\sl}, \bm{\alpha}, \bm{\mu}) &= \frac{1}{2}\left\Vert\thetav\right\Vert^2_2 + C\sumin\sli - \sumin \alpha_i\left(\yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) -1 + \sli\right) \\ & \quad - \sumin \mu_i\sli \quad\text{ with Lagrange multipliers $\bm{\alpha}$ and $\bm{\mu}.$} \end{align*} The KKT conditions for $i=1,\dots, n$ are: \begin{eqnarray*} \alpha_i \geq 0, &\quad\quad&\mu_i \geq 0, \\ - \yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) -1 + \sli \geq 0, && \sli \geq 0, \\ - \alpha_i\left(\yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) -1 + \sli\right) = 0, && \sli\mu_i = 0. + \yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) -1 + \sli \geq 0, && \sli \geq 0, \\ + \alpha_i\left(\yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) -1 + \sli\right) = 0, && \sli\mu_i = 0. \end{eqnarray*} With these, we derive (see our optimization course) that \\ -$\thetab = \sumin \alpha_i\yi\xi, \quad 0 = \sumin\alpha_i\yi, \quad \alpha_i = C - \mu_i\quad \forall i=1,\dots, n.$ +$\thetav = \sumin \alpha_i\yi\xi, \quad 0 = \sumin\alpha_i\yi, \quad \alpha_i = C - \mu_i\quad \forall i=1,\dots, n.$ \end{vbframe} \begin{vbframe}{Soft-margin SVM dual form} diff --git a/slides/multiclass/slides-mc-losses.tex b/slides/multiclass/slides-mc-losses.tex index 5ea2ebc5..09a208fb 100644 --- a/slides/multiclass/slides-mc-losses.tex +++ b/slides/multiclass/slides-mc-losses.tex @@ -129,7 +129,7 @@ \section{MC Brier Score} \vspace*{-0.5cm} \begin{footnotesize} \begin{eqnarray*} - \thetab = \argminlim_{\thetab \in \R^g, \sum \theta_k = 1} \risket \quad \text{ with } \quad \risket = \left(\sumin \sum_{k = 1}^g \left(\mathds{1}_{\{\yi = k\}} - \theta_k\right)^2\right) + \thetav = \argminlim_{\thetav \in \R^g, \sum \theta_k = 1} \risket \quad \text{ with } \quad \risket = \left(\sumin \sum_{k = 1}^g \left(\mathds{1}_{\{\yi = k\}} - \theta_k\right)^2\right) \end{eqnarray*} \end{footnotesize} We solve this by setting the derivative w.r.t. $\theta_k$ to 0 diff --git a/slides/multiclass/slides-mc-softmax-regression.tex b/slides/multiclass/slides-mc-softmax-regression.tex index 4213deba..4649b5ac 100644 --- a/slides/multiclass/slides-mc-softmax-regression.tex +++ b/slides/multiclass/slides-mc-softmax-regression.tex @@ -25,7 +25,7 @@ \vspace*{-0.3cm} \begin{eqnarray*} - \Hspace = \left\{\pi: \Xspace \to \R ~|~\pix = s(\thetab^\top \xv)\right\}\,, + \Hspace = \left\{\pi: \Xspace \to \R ~|~\pix = s(\thetav^\top \xv)\right\}\,, \end{eqnarray*} with the Bernoulli (logarithmic) loss: @@ -37,7 +37,7 @@ \vfill \begin{footnotesize} - \textbf{Remark:} We suppress the intercept term for better readability. The intercept term can be easily included via $\thetab^\top \tilde\xv$, $\thetab \in \R^{p + 1}$, $\tilde\xv = (1, \xv)$. + \textbf{Remark:} We suppress the intercept term for better readability. The intercept term can be easily included via $\thetav^\top \tilde\xv$, $\thetav \in \R^{p + 1}$, $\tilde\xv = (1, \xv)$. \end{footnotesize} \end{vbframe} @@ -49,13 +49,13 @@ \begin{itemize} \item Instead of a single linear discriminant function we have $g$ linear discriminant functions $$ - f_k(\xv) = \thetab_k^\top \xv, \quad k = 1, 2, ..., g, + f_k(\xv) = \thetav_k^\top \xv, \quad k = 1, 2, ..., g, $$ each indicating the confidence in class $k$. \item The $g$ score functions are transformed into $g$ probability functions by the \textbf{softmax} function $s:\R^g \to \R^g$ $$ - \pi_k(\xv) = s(\fx)_k = \frac{\exp(\thetab_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv) }\,, + \pi_k(\xv) = s(\fx)_k = \frac{\exp(\thetav_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv) }\,, $$ instead of the \textbf{logistic} function for $g = 2$. The probabilities are well-defined: $\sum \pi_k(\xv) = 1$ and $\pi_k(\xv) \in [0, 1]$ for all $k$. @@ -70,8 +70,8 @@ \item Furthermore, it is invariant to constant offsets in the input: \end{itemize} $$ - s(\fx + \mathbf{c}) = \frac{\exp(\thetab_k^\top \xv + c)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv + c)} = - \frac{\exp(\thetab_k^\top \xv)\cdot \exp(c)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv) \cdot \exp(c)} = + s(\fx + \mathbf{c}) = \frac{\exp(\thetav_k^\top \xv + c)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv + c)} = + \frac{\exp(\thetav_k^\top \xv)\cdot \exp(c)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv) \cdot \exp(c)} = s(\fx) $$ @@ -87,8 +87,8 @@ \begin{tabular}{ccc} & Logistic Regression & Softmax Regression \\ \hline $\Yspace$ & $\{0, 1\}$ & $\{1, 2, ..., g\}$ \\[0.5cm] -Discriminant fun. & $f(\xv) = \thetab^\top \xv$ & $f_k(\xv) = \thetab_{k}^{\top} \xv, k = 1, 2, ..., g$ \\[0.5cm] -Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetab^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetab_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv) }$ \\[0.5cm] +Discriminant fun. & $f(\xv) = \thetav^\top \xv$ & $f_k(\xv) = \thetav_{k}^{\top} \xv, k = 1, 2, ..., g$ \\[0.5cm] +Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetav^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetav_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv) }$ \\[0.5cm] $L(y, \pix)$ & Bernoulli / logarithmic loss & Multiclass logarithmic loss\\[-0.3cm] & $-y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right)$ & $ - \sum_{k = 1}^g [y = k] \log\left(\pi_k(\xv)\right)$ \\ \end{tabular} @@ -126,11 +126,11 @@ % \item For linear $\fxt = \theta^T \xv$ this is also called \emph{softmax regression}. \item Softmax regression has an unusual property in that it has a \enquote{redundant} set of parameters. If we subtract a fixed vector - from all $\thetab_k$, the predictions do not change at all. + from all $\thetav_k$, the predictions do not change at all. Hence, our model is \enquote{over-parameterized}. For any hypothesis we might fit, there are multiple parameter vectors that give rise to exactly the same hypothesis function. This also implies that the minimizer of $\risket$ above is not unique! - Hence, a numerical trick is to set $\thetab_g = 0$ and only optimize the other $\thetab_k$. This does not restrict our hypothesis space, but the constrained problem is now convex, i.e., there exists exactly one parameter vector for every hypothesis. + Hence, a numerical trick is to set $\thetav_g = 0$ and only optimize the other $\thetav_k$. This does not restrict our hypothesis space, but the constrained problem is now convex, i.e., there exists exactly one parameter vector for every hypothesis. \item A similar approach is used in many ML models: multiclass LDA, naive Bayes, neural networks and boosting. @@ -149,7 +149,7 @@ \item a rank-preserving function, i.e. the ranks among the elements of the vector $\bm{z}$ are the same as among the elements of $s(\bm{z})$. This is because softmax transforms all scores by taking the $\exp(\cdot)$ (rank-preserving) and divides each element by \textbf{the same} normalizing constant. \end{itemize} -Thus, the softmax function has a unique inverse function $s^{-1}: \R^g \to \R^g$ that is also monotonic and rank-preserving. Applying $s_k^{-1}$ to $\pi_k(\xv) = \frac{\exp(\thetab_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv)}$ gives us $f_k(\xv) = \thetab_k^\top \xv$. Thus, softmax regression is a linear classifier. +Thus, the softmax function has a unique inverse function $s^{-1}: \R^g \to \R^g$ that is also monotonic and rank-preserving. Applying $s_k^{-1}$ to $\pi_k(\xv) = \frac{\exp(\thetav_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv)}$ gives us $f_k(\xv) = \thetav_k^\top \xv$. Thus, softmax regression is a linear classifier. \end{itemize} \end{vbframe} diff --git a/slides/nonlinear-svm/slides-nonlinsvm-rkhs-repr.tex b/slides/nonlinear-svm/slides-nonlinsvm-rkhs-repr.tex index f0041336..a54d6ceb 100644 --- a/slides/nonlinear-svm/slides-nonlinsvm-rkhs-repr.tex +++ b/slides/nonlinear-svm/slides-nonlinsvm-rkhs-repr.tex @@ -139,8 +139,8 @@ \vspace*{-0.5cm} \begin{eqnarray*} - & \min\limits_{\thetab, \theta_0,\sli} & \frac{1}{2} \thetab^\top \thetab + C \sum_{i=1}^n \sli \\ - & \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\phi\left(\xi\right)} + \theta_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ + & \min\limits_{\thetav, \theta_0,\sli} & \frac{1}{2} \thetav^\top \thetav + C \sum_{i=1}^n \sli \\ + & \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\phi\left(\xi\right)} + \theta_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\ & \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset\\ \end{eqnarray*} @@ -149,7 +149,7 @@ can be written as \begin{eqnarray*} - \thetab &=& \sum_{j = 1}^n \beta_j \phi\left(\xv^{(j)}\right) + \thetav &=& \sum_{j = 1}^n \beta_j \phi\left(\xv^{(j)}\right) \end{eqnarray*} for $\beta_j \in \R$. @@ -159,29 +159,29 @@ \begin{vbframe}{Representer Theorem} \textbf{Theorem} (Representer Theorem):\\ - The solution $\thetab, \theta_0$ of the support vector machine optimization problem fulfills $\thetab \in V = \spn\big\{\phi\left(\xv^{(1)}\right), \dots, \phi\left(\xv^{(n)}\right)\big\}$.\\ + The solution $\thetav, \theta_0$ of the support vector machine optimization problem fulfills $\thetav \in V = \spn\big\{\phi\left(\xv^{(1)}\right), \dots, \phi\left(\xv^{(n)}\right)\big\}$.\\ \vspace*{0.2cm} \begin{footnotesize} \textbf{Proof:} Let $V^\perp$ denote the space orthogonal to $V$, - so that $\HS = V \oplus V^\perp$. The vector $\thetab$ has a + so that $\HS = V \oplus V^\perp$. The vector $\thetav$ has a unique decomposition into components $\bm{v} \in V$ and $\bm{v} ^\perp \in V^\perp$, - so that $\bm{v} + \bm{v} ^\perp = \thetab$.\\[0.5em] + so that $\bm{v} + \bm{v} ^\perp = \thetav$.\\[0.5em] - The regularizer becomes $\|\thetab\|^2 = \|\bm{v} \|^2 + \|\bm{v} ^\perp\|^2$. - The constraints $\yi \left( \scp{\thetab}{\phi\left(\xi\right)} + \theta_0\right) \geq 1 - \sli$ + The regularizer becomes $\|\thetav\|^2 = \|\bm{v} \|^2 + \|\bm{v} ^\perp\|^2$. + The constraints $\yi \left( \scp{\thetav}{\phi\left(\xi\right)} + \theta_0\right) \geq 1 - \sli$ do not depend on $\bm{v} ^\perp$ at all: %, since $v^\perp$ is orthogonal to all $k\left(\xi, \cdot\right)$: $$ - \scp{\thetab}{\phi\left(\xi\right)} = \scp{\bm{v} }{\phi\left(\xi\right)} + \underbrace{\scp{\bm{v}^\perp}{\phi\left(\xi \right)}}_{= 0} + \scp{\thetav}{\phi\left(\xi\right)} = \scp{\bm{v} }{\phi\left(\xi\right)} + \underbrace{\scp{\bm{v}^\perp}{\phi\left(\xi \right)}}_{= 0} \enspace ~ \forall i \in \{1, 2, ..., n\}. $$ Thus, we have two independent optimization problems, namely the standard SVM problem for $v$ and the unconstrained minimization problem of $\|v^\perp\|^2$ for $v^\perp$, with obvious solution - $v^\perp = 0$. Thus, $\thetab = v \in V$. + $v^\perp = 0$. Thus, $\thetav = v \in V$. \end{footnotesize} \framebreak @@ -194,9 +194,9 @@ training set. \item More explicitly, we can assume the form \begin{footnotesize} - $$ \thetab = \sum_{j=1}^n \beta_j \cdot \phi\left(\xv^{(j)}\right) $$ + $$ \thetav = \sum_{j=1}^n \beta_j \cdot \phi\left(\xv^{(j)}\right) $$ \end{footnotesize} - for the weight vector $\thetab\in \HS$. + for the weight vector $\thetav\in \HS$. \item The SVM prediction on $\xv \in \Xspace$ can be computed as \begin{footnotesize} $$ diff --git a/slides/regularization/slides-bias-var-ridge.tex b/slides/regularization/slides-bias-var-ridge.tex index 29c06803..7b217338 100644 --- a/slides/regularization/slides-bias-var-ridge.tex +++ b/slides/regularization/slides-bias-var-ridge.tex @@ -21,7 +21,7 @@ \begin{vbframe}{Bias-Variance Decomposition for Ridge} -For a linear model $\yv = \Xmat \thetab + \bm{\varepsilon}$ with fixed design $\Xmat \in \mathbb{R}^{n \times p}\,\text{and}\,\bm{\varepsilon} \sim (\bm{0},\sigma^2 \bm{I}_n)$, bias of ridge estimator $\thetah_{\text{ridge}}$ is given by +For a linear model $\yv = \Xmat \thetav + \bm{\varepsilon}$ with fixed design $\Xmat \in \mathbb{R}^{n \times p}\,\text{and}\,\bm{\varepsilon} \sim (\bm{0},\sigma^2 \bm{I}_n)$, bias of ridge estimator $\thetah_{\text{ridge}}$ is given by \begin{equation*} \begin{aligned} \text{Bias}(\thetah_{\text{ridge}}) := \mathbb{E}[\thetah_{\text{ridge}}-\bm{\theta}] &= \mathbb{E}[(\bm{X}^{\top}\bm{X} +\lambda\bm{I}_p )^{-1} \bm{X}^{\top}\bm{y}] - \bm{\theta}\\ @@ -70,15 +70,15 @@ \begin{vbframe}{Bias-Variance in Predictions for ridge} -In supervised learning, our goal is typically not to learn an unknown parameter $\thetab$, but to learn a function $f(\xv)$ that can predict $y$ given $\xv$. +In supervised learning, our goal is typically not to learn an unknown parameter $\thetav$, but to learn a function $f(\xv)$ that can predict $y$ given $\xv$. -%Assume that our targets are generated by $y=f(\xv)+\varepsilon$ where $\varepsilon \sim (0,\sigma^2)$ and $f(\xv)=\thetab^{\top}\xv$. +%Assume that our targets are generated by $y=f(\xv)+\varepsilon$ where $\varepsilon \sim (0,\sigma^2)$ and $f(\xv)=\thetav^{\top}\xv$. \vspace{0.2cm} The bias and variance of predictions $\hat{f}:=\fh(\xv)=\thetah_{\text{ridge}}^{\top} \xv$ is obtained as: \begin{align*} -\text{Bias}(\fh)&=\E[\hat{f}-f]=\E[\thetah_{\text{ridge}}^{\top} \xv - \thetab^{\top} \xv]=\E[\thetah_{\text{ridge}}-\thetab]^{\top} \xv\\ +\text{Bias}(\fh)&=\E[\hat{f}-f]=\E[\thetah_{\text{ridge}}^{\top} \xv - \thetav^{\top} \xv]=\E[\thetah_{\text{ridge}}-\thetav]^{\top} \xv\\ &= \text{Bias}(\thetah_{\text{ridge}})^{\top} \xv \\ \text{Var}(\fh)&=\text{Var}(\thetah_{\text{ridge}}^{\top} \xv) = \xv^{\top} \text{Var}(\thetah_{\text{ridge}}) \xv \end{align*} diff --git a/slides/regularization/slides-regu-bayes.tex b/slides/regularization/slides-regu-bayes.tex index 8fdf7714..11e30d74 100644 --- a/slides/regularization/slides-regu-bayes.tex +++ b/slides/regularization/slides-regu-bayes.tex @@ -33,31 +33,31 @@ \lz -Assume we have a parameterized distribution $p(y | \thetab, \xv)$ for our data and -a prior $q(\thetab)$ over our param space, all in Bayesian framework. +Assume we have a parameterized distribution $p(y | \thetav, \xv)$ for our data and +a prior $q(\thetav)$ over our param space, all in Bayesian framework. \lz \lz From Bayes theorem: $$ -p(\thetab | \xv, y) = \frac{p(y | \thetab, \xv) q(\thetab) }{p(y | \xv)} \propto -p(y | \thetab, \xv) q(\thetab) +p(\thetav | \xv, y) = \frac{p(y | \thetav, \xv) q(\thetav) }{p(y | \xv)} \propto +p(y | \thetav, \xv) q(\thetav) $$ \framebreak -The maximum a posteriori (MAP) estimator of $\thetab$ is now the minimizer of +The maximum a posteriori (MAP) estimator of $\thetav$ is now the minimizer of $$ -- \log p\left(y ~|~ \thetab, \xv\right) - \log q(\thetab). +- \log p\left(y ~|~ \thetav, \xv\right) - \log q(\thetav). $$ \begin{itemize} - \item Again, we identify the loss $\Lxyt$ with $-\log(p(y | \thetab, \xv))$. - \item If $q(\thetab)$ is constant (i.e., we used a uniform, non-informative + \item Again, we identify the loss $\Lxyt$ with $-\log(p(y | \thetav, \xv))$. + \item If $q(\thetav)$ is constant (i.e., we used a uniform, non-informative prior), the second term is irrelevant and we arrive at ERM. - \item If not, we can identify $J(\thetab) \propto -\log(q(\thetab))$, i.e., + \item If not, we can identify $J(\thetav) \propto -\log(q(\thetav))$, i.e., the log-prior corresponds to the regularizer, and the additional $\lambda$, which controls the strength of our penalty, usually influences the peakedness / inverse variance / strength of our prior. \end{itemize} @@ -97,27 +97,27 @@ \begin{itemize} \small - \item Gaussian prior $\mathcal{N}_d(\bm{0}, \mathit{diag}(\tau^2))$ with uncorrelated components for $\thetab$: + \item Gaussian prior $\mathcal{N}_d(\bm{0}, \mathit{diag}(\tau^2))$ with uncorrelated components for $\thetav$: \begin{footnotesize} - $$q(\thetab) = \prod_{j = 1}^d \phi_{0,\tau^2}(\theta_j) + $$q(\thetav) = \prod_{j = 1}^d \phi_{0,\tau^2}(\theta_j) = (2\pi\tau^2)^{-\frac{d}{2}} \exp \left( - \frac{1}{2\tau^2} \sum_{j = 1}^d \theta_j^2 \right)$$ \end{footnotesize} \item MAP: \begin{footnotesize} \begin{eqnarray*} - \thetah^{\text{MAP}} &=& \argmin_{\thetab} \left( - - \log p\left(y ~|~ \thetab, \xv \right) - \log q(\thetab) + \thetah^{\text{MAP}} &=& \argmin_{\thetav} \left( + - \log p\left(y ~|~ \thetav, \xv \right) - \log q(\thetav) \right) \\ - &=& \argmin_{\thetab} \left( - - \log p\left(y ~|~ \thetab, \xv \right) + \tfrac{d}{2} \log(2\pi \tau^2) + + &=& \argmin_{\thetav} \left( + - \log p\left(y ~|~ \thetav, \xv \right) + \tfrac{d}{2} \log(2\pi \tau^2) + \frac{1}{2\tau^2} \sum_{j = 1}^d \theta_j^2 \right) \\ - % &=& \argmin_{\thetab} \left( - % - \log p\left(\xv ~|~ \thetab\right) + \frac{1}{2\tau^2} {\thetab}^T\thetab + % &=& \argmin_{\thetav} \left( + % - \log p\left(\xv ~|~ \thetav\right) + \frac{1}{2\tau^2} {\thetav}^T\thetav % \right) \\ - &=& \argmin_{\thetab} \left( - - \log p\left(y ~|~ \thetab, \xv \right) + \frac{1}{2\tau^2} \| \thetab \|_2^2 + &=& \argmin_{\thetav} \left( + - \log p\left(y ~|~ \thetav, \xv \right) + \frac{1}{2\tau^2} \| \thetav \|_2^2 \right) \end{eqnarray*} \end{footnotesize} @@ -148,49 +148,49 @@ % Gaussian errors $\epsilon^{(i)} \sim \mathcal{N}(0, \sigma^2) ~ \forall i \in % \setn$, $\sigma > 0$, is also Gaussian: % \begin{footnotesize} - % $$- \log p\left(\ydat ~|~ \xv, \thetab\right) = \frac{n}{2} \log (2\pi + % $$- \log p\left(\ydat ~|~ \xv, \thetav\right) = \frac{n}{2} \log (2\pi % \sigma^2) + \frac{1}{2\sigma^2} \sumin \left(\yi - \fxit \right)^2.$$ % \end{footnotesize} -% &=& \argmin_{\thetab} \left( +% &=& \argmin_{\thetav} \left( % \tfrac{n}{2} \log (2\pi \sigma^2) + \frac{1}{2\sigma^2} \sumin \left(\yi - % \fxit \right)^2 + \tfrac{p}{2} \log(2\pi \tau^2) + % \frac{1}{2\tau^2} \sumjp \theta_j^2 % \right) \\ -% &=& \argmin_{\thetab} \left( \frac{1}{\sigma^2} \sumin \left(\yi - +% &=& \argmin_{\thetav} \left( \frac{1}{\sigma^2} \sumin \left(\yi - % \fxit \right)^2 + \frac{1}{\tau^2} \sumjp \theta_j^2 \right) \\ -% &=& \argmin_{\thetab} \left( -% \frac{1}{\sigma^2} \left(\ydat - \Xmat \thetab\right)^\top \left(\ydat - \Xmat -% \thetab\right) + \frac{1}{\tau^2} {\thetab}^T\thetab +% &=& \argmin_{\thetav} \left( +% \frac{1}{\sigma^2} \left(\ydat - \Xmat \thetav\right)^\top \left(\ydat - \Xmat +% \thetav\right) + \frac{1}{\tau^2} {\thetav}^T\thetav % \right) % \begin{eqnarray*} -% \thetah^{\text{MAP}} &=& argmin_{\thetab} \left( -% - \log p\left(\ydat ~|~ \xv, \thetab\right) - \log q(\thetab) +% \thetah^{\text{MAP}} &=& argmin_{\thetav} \left( +% - \log p\left(\ydat ~|~ \xv, \thetav\right) - \log q(\thetav) % \right) \\ % -% &=& \argmin_{\thetab} \left( +% &=& \argmin_{\thetav} \left( % \tfrac{n}{2} \log (2\pi \sigma^2) + \frac{1}{2\sigma^2} \sumin \left(\yi - % \fxit \right)^2 + \tfrac{p}{2} \log(2\pi \tau^2) + % \frac{1}{2\tau^2} \sumjp \theta_j^2 % \right) \\ -% &=& \argmin_{\thetab} \left( \frac{1}{\sigma^2} \sumin \left(\yi - +% &=& \argmin_{\thetav} \left( \frac{1}{\sigma^2} \sumin \left(\yi - % \fxit \right)^2 + \frac{1}{\tau^2} \sumjp \theta_j^2 \right) \\ -% &=& \argmin_{\thetab} \left( -% \frac{1}{\sigma^2} \left(\ydat - \Xmat \thetab\right)^\top \left(\ydat - \Xmat -% \thetab\right) + \frac{1}{\tau^2} {\thetab}^T\thetab +% &=& \argmin_{\thetav} \left( +% \frac{1}{\sigma^2} \left(\ydat - \Xmat \thetav\right)^\top \left(\ydat - \Xmat +% \thetav\right) + \frac{1}{\tau^2} {\thetav}^T\thetav % \right) % -% &=& \argmin_{\thetab} \left( +% &=& \argmin_{\thetav} \left( % \tfrac{n}{2} \log (2\pi \sigma^2) + \frac{1}{2\sigma^2} \sumin \left(\yi - % \fxit \right)^2 + \tfrac{p}{2} \log(2\pi \tau^2) + % \frac{1}{2\tau^2} \sumjp \theta_j^2 % \right) \\ -% &=& \argmin_{\thetab} \left( \frac{1}{\sigma^2} \sumin \left(\yi - +% &=& \argmin_{\thetav} \left( \frac{1}{\sigma^2} \sumin \left(\yi - % \fxit \right)^2 + \frac{1}{\tau^2} \sumjp \theta_j^2 \right) \\ -% &=& \argmin_{\thetab} \left( -% \frac{1}{\sigma^2} \left(\ydat - \Xmat \thetab\right)^\top \left(\ydat - \Xmat -% \thetab\right) + \frac{1}{\tau^2} {\thetab}^T\thetab +% &=& \argmin_{\thetav} \left( +% \frac{1}{\sigma^2} \left(\ydat - \Xmat \thetav\right)^\top \left(\ydat - \Xmat +% \thetav\right) + \frac{1}{\tau^2} {\thetav}^T\thetav % \right) % \end{eqnarray*} @@ -201,13 +201,13 @@ % % \begin{scriptsize} % \begin{eqnarray*} -% 0 &=& \frac{1}{\lambda \tau^2} \left( - {\Xmat}^T \ydat + \thetab {\Xmat}^T \Xmat -% \right) + \frac{\lambda}{\sigma^2} \thetab +% 0 &=& \frac{1}{\lambda \tau^2} \left( - {\Xmat}^T \ydat + \thetav {\Xmat}^T \Xmat +% \right) + \frac{\lambda}{\sigma^2} \thetav % \quad \Leftrightarrow \quad 0 = \frac{\sigma^2}{\tau^2} \left( - {\Xmat}^T \ydat -% + \thetab {\Xmat}^T \Xmat \right) + \lambda^2 \thetab \\ -% 0 &=& - {\Xmat}^T \ydat + \thetab {\Xmat}^T \Xmat + \lambda \thetab +% + \thetav {\Xmat}^T \Xmat \right) + \lambda^2 \thetav \\ +% 0 &=& - {\Xmat}^T \ydat + \thetav {\Xmat}^T \Xmat + \lambda \thetav % \quad \Leftrightarrow \quad -% \thetab(\Xmat^T \Xmat + \lambda \id) = {\Xmat}^T \ydat +% \thetav(\Xmat^T \Xmat + \lambda \id) = {\Xmat}^T \ydat % \end{eqnarray*} % \end{scriptsize} % diff --git a/slides/regularization/slides-regu-early-stopping.tex b/slides/regularization/slides-regu-early-stopping.tex index c2f647de..8f70d6e9 100644 --- a/slides/regularization/slides-regu-early-stopping.tex +++ b/slides/regularization/slides-regu-early-stopping.tex @@ -65,13 +65,13 @@ \hline Effective and simple & Periodical evaluation of validation error\\ \hline - Applicable to almost any model without adjustment \note{of objective function, parameter space, training procedure} & Temporary copy of $\thetab$ (we have to save the whole model each time validation error improves) \\ + Applicable to almost any model without adjustment \note{of objective function, parameter space, training procedure} & Temporary copy of $\thetav$ (we have to save the whole model each time validation error improves) \\ \hline Combinable with other regularization methods & Less data for training $\rightarrow$ include $\mathcal{D}_{\text{val}}$ afterwards\\ \hline\hline \end{tabular} \end{table} \begin{itemize} - \item For simple case of LM with squared loss and GD optim initialized at $\thetab=0$: Early stopping has exact correspondence with $L2$ regularization/WD: %Relation between + \item For simple case of LM with squared loss and GD optim initialized at $\thetav=0$: Early stopping has exact correspondence with $L2$ regularization/WD: %Relation between optimal early-stopping iter $T_{\text{stop}}$ inversely proportional to $\lambda$ scaled by step-size $\alpha$ \end{itemize} @@ -101,7 +101,7 @@ \end{vbframe} \begin{vbframe}{SGD Trajectory and $L2$ \citelink{ALI2020}} -Solution paths for $L2$ regularized linear model closely matches SGD trajectory of unregularized LM initialized at $\thetab=0$ +Solution paths for $L2$ regularized linear model closely matches SGD trajectory of unregularized LM initialized at $\thetav=0$ \lz \begin{figure} \centering diff --git a/slides/regularization/slides-regu-enetlogreg.tex b/slides/regularization/slides-regu-enetlogreg.tex index 0ecd7555..3ae4bb57 100644 --- a/slides/regularization/slides-regu-enetlogreg.tex +++ b/slides/regularization/slides-regu-enetlogreg.tex @@ -16,8 +16,8 @@ \vspace{-0.7cm} \small{ \begin{align*} -\mathcal{R}_{\text{elnet}}(\thetab) &= \sumin (\yi - \thetab^\top \xi)^2 + \lambda_1 \|\thetab\|_1 + \lambda_2 \|\thetab\|_2^2 \\ -&= \sumin (\yi - \thetab^\top \xi)^2 + \lambda \left( (1-\alpha) \|\thetab\|_1 + \alpha \|\thetab\|_2^2\right),\, \alpha=\frac{\lambda_2}{\lambda_1+\lambda_2}, \lambda=\lambda_1+\lambda_2 +\mathcal{R}_{\text{elnet}}(\thetav) &= \sumin (\yi - \thetav^\top \xi)^2 + \lambda_1 \|\thetav\|_1 + \lambda_2 \|\thetav\|_2^2 \\ +&= \sumin (\yi - \thetav^\top \xi)^2 + \lambda \left( (1-\alpha) \|\thetav\|_1 + \alpha \|\thetav\|_2^2\right),\, \alpha=\frac{\lambda_2}{\lambda_1+\lambda_2}, \lambda=\lambda_1+\lambda_2 \end{align*}} \begin{figure} \vspace{-0.3cm} @@ -35,20 +35,20 @@ \begin{vbframe} {Simulated Example} \footnotesize -5-fold CV with $n_{train}=100$ and 20 repetitions with $n_{test}=10000$ for setups: $y =\xv^T \thetab + \epsilon; \quad \epsilon \sim N(0,0.1^2); +5-fold CV with $n_{train}=100$ and 20 repetitions with $n_{test}=10000$ for setups: $y =\xv^T \thetav + \epsilon; \quad \epsilon \sim N(0,0.1^2); \quad \xv \sim N(0, \Sigma); \quad\Sigma_{k,l}=0.8^{|k-l|}$: \vspace{-0.3cm} \begin{columns} \begin{column}{0.5\textwidth} \begin{center} {\footnotesize \textbf{Lasso} better for sparse features:} \\ -$\thetab=(\underbrace{1,\ldots,1}_{5},\underbrace{0,\ldots,0}_{495})$\\ +$\thetav=(\underbrace{1,\ldots,1}_{5},\underbrace{0,\ldots,0}_{495})$\\ \end{center} \end{column} \begin{column}{0.5\textwidth} \begin{center} {\footnotesize \textbf{Ridge} better for dense features:} \\ -$\thetab=(\underbrace{1,\ldots,1,1,\ldots,1}_{500})$ \\ +$\thetav=(\underbrace{1,\ldots,1,1,\ldots,1}_{500})$ \\ \end{center} \end{column} \end{columns} @@ -85,9 +85,9 @@ %\begin{align*} -%\riskrt &= \risket + \lambda \cdot J(\thetab) \\ -%&= \sumin \mathsf{log} \left[1 + \exp \left(-\yi f\left(\left.\xi~\right|~ \thetab\right)\right)\right] + \lambda \cdot J(\thetab) \\ -%&= \sumin \mathsf{log}\left[1 + \mathsf{exp}\left(-2\yi f\left(\left.\xi~\right|~ \thetab\right)\right)\right] + \lambda \cdot J(\thetab) +%\riskrt &= \risket + \lambda \cdot J(\thetav) \\ +%&= \sumin \mathsf{log} \left[1 + \exp \left(-\yi f\left(\left.\xi~\right|~ \thetav\right)\right)\right] + \lambda \cdot J(\thetav) \\ +%&= \sumin \mathsf{log}\left[1 + \mathsf{exp}\left(-2\yi f\left(\left.\xi~\right|~ \thetav\right)\right)\right] + \lambda \cdot J(\thetav) %\end{align*} \item Now: LR with polynomial features for $x_1, x_2$ up to degree 7 and L2 penalty on 2D ``circle data'' below diff --git a/slides/regularization/slides-regu-geom-l1.tex b/slides/regularization/slides-regu-geom-l1.tex index 44131cf3..50eb3b62 100644 --- a/slides/regularization/slides-regu-geom-l1.tex +++ b/slides/regularization/slides-regu-geom-l1.tex @@ -21,21 +21,21 @@ \item The L1-regularized risk of a model $\fxt$ is \[ - \mathcal{R}_{\text{reg}}(\thetab) = \mathcal{R}_{\text{emp}}(\thetab) + \sum_j \lambda |\theta_j| + \mathcal{R}_{\text{reg}}(\thetav) = \mathcal{R}_{\text{emp}}(\thetav) + \sum_j \lambda |\theta_j| \] and the (sub-)gradient is: - %$$\nabla_{\theta} \mathcal{R}_{\text{reg}}(\thetab) = + %$$\nabla_{\theta} \mathcal{R}_{\text{reg}}(\thetav) = - $$ \nabla_{\theta} \risket + \lambda \sign(\thetab) $$ + $$ \nabla_{\theta} \risket + \lambda \sign(\thetav) $$ \item Unlike in $L2$, contribution to grad. doesn't scale with $\theta_j$ elements. - \item Again: quadratic Taylor approximation of $\mathcal{R}_{\text{emp}}(\thetab)$ around its minimizer $\thetah$, then regularize: + \item Again: quadratic Taylor approximation of $\mathcal{R}_{\text{emp}}(\thetav)$ around its minimizer $\thetah$, then regularize: %\item Again, this is an orthonormal design. r example, if the input features for a linear regression task have been decorrelated using PCA. \end{itemize} - $$\mathcal{\tilde R}_{\text{reg}}(\thetab) = \mathcal{R}_{\text{emp}}(\thetah) +\ \frac{1}{2} (\thetab - \thetah)^T \bm{H} (\thetab - \thetah) + \sum_j \lambda |\theta_j|$$ + $$\mathcal{\tilde R}_{\text{reg}}(\thetav) = \mathcal{R}_{\text{emp}}(\thetah) +\ \frac{1}{2} (\thetav - \thetah)^T \bm{H} (\thetav - \thetah) + \sum_j \lambda |\theta_j|$$ \framebreak @@ -43,8 +43,8 @@ \item To cheat and simplify, we assume the $\bm{H}$ is diagonal, with $H_{j,j} \geq 0$ - \item Now $\mathcal{\tilde R}_{\text{reg}}(\thetab)$ decomposes into sum over params $\theta_j$ (separable!): - $$\mathcal{\tilde R}_{\text{reg}}(\thetab) = \mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j|$$ + \item Now $\mathcal{\tilde R}_{\text{reg}}(\thetav)$ decomposes into sum over params $\theta_j$ (separable!): + $$\mathcal{\tilde R}_{\text{reg}}(\thetav) = \mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j|$$ % where $\thetah$ is the minimizer of the unregularized risk $\risket$. \item We can minimize analytically: \begin{align*}\hat{\theta}_{\text{lasso},j} &= \sign(\hat{\theta}_j) \max \left\{ |\hat{\theta}_j| - \frac{\lambda}{H_{j,j}},0 \right\} \\ diff --git a/slides/regularization/slides-regu-geom-l2.tex b/slides/regularization/slides-regu-geom-l2.tex index 0997c083..f5e7f32e 100644 --- a/slides/regularization/slides-regu-geom-l2.tex +++ b/slides/regularization/slides-regu-geom-l2.tex @@ -18,7 +18,7 @@ Quadratic Taylor approx of the unregularized objective $\risket$ \\ around its minimizer $\thetah$: -$$ \mathcal{\tilde R}_{\text{emp}}(\thetab)= \mathcal{R}_{\text{emp}}(\thetah) + \nabla_{\thetab} \mathcal{R}_{\text{emp}}(\thetah)\cdot(\thetab - \thetah) + \ \frac{1}{2} (\thetab - \thetah)^T \bm{H} (\thetab - \thetah) $$ +$$ \mathcal{\tilde R}_{\text{emp}}(\thetav)= \mathcal{R}_{\text{emp}}(\thetah) + \nabla_{\thetav} \mathcal{R}_{\text{emp}}(\thetah)\cdot(\thetav - \thetah) + \ \frac{1}{2} (\thetav - \thetah)^T \bm{H} (\thetav - \thetah) $$ where $\bm{H}$ is the Hessian of $\risket$ at $\thetah$ @@ -26,14 +26,14 @@ We notice: -% Because $\thetah = \argmin_{\thetab}\risket$, +% Because $\thetah = \argmin_{\thetav}\risket$, \begin{itemize} \item First-order term is 0, because gradient must be $0$ at minimizer \item $\bm{H}$ is positive semidefinite, because we are at the minimizer \end{itemize} -$$ \mathcal{\tilde R}_{\text{emp}}(\thetab)= \mathcal{R}_{\text{emp}}(\thetah) + \ \frac{1}{2} (\thetab - \thetah)^T \bm{H} (\thetab - \thetah) $$ +$$ \mathcal{\tilde R}_{\text{emp}}(\thetav)= \mathcal{R}_{\text{emp}}(\thetah) + \ \frac{1}{2} (\thetav - \thetah)^T \bm{H} (\thetav - \thetah) $$ \lz @@ -42,18 +42,18 @@ \normalsize -The minimum of $\mathcal{\tilde R}_{\text{emp}}(\thetab)$ occurs where $\nabla_{\thetab}\mathcal{\tilde R}_{\text{emp}}(\thetab) = \bm{H}(\thetab - \thetah)$ is $0$. +The minimum of $\mathcal{\tilde R}_{\text{emp}}(\thetav)$ occurs where $\nabla_{\thetav}\mathcal{\tilde R}_{\text{emp}}(\thetav) = \bm{H}(\thetav - \thetah)$ is $0$. -Now we $L2$-regularize $\mathcal{\tilde R}_{\text{emp}}(\thetab)$, such that +Now we $L2$-regularize $\mathcal{\tilde R}_{\text{emp}}(\thetav)$, such that \[ -\mathcal{\tilde R}_{\text{reg}}(\thetab) = \mathcal{\tilde R}_{\text{emp}}(\thetab) + \frac{\lambda}{2} \|\thetab\|^2_2\] -and solve this approximation of $\riskr$ for the minimizer $\hat{\thetab}_{\text{ridge}}$: +\mathcal{\tilde R}_{\text{reg}}(\thetav) = \mathcal{\tilde R}_{\text{emp}}(\thetav) + \frac{\lambda}{2} \|\thetav\|^2_2\] +and solve this approximation of $\riskr$ for the minimizer $\hat{\thetav}_{\text{ridge}}$: \begin{align*} - \nabla_{\thetab}\mathcal{\tilde R}_{\text{reg}}(\thetab) = 0\\ -% \lambda \thetab + \nabla_{\thetab}\mathcal{\tilde R}_{\text{emp}}(\thetab) = 0\\ - \lambda \thetab + \bm{H}(\thetab - \thetah) = 0\\ - (\bm{H} + \lambda \id) \thetab = \bm{H} \thetah\\ - \hat{\thetab}_{\text{ridge}} = (\bm{H} + \lambda \id)^{-1}\bm{H} \thetah + \nabla_{\thetav}\mathcal{\tilde R}_{\text{reg}}(\thetav) = 0\\ +% \lambda \thetav + \nabla_{\thetav}\mathcal{\tilde R}_{\text{emp}}(\thetav) = 0\\ + \lambda \thetav + \bm{H}(\thetav - \thetah) = 0\\ + (\bm{H} + \lambda \id) \thetav = \bm{H} \thetah\\ + \hat{\thetav}_{\text{ridge}} = (\bm{H} + \lambda \id)^{-1}\bm{H} \thetah \end{align*} % where $\id$ is the identity matrix. @@ -66,12 +66,12 @@ \framebreak \begin{itemize} - \item As $\lambda$ approaches $0$, the regularized solution $\hat{\thetab}_{\text{ridge}}$ approaches $\thetah$. What happens as $\lambda$ grows? + \item As $\lambda$ approaches $0$, the regularized solution $\hat{\thetav}_{\text{ridge}}$ approaches $\thetah$. What happens as $\lambda$ grows? \item Because $\bm{H}$ is a real symmetric matrix, it can be decomposed as $\bm{H} = \bm{Q} \bm{\Sigma} \bm{Q}^\top$, where $\bm{\Sigma}$ is a diagonal matrix of eigenvalues and $\bm{Q}$ is an orthonormal basis of eigenvectors. \item Rewriting the transformation formula with this: \begin{equation*} \begin{aligned} - \hat{\thetab}_{\text{ridge}} &=\left(\bm{Q} \bm{\Sigma} \bm{Q}^{\top}+\lambda \id\right)^{-1} \bm{Q} \bm{\Sigma} \bm{Q}^{\top} \thetah \\ + \hat{\thetav}_{\text{ridge}} &=\left(\bm{Q} \bm{\Sigma} \bm{Q}^{\top}+\lambda \id\right)^{-1} \bm{Q} \bm{\Sigma} \bm{Q}^{\top} \thetah \\ &=\left[\bm{Q}(\bm{\Sigma}+\lambda \id) \bm{Q}^{\top}\right]^{-1} \bm{Q} \bm{\Sigma} \bm{Q}^{\top} \thetah \\ &=\bm{Q}(\bm{\Sigma} + \lambda \id)^{-1} \bm{\Sigma} \bm{Q}^{\top} \thetah \end{aligned} @@ -117,7 +117,7 @@ % \item Along directions where the eigenvalues of $\bm{H}$ are relatively large, for example, where $\sigma_j >> \lambda$, the effect of regularization is quite small. % \item On the other hand, components with $\sigma_j << \lambda$ will be shrunk to have nearly zero magnitude. % \item In other words, only directions along which the parameters contribute significantly to reducing the objective function are preserved relatively intact. -% \item In the other directions, a small eigenvalue of the Hessian means that moving in this direction will not significantly increase the gradient. For such unimportant directions, the corresponding components of $\thetab$ are decayed away. +% \item In the other directions, a small eigenvalue of the Hessian means that moving in this direction will not significantly increase the gradient. For such unimportant directions, the corresponding components of $\thetav$ are decayed away. % \end{itemize} % \framebreak @@ -132,7 +132,7 @@ \item Along directions where eigenvals of $\bm{H}$ are relatively large, e.g., $\sigma_j >> \lambda$, effect of regularization is small. \item Components / directions with $\sigma_j << \lambda$ are strongly shrunken. \item So: Directions along which parameters contribute strongly to objective are preserved relatively intact. - \item In other directions, small eigenvalue of Hessian means that moving in this direction will not decrease objective much. For such unimportant directions, corresponding components of $\thetab$ are decayed away. + \item In other directions, small eigenvalue of Hessian means that moving in this direction will not decrease objective much. For such unimportant directions, corresponding components of $\thetav$ are decayed away. %In the direction corresponding to smaller eigenvalue of Hessian of $\risket$, the objective function does not increase much when moving away from $\thetah$. Therefore, the regularizer has a strong effect on this direction and towards this, $\theta$ is pulled close to the origin. %\item In the second direction, the corresponding eigenvalue is large indicating high curvature. The objective function is very sensitive to movement along this direction and, as a result, the position of $\theta$ towards this is less affected by the regularization. \end{itemize} @@ -145,7 +145,7 @@ \begin{figure} \centering \scalebox{0.8}{\includegraphics{figure/l2_reg_hess_04.png}} - %\caption{\tiny The solid ellipses represent the contours of the unregularized objective and the dashed circles represent the contours of the $L2$ penalty. At $\hat{\thetab}_{\text{ridge}}$, the competing objectives reach an equilibrium.} + %\caption{\tiny The solid ellipses represent the contours of the unregularized objective and the dashed circles represent the contours of the $L2$ penalty. At $\hat{\thetav}_{\text{ridge}}$, the competing objectives reach an equilibrium.} \end{figure} \end{column} diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex index a2e6ca7a..037b44f2 100644 --- a/slides/regularization/slides-regu-intro.tex +++ b/slides/regularization/slides-regu-intro.tex @@ -67,7 +67,7 @@ \item LM with all features (L2 loss) %$$ -%\fxt = \thetab^T\xv = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + %\theta_{12} x_{12} +%\fxt = \thetav^T\xv = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + %\theta_{12} x_{12} %$$ \item MSE evaluation under $10 \times 10$ REP-CV @@ -230,9 +230,9 @@ \item $\lambda$ is hard to set manually and is usually selected via CV - \item As for $\riske$, $\riskr$ and $J$ are often defined in terms of $\thetab$: \\ + \item As for $\riske$, $\riskr$ and $J$ are often defined in terms of $\thetav$: \\ - $$\riskrt = \risket + \lambda \cdot J(\thetab)$$ + $$\riskrt = \risket + \lambda \cdot J(\thetav)$$ \end{itemize} diff --git a/slides/regularization/slides-regu-l1.tex b/slides/regularization/slides-regu-l1.tex index 0af32f9f..91c18f2f 100644 --- a/slides/regularization/slides-regu-l1.tex +++ b/slides/regularization/slides-regu-l1.tex @@ -19,11 +19,11 @@ \begin{vbframe}{Lasso Regression} -Another shrinkage method is the so-called \textbf{lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetab$: +Another shrinkage method is the so-called \textbf{lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetav$: \vspace{0.4cm} \begin{align*} -\thetah_{\text{lasso}}&= \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi\right)^2 + \lambda \sum_{j=1}^{p} \vert\theta_j\vert\\ -&= \argmin_{\thetab}\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \|\thetab\|_1 +\thetah_{\text{lasso}}&= \argmin_{\thetav} \sumin \left(\yi - \thetav^T \xi\right)^2 + \lambda \sum_{j=1}^{p} \vert\theta_j\vert\\ +&= \argmin_{\thetav}\left(\yv - \Xmat \thetav\right)^\top \left(\yv - \Xmat \thetav\right) + \lambda \|\thetav\|_1 \end{align*} \vspace{0.4cm} @@ -102,8 +102,8 @@ We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape. \vspace{-0.2cm} \begin{eqnarray*} -\min_{\thetab} \sumin \left(\yi - \fxit\right)^2\, -\text{subject to: } \|\thetab\|_1 \leq t +\min_{\thetav} \sumin \left(\yi - \fxit\right)^2\, +\text{subject to: } \|\thetav\|_1 \leq t \end{eqnarray*} The kinks in $L1$ enforce sparse solutions because ``the loss contours first hit the sharp corners of the constraint'' at coordinate axes where (some) entries are zero. \vspace{-0.1cm} @@ -154,23 +154,23 @@ \begin{vbframe}{Support Recovery of Lasso \citelink{ZHAO2006}} \begin{small} -When can lasso select true support of $\thetab$, i.e., only the non-zero parameters? \\ +When can lasso select true support of $\thetav$, i.e., only the non-zero parameters? \\ Can be formalized as sign-consistency: \begin{align*} -\mathbb{P}\big(\text{sign}(\thetah)=\text{sign}(\thetab)\big) \to 1 \, \text{as} \, n \to \infty \quad (\text{where}\,\text{sign}(0):=0) +\mathbb{P}\big(\text{sign}(\thetah)=\text{sign}(\thetav)\big) \to 1 \, \text{as} \, n \to \infty \quad (\text{where}\,\text{sign}(0):=0) \end{align*} \\ %\vspace{0.1cm} -Suppose the true DGP given a partition into subvectors $\thetab=(\thetab_1, \thetab_2)$ is +Suppose the true DGP given a partition into subvectors $\thetav=(\thetav_1, \thetav_2)$ is \begin{align*} - \bm{Y}=\Xmat\thetab + \bm{\varepsilon} = \Xmat_1 \thetab_1 + \Xmat_2 \thetab_2 + \bm{\varepsilon}\,\text{with}\,\bm{\varepsilon}\sim (0,\sigma^2 \id) + \bm{Y}=\Xmat\thetav + \bm{\varepsilon} = \Xmat_1 \thetav_1 + \Xmat_2 \thetav_2 + \bm{\varepsilon}\,\text{with}\,\bm{\varepsilon}\sim (0,\sigma^2 \id) \end{align*} -%$\bm{Y}=\Xmat\thetab + \bm{\varepsilon}$ with $\bm{\varepsilon}\sim (0,\sigma^2 \id)$ -and only $\thetab_1$ is non-zero. +%$\bm{Y}=\Xmat\thetav + \bm{\varepsilon}$ with $\bm{\varepsilon}\sim (0,\sigma^2 \id)$ +and only $\thetav_1$ is non-zero. \vspace{0.1cm} Let $\Xmat_1$ denote the $n \times q$ matrix with the relevant features and $\Xmat_2$ the matrix of noise features. It can be shown that $\thetah_{lasso}$ is sign consistent under an \textbf{irrepresentable condition}: \begin{align*} - \vert (\Xmat_2^{\top} \Xmat_1)(\Xmat_1^{\top} \Xmat_1)^{-1} \text{sign}(\thetab_1)\vert < \bm{1} \,\, (\text{element-wise}) + \vert (\Xmat_2^{\top} \Xmat_1)(\Xmat_1^{\top} \Xmat_1)^{-1} \text{sign}(\thetav_1)\vert < \bm{1} \,\, (\text{element-wise}) \end{align*} In fact, lasso can only be sign-consistent if this condition holds. diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex index c0f3df5f..01094c42 100644 --- a/slides/regularization/slides-regu-l1vsl2.tex +++ b/slides/regularization/slides-regu-l1vsl2.tex @@ -18,7 +18,7 @@ \begin{vbframe}{Lasso vs. ridge Geometry} $$ - \min_{\thetab} \sumin \left(\yi - \fxit\right)^2 \qquad \text{ s.t. } \|\thetab\|_p^p \leq t + \min_{\thetav} \sumin \left(\yi - \fxit\right)^2 \qquad \text{ s.t. } \|\thetav\|_p^p \leq t $$ \vspace{-0.5cm} \begin{figure} @@ -28,7 +28,7 @@ \begin{itemize} \item \small{In both cases (and for sufficiently large $\lambda$), the solution which minimizes $\riskrt$ is always a point on the boundary of the feasible region. - \item As expected, $\hat{\thetab}_{\text{lasso}}$ and $\hat{\thetab}_{\text{ridge}}$ have smaller parameter norms than $\thetah$.} + \item As expected, $\hat{\thetav}_{\text{lasso}}$ and $\hat{\thetav}_{\text{ridge}}$ have smaller parameter norms than $\thetah$.} \item For lasso, solution likely touches a vertex of constraint region. \\ Induces sparsity and is a form of variable selection. \item For $p>n$: lasso selects at most $n$ features \citelink{ZOUHASTIE}. @@ -69,13 +69,13 @@ \begin{vbframe}{Regularization and Feature Scaling} \begin{itemize} - \item Typically we omit $\theta_0$ in penalty $J(\thetab)$ so that the ``infinitely'' regularized model is the constant model (but can be implementation-dependent). + \item Typically we omit $\theta_0$ in penalty $J(\thetav)$ so that the ``infinitely'' regularized model is the constant model (but can be implementation-dependent). \item Unregularized LM has \textbf{rescaling equivariance}, if you scale some features, can simply "anti-scale" coefs and risk does not change. - \item Not true for Reg-LM: if you down-scale features, coeffs become larger to counteract. They are then penalized stronger in $J(\thetab)$, making them less attractive without any relevenat reason. + \item Not true for Reg-LM: if you down-scale features, coeffs become larger to counteract. They are then penalized stronger in $J(\thetav)$, making them less attractive without any relevenat reason. \item \textbf{So: usually standardize features in regularized models, whether linear or non-linear!} - % \item While ridge regression usually leads to smaller estimated coefficients, but still dense $\thetab$ vectors, - % the lasso will usually create a sparse $\thetab$ vector and can therefore be used for variable selection. + % \item While ridge regression usually leads to smaller estimated coefficients, but still dense $\thetav$ vectors, + % the lasso will usually create a sparse $\thetav$ vector and can therefore be used for variable selection. %\item SVMs combine (usually) hinge loss with L2-regularization. But also for SMVs this concept is generalized to different losses and different penalties. \end{itemize} @@ -85,7 +85,7 @@ \footnotesize{ \begin{itemize} - \item Let the DGP be $y = \sum_{j=1}^{5} \theta_j x_{j} +\varepsilon$ for $\thetab=(1,2,3,4,5)^\top$, $\varepsilon \sim \mathcal{N}(0,1)$ %and $n=100$ + \item Let the DGP be $y = \sum_{j=1}^{5} \theta_j x_{j} +\varepsilon$ for $\thetav=(1,2,3,4,5)^\top$, $\varepsilon \sim \mathcal{N}(0,1)$ %and $n=100$ \item Suppose $x_5$ was measured in $m$ but we change the unit to $cm$ ($\Tilde{x}_5=100 \cdot x_5$): \end{itemize} \vspace{-0.4cm} diff --git a/slides/regularization/slides-regu-l2-nonlin.tex b/slides/regularization/slides-regu-l2-nonlin.tex index da2065f6..420f753a 100644 --- a/slides/regularization/slides-regu-l2-nonlin.tex +++ b/slides/regularization/slides-regu-l2-nonlin.tex @@ -28,7 +28,7 @@ If we should define (supervised) ML in only one line, this might be it: \[ -\min_{\thetab} \riskrt= \min_{\thetab} \left(\sumin \Lxyit + \lambda \cdot J(\thetab) \right) +\min_{\thetav} \riskrt= \min_{\thetav} \left(\sumin \Lxyit + \lambda \cdot J(\thetav) \right) \] We can choose for a task at hand: @@ -37,7 +37,7 @@ \item the \textbf{hypothesis space} of $f$, which determines how features can influence the predicted $y$ \item the \textbf{loss} function $L$, which measures how errors should be treated - \item the \textbf{regularization} $J(\thetab)$, which encodes our inductive + \item the \textbf{regularization} $J(\thetav)$, which encodes our inductive bias and preference for certain simpler models \end{itemize} @@ -53,15 +53,15 @@ For neural networks, the regularized loss function is: \[ -\riskrt = \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \cdot J(\thetab) +\riskrt = \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \cdot J(\thetav) \] where: \begin{itemize} - \item \( L(f(x_i; \thetab), y_i) \) is the loss function. - \item \( f(x_i; \thetab) \) is the neural network's prediction. - \item \( J(\thetab) \) is the regularization term (e.g., \( \|\thetab\|_2^2 \) for L2 regularization). + \item \( L(f(x_i; \thetav), y_i) \) is the loss function. + \item \( f(x_i; \thetav) \) is the neural network's prediction. + \item \( J(\thetav) \) is the regularization term (e.g., \( \|\thetav\|_2^2 \) for L2 regularization). \item \( \lambda \) is the regularization parameter. \end{itemize} @@ -74,19 +74,19 @@ \end{vbframe} \begin{vbframe}{Formal Bounds} -Consider a neural network with parameters \(\thetab\) trained with L2 regularization: +Consider a neural network with parameters \(\thetav\) trained with L2 regularization: \[ -\|\thetab\|_2^2 = \sum_{j=1}^{p} \theta_j^2 +\|\thetav\|_2^2 = \sum_{j=1}^{p} \theta_j^2 \] The regularized loss function becomes: \[ -\riskrt = \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \|\thetab\|_2^2 +\riskrt = \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \|\thetav\|_2^2 \] -To bound the variance term, note that the regularization term \( \lambda \|\thetab\|_2^2 \) constrains the parameters: +To bound the variance term, note that the regularization term \( \lambda \|\thetav\|_2^2 \) constrains the parameters: \begin{itemize} \item Without regularization (\(\lambda = 0\)), the parameters can grow large, leading to high variance. @@ -96,7 +96,7 @@ Formally, the variance of the model can be bounded as follows: \[ -\text{Var}(\hat{\thetab}_{\text{Reg}}) \leq \frac{\sigma^2}{\lambda} +\text{Var}(\hat{\thetav}_{\text{Reg}}) \leq \frac{\sigma^2}{\lambda} \] where \(\sigma^2\) is the noise variance. As \(\lambda\) increases, the bound on the variance decreases. @@ -109,7 +109,7 @@ The regularized loss function is: \[ -\riskrt = \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \|\thetab\|_2^2 +\riskrt = \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \|\thetav\|_2^2 \] \textbf{Bias-Variance Decomposition:} @@ -121,22 +121,22 @@ \textbf{Step-by-Step Derivation:} \begin{itemize} - \item Model the Neural Network Parameters: \( \hat{\thetab} = \thetab^* + \epsilon \) - \item Apply Regularization: \( \hat{\thetab}_{\text{Reg}} = \arg \min_{\thetab} \left\{ \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \|\thetab\|_2^2 \right\} \) - \item Analyzing the Variance: \( \text{Var}(\hat{\thetab}_{\text{Reg}}) \approx (I(\thetab) + 2\lambda I)^{-1} \sigma^2 \) + \item Model the Neural Network Parameters: \( \hat{\thetav} = \thetav^* + \epsilon \) + \item Apply Regularization: \( \hat{\thetav}_{\text{Reg}} = \arg \min_{\thetav} \left\{ \frac{1}{n} \sum_{i=1}^{n} \Lxyit + \lambda \|\thetav\|_2^2 \right\} \) + \item Analyzing the Variance: \( \text{Var}(\hat{\thetav}_{\text{Reg}}) \approx (I(\thetav) + 2\lambda I)^{-1} \sigma^2 \) \end{itemize} \textbf{Bounding the Variance:} Given the properties of the Hessian matrix \(H\): \[ -\text{Var}(\hat{\thetab}_{\text{Reg}}) \leq \frac{\sigma^2}{2\lambda} I +\text{Var}(\hat{\thetav}_{\text{Reg}}) \leq \frac{\sigma^2}{2\lambda} I \] The variance of the neural network prediction is bounded by: \[ -\text{Var}(f(x; \hat{\thetab}_{\text{Reg}})) \leq \frac{\sigma^2}{2\lambda} \|\nabla_{\thetab} f(x; \hat{\thetab}_{\text{Reg}})\|^2 +\text{Var}(f(x; \hat{\thetav}_{\text{Reg}})) \leq \frac{\sigma^2}{2\lambda} \|\nabla_{\thetav} f(x; \hat{\thetav}_{\text{Reg}})\|^2 \] \textbf{Conclusion:} @@ -150,25 +150,25 @@ Regularization introduces bias by shrinking the parameter estimates towards zero: \[ -\text{Bias}(f(x)) = E[f(x; \hat{\thetab}_{\text{Reg}})] - f^*(x) +\text{Bias}(f(x)) = E[f(x; \hat{\thetav}_{\text{Reg}})] - f^*(x) \] Using a linear approximation: \[ -E[f(x; \hat{\thetab}_{\text{Reg}})] \approx f(x; \thetab^*) - \lambda \nabla_{\thetab} f(x; \thetab^*)^T H^{-1} \thetab^* +E[f(x; \hat{\thetav}_{\text{Reg}})] \approx f(x; \thetav^*) - \lambda \nabla_{\thetav} f(x; \thetav^*)^T H^{-1} \thetav^* \] Thus, the bias is: \[ -\text{Bias}(f(x)) = -\lambda \nabla_{\thetab} f(x; \thetab^*)^T H^{-1} \thetab^* +\text{Bias}(f(x)) = -\lambda \nabla_{\thetav} f(x; \thetav^*)^T H^{-1} \thetav^* \] \textbf{Combined Bias and Variance Analysis:} \begin{itemize} - \item \textbf{Bias:} \( \text{Bias}^2(f(x)) = (\lambda \nabla_{\thetab} f(x; \thetab^*)^T H^{-1} \thetab^*)^2 \) - \item \textbf{Variance:} \( \text{Var}(f(x; \hat{\thetab}_{\text{Reg}})) \leq \frac{\sigma^2}{2\lambda} \|\nabla_{\thetab} f(x; \hat{\thetab}_{\text{Reg}})\|^2 \) + \item \textbf{Bias:} \( \text{Bias}^2(f(x)) = (\lambda \nabla_{\thetav} f(x; \thetav^*)^T H^{-1} \thetav^*)^2 \) + \item \textbf{Variance:} \( \text{Var}(f(x; \hat{\thetav}_{\text{Reg}})) \leq \frac{\sigma^2}{2\lambda} \|\nabla_{\thetav} f(x; \hat{\thetav}_{\text{Reg}})\|^2 \) \end{itemize} \end{vbframe} @@ -199,14 +199,14 @@ For linear models, it's well-established that some \(\lambda > 0\) can balance the increase in bias against the reduction in variance, leading to a net decrease in MSE. For non-linear models, the situation is more complex: \begin{itemize} - \item The relationship between model parameters \(\thetab\), the regularization term, and the model output \(f(x; \thetab)\) is non-linear. + \item The relationship between model parameters \(\thetav\), the regularization term, and the model output \(f(x; \thetav)\) is non-linear. \item The effects of changing \(\lambda\) on the bias and variance terms are not straightforward and depend heavily on the specific form of the non-linear model and the data distribution. \end{itemize} Proving analytically that there exists a \(\lambda > 0\) such that the regularized model always outperforms the unregularized model in terms of MSE for general non-linear models involves: \begin{itemize} \item Detailed understanding of how changes in \(\lambda\) affect the bias and variance for the specific type of non-linear model. - \item Possibly making assumptions about the smoothness, continuity, or differentiability of the model function \(f\) with respect to both \(x\) and \(\thetab\). + \item Possibly making assumptions about the smoothness, continuity, or differentiability of the model function \(f\) with respect to both \(x\) and \(\thetav\). \end{itemize} @@ -224,7 +224,7 @@ \vspace{0.2cm} \begin{itemize} \item A diagonal linear network with one hidden layer and one output unit can be written as $f(x|\bm{u},\bm{v}) = (\bm{u} \odot \bm{v})^{\top} \bm{x}$ - \item optimizing the network with $L2$ regularization $\lambda$ and MSE loss has multiple global minima that coincide with the lasso solution for the collapsed parameter $\thetab:=\bm{u}\odot \bm{v}$ using $2\lambda$ + \item optimizing the network with $L2$ regularization $\lambda$ and MSE loss has multiple global minima that coincide with the lasso solution for the collapsed parameter $\thetav:=\bm{u}\odot \bm{v}$ using $2\lambda$ \item Since there is no existence theorem (of a $\lambda^*$ that reduces the MSE over OLS) for lasso compared to ridge regression, there can not be one for L2 regularized DNNs in general. \item For fully-connected linear networks using $L$ weight matrices $f(x|W_L,\ldots,W_1)=W_L \cdot \ldots \cdot W_1 x$, adding $L2$ regularization with $\lambda$ to all $W_l$ produces equivalent minma to Schatten $2/L$-norm regularization of the the collapsed linear predictor $\Bar{W}x:=W_L \cdot \ldots \cdot W_1x$ with strength $L\lambda$ \item I am fairly certain there is also no existence theorem for non-convex Schatten $2/L$-norm regularization, their success depends strongly on the low-rank nature of the problem diff --git a/slides/regularization/slides-regu-l2.tex b/slides/regularization/slides-regu-l2.tex index a707c92f..f4891dc3 100644 --- a/slides/regularization/slides-regu-l2.tex +++ b/slides/regularization/slides-regu-l2.tex @@ -34,7 +34,7 @@ \item For highly correlated features, OLS becomes sensitive to random errors in response, results in large variance in fit \item We now add a complexity penalty to the loss: $$ - \riskrt = \sumin \left(\yi - \thetab^\top \xi \right)^2 + \lambda \cdot J(\thetab). + \riskrt = \sumin \left(\yi - \thetav^\top \xi \right)^2 + \lambda \cdot J(\thetav). $$ \end{itemize} @@ -56,12 +56,12 @@ \begin{vbframe}{Ridge Regression / L2 Penalty} Intuitive measure of model complexity is deviation from 0-origin; coeffs then have no or a weak effect. -So we measure $J(\thetab)$ through a vector norm, shrinking coeffs closer to 0.\\ +So we measure $J(\thetav)$ through a vector norm, shrinking coeffs closer to 0.\\ \vspace{0.2cm} \begin{eqnarray*} -\thetah_{\text{ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \sum_{j=1}^{p} \theta_j^2 \\ -%&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\ -&=& \argmin_{\thetab} \| \yv - \Xmat \thetab \|_2^2 + \lambda \|\thetab\|_2^2 +\thetah_{\text{ridge}} &=& \argmin_{\thetav} \sumin \left(\yi - \thetav^T \xi \right)^2 + \lambda \sum_{j=1}^{p} \theta_j^2 \\ +%&=& \argmin_{\thetav} \left(\yv - \Xmat \thetav\right)^\top \left(\yv - \Xmat \thetav\right) + \lambda \thetav^\top \thetav \\ +&=& \argmin_{\thetav} \| \yv - \Xmat \thetav \|_2^2 + \lambda \|\thetav\|_2^2 \end{eqnarray*} Can still analytically solve this: @@ -71,7 +71,7 @@ \framebreak -Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$, with $ \thetah_{\text{ridge}} = \argmin_{\thetab} \|\yv - \Xmat \thetab\|^2 + \lambda \|\thetab\|^2 $. +Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$, with $ \thetah_{\text{ridge}} = \argmin_{\thetav} \|\yv - \Xmat \thetav\|^2 + \lambda \|\thetav\|^2 $. \begin{figure} \includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png} @@ -81,7 +81,7 @@ \framebreak Contours of regularized objective for different $\lambda$ values.\\ -$ \thetah_{\text{ridge}} = \argmin_{\thetab} \|\yv - \Xmat \thetab\|^2 + \lambda \|\thetab\|^2 $. +$ \thetah_{\text{ridge}} = \argmin_{\thetav} \|\yv - \Xmat \thetav\|^2 + \lambda \|\thetav\|^2 $. \begin{figure} \includegraphics[width=0.8\textwidth]{figure/reg_contours_02.png} @@ -96,8 +96,8 @@ \vspace{-0.5cm} \begin{eqnarray*} -\min_{\thetab} && \sumin \left(\yi - \fxit\right)^2 \\ - \text{s.t. } && \|\thetab\|_2^2 \leq t \\ +\min_{\thetav} && \sumin \left(\yi - \fxit\right)^2 \\ + \text{s.t. } && \|\thetav\|_2^2 \leq t \\ \end{eqnarray*} \vspace{-1.0cm} @@ -227,10 +227,10 @@ \begin{comment} \begin{vbframe}{Lasso Regression} -Another shrinkage method is the so-called \textbf{Lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetab$: +Another shrinkage method is the so-called \textbf{Lasso regression} ({\scriptsize{least absolute shrinkage and selection operator}}), which uses an $L1$ penalty on $\thetav$: \vspace{-0.2cm} \begin{eqnarray*} -\thetah_{\text{Lasso}}= \argmin_{\thetab} \underbrace{\sumin \left(\yi - \thetab^T \xi\right)^2}_{\left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right)} + \lambda \|\thetab\|_1 +\thetah_{\text{Lasso}}= \argmin_{\thetav} \underbrace{\sumin \left(\yi - \thetav^T \xi\right)^2}_{\left(\yv - \Xmat \thetav\right)^\top \left(\yv - \Xmat \thetav\right)} + \lambda \|\thetav\|_1 \end{eqnarray*} Optimization is much harder now. $\riskrt$ is still convex, but in general there is no analytical solution and it is non-differentiable.\\ \vspace{0.2cm} @@ -260,8 +260,8 @@ We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape. \vspace{-0.2cm} \begin{eqnarray*} -\min_{\thetab} \sumin \left(\yi - \fxit\right)^2\, -\text{subject to: } \|\thetab\|_1 \leq t +\min_{\thetav} \sumin \left(\yi - \fxit\right)^2\, +\text{subject to: } \|\thetav\|_1 \leq t \end{eqnarray*} The kinks in $L1$ enforce sparse solutions because ``the loss contours first hit the sharp corners of the constraint'' at coordinate axes where (some) entries are zero. \vspace{-0.1cm} diff --git a/slides/regularization/slides-regu-lasso-deepdive.tex b/slides/regularization/slides-regu-lasso-deepdive.tex index 0a4062b8..9f6cc281 100644 --- a/slides/regularization/slides-regu-lasso-deepdive.tex +++ b/slides/regularization/slides-regu-lasso-deepdive.tex @@ -25,11 +25,11 @@ \begin{vbframe}{Soft-thresholding and L1 regularization} In the lecture, we wanted to solve \[ - \min_{\thetab} \mathcal{\tilde R}_{\text{reg}}(\thetab) = \min_{\thetab}\mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j| + \min_{\thetav} \mathcal{\tilde R}_{\text{reg}}(\thetav) = \min_{\thetav}\mathcal{R}_{\text{emp}}(\thetah) + \sum_j \left[ \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 \right] + \sum_j \lambda |\theta_j| \] with $H_{j,j} \geq 0, \lambda > 0$. Note that we can separate the dimensions, i.e., -\[\mathcal{\tilde R}_{\text{reg}}(\thetab) = \sum_j z_j(\theta_j) \text{ with } z_j(\theta_j) = \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 + \lambda |\theta_j|.\] +\[\mathcal{\tilde R}_{\text{reg}}(\thetav) = \sum_j z_j(\theta_j) \text{ with } z_j(\theta_j) = \frac{1}{2} H_{j,j} (\theta_j - \hat{\theta}_j)^2 + \lambda |\theta_j|.\] Hence, we can minimize each $z_j$ separately to find the global minimum. \\ \lz diff --git a/slides/regularization/slides-regu-nonlin.tex b/slides/regularization/slides-regu-nonlin.tex index b359ce23..ea468153 100644 --- a/slides/regularization/slides-regu-nonlin.tex +++ b/slides/regularization/slides-regu-nonlin.tex @@ -31,7 +31,7 @@ If we define (supervised) ML in one line, this might be it: $$ -\min_{\thetab} \riskrt= \min_{\thetab} \left(\sumin \Lxyit + \lambda \cdot J(\thetab) \right) +\min_{\thetav} \riskrt= \min_{\thetav} \left(\sumin \Lxyit + \lambda \cdot J(\thetav) \right) $$ Can choose for task at hand: @@ -39,7 +39,7 @@ \begin{itemize} \item \textbf{hypothesis space} of $f$, controls how features influence prediction \item \textbf{loss} function $L$, measures how errors are treated - \item \textbf{regularizer} $J(\thetab)$, encodes inductive + \item \textbf{regularizer} $J(\thetav)$, encodes inductive bias \end{itemize} @@ -245,8 +245,8 @@ \begin{columns} \begin{column}{0.5\textwidth} \begin{eqnarray*} -\min_{\thetab} && \sumin \Lxyit \\ - \text{s.t. } && \|\thetab\|_2^2 \leq t \\ +\min_{\thetav} && \sumin \Lxyit \\ + \text{s.t. } && \|\thetav\|_2^2 \leq t \\ \end{eqnarray*} \end{column} \begin{column}{0.5\textwidth} @@ -260,7 +260,7 @@ Can interpret going through $\lambda$ from large to small as through $t$ from small to large. Constructs series of ERM problems with hypothesis spaces $\Hspace_\lambda$, -where we constrain norm of $\thetab$ to unit balls of growing sizes. +where we constrain norm of $\thetav$ to unit balls of growing sizes. \end{frame} diff --git a/slides/regularization/slides-regu-others.tex b/slides/regularization/slides-regu-others.tex index 7f4210be..7831fb92 100644 --- a/slides/regularization/slides-regu-others.tex +++ b/slides/regularization/slides-regu-others.tex @@ -33,7 +33,7 @@ \begin{vbframe}{$Lq$ regularization \citelink{FU2000}} -Besides $L1$/$L2$ we could use any $Lq$ (quasi-)norm penalty $\lambda \Vert \thetab \Vert_q^q$ +Besides $L1$/$L2$ we could use any $Lq$ (quasi-)norm penalty $\lambda \Vert \thetav \Vert_q^q$ \begin{figure} \scalebox{0.5}{\includegraphics{figure_man/lasso_ridge_hat.png}}\\ @@ -56,7 +56,7 @@ \begin{vbframe}{L0 regularization} \vspace{-0.3cm} $$ -\riskrt = \risket + \lambda \|\thetab\|_0 := \risket + \lambda \sum_j |\theta_j|^0. +\riskrt = \risket + \lambda \|\thetav\|_0 := \risket + \lambda \sum_j |\theta_j|^0. $$ \vspace{-0.3cm} \begin{figure} @@ -107,7 +107,7 @@ \begin{figure} \centering \scalebox{0.99}{\includegraphics{figure/nc_penalties_comparison.png}} - %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetab$} + %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetav$} \end{figure} \end{column} @@ -143,7 +143,7 @@ \begin{figure} \centering \scalebox{0.99}{\includegraphics{figure/nc_penalties_comparison.png}} - %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetab$} + %\caption{\footnotesize lasso vs non-convex SCAD and MCP penalties for scalar parameter $\thetav$} \end{figure} \end{column} @@ -159,7 +159,7 @@ \vspace{0.15cm} We simulate $n=100$ samples from the following DGP: {\small -$$y = \xv^{\top} \thetab + \varepsilon\,,\quad \thetab =(4,-4,-2,2,0,\ldots,0)^{\top} \in \mathbb{R}^{1500}, \quad x_j,\varepsilon \sim \mathcal{N}(0,1)$$ +$$y = \xv^{\top} \thetav + \varepsilon\,,\quad \thetav =(4,-4,-2,2,0,\ldots,0)^{\top} \in \mathbb{R}^{1500}, \quad x_j,\varepsilon \sim \mathcal{N}(0,1)$$ } \vspace{-1cm} diff --git a/slides/regularization/slides-regu-ridge-deepdive.tex b/slides/regularization/slides-regu-ridge-deepdive.tex index 72a4648f..0c5496e2 100644 --- a/slides/regularization/slides-regu-ridge-deepdive.tex +++ b/slides/regularization/slides-regu-ridge-deepdive.tex @@ -29,16 +29,16 @@ \begin{itemize}\setlength\itemsep{0.8em} \item We know that it is equivalent to a constrained optimization problem: \begin{eqnarray*} - \thetah_{\text{ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv\\ + \thetah_{\text{ridge}} &=& \argmin_{\thetav} \sumin \left(\yi - \thetav^T \xi \right)^2 + \lambda \|\thetav\|_2^2 = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv\\ \end{eqnarray*} For some $t$ depending on $\lambda$ this is equivalent to: \begin{eqnarray*} - %&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\ - \thetah_{\text{ridge}}&=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi\right)^2 \, - \text{s.t. } \|\thetab\|_2^2 \leq t + %&=& \argmin_{\thetav} \left(\yv - \Xmat \thetav\right)^\top \left(\yv - \Xmat \thetav\right) + \lambda \thetav^\top \thetav \\ + \thetah_{\text{ridge}}&=& \argmin_{\thetav} \sumin \left(\yi - \thetav^T \xi\right)^2 \, + \text{s.t. } \|\thetav\|_2^2 \leq t \end{eqnarray*} \item Bayesian interpretation of ridge regression: For additive Gaussian errors $\mathcal{N}(0,\sigma^2)$ and i.i.d. normal priors $\theta_j \sim \mathcal{N}(0,\tau^{2})$, the resulting MAP estimate is $\thetah_{\text{ridge}}$ with $\lambda=\frac{\sigma^2}{\tau^2}$: - $$\thetah_{\text{MAP}}=\argmax_{\theta} \log[p(\yv|\Xmat,\thetab)p(\thetab)] = \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \frac{\sigma^2}{\tau^2} \|\thetab\|_2^2$$ + $$\thetah_{\text{MAP}}=\argmax_{\theta} \log[p(\yv|\Xmat,\thetav)p(\thetav)] = \argmin_{\thetav} \sumin \left(\yi - \thetav^T \xi \right)^2 + \frac{\sigma^2}{\tau^2} \|\thetav\|_2^2$$ \end{itemize} } \end{vbframe} @@ -47,16 +47,16 @@ We can also recover the ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let $\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix} \yv \\ \bm{0}_{p} \end{pmatrix}$. \\ -With the augmented data, the unreg. least-squares solution $\tilde{\thetab}$ is: +With the augmented data, the unreg. least-squares solution $\tilde{\thetav}$ is: {\small \begin{eqnarray*} -\tilde{\thetab} &=& \argmin_{\thetab} -\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 \\ &=& \argmin_{\thetab} -\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 \\ %= \thetah_{\text{ridge}} -&=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 +\tilde{\thetav} &=& \argmin_{\thetav} +\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetav^T \tilde{\xi} \right)^2 \\ &=& \argmin_{\thetav} +\sum_{i=1}^{n} \left(\yi - \thetav^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 \\ %= \thetah_{\text{ridge}} +&=& \argmin_{\thetav} \sumin \left(\yi - \thetav^T \xi \right)^2 + \lambda \|\thetav\|_2^2 \end{eqnarray*} } -$\Longrightarrow$ $\thetah_{\text{ridge}}$ is the least-squares solution $\tilde{\thetab}$ but using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!\\ +$\Longrightarrow$ $\thetah_{\text{ridge}}$ is the least-squares solution $\tilde{\thetav}$ but using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$!\\ \lz @@ -70,16 +70,16 @@ We assume no specifc distribution. Now minimize risk with L2 loss, we define it slightly different than usual, as here our data $\xi$, $\yi$ are fixed, but we integrate over the random permutations $\bm{\delta}$: -$$\riskt:= \mathbb{E}_{\bm{\delta}}\Big[{\textstyle \sumin}(\yi-\thetab^{\top}\tilde{\xv}^{(i)})^2\Big] = \mathbb{E}_{\bm{\delta}}\Big[{\textstyle \sumin}(\yi-\thetab^{\top}(\xi+\bm{\delta}^{(i)}))^2\Big]\,\,\Big|\, \text{expand}$$ +$$\riskt:= \mathbb{E}_{\bm{\delta}}\Big[{\textstyle \sumin}(\yi-\thetav^{\top}\tilde{\xv}^{(i)})^2\Big] = \mathbb{E}_{\bm{\delta}}\Big[{\textstyle \sumin}(\yi-\thetav^{\top}(\xi+\bm{\delta}^{(i)}))^2\Big]\,\,\Big|\, \text{expand}$$ \vspace{-0.2cm} %Expanding, we obtain -$$\riskt = \mathbb{E}_{\bm{\delta}}\Big[{\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\bm{\delta}^{(i)}(\yi-\thetab^{\top}\xi) + \thetab^{\top}\bm{\delta}^{(i)}\bm{{\delta}}^{(i) \top}\thetab\big)\Big]$$ +$$\riskt = \mathbb{E}_{\bm{\delta}}\Big[{\textstyle \sumin}\big((\yi-\thetav^{\top}\xi)^2 - 2 \thetav^{\top}\bm{\delta}^{(i)}(\yi-\thetav^{\top}\xi) + \thetav^{\top}\bm{\delta}^{(i)}\bm{{\delta}}^{(i) \top}\thetav\big)\Big]$$ By linearity of expectation, $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}]=\bm{0}_p$ and $\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]=\lambda \id_p$, this is \vspace{-0.2cm} % -\begin{align*}\riskt&={\textstyle \sumin}\big((\yi-\thetab^{\top}\xi)^2 - 2 \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}](\yi-\thetab^{\top}\xi) + \thetab^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]\thetab \big) \\ -&= {\textstyle \sumin}(\yi-\thetab^{\top}\xi)^2+\lambda \Vert \thetab \Vert_2^2 +\begin{align*}\riskt&={\textstyle \sumin}\big((\yi-\thetav^{\top}\xi)^2 - 2 \thetav^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}](\yi-\thetav^{\top}\xi) + \thetav^{\top}\mathbb{E}_{\bm{\delta}}[\bm{\delta}^{(i)}\bm{\delta}^{(i)\top}]\thetav \big) \\ +&= {\textstyle \sumin}(\yi-\thetav^{\top}\xi)^2+\lambda \Vert \thetav \Vert_2^2 \end{align*} $\Longrightarrow$ Ridge regression on unperturbed features {\small $\xi$} turns out to be the same as minimizing squared loss averaged over feature noise distribution! diff --git a/slides/regularization/slides-regu-wd-vs-l2.tex b/slides/regularization/slides-regu-wd-vs-l2.tex index 4d40045e..d87d9054 100644 --- a/slides/regularization/slides-regu-wd-vs-l2.tex +++ b/slides/regularization/slides-regu-wd-vs-l2.tex @@ -20,24 +20,24 @@ Let's optimize $L2$-regularized risk of a model $\fxt$ \vspace{-0.2cm} \[ -\min_{\thetab} \riskrt = \min_{\thetab} \risket + \frac{\lambda}{2} \|\thetab\|^2_2 +\min_{\thetav} \riskrt = \min_{\thetav} \risket + \frac{\lambda}{2} \|\thetav\|^2_2 \] by GD. The gradient is \[ -\nabla_{\thetab} \riskrt = \nabla_{\thetab} \risket + \lambda \thetab +\nabla_{\thetav} \riskrt = \nabla_{\thetav} \risket + \lambda \thetav \] -We iteratively update $\thetab$ by step size \(\alpha\) times the +We iteratively update $\thetav$ by step size \(\alpha\) times the negative gradient \vspace{-0.2cm} \begin{align*} -\thetab^{[\text{new}]} &= \thetab^{[\text{old}]} - \alpha \left(\nabla_{\thetab} \riske(\thetab^{[\text{old}]}) + \lambda \thetab^{[\text{old}]}\right) \\&= -\thetab^{[\text{old}]} (1 - \alpha \lambda) - \alpha \nabla_{\thetab} \riske(\thetab^{[\text{old}]}) +\thetav^{[\text{new}]} &= \thetav^{[\text{old}]} - \alpha \left(\nabla_{\thetav} \riske(\thetav^{[\text{old}]}) + \lambda \thetav^{[\text{old}]}\right) \\&= +\thetav^{[\text{old}]} (1 - \alpha \lambda) - \alpha \nabla_{\thetav} \riske(\thetav^{[\text{old}]}) \end{align*} {\small -We see how $\thetab^{[old]}$ decays in magnitude -- for small $\alpha$ and $\lambda$ -- before we do the gradient step. Performing the decay directly, under this name, is a very well-known technique in DL - and simply $L2$ regularization in disguise (for GD). +We see how $\thetav^{[old]}$ decays in magnitude -- for small $\alpha$ and $\lambda$ -- before we do the gradient step. Performing the decay directly, under this name, is a very well-known technique in DL - and simply $L2$ regularization in disguise (for GD). } \framebreak @@ -66,9 +66,9 @@ \textbf{Caveat}: Equivalence of weight decay and $L2$ only holds for (S)GD! \begin{itemize}\setlength{\itemsep}{0.5em} - \item \citelink{HANSON1988} originally define WD ``decoupled'' from gradient-updates {\footnotesize $\alpha \nabla_{\thetab} \riske(\thetab^{[\text{old}]})$} as - {\footnotesize $\thetab^{[\text{new}]} = - \thetab^{[\text{old}]} (1 - \lambda') - \alpha \nabla_{\thetab} \riske(\thetab^{[\text{old}]})$} + \item \citelink{HANSON1988} originally define WD ``decoupled'' from gradient-updates {\footnotesize $\alpha \nabla_{\thetav} \riske(\thetav^{[\text{old}]})$} as + {\footnotesize $\thetav^{[\text{new}]} = + \thetav^{[\text{old}]} (1 - \lambda') - \alpha \nabla_{\thetav} \riske(\thetav^{[\text{old}]})$} \item This is equivalent to modern WD/$L2$ (last slide) using reparameterization $\lambda'=\alpha \lambda$ % \item Using this we see the WD is decoupled from the gradient updates \item Consequence: if there is optimal $\lambda'$, then optimal $L2$ penalty is tightly coupled to $\alpha$ as $\lambda=\lambda'/ \alpha$ (and vice versa)