Skip to content

Commit

Permalink
latex-math update: rename thetab -> thetav
Browse files Browse the repository at this point in the history
  • Loading branch information
jemus42 committed Oct 23, 2024
1 parent 3af2ac0 commit d5ae5a4
Show file tree
Hide file tree
Showing 65 changed files with 701 additions and 701 deletions.
8 changes: 4 additions & 4 deletions cheatsheets/cheatsheet_sl.tex
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@
\begin{tabular}{c|cc}
& Logistic Regression & Softmax Regression \\ \hline
$\Yspace$ & $\{0, 1\}$ & $\{1, 2, ..., g\}$ \\[0.5cm]
Discriminant fun. & $f(\xv) = \thetab^\top \xv$ & $f_k(\xv) = \thetab_{k}^{\top} \xv, k = 1, \ldots, g$ \\[0.5cm]
Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetab^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetab_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv) }$ \\[0.5cm]
Discriminant fun. & $f(\xv) = \thetav^\top \xv$ & $f_k(\xv) = \thetav_{k}^{\top} \xv, k = 1, \ldots, g$ \\[0.5cm]
Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetav^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetav_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv) }$ \\[0.5cm]
$L(y, \pix)$ & Bernoulli / logarithmic loss & Multiclass logarithmic loss\\[-0.3cm]
& $-y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right)$ & $ - \sum_{k = 1}^g [y = k] \log\left(\pi_k(\xv)\right)$ \\
\end{tabular}
Expand Down Expand Up @@ -444,12 +444,12 @@
%\begin{myblock}{Components of Learning}

%\textbf{Learning = Hypothesis space + Risk + Optimization} \\
%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta}
%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta}
%\risket$

%
% \textbf{Learning &= Hypothesis space &+ Risk &+ Optimization} \\
% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$
% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$
%
% \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$
% can be learned from the data.
Expand Down
32 changes: 16 additions & 16 deletions cheatsheets/cheatsheet_sl_2.tex
Original file line number Diff line number Diff line change
Expand Up @@ -169,21 +169,21 @@
%
\item $J(f)$ is the \textbf{complexity/roughness penalty} or \textbf{regularizer}.
\item $\lambda > 0$ is the \textbf{complexity control} parameter.
\item For parameterized hypotheses: $\riskrt = \risket + \lambda \cdot J(\thetab)$.
\item For parameterized hypotheses: $\riskrt = \risket + \lambda \cdot J(\thetav)$.
\end{itemize}
%
Tackles the trade-off: \emph{maximizing} the fit (minimizing the train loss) vs.\ \emph{minimizing} the complexity of the model. \\

%
Regularization in the linear model ($\fx = \thetab^\top \xv$):
Regularization in the linear model ($\fx = \thetav^\top \xv$):
%
\begin{itemize}
\setlength{\itemindent}{+.3in}
%
\item Ridge regression: $J(\thetab) = \|\thetab\|_2^2 = \thetab^\top \thetab.$
\item Lasso regression: $J(\thetab) = \|\thetab\|_1 = \sum_{j=1}^p |\theta_j|.$
\item Elastic net regression: $J(\thetab) = (\|\thetab\|_2^2, \|\thetab\|_1)^\top$ and $\lambda=(\lambda_1,\lambda_2).$
\item L0 regression: $J(\thetab) = \|\thetab\|_0 = \sum_{j=1}^p |\theta_j|^0.$
\item Ridge regression: $J(\thetav) = \|\thetav\|_2^2 = \thetav^\top \thetav.$
\item Lasso regression: $J(\thetav) = \|\thetav\|_1 = \sum_{j=1}^p |\theta_j|.$
\item Elastic net regression: $J(\thetav) = (\|\thetav\|_2^2, \|\thetav\|_1)^\top$ and $\lambda=(\lambda_1,\lambda_2).$
\item L0 regression: $J(\thetav) = \|\thetav\|_0 = \sum_{j=1}^p |\theta_j|^0.$
%
\end{itemize}
%
Expand All @@ -204,7 +204,7 @@
%
Signed distance to the separating hyperplane:
$$
d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetab\|} = \yi \frac{\thetab^T \xi + \theta_0}{\|\thetab\|}
d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetav\|} = \yi \frac{\thetav^T \xi + \theta_0}{\|\thetav\|}
$$
Distance of $f$ to the whole dataset $\D:$
$
Expand All @@ -214,13 +214,13 @@
\textbf{Primal linear hard-margin SVM:}
%
\begin{eqnarray*}
& \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\
& \text{s.t.} & \,\,\yi \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset
& \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\
& \text{s.t.} & \,\,\yi \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset
\end{eqnarray*}
%
Support vectors: All instances $(\xi, \yi)$ with minimal margin
$\yi \fxi = 1$, fulfilling the inequality constraints with equality.
All have distance of $\gamma = 1 / \|\thetab\|$ from the separating hyperplane.
All have distance of $\gamma = 1 / \|\thetav\|$ from the separating hyperplane.

\textbf{Dual linear hard-margin SVM:}
%
Expand All @@ -233,7 +233,7 @@
Solution (if existing):
%
$$
\thetah = \sum\nolimits_{i=1}^n \hat \alpha_i \yi \xi, \quad \theta_0 = \yi - \scp{\thetab}{\xi}.
\thetah = \sum\nolimits_{i=1}^n \hat \alpha_i \yi \xi, \quad \theta_0 = \yi - \scp{\thetav}{\xi}.
$$
%
\end{myblock}
Expand All @@ -256,8 +256,8 @@
%
\textbf{Primal linear soft-margin SVM:}
\begin{eqnarray*}
& \min\limits_{\thetab, \thetab_0,\sli} & \frac{1}{2} \|\thetab\|^2 + C \sum_{i=1}^n \sli \\
& \text{s.t.} & \,\, \yi \left( \scp{\thetab}{\xi} + \thetab_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\
& \min\limits_{\thetav, \thetav_0,\sli} & \frac{1}{2} \|\thetav\|^2 + C \sum_{i=1}^n \sli \\
& \text{s.t.} & \,\, \yi \left( \scp{\thetav}{\xi} + \thetav_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\
& \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset,\\
\end{eqnarray*}
%
Expand Down Expand Up @@ -285,7 +285,7 @@
%
Regularized empirical risk minimization representation:
%
$$ \risket = \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$
$$ \risket = \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$
%

\end{myblock}
Expand Down Expand Up @@ -366,12 +366,12 @@
%\begin{myblock}{Components of Learning}

%\textbf{Learning = Hypothesis space + Risk + Optimization} \\
%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta}
%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta}
%\risket$

%
% \textbf{Learning &= Hypothesis space &+ Risk &+ Optimization} \\
% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$
% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$
%
% \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$
% can be learned from the data.
Expand Down
18 changes: 9 additions & 9 deletions cheatsheets/cheatsheet_sl_3.tex
Original file line number Diff line number Diff line change
Expand Up @@ -109,23 +109,23 @@
Bayesian Linear Model:
%
\begin{eqnarray*}
\yi &=& \fxi + \epsi = \thetab^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\}
\yi &=& \fxi + \epsi = \thetav^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\}
\end{eqnarray*}
%
where $\epsi \sim \mathcal{N}(0, \sigma^2).$
%
Parameter vector $\thetab$ is stochastic and follows a distribution.\\
Parameter vector $\thetav$ is stochastic and follows a distribution.\\
%

Gaussian variant:
%
\begin{itemize}
\setlength{\itemindent}{+.3in}
\item Prior distribution: $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p)$
\item Prior distribution: $\thetav \sim \mathcal{N}(\zero, \tau^2 \id_p)$
\item Posterior distribution: $
\thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1})
\thetav ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1})
$ with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$
\item Predictive distribution of $y_* = \thetab^\top \xv_*$ for a new observations $\xv_*$:
\item Predictive distribution of $y_* = \thetav^\top \xv_*$ for a new observations $\xv_*$:
$$
y_* ~|~ \Xmat, \yv, \xv_* \sim \mathcal{N}(\sigma^{-2}\yv^\top \Xmat \Amat^{-1}\xv_*, \xv_*^\top\Amat^{-1}\xv_*)
$$
Expand All @@ -141,8 +141,8 @@
\begin{tabular}{cc}
\textbf{Weight-Space View} & \textbf{Function-Space View} \vspace{4mm}\\
Parameterize functions & \vspace{1mm}\\
\footnotesize Example: $\fxt = \thetab^\top \xv$ & \vspace{3mm}\\
Define distributions on $\thetab$ & Define distributions on $f$ \vspace{4mm}\\
\footnotesize Example: $\fxt = \thetav^\top \xv$ & \vspace{3mm}\\
Define distributions on $\thetav$ & Define distributions on $f$ \vspace{4mm}\\
Inference in parameter space $\Theta$ & Inference in function space $\Hspace$
\end{tabular}
\end{table}
Expand Down Expand Up @@ -393,12 +393,12 @@
%\begin{myblock}{Components of Learning}

%\textbf{Learning = Hypothesis space + Risk + Optimization} \\
%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta}
%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta}
%\risket$

%
% \textbf{Learning &= Hypothesis space &+ Risk &+ Optimization} \\
% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$
% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$
%
% \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$
% can be learned from the data.
Expand Down
6 changes: 3 additions & 3 deletions exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ where $\eps^{(1)},\ldots,\eps^{(n)}$ are iid with distribution $\mathcal{N}(0, \
%
\begin{equation*}
\begin{split}
\Hspace = \{f(\cdot~|~ \thetab): \Xspace \to \R \ ~|~ & f(\cdot~|~ \thetab) \text{ belongs to a certain
functional family parameterized by } \thetab \in \Theta \},
\Hspace = \{f(\cdot~|~ \thetav): \Xspace \to \R \ ~|~ & f(\cdot~|~ \thetav) \text{ belongs to a certain
functional family parameterized by } \thetav \in \Theta \},
\end{split}
\end{equation*}
%
where $\thetab = (\theta_1, \theta_2, \ldots, \theta_d)$ is a parameter vector, which is an element of a \textbf{parameter space}
where $\thetav = (\theta_1, \theta_2, \ldots, \theta_d)$ is a parameter vector, which is an element of a \textbf{parameter space}
$\Theta$.
%
Based on your findings in (a), establish a relationship between minimizing the negative log-likelihood for $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ and empirical loss minimization over $\Hspace$ of the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$
Expand Down
12 changes: 6 additions & 6 deletions exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ current competitor and market data.
\{1, 2, \dots, n\}, i \neq j$, with sample size $n$.
\begin{itemize}
\item Argue which of the following distributions from the one-parametric exponential family is most suitable for the underlying use case: normal, Bernoulli, gamma or Poisson.
\item Write down the probability distribution of the chosen distribution depending on $\thetab$ assuming a log link function.
\item Write down the probability distribution of the chosen distribution depending on $\thetav$ assuming a log link function.
\end{itemize}

%The GLM models the target as a linear function of the features
%with Gaussian error term: $\ydat = \Xmat \thetab + \epsilon$, \\
%with Gaussian error term: $\ydat = \Xmat \thetav + \epsilon$, \\
%$\epsilon \sim N(\bm{0}, \mathit{diag}(\sigma^2)), ~~ \sigma > 0$.
% Furthermore, you have reason to believe that the effect of mileage might be
% non-linear, so you decide to include this quantity logarithmically (using the
% natural logarithm).

\item State the hypothesis space for the corresponding model class.
For this, assume the parameter vector $\thetab$ to include the intercept
For this, assume the parameter vector $\thetav$ to include the intercept
coefficient.
\item Which parameters need to be learned?
Define the corresponding parameter space $\Theta$.
Expand All @@ -32,15 +32,15 @@ current competitor and market data.
likelihood estimation (MLE).
%The likelihood for the LM is given by:
% \[
% \ell(\thetab) = - \frac{n}{2} \log(2 \sigma^2 \pi) - \frac{1}{2 \sigma^2}
% (\ydat - \Xmat \thetab)^T(\ydat - \Xmat \thetab)
% \ell(\thetav) = - \frac{n}{2} \log(2 \sigma^2 \pi) - \frac{1}{2 \sigma^2}
% (\ydat - \Xmat \thetav)^T(\ydat - \Xmat \thetav)
% \]

% \\
% &= \left( \frac{1}{2 \pi \sigma^2} \right)^{\frac{n}{2}} \exp \left(-
% \frac{1}{2 \sigma^2} \sumin \left(\yi - \thetat \xi \right)^2 \right) \\
% &= \left( \frac{1}{2 \pi \sigma^2} \right)^{\frac{n}{2}} \exp \left(-
% \frac{1}{2 \sigma^2} \| \ydat - \Xmat \thetab \|^2 \right)
% \frac{1}{2 \sigma^2} \| \ydat - \Xmat \thetav \|^2 \right)
Describe how you can make use of the likelihood in empirical risk minimization
(ERM) and write down the likelihood as well as the resulting empirical risk.
%\item Now you need to optimize this risk to find the best parameters,
Expand Down
18 changes: 9 additions & 9 deletions exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,26 @@
The likelihood for $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ is
%
\begin{eqnarray*}
\LL(\thetab) &=& \prod_{i=1}^n \pdf\left(z^{(i)} ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \exp\left(-\frac{1}{2\sigma^2}\sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)\,.
\LL(\thetav) &=& \prod_{i=1}^n \pdf\left(z^{(i)} ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \exp\left(-\frac{1}{2\sigma^2}\sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)\,.
\end{eqnarray*}
%
So, the negative log-likelihood for $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ is
%
\begin{eqnarray*}
- \loglt &=& - \log\left(\LL(\thetab)\right) \\
- \loglt &=& - \log\left(\LL(\thetav)\right) \\
&=& - \log\left( \exp\left(-\frac{1}{2\sigma^2} \sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right) \right) \\
&\propto& \sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2 \\
&=& \sumin \left[ m(\yi) - m\left(\fxit\right)\right]^2.
\end{eqnarray*}
%
Thus, the negative log-likelihood for a parameter $\thetab$ is proportional to the empirical risk of a hypothesis $f(\cdot ~|~ \thetab)$ w.r.t. the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$
Thus, the negative log-likelihood for a parameter $\thetav$ is proportional to the empirical risk of a hypothesis $f(\cdot ~|~ \thetav)$ w.r.t. the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$
%

\item First, we specify the feature space: $\Xspace = \{1\} \times \R,$ i.e., any feature $\xv \in \Xspace$ is of the form $\xv=(x_1,x_2)^\top = (1,x_2)^\top$ for some $x_2\in \R.$
%
According to the exercise we use $m(x)=\log(x),$ whose inverse is $m^{-1}(x)=\exp(x).$
%
Let us rewrite Forbes' conjectured model $ y = \theta_1 \exp(\theta_2 x + \eps)$ into $y = m^{-1} \left( m(f(\xv~|~ \thetab)) + \eps \right),$ for some suitable hypothesis $f(\xv~|~ \thetab):$
Let us rewrite Forbes' conjectured model $ y = \theta_1 \exp(\theta_2 x + \eps)$ into $y = m^{-1} \left( m(f(\xv~|~ \thetav)) + \eps \right),$ for some suitable hypothesis $f(\xv~|~ \thetav):$
%
\begin{align*}
%
Expand All @@ -47,13 +47,13 @@ Let us rewrite Forbes' conjectured model $ y = \theta_1 \exp(\theta_2 x + \eps)
%
\end{align*}
%
With this, we see that $f(\xv~|~ \thetab) = \theta_1 x_1 \exp(\theta_2 x_2) = \theta_1 \exp(\theta_2 x_2)$ is a suitable functional form for the hypotheses.
With this, we see that $f(\xv~|~ \thetav) = \theta_1 x_1 \exp(\theta_2 x_2) = \theta_1 \exp(\theta_2 x_2)$ is a suitable functional form for the hypotheses.
%
Thus, we use as our parameter space $\Theta = \R_+ \times \R$ which gives rise to the hypothesis space
%
\begin{equation*}
\begin{split}
\Hspace = \{f(\xv~|~ \thetab) = \theta_1 x_1 \exp(\theta_2 x_2) ~|~ \thetab \in \Theta \}.
\Hspace = \{f(\xv~|~ \thetav) = \theta_1 x_1 \exp(\theta_2 x_2) ~|~ \thetav \in \Theta \}.
\end{split}
\end{equation*}
%
Expand All @@ -76,15 +76,15 @@ A suitable hypothesis space is then
%
\begin{equation*}
\begin{split}
\Hspace = \{f(\xv~|~ \thetab) = \log(\theta_1) x_1 + \theta_2 x_2 ~|~ \thetab \in \Theta \},
\Hspace = \{f(\xv~|~ \thetav) = \log(\theta_1) x_1 + \theta_2 x_2 ~|~ \thetav \in \Theta \},
\end{split}
\end{equation*}
%
which are the linear functions\footnote{Note that $\log(\theta_1)$ can be any value in $\R.$} $\xv^\top \thetab$ of features in $\Xspace.$
which are the linear functions\footnote{Note that $\log(\theta_1)$ can be any value in $\R.$} $\xv^\top \thetav$ of features in $\Xspace.$
%
The empirical risk minimizer in this case is specified by the parameter
%
$$(\log(\hat{\theta}_1),\hat{\theta}_2)^\top = \thetabh=\left(\Xmat^T \Xmat\right)^{-1}\Xmat^T \bm{z}, \qquad \bm{z} = (\log y^{(1)},\ldots,\log y^{(n)})^\top,$$
$$(\log(\hat{\theta}_1),\hat{\theta}_2)^\top = \thetavh=\left(\Xmat^T \Xmat\right)^{-1}\Xmat^T \bm{z}, \qquad \bm{z} = (\log y^{(1)},\ldots,\log y^{(n)})^\top,$$
%
(see \href{https://slds-lmu.github.io/i2ml/chapters/02_supervised_regression/02-02-linearmodel/}{Chapter 02.02 of I2ML}) which for this simple case is:
%
Expand Down
Loading

0 comments on commit d5ae5a4

Please sign in to comment.