latex-math update: rename thetab -> thetav

slds-lmu · Oct 23, 2024 · d5ae5a4 · d5ae5a4
1 parent 3af2ac0
commit d5ae5a4
Show file tree

Hide file tree

Showing 65 changed files with 701 additions and 701 deletions.
diff --git a/cheatsheets/cheatsheet_sl.tex b/cheatsheets/cheatsheet_sl.tex
@@ -236,8 +236,8 @@
 												\begin{tabular}{c|cc}
 													& Logistic Regression & Softmax Regression \\ \hline
 													$\Yspace$ & $\{0, 1\}$ & $\{1, 2, ..., g\}$ \\[0.5cm]
-													Discriminant fun. & $f(\xv) = \thetab^\top \xv$ & $f_k(\xv) = \thetab_{k}^{\top} \xv, k = 1, \ldots, g$ \\[0.5cm]
-													Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetab^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetab_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetab_j^\top \xv) }$ \\[0.5cm]
+													Discriminant fun. & $f(\xv) = \thetav^\top \xv$ & $f_k(\xv) = \thetav_{k}^{\top} \xv, k = 1, \ldots, g$ \\[0.5cm]
+													Probabilities & $\pi(\xv) = \frac{1}{1 + \exp\left(-\thetav^\top \xv\right)}$ & $\pi_k(\xv) = \frac{\exp(\thetav_k^\top \xv)}{\sum_{j = 1}^g \exp(\thetav_j^\top \xv) }$ \\[0.5cm]
 													$L(y, \pix)$ & Bernoulli / logarithmic loss & Multiclass logarithmic loss\\[-0.3cm]
 													& $-y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right)$  & $ - \sum_{k = 1}^g [y = k] \log\left(\pi_k(\xv)\right)$ \\
 												\end{tabular}
@@ -444,12 +444,12 @@
 %\begin{myblock}{Components of Learning}
 
 %\textbf{Learning = Hypothesis space + Risk + Optimization} \\
-%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta} 
+%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta} 
 %\risket$
 
 % 
 % \textbf{Learning &= Hypothesis space &+ Risk  &+ Optimization} \\
-% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$
+% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$
 % 
 % \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$
 % can be learned from the data.

diff --git a/cheatsheets/cheatsheet_sl_2.tex b/cheatsheets/cheatsheet_sl_2.tex
@@ -169,21 +169,21 @@
 								%	
 									\item $J(f)$ is the \textbf{complexity/roughness penalty} or \textbf{regularizer}.
 									\item $\lambda > 0$ is the \textbf{complexity control} parameter. 
-									\item For parameterized hypotheses: $\riskrt = \risket + \lambda \cdot J(\thetab)$. 
+									\item For parameterized hypotheses: $\riskrt = \risket + \lambda \cdot J(\thetav)$. 
 								\end{itemize}
 								%
 								Tackles the trade-off: \emph{maximizing} the fit (minimizing the train loss) vs.\ \emph{minimizing} the complexity of the model. \\
 
 								%
-								Regularization in the linear model ($\fx = \thetab^\top \xv$):
+								Regularization in the linear model ($\fx = \thetav^\top \xv$):
 								%
 								\begin{itemize}
 									\setlength{\itemindent}{+.3in}
 									%	
-									\item Ridge regression: $J(\thetab) =  \|\thetab\|_2^2 = \thetab^\top \thetab.$
-									\item Lasso regression: $J(\thetab) =  \|\thetab\|_1 = \sum_{j=1}^p |\theta_j|.$
-									\item Elastic net regression: $J(\thetab) =  (\|\thetab\|_2^2,  \|\thetab\|_1)^\top$ and $\lambda=(\lambda_1,\lambda_2).$
-									\item L0 regression: $J(\thetab) = \|\thetab\|_0 = \sum_{j=1}^p |\theta_j|^0.$
+									\item Ridge regression: $J(\thetav) =  \|\thetav\|_2^2 = \thetav^\top \thetav.$
+									\item Lasso regression: $J(\thetav) =  \|\thetav\|_1 = \sum_{j=1}^p |\theta_j|.$
+									\item Elastic net regression: $J(\thetav) =  (\|\thetav\|_2^2,  \|\thetav\|_1)^\top$ and $\lambda=(\lambda_1,\lambda_2).$
+									\item L0 regression: $J(\thetav) = \|\thetav\|_0 = \sum_{j=1}^p |\theta_j|^0.$
 								%	
 								\end{itemize}
 								%
@@ -204,7 +204,7 @@
 								%	
 								Signed distance to the separating hyperplane:
 								$$
-								d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetab\|} = \yi \frac{\thetab^T \xi + \theta_0}{\|\thetab\|}
+								d \left(f, \xi \right) = \frac{\yi \fxi}{\|\thetav\|} = \yi \frac{\thetav^T \xi + \theta_0}{\|\thetav\|}
 								$$ 
 								Distance of $f$ to the whole dataset $\D:$ 
 								$
@@ -214,13 +214,13 @@
 								\textbf{Primal linear hard-margin SVM:}
 								%
 								\begin{eqnarray*}
-									& \min\limits_{\thetab, \theta_0} \quad & \frac{1}{2} \|\thetab\|^2 \\
-									& \text{s.t.} & \,\,\yi  \left( \scp{\thetab}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset
+									& \min\limits_{\thetav, \theta_0} \quad & \frac{1}{2} \|\thetav\|^2 \\
+									& \text{s.t.} & \,\,\yi  \left( \scp{\thetav}{\xi} + \theta_0 \right) \geq 1 \quad \forall\, i \in \nset
 								\end{eqnarray*}
 								%
 								Support vectors: All instances $(\xi, \yi)$ with minimal margin
 								$\yi  \fxi = 1$, fulfilling the inequality constraints with equality. 
-								All have distance of $\gamma = 1 / \|\thetab\|$ from the separating hyperplane.
+								All have distance of $\gamma = 1 / \|\thetav\|$ from the separating hyperplane.
 
 								\textbf{Dual linear hard-margin SVM:}
 								%
@@ -233,7 +233,7 @@
 								Solution (if existing):
 								%
 								$$
-								\thetah = \sum\nolimits_{i=1}^n \hat \alpha_i \yi \xi, \quad \theta_0 = \yi - \scp{\thetab}{\xi}.
+								\thetah = \sum\nolimits_{i=1}^n \hat \alpha_i \yi \xi, \quad \theta_0 = \yi - \scp{\thetav}{\xi}.
 								$$
 								%
 							\end{myblock}
@@ -256,8 +256,8 @@
 						%	
 							\textbf{Primal linear soft-margin SVM:} 	
 							\begin{eqnarray*}
-								& \min\limits_{\thetab, \thetab_0,\sli} & \frac{1}{2} \|\thetab\|^2 + C   \sum_{i=1}^n \sli \\
-								& \text{s.t.} & \,\, \yi  \left( \scp{\thetab}{\xi} + \thetab_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\
+								& \min\limits_{\thetav, \thetav_0,\sli} & \frac{1}{2} \|\thetav\|^2 + C   \sum_{i=1}^n \sli \\
+								& \text{s.t.} & \,\, \yi  \left( \scp{\thetav}{\xi} + \thetav_0 \right) \geq 1 - \sli \quad \forall\, i \in \nset,\\
 								& \text{and} & \,\, \sli \geq 0 \quad \forall\, i \in \nset,\\
 							\end{eqnarray*}
 						%
@@ -285,7 +285,7 @@
 						%
 						Regularized empirical risk minimization representation:
 						%
-						$$ \risket = \frac{1}{2} \|\thetab\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$
+						$$ \risket = \frac{1}{2} \|\thetav\|^2 + C \sumin \Lxyi ;\; \Lyf = \max(1-yf, 0)$$
 						%
 
 						\end{myblock}
@@ -366,12 +366,12 @@
 						%\begin{myblock}{Components of Learning}
 
 						%\textbf{Learning = Hypothesis space + Risk + Optimization} \\
-						%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta} 
+						%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta} 
 						%\risket$
 
 						% 
 						% \textbf{Learning &= Hypothesis space &+ Risk  &+ Optimization} \\
-						% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$
+						% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$
 						% 
 						% \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$
 						% can be learned from the data.

diff --git a/cheatsheets/cheatsheet_sl_3.tex b/cheatsheets/cheatsheet_sl_3.tex
@@ -109,23 +109,23 @@
 								Bayesian Linear Model:
 							%
 								\begin{eqnarray*}
-									\yi &=& \fxi + \epsi = \thetab^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\}
+									\yi &=& \fxi + \epsi = \thetav^T \xi + \epsi, \quad \text{for } i \in \{1, \ldots, n\}
 								\end{eqnarray*}
 								%
 								where $\epsi \sim \mathcal{N}(0, \sigma^2).$
 							%
-								Parameter vector $\thetab$ is stochastic and follows a distribution.\\
+								Parameter vector $\thetav$ is stochastic and follows a distribution.\\
 							%	
 
 								Gaussian variant: 
 							%	
 								\begin{itemize}
 									\setlength{\itemindent}{+.3in}
-									\item Prior distribution: $\thetab \sim \mathcal{N}(\zero, \tau^2 \id_p)$ 
+									\item Prior distribution: $\thetav \sim \mathcal{N}(\zero, \tau^2 \id_p)$ 
 									\item Posterior distribution:	$
-									\thetab ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1})
+									\thetav ~|~ \Xmat, \yv \sim \mathcal{N}(\sigma^{-2}\bm{A}^{-1}\Xmat^\top\yv, \bm{A}^{-1})
 									$ with $\bm{A}:= \sigma^{-2}\Xmat^\top\Xmat + \frac{1}{\tau^2} \id_p$
-									\item Predictive distribution of $y_* = 	\thetab^\top \xv_*$ for a new observations $\xv_*$: 
+									\item Predictive distribution of $y_* = 	\thetav^\top \xv_*$ for a new observations $\xv_*$: 
 									$$
 									y_* ~|~ \Xmat, \yv, \xv_* \sim \mathcal{N}(\sigma^{-2}\yv^\top \Xmat \Amat^{-1}\xv_*, \xv_*^\top\Amat^{-1}\xv_*)
 									$$
@@ -141,8 +141,8 @@
 											\begin{tabular}{cc}
 												\textbf{Weight-Space View} & \textbf{Function-Space View} \vspace{4mm}\\ 
 												Parameterize functions & \vspace{1mm}\\
-												\footnotesize Example: $\fxt = \thetab^\top \xv$ & \vspace{3mm}\\
-												Define distributions on $\thetab$ & Define distributions on $f$ \vspace{4mm}\\
+												\footnotesize Example: $\fxt = \thetav^\top \xv$ & \vspace{3mm}\\
+												Define distributions on $\thetav$ & Define distributions on $f$ \vspace{4mm}\\
 												Inference in parameter space $\Theta$ & Inference in function space $\Hspace$
 											\end{tabular}
 										\end{table}  
@@ -393,12 +393,12 @@
 		%\begin{myblock}{Components of Learning}
 
 		%\textbf{Learning = Hypothesis space + Risk + Optimization} \\
-		%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetab \in \Theta} 
+		%\phantom{\textbf{Learning}} \textbf{= }$ \Hspace + \risket + \argmin_{\thetav \in \Theta} 
 		%\risket$
 
 		% 
 		% \textbf{Learning &= Hypothesis space &+ Risk  &+ Optimization} \\
-		% &= $\Hspace &+ \risket &+ \argmin_{\thetab \in \Theta} \risket$
+		% &= $\Hspace &+ \risket &+ \argmin_{\thetav \in \Theta} \risket$
 		% 
 		% \textbf{Hypothesis space: } Defines (and restricts!) what kind of model $f$
 		% can be learned from the data.

diff --git a/exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw b/exercises/advriskmin/ex_rnw/ex_connection_mle_erm.Rnw
@@ -32,12 +32,12 @@ where $\eps^{(1)},\ldots,\eps^{(n)}$ are iid with distribution $\mathcal{N}(0, \
 	%  
 	\begin{equation*}
     \begin{split}
-      \Hspace = \{f(\cdot~|~ \thetab): \Xspace \to \R \ ~|~   & f(\cdot~|~ \thetab) \text{ belongs to a certain
-       functional family parameterized by } \thetab \in \Theta \},
+      \Hspace = \{f(\cdot~|~ \thetav): \Xspace \to \R \ ~|~   & f(\cdot~|~ \thetav) \text{ belongs to a certain
+       functional family parameterized by } \thetav \in \Theta \},
     \end{split}
   \end{equation*}
   %
-  where $\thetab = (\theta_1, \theta_2, \ldots, \theta_d)$ is a parameter vector, which is an element of a \textbf{parameter space} 
+  where $\thetav = (\theta_1, \theta_2, \ldots, \theta_d)$ is a parameter vector, which is an element of a \textbf{parameter space} 
   $\Theta$.
 	%
 	Based on your findings in (a), establish a relationship between minimizing the negative log-likelihood for  $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$ and empirical loss minimization over $\Hspace$ of the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$ 

diff --git a/exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw b/exercises/advriskmin/ex_rnw/ex_glm_optim.Rnw
@@ -12,18 +12,18 @@ current competitor and market data.
   \{1, 2, \dots, n\}, i \neq j$, with sample size $n$.
   \begin{itemize}
   \item Argue which of the following distributions from the one-parametric exponential family is most suitable for the underlying use case: normal, Bernoulli, gamma or Poisson.
-  \item Write down the probability distribution of the chosen distribution depending on $\thetab$ assuming a log link function.
+  \item Write down the probability distribution of the chosen distribution depending on $\thetav$ assuming a log link function.
   \end{itemize}
 
   %The GLM models the target as a linear function of the features 
-  %with Gaussian error term: $\ydat = \Xmat \thetab + \epsilon$, \\ 
+  %with Gaussian error term: $\ydat = \Xmat \thetav + \epsilon$, \\ 
   %$\epsilon \sim N(\bm{0}, \mathit{diag}(\sigma^2)), ~~ \sigma > 0$.
   % Furthermore, you have reason to believe that the effect of mileage might be 
   % non-linear, so you decide to include this quantity logarithmically (using the 
   % natural logarithm).
 
   \item State the hypothesis space for the corresponding model class.
-  For this, assume the parameter vector $\thetab$ to include the intercept 
+  For this, assume the parameter vector $\thetav$ to include the intercept 
   coefficient.
   \item Which parameters need to be learned?
   Define the corresponding parameter space $\Theta$.
@@ -32,15 +32,15 @@ current competitor and market data.
   likelihood estimation (MLE). 
   %The likelihood for the LM is given by:
   % \[
-  % \ell(\thetab) = - \frac{n}{2} \log(2 \sigma^2 \pi) - \frac{1}{2 \sigma^2} 
-  % (\ydat - \Xmat \thetab)^T(\ydat - \Xmat \thetab)
+  % \ell(\thetav) = - \frac{n}{2} \log(2 \sigma^2 \pi) - \frac{1}{2 \sigma^2} 
+  % (\ydat - \Xmat \thetav)^T(\ydat - \Xmat \thetav)
   % \]
 
   % \\
   % &= \left( \frac{1}{2 \pi \sigma^2} \right)^{\frac{n}{2}} \exp \left(- 
   % \frac{1}{2 \sigma^2} \sumin \left(\yi - \thetat \xi \right)^2  \right) \\ 
   % &= \left( \frac{1}{2 \pi \sigma^2} \right)^{\frac{n}{2}} \exp \left(- 
-  % \frac{1}{2 \sigma^2} \| \ydat - \Xmat \thetab \|^2 \right)
+  % \frac{1}{2 \sigma^2} \| \ydat - \Xmat \thetav \|^2 \right)
   Describe how you can make use of the likelihood in empirical risk minimization 
   (ERM) and write down the likelihood as well as the resulting empirical risk.
   %\item Now you need to optimize this risk to find the best parameters, 

diff --git a/exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw b/exercises/advriskmin/ex_rnw/sol_connection_mle_erm.Rnw
@@ -10,26 +10,26 @@
 	The likelihood for  $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$  is  
 %	
 	\begin{eqnarray*}
-		\LL(\thetab) &=& \prod_{i=1}^n \pdf\left(z^{(i)} ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \exp\left(-\frac{1}{2\sigma^2}\sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)\,.
+		\LL(\thetav) &=& \prod_{i=1}^n \pdf\left(z^{(i)} ~\bigg|~ \fxit, \sigma^2\right) \\ &\propto& \exp\left(-\frac{1}{2\sigma^2}\sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)\,.
 	\end{eqnarray*}
 %
 	So, the negative log-likelihood for  $(\xv^{(1)},z^{(1)}),\ldots,(\xv^{(n)},z^{(n)})$  is  
 %	
 	\begin{eqnarray*}
-		- \loglt &=& - \log\left(\LL(\thetab)\right) \\
+		- \loglt &=& - \log\left(\LL(\thetav)\right) \\
 		&=& - \log\left(  \exp\left(-\frac{1}{2\sigma^2} \sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2\right)  \right) \\
 		&\propto& \sumin \left[z^{(i)} - m\left(\fxit\right)\right]^2 \\
 		&=& \sumin \left[ m(\yi) - m\left(\fxit\right)\right]^2.
 	\end{eqnarray*}
 %
-	Thus, the negative log-likelihood for a parameter $\thetab$ is proportional to the empirical risk of a hypothesis $f(\cdot ~|~ \thetab)$ w.r.t. the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$ 
+	Thus, the negative log-likelihood for a parameter $\thetav$ is proportional to the empirical risk of a hypothesis $f(\cdot ~|~ \thetav)$ w.r.t. the generalized L2-loss function of Exercise sheet 1, i.e., $\Lxy= \big(m(y)-m(\fx)\big)^2.$ 
 %
 
 \item First, we specify the feature space: $\Xspace = \{1\} \times \R,$ i.e., any feature $\xv \in \Xspace$ is of the form $\xv=(x_1,x_2)^\top = (1,x_2)^\top$ for some $x_2\in \R.$
 %
 According to the exercise we use $m(x)=\log(x),$ whose inverse is $m^{-1}(x)=\exp(x).$ 
 %
-Let us rewrite Forbes' conjectured model $  y = \theta_1 \exp(\theta_2 x + \eps)$ into $y = m^{-1} \left( m(f(\xv~|~ \thetab)) + \eps \right),$ for some suitable hypothesis $f(\xv~|~ \thetab):$
+Let us rewrite Forbes' conjectured model $  y = \theta_1 \exp(\theta_2 x + \eps)$ into $y = m^{-1} \left( m(f(\xv~|~ \thetav)) + \eps \right),$ for some suitable hypothesis $f(\xv~|~ \thetav):$
 %
 \begin{align*}
 %	
@@ -47,13 +47,13 @@ Let us rewrite Forbes' conjectured model $  y = \theta_1 \exp(\theta_2 x + \eps)
 %	
 \end{align*}
 %
-With this, we see that $f(\xv~|~ \thetab) = \theta_1 x_1 \exp(\theta_2 x_2) = \theta_1 \exp(\theta_2 x_2)$ is a suitable functional form for the hypotheses.
+With this, we see that $f(\xv~|~ \thetav) = \theta_1 x_1 \exp(\theta_2 x_2) = \theta_1 \exp(\theta_2 x_2)$ is a suitable functional form for the hypotheses.
 %
 Thus, we use as our parameter space $\Theta = \R_+ \times \R$ which gives rise to the hypothesis space
 %
 \begin{equation*}
 	\begin{split}
-		\Hspace = \{f(\xv~|~ \thetab) = \theta_1 x_1 \exp(\theta_2 x_2) ~|~   \thetab \in \Theta \}.
+		\Hspace = \{f(\xv~|~ \thetav) = \theta_1 x_1 \exp(\theta_2 x_2) ~|~   \thetav \in \Theta \}.
 	\end{split}
 \end{equation*}
 %
@@ -76,15 +76,15 @@ A suitable hypothesis space is then
 %
 \begin{equation*}
 	\begin{split}
-		\Hspace = \{f(\xv~|~ \thetab) = \log(\theta_1) x_1 + \theta_2 x_2 ~|~   \thetab \in \Theta \},
+		\Hspace = \{f(\xv~|~ \thetav) = \log(\theta_1) x_1 + \theta_2 x_2 ~|~   \thetav \in \Theta \},
 	\end{split}
 \end{equation*}
 %
-which are the linear functions\footnote{Note that $\log(\theta_1)$ can be any value in $\R.$} $\xv^\top \thetab$ of features in $\Xspace.$
+which are the linear functions\footnote{Note that $\log(\theta_1)$ can be any value in $\R.$} $\xv^\top \thetav$ of features in $\Xspace.$
 %
 The empirical risk minimizer in this case is specified by the parameter 
 %
-$$(\log(\hat{\theta}_1),\hat{\theta}_2)^\top = \thetabh=\left(\Xmat^T \Xmat\right)^{-1}\Xmat^T \bm{z}, \qquad  \bm{z} = (\log y^{(1)},\ldots,\log y^{(n)})^\top,$$
+$$(\log(\hat{\theta}_1),\hat{\theta}_2)^\top = \thetavh=\left(\Xmat^T \Xmat\right)^{-1}\Xmat^T \bm{z}, \qquad  \bm{z} = (\log y^{(1)},\ldots,\log y^{(n)})^\top,$$
 %
 (see \href{https://slds-lmu.github.io/i2ml/chapters/02_supervised_regression/02-02-linearmodel/}{Chapter 02.02 of I2ML}) which for this simple case is:
 %