Merge overleaf-2025-02-11-0934 into main

slds-lmu · Feb 11, 2025 · d58ba20 · d58ba20
2 parents c0b8996 + 5d4c8b4
commit d58ba20
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 12 deletions.
diff --git a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex
@@ -80,19 +80,19 @@
 
 \begin{itemize}
 \item Let's assume some RV $z \in \Yspace$ for a label
-\item z not RV $y$, because we want to fiddle with its distribution
+\item z not the same as $y$, as we want to fiddle with its distribution
 \item Assume z has distribution Q, so $z \sim Q$
-\item We can now consider $\argmin_c \E_{z \sim Q}[L(z, c)]$\\
-so the score-constant which loss-minimally approximates z
+\item We now consider $\argmin_c \E_{z \sim Q}[L(z, c)]$\\
+What is the score-constant which approximates z with minimal loss?
 \end{itemize}
 
 \lz
 
 We will consider 3 cases for Q
 \begin{itemize}
-\item $Q = P_y$, simply our labels and their marginal distribution in $\Pxy$
-\item $Q = P_{y | x = x}$, conditional label distribution at point $x = \tilde{x}$
+\item $Q = P_y$, (uncond.) distribution of labels y, marginal of $\Pxy$
 \item $Q = P_n$, the empirical product distribution for data $y_1, \ldots, y_n$
+\item $Q = P_{y | x = \tilde{x}}$, conditional label distribution at point $x = \tilde{x}$
 \end{itemize}
 
 \lz
@@ -104,7 +104,7 @@
 \end{vbframe}
 
 
-\begin{vbframe}{Optimal Constant Model}
+\begin{vbframe}{Unconditional: Optimal Constant Model}
 
 \begin{itemize}
 {\footnotesize
@@ -131,7 +131,7 @@
 \end{vbframe}
 
 
-\begin{vbframe}{Optimal Constant Model}
+\begin{vbframe}{Unconditional: Optimal Constant Model}
 
 \begin{itemize}
 \item Let's start with the simplest case, L2 loss

diff --git a/slides/boosting/algorithms/forward_stagewise_additive_modeling.tex b/slides/boosting/algorithms/forward_stagewise_additive_modeling.tex
@@ -5,10 +5,10 @@
     \begin{algorithmic}[1]
       \State Initialize $\hat{f}^{[0]}(\xv)$ with loss optimal constant model%
       \For{$m = 1 \to M$}
-        \State $(\hat{\alpha^{[m]}}, \thetamh) = \argmin \limits_{\alpha, \bm{\theta}} \sum \limits_{i=1}^n
+        \State $(\hat{\alpha}^{[m]}, \thetamh) = \argmin \limits_{\alpha, \bm{\theta}} \sum \limits_{i=1}^n
                  L\left(\yi, \fmdh\left(\xi\right) + \alpha b\left(\xi, \bm{\theta}\right)\right)$
         \vspace{1.5ex}
-        \State Update $\fmh(\xv) \gets \fmdh(\xv) + \hat{\alpha^{[m]}} b\left(\xv, \thetamh\right)$
+        \State Update $\fmh(\xv) \gets \fmdh(\xv) + \hat{\alpha}^{[m]} b\left(\xv, \thetamh\right)$
       \EndFor
     \end{algorithmic}
     \end{center}

diff --git a/slides/boosting/slides-boosting-cwb-basics2.tex b/slides/boosting/slides-boosting-cwb-basics2.tex
@@ -116,7 +116,7 @@
   \item  {\footnotesize Assume linear base learners $b_j(\xv) = \theta_{j1} + \theta_{j2} x_j$. %with 
   %one intercept $\theta_{j1}$ %per base learner
   %and slope $\theta_{j2}$.
-  If base learner $\hat{b}_j$ with parameter $\thetamh[1] = (\hat{\theta}_{j1}^{[1]}, \hat{\theta}_{j1}^{[1]})$ is selected in first iteration, model intercept is updated to $\fm[0](\xv) + \hat{\theta}_{j1}^{[1]}$.}
+  If base learner $\hat{b}_j$ with parameter $\thetamh[1] = (\hat{\theta}_{j1}^{[1]}, \hat{\theta}_{j2}^{[1]})$ is selected in first iteration, model intercept is updated to $\fm[0](\xv) + \hat{\theta}_{j1}^{[1]}$.}
     \item {\footnotesize During training, intercept is adjusted $M$ times to yield $\fm[0](\xv) + \sum\limits_{m=1}^M %\hat{\theta}^{[m]}_{j^{[m]}1}
     \hat{\theta}^{[m]}_{j1}$}
     \end{itemize}

diff --git a/slides/gaussian-processes/slides-gp-basic.tex b/slides/gaussian-processes/slides-gp-basic.tex
@@ -562,7 +562,7 @@ \section{Gaussian Processes as Indexed Family}
   \vspace*{0.5cm}
   \begin{itemize}
     \item $T = \{1, \dots, m\}$, $Y_t$'s are RVs: Indexed family is a random vector. \vspace*{0.2cm}
-    \item $T = \{1, \dots, m\}$, $Y_t$'s are RVs: Indexed family is a stochastic process in discrete time \vspace*{0.2cm}
+    \item $T = \N$, $Y_t$'s are RVs: Indexed family is a stochastic process in discrete time \vspace*{0.2cm}
     \item $T = \Z^2$, $Y_t$'s are RVs: Indexed family is a 2D-random walk.
   \end{itemize}
 \end{minipage}\hfill

diff --git a/slides/multiclass/slides-mc-softmax-regression.tex b/slides/multiclass/slides-mc-softmax-regression.tex
@@ -130,7 +130,7 @@
   Hence,  our model is \enquote{over-parameterized}. For any hypothesis we might fit,
   there are multiple parameter vectors that give rise to exactly the same hypothesis function.
   This also implies that the minimizer of $\risket$ above is not unique!
-  Hence, a numerical trick is to set $\thetav_g = 0$ and only optimize the other $\thetav_k$. This does not restrict our hypothesis space, but the constrained problem is now convex, i.e., there exists exactly one parameter vector for every hypothesis.
+  Hence, a numerical trick is to set $\thetav_g = 0$ and only optimize the other $\thetav_k$. This does not restrict our hypothesis space, but the constrained problem is now strictly convex, i.e., there exists exactly one parameter vector for every hypothesis.
 
 \item A similar approach is used in many ML models: multiclass LDA, naive Bayes, neural networks and boosting.