From 00c3e058cf4445324f3504269cd483b3d48d3876 Mon Sep 17 00:00:00 2001 From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:33:06 +0200 Subject: [PATCH] post-meeting updates for advriskmin --- slides/advriskmin/references.bib | 7 ++ .../slides-advriskmin-risk-minimizer.tex | 75 +++++++++++-------- 2 files changed, 49 insertions(+), 33 deletions(-) diff --git a/slides/advriskmin/references.bib b/slides/advriskmin/references.bib index 7874fb33..27e9c43b 100644 --- a/slides/advriskmin/references.bib +++ b/slides/advriskmin/references.bib @@ -1,3 +1,10 @@ +@article{BROWN2024BIAS, + title={Bias/Variance is not the same as Approximation/Estimation}, + author={Brown, Gavin and Ali, Riccardo}, + year={2024}, + journal={Transactions on Machine Learning Research} +} + @inproceedings{SOLLICH1999NINTH, author={Sollich, P.}, booktitle={1999 Ninth International Conference on Artificial Neural Networks ICANN 99. (Conf. Publ. No. 470)}, diff --git a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex index 01ac9f78..da11eb13 100644 --- a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex +++ b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex @@ -59,19 +59,19 @@ \begin{vbframe}{Two short examples} \textbf{Regression with linear model:}\\ \begin{itemize} - \item Model: $f(\xi) = \thetab^\top \xi + \theta_0$ + \item Model: $f(\xv) = \thetab^\top \xv + \theta_0$ \item Squared loss: - $L(\yi, f(\xi)) = \left(\yi - f(\xi)\right)^2$ - \item Hypothesis space: $$\Hspace_{\text{lin}} = \left\{ \xi \mapsto \thetab^\top \xi + \theta_0 : \thetab \in \mathbb{R}^d, \theta_0 \in \mathbb{R} \right\}$$ + $\Lyf = \left(y-f\right)^2$ + \item Hypothesis space: $$\Hspace_{\text{lin}} = \left\{ \xv \mapsto \thetab^\top \xv + \theta_0 : \thetab \in \mathbb{R}^d, \theta_0 \in \mathbb{R} \right\}$$ \end{itemize} \vspace{0.3cm} \textbf{Binary classification with shallow MLP:}\\ \begin{itemize} - \item Model: $f(\xi) = \bm{w}_2^{\top} \text{ReLU}(\bm{W}_1 \xi + \bm{b}_1) + b_2$ - \item Binary cross-entropy loss: $L(\yi, f(\xi)) = -(\yi\log(p^{(i)})+(1-\yi)\log(1-p^{(i)}))$\\ where $p^{(i)} = \sigma(f(\xi))$ (logistic sigmoid) - \item Hypothesis space: {\small $$\Hspace_{\text{MLP}} = \left\{ \xi \mapsto \bm{w}_2^{\top} \text{ReLU}(\bm{W}_1 \xi + \bm{b}_1) + b_2: \mathbf{W}_1 \in \mathbb{R}^{h \times d}, \mathbf{b}_1 \in \mathbb{R}^h, \mathbf{w}_2 \in \mathbb{R}^h, b_2 \in \mathbb{R} \right\}$$} + \item Model: $f(\xv) = \pi(\xv)= \sigma(\bm{w}_2^{\top} \text{ReLU}(\bm{W}_1 \xv + \bm{b}_1) + b_2)$ + \item Binary cross-entropy loss: $\Lpiv = -(y\log(\pi)+(1-y)\log(1-\pi))$\\ + \item Hypothesis space: {\small $$\Hspace_{\text{MLP}} = \left\{ \xv \mapsto \sigma(\bm{w}_2^{\top} \text{ReLU}(\bm{W}_1 \xv + \bm{b}_1) + b_2): \mathbf{W}_1 \in \mathbb{R}^{h \times d}, \mathbf{b}_1 \in \mathbb{R}^h, \mathbf{w}_2 \in \mathbb{R}^h, b_2 \in \mathbb{R} \right\}$$} \end{itemize} \end{vbframe} @@ -79,25 +79,25 @@ \begin{vbframe}{Optimal constants for a loss} \begin{itemize} -\item Let's assume some RV $Z \in \Yspace$ for a label -\item Z not $Y$, because we want to fiddle with its distribution -\item Assume Z has distribution Q, so $Z \sim Q$ -\item We can now consider $\argmin_c \E_{Z \sim Q}[L(Z, c)]$\\ -so the score-constant which loss-minimally approximates Z +\item Let's assume some RV $z \in \Yspace$ for a label +\item z not RV $y$, because we want to fiddle with its distribution +\item Assume z has distribution Q, so $z \sim Q$ +\item We can now consider $\argmin_c \E_{z \sim Q}[L(z, c)]$\\ +so the score-constant which loss-minimally approximates z \end{itemize} \lz We will consider 3 cases for Q \begin{itemize} -\item $Q = P_Y$, simply our labels and their marginal distribution in $\Pxy$ -\item $Q = P_{Y | X = x}$, conditional label distribution at point $X = x$ +\item $Q = P_y$, simply our labels and their marginal distribution in $\Pxy$ +\item $Q = P_{y | x = x}$, conditional label distribution at point $x = x$ \item $Q = P_n$, the empirical product distribution for data $y_1, \ldots, y_n$ \end{itemize} \lz -If we can solve $\argmin_c \E_{Z \sim Q}[L(Z, c)]$ for any $Q$, +If we can solve $\argmin_c \E_{z \sim Q}[L(z, c)]$ for any $Q$, we will get multiple useful results! @@ -139,16 +139,16 @@ \end{itemize} -$$ \argmin \E [L(Z, c)] = $$ -$$ \argmin \E [(Z - c)^2] = $$ -$$ \argmin \E [Z^2] - 2cE[Z] + c^2 = $$ -$$ E[Z] $$ +$$ \argmin \E [L(z, c)] = $$ +$$ \argmin \E [(z - c)^2] = $$ +$$ \argmin \E [z^2] - 2cE[z] + c^2 = $$ +$$ E[z] $$ \begin{itemize} -\item Using $Q = P_Y$, this means that, given we know the label distribution, -the best constant is $c = E[Y]$. +\item Using $Q = P_y$, this means that, given we know the label distribution, +the best constant is $c = E[y]$. \item If we only have data $y_1, \ldots y_n$ -$\argmin \E_{Z \sim P_n} [(Z - c)^2] = \E_{Z \sim P_n}[Z] = \frac{1}{n} \sumin \yi = \bar{y}$ +$\argmin \E_{z \sim P_n} [(z - c)^2] = \E_{z \sim P_n}[z] = \frac{1}{n} \sumin \yi = \bar{y}$ \item And we want to find and optimal constant model for @@ -177,7 +177,7 @@ Let us assume we are in an \enquote{ideal world}: \begin{itemize} - \item The hypothesis space $\Hspace$ is unrestricted. We can choose any $f: \Xspace \to \R^g$. + \item The hypothesis space $\Hspace=\Hspace_{all}$ is unrestricted. We can choose any measurable $f: \Xspace \to \R^g$. \item We also assume an ideal optimizer; the risk minimization can always be solved perfectly and efficiently. \item We know $\Pxy$. @@ -192,14 +192,23 @@ is called the \textbf{risk minimizer}, \textbf{population minimizer} or \textbf{Bayes optimal model}. \begin{eqnarray*} - \fbayes &=& \argmin_{f: \Xspace \to \R^g} \risk_L\left(f\right) = \argmin_{f: \Xspace \to \R^g}\Exy\left[\Lxy\right]\\ &=& \argmin_{f: \Xspace \to \R^g}\int \Lxy \text{d}\Pxy. + \fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f: \Xspace \to \R^g}\Exy\left[\Lxy\right]\\ &=& \argmin_{f: \Xspace \to \R^g}\int \Lxy \text{d}\Pxy. \end{eqnarray*} -% Note that we search over an unrestricted hypothesis space (that is over all possible functions $f: \Xspace \to \R^g$)! +The resulting risk is called \textbf{Bayes risk}: $\riskbayes_{} = \risk(\fbayes)$ -\lz +\lz + +Note that if we leave out the hypothesis space in the subscript it becomes clear from the context!\\ + +Similarly, we define the risk minimizer over some $\Hspace \subset \Hspace_{all}$ as -The resulting risk is called \textbf{Bayes risk}: $\riskbayes_{L} = \risk_L(\fbayes)$ +\begin{eqnarray*} + \fbayes_{\Hspace} &=& \argmin_{f \in \Hspace} \risk\left(f\right) +\end{eqnarray*} + + +% Note that we search over an unrestricted hypothesis space (that is over all possible functions $f: \Xspace \to \R^g$)! @@ -230,7 +239,7 @@ \begin{frame}[t]{optimal point-wise predictions} To derive the risk minimizer, observe that by law of total expectation -$$ \risk_L(f) = \E_{xy} \left[\Lxy\right] +$$ \risk(f) = \E_{xy} \left[\Lxy\right] = \E_x \left[\E_{y|x}\left[\Lxy~|~\xv = \xv\right]\right].$$ \begin{itemize} @@ -275,11 +284,11 @@ \begin{vbframe}{Estimation and Approximation Error} -\textbf{Goal of learning: } Train a model $\hat f$ for which the true risk $\risk_L\left(\hat f\right)$ is close to the Bayes risk $\riskbayes_L$. In other words, we want the \textbf{Bayes regret} +\textbf{Goal of learning: } Train a model $\hat f$ for which the true risk $\risk\left(\hat f\right)$ is close to the Bayes risk $\riskbayes$. In other words, we want the \textbf{Bayes regret} $$ - \risk_L\left(\hat f\right) - \riskbayes_{L} + \risk\left(\hat f\right) - \riskbayes $$ to be as low as possible. @@ -289,7 +298,7 @@ The Bayes regret can be decomposed as follows: \begin{eqnarray*} - \risk_L\left(\hat f\right) - \riskbayes_{L} &=& \underbrace{\left[\risk_L\left(\hat f\right) - \inf_{f \in \Hspace} \risk_L(f)\right]}_{\text{estimation error}} + \underbrace{\left[\inf_{f \in \Hspace} \risk_L(f) - \riskbayes_{L}\right]}_{\text{approximation error}} + \risk\left(\hat f\right) - \riskbayes &=& \underbrace{\left[\risk\left(\hat f\right) - \inf_{f \in \Hspace} \risk(f)\right]}_{\text{estimation error}} + \underbrace{\left[\inf_{f \in \Hspace} \risk(f) - \riskbayes\right]}_{\text{approximation error}} \end{eqnarray*} \framebreak @@ -301,8 +310,8 @@ \end{center} \begin{itemize} - \item $\risk_L\left(\hat f\right) - \inf_{f \in \Hspace} \risk(f)$ is the \textbf{estimation error}. We fit $\hat f$ via empirical risk minimization and (usually) use approximate optimization, so we usually do not find the optimal $f \in \Hspace$. - \item $\inf_{f \in \Hspace} \risk_L(f) - \riskbayes_{L}$ is the \textbf{approximation error}. We need to restrict to a hypothesis space $\Hspace$ which might not even contain the Bayes optimal model $\fbayes$. + \item $\risk\left(\hat f\right) - \inf_{f \in \Hspace} \risk(f)$ is the \textbf{estimation error}. We fit $\hat f$ via empirical risk minimization and (usually) use approximate optimization, so we usually do not find the optimal $f \in \Hspace$. + \item $\inf_{f \in \Hspace} \risk(f) - \riskbayes$ is the \textbf{approximation error}. We need to restrict to a hypothesis space $\Hspace$ which might not even contain the Bayes optimal model $\fbayes$. \end{itemize} \end{vbframe} @@ -322,7 +331,7 @@ The learning method $\ind$ is said to be \textbf{consistent} w.r.t. a certain distribution $\Pxy$ if the risk of the estimated model $\hat f$ converges in probability ( \enquote{$\overset{p}{\longrightarrow}$}) to the Bayes risk $\riskbayes$ when $n_\text{train}$ goes to $\infty$: $$ - \risk\left(\ind\left(\Dtrain\right)\right) \overset{p}{\longrightarrow} \riskbayes_L \quad \text{for } n_\text{train} \to \infty. + \risk\left(\ind\left(\Dtrain\right)\right) \overset{p}{\longrightarrow} \riskbayes \quad \text{for } n_\text{train} \to \infty. $$ \vfill