diff --git a/slides/advriskmin/figure/quantile-regression.pdf b/slides/advriskmin/figure/quantile-regression.pdf new file mode 100644 index 00000000..f2926106 Binary files /dev/null and b/slides/advriskmin/figure/quantile-regression.pdf differ diff --git a/slides/advriskmin/figure/telephone-data.pdf b/slides/advriskmin/figure/telephone-data.pdf new file mode 100644 index 00000000..15054624 Binary files /dev/null and b/slides/advriskmin/figure/telephone-data.pdf differ diff --git a/slides/advriskmin/figure_man/expected-risk-decomp.png b/slides/advriskmin/figure_man/expected-risk-decomp.png new file mode 100644 index 00000000..1d1493cd Binary files /dev/null and b/slides/advriskmin/figure_man/expected-risk-decomp.png differ diff --git a/slides/advriskmin/rsrc/quantile-regression.R b/slides/advriskmin/rsrc/quantile-regression.R new file mode 100644 index 00000000..4c4dbc46 --- /dev/null +++ b/slides/advriskmin/rsrc/quantile-regression.R @@ -0,0 +1,69 @@ +### This script produces a plot illustrating the ERM under pinball losses +### Data: simulated univariate heteroskedastic regression y=theta0+theta1*x+eps(x) +### eps(x) is normally distributed with mean 0 and standard deviation 0.5+0.5*x +### x is chosen uniformly in [0,10], theta0=1, theta1=0.2, n=200 samples + +rm(list=ls()) +library(quantreg) +library(ggplot2) + +# Data simulation +set.seed(123) +n <- 200 +x <- runif(n, 0, 10) +theta0 <- 1 +theta1 <- 0.2 +eps <- rnorm(n, mean = 0, sd = 0.5+0.5*x) +y <- theta0 + theta1 * x + eps +simu.dat <- data.frame(x = x, y = y) + +# Pinball loss for 0.05 quantile +mod.5 <- rq(formula = as.formula("y ~ x"), tau=0.05, data=simu.dat) +res.5 <- mod.5$coefficients +resids.5 <- mod.5$residuals +preds.5 <- predict(mod.5) +# Pinball loss for 0.5 quantile +mod.50 <- rq(formula = as.formula("y ~ x"), tau=0.5, data=simu.dat) +res.50 <- mod.50$coefficients +resids.50 <- mod.50$residuals +preds.50 <- predict(mod.50) +# Pinball loss for 0.95 quantile +mod.95 <- rq(formula = as.formula("y ~ x"), tau=0.95, data=simu.dat) +res.95 <- mod.95$coefficients +resids.95 <- mod.95$residuals +preds.95 <- predict(mod.95) + +# Aggregate results +res.dat <- simu.dat +res.dat$resid_5 <- resids.5 +res.dat$resid_50 <- resids.50 +res.dat$resid_95 <- resids.95 +res.dat$pred_5 <- preds.5 +res.dat$pred_50 <- preds.50 +res.dat$pred_95 <- preds.95 + +# Calculate true conditional mean +res.dat$true_mean <- theta0 + theta1 * res.dat$x + + +p <- ggplot(res.dat, aes(x = x)) + + geom_line(aes(y = pred_5, color = "q=0.05"), size = 1.6, alpha=1) + + geom_line(aes(y = pred_50, color = "q=0.5"), size = 1.6, alpha=1) + + geom_line(aes(y = pred_95, color = "q=0.95"), size = 1.6, alpha=1) + + geom_line(aes(y = true_mean, linetype = "True mean"), color = "black", size = 1.6) + + geom_point(aes(y = y), color = "darkgrey", size = 2, alpha=1) + + labs(y = "y", x = "x", color = "Quantile", linetype = "") + + scale_linetype_manual(values = c("True mean" = "dashed")) + + theme_minimal() + + theme(text = element_text(size = 12), + axis.title = element_text(size = rel(2)), + axis.text = element_text(size = rel(2)), + legend.title = element_text(size = rel(1.5)), + legend.text = element_text(size = rel(1.3))) + + guides(color = guide_legend(order = 1), linetype = guide_legend(order = 2)) +# Print plot +print(p) + +# Save figure +ggsave(filename = paste0("../figure/quantile-regression.pdf"), plot = p) + diff --git a/slides/advriskmin/rsrc/telephone-data.R b/slides/advriskmin/rsrc/telephone-data.R new file mode 100644 index 00000000..3e6da80e --- /dev/null +++ b/slides/advriskmin/rsrc/telephone-data.R @@ -0,0 +1,93 @@ +### This script produces a plot comparing various loss functions for robust regression +### Dataset: telephone data (number of calls per year in Belgium) +### Losses: L2, L1, log-cosh, Huber loss + +rm(list=ls()) + +library(Rfit) +library(hqreg) +library(MASS) +library(quantreg) +library(CVXR) +library(ggplot2) + +set.seed(123) + +# Data +tel.dat <- Rfit::telephone +tel.dat$year <- as.numeric(tel.dat$year) + +# L2 loss (least-squares) +fm0 <- lm(calls ~ year, tel.dat) +X <- model.matrix(fm0) +res.l2 <- fm0$coefficients +resids.l2 <- fm0$residuals +preds.l2 <- predict(fm0) + +# Log-cosh regression +f <- function(b) with(tel.dat, sum(log(cosh(calls - X %*% b)))) +res.optim <- optim(coef(fm0), f, method = "Nelder-Mead", control=list(alpha=1.0, beta=0.5, gamma=2.0)) +res.logcosh <- res.optim$par +resids.logcosh <- with(tel.dat, calls - X %*% res.logcosh) +preds.logcosh <- as.numeric(with(tel.dat, X %*% res.logcosh)) + +# L1 loss robust regression +#mod.l1 <- rq(formula = as.formula("calls ~ year"), tau=0.5, data=tel.dat) +#res.l1 <- mod.l1$coefficients +#resids.l1 <- mod.l1$residuals +#preds.l1 <- predict(mod.l1) + +# L1 loss robust regression (manual optimization) +f.l1 <- function(b) with(tel.dat, sum(abs(calls - X %*% b))) +res.optim.l1 <- optim(coef(fm0), f.l1, method = "Nelder-Mead", control=list(alpha=1.0, beta=0.5, gamma=2.0)) +res.l1.manual <- res.optim.l1$par +resids.l1.manual <- with(tel.dat, calls - X %*% res.l1.manual) +preds.l1.manual <- as.numeric(with(tel.dat, X %*% res.l1.manual)) + +# Huber loss robust regression using CVXR +beta <- Variable(2) +obj <- sum(CVXR::huber(tel.dat$calls - X %*% beta, 1)) +prob <- Problem(Minimize(obj)) +result <- solve(prob) +res.huber <- result$getValue(beta) +names(res.huber) <- names(res.l2) +resids.huber <- with(tel.dat, calls - X %*% res.huber) +preds.huber <- as.numeric(with(tel.dat, X %*% res.huber)) + +# aggregate results data +res.tel <- tel.dat +res.tel$resid_l2 <- resids.l2 +#res.tel$resid_l1 <- resids.l1 +res.tel$resid_l1_manual <- resids.l1.manual +res.tel$resid_huber <- resids.huber +res.tel$resid_logcosh <- resids.logcosh +res.tel$pred_l2 <- preds.l2 +#res.tel$pred_l1 <- preds.l1 +res.tel$pred_l1_manual <- preds.l1.manual +res.tel$pred_huber <- preds.huber +res.tel$pred_logcosh <- preds.logcosh + +#coef.b0 <- c(res.l2[1], res.l1[1], res.huber[1], res.logcosh[1]) +#coef.b1 <- c(res.l2[2], res.l1[2], res.huber[2], res.logcosh[2]) + +# Plot results for telephone data +p <- ggplot(res.tel, aes(x = year)) + + geom_line(aes(y = pred_l2, color = "L2 (OLS)"), size = 1.6, alpha=1) + + #geom_line(aes(y = pred_l1, color = "L1"), size = 1.4) + + geom_line(aes(y = pred_l1_manual, color = "L1 (man)"), size = 1.6, alpha=1) + + geom_line(aes(y = pred_huber, color = "Huber"), size = 1.6, alpha=1) + + geom_line(aes(y = pred_logcosh, color = "Log-Cosh"), size = 1.6, alpha=1) + + geom_point(aes(y = calls), color = "black", size = 4, alpha=1) + + labs(y = "Calls (in mio.)", x = "Year", color = "Loss") + + theme_minimal() + + theme(text = element_text(size = 12), + axis.title = element_text(size = rel(2)), + axis.text = element_text(size = rel(2)), + legend.title = element_text(size = rel(1.5)), + legend.text = element_text(size = rel(1.3))) + +# Print plot +print(p) + +# Save figure +ggsave(filename = paste0("../figure/telephone-data.pdf"), plot = p) diff --git a/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex b/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex index de304400..b602e271 100644 --- a/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex +++ b/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex @@ -11,7 +11,6 @@ \titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty Advanced Risk Minimization }{% Lecture title - Advanced Risk Minimization:\\ Bias-Variance Decomposition }{% Relative path to title page image: Can be empty but must not start with slides/ figure/bias_variance_decomposition-linear_model_bias.png @@ -40,7 +39,7 @@ with zero-mean homoskedastic error $\epsilon \sim (0, \sigma^2)$ independent of $\xv$. \vspace{0.1cm} -{\scriptsize ${\ast}$ Similar decomp also exist for other losses expressible as Bregman divergences (e.g. cross-entropy). The $0/1$ loss e.g. has no such decomposition (\cite{BROWN2024BIAS})} +{\scriptsize ${\ast}$ Similar decomps also exist for other losses expressible as Bregman divergences (e.g. cross-entropy). One exception is the $0/1$ loss (\cite{BROWN2024BIAS})} \framebreak @@ -255,24 +254,27 @@ \end{vbframe} -\begin{vbframe}{Approximation an Estimation \citelink{BROWN2024BIAS} } -The Bias-Variance decomposition is often confused or equated with the related (but different) decompo of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error. +\begin{vbframe}{Approximation and Estimation \citelink{BROWN2024BIAS} } +The Bias-Variance decomp is often confused or equated with the related (but different) decomp of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error. \begin{eqnarray*} \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}} \end{eqnarray*} +Both are commonly described using the same figure and analogies +\vspace{-0.1cm} \begin{figure} \centering - \includegraphics[width = 0.75\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png} + \includegraphics[width = 0.78\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png} \tiny{\\ Credit: \cite{BROWN2024BIAS}} \end{figure} + \end{vbframe} \framebreak -\begin{vbframe}{Approximation/Estimation Error} +\begin{vbframe}{Approx./Estimation Error \citelink{BROWN2024BIAS}} \begin{itemize} \item The approx. error is a structural property of $\Hspace$. @@ -280,10 +282,10 @@ \item Estimation error arises because we choose $f \in \Hspace$ with limited training data using $\riske$ instead of $\risk$ \end{itemize} -$\fh_{\Hspace}$ assumes we found a global minimizer of $\riske$, which is often impossible. +Knowing $\fh_{\Hspace} \in \arg\inf_{f \in \Hspace} \riske(f)$ assumes we found a global minimizer of $\riske$, which is often impossible (e.g. DNNs). \vspace{0.2cm} -In practice, optimizing $\riske$ gives us our 'best guess' $\tilde{f}_{\Hspace} \in \Hspace$ of $\fh_{\Hspace}$. We can now decompose its excess risk as +In practice, optimizing $\riske$ gives us our 'best guess' $\tilde{f}_{\Hspace} \in \Hspace$ of $\fh_{\Hspace}$. We can now decompose its excess risk finer as \begin{eqnarray*} \underbrace{\risk(\tilde{f}_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\tilde{f}_{\Hspace}) - \risk(\fh_{\Hspace})}_{\text{optim. error}} + \underbrace{\risk(\hat{f}_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}} @@ -291,6 +293,42 @@ Note that the optimization error can be negative, even though $\riske(\tilde{f}_{\Hspace}) \geq \riske(\fh_{\Hspace})$ always holds. +\framebreak + +We can further decompose estimation error more finely by defining the \textit{centroid} model or 'systematic' model part.\\ +\vspace{0.15cm} + +Given $\fh_{\Hspace} \in \arg\min_{f \in \Hspace} \riske(f)$ the centroid model under squared loss is the mean prediction at each $x$ over all $\D_n$, $f^{\circ}_{\Hspace} := \E_{\D_n \sim \Pxy^n}[\fh_{\Hspace}] $,. +\vspace{0.15cm} + +With $f^{\circ}_{\Hspace}$ we can decompose the expected estimation error as + +\begin{eqnarray*} + \underbrace{\E_{\D_n \sim \Pxy^n}\left[\risk(\hat{f}_{\Hspace}) - \risk(f ^{\ast}_{\Hspace})\right]}_{\text{expected estimation error}} &=& \underbrace{\E_{\D_n \sim \Pxy^n}\left[\risk(\hat{f}_{\Hspace}) - \risk(f^{\circ}_{\Hspace}) \right]}_{\text{estimation variance}} + \underbrace{\risk(f^{\circ}_{\Hspace})-\risk(f^{\ast}_{\Hspace})}_{\text{estimation bias}} +\end{eqnarray*} + +\begin{itemize} + \item estimation bias measures distance of centroid model to risk minimizer over $\Hspace$ + \item estimation variance measures spread of ERM around centroid model induced by randomness due to $\D_n$ +\end{itemize} + +\framebreak + +{\small We can now connect the derived quantities back to bias and variance and see how they differ. As we see, bias is not only approx. error and variance is not estimation error.} +\vspace{-0.2cm} + +$$ \text{bias} = \text{approximation error} + \text{estimation bias} $$ +$$\text{variance} = \text{optimization error} + \text{estimation variance}$$ +% variance = estimation variance + optimization error +\vspace{-0.4cm} +\begin{figure} + \centering + \includegraphics[width = 0.5\textwidth]{slides/advriskmin/figure_man/expected-risk-decomp.png} + \tiny{\\ Credit: \cite{BROWN2024BIAS}} + \end{figure} + +{\scriptsize \textbf{NB}: For special case of LM and squared loss (OLS), we have zero optim. error and estimation bias. Hence both decompositions agree there.} + \end{vbframe} diff --git a/slides/advriskmin/slides-advriskmin-regression-further-losses.tex b/slides/advriskmin/slides-advriskmin-regression-further-losses.tex index 53b23108..b62aae40 100644 --- a/slides/advriskmin/slides-advriskmin-regression-further-losses.tex +++ b/slides/advriskmin/slides-advriskmin-regression-further-losses.tex @@ -214,6 +214,18 @@ \end{vbframe} +\begin{vbframe}{Telephone data} + +{\small We now illustrate the effect of using robust loss functions. The telephone data set contains the number of calls (in 10mio units) made in Belgium between 1950 and 1973 ($n=24$). Outliers are due to a change in measurement without re-calibration for 6 years.} + +\vspace{0.2cm} + +\begin{center} +\includegraphics[width = 0.6\textwidth]{figure/telephone-data.pdf} +\end{center} + +\end{vbframe} + \begin{comment} \begin{vbframe}{Log-Barrier Loss} @@ -369,6 +381,18 @@ % where $Q_\alpha(\cdot)$ computes the empirical $\alpha$-quantile of $\{\yi\}, i = 1, ..., n$. +\framebreak + +We simulate $n=200$ samples from a heteroskedastic LM using the DGP $y = 1+0.2x+\varepsilon$, where $\varepsilon \sim \mathcal{N}(0, 0.5+0.5x)$ and $x \sim \mathcal{U}[0,10]$.\\ + +Using the quantile loss, we estimate the conditional $\alpha$-quantiles for $\alpha \in \{0.05, 0.5, 0.95\}$. + +\vspace{-0.2cm} + +\begin{center} +\includegraphics[width = 0.62\textwidth]{slides/advriskmin/figure/quantile-regression.pdf} +\end{center} + \end{vbframe} diff --git a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex index fa9fa55e..a37ed2b5 100644 --- a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex +++ b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex @@ -91,7 +91,7 @@ We will consider 3 cases for Q \begin{itemize} \item $Q = P_y$, simply our labels and their marginal distribution in $\Pxy$ -\item $Q = P_{y | x = x}$, conditional label distribution at point $x = x$ +\item $Q = P_{y | x = x}$, conditional label distribution at point $x = \tilde{x}$ \item $Q = P_n$, the empirical product distribution for data $y_1, \ldots, y_n$ \end{itemize} @@ -192,10 +192,10 @@ is called the \textbf{risk minimizer}, \textbf{population minimizer} or \textbf{Bayes optimal model}. \begin{eqnarray*} - \fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f: \Xspace \to \R^g}\Exy\left[\Lxy\right]\\ &=& \argmin_{f: \Xspace \to \R^g}\int \Lxy \text{d}\Pxy. + \fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f \in \Hspace_{all}}\Exy\left[\Lxy\right]\\ &=& \argmin_{f \in \Hspace_{all}}\int \Lxy \text{d}\Pxy. \end{eqnarray*} -The resulting risk is called \textbf{Bayes risk}: $\riskbayes_{} = \risk(\fbayes)$ +The resulting risk is called \textbf{Bayes risk}: $\riskbayes_{} = \risk(\fbayes_{\Hspace_{all}})$ \lz @@ -240,14 +240,14 @@ To derive the risk minimizer, observe that by law of total expectation $$ \risk(f) = \E_{xy} \left[\Lxy\right] - = \E_x \left[\E_{y|x}\left[\Lxy~|~\xv = \xv\right]\right].$$ + = \E_x \left[\E_{y|x}\left[\Lxy~|~\xv\right]\right].$$ \begin{itemize} \item We can choose $\fx$ as we want (unrestricted hypothesis space, no assumed functional form) \item Hence, for a fixed value $\xv \in \Xspace$ we can select \textbf{any} value $c$ we want to predict. So we construct the \textbf{point-wise optimizer} % (we are not restricted by any functional form, e.g., a linear function) % \item Instead of looking for the optimal $f$ in function space (which is impossible), we compute the \textbf{point-wise optimizer} for every $\xv \in \Xspace$. - $$\fxbayes = \mbox{argmin}_c \E_{y|x}\left[L(y, c)~|~ \xv = \xv \right] \quad \forall \xv \in \Xspace.$$ + $$\fbayes(\tilde{\xv}) = \mbox{argmin}_c \E_{y|x}\left[L(y, c)~|~ \xv = \tilde{\xv} \right] $$ \end{itemize} \begin{center}