added two illustrations to robust loss chunk

slds-lmu · Aug 21, 2024 · 4f60468 · 4f60468
1 parent a2a2014
commit 4f60468
Show file tree

Hide file tree

Showing 8 changed files with 237 additions and 13 deletions.
diff --git a/slides/advriskmin/figure/quantile-regression.pdf b/slides/advriskmin/figure/quantile-regression.pdf
diff --git a/slides/advriskmin/figure/telephone-data.pdf b/slides/advriskmin/figure/telephone-data.pdf
diff --git a/slides/advriskmin/figure_man/expected-risk-decomp.png b/slides/advriskmin/figure_man/expected-risk-decomp.png
diff --git a/slides/advriskmin/rsrc/quantile-regression.R b/slides/advriskmin/rsrc/quantile-regression.R
@@ -0,0 +1,69 @@
+### This script produces a plot illustrating the ERM under pinball losses
+### Data: simulated univariate heteroskedastic regression y=theta0+theta1*x+eps(x)
+### eps(x) is normally distributed with mean 0 and standard deviation 0.5+0.5*x
+### x is chosen uniformly in [0,10], theta0=1, theta1=0.2, n=200 samples
+
+rm(list=ls())
+library(quantreg)
+library(ggplot2)
+
+# Data simulation
+set.seed(123)  
+n <- 200  
+x <- runif(n, 0, 10)  
+theta0 <- 1
+theta1 <- 0.2
+eps <- rnorm(n, mean = 0, sd = 0.5+0.5*x)  
+y <- theta0 + theta1 * x + eps 
+simu.dat <- data.frame(x = x, y = y)
+
+# Pinball loss for 0.05 quantile
+mod.5 <- rq(formula = as.formula("y ~ x"), tau=0.05, data=simu.dat)
+res.5 <- mod.5$coefficients
+resids.5 <- mod.5$residuals
+preds.5 <- predict(mod.5)
+# Pinball loss for 0.5 quantile
+mod.50 <- rq(formula = as.formula("y ~ x"), tau=0.5, data=simu.dat)
+res.50 <- mod.50$coefficients
+resids.50 <- mod.50$residuals
+preds.50 <- predict(mod.50)
+# Pinball loss for 0.95 quantile
+mod.95 <- rq(formula = as.formula("y ~ x"), tau=0.95, data=simu.dat)
+res.95 <- mod.95$coefficients
+resids.95 <- mod.95$residuals
+preds.95 <- predict(mod.95)
+
+# Aggregate results
+res.dat <- simu.dat
+res.dat$resid_5 <- resids.5
+res.dat$resid_50 <- resids.50
+res.dat$resid_95 <- resids.95
+res.dat$pred_5 <- preds.5
+res.dat$pred_50 <- preds.50
+res.dat$pred_95 <- preds.95
+
+# Calculate true conditional mean
+res.dat$true_mean <- theta0 + theta1 * res.dat$x
+
+
+p <- ggplot(res.dat, aes(x = x)) +
+  geom_line(aes(y = pred_5, color = "q=0.05"), size = 1.6, alpha=1) +   
+  geom_line(aes(y = pred_50, color = "q=0.5"), size = 1.6, alpha=1) +
+  geom_line(aes(y = pred_95, color = "q=0.95"), size = 1.6, alpha=1) +
+  geom_line(aes(y = true_mean, linetype = "True mean"), color = "black", size = 1.6) +
+  geom_point(aes(y = y), color = "darkgrey", size = 2, alpha=1) +  
+  labs(y = "y", x = "x", color = "Quantile", linetype = "") +
+  scale_linetype_manual(values = c("True mean" = "dashed")) +
+  theme_minimal() +
+  theme(text = element_text(size = 12), 
+        axis.title = element_text(size = rel(2)),   
+        axis.text = element_text(size = rel(2)),  
+        legend.title = element_text(size = rel(1.5)),
+        legend.text = element_text(size = rel(1.3))) +
+  guides(color = guide_legend(order = 1), linetype = guide_legend(order = 2))
+# Print plot
+print(p)
+
+# Save figure
+ggsave(filename = paste0("../figure/quantile-regression.pdf"), plot = p)
+
diff --git a/slides/advriskmin/rsrc/telephone-data.R b/slides/advriskmin/rsrc/telephone-data.R
@@ -0,0 +1,93 @@
+### This script produces a plot comparing various loss functions for robust regression
+### Dataset: telephone data (number of calls per year in Belgium)
+### Losses: L2, L1, log-cosh, Huber loss
+
+rm(list=ls())
+
+library(Rfit)
+library(hqreg)
+library(MASS)
+library(quantreg)
+library(CVXR)
+library(ggplot2)
+
+set.seed(123)
+
+# Data
+tel.dat <- Rfit::telephone
+tel.dat$year <- as.numeric(tel.dat$year)
+
+# L2 loss (least-squares)
+fm0 <- lm(calls ~ year, tel.dat)
+X <- model.matrix(fm0)
+res.l2 <- fm0$coefficients
+resids.l2 <- fm0$residuals
+preds.l2 <- predict(fm0)
+
+# Log-cosh regression
+f <- function(b) with(tel.dat, sum(log(cosh(calls - X %*% b))))
+res.optim <- optim(coef(fm0), f, method = "Nelder-Mead", control=list(alpha=1.0, beta=0.5, gamma=2.0))
+res.logcosh <- res.optim$par
+resids.logcosh <- with(tel.dat, calls - X %*% res.logcosh)
+preds.logcosh <- as.numeric(with(tel.dat, X %*% res.logcosh))
+
+# L1 loss robust regression
+#mod.l1 <- rq(formula = as.formula("calls ~ year"), tau=0.5, data=tel.dat)
+#res.l1 <- mod.l1$coefficients
+#resids.l1 <- mod.l1$residuals
+#preds.l1 <- predict(mod.l1)
+
+# L1 loss robust regression (manual optimization)
+f.l1 <- function(b) with(tel.dat, sum(abs(calls - X %*% b)))
+res.optim.l1 <- optim(coef(fm0), f.l1, method = "Nelder-Mead", control=list(alpha=1.0, beta=0.5, gamma=2.0))
+res.l1.manual <- res.optim.l1$par
+resids.l1.manual <- with(tel.dat, calls - X %*% res.l1.manual)
+preds.l1.manual <- as.numeric(with(tel.dat, X %*% res.l1.manual))
+
+# Huber loss robust regression using CVXR
+beta <- Variable(2)
+obj <- sum(CVXR::huber(tel.dat$calls - X %*% beta, 1))
+prob <- Problem(Minimize(obj))
+result <- solve(prob)
+res.huber <- result$getValue(beta)
+names(res.huber) <- names(res.l2)
+resids.huber <- with(tel.dat, calls - X %*% res.huber)
+preds.huber <- as.numeric(with(tel.dat, X %*% res.huber))
+
+# aggregate results data
+res.tel <- tel.dat
+res.tel$resid_l2 <- resids.l2
+#res.tel$resid_l1 <- resids.l1
+res.tel$resid_l1_manual <- resids.l1.manual
+res.tel$resid_huber <- resids.huber
+res.tel$resid_logcosh <- resids.logcosh
+res.tel$pred_l2 <- preds.l2
+#res.tel$pred_l1 <- preds.l1
+res.tel$pred_l1_manual <- preds.l1.manual
+res.tel$pred_huber <- preds.huber
+res.tel$pred_logcosh <- preds.logcosh
+
+#coef.b0 <- c(res.l2[1], res.l1[1], res.huber[1], res.logcosh[1])
+#coef.b1 <- c(res.l2[2], res.l1[2], res.huber[2], res.logcosh[2])
+
+# Plot results for telephone data
+p <- ggplot(res.tel, aes(x = year)) +
+  geom_line(aes(y = pred_l2, color = "L2 (OLS)"), size = 1.6, alpha=1) +   
+  #geom_line(aes(y = pred_l1, color = "L1"), size = 1.4) +
+  geom_line(aes(y = pred_l1_manual, color = "L1 (man)"), size = 1.6, alpha=1) +
+  geom_line(aes(y = pred_huber, color = "Huber"), size = 1.6, alpha=1) +
+  geom_line(aes(y = pred_logcosh, color = "Log-Cosh"), size = 1.6, alpha=1) +
+  geom_point(aes(y = calls), color = "black", size = 4, alpha=1) +  
+  labs(y = "Calls (in mio.)", x = "Year", color = "Loss") +
+  theme_minimal() +
+  theme(text = element_text(size = 12), 
+        axis.title = element_text(size = rel(2)),   
+        axis.text = element_text(size = rel(2)),  
+        legend.title = element_text(size = rel(1.5)),
+        legend.text = element_text(size = rel(1.3)))
+
+# Print plot
+print(p)
+
+# Save figure
+ggsave(filename = paste0("../figure/telephone-data.pdf"), plot = p)
diff --git a/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex b/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex
@@ -11,7 +11,6 @@
 \titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty
     Advanced Risk Minimization
   }{% Lecture title  
-    Advanced Risk Minimization:\\
     Bias-Variance Decomposition
   }{% Relative path to title page image: Can be empty but must not start with slides/
   figure/bias_variance_decomposition-linear_model_bias.png
@@ -40,7 +39,7 @@
 with zero-mean homoskedastic error $\epsilon \sim (0, \sigma^2)$ independent of $\xv$.
 \vspace{0.1cm}
 
-{\scriptsize ${\ast}$ Similar decomp also exist for other losses expressible as Bregman divergences (e.g. cross-entropy). The $0/1$ loss e.g. has no such decomposition (\cite{BROWN2024BIAS})}
+{\scriptsize ${\ast}$ Similar decomps also exist for other losses expressible as Bregman divergences (e.g. cross-entropy). One exception is the $0/1$ loss (\cite{BROWN2024BIAS})}
 
 \framebreak 
 
@@ -255,42 +254,81 @@
 \end{vbframe}
 
 
-\begin{vbframe}{Approximation an Estimation \citelink{BROWN2024BIAS} }
-The Bias-Variance decomposition is often confused or equated with the related (but different) decompo of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error.
+\begin{vbframe}{Approximation and Estimation \citelink{BROWN2024BIAS} }
+The Bias-Variance decomp is often confused or equated with the related (but different) decomp of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error.
 
 \begin{eqnarray*}
     \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) -  \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}} 
 \end{eqnarray*}
 
+Both are commonly described using the same figure and analogies
+\vspace{-0.1cm}
 \begin{figure}
     \centering
-    \includegraphics[width = 0.75\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png}
+    \includegraphics[width = 0.78\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png}
     \tiny{\\ Credit: \cite{BROWN2024BIAS}}
   \end{figure}
 
+
 \end{vbframe}
 
 \framebreak
 
-\begin{vbframe}{Approximation/Estimation Error}
+\begin{vbframe}{Approx./Estimation Error \citelink{BROWN2024BIAS}}
 
 \begin{itemize}
     \item The approx. error is a structural property of $\Hspace$.
     \item The estimation error is random due to dependence on data in $\fh$
     \item Estimation error arises because we choose $f \in \Hspace$ with limited training data using $\riske$ instead of $\risk$
 \end{itemize}
 
-$\fh_{\Hspace}$ assumes we found a global minimizer of $\riske$, which is often impossible. 
+Knowing $\fh_{\Hspace} \in \arg\inf_{f \in \Hspace} \riske(f)$ assumes we found a global minimizer of $\riske$, which is often impossible (e.g. DNNs). 
 \vspace{0.2cm}
 
-In practice, optimizing $\riske$ gives us our 'best guess' $\tilde{f}_{\Hspace} \in \Hspace$ of $\fh_{\Hspace}$. We can now decompose its excess risk as
+In practice, optimizing $\riske$ gives us our 'best guess' $\tilde{f}_{\Hspace} \in \Hspace$ of $\fh_{\Hspace}$. We can now decompose its excess risk finer as
 
 \begin{eqnarray*}
     \underbrace{\risk(\tilde{f}_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\tilde{f}_{\Hspace}) - \risk(\fh_{\Hspace})}_{\text{optim. error}} + \underbrace{\risk(\hat{f}_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) -  \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}} 
 \end{eqnarray*}
 
 Note that the optimization error can be negative, even though $\riske(\tilde{f}_{\Hspace}) \geq \riske(\fh_{\Hspace})$ always holds.
 
+\framebreak
+
+We can further decompose estimation error more finely by defining the \textit{centroid} model or 'systematic' model part.\\
+\vspace{0.15cm}
+
+Given $\fh_{\Hspace} \in  \arg\min_{f \in \Hspace} \riske(f)$ the centroid model under squared loss is the mean prediction at each $x$ over all $\D_n$, $f^{\circ}_{\Hspace} := \E_{\D_n \sim \Pxy^n}[\fh_{\Hspace}] $,.
+\vspace{0.15cm}
+
+With $f^{\circ}_{\Hspace}$ we can decompose the expected estimation error as
+
+\begin{eqnarray*}
+ \underbrace{\E_{\D_n \sim \Pxy^n}\left[\risk(\hat{f}_{\Hspace}) - \risk(f ^{\ast}_{\Hspace})\right]}_{\text{expected estimation error}} &=& \underbrace{\E_{\D_n \sim \Pxy^n}\left[\risk(\hat{f}_{\Hspace}) - \risk(f^{\circ}_{\Hspace}) \right]}_{\text{estimation variance}} + \underbrace{\risk(f^{\circ}_{\Hspace})-\risk(f^{\ast}_{\Hspace})}_{\text{estimation bias}}   
+\end{eqnarray*}
+
+\begin{itemize}
+    \item estimation bias measures distance of centroid model to risk minimizer over $\Hspace$
+    \item estimation variance measures spread of ERM around centroid model induced by randomness due to $\D_n$
+\end{itemize}
+
+\framebreak
+
+{\small We can now connect the derived quantities back to bias and variance and see how they differ. As we see, bias is not only approx. error and variance is not estimation error.}
+\vspace{-0.2cm}
+
+$$ \text{bias} = \text{approximation error} + \text{estimation bias} $$
+$$\text{variance} = \text{optimization error} + \text{estimation variance}$$
+% variance = estimation variance + optimization error
+\vspace{-0.4cm}
+\begin{figure}
+    \centering
+    \includegraphics[width = 0.5\textwidth]{slides/advriskmin/figure_man/expected-risk-decomp.png}
+    \tiny{\\ Credit: \cite{BROWN2024BIAS}}
+  \end{figure}
+
+{\scriptsize \textbf{NB}: For special case of LM and squared loss (OLS), we have zero optim. error and estimation bias. Hence both decompositions agree there.}
+
 
 \end{vbframe}
 

diff --git a/slides/advriskmin/slides-advriskmin-regression-further-losses.tex b/slides/advriskmin/slides-advriskmin-regression-further-losses.tex
@@ -214,6 +214,18 @@
 
 \end{vbframe}
 
+\begin{vbframe}{Telephone data}
+
+{\small We now illustrate the effect of using robust loss functions. The telephone data set contains the number of calls (in 10mio units) made in Belgium between 1950 and 1973 ($n=24$). Outliers are due to a change in measurement without re-calibration for 6 years.}
+
+\vspace{0.2cm}
+
+\begin{center}
+\includegraphics[width = 0.6\textwidth]{figure/telephone-data.pdf}
+\end{center}
+
+\end{vbframe}
+
 \begin{comment}
 \begin{vbframe}{Log-Barrier Loss}
 
@@ -369,6 +381,18 @@
 
 % where $Q_\alpha(\cdot)$ computes the empirical $\alpha$-quantile of $\{\yi\}, i = 1, ..., n$.
 
+\framebreak
+
+We simulate $n=200$ samples from a heteroskedastic LM using the DGP $y = 1+0.2x+\varepsilon$, where $\varepsilon \sim \mathcal{N}(0, 0.5+0.5x)$ and $x \sim \mathcal{U}[0,10]$.\\
+
+Using the quantile loss, we estimate the conditional $\alpha$-quantiles for $\alpha \in \{0.05, 0.5, 0.95\}$.
+
+\vspace{-0.2cm}
+
+\begin{center}
+\includegraphics[width = 0.62\textwidth]{slides/advriskmin/figure/quantile-regression.pdf}
+\end{center}
+
 \end{vbframe}
 
 

diff --git a/slides/advriskmin/slides-advriskmin-risk-minimizer.tex b/slides/advriskmin/slides-advriskmin-risk-minimizer.tex
@@ -91,7 +91,7 @@
 We will consider 3 cases for Q
 \begin{itemize}
 \item $Q = P_y$, simply our labels and their marginal distribution in $\Pxy$
-\item $Q = P_{y | x = x}$, conditional label distribution at point $x = x$
+\item $Q = P_{y | x = x}$, conditional label distribution at point $x = \tilde{x}$
 \item $Q = P_n$, the empirical product distribution for data $y_1, \ldots, y_n$
 \end{itemize}
 
@@ -192,10 +192,10 @@
 is called the \textbf{risk minimizer}, \textbf{population minimizer} or \textbf{Bayes optimal model}. 
 
 \begin{eqnarray*}
-	\fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f: \Xspace \to \R^g}\Exy\left[\Lxy\right]\\ &=&  \argmin_{f: \Xspace \to \R^g}\int \Lxy \text{d}\Pxy. 
+	\fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f \in \Hspace_{all}}\Exy\left[\Lxy\right]\\ &=&  \argmin_{f \in \Hspace_{all}}\int \Lxy \text{d}\Pxy. 
 \end{eqnarray*}
 
-The resulting risk is called \textbf{Bayes risk}:  $\riskbayes_{} = \risk(\fbayes)$
+The resulting risk is called \textbf{Bayes risk}:  $\riskbayes_{} = \risk(\fbayes_{\Hspace_{all}})$
 
 \lz
 
@@ -240,14 +240,14 @@
 
 To derive the risk minimizer, observe that by law of total expectation 
 $$    \risk(f) = \E_{xy} \left[\Lxy\right] 
-    = \E_x \left[\E_{y|x}\left[\Lxy~|~\xv = \xv\right]\right].$$
+    = \E_x \left[\E_{y|x}\left[\Lxy~|~\xv\right]\right].$$
 
 \begin{itemize}
 	\item We can choose $\fx$ as we want (unrestricted hypothesis space, no assumed functional form)
 	\item Hence, for a fixed value $\xv \in \Xspace$ we can select \textbf{any} value $c$ we want to predict. So we construct the \textbf{point-wise optimizer} 
         % (we are not restricted by any functional form, e.g., a linear function)
 	% \item Instead of looking for the optimal $f$ in function space (which is impossible), we compute the \textbf{point-wise optimizer} for every $\xv \in \Xspace$.
- $$\fxbayes = \mbox{argmin}_c \E_{y|x}\left[L(y, c)~|~ \xv = \xv \right] \quad \forall \xv \in \Xspace.$$
+ $$\fbayes(\tilde{\xv}) = \mbox{argmin}_c \E_{y|x}\left[L(y, c)~|~ \xv = \tilde{\xv} \right] $$
 \end{itemize}
 
 \begin{center}