Skip to content

Commit

Permalink
added two illustrations to robust loss chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
ludwigbothmann committed Aug 21, 2024
1 parent a2a2014 commit 4f60468
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 13 deletions.
Binary file added slides/advriskmin/figure/quantile-regression.pdf
Binary file not shown.
Binary file added slides/advriskmin/figure/telephone-data.pdf
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
69 changes: 69 additions & 0 deletions slides/advriskmin/rsrc/quantile-regression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
### This script produces a plot illustrating the ERM under pinball losses
### Data: simulated univariate heteroskedastic regression y=theta0+theta1*x+eps(x)
### eps(x) is normally distributed with mean 0 and standard deviation 0.5+0.5*x
### x is chosen uniformly in [0,10], theta0=1, theta1=0.2, n=200 samples

rm(list=ls())
library(quantreg)
library(ggplot2)

# Data simulation
set.seed(123)
n <- 200
x <- runif(n, 0, 10)
theta0 <- 1
theta1 <- 0.2
eps <- rnorm(n, mean = 0, sd = 0.5+0.5*x)
y <- theta0 + theta1 * x + eps
simu.dat <- data.frame(x = x, y = y)

# Pinball loss for 0.05 quantile
mod.5 <- rq(formula = as.formula("y ~ x"), tau=0.05, data=simu.dat)
res.5 <- mod.5$coefficients
resids.5 <- mod.5$residuals
preds.5 <- predict(mod.5)
# Pinball loss for 0.5 quantile
mod.50 <- rq(formula = as.formula("y ~ x"), tau=0.5, data=simu.dat)
res.50 <- mod.50$coefficients
resids.50 <- mod.50$residuals
preds.50 <- predict(mod.50)
# Pinball loss for 0.95 quantile
mod.95 <- rq(formula = as.formula("y ~ x"), tau=0.95, data=simu.dat)
res.95 <- mod.95$coefficients
resids.95 <- mod.95$residuals
preds.95 <- predict(mod.95)

# Aggregate results
res.dat <- simu.dat
res.dat$resid_5 <- resids.5
res.dat$resid_50 <- resids.50
res.dat$resid_95 <- resids.95
res.dat$pred_5 <- preds.5
res.dat$pred_50 <- preds.50
res.dat$pred_95 <- preds.95

# Calculate true conditional mean
res.dat$true_mean <- theta0 + theta1 * res.dat$x


p <- ggplot(res.dat, aes(x = x)) +
geom_line(aes(y = pred_5, color = "q=0.05"), size = 1.6, alpha=1) +
geom_line(aes(y = pred_50, color = "q=0.5"), size = 1.6, alpha=1) +
geom_line(aes(y = pred_95, color = "q=0.95"), size = 1.6, alpha=1) +
geom_line(aes(y = true_mean, linetype = "True mean"), color = "black", size = 1.6) +
geom_point(aes(y = y), color = "darkgrey", size = 2, alpha=1) +
labs(y = "y", x = "x", color = "Quantile", linetype = "") +
scale_linetype_manual(values = c("True mean" = "dashed")) +
theme_minimal() +
theme(text = element_text(size = 12),
axis.title = element_text(size = rel(2)),
axis.text = element_text(size = rel(2)),
legend.title = element_text(size = rel(1.5)),
legend.text = element_text(size = rel(1.3))) +
guides(color = guide_legend(order = 1), linetype = guide_legend(order = 2))
# Print plot
print(p)

# Save figure
ggsave(filename = paste0("../figure/quantile-regression.pdf"), plot = p)

93 changes: 93 additions & 0 deletions slides/advriskmin/rsrc/telephone-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
### This script produces a plot comparing various loss functions for robust regression
### Dataset: telephone data (number of calls per year in Belgium)
### Losses: L2, L1, log-cosh, Huber loss

rm(list=ls())

library(Rfit)
library(hqreg)
library(MASS)
library(quantreg)
library(CVXR)
library(ggplot2)

set.seed(123)

# Data
tel.dat <- Rfit::telephone
tel.dat$year <- as.numeric(tel.dat$year)

# L2 loss (least-squares)
fm0 <- lm(calls ~ year, tel.dat)
X <- model.matrix(fm0)
res.l2 <- fm0$coefficients
resids.l2 <- fm0$residuals
preds.l2 <- predict(fm0)

# Log-cosh regression
f <- function(b) with(tel.dat, sum(log(cosh(calls - X %*% b))))
res.optim <- optim(coef(fm0), f, method = "Nelder-Mead", control=list(alpha=1.0, beta=0.5, gamma=2.0))
res.logcosh <- res.optim$par
resids.logcosh <- with(tel.dat, calls - X %*% res.logcosh)
preds.logcosh <- as.numeric(with(tel.dat, X %*% res.logcosh))

# L1 loss robust regression
#mod.l1 <- rq(formula = as.formula("calls ~ year"), tau=0.5, data=tel.dat)
#res.l1 <- mod.l1$coefficients
#resids.l1 <- mod.l1$residuals
#preds.l1 <- predict(mod.l1)

# L1 loss robust regression (manual optimization)
f.l1 <- function(b) with(tel.dat, sum(abs(calls - X %*% b)))
res.optim.l1 <- optim(coef(fm0), f.l1, method = "Nelder-Mead", control=list(alpha=1.0, beta=0.5, gamma=2.0))
res.l1.manual <- res.optim.l1$par
resids.l1.manual <- with(tel.dat, calls - X %*% res.l1.manual)
preds.l1.manual <- as.numeric(with(tel.dat, X %*% res.l1.manual))

# Huber loss robust regression using CVXR
beta <- Variable(2)
obj <- sum(CVXR::huber(tel.dat$calls - X %*% beta, 1))
prob <- Problem(Minimize(obj))
result <- solve(prob)
res.huber <- result$getValue(beta)
names(res.huber) <- names(res.l2)
resids.huber <- with(tel.dat, calls - X %*% res.huber)
preds.huber <- as.numeric(with(tel.dat, X %*% res.huber))

# aggregate results data
res.tel <- tel.dat
res.tel$resid_l2 <- resids.l2
#res.tel$resid_l1 <- resids.l1
res.tel$resid_l1_manual <- resids.l1.manual
res.tel$resid_huber <- resids.huber
res.tel$resid_logcosh <- resids.logcosh
res.tel$pred_l2 <- preds.l2
#res.tel$pred_l1 <- preds.l1
res.tel$pred_l1_manual <- preds.l1.manual
res.tel$pred_huber <- preds.huber
res.tel$pred_logcosh <- preds.logcosh

#coef.b0 <- c(res.l2[1], res.l1[1], res.huber[1], res.logcosh[1])
#coef.b1 <- c(res.l2[2], res.l1[2], res.huber[2], res.logcosh[2])

# Plot results for telephone data
p <- ggplot(res.tel, aes(x = year)) +
geom_line(aes(y = pred_l2, color = "L2 (OLS)"), size = 1.6, alpha=1) +
#geom_line(aes(y = pred_l1, color = "L1"), size = 1.4) +
geom_line(aes(y = pred_l1_manual, color = "L1 (man)"), size = 1.6, alpha=1) +
geom_line(aes(y = pred_huber, color = "Huber"), size = 1.6, alpha=1) +
geom_line(aes(y = pred_logcosh, color = "Log-Cosh"), size = 1.6, alpha=1) +
geom_point(aes(y = calls), color = "black", size = 4, alpha=1) +
labs(y = "Calls (in mio.)", x = "Year", color = "Loss") +
theme_minimal() +
theme(text = element_text(size = 12),
axis.title = element_text(size = rel(2)),
axis.text = element_text(size = rel(2)),
legend.title = element_text(size = rel(1.5)),
legend.text = element_text(size = rel(1.3)))

# Print plot
print(p)

# Save figure
ggsave(filename = paste0("../figure/telephone-data.pdf"), plot = p)
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty
Advanced Risk Minimization
}{% Lecture title
Advanced Risk Minimization:\\
Bias-Variance Decomposition
}{% Relative path to title page image: Can be empty but must not start with slides/
figure/bias_variance_decomposition-linear_model_bias.png
Expand Down Expand Up @@ -40,7 +39,7 @@
with zero-mean homoskedastic error $\epsilon \sim (0, \sigma^2)$ independent of $\xv$.
\vspace{0.1cm}

{\scriptsize ${\ast}$ Similar decomp also exist for other losses expressible as Bregman divergences (e.g. cross-entropy). The $0/1$ loss e.g. has no such decomposition (\cite{BROWN2024BIAS})}
{\scriptsize ${\ast}$ Similar decomps also exist for other losses expressible as Bregman divergences (e.g. cross-entropy). One exception is the $0/1$ loss (\cite{BROWN2024BIAS})}

\framebreak

Expand Down Expand Up @@ -255,42 +254,81 @@
\end{vbframe}


\begin{vbframe}{Approximation an Estimation \citelink{BROWN2024BIAS} }
The Bias-Variance decomposition is often confused or equated with the related (but different) decompo of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error.
\begin{vbframe}{Approximation and Estimation \citelink{BROWN2024BIAS} }
The Bias-Variance decomp is often confused or equated with the related (but different) decomp of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error.

\begin{eqnarray*}
\underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}}
\end{eqnarray*}

Both are commonly described using the same figure and analogies
\vspace{-0.1cm}
\begin{figure}
\centering
\includegraphics[width = 0.75\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png}
\includegraphics[width = 0.78\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png}
\tiny{\\ Credit: \cite{BROWN2024BIAS}}
\end{figure}


\end{vbframe}

\framebreak

\begin{vbframe}{Approximation/Estimation Error}
\begin{vbframe}{Approx./Estimation Error \citelink{BROWN2024BIAS}}

\begin{itemize}
\item The approx. error is a structural property of $\Hspace$.
\item The estimation error is random due to dependence on data in $\fh$
\item Estimation error arises because we choose $f \in \Hspace$ with limited training data using $\riske$ instead of $\risk$
\end{itemize}

$\fh_{\Hspace}$ assumes we found a global minimizer of $\riske$, which is often impossible.
Knowing $\fh_{\Hspace} \in \arg\inf_{f \in \Hspace} \riske(f)$ assumes we found a global minimizer of $\riske$, which is often impossible (e.g. DNNs).
\vspace{0.2cm}

In practice, optimizing $\riske$ gives us our 'best guess' $\tilde{f}_{\Hspace} \in \Hspace$ of $\fh_{\Hspace}$. We can now decompose its excess risk as
In practice, optimizing $\riske$ gives us our 'best guess' $\tilde{f}_{\Hspace} \in \Hspace$ of $\fh_{\Hspace}$. We can now decompose its excess risk finer as

\begin{eqnarray*}
\underbrace{\risk(\tilde{f}_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\tilde{f}_{\Hspace}) - \risk(\fh_{\Hspace})}_{\text{optim. error}} + \underbrace{\risk(\hat{f}_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}}
\end{eqnarray*}

Note that the optimization error can be negative, even though $\riske(\tilde{f}_{\Hspace}) \geq \riske(\fh_{\Hspace})$ always holds.

\framebreak

We can further decompose estimation error more finely by defining the \textit{centroid} model or 'systematic' model part.\\
\vspace{0.15cm}

Given $\fh_{\Hspace} \in \arg\min_{f \in \Hspace} \riske(f)$ the centroid model under squared loss is the mean prediction at each $x$ over all $\D_n$, $f^{\circ}_{\Hspace} := \E_{\D_n \sim \Pxy^n}[\fh_{\Hspace}] $,.
\vspace{0.15cm}

With $f^{\circ}_{\Hspace}$ we can decompose the expected estimation error as

\begin{eqnarray*}
\underbrace{\E_{\D_n \sim \Pxy^n}\left[\risk(\hat{f}_{\Hspace}) - \risk(f ^{\ast}_{\Hspace})\right]}_{\text{expected estimation error}} &=& \underbrace{\E_{\D_n \sim \Pxy^n}\left[\risk(\hat{f}_{\Hspace}) - \risk(f^{\circ}_{\Hspace}) \right]}_{\text{estimation variance}} + \underbrace{\risk(f^{\circ}_{\Hspace})-\risk(f^{\ast}_{\Hspace})}_{\text{estimation bias}}
\end{eqnarray*}

\begin{itemize}
\item estimation bias measures distance of centroid model to risk minimizer over $\Hspace$
\item estimation variance measures spread of ERM around centroid model induced by randomness due to $\D_n$
\end{itemize}

\framebreak

{\small We can now connect the derived quantities back to bias and variance and see how they differ. As we see, bias is not only approx. error and variance is not estimation error.}
\vspace{-0.2cm}

$$ \text{bias} = \text{approximation error} + \text{estimation bias} $$
$$\text{variance} = \text{optimization error} + \text{estimation variance}$$
% variance = estimation variance + optimization error
\vspace{-0.4cm}
\begin{figure}
\centering
\includegraphics[width = 0.5\textwidth]{slides/advriskmin/figure_man/expected-risk-decomp.png}
\tiny{\\ Credit: \cite{BROWN2024BIAS}}
\end{figure}

{\scriptsize \textbf{NB}: For special case of LM and squared loss (OLS), we have zero optim. error and estimation bias. Hence both decompositions agree there.}


\end{vbframe}

Expand Down
24 changes: 24 additions & 0 deletions slides/advriskmin/slides-advriskmin-regression-further-losses.tex
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,18 @@

\end{vbframe}

\begin{vbframe}{Telephone data}

{\small We now illustrate the effect of using robust loss functions. The telephone data set contains the number of calls (in 10mio units) made in Belgium between 1950 and 1973 ($n=24$). Outliers are due to a change in measurement without re-calibration for 6 years.}

\vspace{0.2cm}

\begin{center}
\includegraphics[width = 0.6\textwidth]{figure/telephone-data.pdf}
\end{center}

\end{vbframe}

\begin{comment}
\begin{vbframe}{Log-Barrier Loss}

Expand Down Expand Up @@ -369,6 +381,18 @@

% where $Q_\alpha(\cdot)$ computes the empirical $\alpha$-quantile of $\{\yi\}, i = 1, ..., n$.

\framebreak

We simulate $n=200$ samples from a heteroskedastic LM using the DGP $y = 1+0.2x+\varepsilon$, where $\varepsilon \sim \mathcal{N}(0, 0.5+0.5x)$ and $x \sim \mathcal{U}[0,10]$.\\

Using the quantile loss, we estimate the conditional $\alpha$-quantiles for $\alpha \in \{0.05, 0.5, 0.95\}$.

\vspace{-0.2cm}

\begin{center}
\includegraphics[width = 0.62\textwidth]{slides/advriskmin/figure/quantile-regression.pdf}
\end{center}

\end{vbframe}


Expand Down
10 changes: 5 additions & 5 deletions slides/advriskmin/slides-advriskmin-risk-minimizer.tex
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
We will consider 3 cases for Q
\begin{itemize}
\item $Q = P_y$, simply our labels and their marginal distribution in $\Pxy$
\item $Q = P_{y | x = x}$, conditional label distribution at point $x = x$
\item $Q = P_{y | x = x}$, conditional label distribution at point $x = \tilde{x}$
\item $Q = P_n$, the empirical product distribution for data $y_1, \ldots, y_n$
\end{itemize}

Expand Down Expand Up @@ -192,10 +192,10 @@
is called the \textbf{risk minimizer}, \textbf{population minimizer} or \textbf{Bayes optimal model}.

\begin{eqnarray*}
\fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f: \Xspace \to \R^g}\Exy\left[\Lxy\right]\\ &=& \argmin_{f: \Xspace \to \R^g}\int \Lxy \text{d}\Pxy.
\fbayes_{\Hspace_{all}} &=& \argmin_{f \in \Hspace_{all}} \risk\left(f\right) = \argmin_{f \in \Hspace_{all}}\Exy\left[\Lxy\right]\\ &=& \argmin_{f \in \Hspace_{all}}\int \Lxy \text{d}\Pxy.
\end{eqnarray*}

The resulting risk is called \textbf{Bayes risk}: $\riskbayes_{} = \risk(\fbayes)$
The resulting risk is called \textbf{Bayes risk}: $\riskbayes_{} = \risk(\fbayes_{\Hspace_{all}})$

\lz

Expand Down Expand Up @@ -240,14 +240,14 @@

To derive the risk minimizer, observe that by law of total expectation
$$ \risk(f) = \E_{xy} \left[\Lxy\right]
= \E_x \left[\E_{y|x}\left[\Lxy~|~\xv = \xv\right]\right].$$
= \E_x \left[\E_{y|x}\left[\Lxy~|~\xv\right]\right].$$

\begin{itemize}
\item We can choose $\fx$ as we want (unrestricted hypothesis space, no assumed functional form)
\item Hence, for a fixed value $\xv \in \Xspace$ we can select \textbf{any} value $c$ we want to predict. So we construct the \textbf{point-wise optimizer}
% (we are not restricted by any functional form, e.g., a linear function)
% \item Instead of looking for the optimal $f$ in function space (which is impossible), we compute the \textbf{point-wise optimizer} for every $\xv \in \Xspace$.
$$\fxbayes = \mbox{argmin}_c \E_{y|x}\left[L(y, c)~|~ \xv = \xv \right] \quad \forall \xv \in \Xspace.$$
$$\fbayes(\tilde{\xv}) = \mbox{argmin}_c \E_{y|x}\left[L(y, c)~|~ \xv = \tilde{\xv} \right] $$
\end{itemize}

\begin{center}
Expand Down

0 comments on commit 4f60468

Please sign in to comment.