diff --git a/slides/advriskmin/chapter-order.tex b/slides/advriskmin/chapter-order.tex index 0e57ba28..b75a7788 100644 --- a/slides/advriskmin/chapter-order.tex +++ b/slides/advriskmin/chapter-order.tex @@ -41,6 +41,9 @@ \subsection{Bernoulli Loss} \subsection{Logistic Regression (Deep-Dive)} \includepdf[pages=-]{../../slides-pdf/slides-advriskmin-logreg-deepdive.pdf} +\subsection{Proper Scoring Rules} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-proper-scoring-rules.pdf} + \subsection{Brier Score} \includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-brier.pdf} diff --git a/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex b/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex index 8de381b4..4267d5a2 100644 --- a/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex +++ b/slides/advriskmin/slides-advriskmin-bias-variance-decomposition.tex @@ -256,6 +256,7 @@ \begin{vbframe}{Approximation and Estimation \citelink{BROWN2024BIAS} } The Bias-Variance decomp is often confused or equated with the related (but different) decomp of \textbf{excess risk} into \textbf{estimation} and \textbf{approximation} error. +\vspace{-0.3cm} \begin{eqnarray*} \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{excess risk}} &=& \underbrace{\risk(\hat f_{\Hspace}) - \risk(\fbayes_{\Hspace})}_{\text{estimation error}} + \underbrace{\risk(\fbayes_{\Hspace}) - \risk(\fbayes_{\Hspace_{all}})}_{\text{approx. error}} @@ -265,14 +266,15 @@ \vspace{-0.1cm} \begin{figure} \centering - \includegraphics[width = 0.78\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png} + \includegraphics[width = 0.7\textwidth]{figure_man/biasvar-vs-estapprox-tradeoff.png} \tiny{\\ Credit: \cite{BROWN2024BIAS}} \end{figure} +{\footnotesize \textbf{NB}: It should be noted that the bias-variance decomp. only holds for certain losses, while the above decomposition is universal.} -+end{vbframe} +\end{vbframe} -\framebreak +%\framebreak \begin{vbframe}{Approx./Estimation Error \citelink{BROWN2024BIAS}} diff --git a/slides/regularization/figure/enet_lasso_ridge_r2.png b/slides/regularization/figure/enet_lasso_ridge_r2.png index dba18c85..7baf9932 100644 Binary files a/slides/regularization/figure/enet_lasso_ridge_r2.png and b/slides/regularization/figure/enet_lasso_ridge_r2.png differ diff --git a/slides/regularization/figure/enet_tradeoff.png b/slides/regularization/figure/enet_tradeoff.png index a2ef3cfa..03751ab6 100644 Binary files a/slides/regularization/figure/enet_tradeoff.png and b/slides/regularization/figure/enet_tradeoff.png differ diff --git a/slides/regularization/rsrc/enet_exp.R b/slides/regularization/rsrc/enet_exp.R index 6605261d..a7eb9efa 100644 --- a/slides/regularization/rsrc/enet_exp.R +++ b/slides/regularization/rsrc/enet_exp.R @@ -1,113 +1,113 @@ -# ------------------------------------------------------------------------------ -# enetlogreg - -# DATA: generate regression data y = X(n*q ~Normal)*beta(q=500) + eps(n ~Normal) -# (1) beta is sparse, only 5 is non-zero out of 500 -# (2) beta is non-sparse -# then calculate R-squared with enet, lasso and ridge regression -# ------------------------------------------------------------------------------ - -library(mlr3) -library(glmnet) -library(mlr3learners) -library(mlr3tuning) -library(mlr3misc) -library(pracma) -library(mvtnorm) -library(future) - -set.seed(123) - -# DATA ------------------------------------------------------------------------- - -n_train = 100 -n_test = 10000 -n = n_train + n_test -n_reps = 20 -n_folds = 5 -gs1_grid = 30 -gs2_grid = c(10, 20) -p = 500 -q_seq = c(5, 500) -x_corr = 0.8 - -# Initialize grid search tuners -tuner1 = tnr("grid_search", resolution = gs1_grid) # Tuner for lambda only -tuner2 = tnr("grid_search", # Tuner for both alpha and lambda - param_resolutions = c(alpha = gs2_grid[1], lambda = gs2_grid[2]) -) - -inner = rsmp("cv", folds = n_folds) -mm = msr("regr.mse") - -l1 = lrn("regr.glmnet", alpha = 0, id = "ridge") -l2 = lrn("regr.glmnet", alpha = 1, id = "lasso") -l3 = lrn("regr.glmnet", id = "enet") - -ss1 = ps( - lambda = p_dbl(1e-3, 1e2, logscale = TRUE) # Log-scaled lambda search space -) - -l1 = auto_tuner(tuner1, l1, inner, mm, search_space = ss1) -l2 = auto_tuner(tuner1, l2, inner, mm, search_space = ss1) - -ss2 = ps( - alpha = p_dbl(0, 1), - lambda = p_dbl(1e-3, 1e2, logscale = TRUE) -) # Search space - -l3 = auto_tuner(tuner2, l3, inner, mm, search_space = ss2) - -mylearners = list(l1, l2, l3) - -myrsmp = rsmp("holdout", ratio = n_train / n) -# lrn_order = c("LM", "ridge", "lasso") - -# FUNC ------------------------------------------------------------------------- - -# Simulate data based on the given parameters and return regression task -make_simul_data = function(rep_i, q) { - sigma = x_corr^(0:(p-1)) - sigma = Toeplitz(sigma) - X = rmvnorm(n = n, sigma = sigma) - eps = rnorm(n = n, sd = 0.1) - theta = c(rep(1, q), rep(0, p-q)) - y = X %*% theta + eps - d = as.data.frame(X) - colnames(d) = sprintf("x%03i", 1:p) - d$y = y - tt = as_task_regr(d, target = "y", id = sprintf("q:%i", q)) - return(tt) -} - -# Function to run benchmarking -run_bm = function(n_reps) { - simul_grid = expand.grid(q = q_seq, rep_i = 1:n_reps) - mytasks = lapply(1:nrow(simul_grid), function(i) { - row = simul_grid[i,] - make_simul_data(rep_i = row$rep_i, q = row$q) - }) - bg = benchmark_grid(mytasks, mylearners, myrsmp) - bmr = benchmark(bg, store_models = TRUE) - ba = bmr$aggregate(msr("regr.rsq")) - list(bmr = bmr, ba = ba)# detailed and aggregated benchmark result -} - -# DATA ------------------------------------------------------------------------- - -# Execute benchmarking in parallel using multiple cores -plan("multicore") -z = run_bm(n_reps) -ba = z$ba -bmr = z$bmr - -# Extract and save model coefficients (betas) -nn = length(bmr$uhashes) -betas = lapply(1:nn, function(i){ - at = bmr$resample_results$resample_result[[i]]$learners[[1]] - gmod = at$learner$model - as.numeric(gmod$beta) -}) -ba$betas = betas -ba$resample_result = NULL -save(file = "enet_exp.RData", bmr_aggr = ba) +# ------------------------------------------------------------------------------ +# enetlogreg + +# DATA: generate regression data y = X(n*q ~Normal)*theta(q=500) + eps(n ~Normal) +# (1) theta is sparse, only 5 is non-zero out of 500 +# (2) theta is dense +# then calculate R-squared with enet, lasso and ridge regression +# ------------------------------------------------------------------------------ + +library(mlr3) +library(glmnet) +library(mlr3learners) +library(mlr3tuning) +library(mlr3misc) +library(pracma) +library(mvtnorm) +library(future) + +set.seed(123) + +# DATA ------------------------------------------------------------------------- + +n_train = 100 +n_test = 10000 +n = n_train + n_test +n_reps = 20 +n_folds = 5 +gs1_grid = 30 +gs2_grid = c(10, 20) +p = 500 +q_seq = c(5, 500) +x_corr = 0.8 + +# Initialize grid search tuners +tuner1 = tnr("grid_search", resolution = gs1_grid) # Tuner for lambda only +tuner2 = tnr("grid_search", # Tuner for both alpha and lambda + param_resolutions = c(alpha = gs2_grid[1], lambda = gs2_grid[2]) +) + +inner = rsmp("cv", folds = n_folds) +mm = msr("regr.mse") + +l1 = lrn("regr.glmnet", alpha = 0, id = "ridge") +l2 = lrn("regr.glmnet", alpha = 1, id = "lasso") +l3 = lrn("regr.glmnet", id = "enet") + +ss1 = ps( + lambda = p_dbl(1e-3, 1e2, logscale = TRUE) # Log-scaled lambda search space +) + +l1 = auto_tuner(tuner1, l1, inner, mm, search_space = ss1) +l2 = auto_tuner(tuner1, l2, inner, mm, search_space = ss1) + +ss2 = ps( + alpha = p_dbl(0, 1), + lambda = p_dbl(1e-3, 1e2, logscale = TRUE) +) # Search space + +l3 = auto_tuner(tuner2, l3, inner, mm, search_space = ss2) + +mylearners = list(l1, l2, l3) + +myrsmp = rsmp("holdout", ratio = n_train / n) +# lrn_order = c("LM", "ridge", "lasso") + +# FUNC ------------------------------------------------------------------------- + +# Simulate data based on the given parameters and return regression task +make_simul_data = function(rep_i, q) { + sigma = x_corr^(0:(p-1)) + sigma = Toeplitz(sigma) + X = rmvnorm(n = n, sigma = sigma) + eps = rnorm(n = n, sd = 0.1) + theta = c(rep(1, q), rep(0, p-q)) + y = X %*% theta + eps + d = as.data.frame(X) + colnames(d) = sprintf("x%03i", 1:p) + d$y = y + tt = as_task_regr(d, target = "y", id = sprintf("q:%i", q)) + return(tt) +} + +# Function to run benchmarking +run_bm = function(n_reps) { + simul_grid = expand.grid(q = q_seq, rep_i = 1:n_reps) + mytasks = lapply(1:nrow(simul_grid), function(i) { + row = simul_grid[i,] + make_simul_data(rep_i = row$rep_i, q = row$q) + }) + bg = benchmark_grid(mytasks, mylearners, myrsmp) + bmr = benchmark(bg, store_models = TRUE) + ba = bmr$aggregate(msr("regr.rsq")) + list(bmr = bmr, ba = ba)# detailed and aggregated benchmark result +} + +# DATA ------------------------------------------------------------------------- + +# Execute benchmarking in parallel using multiple cores +plan("multicore") +z = run_bm(n_reps) +ba = z$ba +bmr = z$bmr + +# Extract and save model coefficients (betas) +nn = length(bmr$uhashes) +betas = lapply(1:nn, function(i){ + at = bmr$resample_results$resample_result[[i]]$learners[[1]] + gmod = at$learner$model + as.numeric(gmod$beta) +}) +ba$betas = betas +ba$resample_result = NULL +save(file = "enet_exp.RData", bmr_aggr = ba) diff --git a/slides/regularization/rsrc/enet_lasso_ridge_r2.R b/slides/regularization/rsrc/enet_lasso_ridge_r2.R index 550c8a30..c6c680d3 100644 --- a/slides/regularization/rsrc/enet_lasso_ridge_r2.R +++ b/slides/regularization/rsrc/enet_lasso_ridge_r2.R @@ -1,53 +1,53 @@ -# ------------------------------------------------------------------------------ -# enetlogreg - -# FIG: boxplot of R-squared for elasticnet, lasso and ridge -# LEFT: linear model with 5 non-Zero coefficients (sparse) -# RIGHT: linear model with 500 non-Zero coefficients -# ------------------------------------------------------------------------------ - -library(ggplot2) -library(gridExtra) -load("enet_exp.RData") - -# PLOT ------------------------------------------------------------------------- - -q_values <- sapply(ba$task_id, function(task) { - as.numeric(sub("q:(\\d+)", "\\1", task)) -}) - -performance_df <- as.data.frame(ba) -performance_df$q <- q_values -performance_df$learner_id <- as.factor(gsub("\\.tuned", "", performance_df$learner_id)) - -# linear model with sparse features -df_5 <- performance_df[performance_df['q']==5,] - -p1 <- ggplot(data = df_5, aes(x = regr.rsq, y = learner_id)) + - geom_boxplot() + - coord_flip() + - ylab("") + - labs(title="sparse") + - xlab("R-squared")+ - xlim(0.5,1)+ - theme_minimal(base_size = 10) + - theme(legend.position="none", - axis.title.x=element_blank()) - -# linear model with non-sparse features -df_500 <- performance_df[performance_df['q']==500,] - -p2 <- ggplot(data = df_500, aes(x = regr.rsq, y = learner_id)) + - geom_boxplot() + - coord_flip() + - ylab("") + - xlab("R-squared")+ - labs(title="non-sparse") + - xlim(0.5,1)+ - theme_minimal(base_size = 10) + - theme(legend.position="none", - axis.title.x=element_blank()) - -p <- grid.arrange(p1, p2, nrow= 1) - -ggsave("../figure/enet_lasso_ridge_r2.png", plot = p, width = 6, height = 2) +# ------------------------------------------------------------------------------ +# enetlogreg + +# FIG: boxplot of R-squared for elasticnet, lasso and ridge +# LEFT: linear model with 5 non-Zero coefficients (sparse) +# RIGHT: linear model with 500 non-Zero coefficients +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(gridExtra) +load("enet_exp.RData") + +# PLOT ------------------------------------------------------------------------- + +q_values <- sapply(ba$task_id, function(task) { + as.numeric(sub("q:(\\d+)", "\\1", task)) +}) + +performance_df <- as.data.frame(ba) +performance_df$q <- q_values +performance_df$learner_id <- as.factor(gsub("\\.tuned", "", performance_df$learner_id)) + +# linear model with sparse features +df_5 <- performance_df[performance_df['q']==5,] + +p1 <- ggplot(data = df_5, aes(x = regr.rsq, y = learner_id)) + + geom_boxplot() + + coord_flip() + + ylab("") + + labs(title="sparse") + + xlab("R-squared")+ + xlim(0.95,1)+ + theme_minimal(base_size = 10) + + theme(legend.position="none", + axis.title.x=element_blank()) + +# linear model with dense features +df_500 <- performance_df[performance_df['q']==500,] + +p2 <- ggplot(data = df_500, aes(x = regr.rsq, y = learner_id)) + + geom_boxplot() + + coord_flip() + + ylab("") + + xlab("R-squared")+ + labs(title="dense") + + xlim(0.5,1)+ + theme_minimal(base_size = 10) + + theme(legend.position="none", + axis.title.x=element_blank()) + +p <- grid.arrange(p1, p2, nrow= 1) + +ggsave("../figure/enet_lasso_ridge_r2.png", plot = p, width = 6, height = 2) diff --git a/slides/regularization/rsrc/enet_tradeoff.R b/slides/regularization/rsrc/enet_tradeoff.R index d39c2153..bf5ac2ea 100644 --- a/slides/regularization/rsrc/enet_tradeoff.R +++ b/slides/regularization/rsrc/enet_tradeoff.R @@ -48,10 +48,10 @@ p1 <- ggplot(data = df_5, aes(x=as.numeric(betas), y = as.numeric(index), group= labs(title="sparse") + facet_grid(learner_id~.)+ xlab("value") + - ylab(expression('index of'~betas)) + + ylab(expression('index of'~theta)) + scale_y_continuous(breaks=1:10) -# linear model with non-sparse features +# linear model with dense features df_500 <- performance_df[performance_df['q']==500,] df_500 <- df_500 %>% select(learner_id, betas) @@ -70,9 +70,9 @@ p2 <- ggplot(data = df_500, aes(x=as.numeric(betas), y = as.numeric(index), grou geom_boxplot(width = 0.4, color = "gray50", alpha = 0.5) + coord_flip()+ facet_grid(learner_id~.) + - labs(title="non-sparse") + + labs(title="dense") + xlab("value") + - ylab(expression('index of'~betas)) + + ylab(expression('index of'~theta)) + scale_y_continuous(breaks=1:10) p <- grid.arrange(p1, p2, nrow=1) diff --git a/slides/regularization/slides-regu-enetlogreg.tex b/slides/regularization/slides-regu-enetlogreg.tex index c2418b09..0ecd7555 100644 --- a/slides/regularization/slides-regu-enetlogreg.tex +++ b/slides/regularization/slides-regu-enetlogreg.tex @@ -35,22 +35,20 @@ \begin{vbframe} {Simulated Example} \footnotesize -50 data sets with $n=100$ for setups: $y =\xv^T \thetab + \epsilon; \quad \epsilon \sim N(0,1); - \quad \xv \sim N(0, \Sigma)$: +5-fold CV with $n_{train}=100$ and 20 repetitions with $n_{test}=10000$ for setups: $y =\xv^T \thetab + \epsilon; \quad \epsilon \sim N(0,0.1^2); + \quad \xv \sim N(0, \Sigma); \quad\Sigma_{k,l}=0.8^{|k-l|}$: \vspace{-0.3cm} \begin{columns} \begin{column}{0.5\textwidth} \begin{center} -{\footnotesize \textbf{Ridge} better for corr. features}: \\ -$\thetab=(\underbrace{2,\ldots,2}_{5},\underbrace{0,\ldots,0}_{5})$\\ -$ \Sigma_{k,l}=0.8^{|k-l|}$ +{\footnotesize \textbf{Lasso} better for sparse features:} \\ +$\thetab=(\underbrace{1,\ldots,1}_{5},\underbrace{0,\ldots,0}_{495})$\\ \end{center} \end{column} \begin{column}{0.5\textwidth} \begin{center} -{\footnotesize \textbf{Lasso} better for sparse without corr.:} \\ -$\thetab=(2, 2, 2,\underbrace{0,\ldots,0}_{7})$ \\ -$ \Sigma = I_p$ +{\footnotesize \textbf{Ridge} better for dense features:} \\ +$\thetab=(\underbrace{1,\ldots,1,1,\ldots,1}_{500})$ \\ \end{center} \end{column} \end{columns} @@ -67,9 +65,9 @@ \footnotesize -LHS: ridge cannot perform variable selection compared to lasso/e-net. \\ +LHS: ridge estimates of noise features hover around $0$ while lasso/e-net produce $0$s. \\ +RHS: ridge cannot perform variable selection compared to lasso/e-net. \\ Lasso more frequently ignores relevant features than e-net (longer tails in violin plot).\\ -RHS: ridge estimates of noise features hover around $0$ while lasso/e-net produce $0$s. %Since Elastic Net offers a compromise between Ridge and lasso, it is suitable for both data situations. \end{vbframe}