diff --git a/slides/regularization/figure/avoid_overfitting_02.png b/slides/regularization/figure/avoid_overfitting_02.png index 464d7414..31ba2946 100644 Binary files a/slides/regularization/figure/avoid_overfitting_02.png and b/slides/regularization/figure/avoid_overfitting_02.png differ diff --git a/slides/regularization/figure/bias_var_decomp.png b/slides/regularization/figure/bias_var_decomp.png new file mode 100755 index 00000000..d2c8abb3 Binary files /dev/null and b/slides/regularization/figure/bias_var_decomp.png differ diff --git a/slides/regularization/figure/classifi_nn_err_decay.png b/slides/regularization/figure/classifi_nn_err_decay.png new file mode 100755 index 00000000..a9c30a07 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_err_decay.png differ diff --git a/slides/regularization/figure/classifi_nn_err_size.png b/slides/regularization/figure/classifi_nn_err_size.png new file mode 100755 index 00000000..c2531c4e Binary files /dev/null and b/slides/regularization/figure/classifi_nn_err_size.png differ diff --git a/slides/regularization/figure/classifi_nn_size_1.png b/slides/regularization/figure/classifi_nn_size_1.png new file mode 100755 index 00000000..a809933c Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_1.png differ diff --git a/slides/regularization/figure/classifi_nn_size_2.png b/slides/regularization/figure/classifi_nn_size_2.png new file mode 100755 index 00000000..97a1e468 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_2.png differ diff --git a/slides/regularization/figure/classifi_nn_size_3.png b/slides/regularization/figure/classifi_nn_size_3.png new file mode 100755 index 00000000..f7fdae90 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_3.png differ diff --git a/slides/regularization/figure/classifi_nn_size_4.png b/slides/regularization/figure/classifi_nn_size_4.png new file mode 100755 index 00000000..79738067 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_4.png differ diff --git a/slides/regularization/figure/classifi_nn_size_5.png b/slides/regularization/figure/classifi_nn_size_5.png new file mode 100755 index 00000000..26436ad3 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_5.png differ diff --git a/slides/regularization/figure/classifi_nn_size_6.png b/slides/regularization/figure/classifi_nn_size_6.png new file mode 100755 index 00000000..81c0fd6e Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_6.png differ diff --git a/slides/regularization/figure/classifi_nn_w_size_1.png b/slides/regularization/figure/classifi_nn_w_size_1.png new file mode 100755 index 00000000..0f04c780 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_1.png differ diff --git a/slides/regularization/figure/classifi_nn_w_size_2.png b/slides/regularization/figure/classifi_nn_w_size_2.png new file mode 100755 index 00000000..04767ab6 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_2.png differ diff --git a/slides/regularization/figure/classifi_nn_w_size_3.png b/slides/regularization/figure/classifi_nn_w_size_3.png new file mode 100755 index 00000000..762c137b Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_3.png differ diff --git a/slides/regularization/figure/classifi_nn_w_size_4.png b/slides/regularization/figure/classifi_nn_w_size_4.png new file mode 100755 index 00000000..f575ef51 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_4.png differ diff --git a/slides/regularization/figure/classifi_nn_w_size_5.png b/slides/regularization/figure/classifi_nn_w_size_5.png new file mode 100755 index 00000000..5a00920f Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_5.png differ diff --git a/slides/regularization/figure/classifi_nn_w_size_6.png b/slides/regularization/figure/classifi_nn_w_size_6.png new file mode 100755 index 00000000..512ed4a1 Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_6.png differ diff --git a/slides/regularization/figure/early_stopping.png b/slides/regularization/figure/early_stopping.png index ddbb7cad..336be724 100644 Binary files a/slides/regularization/figure/early_stopping.png and b/slides/regularization/figure/early_stopping.png differ diff --git a/slides/regularization/figure/eval_ofit_1a.pdf b/slides/regularization/figure/eval_ofit_1a.pdf deleted file mode 100644 index 7dfa288c..00000000 Binary files a/slides/regularization/figure/eval_ofit_1a.pdf and /dev/null differ diff --git a/slides/regularization/figure/eval_ofit_1o.pdf b/slides/regularization/figure/eval_ofit_1o.pdf deleted file mode 100644 index 03080f7a..00000000 Binary files a/slides/regularization/figure/eval_ofit_1o.pdf and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-1.png b/slides/regularization/figure/fig-regu-nonlin-1.png deleted file mode 100644 index f1962bff..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-1.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-2.png b/slides/regularization/figure/fig-regu-nonlin-2.png deleted file mode 100644 index 9da89241..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-2.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-3.png b/slides/regularization/figure/fig-regu-nonlin-3.png deleted file mode 100644 index 92008738..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-3.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-4.png b/slides/regularization/figure/fig-regu-nonlin-4.png deleted file mode 100644 index d9b015fa..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-4.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-size-1.png b/slides/regularization/figure/fig-regu-nonlin-size-1.png deleted file mode 100644 index f972a45b..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-size-1.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-size-2.png b/slides/regularization/figure/fig-regu-nonlin-size-2.png deleted file mode 100644 index 90086b05..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-size-2.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-size-3.png b/slides/regularization/figure/fig-regu-nonlin-size-3.png deleted file mode 100644 index 32145988..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-size-3.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-size-4.png b/slides/regularization/figure/fig-regu-nonlin-size-4.png deleted file mode 100644 index 5409ec5e..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-size-4.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-size-5.png b/slides/regularization/figure/fig-regu-nonlin-size-5.png deleted file mode 100644 index e8f05c53..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-size-5.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-size-6.png b/slides/regularization/figure/fig-regu-nonlin-size-6.png deleted file mode 100644 index ecb0d3de..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-size-6.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-srm-1.png b/slides/regularization/figure/fig-regu-nonlin-srm-1.png deleted file mode 100644 index b3a2ea07..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-srm-1.png and /dev/null differ diff --git a/slides/regularization/figure/fig-regu-nonlin-srm-2.png b/slides/regularization/figure/fig-regu-nonlin-srm-2.png deleted file mode 100644 index d4bed213..00000000 Binary files a/slides/regularization/figure/fig-regu-nonlin-srm-2.png and /dev/null differ diff --git a/slides/regularization/figure/graddes_vs_weightdecay.png b/slides/regularization/figure/graddes_vs_weightdecay.png index ecb6fd09..54af2d3e 100644 Binary files a/slides/regularization/figure/graddes_vs_weightdecay.png and b/slides/regularization/figure/graddes_vs_weightdecay.png differ diff --git a/slides/regularization/figure/l2_reg_hess_01_plot.png b/slides/regularization/figure/l2_reg_hess_01.png old mode 100644 new mode 100755 similarity index 100% rename from slides/regularization/figure/l2_reg_hess_01_plot.png rename to slides/regularization/figure/l2_reg_hess_01.png diff --git a/slides/regularization/figure/l2_reg_hess_02_plot.png b/slides/regularization/figure/l2_reg_hess_02.png old mode 100644 new mode 100755 similarity index 100% rename from slides/regularization/figure/l2_reg_hess_02_plot.png rename to slides/regularization/figure/l2_reg_hess_02.png diff --git a/slides/regularization/figure/l2_reg_hess_03_plot.png b/slides/regularization/figure/l2_reg_hess_03.png old mode 100644 new mode 100755 similarity index 100% rename from slides/regularization/figure/l2_reg_hess_03_plot.png rename to slides/regularization/figure/l2_reg_hess_03.png diff --git a/slides/regularization/figure/l2_reg_hess_04_plot.png b/slides/regularization/figure/l2_reg_hess_04.png old mode 100644 new mode 100755 similarity index 100% rename from slides/regularization/figure/l2_reg_hess_04_plot.png rename to slides/regularization/figure/l2_reg_hess_04.png diff --git a/slides/regularization/figure/lasso_contour_cases.png b/slides/regularization/figure/lasso_contour_cases.png new file mode 100755 index 00000000..0d9c0652 Binary files /dev/null and b/slides/regularization/figure/lasso_contour_cases.png differ diff --git a/slides/regularization/figure/lasso_contours.png b/slides/regularization/figure/lasso_contours.png deleted file mode 100644 index 3944622c..00000000 Binary files a/slides/regularization/figure/lasso_contours.png and /dev/null differ diff --git a/slides/regularization/figure/lasso_outside.png b/slides/regularization/figure/lasso_outside.png deleted file mode 100644 index cf61e81e..00000000 Binary files a/slides/regularization/figure/lasso_outside.png and /dev/null differ diff --git a/slides/regularization/figure/lin_model_regu_01.png b/slides/regularization/figure/lin_model_regu_01.png new file mode 100755 index 00000000..85d573bf Binary files /dev/null and b/slides/regularization/figure/lin_model_regu_01.png differ diff --git a/slides/regularization/figure/lin_model_regu_02.png b/slides/regularization/figure/lin_model_regu_02.png new file mode 100755 index 00000000..0d245094 Binary files /dev/null and b/slides/regularization/figure/lin_model_regu_02.png differ diff --git a/slides/regularization/figure/lin_reg_l1.png b/slides/regularization/figure/lin_reg_l1.png deleted file mode 100644 index 02fc6d2d..00000000 Binary files a/slides/regularization/figure/lin_reg_l1.png and /dev/null differ diff --git a/slides/regularization/figure/lin_reg_l2.png b/slides/regularization/figure/lin_reg_l2.png deleted file mode 100644 index 4c0b29f5..00000000 Binary files a/slides/regularization/figure/lin_reg_l2.png and /dev/null differ diff --git a/slides/regularization/figure/model_eval_01.png b/slides/regularization/figure/model_eval_01.png new file mode 100755 index 00000000..28b42327 Binary files /dev/null and b/slides/regularization/figure/model_eval_01.png differ diff --git a/slides/regularization/figure/model_eval_02.png b/slides/regularization/figure/model_eval_02.png new file mode 100755 index 00000000..7bb3ff6c Binary files /dev/null and b/slides/regularization/figure/model_eval_02.png differ diff --git a/slides/regularization/figure/model_eval_03.png b/slides/regularization/figure/model_eval_03.png new file mode 100755 index 00000000..4b3f85bd Binary files /dev/null and b/slides/regularization/figure/model_eval_03.png differ diff --git a/slides/regularization/figure/multicollinearity_example.png b/slides/regularization/figure/multicollinearity_example.png new file mode 100755 index 00000000..03c9e467 Binary files /dev/null and b/slides/regularization/figure/multicollinearity_example.png differ diff --git a/slides/regularization/figure/ozone_mse_boxplot.png b/slides/regularization/figure/ozone_mse_boxplot.png index 3206091b..66d384d6 100644 Binary files a/slides/regularization/figure/ozone_mse_boxplot.png and b/slides/regularization/figure/ozone_mse_boxplot.png differ diff --git a/slides/regularization/figure/poly_ridge_01.png b/slides/regularization/figure/poly_ridge_01.png new file mode 100755 index 00000000..79304456 Binary files /dev/null and b/slides/regularization/figure/poly_ridge_01.png differ diff --git a/slides/regularization/figure/poly_ridge_02.png b/slides/regularization/figure/poly_ridge_02.png new file mode 100755 index 00000000..375f83b3 Binary files /dev/null and b/slides/regularization/figure/poly_ridge_02.png differ diff --git a/slides/regularization/figure/poly_ridge_1.png b/slides/regularization/figure/poly_ridge_1.png deleted file mode 100644 index 2ce5e37c..00000000 Binary files a/slides/regularization/figure/poly_ridge_1.png and /dev/null differ diff --git a/slides/regularization/figure/poly_ridge_2.png b/slides/regularization/figure/poly_ridge_2.png deleted file mode 100644 index 0f6d0827..00000000 Binary files a/slides/regularization/figure/poly_ridge_2.png and /dev/null differ diff --git a/slides/regularization/figure/reg_contours_01.png b/slides/regularization/figure/reg_contours_01.png new file mode 100755 index 00000000..6ccf5164 Binary files /dev/null and b/slides/regularization/figure/reg_contours_01.png differ diff --git a/slides/regularization/figure/reg_contours_02.png b/slides/regularization/figure/reg_contours_02.png new file mode 100755 index 00000000..500d3ba6 Binary files /dev/null and b/slides/regularization/figure/reg_contours_02.png differ diff --git a/slides/regularization/figure/reg_surfaces.png b/slides/regularization/figure/reg_surfaces.png deleted file mode 100644 index f7cd9bf3..00000000 Binary files a/slides/regularization/figure/reg_surfaces.png and /dev/null differ diff --git a/slides/regularization/figure/reg_surfaces_l1_l2.png b/slides/regularization/figure/reg_surfaces_l1_l2.png deleted file mode 100644 index 45e9f1ac..00000000 Binary files a/slides/regularization/figure/reg_surfaces_l1_l2.png and /dev/null differ diff --git a/slides/regularization/figure/reg_surfaces_l1_lam0.png b/slides/regularization/figure/reg_surfaces_l1_lam0.png new file mode 100755 index 00000000..2420f092 Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_lam0.png differ diff --git a/slides/regularization/figure/reg_surfaces_l1_lam1.png b/slides/regularization/figure/reg_surfaces_l1_lam1.png new file mode 100755 index 00000000..73593179 Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_lam1.png differ diff --git a/slides/regularization/figure/reg_surfaces_l1_lam10.png b/slides/regularization/figure/reg_surfaces_l1_lam10.png new file mode 100755 index 00000000..0cb729c0 Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_lam10.png differ diff --git a/slides/regularization/figure/reg_surfaces_l2_lam0.png b/slides/regularization/figure/reg_surfaces_l2_lam0.png new file mode 100755 index 00000000..38bcf395 Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l2_lam0.png differ diff --git a/slides/regularization/figure/reg_surfaces_l2_lam1.png b/slides/regularization/figure/reg_surfaces_l2_lam1.png new file mode 100755 index 00000000..14d2eee6 Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l2_lam1.png differ diff --git a/slides/regularization/figure/reg_surfaces_l2_lam10.png b/slides/regularization/figure/reg_surfaces_l2_lam10.png new file mode 100755 index 00000000..70231061 Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l2_lam10.png differ diff --git a/slides/regularization/figure/regu_example_multicollinearity.png b/slides/regularization/figure/regu_example_multicollinearity.png deleted file mode 100644 index 7f837c33..00000000 Binary files a/slides/regularization/figure/regu_example_multicollinearity.png and /dev/null differ diff --git a/slides/regularization/figure/ridge_constraints.png b/slides/regularization/figure/ridge_constraints.png deleted file mode 100644 index c5374862..00000000 Binary files a/slides/regularization/figure/ridge_constraints.png and /dev/null differ diff --git a/slides/regularization/figure/ridge_contours.png b/slides/regularization/figure/ridge_contours.png deleted file mode 100644 index fc0441c2..00000000 Binary files a/slides/regularization/figure/ridge_contours.png and /dev/null differ diff --git a/slides/regularization/figure/ridge_hat.png b/slides/regularization/figure/ridge_hat.png deleted file mode 100644 index 096c3c1a..00000000 Binary files a/slides/regularization/figure/ridge_hat.png and /dev/null differ diff --git a/slides/regularization/figure/ridge_inside.png b/slides/regularization/figure/ridge_inside.png deleted file mode 100644 index f298baa8..00000000 Binary files a/slides/regularization/figure/ridge_inside.png and /dev/null differ diff --git a/slides/regularization/figure/ridge_outside.png b/slides/regularization/figure/ridge_outside.png deleted file mode 100644 index 9b84c425..00000000 Binary files a/slides/regularization/figure/ridge_outside.png and /dev/null differ diff --git a/slides/regularization/figure/ridge_perspectives_01.png b/slides/regularization/figure/ridge_perspectives_01.png new file mode 100755 index 00000000..2e670bc5 Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_01.png differ diff --git a/slides/regularization/figure/ridge_perspectives_02.png b/slides/regularization/figure/ridge_perspectives_02.png new file mode 100755 index 00000000..59fd7ce4 Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_02.png differ diff --git a/slides/regularization/figure/ridge_perspectives_03.png b/slides/regularization/figure/ridge_perspectives_03.png new file mode 100755 index 00000000..87dfda49 Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_03.png differ diff --git a/slides/regularization/figure/ridge_perspectives_04.png b/slides/regularization/figure/ridge_perspectives_04.png new file mode 100755 index 00000000..252e9a27 Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_04.png differ diff --git a/slides/regularization/figure/ridge_vs_sgd_path.png b/slides/regularization/figure/ridge_vs_sgd_path.png new file mode 100755 index 00000000..d1bca0a6 Binary files /dev/null and b/slides/regularization/figure/ridge_vs_sgd_path.png differ diff --git a/slides/regularization/figure/shrinkage_01.png b/slides/regularization/figure/shrinkage_01.png new file mode 100755 index 00000000..970c18a1 Binary files /dev/null and b/slides/regularization/figure/shrinkage_01.png differ diff --git a/slides/regularization/figure/shrinkage_02.png b/slides/regularization/figure/shrinkage_02.png new file mode 100755 index 00000000..2b5d0d08 Binary files /dev/null and b/slides/regularization/figure/shrinkage_02.png differ diff --git a/slides/regularization/figure/shrinkage_1.png b/slides/regularization/figure/shrinkage_1.png deleted file mode 100644 index 0157b7e3..00000000 Binary files a/slides/regularization/figure/shrinkage_1.png and /dev/null differ diff --git a/slides/regularization/figure/shrinkage_2.png b/slides/regularization/figure/shrinkage_2.png deleted file mode 100644 index 6b28982a..00000000 Binary files a/slides/regularization/figure/shrinkage_2.png and /dev/null differ diff --git a/slides/regularization/figure/soft_thresholding.png b/slides/regularization/figure/soft_thresholding.png new file mode 100755 index 00000000..9bb06127 Binary files /dev/null and b/slides/regularization/figure/soft_thresholding.png differ diff --git a/slides/regularization/figure/solution_paths_01.png b/slides/regularization/figure/solution_paths_01.png new file mode 100755 index 00000000..af753baf Binary files /dev/null and b/slides/regularization/figure/solution_paths_01.png differ diff --git a/slides/regularization/figure/solution_paths_02.png b/slides/regularization/figure/solution_paths_02.png new file mode 100755 index 00000000..ae783dc0 Binary files /dev/null and b/slides/regularization/figure/solution_paths_02.png differ diff --git a/slides/regularization/figure/weightdecay_lambda_plot_01.png b/slides/regularization/figure/weightdecay_lambda_01.png old mode 100644 new mode 100755 similarity index 100% rename from slides/regularization/figure/weightdecay_lambda_plot_01.png rename to slides/regularization/figure/weightdecay_lambda_01.png diff --git a/slides/regularization/figure/weightdecay_lambda_plot_02.png b/slides/regularization/figure/weightdecay_lambda_02.png old mode 100644 new mode 100755 similarity index 100% rename from slides/regularization/figure/weightdecay_lambda_plot_02.png rename to slides/regularization/figure/weightdecay_lambda_02.png diff --git a/slides/regularization/figure/weightdecay_lambda_plot.png b/slides/regularization/figure/weightdecay_lambda_plot.png deleted file mode 100644 index 8efe8b7b..00000000 Binary files a/slides/regularization/figure/weightdecay_lambda_plot.png and /dev/null differ diff --git a/slides/regularization/figure_man/bayes-plot-posterior.png b/slides/regularization/figure_man/bayes-plot-posterior.png deleted file mode 100644 index b7011648..00000000 Binary files a/slides/regularization/figure_man/bayes-plot-posterior.png and /dev/null differ diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png deleted file mode 100644 index 1af66601..00000000 Binary files a/slides/regularization/figure_man/bias-variance-ridge.png and /dev/null differ diff --git a/slides/regularization/figure_man/lasso_contours_cases.png b/slides/regularization/figure_man/lasso_contours_cases.png deleted file mode 100644 index 2f68583f..00000000 Binary files a/slides/regularization/figure_man/lasso_contours_cases.png and /dev/null differ diff --git a/slides/regularization/figure_man/other-pen-MCP.png b/slides/regularization/figure_man/other-pen-MCP.png deleted file mode 100644 index cb5eef09..00000000 Binary files a/slides/regularization/figure_man/other-pen-MCP.png and /dev/null differ diff --git a/slides/regularization/figure_man/other-pen-SCAD.png b/slides/regularization/figure_man/other-pen-SCAD.png deleted file mode 100644 index 7dc5225c..00000000 Binary files a/slides/regularization/figure_man/other-pen-SCAD.png and /dev/null differ diff --git a/slides/regularization/figure_man/other-pen-lasso.png b/slides/regularization/figure_man/other-pen-lasso.png deleted file mode 100644 index 88e05f77..00000000 Binary files a/slides/regularization/figure_man/other-pen-lasso.png and /dev/null differ diff --git a/slides/regularization/figure_man/ridge-vs-sgd-path.png b/slides/regularization/figure_man/ridge-vs-sgd-path.png deleted file mode 100644 index 9c732b08..00000000 Binary files a/slides/regularization/figure_man/ridge-vs-sgd-path.png and /dev/null differ diff --git a/slides/regularization/figure_man/ridge_hat.png b/slides/regularization/figure_man/ridge_hat.png deleted file mode 100644 index ebe29bca..00000000 Binary files a/slides/regularization/figure_man/ridge_hat.png and /dev/null differ diff --git a/slides/regularization/figure_man/soft-thresholding.pdf b/slides/regularization/figure_man/soft-thresholding.pdf deleted file mode 100644 index 12208bc0..00000000 Binary files a/slides/regularization/figure_man/soft-thresholding.pdf and /dev/null differ diff --git a/slides/regularization/figure_man/solution-path-ridge-lasso.png b/slides/regularization/figure_man/solution-path-ridge-lasso.png deleted file mode 100644 index 74fc339a..00000000 Binary files a/slides/regularization/figure_man/solution-path-ridge-lasso.png and /dev/null differ diff --git a/slides/regularization/figure_man/solution-path-ridge-only.png b/slides/regularization/figure_man/solution-path-ridge-only.png deleted file mode 100644 index 5f8fda17..00000000 Binary files a/slides/regularization/figure_man/solution-path-ridge-only.png and /dev/null differ diff --git a/slides/regularization/figure_man/solution_path.png b/slides/regularization/figure_man/solution_path.png deleted file mode 100644 index a72944be..00000000 Binary files a/slides/regularization/figure_man/solution_path.png and /dev/null differ diff --git a/slides/regularization/figure_man/solution_path_l2.png b/slides/regularization/figure_man/solution_path_l2.png deleted file mode 100644 index 71cccb93..00000000 Binary files a/slides/regularization/figure_man/solution_path_l2.png and /dev/null differ diff --git a/slides/regularization/figure_man/solution_paths_l1_l2.png b/slides/regularization/figure_man/solution_paths_l1_l2.png deleted file mode 100644 index b163dee8..00000000 Binary files a/slides/regularization/figure_man/solution_paths_l1_l2.png and /dev/null differ diff --git a/slides/regularization/figure_man/wt_decay_hat.png b/slides/regularization/figure_man/wt_decay_hat.png deleted file mode 100644 index 97c5bbc1..00000000 Binary files a/slides/regularization/figure_man/wt_decay_hat.png and /dev/null differ diff --git a/slides/regularization/rsrc/avoid_overfitting.R b/slides/regularization/rsrc/avoid_overfitting.R new file mode 100755 index 00000000..2043285b --- /dev/null +++ b/slides/regularization/rsrc/avoid_overfitting.R @@ -0,0 +1,40 @@ +# ------------------------------------------------------------------------------ +# intro + +# FIG: how MSE for training and test data change with +# different feature numbers, and with different data sizes. + +# DATA: from data_ozone_example.RData +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(data.table) + +theme_set(theme_minimal()) + +# DATA ------------------------------------------------------------------------- + +load("data_ozone_example.RData") + +dfp <- setDT(df_incdata)[, .(mean.mse = median(value)), by = c("nobs", "variable")] + +# PLOTS ------------------------------------------------------------------------ + +# data size +p1 <- ggplot(data = dfp, aes(x = nobs, y = mean.mse, colour = variable)) + + geom_line(lwd = 1.2) + ylim(c(0, 100)) + labs(colour = " ") + + scale_colour_discrete(labels = c("Train error", "Test error")) + + xlab("Size of data set") + ylab("MSE") + + scale_color_brewer(palette="Dark2") + +# feature number +p2 <- ggplot(data = df_incfeatures, aes(x = type, y = mean.mse, colour = variable)) + + geom_line(lwd = 1.2) + labs(colour = " ") + + scale_colour_discrete(labels = c("Train error", "Test error")) + + xlab("Number of features") + ylab("MSE") + + ylim(c(0, 150)) + + scale_x_continuous(breaks = 0:12) + + scale_color_brewer(palette="Dark2") + +ggsave("../figure/avoid_overfitting_01.png", plot=p1, width=5, height=2.5) +ggsave("../figure/avoid_overfitting_02.png", plot=p2, width=5, height=2.5) diff --git a/slides/regularization/rsrc/bias-var-decomp-ridge.py b/slides/regularization/rsrc/bias-var-decomp-ridge.py deleted file mode 100644 index 9c8d7a6f..00000000 --- a/slides/regularization/rsrc/bias-var-decomp-ridge.py +++ /dev/null @@ -1,94 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import Ridge -from sklearn.metrics import mean_squared_error - - -# Set the random seed for reproducibility -np.random.seed(0) - -# Define the true function and the number of datasets -true_function = lambda x: np.sin(x) -n_datasets = 100 # Number of datasets for training -n_samples = 100 -n_test_samples = 10000 -n_order = 8 -lambdas = np.exp(np.linspace(-6, 7, 25)) - -# Generate polynomial features -poly = PolynomialFeatures(degree=n_order, include_bias=False) - -# Initialize arrays to store the bias, variance, and error -bias_squared = np.zeros_like(lambdas) -variance = np.zeros_like(lambdas) -test_error = np.zeros_like(lambdas) - -# Generate shared x values for all datasets -x_shared = np.random.uniform(0, 1, n_samples).reshape(-1, 1) -x_shared_poly = poly.fit_transform(x_shared) - -# Generate test data -x_test = np.random.uniform(0, 1, n_test_samples).reshape(-1, 1) -y_test = true_function(x_test).reshape(-1, 1) + np.random.randn(n_test_samples,1) -x_test_poly = poly.transform(x_test) - -# Loop over the lambda values -for i, lambda_val in enumerate(lambdas): - # Initialize arrays to store predictions for each model - predictions = np.zeros((n_datasets, n_samples)) - - # Train and predict with n_datasets models - for j in range(n_datasets): - # Generate new y values for each dataset - epsilon = np.random.randn(n_samples, 1) - y = true_function(x_shared) + epsilon - - # Fit Ridge regression model - model = Ridge(alpha=lambda_val, fit_intercept=True) - model.fit(x_shared_poly, y) - predictions[j, :] = model.predict(x_shared_poly).flatten() - - # Calculate the average prediction for each x - average_prediction = np.mean(predictions, axis=0) - - # Compute itegrated bias^2 and variance using MC - bias_squared[i] = np.mean((average_prediction - true_function(x_shared).flatten()) ** 2) - variance[i] = np.mean(np.var(predictions, axis=0)) - -# Train a final model on a new dataset and compute test error for each lambda -for i, lambda_val in enumerate(lambdas): - # Generate new data for the final model - x_train_final = np.random.uniform(0, 1, n_samples).reshape(-1, 1) - y_train_final = true_function(x_train_final) + np.random.randn(n_samples, 1) - x_train_final_poly = poly.transform(x_train_final) - - # Fit the final model - model_final = Ridge(alpha=lambda_val, fit_intercept=True) - model_final.fit(x_train_final_poly, y_train_final) - - # Predict on the test set and compute the error - y_test_pred_final = model_final.predict(x_test_poly).flatten() - # The test error - test_error[i] = mean_squared_error(y_test, y_test_pred_final) - -# Plotting the results with two y-axes -fig, ax1 = plt.subplots(figsize=(12, 6)) - -# Plot bias^2 and variance on the primary y-axis -ax1.plot(np.log(lambdas), bias_squared, label='(bias)^2', color='red') -ax1.plot(np.log(lambdas), variance, label='variance', color='blue') -ax1.plot(np.log(lambdas), bias_squared + variance, label='(bias)^2 + variance', color='green') - -ax1.set_xlabel('ln(λ)', fontsize=16) -ax1.set_ylabel('(bias)^2, variance', fontsize=16) -ax1.legend(loc='upper left') - -# Create secondary y-axis for test error -ax2 = ax1.twinx() -ax2.plot(np.log(lambdas), test_error, label='test error', color='magenta', linestyle='--', alpha=.6) -ax2.set_ylabel('Test error on single dataset', fontsize=16) -ax2.legend(loc='upper right') - -plt.title('Bias-Variance Tradeoff with L2 Regularization', fontsize=20) -plt.show() diff --git a/slides/regularization/rsrc/bias_var_decomp.R b/slides/regularization/rsrc/bias_var_decomp.R new file mode 100755 index 00000000..5a6369da --- /dev/null +++ b/slides/regularization/rsrc/bias_var_decomp.R @@ -0,0 +1,78 @@ +# ------------------------------------------------------------------------------ +# l2 nonlin + +# FIG: decompose MSE to bias_square and variance for ridge regression. +# plot lines to show how each part varies +# with ln(lambda) (natural logarithm of regularization constant). + +# DATA: y = sin(x(100*1 ~Uniform)) + epi (100*1 ~Normal) +# X = (x^1,...,x^8) (100*8 design matrix) +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(dplyr) +library(tidyr) +library(glmnet) + +set.seed(0) + +# DATA ------------------------------------------------------------------------- + +true_function <- function(x) sin(x) +n_datasets <- 100 +n_samples <- 100 +n_test_samples <- 10000 +n_order <- 8 +lambdas <- exp(seq(-6, 7, length.out = 25)) + +# Generate polynomial features +poly_features <- function(x, degree) { + model.matrix(~ poly(x, degree, raw = TRUE) - 1) +} + +# Initialize arrays to store the bias, variance, and error +bias_square <- rep(0, length(lambdas)) +variance <- rep(0, length(lambdas)) +test_error <- rep(0, length(lambdas)) + +# Generate shared x values for all datasets +x_shared <- runif(n_samples) +x_shared_poly <- poly_features(x_shared, n_order) + +# Generate test data +x_test <- runif(n_test_samples) +y_test <- true_function(x_test) + rnorm(n_test_samples) +x_test_poly <- poly_features(x_test, n_order) + +for (i in 1:length(lambdas)) { + predictions <- matrix(0, nrow = n_datasets, ncol = n_samples) + + for (j in 1:n_datasets) { + epsilon <- rnorm(n_samples) + y <- true_function(x_shared) + epsilon + + model <- glmnet(x_shared_poly, y, alpha = 0, lambda = lambdas[i]) + predictions[j, ] <- predict(model, newx = x_shared_poly) + } + + average_prediction <- apply(predictions, 2, mean) + + bias_square[i] <- mean((average_prediction - true_function(x_shared))^2) + variance[i] <- mean(apply(predictions, 2, var)) +} + + +data <- data.frame(log_lambdas = log(lambdas), + bias_square = bias_square, + variance = variance, + MSE = bias_square + variance) %>% + pivot_longer(cols = c(bias_square, variance, MSE), names_to = "component", values_to = "value") + +p <- ggplot(data, aes(x = log_lambdas, y = value, color = component, linetype = component)) + + geom_line(size = 1) + + scale_color_manual(values = c("red", "green", "blue")) + + scale_linetype_manual(values = c("solid", "solid", "solid")) + + labs(x = expression("ln("~λ~")"), y = "value", title = "Bias-Variance Tradeoff with L2 Regularization") + + theme_minimal() + +ggsave("bias_var_decomp.png", p, width = 12, height = 6) diff --git a/slides/regularization/rsrc/make_fig_regu_nonlin_plots.R b/slides/regularization/rsrc/classifi_nn.R old mode 100644 new mode 100755 similarity index 54% rename from slides/regularization/rsrc/make_fig_regu_nonlin_plots.R rename to slides/regularization/rsrc/classifi_nn.R index 9b7118f8..e0769717 --- a/slides/regularization/rsrc/make_fig_regu_nonlin_plots.R +++ b/slides/regularization/rsrc/classifi_nn.R @@ -1,31 +1,45 @@ -################################################################################ -####### Non-linear regularization: Neural net ################################## -################################################################################ +# ------------------------------------------------------------------------------ +# nonlin + +# FIG: +# (1) classification prediction, weights histogram, weights values +# with different lambdas (decay parameter) by nn. +# (2) classification prediction with different sizes of hidden layer by nn. +# (3) how classification errors change with different lambdas, +# and with different sizes of hidden layer by nn. + +# DATA: "spirals" from mlr3. +# ------------------------------------------------------------------------------ library(mlr3) library(mlr3learners) library(mlr3viz) +library(mlr3misc) library(ggplot2) library(gridExtra) library(grid) -#------------------------------------------------------------------------------- + options(digits = 3, width = 65, str = strOptions(strict.width = "cut", vec.len = 3)) -#------------------------------------------------------------------------------- -# functions for plotting the weights +set.seed(1234) + +# PLOT FUNCTIONS --------------------------------------------------------------- + +# value for each weight plot_weights <- function (weights) { - weight_data <- data.frame(value = weights, weights = seq_along(weights)) + weight_data <- data.frame(value = weights, weight_index = seq_along(weights)) - ggplot(weight_data, aes(x=weights, y = value)) + + ggplot(weight_data, aes(x=weight_index, y = value)) + geom_bar(stat ="identity", color="black", fill="white") + ylim(c(-75, 75)) + ggtitle("Weights") } +# histogram of weights plot_histogram <- function (weights) { weight_data <- data.frame(value = weights) @@ -33,14 +47,10 @@ plot_histogram <- function (weights) { geom_histogram (bins= 15, color="black", fill="white") + ggtitle("Histogram of weights") + xlab ("value of weights") + - xlim(c(-100, 100)) #+ - #ylim(c(0,40)) + xlim(c(-100, 100)) } - - -# function for plotting the prediction - +# classification model visualization plot_prediction <- function (learner, task) { plot_learner_prediction(learner, task) + scale_fill_viridis_d(end = .9) + @@ -48,29 +58,21 @@ plot_prediction <- function (learner, task) { ggtitle("Prediction") } -#------------------------------------------------------------------------------- -#spirals dataset +# DATA ------------------------------------------------------------------------- + spirals_generator <- tgen("spirals", sd = 0.1) -# get spirals data spirals_task <- spirals_generator$generate(n=100) +# PLOT PREDICTION & WEIGHTS ---------------------------------------------------- -################################################################################ -############ Different decay parameters ### #################################### -################################################################################ -# decay parameter / lambda +### Different decay parameters decay_list <- list(0, 0.001, 0.005, 0.01, 0.05, 0.1) -# size of single hidden layer size <- 10 - -# plot for all decay paramters the predition & a plot of the weights for(i in seq_along(decay_list)){ - set.seed(1234) learner <- lrn("classif.nnet", size = size, decay = decay_list[[i]]) - learner$train(spirals_task) weights <- learner$model$wts weight_plot <- plot_weights(weights = weights) @@ -78,25 +80,20 @@ for(i in seq_along(decay_list)){ prediction_plot <- plot_prediction(learner, spirals_task) - grid <- grid.arrange(prediction_plot,historgram_plot, weight_plot, ncol = 3, + grid <- grid.arrange(prediction_plot, historgram_plot, weight_plot, ncol = 3, top = textGrob(bquote(lambda==.(decay_list[[i]])), gp = gpar(fontsize = 14))) - ggsave(filename = paste0("../figure/fig-regu-nonlin-", i ,".png"), + ggsave(filename = paste0("../figure/classifi_nn_w_size_", i ,".png"), plot = grid, width = 8, height = 3) } -################################################################################ -############ Different size of hidden layer #################################### -################################################################################ - - +### Different size of hidden layer size_list <- list(1, 2, 3, 5, 10, 100) decay <- 0.001 for(i in seq_along(size_list)){ - set.seed(1234) learner <- lrn("classif.nnet", size = size_list[[i]], decay = decay ) learner$train(spirals_task) @@ -110,54 +107,46 @@ for(i in seq_along(size_list)){ top = textGrob(bquote(size~of~hidden~layer==.(size_list[[i]])), gp = gpar(fontsize = 14))) - ggsave(filename = paste0("../figure/fig-regu-nonlin-size-", i ,".png"), + ggsave(filename = paste0("../figure/classifi_nn_size_", i ,".png"), plot = grid, width = 3, height = 3) } -#------------------------------------------------------------------------------- +# PLOT CLASSIFICATION ERROR ---------------------------------------------------- +### Different decay parameters folds <- 10; reps <- 5; size <- 10 decay_list <- seq(0, 0.02, length.out = 20) - -# this might run for 5 min rdesc <- rsmp("repeated_cv", folds = folds, repeats = reps) lrns <- lapply(decay_list, function(d) lrn("classif.nnet", size = size, decay = d)) gg <- benchmark_grid(tasks = spirals_task, resamplings = rdesc, learners = lrns) br <- benchmark(gg) -a <- br$aggregate(measures = msr("classif.ce"), params = TRUE) -a <- mlr3misc::unnest(a, "params") +a1 <- br$aggregate(measures = msr("classif.ce"), params = TRUE) +a1 <- unnest(a1, "params") -a$log_decay <- log(a$decay + 1) #make U-shape more obivious -p <- ggplot(data = a, aes(x = log_decay, y = classif.ce)) + +a1$log_decay <- log(a1$decay + 1) #make U-shape more obivious +p1 <- ggplot(data = a1, aes(x = log_decay, y = classif.ce)) + geom_line() + - xlab("log(lambda+1)") + ylab("classif err") + - xlim(0, 0.01) + ylim(0.13, 0.27) -#print(p) - -ggsave(filename = paste0("../figure/fig-regu-nonlin-srm-1.png"), - plot = p, width = 6, height = 3) - -#------------------------------------------------------------------------------- + xlab(expression("log("~lambda~"+ 1 )")) + ylab("classif err") + + xlim(0, 0.01) + ylim(0.1, 0.25) +ggsave(filename = paste0("../figure/classifi_nn_err_decay.png"), + plot = p1, width = 6, height = 3) +### Different size of hidden layer folds <- 10; reps <- 5; by <- 1 decay <- 0.001 size_list <- seq(1, 30, by = by) - -# this might run for 5 min rdesc <- rsmp("repeated_cv", folds = folds, repeats = reps) lrns <- lapply(size_list, function(s) lrn("classif.nnet", size = s, decay = decay)) gg <- benchmark_grid(tasks = spirals_task, resamplings = rdesc, learners = lrns) br <- benchmark(gg) -a <- br$aggregate(measures = msr("classif.ce"), params = TRUE) -a <- mlr3misc::unnest(a, "params") -p <- ggplot(data = a, aes(x = size, y = classif.ce)) + +a2 <- br$aggregate(measures = msr("classif.ce"), params = TRUE) +a2 <- unnest(a2, "params") +p2 <- ggplot(data = a2, aes(x = size, y = classif.ce)) + geom_line() + xlab("size hidden layer") + ylab("classif err") -#print(p) - -ggsave(filename = paste0("../figure/fig-regu-nonlin-srm-2.png"), - plot = p, width = 6, height = 3) +ggsave(filename = paste0("../figure/classifi_nn_err_size.png"), + plot = p2, width = 6, height = 3) diff --git a/slides/regularization/rsrc/utils.R b/slides/regularization/rsrc/data_func_utils.R old mode 100644 new mode 100755 similarity index 64% rename from slides/regularization/rsrc/utils.R rename to slides/regularization/rsrc/data_func_utils.R index 601554b6..26901f1c --- a/slides/regularization/rsrc/utils.R +++ b/slides/regularization/rsrc/data_func_utils.R @@ -1,11 +1,27 @@ -library(mlr) -library(mlbench) +# ------------------------------------------------------------------------------ +# geom l1, geom l2, wd vs l2 + +# DATA: simulate linear regression data for ridge and lasso subchapters, +# and define functions for contour plots of empirical risk. +# y = X(100*2 ~Unif)·beta_true(0.5,3) + noise(100*1 ~Normal) + +# FUNC: empirical risk of linear regression model +# hessian matrix for empirical risk +# risk function with l2 regularization +# gradient of empirical risk +# gradient of l2 regularized risk +# gradient descent to get optimal beta +# weight decay to get optimal beta +# contour plots for empirical risk +# ------------------------------------------------------------------------------ + library(ggplot2) -library(BBmisc) -library(reshape) library(viridis) set.seed(123) + +# DATA ------------------------------------------------------------------------- + num_obs <- 100 num_features <- 2 @@ -16,64 +32,76 @@ beta_true <- c(0.5, 3) y <- X %*% beta_true + rnorm(num_obs, sd = err_std) +# FUNCTION --------------------------------------------------------------------- + +# empirical risk R_emp <- function(beta, features = X, target = y){ return(sum((features %*% beta - target)^2)) } +# hessian matrix R_emp_hessian <- function(features = X){ return(2 * t(features)%*%(features)) } +# risk function with l2 regularization R_reg_l2 <- function(beta, lambda = 0.1, features = X, target = y){ return(R_emp(beta, features, target) + (0.5*lambda * sum(beta^2))) } -plot_r_emp <- function(r_emp, x1, x2, bins=NULL, breaks=NULL){ - eval_grid <- expand.grid(x1,x2) - eval_grid$r_emp <- apply(eval_grid, 1, r_emp) - - ggplot(eval_grid) + - geom_raster(aes(x=Var1, y=Var2, fill=r_emp)) + - geom_contour(aes(x=Var1, y=Var2, z=r_emp), colour="white", bins=bins, breaks=breaks) + - xlab(expression(theta[1])) + - ylab(expression(theta[2])) + - scale_fill_viridis(end = 0.9) -} - +# gradient of empirical risk R_emp_grad <- function(beta, features = X, target = y){ return(2 * t(features)%*%(features %*% beta - target)) } +# gradient of l2 regularized risk R_reg_l2_grad <- function(beta, lambda, features = X, target = y){ return((2 * t(features)%*%(features %*% beta - target) + lambda*beta)) } +# gradient descent to get optimal beta gradient_descent <- function(beta_start, step_size, grad_fun, num_steps){ betas <- matrix(0, ncol=length(beta_start), nrow=num_steps) betas[1, ] <- beta_start for(i in seq(2,num_steps)){ betas[i, ] <- betas[i-1, ] - step_size * grad_fun(betas[i-1,]) } - + betas <- as.data.frame(betas) return(betas) } +# weight decay to get optimal beta weight_decay <- function(beta_start, lambda, step_size, unreg_grad_fun, num_steps){ betas_wd <- matrix(NA, ncol=length(beta_start), nrow=(num_steps)*3) betas_wd[1, ] <- beta_start - + betas_gd <- matrix(NA, ncol=length(beta_start), nrow=(num_steps-1)*3) - + for(i in seq(1, 3 * (num_steps-1), 3)){ betas_wd[i+1, ] <- betas_wd[i, ]*(1-step_size*lambda) betas_gd[i, ] <- betas_wd[i+1, ] betas_gd[i+1, ] <- betas_gd[i, ] - step_size * unreg_grad_fun(betas_wd[i,]) betas_wd[i+3, ] <- betas_gd[i+1, ] } - + return(list(betas_wd = as.data.frame(betas_wd), betas_gd = as.data.frame(betas_gd))) -} \ No newline at end of file +} + +# PLOT FUNCTION ---------------------------------------------------------------- + +# empirical risk contour plots +plot_r_emp <- function(r_emp, x1, x2, bins=NULL, breaks=NULL){ + eval_grid <- expand.grid(x1,x2) + eval_grid$r_emp <- apply(eval_grid, 1, r_emp) + + ggplot(eval_grid) + + geom_raster(aes(x=Var1, y=Var2, fill=r_emp)) + + geom_contour(aes(x=Var1, y=Var2, z=r_emp), colour="white", bins=bins, breaks=breaks) + + xlab(expression(theta[1])) + + ylab(expression(theta[2])) + + scale_fill_viridis(end = 0.9) +} diff --git a/slides/regularization/rsrc/ozone_example.RData b/slides/regularization/rsrc/data_ozone_example.RData similarity index 100% rename from slides/regularization/rsrc/ozone_example.RData rename to slides/regularization/rsrc/data_ozone_example.RData diff --git a/slides/regularization/rsrc/data_regu_example_1.R b/slides/regularization/rsrc/data_regu_example_1.R new file mode 100755 index 00000000..d3faf433 --- /dev/null +++ b/slides/regularization/rsrc/data_regu_example_1.R @@ -0,0 +1,74 @@ +# ------------------------------------------------------------------------------ +# l1 vs l2 + +# RDATA: +# (1): generate coefficients path for regression with +# different regularization constants under l1 and l2 regularization. +# (2): generate coefficients and MSE for cross-validation with 50 +# regularization constants (9.536743e-07 to 2) +# under l1 and l2 regularization. +# DATA: boston_housing +# ------------------------------------------------------------------------------ + +library(BBmisc) +library(data.table) +library(mlr3) +library(dplyr) +library(mlr3learners) +library(mlr3tuning) + +set.seed(123) + +# DATA ------------------------------------------------------------------------- + +task = tsk("boston_housing") +feat_drop = c("chas", "nox", "rm", "lat", "lon", "town", "tract") +task$select(setdiff(task$feature_names, feat_drop)) +featnames = task$feature_names + +compute_coef_paths = function(task, lambda_name, lambda_seq) { + alpha = ifelse(lambda_name=='lambda1', 1, 0) + path = list() + for (i in seq_along(lambda_seq)) { + lamval <- lambda_seq[i] + learner = lrn("regr.glmnet", alpha = alpha, lambda=lamval) + learner$train(task) + cc <- t(as.matrix(coef(learner$model))) + names <- colnames(cc) + cc <- as.numeric(cc) + names(cc) <- names + cc <- as.list(cc) + cc$lambda <- lamval + path[[i]] <- cc + } + path <- rbindlist(path, fill = TRUE) + path[is.na(path)] <- 0 + + # Perform cross validation + learner = lrn("regr.glmnet", alpha = alpha, lambda=to_tune(lambda_seq)) + + # Construct tuning instance + instance = ti( + task = task, + learner = learner, + resampling = rsmp("cv", folds = 3), + measures = msr("regr.mse"), + terminator = trm("evals", n_evals = length(lambda_seq)) + ) + + tuner <- tnr("grid_search", resolution = length(lambda_seq)) + tuner$optimize(instance) + cv_lam <- as.data.frame(instance$archive$data)[,1:2] + colnames(cv_lam) <- c("lambda", "mse") + cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda)) + cv_lam <- cv_lam %>% arrange(lambda) + + list(path = path, cv_lam = cv_lam) +} + +lambda_seq = 2^seq(-10, 20, length.out = 50) +path_l1 = compute_coef_paths(task, "lambda1", lambda_seq) +path_l2 = compute_coef_paths(task, "lambda2", lambda_seq) + +save2("data_regu_example_1.RData", path_l1 = path_l1, path_l2 = path_l2, featnames = featnames, lambda_seq = lambda_seq) + diff --git a/slides/regularization/rsrc/data_regu_example_1.RData b/slides/regularization/rsrc/data_regu_example_1.RData new file mode 100755 index 00000000..07723ec4 Binary files /dev/null and b/slides/regularization/rsrc/data_regu_example_1.RData differ diff --git a/slides/regularization/rsrc/data_regu_example_2.R b/slides/regularization/rsrc/data_regu_example_2.R new file mode 100755 index 00000000..314bb383 --- /dev/null +++ b/slides/regularization/rsrc/data_regu_example_2.R @@ -0,0 +1,96 @@ +# ------------------------------------------------------------------------------ +# l1 vs l2 + +# RDATA: +# (1): generate coefficients for regression with two regularization constants +# (lambda 0.01, 100) under l1 and l2 regularization. +# (2): generate coefficients and MSE for cross-validation with 50 +# regularization constants (9.536743e-07 to 2) +# under l1 and l2 regularization. +# DATA: +# Xi ~ Normal(0, 1), Cov(xi, xj) = 0.7^|i-j| +# y = 10*x1 + 10*x2 + 5*x3 + 5*x4 + x5 + ... + x14 + eps(100*1 ~Normal) +# ------------------------------------------------------------------------------ + +library(mlr3) +library(dplyr) +library(mlr3learners) +library(mlr3tuning) +library(BBmisc) +library(ggplot2) +library(gridExtra) +library(MASS) + +set.seed(19873) + +# DATA ------------------------------------------------------------------------- + +n <- 100 # Number of observations +p <- 50 # Number of predictors included in model +CovMatrix <- outer(1:p, 1:p, function(x,y) {.7^abs(x-y)}) +x <- mvrnorm(n, rep(0,p), CovMatrix) +y <- 10 * apply(x[, 1:2], 1, sum) + + 5 * apply(x[, 3:4], 1, sum) + + apply(x[, 5:14], 1, sum) + + rnorm(n) + +dd = as.data.frame(x) +dd$y = y +task <- TaskRegr$new(id = "mytask", backend = dd, target = "y") + +# order coefficients +extract_numeric <- function(x) { + as.numeric(gsub("[^0-9]", "", x)) +} + +get_pen_coefs = function(task, alpha, lam) { + learner = lrn("regr.glmnet", alpha = alpha, lambda=lam) + learner$train(task) + cc <- as.matrix(coef(learner$model))[,1] + names <- names(cc) + cc <- as.numeric(cc) + cc_nonin <- cc[2:length(cc)] # reorder non-intercept cc + names(cc) <- names + names_nonin <- extract_numeric(names[2:length(names)]) + names <- c(names[1], paste0("V", as.character(sort(names_nonin)))) + cc <- cc[names] + names(cc) <- names + return(abs(cc)) +} + +compute_cv = function(task, alpha, lambda_seq) { + learner = lrn("regr.glmnet", alpha = alpha, lambda=to_tune(lambda_seq)) + + # Construct tuning instance + instance = ti( + task = task, + learner = learner, + resampling = rsmp("cv", folds = 3), + measures = msr("regr.mse"), + terminator = trm("evals", n_evals = length(lambda_seq)) + ) + + tuner <- tnr("grid_search", resolution = length(lambda_seq)) + tuner$optimize(instance) + cv_lam <- as.data.frame(instance$archive$data)[,1:2] + colnames(cv_lam) <- c("lambda", "mse") + cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda)) + cv_lam <- cv_lam %>% arrange(lambda) + + return(cv_lam) +} + +lams = c(0.01, 100) +cc_l2_1 = get_pen_coefs(task, alpha = 0, lam = lams[1]) +cc_l2_2 = get_pen_coefs(task, alpha = 0, lam = lams[2]) +cc_l1_1 = get_pen_coefs(task, alpha = 1, lam = lams[1]) +cc_l1_2 = get_pen_coefs(task, alpha = 1, lam = lams[2]) + + +lambda_seq = 2^seq(-20, 1, length.out = 50) +cv_l1 = compute_cv(task, alpha = 1, lambda_seq) +cv_l2 = compute_cv(task, alpha = 0, lambda_seq) + +save2("data_regu_example_2.RData", lams, lambda_seq, + cc_l2_1, cc_l2_2, cc_l1_1, cc_l1_2, + cv_l1, cv_l2) \ No newline at end of file diff --git a/slides/regularization/rsrc/data_regu_example_2.RData b/slides/regularization/rsrc/data_regu_example_2.RData new file mode 100755 index 00000000..f994d673 Binary files /dev/null and b/slides/regularization/rsrc/data_regu_example_2.RData differ diff --git a/slides/regularization/rsrc/early_stopping.R b/slides/regularization/rsrc/early_stopping.R new file mode 100755 index 00000000..ba4aaf6e --- /dev/null +++ b/slides/regularization/rsrc/early_stopping.R @@ -0,0 +1,146 @@ +# ------------------------------------------------------------------------------ +# intro + +# FIG: show how early stopping influences training and test results. +# LEFT: how MSE changes with iterations. +# RIGHT: two fitted curves (early stopping & overfit). + +# DATA: Ozone from package-mlbench +# ------------------------------------------------------------------------------ + +library(mlbench) +library(reshape2) +library(ggplot2) +library(gridExtra) + +theme_set(theme_minimal()) + +set.seed(6) + +# DATA ------------------------------------------------------------------------- + +data(Ozone) + +# gradient of empirical risk +R_emp_grad <- function(beta, + features = X, + target = y) { + return(2 * t(features) %*% (features %*% beta - target)) +} + +gradient_descent <- + function(beta_start, + step_size, + grad_fun, + num_steps, + features, + target) { + betas <- matrix(0, ncol = length(beta_start), nrow = num_steps) + betas[1,] <- beta_start + for (i in seq(2, num_steps)) { + betas[i,] <- + betas[i - 1,] - step_size * grad_fun(betas[i - 1, ], features, + target) + } + + betas <- as.data.frame(betas) + return(betas) + } + +# generate polynomials +poly <- function(x, degree) { + sapply(0:degree, function(i) + x ^ i) +} + +o_data <- Ozone[, c(4, 8)] +o_data$V8 <- o_data$V8 / 100 +o_data <- o_data[complete.cases(o_data), ] + +id_train <- sample(1:nrow(o_data), 20) +o_data$type <- "test" +o_data[id_train,]$type <- "train" +o_data$type <- as.factor(o_data$type) + +train_data <- as.matrix(o_data[id_train, 1:2]) +test_data <- as.matrix(o_data[-id_train, 1:2]) + +degree <- 15 + +x_train <- poly(train_data[, 2], degree) +y_train <- o_data[id_train , 1] +x_test <- poly(test_data[, 2], degree) +y_test <- o_data[-id_train , 1] + +num_steps <- 1000000 +res <- gradient_descent(rep(0, ncol(x_train)), 0.02, #0.003, + R_emp_grad, num_steps, x_train, y_train) + +errs <- matrix(0, nrow = 2000, ncol = 2) +it1 <- 1:1000 +for (i in it1) { + errs[i, 1] <- + sum((x_train %*% t(res[i, ]) - y_train) ^ 2) / nrow(x_train) + errs[i, 2] <- + sum((x_test %*% t(res[i, ]) - y_test) ^ 2) / nrow(x_test) +} +it2 <- seq(1000, num_steps, length.out = 1000) +for (i in it2) { + errs[1000 + which(it2 == i), 1] <- + sum((x_train %*% t(res[i, ]) - y_train) ^ 2) / nrow(x_train) + errs[1000 + which(it2 == i), 2] <- + sum((x_test %*% t(res[i, ]) - y_test) ^ 2) / nrow(x_test) +} + +df <- as.data.frame(errs) +colnames(df) <- c("train", "test") +df$id <- c(it1, it2) + +min_te <- which.min(errs[, 2]) + +learning_df <- melt(df, id.vars = "id") + +# PLOT ------------------------------------------------------------------------- + +# MSE +p1 <- ggplot(learning_df, aes(x = id, y = value)) + + geom_line(aes(colour = variable), size=1.2) + + geom_vline(xintercept = min_te, colour="gray", + alpha= 0.8, size = 1.5) + + geom_vline(xintercept = num_steps, colour="gray", + linetype = "dashed", alpha= 0.8, size = 1.5) + + scale_x_log10() + + ylab("MSE") + + xlab("Iterations") + + scale_fill_brewer(palette="Dark2") + + annotate("text", x=min_te-70, y=175, label="stopped early", + color='black', size=3) + + annotate("text", x=num_steps-4*1e5, y=175, label="overfitted", + color='black', size=3) + + theme(legend.position = "bottom") + + guides(color = guide_legend(title = NULL)) + +# ozone level +pl_data <- seq(min(o_data[, 2]), max(o_data[, 2]), length.out = 100) +pl_data <- poly(pl_data, degree) + +y_overfit <- (pl_data) %*% t(res[num_steps, ])[,1] +y_best <- (pl_data) %*% t(res[min_te, ])[,1] + +fitting_df <- data.frame(overfit = y_overfit, best = y_best, x = pl_data[, 2] * 100) +fitting_df <- melt(fitting_df, id.vars = "x") + +p2 <- ggplot(o_data, aes(x=V8*100, y=V4)) + + geom_point(aes(colour=type)) + + geom_line(data=fitting_df, aes(linetype=rev(variable), x=x, y=value), alpha = 0.7, + show.legend=FALSE, color="gray", size=1.5) + + ylab("Ozone level") + + xlab("Temperature (degrees F)") + + scale_fill_brewer(palette="Dark2") + + theme(legend.position = "bottom") + + guides(color = guide_legend(title = NULL)) + +p = grid.arrange(p1, p2, ncol = 2) + +ggsave("../figure/early_stopping.png", plot=p, width=9, height=6) + diff --git a/slides/regularization/rsrc/equivariance-ols-ridge.R b/slides/regularization/rsrc/equivariance-ols-ridge.R deleted file mode 100644 index 13f8b658..00000000 --- a/slides/regularization/rsrc/equivariance-ols-ridge.R +++ /dev/null @@ -1,38 +0,0 @@ -library(MASS) - -# Data -set.seed(123) -n <- 100 -p <- 5 -X <- matrix(rnorm(n * p), n, p) -beta_true <- c(1, 2, 3, 4, 5) -epsilon <- rnorm(n) -Y <- X %*% beta_true + epsilon - -# OLS Solution -beta_ols <- solve(t(X) %*% X) %*% t(X) %*% Y - -# Ridge Solution -lambda <- 10 -beta_ridge <- solve(t(X) %*% X + lambda * diag(p)) %*% t(X) %*% Y - -# Rescale and repeat -X_rescaled <- X -X_rescaled[,5] <- 100 * X_rescaled[,5] -beta_ols_rescaled <- solve(t(X_rescaled) %*% X_rescaled) %*% t(X_rescaled) %*% Y -beta_ridge_rescaled <- solve(t(X_rescaled) %*% X_rescaled + lambda * diag(p)) %*% t(X_rescaled) %*% Y - -# Results -results <- rbind(t(beta_ols), t(beta_ols_rescaled), t(beta_ridge), t(beta_ridge_rescaled)) -colnames(results) <- paste("Coefficient", 1:p) - -# MSE -loss_ols <- mean((Y - X %*% beta_ols)^2) -loss_ols_rescaled <- mean((Y - X_rescaled %*% beta_ols_rescaled)^2) -loss_ridge <- mean((Y - X %*% beta_ridge)^2) # + lambda * sum(beta_ridge^2) -loss_ridge_rescaled <- mean((Y - X_rescaled %*% beta_ridge_rescaled)^2) #+ lambda * sum(beta_ridge_rescaled^2) - -losses <- c(loss_ols, loss_ols_rescaled, loss_ridge, loss_ridge_rescaled) -results <- cbind(results, MSE = losses) -rownames(results) <- c("OLS", "OLS Rescaled", "Ridge", "Ridge Rescaled") -print(results) diff --git a/slides/regularization/rsrc/make_graddes_vs_weightdecay_plot.R b/slides/regularization/rsrc/graddes_vs_weightdecay.R old mode 100644 new mode 100755 similarity index 71% rename from slides/regularization/rsrc/make_graddes_vs_weightdecay_plot.R rename to slides/regularization/rsrc/graddes_vs_weightdecay.R index 1dddbcc2..fc78e14c --- a/slides/regularization/rsrc/make_graddes_vs_weightdecay_plot.R +++ b/slides/regularization/rsrc/graddes_vs_weightdecay.R @@ -1,8 +1,16 @@ # ------------------------------------------------------------------------------ -# FIG: GRADIENT DESCENT VS WEIGHT DECAY PLOT +# wd vs l2 + +# FIG: draw the path of the optimal point for each iteration using +# gradient descent and using weight decay. + +# DATA: linear model data from data_func_utils.R # ------------------------------------------------------------------------------ -source("utils.R") +source("data_func_utils.R") +library(gridExtra) + +# DATA ------------------------------------------------------------------------- x1 <- seq(0,1.5,length.out = 100) x2 <- seq(0,3.5,length.out = 100) @@ -14,14 +22,15 @@ num_steps <- 100 gd_betas <- gradient_descent(beta_start, step_size, grad, num_steps) -# R_emp plot +# PLOT ------------------------------------------------------------------------- + +# GD remp_plot <- plot_r_emp(R_emp, x1, x2) + geom_path(data = gd_betas, aes(x=V1, y=V2), colour = "red", size=1.1) + geom_point(data = gd_betas, aes(x=V1, y=V2), colour = "white") + theme(legend.position="none") -# R_reg plot - +# WD lambda <- 10 num_steps <- 100 gd_l2_betas <- gradient_descent(beta_start, step_size, @@ -35,7 +44,8 @@ remp_l2_plot <- plot_r_emp(R_emp, x1, x2) + geom_point(data = gd_l2_betas, aes(x=V1, y=V2), colour = "white") + theme(legend.position="none") -#p <- grid.arrange(remp_plot, remp_l2_plot, ncol=2) +p <- grid.arrange(remp_plot, remp_l2_plot, ncol=2) +ggsave("../figure/graddes_vs_weightdecay.png", plot = p, width = 5.2, height = 3.1, dpi="retina") ggsave("../figure/graddes_vs_weightdecay_01.png", plot = remp_plot, width = 2.6, height = 3.1, dpi="retina") ggsave("../figure/graddes_vs_weightdecay_02.png", plot = remp_l2_plot, width = 2.6, height = 3.1, dpi="retina") diff --git a/slides/regularization/rsrc/make_l1_reg_hess_plots.R b/slides/regularization/rsrc/l1_reg_hess.R old mode 100644 new mode 100755 similarity index 85% rename from slides/regularization/rsrc/make_l1_reg_hess_plots.R rename to slides/regularization/rsrc/l1_reg_hess.R index d0d05a2e..1b464d05 --- a/slides/regularization/rsrc/make_l1_reg_hess_plots.R +++ b/slides/regularization/rsrc/l1_reg_hess.R @@ -1,10 +1,18 @@ # ------------------------------------------------------------------------------ -# FIG: L2 REGULARIZATION HESSIAN PLOTS +# geom l1 + +# FIG: theta_hat (OLS) and theta_lasso (Lasso Regression) points on contour plot +# to show how l1 penalty influences the optimal value by pulling them +# towards zero on each axis and overall. + +# DATA: principal components of linear model data from data_func_utils.R # ------------------------------------------------------------------------------ -source("utils.R") +source("data_func_utils.R") library(gridExtra) +# DATA ------------------------------------------------------------------------- + prc <- prcomp(X , scale. = FALSE) X_dc <- prc$x X_dc[,1] <- X_dc[, 1]/2 @@ -26,6 +34,9 @@ colnames(theta_hat) <- NULL lambda <- 10 theta_l1_reg <- sign(theta_hat) * pmax(abs(theta_hat) - lambda / diag(hessian),0) +# PLOT ------------------------------------------------------------------------- + +# plot contour lines and theta_hat init_plot_l1 <- plot_r_emp(function(beta) R_emp(beta, features = X_dc, target = y_new), x1, x2) + theme(legend.position = "none") + @@ -37,22 +48,18 @@ init_plot_l1 <- plot_r_emp(function(beta) R_emp(beta, features = X_dc, target = geom_vline(xintercept = 0, colour="lightblue", linetype = "dashed", alpha= 0.8, size = 1.1) + geom_point(data=as.data.frame(theta_hat), aes(x=theta_hat[1], y=theta_hat[2]), color="red", size=2) + - #geom_line(data=rbind(rep(0, num_features), as.data.frame(theta_hat)), - # aes(x=V1, y=V2), colour="red", size=1.1, arrow=arrow(ends="first", length=unit(0.09, "npc"))) + geom_vline(xintercept = -lambda/hessian[1,1], colour="yellow", linetype = "dashed", alpha= 0.8, size = 1.1) + annotate("label", x = -2, y = -2.5, label = "frac(-lambda, H[\"1,1\"])", parse = TRUE, color = 'black', size = 4, fill = "yellow") +# plot with arrows and points theta_hat_1 <- theta_hat theta_hat_1[,1] <- 0 - +# effect along theta1 plot_l1_theta1 <- init_plot_l1 + - # geom_polygon(data = data.frame(x = c(theta_hat[,1], theta_hat[,1], 0, 0), - # y = c(-Inf, Inf, Inf, -Inf)), - # aes(x,y), fill="white", alpha=0.5) + geom_point(data=as.data.frame(theta_hat_1), aes(x=theta_hat_1[1], y=theta_hat_1[2]), color="green", size=2) + geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_hat_1)), aes(x=start.V1, y=start.V2, @@ -64,7 +71,7 @@ p1 <- grid.arrange(init_plot_l1, plot_l1_theta1, ncol=2) ################################################################### - +# effect along theta2 theta_hat_2 <- theta_hat theta_hat_2[,2] <- theta_l1_reg[2] @@ -72,9 +79,6 @@ theta_hat_2[,2] <- theta_l1_reg[2] plot_l1_theta2 <- init_plot_l1 + geom_hline(yintercept=lambda/hessian[2,2], colour="yellow", linetype="dashed", alpha=0.8, size=1.1) + - # geom_polygon(data = data.frame(x = c(-Inf, Inf, Inf, -Inf), - # y = c(theta_hat[,2], theta_hat[,2], 0, 0)), - # aes(x,y), fill="white", alpha=0.5) + geom_point(data=as.data.frame(theta_hat_2), aes(x=theta_hat_2[1], y=theta_hat_2[2]), color="green", size=2) + geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_hat_2)), aes(x=start.V1, y=start.V2, @@ -84,12 +88,10 @@ plot_l1_theta2 <- init_plot_l1 + annotate("label", x=-3, y=2, label="frac(lambda, H[\"2,2\"])", parse=TRUE, color='black', size=4, fill="yellow") +# effect along both axes plot_l1_theta2_dash <- init_plot_l1 + geom_hline(yintercept=lambda/hessian[2,2], colour="yellow", linetype="dashed", alpha=0.8, size=1.1) + - # geom_polygon(data = data.frame(x = c(-Inf, Inf, Inf, -Inf), - # y = c(theta_hat[,2], theta_hat[,2], 0, 0)), - # aes(x,y), fill="white", alpha=0.5) + geom_point(data=as.data.frame(theta_hat_1), aes(x=theta_hat_1[1], y=theta_hat_1[2]), color="green", size=2) + geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_hat_1)), aes(x=start.V1, y=start.V2, @@ -105,6 +107,7 @@ plot_l1_theta2_dash <- init_plot_l1 + annotate("label", x=-3, y=2, label="frac(lambda, H[\"2,2\"])", parse=TRUE, color='black', size=4, fill="yellow") +# sum of two axes leads to theta_lasso plot_l1_theta_lasso <- plot_l1_theta2_dash + geom_point(data=as.data.frame(theta_l1_reg), aes(x=theta_l1_reg[1], y=theta_l1_reg[2]), color="orange", size=2) + geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_l1_reg)), diff --git a/slides/regularization/rsrc/make_l2_reg_hess_plots.R b/slides/regularization/rsrc/l2_reg_hess.R old mode 100644 new mode 100755 similarity index 87% rename from slides/regularization/rsrc/make_l2_reg_hess_plots.R rename to slides/regularization/rsrc/l2_reg_hess.R index 5f8b24b6..b77990d8 --- a/slides/regularization/rsrc/make_l2_reg_hess_plots.R +++ b/slides/regularization/rsrc/l2_reg_hess.R @@ -1,10 +1,18 @@ # ------------------------------------------------------------------------------ -# FIG: L2 REGULARIZATION HESSIAN PLOTS +# geom l2 + +# FIG: theta_hat (OLS) and theta_ridge (Ridge Regression) points on contour plot +# to show how l2 penalty influences the optimal value +# on each principle axis and overall. + +# DATA: linear model data from data_func_utils.R # ------------------------------------------------------------------------------ -source("utils.R") +source("data_func_utils.R") library(gridExtra) +# DATA ------------------------------------------------------------------------- + lambda <- 90 beta_start <- c(0, 0) step_size <- 0.005 @@ -38,14 +46,16 @@ theta_min_ridge_data <- as.data.frame(t(Q %*% theta_min_skew)) x1 <- seq(-2,2,length.out = 100) x2 <- seq(-1,5,length.out = 100) -#record contour level +# PLOT ------------------------------------------------------------------------- + +# record contour levels p_con <- plot_r_emp(R_emp, x1, x2, bins=25) ct_data <- ggplot_build(p_con)$data[[2]] ct_levels <- unique(ct_data$level) -#preserve half to make plots look better +# preserve half of them to make plots look better (less contour lines) ct_levels <- ct_levels[-seq(3, length(ct_levels), by = 2)] -# R_emp +# plot contour lines and theta_hat init_cond_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) + annotate("label", x = 0.75, y = 3, label = "hat(theta)", parse = TRUE, color = 'black', size = 3, fill = "red") + @@ -55,6 +65,7 @@ init_cond_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) + geom_line(data=rbind(rep(0, num_features), theta_min), aes(x=V1, y=V2), colour="red", size=1, arrow = arrow(length = unit(0.06, "npc"))) +# effect along two principle axes rot_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) + theme(legend.position="none") + coord_fixed() + geom_abline(slope = Q[2,1]/Q[1,1], colour="darkgrey", size=1.2) + @@ -71,23 +82,9 @@ rot_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) + colour = "green", arrow.fill = "green") rs <- sapply(1:2, function(i) S[i,i] / (S[i,i] + lambda)) - theta_hat <- theta_proj1_data*rs[1] + theta_proj2_data*rs[2] -geom_l2_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) + - theme(legend.position="none") + coord_fixed() + - geom_hline(yintercept = 0, colour="darkgrey", size=1.2) + - geom_vline(xintercept = 0, colour="darkgrey", size=1.2) + - geom_point(aes(x=beta_true[1], y=beta_true[2], color="red", size=3)) + - geom_point(aes(x=theta_hat[1], y=theta_hat[2], color="yellow", size=3)) - -geom_l2_plot <- geom_l2_plot + - annotate("label", x = 1.3, y = 1.5, label = "hat(theta)[Ridge]", - parse = TRUE, color = 'black', size = 3, fill = "yellow") + - annotate("label", x = 0.75, y = 3, label = "hat(theta)", - parse = TRUE, color = 'black', size = 3, fill = "red") - -##############shang +# theta_ridge decomposition along principle axes scale_rot_plot <- rot_plot + geom_segment(data=cbind(start=as.data.frame(t(c(0,0))), end= theta_proj1_data*rs[1] ), size=0.9, @@ -109,6 +106,7 @@ scale_rot_plot <- rot_plot + xend = end.V1, yend = end.V2), colour = "yellow") +# theta_hat and theta_ridge scale_plot <- init_cond_plot + annotate("label", x = 0.8, y = 1.5, label = "hat(theta)[Ridge]", parse = TRUE, color = 'black', size = 3, fill = "yellow") + @@ -126,9 +124,9 @@ p2 <- grid.arrange(rot_plot, init_cond_plot, ncol=2) p3 <- grid.arrange(scale_rot_plot, scale_plot, ncol=2) -### contour plot for l2 +### contour plot with l2 constraints -# Generate data points for plotting circles(ridge) +# Generate data points for plotting l2 constraints(circles) radius <- sqrt(theta_hat[1]^2 + theta_hat[2]^2)[[1]] #radius for interception point cir_list <- list() seq_data <- seq(0, 2*pi, length.out=100) #points for one circle @@ -141,6 +139,7 @@ for(mul in c(radius/8, radius/3, radius/1.5, radius)){ #adjust radius eval_grid <- expand.grid(x1,x2) eval_grid$r_emp <- apply(eval_grid, 1, R_emp) +# ellipse contours p_elli <- ggplot() + geom_raster(data=eval_grid, aes(x=Var1, y=Var2, fill=r_emp)) + geom_contour(data=eval_grid, aes(x=Var1, y=Var2, z=r_emp), @@ -150,6 +149,7 @@ p_elli <- ggplot() + ylab(expression(theta[2])) + scale_fill_viridis(end = 0.9) +# ellipse and circle contours p_ridge <- p_elli + geom_path(data=cir_list[[1]], aes(x, y), color="white", linetype="dashed") + geom_path(data=cir_list[[2]], aes(x, y), color="white", linetype="dashed") + @@ -160,6 +160,7 @@ p_ridge <- p_elli + beta_true <- data.frame(x=beta_true[1], y=beta_true[2]) theta_hat <- data.frame(x=theta_hat[1][[1]], y=theta_hat[2][[1]]) +# add points p_poi <- p_ridge + geom_point(data=beta_true, aes(x=x, y=y), color="red", size=3) + geom_point(data=theta_hat, aes(x=x, y=y), color="yellow", size=3) + @@ -170,6 +171,7 @@ p_poi <- p_ridge + geom_hline(yintercept=0, colour="darkgrey", size=1.2) + geom_vline(xintercept=0, colour="darkgrey", size=1.2) +# add decomposition arrows p4 <- p_poi + geom_segment(data=cbind(start=as.data.frame(t(c(0,0))), end=beta_true ), size=0.9, arrow=arrow(length = unit(0.06, "npc")), @@ -206,7 +208,7 @@ p4 <- p_poi + -ggsave("../figure/l2_reg_hess_01_plot.png", plot = p1, width = 5.5, height = 3.5, dpi="retina") -ggsave("../figure/l2_reg_hess_02_plot.png", plot = p2, width = 5.5, height = 3.5, dpi="retina") -ggsave("../figure/l2_reg_hess_03_plot.png", plot = p3, width = 5.5, height = 3.5, dpi="retina") -ggsave("../figure/l2_reg_hess_04_plot.png", plot = p4, width = 3, height = 5, dpi="retina") +ggsave("../figure/l2_reg_hess_01.png", plot = p1, width = 5.5, height = 3.5, dpi="retina") +ggsave("../figure/l2_reg_hess_02.png", plot = p2, width = 5.5, height = 3.5, dpi="retina") +ggsave("../figure/l2_reg_hess_03.png", plot = p3, width = 5.5, height = 3.5, dpi="retina") +ggsave("../figure/l2_reg_hess_04.png", plot = p4, width = 3, height = 5, dpi="retina") diff --git a/slides/regularization/rsrc/lasso_contour_cases.R b/slides/regularization/rsrc/lasso_contour_cases.R new file mode 100755 index 00000000..b448c3be --- /dev/null +++ b/slides/regularization/rsrc/lasso_contour_cases.R @@ -0,0 +1,81 @@ +# ------------------------------------------------------------------------------ +# l2 + +# FIG: lasso contour plots under different parameter values. +# (1) smaller parameter with theta_1 removed +# (2) small lambda that does not lead to sparsity +# (3) large lambda that leads to sparsity +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(gridExtra) + +# ------------------------------------------------------------------------------ + +# Function to create contour plots +create_contour_plot <- function(theta_hat, theta_lasso, l1_edge, outermost_point, annotation_positions, subtitle) { + theta1 <- seq(-4, 4, length.out = 300) + theta2 <- seq(-2, 5, length.out = 300) + grid <- expand.grid(Theta1 = theta1, Theta2 = theta2) + + target_direction <- c(1, 4) / sqrt(sum(c(1, 4)^2)) + angle <- atan2(target_direction[2], target_direction[1]) - pi / 18 + rot_matrix <- matrix(c(cos(angle), -sin(angle), sin(angle), cos(angle)), nrow = 2, byrow = TRUE) + + scale <- c(1, 2) + Z <- as.matrix(grid) - matrix(theta_hat, nrow = nrow(grid), ncol = 2, byrow = TRUE) + Z <- Z %*% rot_matrix + Z <- Z %*% diag(scale) + Z <- Z %*% t(rot_matrix) + L <- (Z[, 1])^2 + (Z[, 2])^2 + grid$L <- L + + outermost_level <- sum((outermost_point - theta_hat)^2) + + # Create the ggplot object + p <- ggplot() + + geom_contour(data = grid, aes(x = Theta1, y = Theta2, z = L),colour = "red", breaks = seq(min(L), outermost_level, length.out = 5)) + + geom_polygon(data = data.frame(x = c(l1_edge, 0, -l1_edge, 0), y = c(0, l1_edge, 0, -l1_edge)), aes(x, y), fill = "cyan", alpha = 0.3) + + labs(x = expression(theta[1]), y = expression(theta[2]), title = subtitle) + + theme_bw() + + coord_fixed() + + p <- p + + geom_point(data=as.data.frame(theta_hat), aes(x=theta_hat[1], y=theta_hat[2]), colour="black") + + annotate("label", x=annotation_positions[2, 1], y=annotation_positions[2, 2], label="hat(theta)", parse=TRUE, size=5) + + geom_segment(data=cbind(start=as.data.frame(matrix(annotation_positions[2,], nrow = 1, byrow = TRUE)), end=as.data.frame(matrix(theta_hat, nrow = 1, byrow = TRUE))), + aes(x=start.V1, y=start.V2, + xend=end.V1, yend=end.V2), colour="black", + size=0.9, arrow = arrow(ends="last", type="closed", length=unit(0.04, "npc")), + arrow.fill="black") + + p <- p + + geom_point(data=as.data.frame(theta_lasso), aes(x=theta_lasso[1], y = theta_lasso[2]), colour="black") + + annotate("label", x=annotation_positions[1, 1], y=annotation_positions[1, 2], label="hat(theta)[\"Lasso\"]", parse=TRUE, size=5) + + geom_segment(data=cbind(start=as.data.frame(matrix(annotation_positions[1,], nrow = 1, byrow = TRUE)), end=as.data.frame(matrix(theta_lasso, nrow = 1, byrow = TRUE))), + aes(x=start.V1, y=start.V2, + xend=end.V1, yend=end.V2), colour="black", + size=0.9, arrow = arrow(ends="last", type="closed", length=unit(0.04, "npc")), + arrow.fill="black") + + xlim(-3, 3) + + ylim(-2, 5) + return(p) +} + +# Create individual plots +plot1 <- create_contour_plot(theta_hat = c(0.5, 3), theta_lasso = c(0, 1), l1_edge = 1, + outermost_point = c(0, 1), annotation_positions = matrix(c(-2, 1.1, 2.5, 2), nrow = 2, byrow = TRUE), + subtitle = expression(paste("smaller param. ", theta[1], " is removed"))) + +plot2 <- create_contour_plot(theta_hat = c(1, 1), theta_lasso = c(0.5, 0.5), l1_edge = 1, + outermost_point = c(0.5, 0.5), annotation_positions = matrix(c(-0.5, 2.5, 2, 3), nrow = 2, byrow = TRUE), + subtitle = "small λ: no sparsity") + +plot3 <- create_contour_plot(theta_hat = c(1, 1), theta_lasso = c(0.5, 0), l1_edge = 0.5, + outermost_point = c(0.5, 0), annotation_positions = matrix(c(-0.5, 2.5, 2, 3), nrow = 2, byrow = TRUE), + subtitle = "larger λ: sparsity") + +# Arrange the plots in a grid +p <- grid.arrange(plot1, plot2, plot3, nrow = 1) + +ggsave("../figure/lasso_contour_cases.png", plot = p, height = 6, width = 18) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_linear_model_reg.R b/slides/regularization/rsrc/lin_model_regu.R old mode 100644 new mode 100755 similarity index 75% rename from slides/regularization/rsrc/make_linear_model_reg.R rename to slides/regularization/rsrc/lin_model_regu.R index dfa20732..6c9e4916 --- a/slides/regularization/rsrc/make_linear_model_reg.R +++ b/slides/regularization/rsrc/lin_model_regu.R @@ -1,20 +1,29 @@ -# Load necessary libraries +# ------------------------------------------------------------------------------ +# l1, l2 + +# FIG: optimal points with different regularization constants (lambda) +# on contour plot for linear regression with +# l1 and l2 regularization. + +# DATA: y = X(100*2 ~Normal)·beta_true(3,-2) + noise(100*1 ~Normal) +# ------------------------------------------------------------------------------ + library(ggplot2) library(MASS) library(glmnet) library(gridExtra) -# Example dataset set.seed(123) + +# DATA ------------------------------------------------------------------------- + n <- 100 X <- matrix(rnorm(2 * n), n, 2) beta_true <- c(3, -2) y <- X %*% beta_true + rnorm(n) -# Train unregularized linear model lm_unreg <- lm(y ~ X - 1) # '-1' to remove the intercept -# Train L2 regularized models with different lambdas lambdas_l2 <- c(0.1, 1, 2.5, 5, 10, 20, 100) models <- lapply(lambdas_l2, function(lambda) { return(glmnet::glmnet(X, y, alpha = 0, lambda = lambda, standardize = FALSE, intercept = FALSE)) @@ -24,24 +33,20 @@ coefs <- sapply(models, function(model, lambda) { coef(model, s = lambda)[-1, 1] # Exclude the intercept }, lambdas_l2) -# Transpose to make each column represent a model +# Transpose so each column represents a model coefs_l2 <- t(coefs) -# Create a data frame from the matrix coefs_df_l2 <- as.data.frame(coefs_l2) names(coefs_df_l2) <- c("X1", "X2") -# Prepare data for contour plot grid_range <- seq(-5, 5, length.out = 100) grid_data <- expand.grid(X1 = grid_range, X2 = grid_range) grid_data$loss <- apply(grid_data, 1, function(vec) { sum((y - X %*% vec)^2) / (2 * n) }) -# Adjusted lambda values lambdas_l1 <- c(0.01, 0.5, 1, 1.5, 2, 2.5, 10) -# Train L1 regularized models with the adjusted lambdas models_l1 <- lapply(lambdas_l1, function(lambda) { return(glmnet::glmnet(X, y, alpha = 1, lambda = lambda, standardize = FALSE, intercept = FALSE)) }) @@ -49,33 +54,31 @@ models_l1 <- lapply(lambdas_l1, function(lambda) { # Extract coefficients for L1 regularized models coefs_l1 <- sapply(models_l1, function(model, lambda) { coef(model, s = lambda)[-1, 1] -}, lambdas) +}, lambdas_l1) -# Transpose to make each column represent a model +# Transpose so each column represents a model coefs_l1 <- t(coefs_l1) -# Create a data frame for L1 coefficients coefs_df_l1 <- as.data.frame(coefs_l1) names(coefs_df_l1) <- c("X1", "X2") -# Add lambda values to the L1 coefficients data frame coefs_df_l1$lambda <- factor(lambdas_l1) -# Manually defined red colors -red_colors <- c("#ffcccc", # lightest red +red_colors <- c("#ffcccc", "#ff9999", "#ff6666", "#ff3333", - "#ff0000", # medium red + "#ff0000", "#cc0000", - "#800000") # darkest red + "#800000") -# Ensure the number of colors matches the number of lambda values if(length(red_colors) != length(lambdas_l1)) { stop("The number of manually defined colors does not match the number of lambda values.") } -# Plot for L1 Regularization with manually defined red colors +# PLOT ------------------------------------------------------------------------- + +# L1 Regularization p_l1 <- ggplot(grid_data, aes(x = X1, y = X2)) + geom_contour_filled(aes(z = loss), breaks = seq(min(grid_data$loss), max(grid_data$loss), length.out = 15)) + geom_point(data = coefs_df_l1, aes(x = X1, y = X2, color = lambda), size = 4) + @@ -92,7 +95,7 @@ p_l1 <- ggplot(grid_data, aes(x = X1, y = X2)) + coefs_df_l2$lambda <- factor(lambdas_l2) -# Plot for L2 Regularization with manually defined red colors +# L2 Regularization p_l2 <- ggplot(grid_data, aes(x = X1, y = X2)) + geom_contour_filled(aes(z = loss), breaks = seq(min(grid_data$loss), max(grid_data$loss), length.out = 15)) + geom_point(data = coefs_df_l2, aes(x = X1, y = X2, color = lambda), size = 4) + @@ -107,9 +110,5 @@ p_l2 <- ggplot(grid_data, aes(x = X1, y = X2)) + axis.line = element_blank()) + guides(fill = "none") - -# Save the L2 plot -ggsave("../figure/lin_reg_l2.png", plot = p_l2, width = 8, height = 5) - -# Save the L1 plot -ggsave("../figure/lin_reg_l1.png", plot = p_l1, width = 8, height = 5) \ No newline at end of file +ggsave("../figure/lin_model_regu_01.png", plot = p_l1, width = 8, height = 5) #L1 +ggsave("../figure/lin_model_regu_02.png", plot = p_l2, width = 8, height = 5) #L2 diff --git a/slides/regularization/rsrc/make-solution-path-ridge-lasso.py b/slides/regularization/rsrc/make-solution-path-ridge-lasso.py deleted file mode 100644 index 9d28f079..00000000 --- a/slides/regularization/rsrc/make-solution-path-ridge-lasso.py +++ /dev/null @@ -1,165 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Dec 6 12:36:47 2023 - -@author: chris -""" - -import numpy as np -from matplotlib import pyplot as plt -from sklearn import linear_model - -# Cost function definitions -def cost_l2(x, y): - return x**2 + y**2 - -def cost_l1(x, y): - return np.abs(x) + np.abs(y) - -def costfunction(X, y, theta): - m = np.size(y) - h = X @ theta - return float((1./(2*m)) * (h - y).T @ (h - y)) - -def closed_form_reg_solution(X, y, lamda=10): - m, n = X.shape - I = np.eye((n)) - return (np.linalg.inv(X.T @ X + lamda * I) @ X.T @ y)[:, 0] - -# Dataset creation and normalization -x = np.linspace(0, 1, 40) -noise = 1 * np.random.uniform(size=40) -y = np.sin(x * 1.5 * np.pi) -y_noise = (y + noise).reshape(-1, 1) - np.mean(y + noise) -X = np.vstack((x, x**2)).T -X = X / np.linalg.norm(X, axis=0) - -# Setup of meshgrid of theta values -xx, yy = np.meshgrid(np.linspace(-2, 17, 100), np.linspace(-17, 3, 100)) - -# Computing the cost function for each theta combination -zz_l2 = np.array([cost_l2(xi, yi) for xi, yi in zip(np.ravel(xx), np.ravel(yy))]) # L2 function -zz_l1 = np.array([cost_l1(xi, yi) for xi, yi in zip(np.ravel(xx), np.ravel(yy))]) # L1 function -zz_ls = np.array([costfunction(X, y_noise, np.array([t0, t1]).reshape(-1, 1)) - for t0, t1 in zip(np.ravel(xx), np.ravel(yy))]) # Least square cost function - -# Reshaping the cost values -Z_l2 = zz_l2.reshape(xx.shape) -Z_l1 = zz_l1.reshape(xx.shape) -Z_ls = zz_ls.reshape(xx.shape) - -# Calculating the regularization paths -lambda_range_l2 = np.logspace(0, 4, num=100) / 1000 -theta_0_list_reg_l2, theta_1_list_reg_l2 = zip(*[closed_form_reg_solution(X, y_noise, l) for l in lambda_range_l2]) - -lambda_range_l1 = np.logspace(0, 2, num=100) / 1000 -theta_0_list_reg_l1, theta_1_list_reg_l1 = zip(*[linear_model.Lasso(alpha=l, fit_intercept=False).fit(X, y_noise).coef_ - for l in lambda_range_l1]) - -# Plotting the contours and paths with updated aesthetics -fig = plt.figure(figsize=(16, 7)) - -# L2 regularization plot -ax = fig.add_subplot(1, 2, 1) -ax.contour(xx, yy, Z_l2, levels=[.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250], colors='cyan') -ax.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax.set_xlabel(r'$\theta_1$', fontsize=18) -ax.set_ylabel(r'$\theta_2$', fontsize=18) -ax.set_title('L2 regularization solution path', fontsize=20) -ax.plot(theta_0_list_reg_l2, theta_1_list_reg_l2, linestyle='none', marker='o', color='red', alpha=.2) - -# L1 regularization plot -ax = fig.add_subplot(1, 2, 2) -ax.contour(xx, yy, Z_l1, levels=[.5, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14], colors='cyan') -ax.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax.set_xlabel(r'$\theta_1$', fontsize=18) -ax.set_ylabel(r'$\theta_2$', fontsize=18) -ax.set_title('L1 regularization solution path', fontsize=20) -ax.plot(theta_0_list_reg_l1, theta_1_list_reg_l1, linestyle='none', marker='o', color='red', alpha=.2) - -plt.show() - -# L2 regularization plot only -fig_l2 = plt.figure(figsize=(8, 7)) -ax_l2 = fig_l2.add_subplot(1, 1, 1) - -ax_l2.contour(xx, yy, Z_l2, levels=[.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250], colors='cyan') -ax_l2.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax_l2.set_xlabel(r'$\theta_1$', fontsize=16) -ax_l2.set_ylabel(r'$\theta_2$', fontsize=16) -ax_l2.set_title('L2 regularization solution path', fontsize=17) -ax_l2.plot(theta_0_list_reg_l2, theta_1_list_reg_l2, linestyle='none', marker='o', color='red', alpha=.2) - -plt.show() - -# Define the L2 regularization contour levels -l2_contour_levels = [.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250] - -# Determine which points are inside or outside the L2 regularization contours -inside_points = [] -outside_points = [] - -for theta_0, theta_1 in zip(theta_0_list_reg_l2, theta_1_list_reg_l2): - cost = cost_l2(theta_0, theta_1) - if any(cost < level for level in l2_contour_levels): - inside_points.append((theta_0, theta_1)) - else: - outside_points.append((theta_0, theta_1)) - -# Separate the points into x and y coordinates for plotting -inside_x, inside_y = zip(*inside_points) -outside_x, outside_y = zip(*outside_points) - -# Plot 1: Points inside the L2 regularization contours -fig_inside, ax_inside = plt.subplots(figsize=(8, 7)) -ax_inside.contour(xx, yy, Z_l2, levels=l2_contour_levels, colors='cyan') -ax_inside.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax_inside.scatter(inside_x, inside_y, color='green', marker='o', alpha=.5) # Points inside -ax_inside.set_xlabel(r'$\theta_1$', fontsize=16) -ax_inside.set_ylabel(r'$\theta_2$', fontsize=16) -ax_inside.set_title('L2 regularization solution path', fontsize=17) - -# Plot 2: Points outside the L2 regularization contours -fig_outside, ax_outside = plt.subplots(figsize=(8, 7)) -ax_outside.contour(xx, yy, Z_l2, levels=l2_contour_levels, colors='cyan') -ax_outside.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax_outside.scatter(outside_x, outside_y, color='blue', marker='o', alpha=.5) # Points outside -ax_outside.set_xlabel(r'$\theta_1$', fontsize=16) -ax_outside.set_ylabel(r'$\theta_2$', fontsize=16) -ax_outside.set_title('Solutions outside of L2 regularization', fontsize=17) - -plt.show() - -# L2 regularization contour levels -l2_levels = [.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250] - -# L1 regularization contour levels -l1_levels = [.5, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14] - -# Determine points inside the contours for L2 -inside_l2 = [(t0, t1) for t0, t1 in zip(theta_0_list_reg_l2, theta_1_list_reg_l2) if cost_l2(t0, t1) < max(l2_levels)] - -# Determine points inside the contours for L1 -inside_l1 = [(t0, t1) for t0, t1 in zip(theta_0_list_reg_l1, theta_1_list_reg_l1) if cost_l1(t0, t1) < max(l1_levels)] - -fig = plt.figure(figsize=(16, 7)) - -# L2 Regularization Plot -ax1 = fig.add_subplot(1, 2, 1) -ax1.contour(xx, yy, Z_l2, levels=l2_levels, colors='cyan') -ax1.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax1.scatter(*zip(*inside_l2), color='green', marker='o', alpha=.5) # Points inside L2 -ax1.set_xlabel(r'$\theta_1$', fontsize=18) -ax1.set_ylabel(r'$\theta_2$', fontsize=18) -ax1.set_title('L2 regularization solution path', fontsize=20) - -# L1 Regularization Plot -ax2 = fig.add_subplot(1, 2, 2) -ax2.contour(xx, yy, Z_l1, levels=l1_levels, colors='cyan') -ax2.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm') -ax2.scatter(*zip(*inside_l1), color='green', marker='o', alpha=.5) # Points inside L1 -ax2.set_xlabel(r'$\theta_1$', fontsize=18) -ax2.set_ylabel(r'$\theta_2$', fontsize=18) -ax2.set_title('L1 regularization solution path', fontsize=20) - -plt.show() \ No newline at end of file diff --git a/slides/regularization/rsrc/make_avoid_overfitting_01_plot.R b/slides/regularization/rsrc/make_avoid_overfitting_01_plot.R deleted file mode 100644 index 6edaefc3..00000000 --- a/slides/regularization/rsrc/make_avoid_overfitting_01_plot.R +++ /dev/null @@ -1,30 +0,0 @@ -# ------------------------------------------------------------------------------ -# FIG: AVOID OVERFITTING 01 -# ------------------------------------------------------------------------------ - -library(knitr) -library(mlr) -library(mlbench) -library(ggplot2) -library(tidyr) -library(colorspace) -library(gridExtra) -library(BBmisc) -library(reshape) -library(data.table) - -# DATA ------------------------------------------------------------------------- - -load("ozone_example.RData") - -dfp <- setDT(df_incdata)[, .(mean.mse = median(value)), by = c("nobs", "variable")] - -# PLOTS ------------------------------------------------------------------------ - -p <- ggplot(data = dfp, aes(x = nobs, y = mean.mse, colour = variable)) + - geom_line(lwd = 1.2) + ylim(c(0, 100)) + labs(colour = " ") + - scale_colour_discrete(labels = c("Train error", "Test error")) + - xlab("Size of data set") + ylab("MSE") + - scale_color_brewer(palette="Dark2") - -ggsave("../figure/avoid_overfitting_01.png", plot=p, width=5, height=2.5) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_avoid_overfitting_02_plot.R b/slides/regularization/rsrc/make_avoid_overfitting_02_plot.R deleted file mode 100644 index c6436113..00000000 --- a/slides/regularization/rsrc/make_avoid_overfitting_02_plot.R +++ /dev/null @@ -1,28 +0,0 @@ -# ------------------------------------------------------------------------------ -# FIG: AVOID OVERFITTING 02 -# ------------------------------------------------------------------------------ - -library(knitr) -library(mlr) -library(mlbench) -library(ggplot2) -library(tidyr) -library(colorspace) -library(gridExtra) -library(BBmisc) -library(reshape) - -# DATA ------------------------------------------------------------------------- - -load("ozone_example.RData") - -# PLOTS ------------------------------------------------------------------------ -p <- ggplot(data = df_incfeatures, aes(x = type, y = mean.mse, colour = variable)) + - geom_line(lwd = 1.2) + labs(colour = " ") + - scale_colour_discrete(labels = c("Train error", "Test error")) + - xlab("Number of features") + ylab("Mean Squared Error") + - ylim(c(0, 150)) + - scale_x_continuous(breaks = 0:12) + - scale_color_brewer(palette="Dark2") - -ggsave("../figure/avoid_overfitting_02.png", plot=p, width=5, height=2.5) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_early_stopping_plot.R b/slides/regularization/rsrc/make_early_stopping_plot.R deleted file mode 100644 index bcd13662..00000000 --- a/slides/regularization/rsrc/make_early_stopping_plot.R +++ /dev/null @@ -1,52 +0,0 @@ -# ------------------------------------------------------------------------------ -# FIG: EARLY STOPPING -# ------------------------------------------------------------------------------ - -library(knitr) -library(mlr) -library(mlbench) -library(ggplot2) -library(tidyr) -library(colorspace) -library(gridExtra) -library(BBmisc) -library(reshape) - -# DATA ------------------------------------------------------------------------- -load("early_stopping1.RData") - -o_data$type <- factor(o_data$type, levels=c("train", "test")) - -# PLOTS ------------------------------------------------------------------------ -p1 <- ggplot(o_learn, aes(x = id, y = value)) + - geom_line(aes(colour = variable), lwd = 1.2) + - geom_vline(xintercept = best_it, linetype = "solid", lwd = 2, - colour="darkgrey") + - geom_vline(xintercept = max_it, lwd = 2, colour="darkgrey", - linetype = "dashed") + - annotate("label", x = 30, y = 180, label = "stopped early") + - annotate("label", x = 4e5, y = 180, label = "overfitted") + - scale_x_log10() + - xlab("Iterations") + - ylab("Mean Squared Error") + - labs(colour = " ") + - theme(legend.position="bottom") + - scale_color_brewer(palette="Dark2") - -p2 <- ggplot(o_data, aes(x=V8*100, y=V4)) + - geom_point(data=o_data, aes(colour=type, alpha=type)) + - scale_alpha_manual(values = c(1, 0.2), guide = "none") + - geom_line(data=o_fit, aes(linetype=variable, x=x, y=value), alpha = 1, - lwd = 2, colour="darkgrey") + - scale_linetype_manual(values = c("dashed", "solid")) + - xlab("Temperature (degrees F)") + - ylab("Ozone level") + - theme(legend.position="bottom") + - guides(linetype = FALSE) + - # scale_alpha(guide = "none") + - labs(colour = " ") + - scale_color_brewer(palette="Dark2") - -p <- grid.arrange(p1, p2, ncol=2) - -ggsave("../figure/early_stopping.png", plot=p, width=9, height=6) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_lasso-contours-sparsity.py b/slides/regularization/rsrc/make_lasso-contours-sparsity.py deleted file mode 100644 index 1c8916c4..00000000 --- a/slides/regularization/rsrc/make_lasso-contours-sparsity.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Dec 1 03:40:27 2023 - -@author: chris -""" - -import matplotlib.pyplot as plt -import numpy as np - -# Function to create contour plots -def create_contour_plot(ax, theta_hat, theta_lasso, l1_edge, outermost_point, annotation_positions, subtitle): - theta1 = np.linspace(-4, 4, 300) - theta2 = np.linspace(-2, 5, 300) - Theta1, Theta2 = np.meshgrid(theta1, theta2) - - target_direction = np.array([1, 4]) / np.linalg.norm([1, 4]) - angle = np.arctan2(target_direction[1], target_direction[0]) - np.pi / 18 - rot_matrix = np.array([[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]]) - - scale = np.array([1, 2]) - Z = np.vstack((Theta1.ravel() - theta_hat[0], Theta2.ravel() - theta_hat[1])).T @ rot_matrix - Z = Z * scale - Z = Z @ rot_matrix.T - L = (Z[:, 0])**2 + (Z[:, 1])**2 - L = L.reshape(Theta1.shape) - - outermost_level = (outermost_point[0] - theta_hat[0])**2 + (outermost_point[1] - theta_hat[1])**2 - - # Plot the contours - ax.contour(Theta1, Theta2, L, levels=np.linspace(np.min(L), outermost_level, 5), colors='red') - - # L1 regularization path with adjusted darker blue color - diamond = plt.Polygon([[l1_edge,0], [0,l1_edge], [-l1_edge,0], [0,-l1_edge]], closed=True, color='cyan', alpha=0.3) # Medium Blue - ax.add_patch(diamond) - - # Plot theta_hat and theta_lasso - ax.plot(*theta_hat, 'ko') - ax.plot(*theta_lasso, 'ko') - - # Annotations with adjusted sizes - ax.annotate(r'$\hat{\theta}_{Lasso}$', xy=theta_lasso, xytext=annotation_positions[0], - arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=3), ha='right', va='bottom', fontsize=35) - ax.annotate(r'$\hat{\theta}$', xy=theta_hat, xytext=annotation_positions[1], - arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=3), ha='left', va='bottom', fontsize=35) - - # Axes settings - ax.set_xlabel(r'$\theta_1$', fontsize=30) - ax.set_ylabel(r'$\theta_2$', fontsize=30) - ax.tick_params(axis='both', which='major', labelsize=25) - ax.axis('equal') - ax.set_xlim([-4, 4]) - ax.set_ylim([-2, 5]) - - # Add subtitle - ax.set_title(subtitle, fontsize=30) - -# Initialize a figure with three subplots -fig, axs = plt.subplots(1, 3, figsize=(24, 8)) - -# First plot -create_contour_plot(axs[0], theta_hat=[0.5, 3], theta_lasso=[0, 1], l1_edge=1, - outermost_point=[0, 1], annotation_positions=[(-2, 1.1), (2.5, 2)], subtitle=r'$\text{smaller param. }\theta_{1}\text{ is removed}$') - -# Second plot with subtitle "small λ" -create_contour_plot(axs[1], theta_hat=[1, 1], theta_lasso=[0.5, 0.5], l1_edge=1, - outermost_point=[0.5, 0.5], annotation_positions=[(-0.5, 2.5), (2, 3)], subtitle='small λ: no sparsity') - -# Third plot with subtitle "large λ" -create_contour_plot(axs[2], theta_hat=[1, 1], theta_lasso=[0.5, 0], l1_edge=0.5, - outermost_point=[0.5, 0], annotation_positions=[(-0.5, 2.5), (2, 3)], subtitle='larger λ: sparsity') - -plt.tight_layout() -plt.show() diff --git a/slides/regularization/rsrc/make_plot_ridge_hat.py b/slides/regularization/rsrc/make_plot_ridge_hat.py deleted file mode 100644 index 250c86ae..00000000 --- a/slides/regularization/rsrc/make_plot_ridge_hat.py +++ /dev/null @@ -1,84 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.patches import Circle - -# Define the grid for plotting -x = np.linspace(-3.0, 3.0, 400) -y = np.linspace(-3.0, 3.0, 400) -X, Y = np.meshgrid(x, y) - -# Define the center of the objective function -objective_center = np.array([1.5, 1.5]) # Adjust as needed - -# Elliptical objective function with rotation -def rotated_elliptical_objective(X, Y, center, a, b, angle_deg): - """ Rotated elliptical objective function. """ - angle_rad = np.radians(angle_deg) - X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1]) - Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1]) - return (X_rot**2 / a**2) + (Y_rot**2 / b**2) - -# Define elliptical parameters -a, b = 1.5, 0.75 # Semi-major and semi-minor axes lengths -rotation_angle = -30 # Rotation angle in degrees - -# Calculate rotated elliptical objective function values -Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle) - -# Define the constraint circle for ridge regression (L2) -constraint_radius = 1.0 # Example radius - -# Create contour levels -contour_levels = [0.1, 0.3, 0.6] # Example contour levels - -# Create a 2x1 grid of plots -fig, axs = plt.subplots(figsize=(8, 8), dpi=100) - -def draw_plot(ax, constraint_radius, contour_levels): - # Plot contour lines around the objective center - CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5) - - # Plot the constraint circle - colors = ['cornflowerblue', 'blue', 'navy'] - rads = [1, 1.5, 3] - for i in range(3): - circle = Circle((0, 0), constraint_radius/rads[i], color=colors[i], alpha=0.3, linestyle='--') - ax.add_artist(circle) - - # Plot the minimum point - ax.plot(objective_center[0], objective_center[1], 'o', color='red', markersize=6) - ax.text(objective_center[0]+0.05, objective_center[1]+0.05, r'$\hat{\theta}$', fontsize=12, color='black') - - # Set the same scale for both axes and set limits - ax.set_aspect('equal', 'box') - ax.set_xlim(-1.2, 2.7) - ax.set_ylim(-1.2, 2.5) - ax.axis('off') - - # Define the legend elements - #legend_elements = [ - # plt.Line2D([0], [0], marker='o', color='black', markersize=6, label=r'$\hat{\theta}$', linestyle='None') - #] - - last_contour = CS.allsegs[2][0] # Use the second contour for intersection - distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2) - min_idx = np.argmin(np.abs(distances - constraint_radius)) - intersection_point = last_contour[min_idx] - ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=6) - ax.text(intersection_point[0]+0.05, intersection_point[1]+0.05, r'$\hat{\theta}_{ridge}$', fontsize=12, color='black') - #legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=6, label=r'$\hat{\theta}_{ridge}$')) - - # Add the legend - #ax.legend(handles=legend_elements, loc='upper left', fontsize='large', frameon=True, handletextpad=0.4, borderpad=0.1, labelspacing=0.1) - - # Add arrows indicating the axes - ax.arrow(-1.2, 0, 3.6, 0, head_width=0.1, head_length=0.2, fc='black', ec='black') - ax.text(2.3, -0.1, r'$\theta_1$', fontsize=12, color='black') - ax.arrow(0, -1.2, 0, 3.4, head_width=0.1, head_length=0.2, fc='black', ec='black') - ax.text(-0.13, 2.1, r'$\theta_2$', fontsize=12, color='black') - -# Draw plots -draw_plot(axs, constraint_radius, contour_levels) - -plt.tight_layout() -plt.show() \ No newline at end of file diff --git a/slides/regularization/rsrc/make_poly_ridge_1_plot.R b/slides/regularization/rsrc/make_poly_ridge_1_plot.R deleted file mode 100644 index bac7d48e..00000000 --- a/slides/regularization/rsrc/make_poly_ridge_1_plot.R +++ /dev/null @@ -1,36 +0,0 @@ -# ------------------------------------------------------------------------------ -# FIG: POLYNOMIAL RIDGE 1 -# ------------------------------------------------------------------------------ - -library(ggplot2) -library(viridis) - -theme_set(theme_minimal()) - -# DATA ------------------------------------------------------------------------- - -source("ridge_polynomial_reg.R") - -set.seed(314259) -f <- function (x) { - return (5 + 2 * x + 10 * x^2 - 2 * x^3) -} - -x <- runif(40, -2, 5) -y <- f(x) + rnorm(length(x), 0, 10) - -x.true <- seq(-2, 5, length.out = 400) -y.true <- f(x.true) -df <- data.frame(x = x.true, y = y.true) - -lambda.vec <- 0 - -# PLOTS ------------------------------------------------------------------------ - -p <- plotRidge(x, y, lambda.vec, baseTrafo, degree = 10) + - geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) + - xlab("x") + ylab("f(x)") + - theme(plot.title = element_text(size = 15)) + - scale_color_viridis(end = 0.9, discrete = TRUE) - -ggsave("../figure/poly_ridge_1.png", plot = p, width = 6, height = 2) diff --git a/slides/regularization/rsrc/make_poly_ridge_2_plot.R b/slides/regularization/rsrc/make_poly_ridge_2_plot.R deleted file mode 100644 index 4d5dc6a8..00000000 --- a/slides/regularization/rsrc/make_poly_ridge_2_plot.R +++ /dev/null @@ -1,38 +0,0 @@ -# ------------------------------------------------------------------------------ -# FIG: POLYNOMIAL RIDGE 2 -# ------------------------------------------------------------------------------ - -library(ggplot2) -library(viridis) - -theme_set(theme_minimal()) - -# DATA ------------------------------------------------------------------------- - -source("ridge_polynomial_reg.R") - -f <- function (x) { - return (5 + 2 * x + 10 * x^2 - 2 * x^3) -} - -set.seed(314259) -x <- runif(40, -2, 5) -y <- f(x) + rnorm(length(x), 0, 10) - -x.true <- seq(-2, 5, length.out = 400) -y.true <- f(x.true) -df <- data.frame(x = x.true, y = y.true) - -lambda.vec <- c(0, 10, 100) - - -# PLOTS ------------------------------------------------------------------------ - -p <- plotRidge(x, y, lambda.vec, baseTrafo, degree = 10) + - geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) + - xlab("x") + ylab("f(x)") + - labs(color=expression(lambda)) + - theme(plot.title = element_text(size = 15)) + - scale_color_viridis(end = 0.9, discrete = TRUE) - -ggsave("../figure/poly_ridge_2.png", plot = p, width = 7.5, height = 3) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_poly_ridge_table_latex.R b/slides/regularization/rsrc/make_poly_ridge_table_latex.R deleted file mode 100644 index 4d353ba1..00000000 --- a/slides/regularization/rsrc/make_poly_ridge_table_latex.R +++ /dev/null @@ -1,18 +0,0 @@ -# ------------------------------------------------------------------------------ -# TAB: POLYNOMIAL RIDGE -# ------------------------------------------------------------------------------ - -library(xtable) - -betas <- getPolyData(x, y, lambda.vec, baseTrafo, degree = 10)$betas - -betas <- cbind(as.numeric(rownames(betas)), betas) - -colnames(betas) <- c("$\\lambda$" , sapply(1:(ncol(betas)-1), - function(i) return (paste0("$\\beta_{", - as.character(i-1), - "}$")))) - -print(xtable(signif(betas, 2), digits = 2, align = "rr|lllllllllll"), - row.names = FALSE, sanitize.colnames.function = function(x) x, include.rownames = FALSE, - hline.after = 0, latex.environments = "tiny") diff --git a/slides/regularization/rsrc/make_reg_surfaces.py b/slides/regularization/rsrc/make_reg_surfaces.py deleted file mode 100644 index 349c5c18..00000000 --- a/slides/regularization/rsrc/make_reg_surfaces.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D -from scipy.optimize import minimize - -# Data Generation -n = 500 -np.random.seed(0) -x1 = np.random.uniform(-1, 1, n) -x2 = np.random.uniform(-1, 1, n) -epsilon = np.random.normal(0, 0.1, n) -y = -0.5 * x1 + 3 * x2 + epsilon - -# Regularization Norm Functions -def l1_norm(beta1, beta2): - return np.abs(beta1) + np.abs(beta2) - -def l2_norm_squared(beta1, beta2): - return beta1**2 + beta2**2 - -# Updated Regularized Least Squares Objective Function with 1/n factor -def updated_objective(beta, x1, x2, y, lam, regularization): - beta1, beta2 = beta - residuals = y - beta1 * x1 - beta2 * x2 - error_term = np.sum(residuals**2) / n - if regularization == 'l1': - penalty = l1_norm(beta1, beta2) - elif regularization == 'l2': - penalty = l2_norm_squared(beta1, beta2) - return error_term + lam * penalty - -# Compute the Minima for each plot -minima = {} -regularizations = ['l1', 'l2'] -lambdas = [0, 1, 10] -for reg in regularizations: - for lam in lambdas: - result = minimize(updated_objective, [0, 0], args=(x1, x2, y, lam, reg), method='L-BFGS-B') - minima[(reg, lam)] = result.x - -# Parameter Space for Beta1 and Beta2 -beta1_range = np.linspace(-10, 10, 100) -beta2_range = np.linspace(-10, 10, 100) -beta1_grid, beta2_grid = np.meshgrid(beta1_range, beta2_range) - -# Plotting -fig, axes = plt.subplots(2, 3, subplot_kw={"projection": "3d"}, figsize=(18, 12)) -for i, reg in enumerate(regularizations): - for j, lam in enumerate(lambdas): - objective_values = np.array([updated_objective([b1, b2], x1, x2, y, lam, reg) - for b1, b2 in zip(np.ravel(beta1_grid), np.ravel(beta2_grid))]) - objective_values = objective_values.reshape(beta1_grid.shape) - - ax = axes[i, j] - ax.plot_surface(beta1_grid, beta2_grid, objective_values, cmap='viridis') - ax.set_title(f'Regularization: {reg.upper()}, Lambda: {lam}', fontsize=20) # Increased font size - ax.set_xlabel('Theta 1', fontsize=14) # Increased font size - ax.set_ylabel('Theta 2', fontsize=14) # Increased font size - ax.set_zlabel('Emp. risk', fontsize=14) # Increased font size - - # Add the minima as a red dot - min_beta1, min_beta2 = minima[(reg, lam)] - min_val = updated_objective([min_beta1, min_beta2], x1, x2, y, lam, reg) - ax.scatter(min_beta1, min_beta2, min_val, color='red', s=50) - -plt.tight_layout() -plt.subplots_adjust(wspace=0.1, hspace=0.1) # Adjust spacing between the plots if needed -plt.savefig('..figure/reg_surfaces.png', bbox_inches='tight', pad_inches=0, facecolor='white') \ No newline at end of file diff --git a/slides/regularization/rsrc/make_regu_example_multicollinearity_plot.R b/slides/regularization/rsrc/make_regu_example_multicollinearity_plot.R deleted file mode 100644 index dd412b4a..00000000 --- a/slides/regularization/rsrc/make_regu_example_multicollinearity_plot.R +++ /dev/null @@ -1,125 +0,0 @@ -################################################################################ -# EXAMPLE: LASSO VS RIDGE WITH MULTICOLLINEARITY -################################################################################ - -# PREREQ ----------------------------------------------------------------------- - -library(dplyr) -library(ggrepel) -library(MASS) -library(mlr) -library(BBmisc) -library(data.table) -library(gridExtra) -library(grid) -options(scipen = 10000) - -# FICTIONAL DATA --------------------------------------------------------------- - -set.seed(20200611) - -# Create 4 normally distributed, uncorrelated RV - -Sigma <- diag(rep(2, 4)) - -design_matrix <- data.frame(mvrnorm(100, mu = rep(0, 4), Sigma = Sigma, - empirical = TRUE)) - -# Add X5 - almost perfectly correlated to X4 - -colnames(design_matrix) <- c("X1", "X2", "X3", "X4") -design_matrix <- design_matrix %>% - mutate(X5 = X4 + rnorm(nrow(design_matrix), 0, 0.3)) - -# Create target variable - -design_matrix <- design_matrix %>% mutate(y = 0.2 * X1 + 0.2 * X2 + 0.2 * X3 - + 0.2 * X4 + 0.2 * X5 + - rnorm(nrow(design_matrix), 0, 1)) - -# REGRESSION TASK -------------------------------------------------------------- - -task_mc <- makeRegrTask("fictional", design_matrix, "y") -featnames_mc <- getTaskFeatureNames(task_mc) - -# COEFFICENT PATHS ------------------------------------------------------------- - -compute_coef_paths <- function(task, lambda_name, lambda_seq) { - - lrn <- makeLearner("regr.penalized", trace = FALSE, lambda1 = 0, lambda2 = 0) - path <- list() - - # Compute coefficients for each model (on entire data) - - for (i in seq_along(lambda_seq)) { - - lamval <- lambda_seq[[i]] - pv <- namedList(lambda_name, lamval) - lrn2 <- setHyperPars(lrn, par.vals = pv) - m1 <- train(lrn2, task) - mm1 <- getLearnerModel(m1) - cc <- coefficients(mm1) - cc <- as.list(cc) - cc$lambda <- lamval - path[[i]] <- cc - - } - - path <- rbindlist(path, fill = TRUE) - path[is.na(path)] <- 0 - - # Perform cross validation - - ps <- makeParamSet( - makeDiscreteParam(id = lambda_name, values = lambda_seq) - ) - ctrl <- makeTuneControlGrid() - tr <- tuneParams(lrn, task, cv3, par.set = ps, control = ctrl, show.info = - FALSE) - cv_lam <- as.data.frame(tr$opt.path)[, c(lambda_name, "mse.test.mean")] - colnames(cv_lam) <- c("lambda", "mse") - cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda)) - list(path = path, cv_lam = cv_lam) - -} - -# PLOT PATHS ------------------------------------------------------------------- - -plot_coef_paths_mc <- function(obj, featnames, xlab) { - ggd <- data.table::melt(obj$path, id.var = "lambda", measure = featnames, - variable.name = "featname", value.name = "coefval") - ggd$label <- ifelse(ggd$lambda == min(lambda_seq_mc), - as.character(ggd$featname), NA) - ggd$mse <- rep(obj$cv_lam[, "mse"], 5) - pl <- ggplot(data = ggd, mapping = aes(x = lambda, y = coefval, - group = featname, col = featname)) - pl <- pl + geom_line() - pl <- pl + geom_label_repel(aes(label = label), na.rm = TRUE) - pl <- pl + scale_x_log10() - pl <- pl + xlab(xlab) - pl <- pl + theme_bw() - pl <- pl + scale_color_manual(values = c(rep("black", 3), "#7FFF32", "#067B7F"), - guide = FALSE) - pl <- pl + geom_line(mapping = aes(x = ggd$lambda, y = ggd$mse * 0.5), - col = "black", linetype = "longdash") - pl <- pl + geom_text(x = max(log(ggd$lambda, 10)), - y = 0.5 * (max(ggd$mse)) - 0.01, vjust = 1, hjust = 1, - label = "MSE", col = "black") - pl <- pl + scale_y_continuous(sec.axis = sec_axis(~. * 2, name = "MSE")) - pl <- pl + geom_hline(aes(yintercept = 0), col = "black", linetype = "dotted") - -} - -#Visualize shrinkage in presence of multicollinearity -library(ggplot2) -lambda_seq_mc <- 2^seq(-10, 20, length.out = 50) - -path_l1_mc <- compute_coef_paths(task_mc, "lambda1", lambda_seq_mc) -path_l2_mc <- compute_coef_paths(task_mc, "lambda2", lambda_seq_mc) - -p_l1 <- plot_coef_paths_mc(path_l1_mc, featnames_mc, "Lasso / lambda") -p_l2 <- plot_coef_paths_mc(path_l2_mc, featnames_mc, "Ridge / lambda") - -p <- grid.arrange(p_l1, p_l2, nrow = 1) -ggsave("../figure/regu_example_multicollinearity.png", plot=p, width= 8, height =3) - diff --git a/slides/regularization/rsrc/make_ridge_vs_sgd_path.py b/slides/regularization/rsrc/make_ridge_vs_sgd_path.py deleted file mode 100644 index 449fb04c..00000000 --- a/slides/regularization/rsrc/make_ridge_vs_sgd_path.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from sklearn.utils import shuffle - -# Set the random seed for reproducibility -np.random.seed(6) - -# Function to generate data -def generate_data(n, p): - X = np.random.normal(0, 1, (n, p)) - true_coef = np.linspace(-1, 1, p) - noise = np.random.normal(0, 1, n) - y = X.dot(true_coef) + noise - return X, y, true_coef - -# Function to compute the ridge coefficients analytically -def compute_ridge_path(X, y, alphas): - coefs = [np.zeros(X.shape[1])] # Start with a row of zeros - n, p = X.shape - for alpha in alphas: - ridge_coefs = np.linalg.inv(X.T @ X + alpha * np.identity(p)) @ X.T @ y - coefs.append(ridge_coefs) - return np.array(coefs) - -# Function to compute the optimization trajectory for SGD -def compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter): - w = np.zeros(X.shape[1]) - coefs = [w.copy()] # Start with a row of zeros - for i in range(n_iter): - X_shuffled, y_shuffled = shuffle(X, y) - for j in range(0, n, batch_size): - X_batch = X_shuffled[j:j+batch_size] - y_batch = y_shuffled[j:j+batch_size] - gradient = -2 * X_batch.T @ (y_batch - X_batch @ w) / batch_size - w -= learning_rate * gradient - coefs.append(w.copy()) - return np.array(coefs) - -# Parameters -n = 100 -p = 10 -batch_size = 4 -learning_rate = 0.01 -n_iter = 50 -t_values = np.arange(0.001, n_iter + 1) # Include 0 in t_values for the zero coefficients -alphas = 1/(learning_rate * t_values[0:]) # Exclude 0 to avoid division by zero - -# Generate data -X, y, true_coef = generate_data(n, p) - -# Compute the regularization path for ridge regression -ridge_coefs = compute_ridge_path(X, y, alphas) - -# Compute the optimization trajectory for SGD -sgd_coefs = compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter) - -# Plotting -fig, axs = plt.subplots(1, 2, figsize=(14, 5)) -# Regularization path for ridge regression -# Skip the first element (0) in t_values for plotting to match dimensions with ridge_coefs -axs[0].plot(1/alphas, ridge_coefs[1:]) -axs[0].set_xlabel('1/(lr * lambda)', fontsize=18) -axs[0].set_ylabel('Parameters', fontsize=18) -axs[0].set_title('Ridge Regression Path', fontsize=22) - -# Optimization trajectory for SGD -# Use t_values for x-axis to include the initial zero coefficients -axs[1].plot(t_values, sgd_coefs) -axs[1].set_xlabel('iteration', fontsize=18) -axs[1].set_ylabel('Parameters', fontsize=18) -axs[1].set_title('SGD Trajectory', fontsize=22) - -plt.tight_layout() -plt.show() - diff --git a/slides/regularization/rsrc/make_shrinkage_1_plot.R b/slides/regularization/rsrc/make_shrinkage_1_plot.R deleted file mode 100644 index f888ea21..00000000 --- a/slides/regularization/rsrc/make_shrinkage_1_plot.R +++ /dev/null @@ -1,56 +0,0 @@ -# ------------------------------------------------------------------------------ -# FIG: SHRINNKAGE 1 -# ------------------------------------------------------------------------------ - - -library(knitr) -library(mlr) -library(mlbench) -library(ggplot2) -library(tidyr) -library(colorspace) -library(BBmisc) -library(penalized) -library(reshape) -library(gridExtra) -library(ggrepel) -library(data.table) -library(viridis) - -# DATA ------------------------------------------------------------------------- - -load("regu_example_1.RData") - -# PLOTS ------------------------------------------------------------------------ - -plot_coef_paths <- function(path, featnames, xlab) { - ggd <- data.table::melt(path, id.vars = "lambda", measure = featnames, variable.name = "featname", value.name = "coefval") - ggd$label <- ifelse(ggd$lambda == min(lambda_seq), as.character(ggd$featname), NA) - pl <- ggplot(data = ggd, aes(x = lambda, y = coefval, group = featname, col = featname)) + - guides(color = "none") + - geom_line() + - geom_label_repel(aes(label = label), na.rm = TRUE, max.overlaps = Inf) + - scale_color_discrete(guide = FALSE) + - scale_x_log10() + - xlab(xlab) + - theme_bw() + - scale_color_viridis(end = 0.9, discrete = TRUE) - - -} - -plot_cv_path <- function(cv_lam, xlab) { - pl <- ggplot(data = cv_lam, aes(x = lambda, y = mse)) + - geom_line() + - scale_x_log10() + - xlab(xlab) + - theme_minimal() -} - -pl1 <- plot_coef_paths(path_l1$path, featnames, "Lasso / lambda") -pl2 <- plot_coef_paths(path_l2$path, featnames, "Ridge / lambda") -pl3 <- plot_cv_path(path_l1$cv_lam, "Lasso / lambda") + ylim(25, 90) -pl4 <- plot_cv_path(path_l2$cv_lam, "Ridge / lambda") + ylim(20, 90) - -p <- grid.arrange(pl1, pl2, pl3, pl4, nrow = 2) -ggsave("../figure/shrinkage_1.png", plot = p, width = 8, height = 4) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_shrinkage_2_plot.R b/slides/regularization/rsrc/make_shrinkage_2_plot.R deleted file mode 100644 index 35666f04..00000000 --- a/slides/regularization/rsrc/make_shrinkage_2_plot.R +++ /dev/null @@ -1,48 +0,0 @@ -library(knitr) -library(mlr) -library(mlbench) -library(ggplot2) -library(tidyr) -library(colorspace) -library(BBmisc) -library(data.table) -library(penalized) -library(reshape) -library(gridExtra) -library(viridis) -########################################################### - -load("regu_example_2.RData") -d_l1 <- rbind( - data.frame(lam = paste("L1-", lams[1]), coefval = cc_l1_1), - data.frame(lam = paste("L1-", lams[2]), coefval = cc_l1_2) -) -d_l1$lam <- as.factor(d_l1$lam) -d_l2 <- rbind( - data.frame(lam = paste("L2-", lams[1]), coefval = cc_l2_1), - data.frame(lam = paste("L2-", lams[2]), coefval = cc_l2_2) -) -d_l2$lam <- as.factor(d_l2$lam) -plot_coef_hist <- function(d) { - pl <- ggplot(d, aes(x = coefval, fill = lam)) + - scale_fill_viridis(end = 0.9, discrete = TRUE) + - geom_histogram(alpha = 0.9, position = "dodge") + - theme_gray(base_size = 14) - return(pl) -} -plot_cv_path <- function(cv_lam, xlab) { - pl <- ggplot(data = cv_lam, aes(x = lambda, y = mse)) - pl <- pl + geom_line() - pl <- pl + scale_x_log10() - pl <- pl + ylim(1, 10) - pl <- pl + xlab(xlab) + theme_gray(base_size = 14) -} - -pl1 <- plot_coef_hist(d_l1) + guides(fill=guide_legend(title="lambda")) -pl2 <- plot_coef_hist(d_l2)+ guides(fill=guide_legend(title="lambda")) + - ylim(0, 50) -pl3 <- plot_cv_path(cv_l1, "lambda") -pl4 <- plot_cv_path(cv_l2, "lambda") - -p <- grid.arrange(pl1, pl2, pl3, pl4, nrow = 2) -ggsave("../figure/shrinkage_2.png", plot = p, width = 8, height = 5) \ No newline at end of file diff --git a/slides/regularization/rsrc/make_wd_l2_geom_plot.R b/slides/regularization/rsrc/make_wd_l2_geom_plot.R deleted file mode 100644 index 7094f54f..00000000 --- a/slides/regularization/rsrc/make_wd_l2_geom_plot.R +++ /dev/null @@ -1,58 +0,0 @@ -library(ggplot2) - -# Define parameters for the ellipse -center <- c(1.5, 1.5) -axis_len <- c(1.5, 0.75) # Lengths of the axes for the ellipse -rotation <- pi/3 - - -seq_data <- seq(0, 2*pi, length.out=100) #points for one circle -# Generate data points for plotting ellipses -ellipse_x <- cos(rotation)*cos(seq_data)*axis_len[1]-sin(rotation)*sin(seq_data)*axis_len[2] -ellipse_y <- sin(rotation)*cos(seq_data)*axis_len[1]+cos(rotation)*sin(seq_data)*axis_len[2] - -elli_list <- list() -i <- 1 -for(mul in c(0.24, 0.43, 0.62, 0.78)){ #adjust radius - elli_list[[i]] <- data.frame(x=center[1]+ellipse_x*mul, y=center[2]+ellipse_y*mul) - i <- i + 1 -} - -# Generate data points for plotting circles(ridge) -cir_list <- list() -i <- 1 -for(mul in c(0.15, 0.4, 0.67, 1)){ #adjust radius - cir_list[[i]] <- data.frame(x=cos(seq_data)*mul, y=sin(seq_data)*mul) - i <- i + 1 -} - -# Create the plot of ellipses -p_elli <- ggplot() + - geom_path(data=elli_list[[1]], aes(x, y), color="black") + - geom_path(data=elli_list[[2]], aes(x, y), color="black") + - geom_path(data=elli_list[[3]], aes(x, y), color="black") + - geom_path(data=elli_list[[4]], aes(x, y), color="black") + - geom_point(aes(x=center[1], y=center[2]), color="black", size=3) + - annotate("label", x=1.6, y=1.3, label="hat(theta)", - parse=TRUE, color='black', size=3) - -# Create whole plot -p_ridge_geom <- p_elli + - geom_path(data=cir_list[[1]], aes(x, y), color="black", linetype="dashed") + - geom_path(data=cir_list[[2]], aes(x, y), color="black", linetype="dashed") + - geom_path(data=cir_list[[3]], aes(x, y), color="black", linetype="dashed") + - geom_path(data=cir_list[[4]], aes(x, y), color="black", linetype="dashed") + - geom_point(aes(x=0.83, y=sqrt(1-0.83^2)), color="black", size=3) + #intersection point - annotate("label", x=1, y=0.2, label="hat(theta)[\"Ridge\"]", - parse=TRUE, color='black', size=3) + - xlim(-1.5, 3) + - ylim(-1.5, 3) + - coord_equal() + - theme_light() + - labs(title = "", - x = expression(theta_1), - y = expression(theta_2)) - -ggsave(filename = paste0("../figure/wd-l2-geom.png"), - plot=p_ridge_geom, width=12, height=7.5) - diff --git a/slides/regularization/rsrc/fig-eval_ofit_1.R b/slides/regularization/rsrc/model_eval.R old mode 100644 new mode 100755 similarity index 57% rename from slides/regularization/rsrc/fig-eval_ofit_1.R rename to slides/regularization/rsrc/model_eval.R index 42392c24..5289a4af --- a/slides/regularization/rsrc/fig-eval_ofit_1.R +++ b/slides/regularization/rsrc/model_eval.R @@ -1,3 +1,11 @@ +# ------------------------------------------------------------------------------ +# intro + +# FIG: binary classification visualization under +# appropriate, overfitted and underfitted models. + +# DATA: 100000 2-feature samples from Normal distribution into two classes. +# ------------------------------------------------------------------------------ library(mlr3misc) library(mvtnorm) @@ -6,8 +14,12 @@ library(mlr3learners) library(mlr3viz) library(ggplot2) library(gridExtra) +library(e1071) set.seed(600000) + +# DATA ------------------------------------------------------------------------- + n = 100000 mu1 = c(0, 3) @@ -28,10 +40,10 @@ trainsize = 200 trainset = 1:trainsize testset = (trainsize+1):n -l1 = lrn("classif.qda", predict_type = "prob") -l2 = lrn("classif.log_reg", predict_type = "prob") -l3 = lrn("classif.svm", type = "C-classification", predict_type = "prob", - kernel = "radial", gamma = 99, cost = 1) +l1 = lrn("classif.qda", predict_type = "prob") # appropriate +l2 = lrn("classif.svm", type = "C-classification", predict_type = "prob", + kernel = "radial", gamma = 99, cost = 1) # overfit +l3 = lrn("classif.log_reg", predict_type = "prob") # underfit l1$train(task) r1 = range(dd[trainset,]$V1) @@ -43,26 +55,25 @@ pred_true = as.data.table(l1$predict_newdata(d_grid)) d_grid$prob = pred_true$prob.1 true_decb = d_grid[d_grid$prob > 0.47 & d_grid$prob < 0.53,] +# PLOT ------------------------------------------------------------------------- -make_plot = function(ll, file_postfix) { +make_plot = function(ll) { ll$train(task, row_ids = trainset) pred_train = ll$predict(task, row_ids = trainset) trainerr = pred_train$score(msr("classif.ce")) pred_test = ll$predict(task, row_ids = testset) testerr = pred_test$score(msr("classif.ce")) - fname = sprintf("../figure/eval_ofit_1%s.pdf", file_postfix) task_train = task$filter(rows = trainset) pl = plot_learner_prediction(ll, task) + guides(shape = FALSE, alpha = FALSE) pl = pl + ggtitle(sprintf("TrainErr=%.2f; TestErr=%.2f", trainerr, testerr)) pl = pl + geom_point(data = true_decb, alpha=0.5, size=0.2) - ggsave(plot = pl, filename = fname, width = 8, height = 6) return(pl) } -p1 = make_plot(l1, file_postfix = "a") -p2 = make_plot(l2, file_postfix = "u") -p3 = make_plot(l3, file_postfix = "o") - -#grid.arrange(p1, p2, p3) -#print(p2) +p1 = make_plot(l1) # appropriate +p2 = make_plot(l2) # overfit +p3 = make_plot(l3) # underfit +ggsave("../figure/model_eval_01.png", plot = p1, width = 8, height = 6) +ggsave("../figure/model_eval_02.png", plot = p2, width = 8, height = 6) +ggsave("../figure/model_eval_03.png", plot = p3, width = 8, height = 6) diff --git a/slides/regularization/rsrc/multicollinearity_example.R b/slides/regularization/rsrc/multicollinearity_example.R new file mode 100755 index 00000000..aa111be8 --- /dev/null +++ b/slides/regularization/rsrc/multicollinearity_example.R @@ -0,0 +1,130 @@ +# ------------------------------------------------------------------------------ +# l1 vs l2 + +# FIG: draw how coefficient values and MSE of linear regression change with +# different regularization constants (lambda). +# left: Under L1 regularization +# right: Under L2 regularization +# DATA: +# xi ~ Normal(0, 2) i=1,2,3,4 uncorrelated, x5 = x4 + Normal(0, 0.3) +# y = 0.2*x1 + 0.2*x2 + 0.2*x3 + 0.2*x4 + 0.2*x5 + eps(100*1 ~Normal) +# ------------------------------------------------------------------------------ + +library(dplyr) +library(ggrepel) +library(BBmisc) +library(MASS) +library(mlr3) +library(mlr3learners) +library(mlr3tuning) +library(ggplot2) +library(data.table) +library(gridExtra) +options(scipen = 10000) + +set.seed(20200611) +# FICTIONAL DATA --------------------------------------------------------------- + +# Create 4 normally distributed, uncorrelated RV +Sigma <- diag(rep(2, 4)) +design_matrix <- data.frame(mvrnorm(100, mu = rep(0, 4), Sigma = Sigma, + empirical = TRUE)) + +# Add X5 - almost perfectly correlated to X4 +colnames(design_matrix) <- c("X1", "X2", "X3", "X4") +design_matrix <- design_matrix %>% + mutate(X5 = X4 + rnorm(nrow(design_matrix), 0, 0.3)) + +# Create target variable +design_matrix <- design_matrix %>% mutate(y = 0.2 * X1 + 0.2 * X2 + 0.2 * X3 + + 0.2 * X4 + 0.2 * X5 + + rnorm(nrow(design_matrix), 0, 1)) + +# REGRESSION TASK -------------------------------------------------------------- + +task_mc <- TaskRegr$new(id = "fictional", backend = design_matrix, target = "y") +featnames_mc <- task_mc$feature_names + +# COEFFICENT PATHS ------------------------------------------------------------- + +compute_coef_paths <- function(task, lambda_name, lambda_seq) { + alpha = ifelse(lambda_name=='lambda1', 1, 0) + path <- list() + # Compute coefficients for each model (on entire data) + for (i in seq_along(lambda_seq)) { + lamval <- lambda_seq[i] + learner = lrn("regr.glmnet", alpha = alpha, lambda=lamval) + learner$train(task) + cc <- t(as.matrix(coef(learner$model))) + names <- colnames(cc) + cc <- as.numeric(cc) + names(cc) <- names + cc <- as.list(cc) + cc$lambda <- lamval + path[[i]] <- cc + } + + path <- rbindlist(path, fill = TRUE) + path[is.na(path)] <- 0 + + # Perform cross validation + learner = lrn("regr.glmnet", alpha = alpha, lambda=to_tune(lambda_seq)) + + # Construct tuning instance + instance = ti( + task = task, + learner = learner, + resampling = rsmp("cv", folds = 3), + measures = msr("regr.mse"), + terminator = trm("evals", n_evals = length(lambda_seq)) + ) + + tuner <- tnr("grid_search", resolution = length(lambda_seq)) + tuner$optimize(instance) + cv_lam <- as.data.frame(instance$archive$data)[,1:2] + colnames(cv_lam) <- c("lambda", "mse") + cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda)) + cv_lam <- cv_lam %>% arrange(lambda) + + list(path = path, cv_lam = cv_lam) +} + +# PLOT PATHS ------------------------------------------------------------------- + +plot_coef_paths_mc <- function(obj, featnames, title, xlab) { + ggd <- melt(obj$path, id.var = "lambda", measure = featnames, + variable.name = "featname", value.name = "coefval") + ggd$label <- ifelse(ggd$lambda == min(lambda_seq_mc), + as.character(ggd$featname), NA) + ggd$mse <- rep(obj$cv_lam[, "mse"], 5) + pl <- ggplot(data = ggd, mapping = aes(x = lambda, y = coefval, + group = featname, col = featname)) + + geom_line() + + geom_label_repel(aes(label = label), na.rm = TRUE) + + scale_x_log10() + + ggtitle(title) + + xlab(xlab) + + theme_bw() + + scale_color_manual(values = c(rep("black", 3), "#7FFF32", "#067B7F"), + guide = FALSE) + + geom_line(mapping = aes(x = ggd$lambda, y = ggd$mse * 0.5), + col = "black", linetype = "longdash") + + geom_text(x = max(log(ggd$lambda, 10)), + y = 0.5 * (max(ggd$mse)) - 0.01, vjust = 1, hjust = 1, + label = "MSE", col = "black") + + scale_y_continuous(sec.axis = sec_axis(~. * 2, name = "MSE")) + + geom_hline(aes(yintercept = 0), col = "black", linetype = "dotted") + +} + +#Visualize shrinkage in presence of multicollinearity +lambda_seq_mc <- 2^seq(-10, 20, length.out = 50) + +path_l1_mc <- compute_coef_paths(task_mc, "lambda1", lambda_seq_mc) +path_l2_mc <- compute_coef_paths(task_mc, "lambda2", lambda_seq_mc) + +p_l1 <- plot_coef_paths_mc(path_l1_mc, featnames_mc, "Lasso", expression(lambda)) +p_l2 <- plot_coef_paths_mc(path_l2_mc, featnames_mc, "Ridge", expression(lambda)) + +p <- grid.arrange(p_l1, p_l2, nrow = 1) +ggsave("../figure/multicollinearity_example.png", plot=p, width= 8, height =3) diff --git a/slides/regularization/rsrc/make_nn_plots.R b/slides/regularization/rsrc/nn_size.R old mode 100644 new mode 100755 similarity index 72% rename from slides/regularization/rsrc/make_nn_plots.R rename to slides/regularization/rsrc/nn_size.R index 967b3fa9..b55aef79 --- a/slides/regularization/rsrc/make_nn_plots.R +++ b/slides/regularization/rsrc/nn_size.R @@ -1,13 +1,20 @@ +# ------------------------------------------------------------------------------ +# nonlin + +# FIG: plot schematic diagrams of one-hidden-layer neural network +# with different sizes (1,2,3,5,10,100) (input size: 2, output size: 1). +# ------------------------------------------------------------------------------ + library(RSNNS) library(nnet) library(clusterGeneration) -#import the function from Github library(devtools) source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r') -seed.val<-2 -set.seed(seed.val) - +set.seed(2) + +# DATA ------------------------------------------------------------------------- + num.vars<-2 num.obs<-1000 @@ -25,15 +32,15 @@ resp<-data.frame(y1) names(resp)<-c('Y1') dat.in<-data.frame(resp,rand.vars) +# plot ------------------------------------------------------------------------- + nn_plot <- function(size) { - # Your existing code to generate the model might go here - mod1 <- nnet(rand.vars, resp, data=dat.in, size=size, linout=T) # Example + mod1 <- nnet(rand.vars, resp, data=dat.in, size=size, linout=T) save_dir <- "../figure" filename <- file.path(save_dir, sprintf("nn_size_%d.png", size)) png(filename, width = 3000, height = 2800, res = 500) par(mar = c(1, 1, 1, 1)) - # Your plot code plot.nnet(mod1, nid=FALSE, rel.rsc=3, @@ -47,11 +54,9 @@ nn_plot <- function(size) { neg.col='black', max.sp=TRUE) - # Close the device, saving the file dev.off() } -# Your existing loop vec <- c(1, 2, 3, 5, 10, 100) for (i in vec) { nn_plot(i) diff --git a/slides/regularization/rsrc/ozone_mse_boxplot.R b/slides/regularization/rsrc/ozone_mse_boxplot.R new file mode 100755 index 00000000..6aa2a699 --- /dev/null +++ b/slides/regularization/rsrc/ozone_mse_boxplot.R @@ -0,0 +1,30 @@ +# ------------------------------------------------------------------------------ +# intro + +# FIG: boxplot of MSE for training and test results. + +# DATA: from data_ozone_example.RData +# ------------------------------------------------------------------------------ + +library(ggplot2) + +theme_set(theme_minimal()) + +# DATA ------------------------------------------------------------------------- + +load("data_ozone_example.RData") +dfp <- df_incdata[df_incdata$nobs == 50, ] + +# PLOTS ------------------------------------------------------------------------ + +p <- ggplot(data = dfp, aes(x = 0, y = value, fill = variable)) + + geom_boxplot() + labs(colour = " ") + + scale_colour_discrete(labels = c("Train error", "Test error")) + + xlab(" ") + ylab("MSE") + + ylim(c(0, 400)) + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + + scale_fill_brewer(palette="Dark2") + +ggsave("../figure/ozone_mse_boxplot.png", plot=p, width=4, height=2) diff --git a/slides/regularization/rsrc/poly_ridge.R b/slides/regularization/rsrc/poly_ridge.R new file mode 100755 index 00000000..d0bd76be --- /dev/null +++ b/slides/regularization/rsrc/poly_ridge.R @@ -0,0 +1,132 @@ +# ------------------------------------------------------------------------------ +# l2 + +# FIG: +# (1) true and fitted polynomials by OLS regression (degree = 10, overfitted). +# (2) true and fitted polynomials with different regularization +# constant (lambda) by ridge regression (large lambda helps with overfit). + +# DATA: y = 5 + 2x + 10x^2 - 2*x^3 (x 40*1 ~Unif) + noise (40*1 ~Normal) +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(viridis) + +set.seed(314259) + +theme_set(theme_minimal()) + +# DATA ------------------------------------------------------------------------- + +f <- function (x) { + return (5 + 2 * x + 10 * x^2 - 2 * x^3) +} + +x <- runif(40, -2, 5) +y <- f(x) + rnorm(length(x), 0, 10) + +x.true <- seq(-2, 5, length.out = 400) +y.true <- f(x.true) +df <- data.frame(x = x.true, y = y.true) + +lambda <- 0 + +lambda.vec <- c(0, 10, 100) + +# FUNC ------------------------------------------------------------------------- + +# calculate ridge coefficients +betaRidge <- function (X, y, lambda) +{ + return (solve(t(X) %*% X + lambda * diag(ncol(X))) %*% (t(X) %*% y)) +} + +# generate polynomials +baseTrafo <- function (x, degree) +{ + out <- cbind(1, x) + for (i in seq_len(degree)[-1]) { + out <- cbind(out, x^i) + } + return (out) +} + +# generate df with polynomial features, true polynomial values, coefficients +getPolyData <- function(x, y, lambda.vec, base.trafo, ...) +{ + X <- base.trafo(x, ...) + + x.pred <- seq(min(x), max(x), length.out = 500) + X.pred <- base.trafo(x.pred, ...) + + df.truth <- data.frame(feature = x, truth = y) + + # browser() + + df.betas <- matrix(NA, nrow=length(lambda.vec), ncol=ncol(X)) + row.names(df.betas) <- lambda.vec + + for(i in 1:length(lambda.vec)){ + df.betas[i,] <- betaRidge(X, y, lambda.vec[i]) + } + + df.polys <- lapply(1:length(lambda.vec), function (i) { + return (data.frame( + feature = x.pred, + pred = X.pred %*% df.betas[i,], + lambda = row.names(df.betas)[i] + )) + }) + return (list(polys = df.polys, + truth = df.truth, + betas = df.betas)) +} + +# plot true and fitted polynomials +plotRidge <- function (x, y, lambda.vec, base.trafo, ...) +{ + requireNamespace("ggplot2") + + # browser() + + res <- getPolyData(x, y, lambda.vec, base.trafo, ...) + df.polys <- res$polys + df.truth <- res$truth + + plot.df <- df.polys[[1]] + for (i in seq_along(df.polys)[-1]) { + plot.df <- rbind(plot.df, df.polys[[i]]) + } + plot.df$lambda <- as.factor(plot.df$lambda) + + gg <- ggplot() + if (length(lambda.vec) == 1) { + gg <- gg + geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda), show.legend = FALSE) + } else { + gg <- gg + geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda)) + } + + return ( + gg + + geom_point(data = df.truth, mapping = aes(x = feature, y = truth)) + ) +} + +# PLOTS ------------------------------------------------------------------------ + +p1 <- plotRidge(x, y, lambda, baseTrafo, degree = 10) + + geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) + + xlab("x") + ylab("f(x)") + + theme(plot.title = element_text(size = 15)) + + scale_color_viridis(end = 0.9, discrete = TRUE) + +# multiple lines +p2 <- plotRidge(x, y, lambda.vec, baseTrafo, degree = 10) + + geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) + + xlab("x") + ylab("f(x)") + + labs(color=expression(lambda)) + + theme(plot.title = element_text(size = 15)) + + scale_color_viridis(end = 0.9, discrete = TRUE) + +ggsave("../figure/poly_ridge_01.png", plot = p1, width = 6, height = 2) +ggsave("../figure/poly_ridge_02.png", plot = p2, width = 7.5, height = 3) diff --git a/slides/regularization/rsrc/make_reg_contours.R b/slides/regularization/rsrc/reg_contours.R old mode 100644 new mode 100755 similarity index 74% rename from slides/regularization/rsrc/make_reg_contours.R rename to slides/regularization/rsrc/reg_contours.R index ef649c34..70d9318d --- a/slides/regularization/rsrc/make_reg_contours.R +++ b/slides/regularization/rsrc/reg_contours.R @@ -1,34 +1,43 @@ -# Load necessary libraries +# ------------------------------------------------------------------------------ +# l1, l2 + +# FIG: contour plots for l1, l2 regularized linear model and corresponding +# optimal points with different regularization constants (lambda). + +# DATA: y = X (100*2 ~Normal)·beta_true(3,-2) + noise (100*1 ~Normal) +# ------------------------------------------------------------------------------ + library(ggplot2) library(MASS) library(glmnet) library(gridExtra) -# Example dataset set.seed(123) + +# DATA ------------------------------------------------------------------------- + n <- 100 X <- matrix(rnorm(2 * n), n, 2) beta_true <- c(3, -2) y <- X %*% beta_true + rnorm(n) -# Prepare data for contour plot grid_range <- seq(-5, 5, length.out = 100) grid_data <- expand.grid(X1 = grid_range, X2 = grid_range) grid_data$loss <- apply(grid_data, 1, function(vec) { sum((y - X %*% vec)^2) / (2 * n) }) +# DATA L2 ---------------------------------------------------------------------- lambdas_l2 <- c(0, 10, 100, 500) -# Ridge regression implementation + +# coefficients for ridge regression ridge_regression <- function(X, y, lambda) { n <- nrow(X) d <- ncol(X) - # Adding a column of ones for the intercept term - X_ext <- cbind(1, X) # Ensure X_ext has n rows and d+1 columns + X_ext <- cbind(1, X) # n rows and d+1(intercept term) columns - # Ridge regression closed-form solution I <- diag(d + 1) I[1, 1] <- 0 # No regularization on the intercept @@ -36,82 +45,75 @@ ridge_regression <- function(X, y, lambda) { return(beta) } -# OLS regression implementation +# coefficients for ols ols_regression <- function(X, y) { n <- nrow(X) d <- ncol(X) - # Adding a column of ones for the intercept term X_ext <- cbind(1, X) - # OLS closed-form solution beta <- solve(t(X_ext) %*% X_ext) %*% t(X_ext) %*% y return(beta) } -# Calculate coefficients using ridge_regression for each lambda +# coefficients using ridge_regression for each lambda +# put intercept term at the end coefs_manual <- sapply(lambdas_l2, function(lambda) { beta <- ridge_regression(X, y, lambda) - return(beta[2:3, 1]) # Extract coefficients excluding the intercept + return(beta[2:3, 1]) # excluding intercept }) coefs_manual <- t(coefs_manual) coefs_df_manual <- as.data.frame(coefs_manual) names(coefs_df_manual) <- c("X1", "X2") -# Function to create contour plots for regularized loss +# PLOT L2 ---------------------------------------------------------------------- + +# contour plots with optimal points create_reg_contour_plot <- function(coefs, title, lambda, alpha, X, y, grid_range, true_minimizer = c(3, -2.5)) { n <- nrow(X) d <- ncol(X) - # Make sure coefs is a numeric vector coefs <- as.numeric(coefs) - # Define the loss function for OLS loss_ols <- function(beta, X, y) { - X_ext <- cbind(1, X) # Include intercept term + X_ext <- cbind(1, X) return(sum((y - X_ext %*% beta)^2) / (2 * n)) } - # Define the regularized loss function for Ridge regularized_loss_ridge <- function(beta, X, y, lambda) { ridge_term <- ifelse(alpha == 0, lambda * sum(beta[-1]^2), 0) - X_ext <- cbind(1, X) # Include intercept term + X_ext <- cbind(1, X) return(sum((y - X_ext %*% beta)^2) / (2 * n) + ridge_term) } - - # Define the regularized loss function for LASSO + regularized_loss_lasso <- function(beta, X, y, lambda) { lasso_term <- ifelse(alpha == 1, lambda * sum(abs(beta[-1])), 0) X_ext <- cbind(1, X) # Include intercept term return(sum((y - X_ext %*% beta)^2) / (2 * n) + lasso_term) } - # Prepare grid data for contour plot + # data for contour plot grid_data <- expand.grid(X1 = grid_range, X2 = grid_range) X_ext <- cbind(1, grid_data) if(lambda == 0 && alpha == 0) { - # Use OLS loss function beta_center_ols <- c(1, coefs) grid_data$reg_loss <- apply(X_ext, 1, function(vec) { loss_ols(vec, X, y) # Directly use vec as beta values }) } else if(alpha == 0) { - # Use Ridge loss function beta_center <- c(1, coefs) grid_data$reg_loss <- apply(X_ext, 1, function(vec) { regularized_loss_ridge(vec - beta_center, X, y, lambda) }) } else { - # Use LASSO loss function beta_center <- c(1, coefs) grid_data$reg_loss <- apply(X_ext, 1, function(vec) { regularized_loss_lasso(vec, X, y, lambda) }) } - # Create the contour plot plot <- ggplot(grid_data, aes(x = X1, y = X2)) + geom_contour_filled(aes(z = reg_loss), breaks = pretty(range(grid_data$reg_loss), n = 15)) + geom_point(aes(x = coefs[1], y = coefs[2]), color = "red", size = 2) + @@ -128,8 +130,7 @@ create_reg_contour_plot <- function(coefs, title, lambda, alpha, X, y, grid_rang } -# Calculate coefficients using OLS regression for lambda = 0 -coefs_ols <- ols_regression(X, y)[2:3, 1] # Extract coefficients excluding the intercept +coefs_ols <- ols_regression(X, y)[2:3, 1] # Create plots for each lambda plots_l2 <- list() @@ -137,7 +138,6 @@ for (i in 1:length(lambdas_l2)) { lambda_value <- lambdas_l2[i] title_expression <- bquote("L2 Regularization:" ~ lambda == .(lambda_value)) - # Use OLS coefficients for lambda = 0 if (lambda_value == 0) { coefs_to_use <- coefs_ols } else { @@ -147,13 +147,12 @@ for (i in 1:length(lambdas_l2)) { plots_l2[[i]] <- create_reg_contour_plot(coefs_to_use, title_expression, lambda_value, 0, X, y, grid_range) } -# Display the grid of contour plots ridge_contours <- grid.arrange(grobs = plots_l2, ncol = 2, nrow = 2) -ggsave("../figure/ridge_contours.png", plot = ridge_contours, width =9, height = 6) +ggsave("../figure/reg_contours_02.png", plot = ridge_contours, width = 9, height = 6) +# DATA L1 ---------------------------------------------------------------------- -# Assuming lambdas_l1 contains your lambda values for LASSO lambdas_l1 <- c(0, 1, 2, 10) # glmnet requires a matrix for X and a vector for y @@ -166,14 +165,14 @@ lasso_models <- lapply(lambdas_l1, function(lambda) { }) coefs_l1 <- sapply(lasso_models, function(model) { - coef(model)[2:3,1] # Extracting only the relevant coefficients + coef(model)[2:3,1] }) -# Transpose and convert to data frame coefs_l1 <- t(coefs_l1) coefs_df_l1 <- as.data.frame(coefs_l1) names(coefs_df_l1) <- c("X1", "X2") +# PLOT L1 ---------------------------------------------------------------------- plots_l1 <- list() for (i in 1:length(lambdas_l1)) { @@ -191,10 +190,6 @@ for (i in 1:length(lambdas_l1)) { plots_l1[[i]] <- create_reg_contour_plot(coefs_to_use, title_expression, lambda_value, 1, X, y, grid_range) } -# Display the grid of LASSO contour plots lasso_contours <- grid.arrange(grobs = plots_l1, ncol = 2, nrow = 2) - -ggsave("../figure/lasso_contours.png", plot = lasso_contours, width =9, height = 6) - - \ No newline at end of file +ggsave("../figure/reg_contours_01.png", plot = lasso_contours, width =9, height = 6) diff --git a/slides/regularization/rsrc/reg_perspectives.py b/slides/regularization/rsrc/reg_perspectives.py deleted file mode 100644 index aafd0c34..00000000 --- a/slides/regularization/rsrc/reg_perspectives.py +++ /dev/null @@ -1,317 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.patches import Circle - -# Define the grid for plotting -x = np.linspace(-3.0, 3.0, 400) -y = np.linspace(-3.0, 3.0, 400) -X, Y = np.meshgrid(x, y) - -# Define the center of the objective function -objective_center = np.array([1.5, 1.5]) # Adjust as needed - -# Elliptical objective function with rotation -def rotated_elliptical_objective(X, Y, center, a, b, angle_deg): - """ Rotated elliptical objective function. """ - angle_rad = np.radians(angle_deg) - X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1]) - Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1]) - return (X_rot**2 / a**2) + (Y_rot**2 / b**2) - -# Define elliptical parameters -a, b = 1.5, 0.75 # Semi-major and semi-minor axes lengths -rotation_angle = -30 # Rotation angle in degrees - -# Calculate rotated elliptical objective function values -Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle) - -# Define the constraint circle for ridge regression (L2) -constraint_radius = 1.0 # Example radius - -def draw_plot(ax, contour_levels, last_plot=False): - # Plot contour lines around the objective center if any contour levels are provided - if contour_levels: - CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5) - - # Plot the constraint circle - circle = Circle((0, 0), constraint_radius, color='blue', alpha=0.3, linestyle='--') - ax.add_artist(circle) - - # Plot the minimum point in all plots - ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=4) - - # Set the same scale for both axes and set limits - ax.set_aspect('equal', 'box') - ax.set_xlim(-3, 3) - ax.set_ylim(-3, 3) - ax.axhline(0, color='black', linewidth=0.5) - ax.axvline(0, color='black', linewidth=0.5) - - # Define the legend elements - legend_elements = [plt.Line2D([0], [0], color='black', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}$')] - - # Add the intersection point for the last plot - if last_plot: - # Calculate the intersection point - last_contour = CS.allsegs[-1][0] - distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2) - min_idx = np.argmin(np.abs(distances - constraint_radius)) - intersection_point = last_contour[min_idx] - ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=4) - legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}_{ridge}$')) - - # Add the legend to the top-left of the plot - ax.legend(handles=legend_elements, loc='upper left', fontsize='small', frameon=True, handletextpad=0.2, borderpad=0.1, labelspacing=0.1) - -# Rest of your plotting code remains the same - - -# Create contour levels -first_contour_level = 0.1 # Start with a small contour level -max_contour_level = (constraint_radius**2) * 0.6 # Largest contour level touching the circle - -# For each subsequent plot, we add one more contour level, increasing the value -contour_levels_for_plots = [ - [], # No contour for the first plot - [first_contour_level], # One small contour for the second plot - [first_contour_level, first_contour_level * 3], # Two contours for the third plot - [first_contour_level, first_contour_level * 3, max_contour_level] # Three contours for the last plot -] - -# Create a 2x2 grid of plots -fig, axs = plt.subplots(2, 2, figsize=(6, 6), dpi=120) - -# Plot for each subplot in the 2x2 grid -for i, ax in enumerate(axs.flatten()): - last_plot = i == len(axs.flatten()) - 1 # Check if it's the last plot - draw_plot(ax, contour_levels_for_plots[i], last_plot) - -# Adjust layout to prevent overlapping -plt.tight_layout() -plt.show() - - -def create_diamond(ax, constraint_radius): - """Create and add a diamond shape for L1 regularization.""" - diamond = plt.Polygon([[-constraint_radius, 0], [0, constraint_radius], [constraint_radius, 0], [0, -constraint_radius]], - closed=True, color='blue', alpha=0.3, linestyle='--') - ax.add_patch(diamond) - - -# Elliptical objective function with rotation -def rotated_elliptical_objective(X, Y, center, a, b, angle_deg): - """ Rotated elliptical objective function. """ - angle_rad = np.radians(angle_deg) - X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1]) - Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1]) - return (X_rot**2 / a**2) + (Y_rot**2 / b**2) - -# Define elliptical parameters -a, b = 1.5, 0.75 # Semi-major and semi-minor axes lengths -rotation_angle = -10 # Rotation angle in degrees - -# Calculate rotated elliptical objective function values -Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle) - -# Define the constraint circle for ridge regression (L2) -constraint_radius = 1.0 # Example radius - -def draw_plot(ax, contour_levels, last_plot=False): - # Plot contour lines around the objective center if any contour levels are provided - if contour_levels: - CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5) - - # Plot the diamond shape for L1 regularization - create_diamond(ax, constraint_radius) - - # Plot the minimum point in all plots - ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=4) - - # Set the same scale for both axes and set limits - ax.set_aspect('equal', 'box') - ax.set_xlim(-3, 3) - ax.set_ylim(-3, 3) - ax.axhline(0, color='black', linewidth=0.5) - ax.axvline(0, color='black', linewidth=0.5) - - # Define the legend elements - legend_elements = [plt.Line2D([0], [0], color='black', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}$')] - - # Add the intersection point for the last plot - # Add the intersection point for the last plot - if last_plot: - # Calculate the intersection point - last_contour = CS.allsegs[-1][0] - distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2) - min_idx = np.argmin(np.abs(distances - constraint_radius)) - intersection_point = last_contour[min_idx] - ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=4) - legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}_{lasso}$')) - - # Add the legend to the top-left of the plot - ax.legend(handles=legend_elements, loc='upper left', fontsize='small', frameon=True, handletextpad=0.2, borderpad=0.1, labelspacing=0.1) - -# Create contour levels -first_contour_level = 0.1 # Start with a small contour level -max_contour_level = (constraint_radius**2) * 1.17 # Largest contour level touching the circle - -# For each subsequent plot, we add one more contour level, increasing the value -contour_levels_for_plots = [ - [], # No contour for the first plot - [first_contour_level], # One small contour for the second plot - [first_contour_level, first_contour_level * 4], # Two contours for the third plot - [first_contour_level, first_contour_level * 4, max_contour_level] # Three contours for the last plot -] - -# Create a 2x2 grid of plots -fig, axs = plt.subplots(2, 2, figsize=(6, 6), dpi=120) - -# Plot for each subplot in the 2x2 grid -for i, ax in enumerate(axs.flatten()): - last_plot = i == len(axs.flatten()) - 1 # Check if it's the last plot - draw_plot(ax, contour_levels_for_plots[i], last_plot) - -# Adjust layout to prevent overlapping -plt.tight_layout() -plt.show() - - - # Define the center of the objective function and elliptical parameters -objective_center = np.array([1.5, 1.5]) -a, b = 1.5, 0.75 -rotation_angle = -30 - -# Elliptical objective function -def rotated_elliptical_objective(X, Y, center, a, b, angle_deg): - angle_rad = np.radians(angle_deg) - X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1]) - Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1]) - return (X_rot**2 / a**2) + (Y_rot**2 / b**2) - -Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle) - -# Define the constraint circle for ridge regression (L2) -constraint_radius = 1.0 - -# Define contour levels in increasing order -contour_levels = [(constraint_radius**2) * 0.6, (constraint_radius**2) * 1.2, (constraint_radius**2) * 2.4] - -def draw_plot(ax, plot_index, last_plot=False): - # Plot all contours - CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5) - - # Control visibility of contours based on plot index - for i, contour in enumerate(CS.collections): - contour.set_visible(i >= plot_index) - - # Plot the constraint circle - circle = Circle((0, 0), constraint_radius, color='blue', alpha=0.3, linestyle='--') - ax.add_artist(circle) - - # Plot the minimum point and set limits - min_point_handle, = ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=4) - ax.set_aspect('equal', 'box') - ax.set_xlim(-3, 3) - ax.set_ylim(-3, 3) - - # Draw coordinate axes - ax.axhline(0, color='black', linewidth=0.5) - ax.axvline(0, color='black', linewidth=0.5) - - # Add legend for the minimum point - if last_plot: - # Calculate intersection point for the last plot using the last contour segment (smallest contour) - last_contour = CS.allsegs[-3][0] # Use the last contour segment - distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2) - min_idx = np.argmin(np.abs(distances - constraint_radius)) - intersection_point = last_contour[min_idx] - ridge_point_handle, = ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=4) - ax.legend([min_point_handle, ridge_point_handle], [r'$\hat{\theta}$', r'$\hat{\theta}_{ridge}$'], loc='upper left', fontsize='small', frameon=True) - else: - ax.legend([min_point_handle], [r'$\hat{\theta}$'], loc='upper left', fontsize='small', frameon=True) - - -# Create a 2x2 grid of plots -fig, axs = plt.subplots(2, 2, figsize=(6, 6), dpi=120) - -for i, ax in enumerate(axs.flatten()): - last_plot = i == len(axs.flatten()) - 1 - draw_plot(ax, 3 - i, last_plot) # Reverse the order of plots - -plt.tight_layout() -plt.show() - - -# Elliptical objective function with rotation -def rotated_elliptical_objective(X, Y, center, a, b, angle_deg): - """ Rotated elliptical objective function. """ - angle_rad = np.radians(angle_deg) - X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1]) - Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1]) - return (X_rot**2 / a**2) + (Y_rot**2 / b**2) - -# Define elliptical parameters -a, b = 1.5, 0.75 # Semi-major and semi-minor axes lengths -rotation_angle = -30 # Rotation angle in degrees - -# Calculate rotated elliptical objective function values -Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle) - -# Define the constraint circle for ridge regression (L2) -constraint_radius = 1.0 # Example radius -constraint_radius_large = 1.33 # Larger radius for comparison - -# Create contour levels -contour_levels = [0.1, 0.3, 0.6] # Example contour levels - -# Create a 2x1 grid of plots -fig, axs = plt.subplots(1, 2, figsize=(12, 6), dpi=100) - -def draw_plot(ax, constraint_radius, contour_levels): - # Plot contour lines around the objective center - CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5) - - # Plot the constraint circle - circle = Circle((0, 0), constraint_radius, color='blue', alpha=0.3, linestyle='--') - ax.add_artist(circle) - - # Plot the minimum point - ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=6) - - # Set the same scale for both axes and set limits - ax.set_aspect('equal', 'box') - ax.set_xlim(-3, 3) - ax.set_ylim(-3, 3) - ax.axhline(0, color='black', linewidth=0.5) - ax.axvline(0, color='black', linewidth=0.5) - - # Define the legend elements - legend_elements = [ - plt.Line2D([0], [0], marker='o', color='black', markersize=6, label=r'$\hat{\theta}$', linestyle='None') - ] - - # Calculate and plot the intersection point for the second contour and larger circle if needed - if constraint_radius == constraint_radius_large: - last_contour = CS.allsegs[1][0] # Use the second contour for intersection - distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2) - min_idx = np.argmin(np.abs(distances - constraint_radius)) - intersection_point = last_contour[min_idx] - ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=6) - legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=6, label=r'$\hat{\theta}_{ridge}$')) - else: - last_contour = CS.allsegs[2][0] # Use the second contour for intersection - distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2) - min_idx = np.argmin(np.abs(distances - constraint_radius)) - intersection_point = last_contour[min_idx] - ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=6) - legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=6, label=r'$\hat{\theta}_{ridge}$')) - - # Add the legend - ax.legend(handles=legend_elements, loc='upper left', fontsize='large', frameon=True, handletextpad=0.4, borderpad=0.1, labelspacing=0.1) - -# Draw plots -draw_plot(axs[0], constraint_radius, contour_levels) -draw_plot(axs[1], constraint_radius_large, contour_levels) - -plt.tight_layout() -plt.show() \ No newline at end of file diff --git a/slides/regularization/rsrc/reg_surfaces.R b/slides/regularization/rsrc/reg_surfaces.R new file mode 100755 index 00000000..3c073091 --- /dev/null +++ b/slides/regularization/rsrc/reg_surfaces.R @@ -0,0 +1,92 @@ +# ------------------------------------------------------------------------------ +# l1, l2 + +# FIG: plot 3D regression surfaces with two coefficients +# under different regularization constanta (lambda 0, 1, 10) +# using l1 and l2 regularization. + +# DATA: y(500*1) = -0.5 * x1(~Unif(-1,1)) + +# 3 * x2(~Unif(-1,1)) + epsilon(~Norm(0,0.1)). + +# ENV: for use of vistool, need to build a virtual env in miniconda 3 +# conda create -n r-reticulate python=3.12 +# conda activate r-reticulate +# conda install -c plotly plotly-orca python-kaleido +# conda deactivate +# ------------------------------------------------------------------------------ + +library(vistool) +library(plotly) +set.seed(0) + +#library(plotly) +#use_condaenv("r-reticulate", required = TRUE) +#py_config() + +# DATA ------------------------------------------------------------------------- + +n <- 500 +x1 <- runif(n, -1, 1) +x2 <- runif(n, -1, 1) +epsilon <- rnorm(n, 0, 0.1) +y <- -0.5 * x1 + 3 * x2 + epsilon + +# Regularization Norm Functions +l1_norm <- function(beta1, beta2) { + return(abs(beta1) + abs(beta2)) +} + +l2_norm_squared <- function(beta1, beta2) { + return(beta1^2 + beta2^2) +} + +# Updated Regularized Least Squares Objective Function with 1/n factor +updated_objective <- function(x, x1, x2, y, lam, regularization) { + # x: beta, need to use x for building objective in vistool + residuals <- y - x[1] * x1 - x[2] * x2 + error_term <- sum(residuals^2) / n + if (regularization == 'l1') { + penalty <- l1_norm(x[1], x[2]) + } else if (regularization == 'l2') { + penalty <- l2_norm_squared(x[1], x[2]) + } + return(error_term + lam * penalty) +} + +# PLOT ------------------------------------------------------------------------- + +regularizations <- c('l1', 'l2') +lambdas <- c(0, 1, 10) + +for (reg in regularizations) { + for (lam in lambdas) { + obj_lm = Objective$new(id = "reg surfaces", fun = updated_objective, xdim = 2, + x1 = x1, x2 = x2, y = y, + lam = lam, regularization = reg, minimize = TRUE) + viz_lm = as_visualizer(obj_lm, x1_limits = c(-10, 10), x2_limits = c(-10, 10)) + result <- optim(c(0, 0), updated_objective, x1 = x1, x2 = x2, y = y, lam = lam, + regularization = reg, method = 'L-BFGS-B') + plot_obj <- viz_lm$plot() + plot_obj <- plot_obj %>% + layout( + title = paste("Regularization:",reg,"λ:", as.character(lam)), + scene = list( + xaxis = list(title = "β1"), + yaxis = list(title = "β2"), + zaxis = list(title = "Objective") + ) + ) %>% + add_trace( + type = "scatter3d", + mode = "markers", + x = result$par[1], # beta1 + y = result$par[2], # beta2 + z = result$value, # objective value + marker = list(color = 'red', size = 3), + name = "Minimum Point" + ) + savename = paste0("../figure/reg_surfaces_", reg, "_lam", as.character(lam),".png") + save_image(plot_obj, savename, engine = "kaleido", width = 600, height = 500) + } +} + diff --git a/slides/regularization/rsrc/regu_example_1.R b/slides/regularization/rsrc/regu_example_1.R deleted file mode 100644 index 96a4ecc0..00000000 --- a/slides/regularization/rsrc/regu_example_1.R +++ /dev/null @@ -1,43 +0,0 @@ -library(mlr) -library(BBmisc) -library(data.table) - -set.seed(123) - -task = bh.task -task = dropFeatures(task, c("chas", "nox", "rm")) -featnames = getTaskFeatureNames(task) - -compute_coef_paths = function(task, lambda_name, lambda_seq) { - lrn = makeLearner("regr.penalized", trace = FALSE, lambda1 = 0, lambda2 = 0) - path = list() - for (i in seq_along(lambda_seq)) { - lamval = lambda_seq[[i]] - pv = namedList(lambda_name, lamval) - lrn2 = setHyperPars(lrn, par.vals = pv) - m1 = train(lrn2, task) - mm1 = getLearnerModel(m1) - cc = coefficients(mm1) - cc = as.list(cc) - cc$lambda = lamval - path[[i]] = cc - } - path = rbindlist(path, fill = TRUE) - path[is.na(path)] = 0 - ps = makeParamSet( - makeDiscreteParam(id = lambda_name, values = lambda_seq) - ) - ctrl = makeTuneControlGrid() - tr = tuneParams(lrn, task, cv3, par.set = ps, control = ctrl, show.info = FALSE) - cv_lam = as.data.frame(tr$opt.path)[, c(lambda_name, "mse.test.mean")] - colnames(cv_lam) = c("lambda", "mse") - cv_lam$lambda = as.numeric(as.character(cv_lam$lambda)) - list(path = path, cv_lam = cv_lam) -} - -lambda_seq = 2^seq(-10, 20, length.out = 50) -path_l1 = compute_coef_paths(task, "lambda1", lambda_seq) -path_l2 = compute_coef_paths(task, "lambda2", lambda_seq) - -save2("regu_example_1.RData", path_l1 = path_l1, path_l2 = path_l2, featnames = featnames, lambda_seq = lambda_seq) - diff --git a/slides/regularization/rsrc/regu_example_1.RData b/slides/regularization/rsrc/regu_example_1.RData deleted file mode 100644 index db2c210b..00000000 Binary files a/slides/regularization/rsrc/regu_example_1.RData and /dev/null differ diff --git a/slides/regularization/rsrc/regu_example_2.R b/slides/regularization/rsrc/regu_example_2.R deleted file mode 100644 index 1ce02f71..00000000 --- a/slides/regularization/rsrc/regu_example_2.R +++ /dev/null @@ -1,59 +0,0 @@ -library(mlr) -library(pensim) -library(ggplot2) -library(gridExtra) -library(MASS) - -set.seed(19873) -n <- 100 # Number of observations -p <- 50 # Number of predictors included in model -CovMatrix <- outer(1:p, 1:p, function(x,y) {.7^abs(x-y)}) -x <- mvrnorm(n, rep(0,p), CovMatrix) -y <- 10 * apply(x[, 1:2], 1, sum) + - 5 * apply(x[, 3:4], 1, sum) + - apply(x[, 5:14], 1, sum) + - rnorm(n) - - -dd = as.data.frame(x) -dd$y = y -task = makeRegrTask(data = dd, target = "y") - - -get_pen_coefs = function(task, alpha, lam) { - featnames = getTaskFeatureNames(task) - lrn = makeLearner("regr.glmnet", alpha = alpha, lambda = lam) - m = train(lrn, task) - mm = getLearnerModel(m) - cc1 = as.matrix(coef(mm))[,1] - return(abs(cc1)) -} - -compute_cv = function(task, alpha, lambda_seq) { - lrn = makeLearner("regr.glmnet", alpha = alpha) - ps = makeParamSet( - makeDiscreteParam("lambda", values = lambda_seq) - ) - ctrl = makeTuneControlGrid() - tr = tuneParams(lrn, task, cv3, par.set = ps, control = ctrl, show.info = FALSE) - cv_lam = as.data.frame(tr$opt.path)[, c("lambda", "mse.test.mean")] - colnames(cv_lam) = c("lambda", "mse") - cv_lam$lambda = as.numeric(as.character(cv_lam$lambda)) - cv_lam -} - -lams = c(0.01, 100) -cc_l2_1 = get_pen_coefs(task, alpha = 0, lam = lams[1]) -cc_l2_2 = get_pen_coefs(task, alpha = 0, lam = lams[2]) -cc_l1_1 = get_pen_coefs(task, alpha = 1, lam = lams[1]) -cc_l1_2 = get_pen_coefs(task, alpha = 1, lam = lams[2]) - - -lambda_seq = 2^seq(-20, 1, length.out = 50) -cv_l1 = compute_cv(task, alpha = 1, lambda_seq) -cv_l2 = compute_cv(task, alpha = 0, lambda_seq) - -save2("regu_example_2.RData", lams, lambda_seq, - cc_l2_1, cc_l2_2, cc_l1_1, cc_l1_2, - cv_l1, cv_l2) - diff --git a/slides/regularization/rsrc/regu_example_2.RData b/slides/regularization/rsrc/regu_example_2.RData deleted file mode 100644 index 39155db3..00000000 Binary files a/slides/regularization/rsrc/regu_example_2.RData and /dev/null differ diff --git a/slides/regularization/rsrc/ridge_perspectives.R b/slides/regularization/rsrc/ridge_perspectives.R new file mode 100755 index 00000000..97c321a7 --- /dev/null +++ b/slides/regularization/rsrc/ridge_perspectives.R @@ -0,0 +1,273 @@ +# ------------------------------------------------------------------------------ +# l2, nonlin + +# FIG: schematic diagrams of ridge regularization +# (1) increase objective function until the constraints are met +# (2) optimize the objective function till optimum under constraints +# (3) different strength of ridge constraint +# (4) single schematic diagram +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(grid) +library(dplyr) +library(gridExtra) +library(pracma) + +# DATA ------------------------------------------------------------------------- + +# Define the grid for plotting +x <- seq(-3.0, 3.0, length.out = 400) +y <- seq(-3.0, 3.0, length.out = 400) +X <- outer(rep(1, length(x)), y) +Y <- outer(x, rep(1, length(y))) + +# Define elliptical parameters +a <- 1.5 +b <- 0.75 +rotation_angle <- -30 +constraint_radius <- 1.0 +objective_center <- c(1.5, 1.5) + +# Rotated elliptical objective function +rotated_elliptical_objective <- function(X, Y, center, a, b, angle_deg) { + angle_rad <- deg2rad(angle_deg) + X_rot <- cos(angle_rad) * (X - center[1]) - sin(angle_rad) * (Y - center[2]) + Y_rot <- sin(angle_rad) * (X - center[1]) + cos(angle_rad) * (Y - center[2]) + (X_rot^2 / a^2) + (Y_rot^2 / b^2) +} + +Z_rotated_elliptical <- rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle) + +# Create data frame for ggplot +data <- data.frame( + x = as.vector(X), + y = as.vector(Y), + z = as.vector(Z_rotated_elliptical) +) + +# Function to create plots +create_plot <- function(data, levels, objective_center, constraint_radius, type, last_plot = FALSE){ + #type: outside / inside + p <- ggplot() + if(length(levels)!=0){ + p <- p + + geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = levels) + if (last_plot){ + plot_build <- ggplot_build(p) + plot_data <- plot_build$data[[1]] + level_value <- ifelse(type=="outside", max(levels), min(levels)) + filtered_data <- plot_data[plot_data$level == level_value, c("x","y")] + distances <- sqrt(filtered_data$x^2 + filtered_data$y^2) + min_idx <- which.min(abs(distances - constraint_radius)) + intersection_point <- filtered_data[min_idx,] + p <- p + + geom_point(aes(x = intersection_point[[1]], y = intersection_point[[2]]), color = 'green', size = 2) + + annotate("label", x = intersection_point[[1]] - 0.5, y = intersection_point[[2]] + 0.6, label = expression(hat(theta)[ridge]), color = "green", size = 3) + } + } + + # Create data for the circle + theta <- seq(0, 2 * pi, length.out = 100) + center <- c(0, 0) + circle_data <- data.frame( + x = center[1] + constraint_radius * cos(theta), + y = center[2] + constraint_radius * sin(theta) + ) + + # Plot the circle with dashed lines and blue color + p <- p + + geom_path(data = circle_data, aes(x = x, y = y), color = 'blue', linetype = 'dashed', size = 0.5, alpha = 0.3) + + geom_polygon(data = circle_data, aes(x = x, y = y), fill = 'blue', alpha = 0.3) # Fill the circle with blue color and alpha 0.3 + + p <- p + + geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) + + annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + + geom_hline(yintercept = 0, color = 'black', size = 0.5) + + geom_vline(xintercept = 0, color = 'black', size = 0.5) + + theme_linedraw() + + theme( + panel.grid = element_blank(), + axis.title = element_blank(), + plot.title = element_blank() + ) + + coord_fixed(xlim = c(-3, 3), ylim = c(-3, 3), expand = FALSE) + return(p) +} + +# PLOT 1: outside -------------------------------------------------------------- +# increase objective function until the constraints are met + +contour_levels <- list( + c(), + c(0.1), + c(0.1, 0.3), + c(0.1, 0.3, 0.6) +) + +plots_out <- lapply(1:4, function(i) { + create_plot(data, contour_levels[[i]], objective_center, constraint_radius, type="outside", last_plot = (i == 4)) +}) + +p_outside <- grid.arrange(grobs = plots_out, nrow = 2, ncol = 2) + +# PLOT 2: inside --------------------------------------------------------------- +# optimize the objective function till optimum under constraints + +contour_levels <- list( + c(), + c(2.4), + c(1.2, 2.4), + c(0.6, 1.2, 2.4) +) + +# Generate plots +plots_in <- lapply(1:4, function(i) { + create_plot(data, contour_levels[[i]], objective_center, constraint_radius, type="inside", last_plot = (i == 4)) +}) + +# Arrange plots in a 2x2 grid +p_inside <- grid.arrange(grobs = plots_in, nrow = 2, ncol = 2) + +# PLOT 3: constraints ---------------------------------------------------------- +# different strength of ridge constraint + +contour_levels <- c(0.1, 0.3, 0.6) + +# p1 +p1 <- ggplot() + + geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = contour_levels) + +plot_build_1 <- ggplot_build(p1) +plot_data_1 <- plot_build_1$data[[1]] +filtered_data_1 <- plot_data_1[plot_data_1$level == 0.6, c("x","y")] +distances_1 <- sqrt(filtered_data_1$x^2 + filtered_data_1$y^2) +min_idx_1 <- which.min(abs(distances_1 - constraint_radius)) +intersection_point_1 <- filtered_data_1[min_idx_1,] + +p1 <- p1 + + geom_point(aes(x = intersection_point_1[[1]], y = intersection_point_1[[2]]), color = 'green', size = 2) + + annotate("label", x = intersection_point_1[[1]] - 0.5, y = intersection_point_1[[2]] + 0.6, label = expression(hat(theta)[ridge]), color = "green", size = 3) + +theta <- seq(0, 2 * pi, length.out = 100) +center <- c(0, 0) +circle_data_1 <- data.frame( + x = center[1] + cos(theta), + y = center[2] + sin(theta) +) + +p1 <- p1 + + geom_path(data = circle_data_1, aes(x = x, y = y), color = 'blue', linetype = 'dashed', size = 0.5, alpha = 0.3) + + geom_polygon(data = circle_data_1, aes(x = x, y = y), fill = 'blue', alpha = 0.3) + + geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) + + annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + + geom_hline(yintercept = 0, color = 'black', size = 0.5) + + geom_vline(xintercept = 0, color = 'black', size = 0.5) + + theme_linedraw() + + theme( + panel.grid = element_blank(), + axis.title = element_blank(), + plot.title = element_blank() + ) + + coord_fixed(xlim = c(-3, 3), ylim = c(-3, 3), expand = FALSE) + +# p2 +constraint_radius <- 1.33 +level_value <- 0.3 + +p2 <- ggplot() + + geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = contour_levels) + +plot_build <- ggplot_build(p2) +plot_data <- plot_build$data[[1]] +filtered_data <- plot_data[plot_data$level == level_value, c("x","y")] +distances <- sqrt(filtered_data$x^2 + filtered_data$y^2) +min_idx <- which.min(abs(distances - constraint_radius)) +intersection_point <- filtered_data[min_idx,] + +p2 <- p2 + + geom_point(aes(x = intersection_point[[1]], y = intersection_point[[2]]), color = 'green', size = 2) + + annotate("label", x = intersection_point[[1]] - 0.5, y = intersection_point[[2]] + 0.6, label = expression(hat(theta)[ridge]), color = "green", size = 3) + +theta <- seq(0, 2 * pi, length.out = 100) +center <- c(0, 0) +circle_data <- data.frame( + x = center[1] + constraint_radius * cos(theta), + y = center[2] + constraint_radius * sin(theta) +) + + +p2 <- p2 + + geom_path(data = circle_data, aes(x = x, y = y), color = 'blue', linetype = 'dashed', size = 0.5, alpha = 0.3) + + geom_polygon(data = circle_data, aes(x = x, y = y), fill = 'blue', alpha = 0.3) + + geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) + + annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + + geom_hline(yintercept = 0, color = 'black', size = 0.5) + + geom_vline(xintercept = 0, color = 'black', size = 0.5) + + theme_linedraw() + + theme( + panel.grid = element_blank(), + axis.title = element_blank(), + plot.title = element_blank() + ) + + coord_fixed(xlim = c(-3, 3), ylim = c(-3, 3), expand = FALSE) + +# Arrange plots in a 1x2 grid +p_cons <- grid.arrange(p1, p2, nrow = 1, ncol = 2) + +# PLOT 4: single schematic plot ------------------------------------------------ + +constraint_radius <- 1 +contour_levels <- c(0.1, 0.3, 0.6) +level_value <- 0.6 + +p <- ggplot() + + geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = contour_levels) + +plot_build <- ggplot_build(p) +plot_data <- plot_build$data[[1]] +filtered_data <- plot_data[plot_data$level == level_value, c("x","y")] +distances <- sqrt(filtered_data$x^2 + filtered_data$y^2) +min_idx <- which.min(abs(distances - constraint_radius)) +intersection_point <- filtered_data[min_idx,] + +p <- p + + geom_point(aes(x = intersection_point[[1]], y = intersection_point[[2]]), color = 'green', size = 2) + + annotate("label", x = intersection_point[[1]] - 0.4, y = intersection_point[[2]] + 0.4, label = expression(hat(theta)[ridge]), color = "green", size = 3) +theta <- seq(0, 2 * pi, length.out = 100) +center <- c(0, 0) +circle_datas <- data.frame( + x1 = center[1] + constraint_radius * cos(theta), + y1 = center[2] + constraint_radius * sin(theta), + x2 = center[1] + (constraint_radius / 1.5)* cos(theta), + y2 = center[2] + (constraint_radius / 1.5) * sin(theta), + x3 = center[1] + (constraint_radius / 3) * cos(theta), + y3 = center[2] + (constraint_radius / 3) * sin(theta) +) + +p <- p + + geom_polygon(data = circle_datas, aes(x = x1, y = y1), fill = 'blue', alpha = 0.3) + + geom_polygon(data = circle_datas, aes(x = x2, y = y2), fill = 'blue', alpha = 0.5) + + geom_polygon(data = circle_datas, aes(x = x3, y = y3), fill = 'blue', alpha = 0.7) + + geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) + + annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + + geom_hline(yintercept = 0, color = 'black', size = 0.5) + + geom_vline(xintercept = 0, color = 'black', size = 0.5) + + geom_segment(aes(x = 0, y = -1.5, xend = 0, yend = 3), color = 'black', + arrow = arrow(length = unit(0.2, "cm"), ends = "last", type = "closed")) + # y-axis with arrow + annotate("text", x = -0.2, y = 2.8, label = expression(theta[2]), color = "black", size = 3) + + geom_segment(aes(x = -1.5, y = 0, xend = 3, yend = 0), color = 'black', + arrow = arrow(length = unit(0.2, "cm"), ends = "last", type = "closed")) + # x-axis with arrow + annotate("text", x = 2.8, y = -0.2, label = expression(theta[1]), color = "black", size = 3) + + theme_void() + + theme( + panel.grid = element_blank(), + axis.title = element_blank(), + plot.title = element_blank() + ) + + coord_fixed(xlim = c(-1.5, 3), ylim = c(-1.5, 3), expand = FALSE) + +ggsave(filename = "../ridge_perspectives_01.png", plot = p_outside, width = 6, height = 6) +ggsave(filename = "../ridge_perspectives_02.png", plot = p_inside, width = 6, height = 6) +ggsave(filename = "../ridge_perspectives_03.png", plot = p_cons, width = 6, height = 3) +ggsave(filename = "../ridge_perspectives_04.png", plot = p, width = 3, height = 3) diff --git a/slides/regularization/rsrc/ridge_polynomial_reg.R b/slides/regularization/rsrc/ridge_polynomial_reg.R deleted file mode 100644 index 8ce4659b..00000000 --- a/slides/regularization/rsrc/ridge_polynomial_reg.R +++ /dev/null @@ -1,74 +0,0 @@ -betaRidge <- function (X, y, lambda) -{ - return (solve(t(X) %*% X + lambda * diag(ncol(X))) %*% (t(X) %*% y)) -} - -baseTrafo <- function (x, degree) -{ - out <- cbind(1, x) - for (i in seq_len(degree)[-1]) { - out <- cbind(out, x^i) - } - # poly ist schei?e - return (out) -} - -getPolyData <- function(x, y, lambda.vec, base.trafo, ...) -{ - X <- base.trafo(x, ...) - - x.pred <- seq(min(x), max(x), length.out = 500) - X.pred <- base.trafo(x.pred, ...) - - df.truth <- data.frame(feature = x, truth = y) - - # browser() - - df.betas <- matrix(NA, nrow=length(lambda.vec), ncol=ncol(X)) - row.names(df.betas) <- lambda.vec - - for(i in 1:length(lambda.vec)){ - df.betas[i,] <- betaRidge(X, y, lambda.vec[i]) - } - - df.polys <- lapply(1:length(lambda.vec), function (i) { - return (data.frame( - feature = x.pred, - pred = X.pred %*% df.betas[i,], - lambda = row.names(df.betas)[i] - )) - }) - return (list(polys = df.polys, - truth = df.truth, - betas = df.betas)) -} - -plotRidge <- function (x, y, lambda.vec, base.trafo, ...) -{ - requireNamespace("ggplot2") - - # browser() - - res <- getPolyData(x, y, lambda.vec, base.trafo, ...) - df.polys <- res$polys - df.truth <- res$truth - - plot.df <- df.polys[[1]] - for (i in seq_along(df.polys)[-1]) { - plot.df <- rbind(plot.df, df.polys[[i]]) - } - plot.df$lambda <- as.factor(plot.df$lambda) - - gg <- ggplot2::ggplot() - if (length(lambda.vec) == 1) { - gg <- gg + ggplot2::geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda), show.legend = FALSE) - } else { - gg <- gg + ggplot2::geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda)) - } - - return ( - gg + - ggplot2::geom_point(data = df.truth, mapping = aes(x = feature, y = truth)) - ) -} - diff --git a/slides/regularization/rsrc/ridge_vs_sgd_path.R b/slides/regularization/rsrc/ridge_vs_sgd_path.R new file mode 100755 index 00000000..c759001c --- /dev/null +++ b/slides/regularization/rsrc/ridge_vs_sgd_path.R @@ -0,0 +1,105 @@ +# ------------------------------------------------------------------------------ +# early stopping + +# FIG: +# LEFT: how coefficients of a linear model change +# with regularization constant (lambda) for ridge regression. +# RIGHT: how coefficients of a linear model change with iterations for SGD. + +# DATA: linear regression model data generated by +# y = X(100*10 ~Normal)·true_coef(10*1) + noise(100*1 ~Normal). +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(dplyr) +library(tidyr) +library(gridExtra) + +set.seed(6) + +# DATA ------------------------------------------------------------------------- + +# generate data for design matrix, response variable, +# and true coefficients for a linear model +# with n samples, p features and no intercept. +generate_data <- function(n, p) { + X <- matrix(rnorm(n * p), nrow = n, ncol = p) + true_coef <- seq(-1, 1, length.out = p) + noise <- rnorm(n) + y <- X %*% true_coef + noise + return(list(X = X, y = y, true_coef = true_coef)) +} + +# compute the ridge coefficients analytically +compute_ridge_path <- function(X, y, alphas) { + coefs <- matrix(0, nrow = 1, ncol = ncol(X)) + for (i in 1:length(alphas)) { + ridge_coefs <- solve(t(X) %*% X + alphas[i] * diag(ncol(X))) %*% t(X) %*% y + coefs <- rbind(coefs, as.vector(ridge_coefs)) + } + return(coefs) +} + +# compute the optimization trajectory for SGD +compute_sgd_trajectory <- function(X, y, batch_size, learning_rate, n_iter) { + w <- rep(0, ncol(X)) + coefs <- matrix(0, nrow = 1, ncol = ncol(X)) + for (i in 1:n_iter) { + indices <- sample(1:nrow(X), replace = FALSE) + for (j in seq(1, nrow(X), batch_size)) { + indices_batch <- indices[j:min(j + batch_size - 1, nrow(X))] + X_batch <- X[indices_batch, ] + y_batch <- y[indices_batch] + gradient <- -2 * t(X_batch) %*% (y_batch - X_batch %*% w) / batch_size + w <- w - learning_rate * gradient + } + coefs <- rbind(coefs, as.vector(w)) + } + return(coefs) +} + +n <- 100 +p <- 10 +batch_size <- 4 +learning_rate <- 0.01 +n_iter <- 50 +t_values <- seq(0.001, n_iter + 1, by = 1) # Include 0 in t_values for the zero coefficients +alphas <- 1 / (learning_rate * t_values[1:length(t_values)]) # Exclude 0 to avoid division by zero + +data <- generate_data(n, p) +X <- data$X +y <- data$y +true_coef <- data$true_coef + +ridge_coefs <- compute_ridge_path(X, y, alphas) + +sgd_coefs <- compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter) + +# PLOT ------------------------------------------------------------------------- + +# Ridge path +inv_alphas <- 1/alphas +df_ridge <- data.frame(inv_alphas, ridge_coefs[-1,]) + +df_ridge_long <- df_ridge %>% + pivot_longer(cols = starts_with("X"), names_to = "line", values_to = "value") + +p1 <- ggplot(df_ridge_long, aes(x = inv_alphas, y = value, color = line)) + + geom_line(show.legend = FALSE) + + labs(title = "Ridge Regression Path", x = expression("1 / ( lr *"~lambda~")"), y = "Parameters") + + theme_minimal() + +# SGD path +df_SGD <- data.frame(t_values, sgd_coefs) + +df_SGD_long <- df_SGD %>% + pivot_longer(cols = starts_with("X"), names_to = "line", values_to = "value") + +p2 <- ggplot(df_SGD_long, aes(x = t_values, y = value, color = line)) + + geom_line(show.legend = FALSE) + + labs(title = "SGD Trajectory", x = "Iterations", y = "Parameters") + + theme_minimal() + +p = grid.arrange(p1, p2, ncol = 2) + +ggsave("../figure/ridge_vs_sgd_path.png", plot=p, width=12, height=4.5) diff --git a/slides/regularization/rsrc/shrinkage.R b/slides/regularization/rsrc/shrinkage.R new file mode 100755 index 00000000..930e549f --- /dev/null +++ b/slides/regularization/rsrc/shrinkage.R @@ -0,0 +1,98 @@ +# ------------------------------------------------------------------------------ +# l1 vs l2 + +# FIG: +# (1): how coefficient values and MSE changes with regularization constant +# (lambda) for linear regression with l1 and l2 regularization. +# (2): histogram of coefficient values with two regularization constants +# (lambda 0.01, 100) to show how they affect shrinkage +# for linear regression with l1 and l2 regularization. +# DATA: +# (1): data from data_regu_example_1.RData +# (2): data from data_regu_example_2.RData +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(gridExtra) +library(ggrepel) +library(data.table) +library(viridis) + +# DATA ------------------------------------------------------------------------- + +load("data_regu_example_1.RData") +load("data_regu_example_2.RData") + +d_l1 <- rbind( + data.frame(lam = paste(lams[1]), coefval = cc_l1_1), + data.frame(lam = paste(lams[2]), coefval = cc_l1_2) +) +d_l1$lam <- as.factor(d_l1$lam) +d_l2 <- rbind( + data.frame(lam = paste(lams[1]), coefval = cc_l2_1), + data.frame(lam = paste(lams[2]), coefval = cc_l2_2) +) +d_l2$lam <- as.factor(d_l2$lam) + +# PLOTS ------------------------------------------------------------------------- + +### (1) +plot_coef_paths <- function(path, featnames, title, xlab) { + ggd <- melt(path, id.vars = "lambda", measure = featnames, variable.name = "featname", value.name = "coefval") + ggd$label <- ifelse(ggd$lambda == min(lambda_seq), as.character(ggd$featname), NA) + pl <- ggplot(data = ggd, aes(x = lambda, y = coefval, group = featname, col = featname)) + + guides(color = "none") + + geom_line() + + geom_label_repel(aes(label = label), na.rm = TRUE, max.overlaps = Inf) + + scale_color_discrete(guide = FALSE) + + scale_x_log10() + + ggtitle(title) + + xlab(xlab) + + theme_bw() + + scale_color_viridis(end = 0.9, discrete = TRUE) +} + +plot_cv_path <- function(cv_lam, title, xlab, ylab) { + pl <- ggplot(data = cv_lam, aes(x = lambda, y = mse)) + + geom_line() + + scale_x_log10() + + ggtitle(title) + + xlab(xlab) + + ylab(ylab) +} + +p1l1 <- plot_coef_paths(path_l1$path, featnames, "Lasso", expression(lambda)) +p1l2 <- plot_coef_paths(path_l2$path, featnames, "Ridge", expression(lambda)) +p1l3 <- plot_cv_path(path_l1$cv_lam, "Lasso", expression(lambda), 'MSE') + + theme_minimal() + ylim(25, 90) +p1l4 <- plot_cv_path(path_l2$cv_lam, "Ridge", expression(lambda), 'MSE') + + theme_minimal() + ylim(20, 90) + +p1 <- grid.arrange(p1l1, p1l2, p1l3, p1l4, nrow = 2) +ggsave("../figure/shrinkage_01.png", plot = p1, width = 8, height = 4) + + + +### (2) +# histogram of coefficients value of data d +plot_coef_hist <- function(d, title) { + pl <- ggplot(d, aes(x = coefval, fill = lam)) + + scale_fill_viridis(end = 0.9, discrete = TRUE) + + geom_histogram(alpha = 0.9, position = "dodge") + + theme_gray(base_size = 14) + + ggtitle(title) + return(pl) +} + +# MSE with different lambda for data cv_lam + +p2l1 <- plot_coef_hist(d_l1, "Lasso") + guides(fill=guide_legend(title=expression(lambda))) +p2l2 <- plot_coef_hist(d_l2, "Ridge")+ guides(fill=guide_legend(title=expression(lambda))) + + ylim(0, 50) +p2l3 <- plot_cv_path(cv_l1, "Lasso", expression(lambda), 'MSE') + + theme_gray(base_size = 14) + ylim(1, 10) +p2l4 <- plot_cv_path(cv_l2, "Ridge", expression(lambda), 'MSE') + + theme_gray(base_size = 14) + ylim(1, 10) + +p2 <- grid.arrange(p2l1, p2l2, p2l3, p2l4, nrow = 2) +ggsave("../figure/shrinkage_02.png", plot = p2, width = 8, height = 5) diff --git a/slides/regularization/rsrc/soft-thresholding.R b/slides/regularization/rsrc/soft_thresholding.R old mode 100644 new mode 100755 similarity index 63% rename from slides/regularization/rsrc/soft-thresholding.R rename to slides/regularization/rsrc/soft_thresholding.R index 5b96abba..b1de8c12 --- a/slides/regularization/rsrc/soft-thresholding.R +++ b/slides/regularization/rsrc/soft_thresholding.R @@ -1,52 +1,55 @@ -library(ggplot2) - -# Define the soft thresholding function -soft_threshold <- function(rho, lamda) { - if (rho < -lamda) { - return (rho + lamda) - } else if (rho > lamda) { - return (rho - lamda) - } else { - return (0) - } -} - -# Lambda value -lamda <- 3 - -# Generate sequence of rho values (similar to x1 in Python) -x1 <- seq(-10, 10, by = 0.1) - -# Apply the soft thresholding function to each value in x1 -y_st <- sapply(x1, function(rho) soft_threshold(rho, lamda)) - -# Compute the ridge estimate for each value in x1 -y_ridge <- x1 / (1 + lamda) - -# Create a data frame for plotting -data <- data.frame(rho = x1, theta = y_st, OLS = x1, Ridge = y_ridge) - -# Plot using ggplot2 -p <- ggplot(data, aes(x = rho)) + - geom_line(aes(y = theta), color = 'blue', linetype = "solid", size=1.2) + - geom_line(aes(y = OLS), color = 'grey', linetype = "dashed", size=1.2) + - geom_line(aes(y = Ridge), color = 'red', linetype = "solid", size=1.2) + - labs(x = expression(theta[j]), y = expression(theta[j]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design, lambda=3)') + - theme_minimal() + - theme( - plot.title = element_text(hjust = 0.5, size = 20), - axis.title = element_text(size = 18), - axis.text = element_text(size = 18), - axis.ticks = element_line(size = 1) - ) + - scale_color_manual(values = c('blue', 'grey', 'red')) + - geom_hline(yintercept = 0, linetype="solid", color = "black") + - geom_vline(xintercept = 0, linetype="solid", color = "black") + - guides(color = guide_legend(title = NULL)) + - theme(legend.position = "bottom") + - annotate("text", x = -9, y = -4, label = expression(S(theta[j], lambda)), parse = TRUE, size=8, color="blue") + - annotate("text", x = 7, y = 9, label = "OLS", parse = TRUE, size=8, color ="grey") + - annotate("text", x = 7, y = 0.5, label = "Ridge", color = "red", parse = TRUE, size=8) # Label for Ridge - -# Display the plot -print(p) +# ------------------------------------------------------------------------------ +# l1 + +# FIG: draw lasso and ridge solution paths in terms of OLS. +# ------------------------------------------------------------------------------ + +library(ggplot2) + +# DATA ------------------------------------------------------------------------- + +soft_threshold <- function(rho, lamda) { + if (rho < -lamda) { + return (rho + lamda) + } else if (rho > lamda) { + return (rho - lamda) + } else { + return (0) + } +} + +lamda <- 3 + +x1 <- seq(-10, 10, by = 0.1) + +y_st <- sapply(x1, function(rho) soft_threshold(rho, lamda)) + +# ridge estimate +y_ridge <- x1 / (1 + lamda) + +# PLOT ------------------------------------------------------------------------- + +data <- data.frame(rho = x1, theta = y_st, OLS = x1, Ridge = y_ridge) + +p <- ggplot(data, aes(x = rho)) + + geom_line(aes(y = theta), color = 'blue', linetype = "solid", size=1.2) + + geom_line(aes(y = OLS), color = 'grey', linetype = "dashed", size=1.2) + + geom_line(aes(y = Ridge), color = 'red', linetype = "solid", size=1.2) + + labs(x = expression(theta[OLS]), y = expression(theta[pen]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design, lambda=3)') + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5, size = 20), + axis.title = element_text(size = 18), + axis.text = element_text(size = 18), + axis.ticks = element_line(size = 1) + ) + + scale_color_manual(values = c('blue', 'grey', 'red')) + + geom_hline(yintercept = 0, linetype="solid", color = "black") + + geom_vline(xintercept = 0, linetype="solid", color = "black") + + guides(color = guide_legend(title = NULL)) + + theme(legend.position = "bottom") + + annotate("text", x = -9, y = -4, label = "Lasso", parse = TRUE, size=8, color="blue") + + annotate("text", x = 7, y = 9, label = "OLS", parse = TRUE, size=8, color ="grey") + + annotate("text", x = 7, y = 0.5, label = "Ridge", color = "red", parse = TRUE, size=8) + +ggsave("../figure/soft_thresholding.png", plot = p, width = 10, height = 5) diff --git a/slides/regularization/rsrc/solution_path.R b/slides/regularization/rsrc/solution_path.R new file mode 100755 index 00000000..76cbfb5d --- /dev/null +++ b/slides/regularization/rsrc/solution_path.R @@ -0,0 +1,118 @@ +# ------------------------------------------------------------------------------ +# l1, l2 + +# FIG: solution path under l1 and l2 regularization. +# DATA: +# x = seq(0, 1, length.out = 40) +# noise ~ Unif(0, 1) +# y = sin(x * 1.5 * pi) +# y_noise = (y + noise) - mean(y + noise) +# ------------------------------------------------------------------------------ + +library(ggplot2) +library(dplyr) +library(tidyr) +library(Matrix) +library(glmnet) +library(pracma) +library(gridExtra) +set.seed(0) + +# DATA ------------------------------------------------------------------------- + +# Cost function definitions +cost_l2 <- function(x, y) { + return(x^2 + y^2) +} + +cost_l1 <- function(x, y) { + return(abs(x) + abs(y)) +} + +costfunction <- function(X, y, theta) { + m <- length(y) + h <- X %*% theta + return((1 / (2 * m)) * t(h - y) %*% (h - y)) +} + +closed_form_reg_solution <- function(X, y, lambda = 10) { + m <- nrow(X) + n <- ncol(X) + I <- diag(n) + return(solve(t(X) %*% X + lambda * I) %*% t(X) %*% y) +} + +# Dataset creation and normalization +x <- seq(0, 1, length.out = 40) +noise <- runif(40, 0, 1) +y <- sin(x * 1.5 * pi) +y_noise <- (y + noise) - mean(y + noise) +X <- cbind(x, x^2) +X <- sweep(X, 2, sqrt(colSums(X^2)), FUN = "/") + +# Setup of meshgrid of theta values +theta1 <- seq(-2, 17, length.out = 100) +theta2 <- seq(-17, 3, length.out = 100) +grid <- expand.grid(theta1 = theta1, theta2 = theta2) + +# Computing the cost function for each theta combination +grid <- grid %>% + mutate(Z_l2 = cost_l2(theta1, theta2), + Z_l1 = cost_l1(theta1, theta2), + Z_ls = apply(grid, 1, function(row) costfunction(X, y_noise, matrix(c(row[1], row[2]), nrow = 2)))) + +# Calculating the regularization paths +lambda_range_l2 <- 10^seq(0, 4, length.out = 100) / 1000 +lambda_range_l1 <- 10^seq(0, 2, length.out = 100) / 1000 + +theta_l2 <- sapply(lambda_range_l2, function(l) closed_form_reg_solution(X, y_noise, l)) +theta_l1 <- sapply(lambda_range_l1, function(l) coef(glmnet(X, y_noise, alpha=1, lambda=l, standardize=FALSE, intercept=FALSE))[2:3]) + +theta_l2_df <- data.frame(t(theta_l2)) +theta_l1_df <- data.frame(t(theta_l1)) + + +# L2 plot + +l2_contour_levels <- c(.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250) + +p2 <- ggplot() + + geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_l2), color = 'cyan', breaks = l2_contour_levels) + + geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_ls), color = 'red', breaks = c(.01, .06, .09, .11, .15)) + + geom_point(data = theta_l2_df, aes(x = X1, y = X2), color = 'red', alpha = 0.2) + + labs(title = 'L2 regularization solution path', x = expression(theta[1]), y = expression(theta[2])) + + theme_minimal() + + coord_fixed() + +# L1 & L2 plot + +# Plot L2 Regularization +inside_l2 <- theta_l2_df %>% + filter(cost_l2(X1, X2) < max(l2_contour_levels)) + +p_l2 <- ggplot() + + geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_l2), color = 'cyan', breaks = l2_contour_levels) + + geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_ls), color = 'red', breaks = c(.01, .06, .09, .11, .15)) + + geom_point(data = inside_l2, aes(x = X1, y = X2), color = 'green', alpha = 0.5) + + labs(title = 'L2 regularization solution path', x = expression(theta[1]), y = expression(theta[2])) + + theme_minimal() + + coord_fixed() + +# Plot L1 Regularization +l1_contour_levels = c(.5, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14) + +inside_l1 <- theta_l1_df %>% + filter(cost_l1(X1, X2) < max(l1_contour_levels)) + +p_l1 <- ggplot() + + geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_l1), color = 'cyan', breaks = l1_contour_levels) + + geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_ls), color = 'red', breaks = c(.01, .06, .09, .11, .15)) + + geom_point(data = inside_l1, aes(x = X1, y = X2), color = 'green', alpha = 0.5) + + labs(title = 'L1 regularization solution path', x = expression(theta[1]), y = expression(theta[2])) + + theme_minimal() + + coord_fixed() + +p <- grid.arrange(p_l2, p_l1, ncol = 2) + +ggsave(filename = "../figure/solution_paths_02.png", plot = p2, width = 5, height = 5) +ggsave(filename = "../figure/solution_paths_01.png", plot = p, width = 10, height = 5) diff --git a/slides/regularization/rsrc/table_equivariance.R b/slides/regularization/rsrc/table_equivariance.R new file mode 100755 index 00000000..2b8eafd7 --- /dev/null +++ b/slides/regularization/rsrc/table_equivariance.R @@ -0,0 +1,89 @@ +# ------------------------------------------------------------------------------ +# l1 vs l2 + +# TABLE: coefficients and MSE of OLS and ridge for X and rescaled X +# DATA: Y = X(100*5 ~Normal) * beta_true + epsilon(100*1 ~Normal) +# ------------------------------------------------------------------------------ + +library(MASS) +library(xtable) +library(dplyr) + +set.seed(123) + +# DATA ------------------------------------------------------------------------- + +n <- 100 +p <- 5 +X <- matrix(rnorm(n * p), n, p) +beta_true <- c(1, 2, 3, 4, 5) +epsilon <- rnorm(n) +Y <- X %*% beta_true + epsilon + +# OLS Solution +beta_ols <- solve(t(X) %*% X) %*% t(X) %*% Y + +# Ridge Solution +lambda <- 10 +beta_ridge <- solve(t(X) %*% X + lambda * diag(p)) %*% t(X) %*% Y + +# Rescale and repeat +X_rescaled <- X +X_rescaled[,5] <- 100 * X_rescaled[,5] +beta_ols_rescaled <- solve(t(X_rescaled) %*% X_rescaled) %*% t(X_rescaled) %*% Y +beta_ridge_rescaled <- solve(t(X_rescaled) %*% X_rescaled + lambda * diag(p)) %*% t(X_rescaled) %*% Y + +# Results +results <- rbind(t(beta_ols), t(beta_ols_rescaled), t(beta_ridge), t(beta_ridge_rescaled)) +colnames(results) <- paste("Coefficient", 1:p) + +# MSE +loss_ols <- mean((Y - X %*% beta_ols)^2) +loss_ols_rescaled <- mean((Y - X_rescaled %*% beta_ols_rescaled)^2) +loss_ridge <- mean((Y - X %*% beta_ridge)^2) # + lambda * sum(beta_ridge^2) +loss_ridge_rescaled <- mean((Y - X_rescaled %*% beta_ridge_rescaled)^2) #+ lambda * sum(beta_ridge_rescaled^2) + +losses <- c(loss_ols, loss_ols_rescaled, loss_ridge, loss_ridge_rescaled) +results <- cbind(results, MSE = losses) +rownames(results) <- c("OLS", "OLS Rescaled", "Ridge", "Ridge Rescaled") +print(results) + +# TABLE ------------------------------------------------------------------------ +results <- round(results, 3) +# Function to bold specific column values +bold_coefficient5 <- function(x) { + x[, "Coefficient 5"] <- paste0("\\textbf{", formatC(x[, "Coefficient 5"], format = "f", digits = 3), "}") + x +} + +table_ols <- bold_coefficient5(results[1:2,]) +table_ridge <- bold_coefficient5(results[3:4,]) + +table_ols <- xtable(table_ols) +align(table_ols) <- "|c|cccccc|" +table_ridge <- xtable(table_ridge) +align(table_ridge) <- "|c|cccccc|" + +add.to.row <- list(pos = list(-1, nrow(table_ols)), + command = c("\\hline\n\\textbf{Method} & \\( \\hat{\\theta}_1 \\) & \\( \\hat{\\theta}_2 \\) & \\( \\hat{\\theta}_3 \\) & \\( \\hat{\\theta}_4 \\) & \\( \\hat{\\theta}_5 \\) & MSE \\\\ \\hline\n", + "\\hline\n")) + +print(table_ols, file = "table_equivariance_ols.tex", include.rownames = TRUE, + include.colnames = FALSE, + sanitize.text.function = identity, + tabular.environment = "tabular", + floating = FALSE, + add.to.row = add.to.row, + hline.after = NULL, + booktabs = FALSE, + comment = FALSE) + +print(table_ridge, file = "table_equivariance_ridge.tex", include.rownames = TRUE, + include.colnames = FALSE, + sanitize.text.function = identity, + tabular.environment = "tabular", + floating = FALSE, + add.to.row = add.to.row, + hline.after = NULL, + booktabs = FALSE, + comment = FALSE) diff --git a/slides/regularization/rsrc/table_equivariance_ols.tex b/slides/regularization/rsrc/table_equivariance_ols.tex new file mode 100755 index 00000000..80d4bdbb --- /dev/null +++ b/slides/regularization/rsrc/table_equivariance_ols.tex @@ -0,0 +1,7 @@ +\begin{tabular}{|c|cccccc|} + \hline +\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline + OLS & 0.984 & 2.147 & 3.006 & 3.918 & \textbf{5.205} & 0.812 \\ + OLS Rescaled & 0.984 & 2.147 & 3.006 & 3.918 & \textbf{0.052} & 0.812 \\ + \hline +\end{tabular} diff --git a/slides/regularization/rsrc/table_equivariance_ridge.tex b/slides/regularization/rsrc/table_equivariance_ridge.tex new file mode 100755 index 00000000..d22cca73 --- /dev/null +++ b/slides/regularization/rsrc/table_equivariance_ridge.tex @@ -0,0 +1,7 @@ +\begin{tabular}{|c|cccccc|} + \hline +\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline + Ridge & 0.709 & 1.874 & 2.661 & 3.558 & \textbf{4.636} & 1.366 \\ + Ridge Rescaled & 0.802 & 1.943 & 2.675 & 3.569 & \textbf{0.051} & 1.08 \\ + \hline +\end{tabular} diff --git a/slides/regularization/rsrc/make_weightdecay_lambda_plot.R b/slides/regularization/rsrc/weightdecay_lambda.R old mode 100644 new mode 100755 similarity index 62% rename from slides/regularization/rsrc/make_weightdecay_lambda_plot.R rename to slides/regularization/rsrc/weightdecay_lambda.R index 28b150fc..b9af3b4e --- a/slides/regularization/rsrc/make_weightdecay_lambda_plot.R +++ b/slides/regularization/rsrc/weightdecay_lambda.R @@ -1,6 +1,17 @@ -source("utils.R") +# ------------------------------------------------------------------------------ +# wd vs l2 + +# FIG: draw the path of optimal point for each iteration using weight decay. +# use different decay parameters(lambda) to show how strong the pulling is. + +# DATA: linear model data from data_func_utils.R +# ------------------------------------------------------------------------------ + +source("data_func_utils.R") library(gridExtra) +# DATA ------------------------------------------------------------------------- + x1 <- seq(0,1.5,length.out = 100) x2 <- seq(0,3.5,length.out = 100) lambda <- 5 @@ -14,6 +25,8 @@ gd_l2_betas <- gradient_descent(beta_start, step_size, ret <- weight_decay(beta_start, lambda, step_size, R_emp_grad, num_steps) +# PLOT ------------------------------------------------------------------------- + remp_l2_plot_1 <- plot_r_emp(R_emp, x1, x2) + geom_path(data = ret$betas_gd, aes(x=V1, y=V2), colour = "red", size=1.1) + geom_path(data = ret$betas_wd, aes(x=V1, y=V2), colour = "yellow", size=1.1) + @@ -34,5 +47,5 @@ remp_l2_plot_2 <- plot_r_emp(R_emp, x1, x2) + #p <- grid.arrange(remp_l2_plot_1 , remp_l2_plot_2 , ncol=2) -ggsave("../figure/weightdecay_lambda_plot_01.png", plot = remp_l2_plot_1, width = 2.6, height = 3.1, dpi="retina") -ggsave("../figure/weightdecay_lambda_plot_02.png", plot = remp_l2_plot_2, width = 2.6, height = 3.1, dpi="retina") +ggsave("../figure/weightdecay_lambda_01.png", plot = remp_l2_plot_1, width = 2.6, height = 3.1, dpi="retina") +ggsave("../figure/weightdecay_lambda_02.png", plot = remp_l2_plot_2, width = 2.6, height = 3.1, dpi="retina") diff --git a/slides/regularization/slides-regu-early-stopping.tex b/slides/regularization/slides-regu-early-stopping.tex index 87a6bac2..c2f647de 100644 --- a/slides/regularization/slides-regu-early-stopping.tex +++ b/slides/regularization/slides-regu-early-stopping.tex @@ -106,7 +106,7 @@ \begin{figure} \centering %\scalebox{0.75} - {\includegraphics{figure_man/ridge-vs-sgd-path.png}} + {\includegraphics{figure/ridge_vs_sgd_path.png}} %\scriptsize{\\Ali et al. (2020)\\} \end{figure} diff --git a/slides/regularization/slides-regu-geom-l2.tex b/slides/regularization/slides-regu-geom-l2.tex index c83460ad..0997c083 100644 --- a/slides/regularization/slides-regu-geom-l2.tex +++ b/slides/regularization/slides-regu-geom-l2.tex @@ -8,7 +8,7 @@ \begin{document} \titlemeta{Regularization }{Geometry of L2 Regularization} -{figure/l2_reg_hess_03_plot.png} { +{figure/l2_reg_hess_03.png} { \item Approximate transformation of unregularized minimizer to regularized \item Principal components of Hessian influence where parameters are decayed } @@ -87,7 +87,7 @@ % \end{footnotesize} \begin{figure} -\includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_01_plot.png}\\ +\includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_01.png}\\ \end{figure} % % \begin{footnotesize} @@ -95,7 +95,7 @@ % % \end{footnotesize} % \begin{figure} -% \includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_02_plot.png}\\ +% \includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_02.png}\\ % \end{figure} @@ -106,7 +106,7 @@ % \end{footnotesize} \begin{figure} -\includegraphics[width=0.85\textwidth]{figure/l2_reg_hess_03_plot.png}\\ +\includegraphics[width=0.85\textwidth]{figure/l2_reg_hess_03.png}\\ \end{figure} @@ -144,7 +144,7 @@ \begin{figure} \centering - \scalebox{0.8}{\includegraphics{figure/l2_reg_hess_04_plot.png}} + \scalebox{0.8}{\includegraphics{figure/l2_reg_hess_04.png}} %\caption{\tiny The solid ellipses represent the contours of the unregularized objective and the dashed circles represent the contours of the $L2$ penalty. At $\hat{\thetab}_{\text{ridge}}$, the competing objectives reach an equilibrium.} \end{figure} diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex index aae6ca1e..f30328e2 100644 --- a/slides/regularization/slides-regu-intro.tex +++ b/slides/regularization/slides-regu-intro.tex @@ -45,12 +45,12 @@ \begin{column}{0.5\textwidth} \raggedright Overfitted model\\ - \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1o} + \includegraphics[width=0.85\textwidth]{figure/model_eval_02.png} \end{column} \begin{column}{0.5\textwidth} \raggedright Appropriate model\\ - \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1a} + \includegraphics[width=0.85\textwidth]{figure/model_eval_01.png} \end{column} \end{columns} diff --git a/slides/regularization/slides-regu-l1.tex b/slides/regularization/slides-regu-l1.tex index 7c9cd8be..0af32f9f 100644 --- a/slides/regularization/slides-regu-l1.tex +++ b/slides/regularization/slides-regu-l1.tex @@ -8,7 +8,7 @@ \begin{document} -\titlemeta{Regularization}{Lasso Regression}{figure/lin_reg_l1.png}{ +\titlemeta{Regularization}{Lasso Regression}{figure/lin_model_regu_01.png}{ \item Lasso regression / $L1$ penalty \item Know that lasso selects features \item Support recovery @@ -40,24 +40,24 @@ \begin{column}{0.5\textwidth} \lz \begin{figure} -\includegraphics[width=0.99\textwidth]{figure/lin_reg_l1.png} +\includegraphics[width=0.99\textwidth]{figure/lin_model_regu_01.png} \end{figure} \end{column} \begin{column}{0.5\textwidth} \lz \begin{figure} -\includegraphics[width=0.99\textwidth]{figure/lin_reg_l2.png} +\includegraphics[width=0.99\textwidth]{figure/lin_model_regu_02.png} \end{figure} \end{column} \end{columns} %\begin{figure} -%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l1.png} +%\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_01.png} %\end{figure} %\begin{figure} -%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png} +%\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png} %\end{figure} \lz @@ -70,7 +70,7 @@ Contours of regularized objective for different $\lambda$ values. \begin{figure} -\includegraphics[width=0.85\textwidth]{figure/lasso_contours.png} +\includegraphics[width=0.85\textwidth]{figure/reg_contours_01.png} \end{figure} Green = true minimizer of the unreg.objective and red = lasso solution. @@ -80,9 +80,23 @@ Regularized empirical risk $\riskr(\theta_1,\theta_2)$ using squared loss for $\lambda \uparrow$. $L1$ penalty makes non-smooth kinks at coordinate axes more pronounced, while $L2$ penalty warps $\riskr$ toward a ``basin'' (elliptic paraboloid). \begin{figure} -\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\ + \begin{minipage}{0.32\linewidth} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l1_lam0.png}} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam0.png}} + \end{minipage} + \begin{minipage}{0.32\linewidth} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l1_lam1.png}} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam1.png}} + \end{minipage} + \begin{minipage}{0.32\linewidth} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l1_lam10.png}} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam10.png}} + \end{minipage} \end{figure} +%\begin{figure} +%\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\ +%\end{figure} \framebreak We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape. @@ -95,7 +109,7 @@ \vspace{-0.1cm} \begin{figure}%\includegraphics[width=0.3\textwidth]{figure_man/lasso_hat.png}\\ \includegraphics[width=0.95\textwidth] -{figure_man/lasso_contours_cases.png}\\ +{figure/lasso_contour_cases.png}\\ \end{figure} \end{vbframe} @@ -112,7 +126,7 @@ %Soft threshold ensures exact zeros, while $L2$ penalty shrinks uniformly. \vspace{-0.16cm} \begin{figure} -\includegraphics[width=0.5\textwidth]{figure_man/soft-thresholding.pdf}\\ +\includegraphics[width=0.5\textwidth]{figure/soft_thresholding.png}\\ \end{figure} \end{vbframe} @@ -124,7 +138,7 @@ \end{itemize} \lz \begin{figure} -\includegraphics[width=0.9\textwidth]{figure_man/solution_paths_l1_l2.png}\\ +\includegraphics[width=0.9\textwidth]{figure/solution_paths_01.png}\\ \end{figure} \end{vbframe} diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex index 7c956053..e409b5d8 100644 --- a/slides/regularization/slides-regu-l1vsl2.tex +++ b/slides/regularization/slides-regu-l1vsl2.tex @@ -61,7 +61,7 @@ %Coefficient histograms for different $\lambda$ values for ridge and lasso for simulated data along with the cross-validated MSE. \begin{figure} -\includegraphics[width=0.6\textwidth]{figure/shrinkage_2.png}\\ +\includegraphics[width=0.6\textwidth]{figure/shrinkage_02.png}\\ \end{figure} \end{vbframe} @@ -91,12 +91,7 @@ \vspace{-0.4cm} \begin{table}[h] \centering -\begin{tabular}{|c|c c c c c c|} -\hline -\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline -OLS & 0.983 & 2.147 & 3.005 & 3.917 & \textbf{5.204} & 0.812 \\ %\hline -OLS rescaled & 0.983 & 2.147 & 3.005 & 3.917 & \textbf{0.052} & 0.812 \\ \hline -\end{tabular} +\input{rsrc/table_equivariance_ols.tex} %\caption{Equivariant OLS estimates under rescaling of $x_5$} \end{table} \vspace{-0.1cm} @@ -108,12 +103,7 @@ \vspace{-0.4cm} \begin{table}[h] \centering -\begin{tabular}{|c|c c c c c c|} -\hline -\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline -ridge & 0.709 & 1.873 & 2.661 & 3.557 & \textbf{4.636} & 1.366 \\ %\hline -ridge rescaled & 0.802 & 1.942 & 2.675 & 3.569 & \textbf{0.051} & 1.079 \\ \hline -\end{tabular} +\input{rsrc/table_equivariance_ridge.tex} %\caption{ridge estimates for $\lambda=10$ under rescaling of $x_5$} \end{table} } @@ -171,7 +161,7 @@ $x_1$-$x_4$ are independent, but $x_4$ and $x_5$ are strongly correlated. \begin{center} -\includegraphics[width=0.6\textwidth]{figure/regu_example_multicollinearity.png} +\includegraphics[width=0.6\textwidth]{figure/multicollinearity_example.png} \end{center} diff --git a/slides/regularization/slides-regu-l2-nonlin.tex b/slides/regularization/slides-regu-l2-nonlin.tex index c475ec33..da2065f6 100644 --- a/slides/regularization/slides-regu-l2-nonlin.tex +++ b/slides/regularization/slides-regu-l2-nonlin.tex @@ -15,7 +15,7 @@ }{% Lecture title Intuition for L2 Regularization in Non-Linear Models }{% Relative path to title page image: Can be empty but must not start with slides/ - figure_man/bias-variance-ridge.png + figure/bias_var_decomp.png }{ \item Understand how regularization and parameter shrinkage can be beneficial to non-linear models } diff --git a/slides/regularization/slides-regu-l2.tex b/slides/regularization/slides-regu-l2.tex index 06c3618f..985b66f9 100644 --- a/slides/regularization/slides-regu-l2.tex +++ b/slides/regularization/slides-regu-l2.tex @@ -15,7 +15,7 @@ }{% Lecture title Ridge Regression }{% Relative path to title page image: Can be empty but must not start with slides/ - figure/ridge_outside.png + figure/ridge_perspectives_01.png }{ \item Regularized linear model \item Ridge regression / $L2$ penalty @@ -44,7 +44,7 @@ %Assume the data generating process $y=3x_{1} -2x_{2} +\epsilon $, where $\displaystyle \epsilon \sim N( 0,1)$. The true minimizer is given by $\theta ^{*} =( 3,-2)^{T}$. %\begin{figure} -%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png} +%\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png} %\end{figure} %With increasing regularization, $\theta_{\textit{reg}}$ is pulled back to the origin. @@ -74,7 +74,7 @@ Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$, with $ \thetah_{\text{ridge}} = \argmin_{\thetab} \|\yv - \Xmat \thetab\|^2 + \lambda \|\thetab\|^2 $. \begin{figure} -\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png} +\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png} \end{figure} \vspace{-0.2cm} {\small With increasing regularization, $\hat{\theta}_{\textit{ridge}}$ is pulled back to the origin\\ (contour lines show unregularized objective).} @@ -84,7 +84,7 @@ $ \thetah_{\text{ridge}} = \argmin_{\thetab} \|\yv - \Xmat \thetab\|^2 + \lambda \|\thetab\|^2 $. \begin{figure} -\includegraphics[width=0.8\textwidth]{figure/ridge_contours.png} +\includegraphics[width=0.8\textwidth]{figure/reg_contours_02.png} \end{figure} \vspace{-0.2cm} Green = true coefs of the DGP and red = ridge solution. @@ -103,7 +103,7 @@ \vspace{-1.0cm} \begin{figure} -\includegraphics[width=0.6\textwidth]{figure/ridge_constraints.png} +\includegraphics[width=0.6\textwidth]{figure/ridge_perspectives_03.png} \end{figure} \begin{footnotesize} @@ -116,7 +116,7 @@ \begin{column}{0.5\textwidth} \lz \begin{figure} -\includegraphics[width=\textwidth]{figure/ridge_inside.png} +\includegraphics[width=\textwidth]{figure/ridge_perspectives_02.png} \end{figure} \end{column} @@ -140,7 +140,7 @@ \begin{column}{0.5\textwidth} \lz \begin{figure} -\includegraphics[width=\textwidth]{figure/ridge_outside.png} +\includegraphics[width=\textwidth]{figure/ridge_perspectives_01.png} \end{figure} \end{column} @@ -162,7 +162,7 @@ \begin{column}{0.5\textwidth} \lz \begin{figure} -\includegraphics[width=\textwidth]{figure_man/solution-path-ridge-only.png} +\includegraphics[width=\textwidth]{figure/solution_paths_02.png} \end{figure} \end{column} @@ -194,7 +194,7 @@ Using model complexity $d = 10$ overfits: \begin{center} -\includegraphics[width = 10cm ]{figure/poly_ridge_1.png} \\ +\includegraphics[width = 10cm ]{figure/poly_ridge_01.png} \\ \end{center} \framebreak @@ -204,7 +204,7 @@ \vfill \begin{center} -\includegraphics[width = 11cm ]{figure/poly_ridge_2.png} \\ +\includegraphics[width = 11cm ]{figure/poly_ridge_02.png} \\ \end{center} @@ -241,7 +241,7 @@ Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$. Consider $\lambda $ values of 0.01, 0.5, 1, 1.5, 2, 2.5, 10. \begin{figure} -\includegraphics[width=0.7\textwidth]{figure/lin_reg_l1.png} +\includegraphics[width=0.7\textwidth]{figure/lin_model_regu_01.png} \end{figure} With increasing regularization, $\theta_{\textit{ridge}}$ is pulled back to the origin. Contours = unreg. objective, dots = reg. solution for increasing $\lambda$. @@ -252,7 +252,7 @@ Contours of regularized objective for different $\lambda$ values. \begin{figure} -\includegraphics[width=0.9\textwidth]{figure/lasso_contours.png} +\includegraphics[width=0.9\textwidth]{figure/reg_contours_01.png} \end{figure} \framebreak @@ -289,18 +289,33 @@ \end{itemize} \lz \begin{figure} -\includegraphics[width=0.9\textwidth]{figure_man/solution_paths_l1_l2.png}\\ +\includegraphics[width=0.9\textwidth]{figure_man/solution_paths_01.png}\\ \end{figure} \end{vbframe} \begin{vbframe}{Effect of $L1$/$L2$ on Loss Surface} Regularized empirical risk $\riskr(\theta_1,\theta_2)$ using squared loss for $\lambda \uparrow$. $L1$ penalty makes non-smooth kinks at coordinate axes more pronounced, while $L2$ penalty warps $\riskr$ toward a ``basin'' (elliptic paraboloid). - + \begin{figure} -\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\ + \begin{minipage}{0.32\linewidth} + \centerline{\includegraphics[width=0.3\textwidth]{figure/reg_surfaces_l1_lam0.png}} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam0.png}} + \end{minipage} + \begin{minipage}{0.32\linewidth} + \centerline{\includegraphics[width=0.3\textwidth]{figure/reg_surfaces_l1_lam1.png}} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam1.png}} + \end{minipage} + \begin{minipage}{0.32\linewidth} + \centerline{\includegraphics[width=0.3\textwidth]{figure/reg_surfaces_l1_lam10.png}} + \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam10.png}} + \end{minipage} \end{figure} +%\begin{figure} +%\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\ +%\end{figure} + \end{vbframe} \end{comment} diff --git a/slides/regularization/slides-regu-nonlin.tex b/slides/regularization/slides-regu-nonlin.tex index d80bc442..b359ce23 100644 --- a/slides/regularization/slides-regu-nonlin.tex +++ b/slides/regularization/slides-regu-nonlin.tex @@ -15,7 +15,7 @@ }{% Lecture title Non-Linear Models and Structural Risk Minimization }{% Relative path to title page image: Can be empty but must not start with slides/ - figure/fig-regu-nonlin-2.png + figure/classifi_nn_w_size_2.png }{ \item Regularization even more important in non-linear models \item Norm penalties applied similarly @@ -88,13 +88,13 @@ \vspace{-0.8cm} %\vfill -\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-1.png}\end{center}} -\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-2.png}\end{center}} -\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-3.png}\end{center}} -\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-4.png}\end{center}} +\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_1.png}\end{center}} +\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_2.png}\end{center}} +\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_3.png}\end{center}} +\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_4.png}\end{center}} -%\only<5>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-5.png}} -%\only<6>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-6.png}} +%\only<5>{\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_5.png}} +%\only<6>{\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_6.png}} $\lambda$ affects smoothness of decision boundary and magnitude of weights @@ -106,7 +106,7 @@ Same settings as before, but each $\lambda$ is evaluated with 5x10 REP-CV -\begin{center}\includegraphics[width=1\textwidth]{figure/fig-regu-nonlin-srm-1.png}\end{center} +\begin{center}\includegraphics[width=1\textwidth]{figure/classifi_nn_err_decay.png}\end{center} Typical U-shape with sweet spot between overfitting and underfitting \end{frame} @@ -163,7 +163,7 @@ \only<1>{ \begin{center} \begin{minipage}{0.5\textwidth} -\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-1.png} +\includegraphics[width=\linewidth]{figure/classifi_nn_size_1.png} \end{minipage}% \begin{minipage}{0.5\textwidth} \includegraphics[width=\linewidth]{figure/nn_size_1.png} @@ -173,7 +173,7 @@ \only<2>{ \begin{center} \begin{minipage}{0.5\textwidth} -\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-2.png} +\includegraphics[width=\linewidth]{figure/classifi_nn_size_2.png} \end{minipage}% \begin{minipage}{0.5\textwidth} \includegraphics[width=\linewidth]{figure/nn_size_2.png} @@ -183,7 +183,7 @@ \only<3>{ \begin{center} \begin{minipage}{0.5\textwidth} -\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-3.png} +\includegraphics[width=\linewidth]{figure/classifi_nn_size_3.png} \end{minipage}% \begin{minipage}{0.5\textwidth} \includegraphics[width=\linewidth]{figure/nn_size_3.png} @@ -193,7 +193,7 @@ \only<4>{ \begin{center} \begin{minipage}{0.5\textwidth} -\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-4.png} +\includegraphics[width=\linewidth]{figure/classifi_nn_size_4.png} \end{minipage}% \begin{minipage}{0.5\textwidth} \includegraphics[width=\linewidth]{figure/nn_size_5.png} @@ -203,7 +203,7 @@ \only<5>{ \begin{center} \begin{minipage}{0.5\textwidth} -\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-5.png} +\includegraphics[width=\linewidth]{figure/classifi_nn_size_5.png} \end{minipage}% \begin{minipage}{0.5\textwidth} \includegraphics[width=\linewidth]{figure/nn_size_10.png} @@ -214,7 +214,7 @@ \only<6>{ \begin{center} \begin{minipage}{0.5\textwidth} -\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-6.png} +\includegraphics[width=\linewidth]{figure/classifi_nn_size_6.png} \end{minipage}% \begin{minipage}{0.5\textwidth} \includegraphics[width=\linewidth]{figure/nn_size_100.png} @@ -230,7 +230,7 @@ \begin{frame} {Structural Risk Minimization} Again, complexity vs CV score. -\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-srm-2.png}\end{center} +\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_err_size.png}\end{center} Minimal model with good generalization seems to size=10 @@ -251,7 +251,7 @@ \end{column} \begin{column}{0.5\textwidth} \begin{figure} -\includegraphics[width=0.6\textwidth]{figure/ridge_hat.png} +\includegraphics[width=0.6\textwidth]{figure/ridge_perspectives_04.png} \end{figure} \end{column} \end{columns} diff --git a/slides/regularization/slides-regu-wd-vs-l2.tex b/slides/regularization/slides-regu-wd-vs-l2.tex index 58f62513..4d40045e 100644 --- a/slides/regularization/slides-regu-wd-vs-l2.tex +++ b/slides/regularization/slides-regu-wd-vs-l2.tex @@ -55,8 +55,8 @@ How strongly we are pulled back (for fixed $\alpha$) depends on $\lambda$: \begin{figure} - \subfloat[Small $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_plot_01.png}} - \subfloat[Large $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_plot_02.png}}\\ + \subfloat[Small $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_01.png}} + \subfloat[Large $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_02.png}}\\ \end{figure} \end{vbframe}