diff --git a/slides/regularization/figure/avoid_overfitting_02.png b/slides/regularization/figure/avoid_overfitting_02.png
index 464d7414..31ba2946 100644
Binary files a/slides/regularization/figure/avoid_overfitting_02.png and b/slides/regularization/figure/avoid_overfitting_02.png differ
diff --git a/slides/regularization/figure/bias_var_decomp.png b/slides/regularization/figure/bias_var_decomp.png
new file mode 100755
index 00000000..d2c8abb3
Binary files /dev/null and b/slides/regularization/figure/bias_var_decomp.png differ
diff --git a/slides/regularization/figure/classifi_nn_err_decay.png b/slides/regularization/figure/classifi_nn_err_decay.png
new file mode 100755
index 00000000..a9c30a07
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_err_decay.png differ
diff --git a/slides/regularization/figure/classifi_nn_err_size.png b/slides/regularization/figure/classifi_nn_err_size.png
new file mode 100755
index 00000000..c2531c4e
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_err_size.png differ
diff --git a/slides/regularization/figure/classifi_nn_size_1.png b/slides/regularization/figure/classifi_nn_size_1.png
new file mode 100755
index 00000000..a809933c
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_1.png differ
diff --git a/slides/regularization/figure/classifi_nn_size_2.png b/slides/regularization/figure/classifi_nn_size_2.png
new file mode 100755
index 00000000..97a1e468
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_2.png differ
diff --git a/slides/regularization/figure/classifi_nn_size_3.png b/slides/regularization/figure/classifi_nn_size_3.png
new file mode 100755
index 00000000..f7fdae90
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_3.png differ
diff --git a/slides/regularization/figure/classifi_nn_size_4.png b/slides/regularization/figure/classifi_nn_size_4.png
new file mode 100755
index 00000000..79738067
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_4.png differ
diff --git a/slides/regularization/figure/classifi_nn_size_5.png b/slides/regularization/figure/classifi_nn_size_5.png
new file mode 100755
index 00000000..26436ad3
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_5.png differ
diff --git a/slides/regularization/figure/classifi_nn_size_6.png b/slides/regularization/figure/classifi_nn_size_6.png
new file mode 100755
index 00000000..81c0fd6e
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_size_6.png differ
diff --git a/slides/regularization/figure/classifi_nn_w_size_1.png b/slides/regularization/figure/classifi_nn_w_size_1.png
new file mode 100755
index 00000000..0f04c780
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_1.png differ
diff --git a/slides/regularization/figure/classifi_nn_w_size_2.png b/slides/regularization/figure/classifi_nn_w_size_2.png
new file mode 100755
index 00000000..04767ab6
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_2.png differ
diff --git a/slides/regularization/figure/classifi_nn_w_size_3.png b/slides/regularization/figure/classifi_nn_w_size_3.png
new file mode 100755
index 00000000..762c137b
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_3.png differ
diff --git a/slides/regularization/figure/classifi_nn_w_size_4.png b/slides/regularization/figure/classifi_nn_w_size_4.png
new file mode 100755
index 00000000..f575ef51
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_4.png differ
diff --git a/slides/regularization/figure/classifi_nn_w_size_5.png b/slides/regularization/figure/classifi_nn_w_size_5.png
new file mode 100755
index 00000000..5a00920f
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_5.png differ
diff --git a/slides/regularization/figure/classifi_nn_w_size_6.png b/slides/regularization/figure/classifi_nn_w_size_6.png
new file mode 100755
index 00000000..512ed4a1
Binary files /dev/null and b/slides/regularization/figure/classifi_nn_w_size_6.png differ
diff --git a/slides/regularization/figure/early_stopping.png b/slides/regularization/figure/early_stopping.png
index ddbb7cad..336be724 100644
Binary files a/slides/regularization/figure/early_stopping.png and b/slides/regularization/figure/early_stopping.png differ
diff --git a/slides/regularization/figure/eval_ofit_1a.pdf b/slides/regularization/figure/eval_ofit_1a.pdf
deleted file mode 100644
index 7dfa288c..00000000
Binary files a/slides/regularization/figure/eval_ofit_1a.pdf and /dev/null differ
diff --git a/slides/regularization/figure/eval_ofit_1o.pdf b/slides/regularization/figure/eval_ofit_1o.pdf
deleted file mode 100644
index 03080f7a..00000000
Binary files a/slides/regularization/figure/eval_ofit_1o.pdf and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-1.png b/slides/regularization/figure/fig-regu-nonlin-1.png
deleted file mode 100644
index f1962bff..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-1.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-2.png b/slides/regularization/figure/fig-regu-nonlin-2.png
deleted file mode 100644
index 9da89241..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-2.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-3.png b/slides/regularization/figure/fig-regu-nonlin-3.png
deleted file mode 100644
index 92008738..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-3.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-4.png b/slides/regularization/figure/fig-regu-nonlin-4.png
deleted file mode 100644
index d9b015fa..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-4.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-size-1.png b/slides/regularization/figure/fig-regu-nonlin-size-1.png
deleted file mode 100644
index f972a45b..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-size-1.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-size-2.png b/slides/regularization/figure/fig-regu-nonlin-size-2.png
deleted file mode 100644
index 90086b05..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-size-2.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-size-3.png b/slides/regularization/figure/fig-regu-nonlin-size-3.png
deleted file mode 100644
index 32145988..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-size-3.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-size-4.png b/slides/regularization/figure/fig-regu-nonlin-size-4.png
deleted file mode 100644
index 5409ec5e..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-size-4.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-size-5.png b/slides/regularization/figure/fig-regu-nonlin-size-5.png
deleted file mode 100644
index e8f05c53..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-size-5.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-size-6.png b/slides/regularization/figure/fig-regu-nonlin-size-6.png
deleted file mode 100644
index ecb0d3de..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-size-6.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-srm-1.png b/slides/regularization/figure/fig-regu-nonlin-srm-1.png
deleted file mode 100644
index b3a2ea07..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-srm-1.png and /dev/null differ
diff --git a/slides/regularization/figure/fig-regu-nonlin-srm-2.png b/slides/regularization/figure/fig-regu-nonlin-srm-2.png
deleted file mode 100644
index d4bed213..00000000
Binary files a/slides/regularization/figure/fig-regu-nonlin-srm-2.png and /dev/null differ
diff --git a/slides/regularization/figure/graddes_vs_weightdecay.png b/slides/regularization/figure/graddes_vs_weightdecay.png
index ecb6fd09..54af2d3e 100644
Binary files a/slides/regularization/figure/graddes_vs_weightdecay.png and b/slides/regularization/figure/graddes_vs_weightdecay.png differ
diff --git a/slides/regularization/figure/l2_reg_hess_01_plot.png b/slides/regularization/figure/l2_reg_hess_01.png
old mode 100644
new mode 100755
similarity index 100%
rename from slides/regularization/figure/l2_reg_hess_01_plot.png
rename to slides/regularization/figure/l2_reg_hess_01.png
diff --git a/slides/regularization/figure/l2_reg_hess_02_plot.png b/slides/regularization/figure/l2_reg_hess_02.png
old mode 100644
new mode 100755
similarity index 100%
rename from slides/regularization/figure/l2_reg_hess_02_plot.png
rename to slides/regularization/figure/l2_reg_hess_02.png
diff --git a/slides/regularization/figure/l2_reg_hess_03_plot.png b/slides/regularization/figure/l2_reg_hess_03.png
old mode 100644
new mode 100755
similarity index 100%
rename from slides/regularization/figure/l2_reg_hess_03_plot.png
rename to slides/regularization/figure/l2_reg_hess_03.png
diff --git a/slides/regularization/figure/l2_reg_hess_04_plot.png b/slides/regularization/figure/l2_reg_hess_04.png
old mode 100644
new mode 100755
similarity index 100%
rename from slides/regularization/figure/l2_reg_hess_04_plot.png
rename to slides/regularization/figure/l2_reg_hess_04.png
diff --git a/slides/regularization/figure/lasso_contour_cases.png b/slides/regularization/figure/lasso_contour_cases.png
new file mode 100755
index 00000000..0d9c0652
Binary files /dev/null and b/slides/regularization/figure/lasso_contour_cases.png differ
diff --git a/slides/regularization/figure/lasso_contours.png b/slides/regularization/figure/lasso_contours.png
deleted file mode 100644
index 3944622c..00000000
Binary files a/slides/regularization/figure/lasso_contours.png and /dev/null differ
diff --git a/slides/regularization/figure/lasso_outside.png b/slides/regularization/figure/lasso_outside.png
deleted file mode 100644
index cf61e81e..00000000
Binary files a/slides/regularization/figure/lasso_outside.png and /dev/null differ
diff --git a/slides/regularization/figure/lin_model_regu_01.png b/slides/regularization/figure/lin_model_regu_01.png
new file mode 100755
index 00000000..85d573bf
Binary files /dev/null and b/slides/regularization/figure/lin_model_regu_01.png differ
diff --git a/slides/regularization/figure/lin_model_regu_02.png b/slides/regularization/figure/lin_model_regu_02.png
new file mode 100755
index 00000000..0d245094
Binary files /dev/null and b/slides/regularization/figure/lin_model_regu_02.png differ
diff --git a/slides/regularization/figure/lin_reg_l1.png b/slides/regularization/figure/lin_reg_l1.png
deleted file mode 100644
index 02fc6d2d..00000000
Binary files a/slides/regularization/figure/lin_reg_l1.png and /dev/null differ
diff --git a/slides/regularization/figure/lin_reg_l2.png b/slides/regularization/figure/lin_reg_l2.png
deleted file mode 100644
index 4c0b29f5..00000000
Binary files a/slides/regularization/figure/lin_reg_l2.png and /dev/null differ
diff --git a/slides/regularization/figure/model_eval_01.png b/slides/regularization/figure/model_eval_01.png
new file mode 100755
index 00000000..28b42327
Binary files /dev/null and b/slides/regularization/figure/model_eval_01.png differ
diff --git a/slides/regularization/figure/model_eval_02.png b/slides/regularization/figure/model_eval_02.png
new file mode 100755
index 00000000..7bb3ff6c
Binary files /dev/null and b/slides/regularization/figure/model_eval_02.png differ
diff --git a/slides/regularization/figure/model_eval_03.png b/slides/regularization/figure/model_eval_03.png
new file mode 100755
index 00000000..4b3f85bd
Binary files /dev/null and b/slides/regularization/figure/model_eval_03.png differ
diff --git a/slides/regularization/figure/multicollinearity_example.png b/slides/regularization/figure/multicollinearity_example.png
new file mode 100755
index 00000000..03c9e467
Binary files /dev/null and b/slides/regularization/figure/multicollinearity_example.png differ
diff --git a/slides/regularization/figure/ozone_mse_boxplot.png b/slides/regularization/figure/ozone_mse_boxplot.png
index 3206091b..66d384d6 100644
Binary files a/slides/regularization/figure/ozone_mse_boxplot.png and b/slides/regularization/figure/ozone_mse_boxplot.png differ
diff --git a/slides/regularization/figure/poly_ridge_01.png b/slides/regularization/figure/poly_ridge_01.png
new file mode 100755
index 00000000..79304456
Binary files /dev/null and b/slides/regularization/figure/poly_ridge_01.png differ
diff --git a/slides/regularization/figure/poly_ridge_02.png b/slides/regularization/figure/poly_ridge_02.png
new file mode 100755
index 00000000..375f83b3
Binary files /dev/null and b/slides/regularization/figure/poly_ridge_02.png differ
diff --git a/slides/regularization/figure/poly_ridge_1.png b/slides/regularization/figure/poly_ridge_1.png
deleted file mode 100644
index 2ce5e37c..00000000
Binary files a/slides/regularization/figure/poly_ridge_1.png and /dev/null differ
diff --git a/slides/regularization/figure/poly_ridge_2.png b/slides/regularization/figure/poly_ridge_2.png
deleted file mode 100644
index 0f6d0827..00000000
Binary files a/slides/regularization/figure/poly_ridge_2.png and /dev/null differ
diff --git a/slides/regularization/figure/reg_contours_01.png b/slides/regularization/figure/reg_contours_01.png
new file mode 100755
index 00000000..6ccf5164
Binary files /dev/null and b/slides/regularization/figure/reg_contours_01.png differ
diff --git a/slides/regularization/figure/reg_contours_02.png b/slides/regularization/figure/reg_contours_02.png
new file mode 100755
index 00000000..500d3ba6
Binary files /dev/null and b/slides/regularization/figure/reg_contours_02.png differ
diff --git a/slides/regularization/figure/reg_surfaces.png b/slides/regularization/figure/reg_surfaces.png
deleted file mode 100644
index f7cd9bf3..00000000
Binary files a/slides/regularization/figure/reg_surfaces.png and /dev/null differ
diff --git a/slides/regularization/figure/reg_surfaces_l1_l2.png b/slides/regularization/figure/reg_surfaces_l1_l2.png
deleted file mode 100644
index 45e9f1ac..00000000
Binary files a/slides/regularization/figure/reg_surfaces_l1_l2.png and /dev/null differ
diff --git a/slides/regularization/figure/reg_surfaces_l1_lam0.png b/slides/regularization/figure/reg_surfaces_l1_lam0.png
new file mode 100755
index 00000000..2420f092
Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_lam0.png differ
diff --git a/slides/regularization/figure/reg_surfaces_l1_lam1.png b/slides/regularization/figure/reg_surfaces_l1_lam1.png
new file mode 100755
index 00000000..73593179
Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_lam1.png differ
diff --git a/slides/regularization/figure/reg_surfaces_l1_lam10.png b/slides/regularization/figure/reg_surfaces_l1_lam10.png
new file mode 100755
index 00000000..0cb729c0
Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l1_lam10.png differ
diff --git a/slides/regularization/figure/reg_surfaces_l2_lam0.png b/slides/regularization/figure/reg_surfaces_l2_lam0.png
new file mode 100755
index 00000000..38bcf395
Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l2_lam0.png differ
diff --git a/slides/regularization/figure/reg_surfaces_l2_lam1.png b/slides/regularization/figure/reg_surfaces_l2_lam1.png
new file mode 100755
index 00000000..14d2eee6
Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l2_lam1.png differ
diff --git a/slides/regularization/figure/reg_surfaces_l2_lam10.png b/slides/regularization/figure/reg_surfaces_l2_lam10.png
new file mode 100755
index 00000000..70231061
Binary files /dev/null and b/slides/regularization/figure/reg_surfaces_l2_lam10.png differ
diff --git a/slides/regularization/figure/regu_example_multicollinearity.png b/slides/regularization/figure/regu_example_multicollinearity.png
deleted file mode 100644
index 7f837c33..00000000
Binary files a/slides/regularization/figure/regu_example_multicollinearity.png and /dev/null differ
diff --git a/slides/regularization/figure/ridge_constraints.png b/slides/regularization/figure/ridge_constraints.png
deleted file mode 100644
index c5374862..00000000
Binary files a/slides/regularization/figure/ridge_constraints.png and /dev/null differ
diff --git a/slides/regularization/figure/ridge_contours.png b/slides/regularization/figure/ridge_contours.png
deleted file mode 100644
index fc0441c2..00000000
Binary files a/slides/regularization/figure/ridge_contours.png and /dev/null differ
diff --git a/slides/regularization/figure/ridge_hat.png b/slides/regularization/figure/ridge_hat.png
deleted file mode 100644
index 096c3c1a..00000000
Binary files a/slides/regularization/figure/ridge_hat.png and /dev/null differ
diff --git a/slides/regularization/figure/ridge_inside.png b/slides/regularization/figure/ridge_inside.png
deleted file mode 100644
index f298baa8..00000000
Binary files a/slides/regularization/figure/ridge_inside.png and /dev/null differ
diff --git a/slides/regularization/figure/ridge_outside.png b/slides/regularization/figure/ridge_outside.png
deleted file mode 100644
index 9b84c425..00000000
Binary files a/slides/regularization/figure/ridge_outside.png and /dev/null differ
diff --git a/slides/regularization/figure/ridge_perspectives_01.png b/slides/regularization/figure/ridge_perspectives_01.png
new file mode 100755
index 00000000..2e670bc5
Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_01.png differ
diff --git a/slides/regularization/figure/ridge_perspectives_02.png b/slides/regularization/figure/ridge_perspectives_02.png
new file mode 100755
index 00000000..59fd7ce4
Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_02.png differ
diff --git a/slides/regularization/figure/ridge_perspectives_03.png b/slides/regularization/figure/ridge_perspectives_03.png
new file mode 100755
index 00000000..87dfda49
Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_03.png differ
diff --git a/slides/regularization/figure/ridge_perspectives_04.png b/slides/regularization/figure/ridge_perspectives_04.png
new file mode 100755
index 00000000..252e9a27
Binary files /dev/null and b/slides/regularization/figure/ridge_perspectives_04.png differ
diff --git a/slides/regularization/figure/ridge_vs_sgd_path.png b/slides/regularization/figure/ridge_vs_sgd_path.png
new file mode 100755
index 00000000..d1bca0a6
Binary files /dev/null and b/slides/regularization/figure/ridge_vs_sgd_path.png differ
diff --git a/slides/regularization/figure/shrinkage_01.png b/slides/regularization/figure/shrinkage_01.png
new file mode 100755
index 00000000..970c18a1
Binary files /dev/null and b/slides/regularization/figure/shrinkage_01.png differ
diff --git a/slides/regularization/figure/shrinkage_02.png b/slides/regularization/figure/shrinkage_02.png
new file mode 100755
index 00000000..2b5d0d08
Binary files /dev/null and b/slides/regularization/figure/shrinkage_02.png differ
diff --git a/slides/regularization/figure/shrinkage_1.png b/slides/regularization/figure/shrinkage_1.png
deleted file mode 100644
index 0157b7e3..00000000
Binary files a/slides/regularization/figure/shrinkage_1.png and /dev/null differ
diff --git a/slides/regularization/figure/shrinkage_2.png b/slides/regularization/figure/shrinkage_2.png
deleted file mode 100644
index 6b28982a..00000000
Binary files a/slides/regularization/figure/shrinkage_2.png and /dev/null differ
diff --git a/slides/regularization/figure/soft_thresholding.png b/slides/regularization/figure/soft_thresholding.png
new file mode 100755
index 00000000..9bb06127
Binary files /dev/null and b/slides/regularization/figure/soft_thresholding.png differ
diff --git a/slides/regularization/figure/solution_paths_01.png b/slides/regularization/figure/solution_paths_01.png
new file mode 100755
index 00000000..af753baf
Binary files /dev/null and b/slides/regularization/figure/solution_paths_01.png differ
diff --git a/slides/regularization/figure/solution_paths_02.png b/slides/regularization/figure/solution_paths_02.png
new file mode 100755
index 00000000..ae783dc0
Binary files /dev/null and b/slides/regularization/figure/solution_paths_02.png differ
diff --git a/slides/regularization/figure/weightdecay_lambda_plot_01.png b/slides/regularization/figure/weightdecay_lambda_01.png
old mode 100644
new mode 100755
similarity index 100%
rename from slides/regularization/figure/weightdecay_lambda_plot_01.png
rename to slides/regularization/figure/weightdecay_lambda_01.png
diff --git a/slides/regularization/figure/weightdecay_lambda_plot_02.png b/slides/regularization/figure/weightdecay_lambda_02.png
old mode 100644
new mode 100755
similarity index 100%
rename from slides/regularization/figure/weightdecay_lambda_plot_02.png
rename to slides/regularization/figure/weightdecay_lambda_02.png
diff --git a/slides/regularization/figure/weightdecay_lambda_plot.png b/slides/regularization/figure/weightdecay_lambda_plot.png
deleted file mode 100644
index 8efe8b7b..00000000
Binary files a/slides/regularization/figure/weightdecay_lambda_plot.png and /dev/null differ
diff --git a/slides/regularization/figure_man/bayes-plot-posterior.png b/slides/regularization/figure_man/bayes-plot-posterior.png
deleted file mode 100644
index b7011648..00000000
Binary files a/slides/regularization/figure_man/bayes-plot-posterior.png and /dev/null differ
diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png
deleted file mode 100644
index 1af66601..00000000
Binary files a/slides/regularization/figure_man/bias-variance-ridge.png and /dev/null differ
diff --git a/slides/regularization/figure_man/lasso_contours_cases.png b/slides/regularization/figure_man/lasso_contours_cases.png
deleted file mode 100644
index 2f68583f..00000000
Binary files a/slides/regularization/figure_man/lasso_contours_cases.png and /dev/null differ
diff --git a/slides/regularization/figure_man/other-pen-MCP.png b/slides/regularization/figure_man/other-pen-MCP.png
deleted file mode 100644
index cb5eef09..00000000
Binary files a/slides/regularization/figure_man/other-pen-MCP.png and /dev/null differ
diff --git a/slides/regularization/figure_man/other-pen-SCAD.png b/slides/regularization/figure_man/other-pen-SCAD.png
deleted file mode 100644
index 7dc5225c..00000000
Binary files a/slides/regularization/figure_man/other-pen-SCAD.png and /dev/null differ
diff --git a/slides/regularization/figure_man/other-pen-lasso.png b/slides/regularization/figure_man/other-pen-lasso.png
deleted file mode 100644
index 88e05f77..00000000
Binary files a/slides/regularization/figure_man/other-pen-lasso.png and /dev/null differ
diff --git a/slides/regularization/figure_man/ridge-vs-sgd-path.png b/slides/regularization/figure_man/ridge-vs-sgd-path.png
deleted file mode 100644
index 9c732b08..00000000
Binary files a/slides/regularization/figure_man/ridge-vs-sgd-path.png and /dev/null differ
diff --git a/slides/regularization/figure_man/ridge_hat.png b/slides/regularization/figure_man/ridge_hat.png
deleted file mode 100644
index ebe29bca..00000000
Binary files a/slides/regularization/figure_man/ridge_hat.png and /dev/null differ
diff --git a/slides/regularization/figure_man/soft-thresholding.pdf b/slides/regularization/figure_man/soft-thresholding.pdf
deleted file mode 100644
index 12208bc0..00000000
Binary files a/slides/regularization/figure_man/soft-thresholding.pdf and /dev/null differ
diff --git a/slides/regularization/figure_man/solution-path-ridge-lasso.png b/slides/regularization/figure_man/solution-path-ridge-lasso.png
deleted file mode 100644
index 74fc339a..00000000
Binary files a/slides/regularization/figure_man/solution-path-ridge-lasso.png and /dev/null differ
diff --git a/slides/regularization/figure_man/solution-path-ridge-only.png b/slides/regularization/figure_man/solution-path-ridge-only.png
deleted file mode 100644
index 5f8fda17..00000000
Binary files a/slides/regularization/figure_man/solution-path-ridge-only.png and /dev/null differ
diff --git a/slides/regularization/figure_man/solution_path.png b/slides/regularization/figure_man/solution_path.png
deleted file mode 100644
index a72944be..00000000
Binary files a/slides/regularization/figure_man/solution_path.png and /dev/null differ
diff --git a/slides/regularization/figure_man/solution_path_l2.png b/slides/regularization/figure_man/solution_path_l2.png
deleted file mode 100644
index 71cccb93..00000000
Binary files a/slides/regularization/figure_man/solution_path_l2.png and /dev/null differ
diff --git a/slides/regularization/figure_man/solution_paths_l1_l2.png b/slides/regularization/figure_man/solution_paths_l1_l2.png
deleted file mode 100644
index b163dee8..00000000
Binary files a/slides/regularization/figure_man/solution_paths_l1_l2.png and /dev/null differ
diff --git a/slides/regularization/figure_man/wt_decay_hat.png b/slides/regularization/figure_man/wt_decay_hat.png
deleted file mode 100644
index 97c5bbc1..00000000
Binary files a/slides/regularization/figure_man/wt_decay_hat.png and /dev/null differ
diff --git a/slides/regularization/rsrc/avoid_overfitting.R b/slides/regularization/rsrc/avoid_overfitting.R
new file mode 100755
index 00000000..2043285b
--- /dev/null
+++ b/slides/regularization/rsrc/avoid_overfitting.R
@@ -0,0 +1,40 @@
+# ------------------------------------------------------------------------------
+# intro
+
+# FIG: how MSE for training and test data change with
+#      different feature numbers, and with different data sizes.
+
+# DATA: from data_ozone_example.RData
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(data.table)
+
+theme_set(theme_minimal())
+
+# DATA -------------------------------------------------------------------------
+
+load("data_ozone_example.RData")
+
+dfp <- setDT(df_incdata)[, .(mean.mse = median(value)), by = c("nobs", "variable")]
+
+# PLOTS ------------------------------------------------------------------------
+
+# data size
+p1 <- ggplot(data = dfp, aes(x = nobs, y = mean.mse, colour = variable)) +
+  geom_line(lwd = 1.2) + ylim(c(0, 100)) + labs(colour = " ") +
+  scale_colour_discrete(labels = c("Train error", "Test error")) +
+  xlab("Size of data set") + ylab("MSE") +
+  scale_color_brewer(palette="Dark2") 
+
+# feature number
+p2 <- ggplot(data = df_incfeatures, aes(x = type, y = mean.mse, colour = variable)) +
+  geom_line(lwd = 1.2) + labs(colour = " ") +
+  scale_colour_discrete(labels = c("Train error", "Test error")) +
+  xlab("Number of features") + ylab("MSE") +
+  ylim(c(0, 150)) +
+  scale_x_continuous(breaks = 0:12) +
+  scale_color_brewer(palette="Dark2")
+
+ggsave("../figure/avoid_overfitting_01.png", plot=p1, width=5, height=2.5)
+ggsave("../figure/avoid_overfitting_02.png", plot=p2, width=5, height=2.5)
diff --git a/slides/regularization/rsrc/bias-var-decomp-ridge.py b/slides/regularization/rsrc/bias-var-decomp-ridge.py
deleted file mode 100644
index 9c8d7a6f..00000000
--- a/slides/regularization/rsrc/bias-var-decomp-ridge.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import PolynomialFeatures
-from sklearn.linear_model import Ridge
-from sklearn.metrics import mean_squared_error
-
-
-# Set the random seed for reproducibility
-np.random.seed(0)
-
-# Define the true function and the number of datasets
-true_function = lambda x: np.sin(x)
-n_datasets = 100  # Number of datasets for training
-n_samples = 100
-n_test_samples = 10000
-n_order = 8
-lambdas = np.exp(np.linspace(-6, 7, 25))  
-
-# Generate polynomial features
-poly = PolynomialFeatures(degree=n_order, include_bias=False)
-
-# Initialize arrays to store the bias, variance, and error
-bias_squared = np.zeros_like(lambdas)
-variance = np.zeros_like(lambdas)
-test_error = np.zeros_like(lambdas)
-
-# Generate shared x values for all datasets
-x_shared = np.random.uniform(0, 1, n_samples).reshape(-1, 1)
-x_shared_poly = poly.fit_transform(x_shared)
-
-# Generate test data
-x_test = np.random.uniform(0, 1, n_test_samples).reshape(-1, 1)
-y_test = true_function(x_test).reshape(-1, 1) + np.random.randn(n_test_samples,1)
-x_test_poly = poly.transform(x_test)
-
-# Loop over the lambda values
-for i, lambda_val in enumerate(lambdas):
-    # Initialize arrays to store predictions for each model
-    predictions = np.zeros((n_datasets, n_samples))
-
-    # Train and predict with n_datasets models
-    for j in range(n_datasets):
-        # Generate new y values for each dataset
-        epsilon = np.random.randn(n_samples, 1)
-        y = true_function(x_shared) + epsilon
-
-        # Fit Ridge regression model
-        model = Ridge(alpha=lambda_val, fit_intercept=True)
-        model.fit(x_shared_poly, y)
-        predictions[j, :] = model.predict(x_shared_poly).flatten()
-
-    # Calculate the average prediction for each x
-    average_prediction = np.mean(predictions, axis=0)
-
-    # Compute itegrated bias^2 and variance using MC
-    bias_squared[i] = np.mean((average_prediction - true_function(x_shared).flatten()) ** 2)
-    variance[i] = np.mean(np.var(predictions, axis=0))
-
-# Train a final model on a new dataset and compute test error for each lambda
-for i, lambda_val in enumerate(lambdas):
-    # Generate new data for the final model
-    x_train_final = np.random.uniform(0, 1, n_samples).reshape(-1, 1)
-    y_train_final = true_function(x_train_final) + np.random.randn(n_samples, 1)
-    x_train_final_poly = poly.transform(x_train_final)
-
-    # Fit the final model
-    model_final = Ridge(alpha=lambda_val, fit_intercept=True)
-    model_final.fit(x_train_final_poly, y_train_final)
-
-    # Predict on the test set and compute the error
-    y_test_pred_final = model_final.predict(x_test_poly).flatten()
-    # The test error
-    test_error[i] = mean_squared_error(y_test, y_test_pred_final)
-
-# Plotting the results with two y-axes
-fig, ax1 = plt.subplots(figsize=(12, 6))
-
-# Plot bias^2 and variance on the primary y-axis
-ax1.plot(np.log(lambdas), bias_squared, label='(bias)^2', color='red')
-ax1.plot(np.log(lambdas), variance, label='variance', color='blue')
-ax1.plot(np.log(lambdas), bias_squared + variance, label='(bias)^2 + variance', color='green')
-
-ax1.set_xlabel('ln(λ)', fontsize=16)
-ax1.set_ylabel('(bias)^2, variance', fontsize=16)
-ax1.legend(loc='upper left')
-
-# Create secondary y-axis for test error
-ax2 = ax1.twinx()
-ax2.plot(np.log(lambdas), test_error, label='test error', color='magenta', linestyle='--', alpha=.6)
-ax2.set_ylabel('Test error on single dataset', fontsize=16)
-ax2.legend(loc='upper right')
-
-plt.title('Bias-Variance Tradeoff with L2 Regularization', fontsize=20)
-plt.show()
diff --git a/slides/regularization/rsrc/bias_var_decomp.R b/slides/regularization/rsrc/bias_var_decomp.R
new file mode 100755
index 00000000..5a6369da
--- /dev/null
+++ b/slides/regularization/rsrc/bias_var_decomp.R
@@ -0,0 +1,78 @@
+# ------------------------------------------------------------------------------
+# l2 nonlin
+
+# FIG: decompose MSE to bias_square and variance for ridge regression.
+#      plot lines to show how each part varies 
+#      with ln(lambda) (natural logarithm of regularization constant).
+
+# DATA: y = sin(x(100*1 ~Uniform)) + epi (100*1 ~Normal)
+#       X = (x^1,...,x^8) (100*8 design matrix)
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(glmnet)
+
+set.seed(0)
+
+# DATA -------------------------------------------------------------------------
+
+true_function <- function(x) sin(x)
+n_datasets <- 100
+n_samples <- 100
+n_test_samples <- 10000
+n_order <- 8
+lambdas <- exp(seq(-6, 7, length.out = 25))
+
+# Generate polynomial features
+poly_features <- function(x, degree) {
+  model.matrix(~ poly(x, degree, raw = TRUE) - 1)
+}
+
+# Initialize arrays to store the bias, variance, and error
+bias_square <- rep(0, length(lambdas))
+variance <- rep(0, length(lambdas))
+test_error <- rep(0, length(lambdas))
+
+# Generate shared x values for all datasets
+x_shared <- runif(n_samples)
+x_shared_poly <- poly_features(x_shared, n_order)
+
+# Generate test data
+x_test <- runif(n_test_samples)
+y_test <- true_function(x_test) + rnorm(n_test_samples)
+x_test_poly <- poly_features(x_test, n_order)
+
+for (i in 1:length(lambdas)) {
+  predictions <- matrix(0, nrow = n_datasets, ncol = n_samples)
+  
+  for (j in 1:n_datasets) {
+    epsilon <- rnorm(n_samples)
+    y <- true_function(x_shared) + epsilon
+    
+    model <- glmnet(x_shared_poly, y, alpha = 0, lambda = lambdas[i])
+    predictions[j, ] <- predict(model, newx = x_shared_poly)
+  }
+  
+  average_prediction <- apply(predictions, 2, mean)
+  
+  bias_square[i] <- mean((average_prediction - true_function(x_shared))^2)
+  variance[i] <- mean(apply(predictions, 2, var))
+}
+
+
+data <- data.frame(log_lambdas = log(lambdas),
+                   bias_square = bias_square,
+                   variance = variance,
+                   MSE = bias_square + variance) %>%
+  pivot_longer(cols = c(bias_square, variance, MSE), names_to = "component", values_to = "value")
+
+p <- ggplot(data, aes(x = log_lambdas, y = value, color = component, linetype = component)) +
+  geom_line(size = 1) +
+  scale_color_manual(values = c("red", "green", "blue")) +
+  scale_linetype_manual(values = c("solid", "solid", "solid")) +
+  labs(x = expression("ln("~λ~")"), y = "value", title = "Bias-Variance Tradeoff with L2 Regularization") +
+  theme_minimal()
+
+ggsave("bias_var_decomp.png", p, width = 12, height = 6)
diff --git a/slides/regularization/rsrc/make_fig_regu_nonlin_plots.R b/slides/regularization/rsrc/classifi_nn.R
old mode 100644
new mode 100755
similarity index 54%
rename from slides/regularization/rsrc/make_fig_regu_nonlin_plots.R
rename to slides/regularization/rsrc/classifi_nn.R
index 9b7118f8..e0769717
--- a/slides/regularization/rsrc/make_fig_regu_nonlin_plots.R
+++ b/slides/regularization/rsrc/classifi_nn.R
@@ -1,31 +1,45 @@
-################################################################################
-####### Non-linear regularization: Neural net ##################################
-################################################################################
+# ------------------------------------------------------------------------------
+# nonlin
+
+# FIG: 
+#  (1) classification prediction, weights histogram, weights values
+#      with different lambdas (decay parameter) by nn.
+#  (2) classification prediction with different sizes of hidden layer by nn.
+#  (3) how classification errors change with different lambdas, 
+#      and with different sizes of hidden layer by nn.
+
+# DATA: "spirals" from mlr3.
+# ------------------------------------------------------------------------------
 
 library(mlr3)
 library(mlr3learners)
 library(mlr3viz)
+library(mlr3misc)
 
 library(ggplot2)
 
 library(gridExtra)
 library(grid)
-#-------------------------------------------------------------------------------
+
 options(digits = 3, 
         width = 65, 
         str = strOptions(strict.width = "cut", vec.len = 3))
-#-------------------------------------------------------------------------------
-# functions for plotting the weights 
 
+set.seed(1234)
+
+# PLOT FUNCTIONS ---------------------------------------------------------------
+
+# value for each weight
 plot_weights <- function (weights) {
-  weight_data <- data.frame(value = weights, weights = seq_along(weights))
+  weight_data <- data.frame(value = weights, weight_index = seq_along(weights))
   
-  ggplot(weight_data, aes(x=weights, y = value)) + 
+  ggplot(weight_data, aes(x=weight_index, y = value)) + 
     geom_bar(stat ="identity", color="black", fill="white") +
     ylim(c(-75, 75)) +
     ggtitle("Weights")
 }
 
+# histogram of weights
 plot_histogram <- function (weights) {
   weight_data <- data.frame(value = weights)
   
@@ -33,14 +47,10 @@ plot_histogram <- function (weights) {
     geom_histogram (bins= 15, color="black", fill="white") +
     ggtitle("Histogram of weights") +
     xlab ("value of weights") +
-    xlim(c(-100, 100)) #+
-    #ylim(c(0,40))
+    xlim(c(-100, 100))
 }
 
-
-
-# function for plotting the prediction
-
+# classification model visualization
 plot_prediction <- function (learner, task) {
   plot_learner_prediction(learner, task) + 
     scale_fill_viridis_d(end = .9) + 
@@ -48,29 +58,21 @@ plot_prediction <- function (learner, task) {
     ggtitle("Prediction")
 }
 
-#-------------------------------------------------------------------------------
-#spirals dataset
+# DATA -------------------------------------------------------------------------
+
 spirals_generator <- tgen("spirals", sd = 0.1)
 
-# get spirals data 
 spirals_task <- spirals_generator$generate(n=100)
 
+# PLOT PREDICTION & WEIGHTS ----------------------------------------------------
 
-################################################################################
-############ Different decay parameters ### ####################################
-################################################################################
-# decay parameter / lambda 
+### Different decay parameters
 decay_list <- list(0, 0.001, 0.005, 0.01, 0.05, 0.1)
 
-# size of single hidden layer
 size <- 10
 
-
-# plot for all decay paramters the predition & a plot of the weights
 for(i in seq_along(decay_list)){
-  set.seed(1234)
   learner <- lrn("classif.nnet", size = size, decay = decay_list[[i]])
-  
   learner$train(spirals_task)
   weights <- learner$model$wts
   weight_plot <- plot_weights(weights = weights) 
@@ -78,25 +80,20 @@ for(i in seq_along(decay_list)){
   
   prediction_plot <- plot_prediction(learner, spirals_task)
   
-  grid <- grid.arrange(prediction_plot,historgram_plot, weight_plot, ncol = 3, 
+  grid <- grid.arrange(prediction_plot, historgram_plot, weight_plot, ncol = 3, 
                        top = textGrob(bquote(lambda==.(decay_list[[i]])), 
                                       gp = gpar(fontsize = 14))) 
   
-  ggsave(filename = paste0("../figure/fig-regu-nonlin-", i ,".png"), 
+  ggsave(filename = paste0("../figure/classifi_nn_w_size_", i ,".png"), 
          plot = grid, width = 8, height = 3) 
   
 }
 
-################################################################################
-############ Different size of hidden layer ####################################
-################################################################################
-
-
+### Different size of hidden layer
 size_list <- list(1, 2, 3, 5, 10, 100)
 decay <- 0.001
 
 for(i in seq_along(size_list)){
-  set.seed(1234)
   learner <- lrn("classif.nnet", size = size_list[[i]], decay = decay )
   
   learner$train(spirals_task)
@@ -110,54 +107,46 @@ for(i in seq_along(size_list)){
                        top = textGrob(bquote(size~of~hidden~layer==.(size_list[[i]])), 
                                       gp = gpar(fontsize = 14))) 
   
-  ggsave(filename = paste0("../figure/fig-regu-nonlin-size-", i ,".png"), 
+  ggsave(filename = paste0("../figure/classifi_nn_size_", i ,".png"), 
          plot = grid, width = 3, height = 3) 
   
 }
 
-#-------------------------------------------------------------------------------
+# PLOT CLASSIFICATION ERROR ----------------------------------------------------
 
+### Different decay parameters
 folds <- 10; reps <- 5;
 size <- 10
 decay_list <- seq(0, 0.02, length.out = 20)
-
-# this might run for 5 min
 rdesc <- rsmp("repeated_cv", folds = folds, repeats = reps)
 lrns <- lapply(decay_list, function(d) lrn("classif.nnet", size = size, decay = d))
 gg <- benchmark_grid(tasks = spirals_task, resamplings = rdesc, learners = lrns)
 br <- benchmark(gg)
-a <- br$aggregate(measures = msr("classif.ce"), params = TRUE)
-a <- mlr3misc::unnest(a, "params")
+a1 <- br$aggregate(measures = msr("classif.ce"), params = TRUE)
+a1 <- unnest(a1, "params")
 
-a$log_decay <- log(a$decay + 1) #make U-shape more obivious
-p <- ggplot(data = a, aes(x = log_decay, y = classif.ce)) +
+a1$log_decay <- log(a1$decay + 1) #make U-shape more obivious
+p1 <- ggplot(data = a1, aes(x = log_decay, y = classif.ce)) +
   geom_line() + 
-  xlab("log(lambda+1)") + ylab("classif err") +
-  xlim(0, 0.01) + ylim(0.13, 0.27)
-#print(p)
-
-ggsave(filename = paste0("../figure/fig-regu-nonlin-srm-1.png"), 
-  plot = p, width = 6, height = 3) 
-
-#-------------------------------------------------------------------------------
+  xlab(expression("log("~lambda~"+ 1 )")) + ylab("classif err") +
+  xlim(0, 0.01) + ylim(0.1, 0.25)
 
+ggsave(filename = paste0("../figure/classifi_nn_err_decay.png"), 
+       plot = p1, width = 6, height = 3)
+### Different size of hidden layer
 folds <- 10; reps <- 5; by <- 1
 decay <- 0.001
 size_list <- seq(1, 30, by = by)
-
-# this might run for 5 min
 rdesc <- rsmp("repeated_cv", folds = folds, repeats = reps)
 lrns <- lapply(size_list, function(s) lrn("classif.nnet", size = s, decay = decay))
 gg <- benchmark_grid(tasks = spirals_task, resamplings = rdesc, learners = lrns)
 br <- benchmark(gg)
-a <- br$aggregate(measures = msr("classif.ce"), params = TRUE)
-a <- mlr3misc::unnest(a, "params")
-p <- ggplot(data = a, aes(x = size, y = classif.ce)) +
+a2 <- br$aggregate(measures = msr("classif.ce"), params = TRUE)
+a2 <- unnest(a2, "params")
+p2 <- ggplot(data = a2, aes(x = size, y = classif.ce)) +
   geom_line() + 
   xlab("size hidden layer") + ylab("classif err")
-#print(p)
-
-ggsave(filename = paste0("../figure/fig-regu-nonlin-srm-2.png"), 
-  plot = p, width = 6, height = 3) 
 
+ggsave(filename = paste0("../figure/classifi_nn_err_size.png"), 
+  plot = p2, width = 6, height = 3) 
 
diff --git a/slides/regularization/rsrc/utils.R b/slides/regularization/rsrc/data_func_utils.R
old mode 100644
new mode 100755
similarity index 64%
rename from slides/regularization/rsrc/utils.R
rename to slides/regularization/rsrc/data_func_utils.R
index 601554b6..26901f1c
--- a/slides/regularization/rsrc/utils.R
+++ b/slides/regularization/rsrc/data_func_utils.R
@@ -1,11 +1,27 @@
-library(mlr)
-library(mlbench)
+# ------------------------------------------------------------------------------
+# geom l1, geom l2, wd vs l2
+
+# DATA: simulate linear regression data for ridge and lasso subchapters,
+#       and define functions for contour plots of empirical risk.
+#       y = X(100*2 ~Unif)·beta_true(0.5,3) + noise(100*1 ~Normal)
+
+# FUNC: empirical risk of linear regression model
+#       hessian matrix for empirical risk
+#       risk function with l2 regularization
+#       gradient of empirical risk
+#       gradient of l2 regularized risk
+#       gradient descent to get optimal beta
+#       weight decay to get optimal beta
+#       contour plots for empirical risk
+# ------------------------------------------------------------------------------
+
 library(ggplot2)
-library(BBmisc)
-library(reshape)
 library(viridis)
 
 set.seed(123)
+
+# DATA -------------------------------------------------------------------------
+
 num_obs <- 100
 num_features <- 2
 
@@ -16,64 +32,76 @@ beta_true <- c(0.5, 3)
 
 y <- X %*% beta_true + rnorm(num_obs, sd = err_std)
 
+# FUNCTION ---------------------------------------------------------------------
+
+# empirical risk
 R_emp <- function(beta, features = X, target = y){
   return(sum((features %*% beta - target)^2))
 }
 
+# hessian matrix
 R_emp_hessian <- function(features = X){
   return(2 * t(features)%*%(features))
 }
 
+# risk function with l2 regularization
 R_reg_l2 <- function(beta, lambda = 0.1, features = X, target = y){
   return(R_emp(beta, features, target) + (0.5*lambda * sum(beta^2)))
 }
 
-plot_r_emp <- function(r_emp, x1, x2, bins=NULL, breaks=NULL){
-  eval_grid <- expand.grid(x1,x2)
-  eval_grid$r_emp <- apply(eval_grid, 1, r_emp)
-
-  ggplot(eval_grid) +
-    geom_raster(aes(x=Var1, y=Var2, fill=r_emp)) +
-    geom_contour(aes(x=Var1, y=Var2, z=r_emp), colour="white", bins=bins, breaks=breaks) +
-    xlab(expression(theta[1])) +
-    ylab(expression(theta[2])) +
-    scale_fill_viridis(end = 0.9)
-}
-
+# gradient of empirical risk
 R_emp_grad <- function(beta, features = X, target = y){
   return(2 * t(features)%*%(features %*% beta - target))
 }
 
+# gradient of l2 regularized risk
 R_reg_l2_grad <- function(beta, lambda, features = X, target = y){
   return((2 * t(features)%*%(features %*% beta - target) +
             lambda*beta))
 }
 
+# gradient descent to get optimal beta
 gradient_descent <- function(beta_start, step_size, grad_fun, num_steps){
   betas <- matrix(0, ncol=length(beta_start), nrow=num_steps)
   betas[1, ] <- beta_start
   for(i in seq(2,num_steps)){
     betas[i, ] <- betas[i-1, ] - step_size * grad_fun(betas[i-1,])
   }
-
+  
   betas <- as.data.frame(betas)
   return(betas)
 }
 
+# weight decay to get optimal beta
 weight_decay <- function(beta_start, lambda, step_size, unreg_grad_fun,
                          num_steps){
   betas_wd <- matrix(NA, ncol=length(beta_start), nrow=(num_steps)*3)
   betas_wd[1, ] <- beta_start
-
+  
   betas_gd <- matrix(NA, ncol=length(beta_start), nrow=(num_steps-1)*3)
-
+  
   for(i in seq(1, 3 * (num_steps-1), 3)){
     betas_wd[i+1, ] <- betas_wd[i, ]*(1-step_size*lambda)
     betas_gd[i, ] <- betas_wd[i+1, ]
     betas_gd[i+1, ] <- betas_gd[i, ] - step_size * unreg_grad_fun(betas_wd[i,])
     betas_wd[i+3, ] <- betas_gd[i+1, ]
   }
-
+  
   return(list(betas_wd = as.data.frame(betas_wd),
               betas_gd = as.data.frame(betas_gd)))
-}
\ No newline at end of file
+}
+
+# PLOT FUNCTION ----------------------------------------------------------------
+
+# empirical risk contour plots
+plot_r_emp <- function(r_emp, x1, x2, bins=NULL, breaks=NULL){
+  eval_grid <- expand.grid(x1,x2)
+  eval_grid$r_emp <- apply(eval_grid, 1, r_emp)
+
+  ggplot(eval_grid) +
+    geom_raster(aes(x=Var1, y=Var2, fill=r_emp)) +
+    geom_contour(aes(x=Var1, y=Var2, z=r_emp), colour="white", bins=bins, breaks=breaks) +
+    xlab(expression(theta[1])) +
+    ylab(expression(theta[2])) +
+    scale_fill_viridis(end = 0.9)
+}
diff --git a/slides/regularization/rsrc/ozone_example.RData b/slides/regularization/rsrc/data_ozone_example.RData
similarity index 100%
rename from slides/regularization/rsrc/ozone_example.RData
rename to slides/regularization/rsrc/data_ozone_example.RData
diff --git a/slides/regularization/rsrc/data_regu_example_1.R b/slides/regularization/rsrc/data_regu_example_1.R
new file mode 100755
index 00000000..d3faf433
--- /dev/null
+++ b/slides/regularization/rsrc/data_regu_example_1.R
@@ -0,0 +1,74 @@
+# ------------------------------------------------------------------------------
+# l1 vs l2
+
+# RDATA: 
+#  (1): generate coefficients path for regression with 
+#       different regularization constants under l1 and l2 regularization.
+#  (2): generate coefficients and MSE for cross-validation with 50
+#       regularization constants (9.536743e-07 to 2) 
+#       under l1 and l2 regularization.
+# DATA: boston_housing
+# ------------------------------------------------------------------------------
+
+library(BBmisc)
+library(data.table)
+library(mlr3)
+library(dplyr)
+library(mlr3learners)
+library(mlr3tuning)
+
+set.seed(123)
+
+# DATA -------------------------------------------------------------------------
+
+task = tsk("boston_housing")
+feat_drop = c("chas", "nox", "rm", "lat", "lon", "town", "tract")
+task$select(setdiff(task$feature_names, feat_drop))
+featnames = task$feature_names
+
+compute_coef_paths = function(task, lambda_name, lambda_seq) {
+  alpha = ifelse(lambda_name=='lambda1', 1, 0)
+  path = list()
+  for (i in seq_along(lambda_seq)) {
+    lamval <- lambda_seq[i]
+    learner = lrn("regr.glmnet", alpha = alpha, lambda=lamval)
+    learner$train(task)
+    cc <- t(as.matrix(coef(learner$model)))
+    names <- colnames(cc)
+    cc <- as.numeric(cc)
+    names(cc) <- names
+    cc <- as.list(cc)
+    cc$lambda <- lamval
+    path[[i]] <- cc
+  }
+  path <- rbindlist(path, fill = TRUE)
+  path[is.na(path)] <- 0
+  
+  # Perform cross validation
+  learner = lrn("regr.glmnet", alpha = alpha, lambda=to_tune(lambda_seq))
+  
+  # Construct tuning instance
+  instance = ti(
+    task = task,
+    learner = learner,
+    resampling = rsmp("cv", folds = 3),
+    measures = msr("regr.mse"),
+    terminator = trm("evals", n_evals = length(lambda_seq))
+  )
+  
+  tuner <- tnr("grid_search", resolution = length(lambda_seq))
+  tuner$optimize(instance)
+  cv_lam <- as.data.frame(instance$archive$data)[,1:2]
+  colnames(cv_lam) <- c("lambda", "mse")
+  cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda))
+  cv_lam <- cv_lam %>% arrange(lambda)
+  
+  list(path = path, cv_lam = cv_lam)
+}
+
+lambda_seq = 2^seq(-10, 20, length.out = 50)
+path_l1 = compute_coef_paths(task, "lambda1", lambda_seq)
+path_l2 = compute_coef_paths(task, "lambda2", lambda_seq)
+
+save2("data_regu_example_1.RData", path_l1 = path_l1, path_l2 = path_l2, featnames = featnames, lambda_seq = lambda_seq)
+
diff --git a/slides/regularization/rsrc/data_regu_example_1.RData b/slides/regularization/rsrc/data_regu_example_1.RData
new file mode 100755
index 00000000..07723ec4
Binary files /dev/null and b/slides/regularization/rsrc/data_regu_example_1.RData differ
diff --git a/slides/regularization/rsrc/data_regu_example_2.R b/slides/regularization/rsrc/data_regu_example_2.R
new file mode 100755
index 00000000..314bb383
--- /dev/null
+++ b/slides/regularization/rsrc/data_regu_example_2.R
@@ -0,0 +1,96 @@
+# ------------------------------------------------------------------------------
+# l1 vs l2
+
+# RDATA: 
+#  (1): generate coefficients for regression with two regularization constants
+#       (lambda 0.01, 100) under l1 and l2 regularization.
+#  (2): generate coefficients and MSE for cross-validation with 50
+#       regularization constants (9.536743e-07 to 2) 
+#       under l1 and l2 regularization.
+# DATA: 
+#     Xi ~ Normal(0, 1), Cov(xi, xj) = 0.7^|i-j|
+#     y = 10*x1 + 10*x2 + 5*x3 + 5*x4 + x5 + ... + x14 + eps(100*1 ~Normal)
+# ------------------------------------------------------------------------------
+
+library(mlr3)
+library(dplyr)
+library(mlr3learners)
+library(mlr3tuning)
+library(BBmisc)
+library(ggplot2)
+library(gridExtra)
+library(MASS)
+
+set.seed(19873)
+
+# DATA -------------------------------------------------------------------------
+
+n <- 100    # Number of observations
+p <- 50     # Number of predictors included in model
+CovMatrix <- outer(1:p, 1:p, function(x,y) {.7^abs(x-y)})
+x <- mvrnorm(n, rep(0,p), CovMatrix)
+y <- 10 * apply(x[, 1:2], 1, sum) +
+  5 * apply(x[, 3:4], 1, sum) +
+  apply(x[, 5:14], 1, sum) +
+  rnorm(n)
+
+dd = as.data.frame(x)
+dd$y = y
+task <- TaskRegr$new(id = "mytask", backend = dd, target = "y")
+
+# order coefficients
+extract_numeric <- function(x) {
+  as.numeric(gsub("[^0-9]", "", x))
+}
+
+get_pen_coefs = function(task, alpha, lam) {
+  learner = lrn("regr.glmnet", alpha = alpha, lambda=lam)
+  learner$train(task)
+  cc <- as.matrix(coef(learner$model))[,1]
+  names <- names(cc)
+  cc <- as.numeric(cc)
+  cc_nonin <- cc[2:length(cc)] # reorder non-intercept cc
+  names(cc) <- names
+  names_nonin <- extract_numeric(names[2:length(names)])
+  names <- c(names[1], paste0("V", as.character(sort(names_nonin))))
+  cc <- cc[names]
+  names(cc) <- names
+  return(abs(cc))
+}
+
+compute_cv = function(task, alpha, lambda_seq) {
+  learner = lrn("regr.glmnet", alpha = alpha, lambda=to_tune(lambda_seq))
+  
+  # Construct tuning instance
+  instance = ti(
+    task = task,
+    learner = learner,
+    resampling = rsmp("cv", folds = 3),
+    measures = msr("regr.mse"),
+    terminator = trm("evals", n_evals = length(lambda_seq))
+  )
+  
+  tuner <- tnr("grid_search", resolution = length(lambda_seq))
+  tuner$optimize(instance)
+  cv_lam <- as.data.frame(instance$archive$data)[,1:2]
+  colnames(cv_lam) <- c("lambda", "mse")
+  cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda))
+  cv_lam <- cv_lam %>% arrange(lambda)
+  
+  return(cv_lam)
+}
+
+lams = c(0.01, 100)
+cc_l2_1 = get_pen_coefs(task, alpha = 0, lam = lams[1])
+cc_l2_2 = get_pen_coefs(task, alpha = 0, lam = lams[2])
+cc_l1_1 = get_pen_coefs(task, alpha = 1, lam = lams[1])
+cc_l1_2 = get_pen_coefs(task, alpha = 1, lam = lams[2])
+
+
+lambda_seq = 2^seq(-20, 1, length.out = 50)
+cv_l1 = compute_cv(task, alpha = 1, lambda_seq)
+cv_l2 = compute_cv(task, alpha = 0, lambda_seq)
+
+save2("data_regu_example_2.RData", lams, lambda_seq,
+      cc_l2_1, cc_l2_2, cc_l1_1, cc_l1_2,
+      cv_l1, cv_l2)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/data_regu_example_2.RData b/slides/regularization/rsrc/data_regu_example_2.RData
new file mode 100755
index 00000000..f994d673
Binary files /dev/null and b/slides/regularization/rsrc/data_regu_example_2.RData differ
diff --git a/slides/regularization/rsrc/early_stopping.R b/slides/regularization/rsrc/early_stopping.R
new file mode 100755
index 00000000..ba4aaf6e
--- /dev/null
+++ b/slides/regularization/rsrc/early_stopping.R
@@ -0,0 +1,146 @@
+# ------------------------------------------------------------------------------
+# intro
+
+# FIG: show how early stopping influences training and test results.
+# LEFT: how MSE changes with iterations.
+# RIGHT: two fitted curves (early stopping & overfit).
+
+# DATA: Ozone from package-mlbench
+# ------------------------------------------------------------------------------
+
+library(mlbench)
+library(reshape2)
+library(ggplot2)
+library(gridExtra)
+
+theme_set(theme_minimal())
+
+set.seed(6)
+
+# DATA -------------------------------------------------------------------------
+
+data(Ozone)
+
+# gradient of empirical risk
+R_emp_grad <- function(beta,
+                       features = X,
+                       target = y) {
+  return(2 * t(features) %*% (features %*% beta - target))
+}
+
+gradient_descent <-
+  function(beta_start,
+           step_size,
+           grad_fun,
+           num_steps,
+           features,
+           target) {
+    betas <- matrix(0, ncol = length(beta_start), nrow = num_steps)
+    betas[1,] <- beta_start
+    for (i in seq(2, num_steps)) {
+      betas[i,] <-
+        betas[i - 1,] - step_size * grad_fun(betas[i - 1, ], features,
+                                             target)
+    }
+    
+    betas <- as.data.frame(betas)
+    return(betas)
+  }
+
+# generate polynomials
+poly <- function(x, degree) {
+  sapply(0:degree, function(i)
+    x ^ i)
+}
+
+o_data <- Ozone[, c(4, 8)]
+o_data$V8 <- o_data$V8 / 100
+o_data <- o_data[complete.cases(o_data), ]
+
+id_train <- sample(1:nrow(o_data), 20)
+o_data$type <- "test"
+o_data[id_train,]$type <- "train"
+o_data$type <- as.factor(o_data$type)
+
+train_data <-  as.matrix(o_data[id_train, 1:2])
+test_data  <-  as.matrix(o_data[-id_train, 1:2])
+
+degree <- 15
+
+x_train <- poly(train_data[, 2], degree)
+y_train <- o_data[id_train , 1]
+x_test <- poly(test_data[, 2], degree)
+y_test <- o_data[-id_train , 1]
+
+num_steps <- 1000000
+res <- gradient_descent(rep(0, ncol(x_train)),  0.02, #0.003,
+                        R_emp_grad, num_steps, x_train, y_train)
+
+errs <- matrix(0, nrow = 2000, ncol = 2)
+it1 <- 1:1000
+for (i in it1) {
+  errs[i, 1] <-
+    sum((x_train %*% t(res[i, ]) - y_train) ^ 2) / nrow(x_train)
+  errs[i, 2]  <-
+    sum((x_test %*% t(res[i, ]) - y_test) ^ 2) / nrow(x_test)
+}
+it2 <- seq(1000, num_steps, length.out = 1000)
+for (i in it2) {
+  errs[1000 + which(it2 == i), 1] <-
+    sum((x_train %*% t(res[i, ]) - y_train) ^ 2) / nrow(x_train)
+  errs[1000 + which(it2 == i), 2]  <-
+    sum((x_test %*% t(res[i, ]) - y_test) ^ 2) / nrow(x_test)
+}
+
+df <- as.data.frame(errs)
+colnames(df) <- c("train", "test")
+df$id <- c(it1, it2)
+
+min_te <- which.min(errs[, 2])
+
+learning_df <- melt(df, id.vars = "id")
+
+# PLOT -------------------------------------------------------------------------
+
+# MSE
+p1 <- ggplot(learning_df, aes(x = id, y = value)) +
+  geom_line(aes(colour = variable), size=1.2) +
+  geom_vline(xintercept = min_te, colour="gray", 
+             alpha= 0.8, size = 1.5) +
+  geom_vline(xintercept = num_steps, colour="gray",
+             linetype = "dashed", alpha= 0.8, size = 1.5) +
+  scale_x_log10() +
+  ylab("MSE") +
+  xlab("Iterations") +
+  scale_fill_brewer(palette="Dark2") + 
+  annotate("text", x=min_te-70, y=175, label="stopped early",
+           color='black', size=3) + 
+  annotate("text", x=num_steps-4*1e5, y=175, label="overfitted",
+         color='black', size=3) +
+  theme(legend.position = "bottom") +
+  guides(color = guide_legend(title = NULL))
+
+# ozone level
+pl_data <- seq(min(o_data[, 2]), max(o_data[, 2]), length.out = 100)
+pl_data <- poly(pl_data, degree)
+
+y_overfit <- (pl_data) %*% t(res[num_steps, ])[,1]
+y_best <- (pl_data) %*% t(res[min_te, ])[,1]
+
+fitting_df <- data.frame(overfit = y_overfit, best = y_best, x = pl_data[, 2] * 100)
+fitting_df <- melt(fitting_df, id.vars = "x")
+
+p2 <- ggplot(o_data, aes(x=V8*100, y=V4)) + 
+  geom_point(aes(colour=type)) +
+  geom_line(data=fitting_df, aes(linetype=rev(variable), x=x, y=value), alpha = 0.7, 
+            show.legend=FALSE, color="gray", size=1.5) +
+  ylab("Ozone level") + 
+  xlab("Temperature (degrees F)") +
+  scale_fill_brewer(palette="Dark2") +
+  theme(legend.position = "bottom") +
+  guides(color = guide_legend(title = NULL))
+
+p = grid.arrange(p1, p2, ncol = 2)
+
+ggsave("../figure/early_stopping.png", plot=p, width=9, height=6)
+
diff --git a/slides/regularization/rsrc/equivariance-ols-ridge.R b/slides/regularization/rsrc/equivariance-ols-ridge.R
deleted file mode 100644
index 13f8b658..00000000
--- a/slides/regularization/rsrc/equivariance-ols-ridge.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(MASS)
-
-# Data
-set.seed(123)
-n <- 100
-p <- 5
-X <- matrix(rnorm(n * p), n, p)
-beta_true <- c(1, 2, 3, 4, 5)
-epsilon <- rnorm(n)
-Y <- X %*% beta_true + epsilon
-
-# OLS Solution
-beta_ols <- solve(t(X) %*% X) %*% t(X) %*% Y
-
-# Ridge Solution
-lambda <- 10
-beta_ridge <- solve(t(X) %*% X + lambda * diag(p)) %*% t(X) %*% Y
-
-# Rescale and repeat
-X_rescaled <- X
-X_rescaled[,5] <- 100 * X_rescaled[,5]
-beta_ols_rescaled <- solve(t(X_rescaled) %*% X_rescaled) %*% t(X_rescaled) %*% Y
-beta_ridge_rescaled <- solve(t(X_rescaled) %*% X_rescaled + lambda * diag(p)) %*% t(X_rescaled) %*% Y
-
-# Results
-results <- rbind(t(beta_ols), t(beta_ols_rescaled), t(beta_ridge), t(beta_ridge_rescaled))
-colnames(results) <- paste("Coefficient", 1:p)
-
-# MSE
-loss_ols <- mean((Y - X %*% beta_ols)^2)
-loss_ols_rescaled <- mean((Y - X_rescaled %*% beta_ols_rescaled)^2)
-loss_ridge <- mean((Y - X %*% beta_ridge)^2) # + lambda * sum(beta_ridge^2)
-loss_ridge_rescaled <- mean((Y - X_rescaled %*% beta_ridge_rescaled)^2) #+ lambda * sum(beta_ridge_rescaled^2)
-
-losses <- c(loss_ols, loss_ols_rescaled, loss_ridge, loss_ridge_rescaled)
-results <- cbind(results, MSE = losses)
-rownames(results) <- c("OLS", "OLS Rescaled", "Ridge", "Ridge Rescaled")
-print(results)
diff --git a/slides/regularization/rsrc/make_graddes_vs_weightdecay_plot.R b/slides/regularization/rsrc/graddes_vs_weightdecay.R
old mode 100644
new mode 100755
similarity index 71%
rename from slides/regularization/rsrc/make_graddes_vs_weightdecay_plot.R
rename to slides/regularization/rsrc/graddes_vs_weightdecay.R
index 1dddbcc2..fc78e14c
--- a/slides/regularization/rsrc/make_graddes_vs_weightdecay_plot.R
+++ b/slides/regularization/rsrc/graddes_vs_weightdecay.R
@@ -1,8 +1,16 @@
 # ------------------------------------------------------------------------------
-# FIG: GRADIENT DESCENT VS WEIGHT DECAY PLOT
+# wd vs l2
+
+# FIG: draw the path of the optimal point for each iteration using
+#      gradient descent and using weight decay.
+
+# DATA: linear model data from data_func_utils.R
 # ------------------------------------------------------------------------------
 
-source("utils.R")
+source("data_func_utils.R")
+library(gridExtra)
+
+# DATA -------------------------------------------------------------------------
 
 x1 <- seq(0,1.5,length.out = 100)
 x2 <- seq(0,3.5,length.out = 100)
@@ -14,14 +22,15 @@ num_steps <- 100
 
 gd_betas <- gradient_descent(beta_start, step_size, grad, num_steps)
 
-# R_emp plot
+# PLOT -------------------------------------------------------------------------
+
+# GD
 remp_plot <- plot_r_emp(R_emp, x1, x2) +
   geom_path(data = gd_betas, aes(x=V1, y=V2), colour = "red", size=1.1) +
   geom_point(data = gd_betas, aes(x=V1, y=V2), colour = "white") +
   theme(legend.position="none")
 
-# R_reg plot
-
+# WD
 lambda <- 10
 num_steps <- 100
 gd_l2_betas <- gradient_descent(beta_start, step_size,
@@ -35,7 +44,8 @@ remp_l2_plot <-  plot_r_emp(R_emp, x1, x2) +
   geom_point(data = gd_l2_betas, aes(x=V1, y=V2), colour = "white") +
   theme(legend.position="none")
 
-#p <- grid.arrange(remp_plot, remp_l2_plot, ncol=2)
+p <- grid.arrange(remp_plot, remp_l2_plot, ncol=2)
 
+ggsave("../figure/graddes_vs_weightdecay.png", plot = p, width = 5.2, height = 3.1, dpi="retina")
 ggsave("../figure/graddes_vs_weightdecay_01.png", plot = remp_plot, width = 2.6, height = 3.1, dpi="retina")
 ggsave("../figure/graddes_vs_weightdecay_02.png", plot = remp_l2_plot, width = 2.6, height = 3.1, dpi="retina")
diff --git a/slides/regularization/rsrc/make_l1_reg_hess_plots.R b/slides/regularization/rsrc/l1_reg_hess.R
old mode 100644
new mode 100755
similarity index 85%
rename from slides/regularization/rsrc/make_l1_reg_hess_plots.R
rename to slides/regularization/rsrc/l1_reg_hess.R
index d0d05a2e..1b464d05
--- a/slides/regularization/rsrc/make_l1_reg_hess_plots.R
+++ b/slides/regularization/rsrc/l1_reg_hess.R
@@ -1,10 +1,18 @@
 # ------------------------------------------------------------------------------
-# FIG: L2 REGULARIZATION HESSIAN PLOTS
+# geom l1
+
+# FIG: theta_hat (OLS) and theta_lasso (Lasso Regression) points on contour plot
+#      to show how l1 penalty influences the optimal value by pulling them 
+#      towards zero on each axis and overall.
+
+# DATA: principal components of linear model data from data_func_utils.R
 # ------------------------------------------------------------------------------
 
-source("utils.R")
+source("data_func_utils.R")
 library(gridExtra)
 
+# DATA -------------------------------------------------------------------------
+
 prc <- prcomp(X , scale. = FALSE)
 X_dc <- prc$x
 X_dc[,1] <- X_dc[, 1]/2
@@ -26,6 +34,9 @@ colnames(theta_hat) <- NULL
 lambda <- 10
 theta_l1_reg <- sign(theta_hat) * pmax(abs(theta_hat) - lambda / diag(hessian),0)
 
+# PLOT -------------------------------------------------------------------------
+
+# plot contour lines and theta_hat
 init_plot_l1 <- plot_r_emp(function(beta) R_emp(beta, features = X_dc, target = y_new), 
                            x1, x2) +
   theme(legend.position = "none") +
@@ -37,22 +48,18 @@ init_plot_l1 <- plot_r_emp(function(beta) R_emp(beta, features = X_dc, target =
   geom_vline(xintercept = 0, colour="lightblue",
              linetype = "dashed", alpha= 0.8, size = 1.1) +
   geom_point(data=as.data.frame(theta_hat), aes(x=theta_hat[1], y=theta_hat[2]), color="red", size=2) +
-  #geom_line(data=rbind(rep(0, num_features), as.data.frame(theta_hat)),
-  #          aes(x=V1, y=V2), colour="red", size=1.1, arrow=arrow(ends="first", length=unit(0.09, "npc"))) +
   geom_vline(xintercept = -lambda/hessian[1,1], colour="yellow",
              linetype = "dashed", alpha= 0.8, size = 1.1) +
   annotate("label", x = -2, y = -2.5, label =
              "frac(-lambda, H[\"1,1\"])",
            parse = TRUE, color = 'black', size = 4, fill = "yellow") 
 
+# plot with arrows and points
 theta_hat_1 <- theta_hat
 theta_hat_1[,1] <- 0
 
-
+# effect along theta1
 plot_l1_theta1 <- init_plot_l1 +
-  # geom_polygon(data = data.frame(x = c(theta_hat[,1], theta_hat[,1], 0, 0), 
-  #                               y = c(-Inf, Inf, Inf, -Inf)),
-  #         aes(x,y), fill="white", alpha=0.5) + 
   geom_point(data=as.data.frame(theta_hat_1), aes(x=theta_hat_1[1], y=theta_hat_1[2]), color="green", size=2) +
   geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_hat_1)),
                aes(x=start.V1, y=start.V2, 
@@ -64,7 +71,7 @@ p1 <- grid.arrange(init_plot_l1, plot_l1_theta1, ncol=2)
 
 ###################################################################
 
-
+# effect along theta2
 theta_hat_2 <- theta_hat
 theta_hat_2[,2] <- theta_l1_reg[2]
 
@@ -72,9 +79,6 @@ theta_hat_2[,2] <- theta_l1_reg[2]
 plot_l1_theta2 <- init_plot_l1 +
   geom_hline(yintercept=lambda/hessian[2,2], colour="yellow",
              linetype="dashed", alpha=0.8, size=1.1) +
-  #      geom_polygon(data = data.frame(x = c(-Inf, Inf, Inf, -Inf), 
-  #                                     y = c(theta_hat[,2], theta_hat[,2], 0, 0)),
-  #               aes(x,y), fill="white", alpha=0.5) +
   geom_point(data=as.data.frame(theta_hat_2), aes(x=theta_hat_2[1], y=theta_hat_2[2]), color="green", size=2) +
   geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_hat_2)),
                aes(x=start.V1, y=start.V2, 
@@ -84,12 +88,10 @@ plot_l1_theta2 <- init_plot_l1 +
   annotate("label", x=-3, y=2, label="frac(lambda, H[\"2,2\"])",
            parse=TRUE, color='black', size=4, fill="yellow") 
 
+# effect along both axes
 plot_l1_theta2_dash <- init_plot_l1 +
   geom_hline(yintercept=lambda/hessian[2,2], colour="yellow",
              linetype="dashed", alpha=0.8, size=1.1) +
-  #      geom_polygon(data = data.frame(x = c(-Inf, Inf, Inf, -Inf), 
-  #                                     y = c(theta_hat[,2], theta_hat[,2], 0, 0)),
-  #               aes(x,y), fill="white", alpha=0.5) +
   geom_point(data=as.data.frame(theta_hat_1), aes(x=theta_hat_1[1], y=theta_hat_1[2]), color="green", size=2) +
   geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_hat_1)),
                aes(x=start.V1, y=start.V2, 
@@ -105,6 +107,7 @@ plot_l1_theta2_dash <- init_plot_l1 +
   annotate("label", x=-3, y=2, label="frac(lambda, H[\"2,2\"])",
            parse=TRUE, color='black', size=4, fill="yellow") 
   
+# sum of two axes leads to theta_lasso
 plot_l1_theta_lasso <- plot_l1_theta2_dash +
   geom_point(data=as.data.frame(theta_l1_reg), aes(x=theta_l1_reg[1], y=theta_l1_reg[2]), color="orange", size=2) + 
   geom_segment(data=cbind(start=as.data.frame(theta_hat), end=as.data.frame(theta_l1_reg)),
diff --git a/slides/regularization/rsrc/make_l2_reg_hess_plots.R b/slides/regularization/rsrc/l2_reg_hess.R
old mode 100644
new mode 100755
similarity index 87%
rename from slides/regularization/rsrc/make_l2_reg_hess_plots.R
rename to slides/regularization/rsrc/l2_reg_hess.R
index 5f8b24b6..b77990d8
--- a/slides/regularization/rsrc/make_l2_reg_hess_plots.R
+++ b/slides/regularization/rsrc/l2_reg_hess.R
@@ -1,10 +1,18 @@
 # ------------------------------------------------------------------------------
-# FIG: L2 REGULARIZATION HESSIAN PLOTS
+# geom l2
+
+# FIG: theta_hat (OLS) and theta_ridge (Ridge Regression) points on contour plot
+#      to show how l2 penalty influences the optimal value
+#      on each principle axis and overall.
+
+# DATA: linear model data from data_func_utils.R
 # ------------------------------------------------------------------------------
 
-source("utils.R")
+source("data_func_utils.R")
 library(gridExtra)
 
+# DATA -------------------------------------------------------------------------
+
 lambda <- 90
 beta_start <- c(0, 0)
 step_size <- 0.005
@@ -38,14 +46,16 @@ theta_min_ridge_data <- as.data.frame(t(Q %*% theta_min_skew))
 x1 <- seq(-2,2,length.out = 100)
 x2 <- seq(-1,5,length.out = 100)
 
-#record contour level
+# PLOT -------------------------------------------------------------------------
+
+# record contour levels
 p_con <- plot_r_emp(R_emp, x1, x2, bins=25)
 ct_data <- ggplot_build(p_con)$data[[2]]
 ct_levels <- unique(ct_data$level)
-#preserve half to make plots look better
+# preserve half of them to make plots look better (less contour lines)
 ct_levels <- ct_levels[-seq(3, length(ct_levels), by = 2)]
 
-# R_emp
+# plot contour lines and theta_hat
 init_cond_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) +
   annotate("label", x = 0.75, y = 3, label = "hat(theta)",
            parse = TRUE, color = 'black', size = 3, fill = "red") +
@@ -55,6 +65,7 @@ init_cond_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) +
   geom_line(data=rbind(rep(0, num_features), theta_min),
             aes(x=V1, y=V2), colour="red", size=1, arrow = arrow(length = unit(0.06, "npc")))
 
+# effect along two principle axes
 rot_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) +
   theme(legend.position="none") + coord_fixed() +
   geom_abline(slope = Q[2,1]/Q[1,1], colour="darkgrey", size=1.2) +
@@ -71,23 +82,9 @@ rot_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) +
                colour = "green", arrow.fill = "green")
 
 rs <- sapply(1:2, function(i) S[i,i] / (S[i,i] + lambda))
-
 theta_hat <- theta_proj1_data*rs[1] + theta_proj2_data*rs[2] 
-geom_l2_plot <- plot_r_emp(R_emp, x1, x2, breaks=ct_levels) +
-  theme(legend.position="none") + coord_fixed() +
-  geom_hline(yintercept = 0, colour="darkgrey", size=1.2) +
-  geom_vline(xintercept = 0, colour="darkgrey", size=1.2) +
-  geom_point(aes(x=beta_true[1], y=beta_true[2], color="red", size=3))  +
-  geom_point(aes(x=theta_hat[1], y=theta_hat[2], color="yellow", size=3))
-
-geom_l2_plot <- geom_l2_plot +
-  annotate("label", x = 1.3, y = 1.5, label = "hat(theta)[Ridge]",
-           parse = TRUE, color = 'black', size = 3, fill = "yellow") + 
-  annotate("label", x = 0.75, y = 3, label = "hat(theta)",
-           parse = TRUE, color = 'black', size = 3, fill = "red") 
-
-##############shang
 
+# theta_ridge decomposition along principle axes
 scale_rot_plot <- rot_plot +
   geom_segment(data=cbind(start=as.data.frame(t(c(0,0))), end=
                             theta_proj1_data*rs[1] ), size=0.9,
@@ -109,6 +106,7 @@ scale_rot_plot <- rot_plot +
                                      xend = end.V1, yend = end.V2),
                colour = "yellow")
 
+# theta_hat and theta_ridge
 scale_plot <- init_cond_plot +
   annotate("label", x = 0.8, y = 1.5, label = "hat(theta)[Ridge]",
            parse = TRUE, color = 'black', size = 3, fill = "yellow") +
@@ -126,9 +124,9 @@ p2 <- grid.arrange(rot_plot, init_cond_plot, ncol=2)
 
 p3 <- grid.arrange(scale_rot_plot, scale_plot, ncol=2)
 
-### contour plot for l2
+### contour plot with l2 constraints
 
-# Generate data points for plotting circles(ridge)
+# Generate data points for plotting l2 constraints(circles)
 radius <- sqrt(theta_hat[1]^2 + theta_hat[2]^2)[[1]] #radius for interception point
 cir_list <- list()
 seq_data <- seq(0, 2*pi, length.out=100) #points for one circle
@@ -141,6 +139,7 @@ for(mul in c(radius/8, radius/3, radius/1.5, radius)){ #adjust radius
 eval_grid <- expand.grid(x1,x2)
 eval_grid$r_emp <- apply(eval_grid, 1, R_emp)
 
+# ellipse contours
 p_elli <- ggplot() +
   geom_raster(data=eval_grid, aes(x=Var1, y=Var2, fill=r_emp)) +
   geom_contour(data=eval_grid, aes(x=Var1, y=Var2, z=r_emp), 
@@ -150,6 +149,7 @@ p_elli <- ggplot() +
   ylab(expression(theta[2])) +
   scale_fill_viridis(end = 0.9)
 
+# ellipse and circle contours
 p_ridge <- p_elli + 
   geom_path(data=cir_list[[1]], aes(x, y), color="white", linetype="dashed") +
   geom_path(data=cir_list[[2]], aes(x, y), color="white", linetype="dashed") +
@@ -160,6 +160,7 @@ p_ridge <- p_elli +
 beta_true <- data.frame(x=beta_true[1], y=beta_true[2])
 theta_hat <- data.frame(x=theta_hat[1][[1]], y=theta_hat[2][[1]])
 
+# add points
 p_poi <- p_ridge + 
   geom_point(data=beta_true, aes(x=x, y=y), color="red", size=3) +
   geom_point(data=theta_hat, aes(x=x, y=y), color="yellow", size=3) +
@@ -170,6 +171,7 @@ p_poi <- p_ridge +
   geom_hline(yintercept=0, colour="darkgrey", size=1.2) +
   geom_vline(xintercept=0, colour="darkgrey", size=1.2)
 
+# add decomposition arrows
 p4 <- p_poi + 
   geom_segment(data=cbind(start=as.data.frame(t(c(0,0))), end=beta_true ), size=0.9,
                arrow=arrow(length = unit(0.06, "npc")),
@@ -206,7 +208,7 @@ p4 <- p_poi +
   
 
 
-ggsave("../figure/l2_reg_hess_01_plot.png", plot = p1, width = 5.5, height = 3.5, dpi="retina")
-ggsave("../figure/l2_reg_hess_02_plot.png", plot = p2, width = 5.5, height = 3.5, dpi="retina")
-ggsave("../figure/l2_reg_hess_03_plot.png", plot = p3, width = 5.5, height = 3.5, dpi="retina")
-ggsave("../figure/l2_reg_hess_04_plot.png", plot = p4, width = 3, height = 5, dpi="retina")
+ggsave("../figure/l2_reg_hess_01.png", plot = p1, width = 5.5, height = 3.5, dpi="retina")
+ggsave("../figure/l2_reg_hess_02.png", plot = p2, width = 5.5, height = 3.5, dpi="retina")
+ggsave("../figure/l2_reg_hess_03.png", plot = p3, width = 5.5, height = 3.5, dpi="retina")
+ggsave("../figure/l2_reg_hess_04.png", plot = p4, width = 3, height = 5, dpi="retina")
diff --git a/slides/regularization/rsrc/lasso_contour_cases.R b/slides/regularization/rsrc/lasso_contour_cases.R
new file mode 100755
index 00000000..b448c3be
--- /dev/null
+++ b/slides/regularization/rsrc/lasso_contour_cases.R
@@ -0,0 +1,81 @@
+# ------------------------------------------------------------------------------
+# l2
+
+# FIG: lasso contour plots under different parameter values.
+#  (1) smaller parameter with theta_1 removed
+#  (2) small lambda that does not lead to sparsity
+#  (3) large lambda that leads to sparsity
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(gridExtra)
+
+# ------------------------------------------------------------------------------
+
+# Function to create contour plots
+create_contour_plot <- function(theta_hat, theta_lasso, l1_edge, outermost_point, annotation_positions, subtitle) {
+  theta1 <- seq(-4, 4, length.out = 300)
+  theta2 <- seq(-2, 5, length.out = 300)
+  grid <- expand.grid(Theta1 = theta1, Theta2 = theta2)
+  
+  target_direction <- c(1, 4) / sqrt(sum(c(1, 4)^2))
+  angle <- atan2(target_direction[2], target_direction[1]) - pi / 18
+  rot_matrix <- matrix(c(cos(angle), -sin(angle), sin(angle), cos(angle)), nrow = 2, byrow = TRUE)
+  
+  scale <- c(1, 2)
+  Z <- as.matrix(grid) - matrix(theta_hat, nrow = nrow(grid), ncol = 2, byrow = TRUE)
+  Z <- Z %*% rot_matrix
+  Z <- Z %*% diag(scale)
+  Z <- Z %*% t(rot_matrix)
+  L <- (Z[, 1])^2 + (Z[, 2])^2
+  grid$L <- L
+  
+  outermost_level <- sum((outermost_point - theta_hat)^2)
+  
+  # Create the ggplot object
+  p <- ggplot() +
+    geom_contour(data = grid, aes(x = Theta1, y = Theta2, z = L),colour = "red", breaks = seq(min(L), outermost_level, length.out = 5)) +
+    geom_polygon(data = data.frame(x = c(l1_edge, 0, -l1_edge, 0), y = c(0, l1_edge, 0, -l1_edge)), aes(x, y), fill = "cyan", alpha = 0.3) +
+    labs(x = expression(theta[1]), y = expression(theta[2]), title = subtitle) +
+    theme_bw() +
+    coord_fixed()
+  
+  p <- p + 
+    geom_point(data=as.data.frame(theta_hat), aes(x=theta_hat[1], y=theta_hat[2]), colour="black") + 
+    annotate("label", x=annotation_positions[2, 1], y=annotation_positions[2, 2], label="hat(theta)", parse=TRUE, size=5) + 
+    geom_segment(data=cbind(start=as.data.frame(matrix(annotation_positions[2,], nrow = 1, byrow = TRUE)), end=as.data.frame(matrix(theta_hat, nrow = 1, byrow = TRUE))),
+                 aes(x=start.V1, y=start.V2, 
+                     xend=end.V1, yend=end.V2), colour="black",
+                 size=0.9, arrow = arrow(ends="last", type="closed", length=unit(0.04, "npc")),
+                 arrow.fill="black")
+  
+  p <- p + 
+    geom_point(data=as.data.frame(theta_lasso), aes(x=theta_lasso[1], y = theta_lasso[2]), colour="black") + 
+    annotate("label", x=annotation_positions[1, 1], y=annotation_positions[1, 2], label="hat(theta)[\"Lasso\"]", parse=TRUE, size=5) + 
+    geom_segment(data=cbind(start=as.data.frame(matrix(annotation_positions[1,], nrow = 1, byrow = TRUE)), end=as.data.frame(matrix(theta_lasso, nrow = 1, byrow = TRUE))),
+                 aes(x=start.V1, y=start.V2, 
+                     xend=end.V1, yend=end.V2), colour="black",
+                 size=0.9, arrow = arrow(ends="last", type="closed", length=unit(0.04, "npc")),
+                 arrow.fill="black") + 
+    xlim(-3, 3) +
+    ylim(-2, 5)
+  return(p)
+}
+
+# Create individual plots
+plot1 <- create_contour_plot(theta_hat = c(0.5, 3), theta_lasso = c(0, 1), l1_edge = 1, 
+                             outermost_point = c(0, 1), annotation_positions = matrix(c(-2, 1.1, 2.5, 2), nrow = 2, byrow = TRUE), 
+                             subtitle = expression(paste("smaller param. ", theta[1], " is removed")))
+
+plot2 <- create_contour_plot(theta_hat = c(1, 1), theta_lasso = c(0.5, 0.5), l1_edge = 1, 
+                             outermost_point = c(0.5, 0.5), annotation_positions = matrix(c(-0.5, 2.5, 2, 3), nrow = 2, byrow = TRUE), 
+                             subtitle = "small λ: no sparsity")
+
+plot3 <- create_contour_plot(theta_hat = c(1, 1), theta_lasso = c(0.5, 0), l1_edge = 0.5, 
+                             outermost_point = c(0.5, 0), annotation_positions = matrix(c(-0.5, 2.5, 2, 3), nrow = 2, byrow = TRUE), 
+                             subtitle = "larger λ: sparsity")
+
+# Arrange the plots in a grid
+p <- grid.arrange(plot1, plot2, plot3, nrow = 1)
+
+ggsave("../figure/lasso_contour_cases.png", plot = p, height = 6, width = 18)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_linear_model_reg.R b/slides/regularization/rsrc/lin_model_regu.R
old mode 100644
new mode 100755
similarity index 75%
rename from slides/regularization/rsrc/make_linear_model_reg.R
rename to slides/regularization/rsrc/lin_model_regu.R
index dfa20732..6c9e4916
--- a/slides/regularization/rsrc/make_linear_model_reg.R
+++ b/slides/regularization/rsrc/lin_model_regu.R
@@ -1,20 +1,29 @@
-# Load necessary libraries
+# ------------------------------------------------------------------------------
+# l1, l2
+
+# FIG: optimal points with different regularization constants (lambda)
+#      on contour plot for linear regression with 
+#      l1 and l2 regularization.
+
+# DATA: y = X(100*2 ~Normal)·beta_true(3,-2) + noise(100*1 ~Normal)
+# ------------------------------------------------------------------------------
+
 library(ggplot2)
 library(MASS)
 library(glmnet)
 library(gridExtra)
 
-# Example dataset
 set.seed(123)
+
+# DATA -------------------------------------------------------------------------
+
 n <- 100
 X <- matrix(rnorm(2 * n), n, 2)
 beta_true <- c(3, -2)
 y <- X %*% beta_true + rnorm(n)
 
-# Train unregularized linear model
 lm_unreg <- lm(y ~ X - 1)  # '-1' to remove the intercept
 
-# Train L2 regularized models with different lambdas
 lambdas_l2 <- c(0.1, 1, 2.5, 5, 10, 20, 100)
 models <- lapply(lambdas_l2, function(lambda) {
   return(glmnet::glmnet(X, y, alpha = 0, lambda = lambda, standardize = FALSE, intercept = FALSE))
@@ -24,24 +33,20 @@ coefs <- sapply(models, function(model, lambda) {
   coef(model, s = lambda)[-1, 1]  # Exclude the intercept
 }, lambdas_l2)
 
-# Transpose to make each column represent a model
+# Transpose so each column represents a model
 coefs_l2 <- t(coefs)
 
-# Create a data frame from the matrix
 coefs_df_l2 <- as.data.frame(coefs_l2)
 names(coefs_df_l2) <- c("X1", "X2")
 
-# Prepare data for contour plot
 grid_range <- seq(-5, 5, length.out = 100)
 grid_data <- expand.grid(X1 = grid_range, X2 = grid_range)
 grid_data$loss <- apply(grid_data, 1, function(vec) {
   sum((y - X %*% vec)^2) / (2 * n)
 })
 
-# Adjusted lambda values
 lambdas_l1 <- c(0.01, 0.5, 1, 1.5, 2, 2.5, 10)
 
-# Train L1 regularized models with the adjusted lambdas
 models_l1 <- lapply(lambdas_l1, function(lambda) {
   return(glmnet::glmnet(X, y, alpha = 1, lambda = lambda, standardize = FALSE, intercept = FALSE))
 })
@@ -49,33 +54,31 @@ models_l1 <- lapply(lambdas_l1, function(lambda) {
 # Extract coefficients for L1 regularized models
 coefs_l1 <- sapply(models_l1, function(model, lambda) {
   coef(model, s = lambda)[-1, 1]
-}, lambdas)
+}, lambdas_l1)
 
-# Transpose to make each column represent a model
+# Transpose so each column represents a model
 coefs_l1 <- t(coefs_l1)
 
-# Create a data frame for L1 coefficients
 coefs_df_l1 <- as.data.frame(coefs_l1)
 names(coefs_df_l1) <- c("X1", "X2")
 
-# Add lambda values to the L1 coefficients data frame
 coefs_df_l1$lambda <- factor(lambdas_l1)
 
-# Manually defined red colors
-red_colors <- c("#ffcccc",  # lightest red
+red_colors <- c("#ffcccc",
                 "#ff9999",
                 "#ff6666",
                 "#ff3333",
-                "#ff0000",  # medium red
+                "#ff0000",
                 "#cc0000",
-                "#800000")  # darkest red
+                "#800000")
 
-# Ensure the number of colors matches the number of lambda values
 if(length(red_colors) != length(lambdas_l1)) {
   stop("The number of manually defined colors does not match the number of lambda values.")
 }
 
-# Plot for L1 Regularization with manually defined red colors
+# PLOT -------------------------------------------------------------------------
+
+# L1 Regularization
 p_l1 <- ggplot(grid_data, aes(x = X1, y = X2)) +
   geom_contour_filled(aes(z = loss), breaks = seq(min(grid_data$loss), max(grid_data$loss), length.out = 15)) +
   geom_point(data = coefs_df_l1, aes(x = X1, y = X2, color = lambda), size = 4) +
@@ -92,7 +95,7 @@ p_l1 <- ggplot(grid_data, aes(x = X1, y = X2)) +
 
 coefs_df_l2$lambda <- factor(lambdas_l2)
 
-# Plot for L2 Regularization with manually defined red colors
+# L2 Regularization
 p_l2 <- ggplot(grid_data, aes(x = X1, y = X2)) +
   geom_contour_filled(aes(z = loss), breaks = seq(min(grid_data$loss), max(grid_data$loss), length.out = 15)) +
   geom_point(data = coefs_df_l2, aes(x = X1, y = X2, color = lambda), size = 4) +
@@ -107,9 +110,5 @@ p_l2 <- ggplot(grid_data, aes(x = X1, y = X2)) +
         axis.line = element_blank()) +
   guides(fill = "none")
 
-
-# Save the L2 plot
-ggsave("../figure/lin_reg_l2.png", plot = p_l2, width = 8, height = 5)
-
-# Save the L1 plot
-ggsave("../figure/lin_reg_l1.png", plot = p_l1, width = 8, height = 5)
\ No newline at end of file
+ggsave("../figure/lin_model_regu_01.png", plot = p_l1, width = 8, height = 5) #L1
+ggsave("../figure/lin_model_regu_02.png", plot = p_l2, width = 8, height = 5) #L2
diff --git a/slides/regularization/rsrc/make-solution-path-ridge-lasso.py b/slides/regularization/rsrc/make-solution-path-ridge-lasso.py
deleted file mode 100644
index 9d28f079..00000000
--- a/slides/regularization/rsrc/make-solution-path-ridge-lasso.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Dec  6 12:36:47 2023
-
-@author: chris
-"""
-
-import numpy as np
-from matplotlib import pyplot as plt
-from sklearn import linear_model
-
-# Cost function definitions
-def cost_l2(x, y):
-    return x**2 + y**2
-
-def cost_l1(x, y):
-    return np.abs(x) + np.abs(y)
-
-def costfunction(X, y, theta):
-    m = np.size(y)
-    h = X @ theta
-    return float((1./(2*m)) * (h - y).T @ (h - y))
-
-def closed_form_reg_solution(X, y, lamda=10): 
-    m, n = X.shape
-    I = np.eye((n))
-    return (np.linalg.inv(X.T @ X + lamda * I) @ X.T @ y)[:, 0]
-
-# Dataset creation and normalization
-x = np.linspace(0, 1, 40)
-noise = 1 * np.random.uniform(size=40)
-y = np.sin(x * 1.5 * np.pi) 
-y_noise = (y + noise).reshape(-1, 1) - np.mean(y + noise)
-X = np.vstack((x, x**2)).T
-X = X / np.linalg.norm(X, axis=0)
-
-# Setup of meshgrid of theta values
-xx, yy = np.meshgrid(np.linspace(-2, 17, 100), np.linspace(-17, 3, 100))
-
-# Computing the cost function for each theta combination
-zz_l2 = np.array([cost_l2(xi, yi) for xi, yi in zip(np.ravel(xx), np.ravel(yy))]) # L2 function
-zz_l1 = np.array([cost_l1(xi, yi) for xi, yi in zip(np.ravel(xx), np.ravel(yy))]) # L1 function
-zz_ls = np.array([costfunction(X, y_noise, np.array([t0, t1]).reshape(-1, 1))
-                  for t0, t1 in zip(np.ravel(xx), np.ravel(yy))]) # Least square cost function
-
-# Reshaping the cost values    
-Z_l2 = zz_l2.reshape(xx.shape)
-Z_l1 = zz_l1.reshape(xx.shape)
-Z_ls = zz_ls.reshape(xx.shape)
-
-# Calculating the regularization paths
-lambda_range_l2 = np.logspace(0, 4, num=100) / 1000
-theta_0_list_reg_l2, theta_1_list_reg_l2 = zip(*[closed_form_reg_solution(X, y_noise, l) for l in lambda_range_l2])
-
-lambda_range_l1 = np.logspace(0, 2, num=100) / 1000
-theta_0_list_reg_l1, theta_1_list_reg_l1 = zip(*[linear_model.Lasso(alpha=l, fit_intercept=False).fit(X, y_noise).coef_
-                                                  for l in lambda_range_l1])
-
-# Plotting the contours and paths with updated aesthetics
-fig = plt.figure(figsize=(16, 7))
-
-# L2 regularization plot
-ax = fig.add_subplot(1, 2, 1)
-ax.contour(xx, yy, Z_l2, levels=[.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250], colors='cyan')
-ax.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax.set_xlabel(r'$\theta_1$', fontsize=18)
-ax.set_ylabel(r'$\theta_2$', fontsize=18)
-ax.set_title('L2 regularization solution path', fontsize=20)
-ax.plot(theta_0_list_reg_l2, theta_1_list_reg_l2, linestyle='none', marker='o', color='red', alpha=.2)
-
-# L1 regularization plot
-ax = fig.add_subplot(1, 2, 2)
-ax.contour(xx, yy, Z_l1, levels=[.5, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14], colors='cyan')
-ax.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax.set_xlabel(r'$\theta_1$', fontsize=18)
-ax.set_ylabel(r'$\theta_2$', fontsize=18)
-ax.set_title('L1 regularization solution path', fontsize=20)
-ax.plot(theta_0_list_reg_l1, theta_1_list_reg_l1, linestyle='none', marker='o', color='red', alpha=.2)
-
-plt.show()
-
-# L2 regularization plot only
-fig_l2 = plt.figure(figsize=(8, 7))
-ax_l2 = fig_l2.add_subplot(1, 1, 1)
-
-ax_l2.contour(xx, yy, Z_l2, levels=[.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250], colors='cyan')
-ax_l2.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax_l2.set_xlabel(r'$\theta_1$', fontsize=16)
-ax_l2.set_ylabel(r'$\theta_2$', fontsize=16)
-ax_l2.set_title('L2 regularization solution path', fontsize=17)
-ax_l2.plot(theta_0_list_reg_l2, theta_1_list_reg_l2, linestyle='none', marker='o', color='red', alpha=.2)
-
-plt.show()
-
-# Define the L2 regularization contour levels
-l2_contour_levels = [.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250]
-
-# Determine which points are inside or outside the L2 regularization contours
-inside_points = []
-outside_points = []
-
-for theta_0, theta_1 in zip(theta_0_list_reg_l2, theta_1_list_reg_l2):
-    cost = cost_l2(theta_0, theta_1)
-    if any(cost < level for level in l2_contour_levels):
-        inside_points.append((theta_0, theta_1))
-    else:
-        outside_points.append((theta_0, theta_1))
-
-# Separate the points into x and y coordinates for plotting
-inside_x, inside_y = zip(*inside_points)
-outside_x, outside_y = zip(*outside_points)
-
-# Plot 1: Points inside the L2 regularization contours
-fig_inside, ax_inside = plt.subplots(figsize=(8, 7))
-ax_inside.contour(xx, yy, Z_l2, levels=l2_contour_levels, colors='cyan')
-ax_inside.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax_inside.scatter(inside_x, inside_y, color='green', marker='o', alpha=.5)  # Points inside
-ax_inside.set_xlabel(r'$\theta_1$', fontsize=16)
-ax_inside.set_ylabel(r'$\theta_2$', fontsize=16)
-ax_inside.set_title('L2 regularization solution path', fontsize=17)
-
-# Plot 2: Points outside the L2 regularization contours
-fig_outside, ax_outside = plt.subplots(figsize=(8, 7))
-ax_outside.contour(xx, yy, Z_l2, levels=l2_contour_levels, colors='cyan')
-ax_outside.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax_outside.scatter(outside_x, outside_y, color='blue', marker='o', alpha=.5)  # Points outside
-ax_outside.set_xlabel(r'$\theta_1$', fontsize=16)
-ax_outside.set_ylabel(r'$\theta_2$', fontsize=16)
-ax_outside.set_title('Solutions outside of L2 regularization', fontsize=17)
-
-plt.show()
-
-# L2 regularization contour levels
-l2_levels = [.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250]
-
-# L1 regularization contour levels
-l1_levels = [.5, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14]
-
-# Determine points inside the contours for L2
-inside_l2 = [(t0, t1) for t0, t1 in zip(theta_0_list_reg_l2, theta_1_list_reg_l2) if cost_l2(t0, t1) < max(l2_levels)]
-
-# Determine points inside the contours for L1
-inside_l1 = [(t0, t1) for t0, t1 in zip(theta_0_list_reg_l1, theta_1_list_reg_l1) if cost_l1(t0, t1) < max(l1_levels)]
-
-fig = plt.figure(figsize=(16, 7))
-
-# L2 Regularization Plot
-ax1 = fig.add_subplot(1, 2, 1)
-ax1.contour(xx, yy, Z_l2, levels=l2_levels, colors='cyan')
-ax1.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax1.scatter(*zip(*inside_l2), color='green', marker='o', alpha=.5)  # Points inside L2
-ax1.set_xlabel(r'$\theta_1$', fontsize=18)
-ax1.set_ylabel(r'$\theta_2$', fontsize=18)
-ax1.set_title('L2 regularization solution path', fontsize=20)
-
-# L1 Regularization Plot
-ax2 = fig.add_subplot(1, 2, 2)
-ax2.contour(xx, yy, Z_l1, levels=l1_levels, colors='cyan')
-ax2.contour(xx, yy, Z_ls, levels=[.01, .06, .09, .11, .15], cmap='coolwarm')
-ax2.scatter(*zip(*inside_l1), color='green', marker='o', alpha=.5)  # Points inside L1
-ax2.set_xlabel(r'$\theta_1$', fontsize=18)
-ax2.set_ylabel(r'$\theta_2$', fontsize=18)
-ax2.set_title('L1 regularization solution path', fontsize=20)
-
-plt.show()
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_avoid_overfitting_01_plot.R b/slides/regularization/rsrc/make_avoid_overfitting_01_plot.R
deleted file mode 100644
index 6edaefc3..00000000
--- a/slides/regularization/rsrc/make_avoid_overfitting_01_plot.R
+++ /dev/null
@@ -1,30 +0,0 @@
-# ------------------------------------------------------------------------------
-# FIG: AVOID OVERFITTING 01
-# ------------------------------------------------------------------------------
-
-library(knitr)
-library(mlr)
-library(mlbench)
-library(ggplot2)
-library(tidyr)
-library(colorspace)
-library(gridExtra)
-library(BBmisc)
-library(reshape)
-library(data.table)
-
-# DATA -------------------------------------------------------------------------
-
-load("ozone_example.RData")
-
-dfp <- setDT(df_incdata)[, .(mean.mse = median(value)), by = c("nobs", "variable")]
-
-# PLOTS ------------------------------------------------------------------------
-
-p <- ggplot(data = dfp, aes(x = nobs, y = mean.mse, colour = variable)) +
-  geom_line(lwd = 1.2) + ylim(c(0, 100)) + labs(colour = " ") +
-  scale_colour_discrete(labels = c("Train error", "Test error")) +
-  xlab("Size of data set") + ylab("MSE") +
-  scale_color_brewer(palette="Dark2")
-
-ggsave("../figure/avoid_overfitting_01.png", plot=p, width=5, height=2.5)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_avoid_overfitting_02_plot.R b/slides/regularization/rsrc/make_avoid_overfitting_02_plot.R
deleted file mode 100644
index c6436113..00000000
--- a/slides/regularization/rsrc/make_avoid_overfitting_02_plot.R
+++ /dev/null
@@ -1,28 +0,0 @@
-# ------------------------------------------------------------------------------
-# FIG: AVOID OVERFITTING 02
-# ------------------------------------------------------------------------------
-
-library(knitr)
-library(mlr)
-library(mlbench)
-library(ggplot2)
-library(tidyr)
-library(colorspace)
-library(gridExtra)
-library(BBmisc)
-library(reshape)
-
-# DATA -------------------------------------------------------------------------
-
-load("ozone_example.RData")
-
-# PLOTS ------------------------------------------------------------------------
-p <- ggplot(data = df_incfeatures, aes(x = type, y = mean.mse, colour = variable)) +
-  geom_line(lwd = 1.2) + labs(colour = " ") +
-  scale_colour_discrete(labels = c("Train error", "Test error")) +
-  xlab("Number of features") + ylab("Mean Squared Error") +
-  ylim(c(0, 150)) +
-  scale_x_continuous(breaks = 0:12) +
-  scale_color_brewer(palette="Dark2")
-
-ggsave("../figure/avoid_overfitting_02.png", plot=p, width=5, height=2.5)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_early_stopping_plot.R b/slides/regularization/rsrc/make_early_stopping_plot.R
deleted file mode 100644
index bcd13662..00000000
--- a/slides/regularization/rsrc/make_early_stopping_plot.R
+++ /dev/null
@@ -1,52 +0,0 @@
-# ------------------------------------------------------------------------------
-# FIG: EARLY STOPPING
-# ------------------------------------------------------------------------------
-
-library(knitr)
-library(mlr)
-library(mlbench)
-library(ggplot2)
-library(tidyr)
-library(colorspace)
-library(gridExtra)
-library(BBmisc)
-library(reshape)
-
-# DATA -------------------------------------------------------------------------
-load("early_stopping1.RData")
-
-o_data$type <- factor(o_data$type, levels=c("train", "test"))
-
-# PLOTS ------------------------------------------------------------------------
-p1 <- ggplot(o_learn, aes(x = id, y = value)) +
-  geom_line(aes(colour = variable), lwd = 1.2) +
-  geom_vline(xintercept = best_it, linetype = "solid", lwd = 2,
-             colour="darkgrey") +
-  geom_vline(xintercept = max_it, lwd = 2, colour="darkgrey",
-             linetype = "dashed") +
-  annotate("label", x = 30, y = 180, label = "stopped early") +
-  annotate("label", x = 4e5, y = 180, label = "overfitted") +
-  scale_x_log10() +
-  xlab("Iterations") +
-  ylab("Mean Squared Error") +
-  labs(colour = " ")  +
-  theme(legend.position="bottom") +
-  scale_color_brewer(palette="Dark2")
-
-p2 <- ggplot(o_data, aes(x=V8*100, y=V4)) +
-  geom_point(data=o_data, aes(colour=type, alpha=type)) +
-  scale_alpha_manual(values = c(1, 0.2), guide = "none") +
-  geom_line(data=o_fit, aes(linetype=variable, x=x, y=value), alpha = 1,
-            lwd = 2, colour="darkgrey") +
-  scale_linetype_manual(values = c("dashed", "solid")) +
-  xlab("Temperature (degrees F)") +
-  ylab("Ozone level") +
-  theme(legend.position="bottom") +
-  guides(linetype = FALSE) +
-  # scale_alpha(guide = "none") +
-  labs(colour = " ") +
-  scale_color_brewer(palette="Dark2")
-
-p <- grid.arrange(p1, p2, ncol=2)
-
-ggsave("../figure/early_stopping.png", plot=p, width=9, height=6)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_lasso-contours-sparsity.py b/slides/regularization/rsrc/make_lasso-contours-sparsity.py
deleted file mode 100644
index 1c8916c4..00000000
--- a/slides/regularization/rsrc/make_lasso-contours-sparsity.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Dec  1 03:40:27 2023
-
-@author: chris
-"""
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-# Function to create contour plots
-def create_contour_plot(ax, theta_hat, theta_lasso, l1_edge, outermost_point, annotation_positions, subtitle):
-    theta1 = np.linspace(-4, 4, 300)
-    theta2 = np.linspace(-2, 5, 300)
-    Theta1, Theta2 = np.meshgrid(theta1, theta2)
-
-    target_direction = np.array([1, 4]) / np.linalg.norm([1, 4])
-    angle = np.arctan2(target_direction[1], target_direction[0]) - np.pi / 18
-    rot_matrix = np.array([[np.cos(angle), -np.sin(angle)], 
-                           [np.sin(angle), np.cos(angle)]])
-    
-    scale = np.array([1, 2])
-    Z = np.vstack((Theta1.ravel() - theta_hat[0], Theta2.ravel() - theta_hat[1])).T @ rot_matrix
-    Z = Z * scale
-    Z = Z @ rot_matrix.T
-    L = (Z[:, 0])**2 + (Z[:, 1])**2
-    L = L.reshape(Theta1.shape)
-
-    outermost_level = (outermost_point[0] - theta_hat[0])**2 + (outermost_point[1] - theta_hat[1])**2
-
-    # Plot the contours
-    ax.contour(Theta1, Theta2, L, levels=np.linspace(np.min(L), outermost_level, 5), colors='red')
-
-    # L1 regularization path with adjusted darker blue color
-    diamond = plt.Polygon([[l1_edge,0], [0,l1_edge], [-l1_edge,0], [0,-l1_edge]], closed=True, color='cyan', alpha=0.3)  # Medium Blue
-    ax.add_patch(diamond)
-
-    # Plot theta_hat and theta_lasso
-    ax.plot(*theta_hat, 'ko')
-    ax.plot(*theta_lasso, 'ko')
-
-    # Annotations with adjusted sizes
-    ax.annotate(r'$\hat{\theta}_{Lasso}$', xy=theta_lasso, xytext=annotation_positions[0],
-                 arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=3), ha='right', va='bottom', fontsize=35)
-    ax.annotate(r'$\hat{\theta}$', xy=theta_hat, xytext=annotation_positions[1],
-                 arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=3), ha='left', va='bottom', fontsize=35)
-
-    # Axes settings
-    ax.set_xlabel(r'$\theta_1$', fontsize=30)
-    ax.set_ylabel(r'$\theta_2$', fontsize=30)
-    ax.tick_params(axis='both', which='major', labelsize=25)
-    ax.axis('equal')
-    ax.set_xlim([-4, 4])
-    ax.set_ylim([-2, 5])
-
-    # Add subtitle
-    ax.set_title(subtitle, fontsize=30)
-
-# Initialize a figure with three subplots
-fig, axs = plt.subplots(1, 3, figsize=(24, 8))
-
-# First plot
-create_contour_plot(axs[0], theta_hat=[0.5, 3], theta_lasso=[0, 1], l1_edge=1, 
-                    outermost_point=[0, 1], annotation_positions=[(-2, 1.1), (2.5, 2)], subtitle=r'$\text{smaller param. }\theta_{1}\text{ is removed}$')
-
-# Second plot with subtitle "small λ"
-create_contour_plot(axs[1], theta_hat=[1, 1], theta_lasso=[0.5, 0.5], l1_edge=1, 
-                    outermost_point=[0.5, 0.5], annotation_positions=[(-0.5, 2.5), (2, 3)], subtitle='small λ: no sparsity')
-
-# Third plot with subtitle "large λ"
-create_contour_plot(axs[2], theta_hat=[1, 1], theta_lasso=[0.5, 0], l1_edge=0.5, 
-                    outermost_point=[0.5, 0], annotation_positions=[(-0.5, 2.5), (2, 3)], subtitle='larger λ: sparsity')
-
-plt.tight_layout()
-plt.show()
diff --git a/slides/regularization/rsrc/make_plot_ridge_hat.py b/slides/regularization/rsrc/make_plot_ridge_hat.py
deleted file mode 100644
index 250c86ae..00000000
--- a/slides/regularization/rsrc/make_plot_ridge_hat.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from matplotlib.patches import Circle
-
-# Define the grid for plotting
-x = np.linspace(-3.0, 3.0, 400)
-y = np.linspace(-3.0, 3.0, 400)
-X, Y = np.meshgrid(x, y)
-
-# Define the center of the objective function
-objective_center = np.array([1.5, 1.5])  # Adjust as needed
-
-# Elliptical objective function with rotation
-def rotated_elliptical_objective(X, Y, center, a, b, angle_deg):
-    """ Rotated elliptical objective function. """
-    angle_rad = np.radians(angle_deg)
-    X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1])
-    Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1])
-    return (X_rot**2 / a**2) + (Y_rot**2 / b**2)
-
-# Define elliptical parameters
-a, b = 1.5, 0.75  # Semi-major and semi-minor axes lengths
-rotation_angle = -30  # Rotation angle in degrees
-
-# Calculate rotated elliptical objective function values
-Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle)
-
-# Define the constraint circle for ridge regression (L2)
-constraint_radius = 1.0  # Example radius
-
-# Create contour levels
-contour_levels = [0.1, 0.3, 0.6]  # Example contour levels
-
-# Create a 2x1 grid of plots
-fig, axs = plt.subplots(figsize=(8, 8), dpi=100)
-
-def draw_plot(ax, constraint_radius, contour_levels):
-    # Plot contour lines around the objective center
-    CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5)
-    
-    # Plot the constraint circle
-    colors = ['cornflowerblue', 'blue', 'navy']
-    rads = [1, 1.5, 3]
-    for i in range(3):
-        circle = Circle((0, 0), constraint_radius/rads[i], color=colors[i], alpha=0.3, linestyle='--')
-        ax.add_artist(circle)
-    
-    # Plot the minimum point
-    ax.plot(objective_center[0], objective_center[1], 'o', color='red', markersize=6)
-    ax.text(objective_center[0]+0.05, objective_center[1]+0.05, r'$\hat{\theta}$', fontsize=12, color='black')
-
-    # Set the same scale for both axes and set limits
-    ax.set_aspect('equal', 'box')
-    ax.set_xlim(-1.2, 2.7)
-    ax.set_ylim(-1.2, 2.5)
-    ax.axis('off')
-
-    # Define the legend elements
-    #legend_elements = [
-    #    plt.Line2D([0], [0], marker='o', color='black', markersize=6, label=r'$\hat{\theta}$', linestyle='None')
-    #]
-    
-    last_contour = CS.allsegs[2][0]  # Use the second contour for intersection
-    distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2)
-    min_idx = np.argmin(np.abs(distances - constraint_radius))
-    intersection_point = last_contour[min_idx]
-    ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=6)
-    ax.text(intersection_point[0]+0.05, intersection_point[1]+0.05, r'$\hat{\theta}_{ridge}$', fontsize=12, color='black')
-    #legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=6, label=r'$\hat{\theta}_{ridge}$')) 
-
-    # Add the legend
-    #ax.legend(handles=legend_elements, loc='upper left', fontsize='large', frameon=True, handletextpad=0.4, borderpad=0.1, labelspacing=0.1)
-
-    # Add arrows indicating the axes
-    ax.arrow(-1.2, 0, 3.6, 0, head_width=0.1, head_length=0.2, fc='black', ec='black')
-    ax.text(2.3, -0.1, r'$\theta_1$', fontsize=12, color='black')
-    ax.arrow(0, -1.2, 0, 3.4, head_width=0.1, head_length=0.2, fc='black', ec='black')
-    ax.text(-0.13, 2.1, r'$\theta_2$', fontsize=12, color='black')
-
-# Draw plots
-draw_plot(axs, constraint_radius, contour_levels)
-
-plt.tight_layout()
-plt.show()
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_poly_ridge_1_plot.R b/slides/regularization/rsrc/make_poly_ridge_1_plot.R
deleted file mode 100644
index bac7d48e..00000000
--- a/slides/regularization/rsrc/make_poly_ridge_1_plot.R
+++ /dev/null
@@ -1,36 +0,0 @@
-# ------------------------------------------------------------------------------
-# FIG: POLYNOMIAL RIDGE 1
-# ------------------------------------------------------------------------------
-
-library(ggplot2)
-library(viridis)
-
-theme_set(theme_minimal())
-
-# DATA -------------------------------------------------------------------------
-
-source("ridge_polynomial_reg.R")
-
-set.seed(314259)
-f <- function (x) {
-  return (5 + 2 * x + 10 * x^2 - 2 * x^3)
-}
-
-x <- runif(40, -2, 5)
-y <- f(x) + rnorm(length(x), 0, 10)
-
-x.true <- seq(-2, 5, length.out = 400)
-y.true <- f(x.true)
-df <- data.frame(x = x.true, y = y.true)
-
-lambda.vec <- 0
-
-# PLOTS ------------------------------------------------------------------------
-
-p <- plotRidge(x, y, lambda.vec, baseTrafo, degree = 10) +
-  geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) +
-  xlab("x") + ylab("f(x)") +
-  theme(plot.title = element_text(size = 15)) +
-  scale_color_viridis(end = 0.9, discrete = TRUE)
-
-ggsave("../figure/poly_ridge_1.png", plot = p, width = 6, height = 2)
diff --git a/slides/regularization/rsrc/make_poly_ridge_2_plot.R b/slides/regularization/rsrc/make_poly_ridge_2_plot.R
deleted file mode 100644
index 4d5dc6a8..00000000
--- a/slides/regularization/rsrc/make_poly_ridge_2_plot.R
+++ /dev/null
@@ -1,38 +0,0 @@
-# ------------------------------------------------------------------------------
-# FIG: POLYNOMIAL RIDGE 2
-# ------------------------------------------------------------------------------
-
-library(ggplot2)
-library(viridis)
-
-theme_set(theme_minimal())
-
-# DATA -------------------------------------------------------------------------
-
-source("ridge_polynomial_reg.R")
-
-f <- function (x) {
-  return (5 + 2 * x + 10 * x^2 - 2 * x^3)
-}
-
-set.seed(314259)
-x <- runif(40, -2, 5)
-y <- f(x) + rnorm(length(x), 0, 10)
-
-x.true <- seq(-2, 5, length.out = 400)
-y.true <- f(x.true)
-df <- data.frame(x = x.true, y = y.true)
-
-lambda.vec <- c(0, 10, 100)
-
-
-# PLOTS ------------------------------------------------------------------------
-
-p <- plotRidge(x, y, lambda.vec, baseTrafo, degree = 10) +
-  geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) +
-  xlab("x") + ylab("f(x)") +
-  labs(color=expression(lambda)) +
-  theme(plot.title = element_text(size = 15)) +
-  scale_color_viridis(end = 0.9, discrete = TRUE)
-
-ggsave("../figure/poly_ridge_2.png", plot = p, width = 7.5, height = 3)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_poly_ridge_table_latex.R b/slides/regularization/rsrc/make_poly_ridge_table_latex.R
deleted file mode 100644
index 4d353ba1..00000000
--- a/slides/regularization/rsrc/make_poly_ridge_table_latex.R
+++ /dev/null
@@ -1,18 +0,0 @@
-# ------------------------------------------------------------------------------
-# TAB: POLYNOMIAL RIDGE
-# ------------------------------------------------------------------------------
-
-library(xtable)
-
-betas <- getPolyData(x, y, lambda.vec, baseTrafo, degree = 10)$betas
-
-betas <- cbind(as.numeric(rownames(betas)), betas)
-
-colnames(betas) <- c("$\\lambda$" , sapply(1:(ncol(betas)-1), 
-                                           function(i) return (paste0("$\\beta_{",
-                                                                      as.character(i-1),
-                                                                      "}$"))))
-
-print(xtable(signif(betas, 2), digits = 2, align = "rr|lllllllllll"),
-      row.names = FALSE, sanitize.colnames.function = function(x) x, include.rownames = FALSE,
-      hline.after = 0, latex.environments = "tiny")
diff --git a/slides/regularization/rsrc/make_reg_surfaces.py b/slides/regularization/rsrc/make_reg_surfaces.py
deleted file mode 100644
index 349c5c18..00000000
--- a/slides/regularization/rsrc/make_reg_surfaces.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from mpl_toolkits.mplot3d import Axes3D
-from scipy.optimize import minimize
-
-# Data Generation
-n = 500
-np.random.seed(0)
-x1 = np.random.uniform(-1, 1, n)
-x2 = np.random.uniform(-1, 1, n)
-epsilon = np.random.normal(0, 0.1, n)
-y = -0.5 * x1 + 3 * x2 + epsilon
-
-# Regularization Norm Functions
-def l1_norm(beta1, beta2):
-    return np.abs(beta1) + np.abs(beta2)
-
-def l2_norm_squared(beta1, beta2):
-    return beta1**2 + beta2**2
-
-# Updated Regularized Least Squares Objective Function with 1/n factor
-def updated_objective(beta, x1, x2, y, lam, regularization):
-    beta1, beta2 = beta
-    residuals = y - beta1 * x1 - beta2 * x2
-    error_term = np.sum(residuals**2) / n
-    if regularization == 'l1':
-        penalty = l1_norm(beta1, beta2)
-    elif regularization == 'l2':
-        penalty = l2_norm_squared(beta1, beta2)
-    return error_term + lam * penalty
-
-# Compute the Minima for each plot
-minima = {}
-regularizations = ['l1', 'l2']
-lambdas = [0, 1, 10]
-for reg in regularizations:
-    for lam in lambdas:
-        result = minimize(updated_objective, [0, 0], args=(x1, x2, y, lam, reg), method='L-BFGS-B')
-        minima[(reg, lam)] = result.x
-
-# Parameter Space for Beta1 and Beta2
-beta1_range = np.linspace(-10, 10, 100)
-beta2_range = np.linspace(-10, 10, 100)
-beta1_grid, beta2_grid = np.meshgrid(beta1_range, beta2_range)
-
-# Plotting
-fig, axes = plt.subplots(2, 3, subplot_kw={"projection": "3d"}, figsize=(18, 12))
-for i, reg in enumerate(regularizations):
-    for j, lam in enumerate(lambdas):
-        objective_values = np.array([updated_objective([b1, b2], x1, x2, y, lam, reg) 
-                                     for b1, b2 in zip(np.ravel(beta1_grid), np.ravel(beta2_grid))])
-        objective_values = objective_values.reshape(beta1_grid.shape)
-
-        ax = axes[i, j]
-        ax.plot_surface(beta1_grid, beta2_grid, objective_values, cmap='viridis')
-        ax.set_title(f'Regularization: {reg.upper()}, Lambda: {lam}', fontsize=20)  # Increased font size
-        ax.set_xlabel('Theta 1', fontsize=14)  # Increased font size
-        ax.set_ylabel('Theta 2', fontsize=14)  # Increased font size
-        ax.set_zlabel('Emp. risk', fontsize=14)  # Increased font size
-
-        # Add the minima as a red dot
-        min_beta1, min_beta2 = minima[(reg, lam)]
-        min_val = updated_objective([min_beta1, min_beta2], x1, x2, y, lam, reg)
-        ax.scatter(min_beta1, min_beta2, min_val, color='red', s=50)
-
-plt.tight_layout()
-plt.subplots_adjust(wspace=0.1, hspace=0.1)  # Adjust spacing between the plots if needed
-plt.savefig('..figure/reg_surfaces.png', bbox_inches='tight', pad_inches=0, facecolor='white')
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_regu_example_multicollinearity_plot.R b/slides/regularization/rsrc/make_regu_example_multicollinearity_plot.R
deleted file mode 100644
index dd412b4a..00000000
--- a/slides/regularization/rsrc/make_regu_example_multicollinearity_plot.R
+++ /dev/null
@@ -1,125 +0,0 @@
-################################################################################
-# EXAMPLE: LASSO VS RIDGE WITH MULTICOLLINEARITY
-################################################################################
-
-# PREREQ -----------------------------------------------------------------------
-
-library(dplyr)
-library(ggrepel)
-library(MASS)
-library(mlr)
-library(BBmisc)
-library(data.table)
-library(gridExtra)
-library(grid)
-options(scipen = 10000)
-
-# FICTIONAL DATA ---------------------------------------------------------------
-
-set.seed(20200611)
-
-# Create 4 normally distributed, uncorrelated RV
-
-Sigma <- diag(rep(2, 4))
-
-design_matrix <- data.frame(mvrnorm(100, mu = rep(0, 4), Sigma = Sigma,
-                                    empirical = TRUE))
-
-# Add X5 - almost perfectly correlated to X4
-
-colnames(design_matrix) <- c("X1", "X2", "X3", "X4")
-design_matrix <- design_matrix %>%
-  mutate(X5 = X4 + rnorm(nrow(design_matrix), 0, 0.3))
-
-# Create target variable
-
-design_matrix <- design_matrix %>% mutate(y = 0.2 * X1 + 0.2 * X2 + 0.2 * X3
-                                             + 0.2 * X4 + 0.2 * X5 + 
-                                           rnorm(nrow(design_matrix), 0, 1))
-
-# REGRESSION TASK --------------------------------------------------------------
-
-task_mc <- makeRegrTask("fictional", design_matrix, "y")
-featnames_mc <- getTaskFeatureNames(task_mc)
-
-# COEFFICENT PATHS -------------------------------------------------------------
-
-compute_coef_paths <- function(task, lambda_name, lambda_seq) {
-  
-  lrn <- makeLearner("regr.penalized", trace = FALSE, lambda1 = 0, lambda2 = 0)
-  path <- list()
-  
-  # Compute coefficients for each model (on entire data)
-  
-  for (i in seq_along(lambda_seq)) {
-    
-    lamval <- lambda_seq[[i]]
-    pv <- namedList(lambda_name, lamval)
-    lrn2 <- setHyperPars(lrn, par.vals = pv)
-    m1 <- train(lrn2, task)
-    mm1 <- getLearnerModel(m1)
-    cc <- coefficients(mm1)
-    cc <- as.list(cc)
-    cc$lambda <- lamval
-    path[[i]] <- cc
-    
-  }
-  
-  path <- rbindlist(path, fill = TRUE)
-  path[is.na(path)] <- 0
-  
-  # Perform cross validation
-  
-  ps <- makeParamSet(
-    makeDiscreteParam(id = lambda_name, values = lambda_seq)
-  )
-  ctrl <- makeTuneControlGrid()
-  tr <- tuneParams(lrn, task, cv3, par.set = ps, control = ctrl, show.info =
-                    FALSE)
-  cv_lam <- as.data.frame(tr$opt.path)[, c(lambda_name, "mse.test.mean")]
-  colnames(cv_lam) <- c("lambda", "mse")
-  cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda))
-  list(path = path, cv_lam = cv_lam)
-  
-}
-
-# PLOT PATHS -------------------------------------------------------------------
-
-plot_coef_paths_mc <- function(obj, featnames, xlab) {
-  ggd <- data.table::melt(obj$path, id.var = "lambda", measure = featnames,
-              variable.name = "featname", value.name = "coefval")
-  ggd$label <- ifelse(ggd$lambda == min(lambda_seq_mc),
-                      as.character(ggd$featname), NA)
-  ggd$mse <- rep(obj$cv_lam[, "mse"], 5)
-  pl <- ggplot(data = ggd, mapping = aes(x = lambda, y = coefval,
-                                         group = featname, col = featname))
-  pl <- pl + geom_line()
-  pl <- pl + geom_label_repel(aes(label = label), na.rm = TRUE)
-  pl <- pl + scale_x_log10()
-  pl <- pl + xlab(xlab)
-  pl <- pl + theme_bw()
-  pl <- pl + scale_color_manual(values = c(rep("black", 3), "#7FFF32", "#067B7F"),
-                                guide = FALSE)
-  pl <- pl + geom_line(mapping = aes(x = ggd$lambda, y = ggd$mse * 0.5),
-                       col = "black", linetype = "longdash")
-  pl <- pl + geom_text(x = max(log(ggd$lambda, 10)),
-                       y = 0.5 * (max(ggd$mse)) - 0.01, vjust = 1, hjust = 1,
-                       label = "MSE", col = "black")
-  pl <- pl + scale_y_continuous(sec.axis = sec_axis(~. * 2, name = "MSE"))
-  pl <- pl + geom_hline(aes(yintercept = 0), col = "black", linetype = "dotted")
-
-}
-
-#Visualize shrinkage in presence of multicollinearity
-library(ggplot2)
-lambda_seq_mc <- 2^seq(-10, 20, length.out = 50)
-
-path_l1_mc <- compute_coef_paths(task_mc, "lambda1", lambda_seq_mc)
-path_l2_mc <- compute_coef_paths(task_mc, "lambda2", lambda_seq_mc)
-
-p_l1 <- plot_coef_paths_mc(path_l1_mc, featnames_mc, "Lasso / lambda")
-p_l2 <- plot_coef_paths_mc(path_l2_mc, featnames_mc, "Ridge / lambda")
-
-p <- grid.arrange(p_l1, p_l2, nrow = 1)
-ggsave("../figure/regu_example_multicollinearity.png", plot=p, width= 8, height =3)
-
diff --git a/slides/regularization/rsrc/make_ridge_vs_sgd_path.py b/slides/regularization/rsrc/make_ridge_vs_sgd_path.py
deleted file mode 100644
index 449fb04c..00000000
--- a/slides/regularization/rsrc/make_ridge_vs_sgd_path.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.utils import shuffle
-
-# Set the random seed for reproducibility
-np.random.seed(6)
-
-# Function to generate data
-def generate_data(n, p):
-    X = np.random.normal(0, 1, (n, p))
-    true_coef = np.linspace(-1, 1, p)
-    noise = np.random.normal(0, 1, n)
-    y = X.dot(true_coef) + noise
-    return X, y, true_coef
-
-# Function to compute the ridge coefficients analytically
-def compute_ridge_path(X, y, alphas):
-    coefs = [np.zeros(X.shape[1])]  # Start with a row of zeros
-    n, p = X.shape
-    for alpha in alphas:
-        ridge_coefs = np.linalg.inv(X.T @ X + alpha * np.identity(p)) @ X.T @ y
-        coefs.append(ridge_coefs)
-    return np.array(coefs)
-
-# Function to compute the optimization trajectory for SGD
-def compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter):
-    w = np.zeros(X.shape[1])
-    coefs = [w.copy()]  # Start with a row of zeros
-    for i in range(n_iter):
-        X_shuffled, y_shuffled = shuffle(X, y)
-        for j in range(0, n, batch_size):
-            X_batch = X_shuffled[j:j+batch_size]
-            y_batch = y_shuffled[j:j+batch_size]
-            gradient = -2 * X_batch.T @ (y_batch - X_batch @ w) / batch_size
-            w -= learning_rate * gradient
-        coefs.append(w.copy())
-    return np.array(coefs)
-
-# Parameters
-n = 100
-p = 10
-batch_size = 4
-learning_rate = 0.01
-n_iter = 50
-t_values = np.arange(0.001, n_iter + 1)  # Include 0 in t_values for the zero coefficients
-alphas = 1/(learning_rate * t_values[0:])  # Exclude 0 to avoid division by zero
-
-# Generate data
-X, y, true_coef = generate_data(n, p)
-
-# Compute the regularization path for ridge regression
-ridge_coefs = compute_ridge_path(X, y, alphas)
-
-# Compute the optimization trajectory for SGD
-sgd_coefs = compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter)
-
-# Plotting
-fig, axs = plt.subplots(1, 2, figsize=(14, 5))
-# Regularization path for ridge regression
-# Skip the first element (0) in t_values for plotting to match dimensions with ridge_coefs
-axs[0].plot(1/alphas, ridge_coefs[1:])
-axs[0].set_xlabel('1/(lr * lambda)', fontsize=18)
-axs[0].set_ylabel('Parameters', fontsize=18)
-axs[0].set_title('Ridge Regression Path', fontsize=22)
-
-# Optimization trajectory for SGD
-# Use t_values for x-axis to include the initial zero coefficients
-axs[1].plot(t_values, sgd_coefs)
-axs[1].set_xlabel('iteration', fontsize=18)
-axs[1].set_ylabel('Parameters', fontsize=18)
-axs[1].set_title('SGD Trajectory', fontsize=22)
-
-plt.tight_layout()
-plt.show()
-
diff --git a/slides/regularization/rsrc/make_shrinkage_1_plot.R b/slides/regularization/rsrc/make_shrinkage_1_plot.R
deleted file mode 100644
index f888ea21..00000000
--- a/slides/regularization/rsrc/make_shrinkage_1_plot.R
+++ /dev/null
@@ -1,56 +0,0 @@
-# ------------------------------------------------------------------------------
-# FIG: SHRINNKAGE 1
-# ------------------------------------------------------------------------------
-
-
-library(knitr)
-library(mlr)
-library(mlbench)
-library(ggplot2)
-library(tidyr)
-library(colorspace)
-library(BBmisc)
-library(penalized)
-library(reshape)
-library(gridExtra)
-library(ggrepel)
-library(data.table)
-library(viridis)
-
-# DATA -------------------------------------------------------------------------
-
-load("regu_example_1.RData")
-
-# PLOTS ------------------------------------------------------------------------
-
-plot_coef_paths <- function(path, featnames, xlab) {
-  ggd <- data.table::melt(path, id.vars = "lambda", measure = featnames, variable.name = "featname", value.name = "coefval")
-  ggd$label <- ifelse(ggd$lambda == min(lambda_seq), as.character(ggd$featname), NA)
-  pl <- ggplot(data = ggd, aes(x = lambda, y = coefval, group = featname, col = featname)) +
-    guides(color = "none") +
-    geom_line() +
-    geom_label_repel(aes(label = label), na.rm = TRUE, max.overlaps = Inf) +
-    scale_color_discrete(guide = FALSE) +
-    scale_x_log10() +
-    xlab(xlab) +
-    theme_bw() +
-    scale_color_viridis(end = 0.9, discrete = TRUE)
-
-
-}
-
-plot_cv_path <- function(cv_lam, xlab) {
-  pl <- ggplot(data = cv_lam, aes(x = lambda, y = mse)) +
-    geom_line() +
-    scale_x_log10() +
-    xlab(xlab) +
-    theme_minimal()
-}
-
-pl1 <- plot_coef_paths(path_l1$path, featnames, "Lasso / lambda")
-pl2 <- plot_coef_paths(path_l2$path, featnames, "Ridge / lambda")
-pl3 <- plot_cv_path(path_l1$cv_lam, "Lasso / lambda") + ylim(25, 90)
-pl4 <- plot_cv_path(path_l2$cv_lam, "Ridge / lambda") + ylim(20, 90)
-
-p <- grid.arrange(pl1, pl2, pl3, pl4, nrow = 2)
-ggsave("../figure/shrinkage_1.png", plot = p, width = 8, height = 4)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_shrinkage_2_plot.R b/slides/regularization/rsrc/make_shrinkage_2_plot.R
deleted file mode 100644
index 35666f04..00000000
--- a/slides/regularization/rsrc/make_shrinkage_2_plot.R
+++ /dev/null
@@ -1,48 +0,0 @@
-library(knitr)
-library(mlr)
-library(mlbench)
-library(ggplot2)
-library(tidyr)
-library(colorspace)
-library(BBmisc)
-library(data.table)
-library(penalized)
-library(reshape)
-library(gridExtra)
-library(viridis)
-###########################################################
-
-load("regu_example_2.RData")
-d_l1 <- rbind(
-  data.frame(lam = paste("L1-", lams[1]), coefval = cc_l1_1),
-  data.frame(lam = paste("L1-", lams[2]), coefval = cc_l1_2)
-)
-d_l1$lam <- as.factor(d_l1$lam)
-d_l2 <- rbind(
-  data.frame(lam = paste("L2-", lams[1]), coefval = cc_l2_1),
-  data.frame(lam = paste("L2-", lams[2]), coefval = cc_l2_2)
-)
-d_l2$lam <- as.factor(d_l2$lam)
-plot_coef_hist <- function(d) {
-  pl <- ggplot(d, aes(x = coefval, fill = lam)) +
-    scale_fill_viridis(end = 0.9, discrete = TRUE) +
-    geom_histogram(alpha = 0.9, position = "dodge") +
-    theme_gray(base_size = 14)
-  return(pl)
-}
-plot_cv_path <- function(cv_lam, xlab) {
-  pl <- ggplot(data = cv_lam, aes(x = lambda, y = mse))
-  pl <- pl + geom_line()
-  pl <- pl + scale_x_log10()
-  pl <- pl + ylim(1, 10)
-  pl <- pl + xlab(xlab) + theme_gray(base_size = 14)
-}
-
-pl1 <- plot_coef_hist(d_l1) + guides(fill=guide_legend(title="lambda"))
-pl2 <- plot_coef_hist(d_l2)+ guides(fill=guide_legend(title="lambda")) +
-  ylim(0, 50)
-pl3 <- plot_cv_path(cv_l1, "lambda")
-pl4 <- plot_cv_path(cv_l2, "lambda")
-
-p <- grid.arrange(pl1, pl2, pl3, pl4, nrow = 2)
-ggsave("../figure/shrinkage_2.png", plot = p, width = 8, height = 5)
\ No newline at end of file
diff --git a/slides/regularization/rsrc/make_wd_l2_geom_plot.R b/slides/regularization/rsrc/make_wd_l2_geom_plot.R
deleted file mode 100644
index 7094f54f..00000000
--- a/slides/regularization/rsrc/make_wd_l2_geom_plot.R
+++ /dev/null
@@ -1,58 +0,0 @@
-library(ggplot2)
-
-# Define parameters for the ellipse
-center <- c(1.5, 1.5)
-axis_len <- c(1.5, 0.75)  # Lengths of the axes for the ellipse
-rotation <- pi/3
-
-
-seq_data <- seq(0, 2*pi, length.out=100) #points for one circle
-# Generate data points for plotting ellipses
-ellipse_x <- cos(rotation)*cos(seq_data)*axis_len[1]-sin(rotation)*sin(seq_data)*axis_len[2] 
-ellipse_y <- sin(rotation)*cos(seq_data)*axis_len[1]+cos(rotation)*sin(seq_data)*axis_len[2]
-
-elli_list <- list()
-i <- 1
-for(mul in c(0.24, 0.43, 0.62, 0.78)){ #adjust radius
-  elli_list[[i]] <- data.frame(x=center[1]+ellipse_x*mul, y=center[2]+ellipse_y*mul)
-  i <- i + 1
-}
-
-# Generate data points for plotting circles(ridge)
-cir_list <- list()
-i <- 1
-for(mul in c(0.15, 0.4, 0.67, 1)){ #adjust radius
-  cir_list[[i]] <- data.frame(x=cos(seq_data)*mul, y=sin(seq_data)*mul)
-  i <- i + 1
-}
-
-# Create the plot of ellipses
-p_elli <- ggplot() + 
-  geom_path(data=elli_list[[1]], aes(x, y), color="black") +
-  geom_path(data=elli_list[[2]], aes(x, y), color="black") +
-  geom_path(data=elli_list[[3]], aes(x, y), color="black") +
-  geom_path(data=elli_list[[4]], aes(x, y), color="black") +
-  geom_point(aes(x=center[1], y=center[2]), color="black", size=3) + 
-  annotate("label", x=1.6, y=1.3, label="hat(theta)",
-           parse=TRUE, color='black', size=3)
-
-# Create whole plot
-p_ridge_geom <- p_elli + 
-  geom_path(data=cir_list[[1]], aes(x, y), color="black", linetype="dashed") +
-  geom_path(data=cir_list[[2]], aes(x, y), color="black", linetype="dashed") +
-  geom_path(data=cir_list[[3]], aes(x, y), color="black", linetype="dashed") +
-  geom_path(data=cir_list[[4]], aes(x, y), color="black", linetype="dashed") +
-  geom_point(aes(x=0.83, y=sqrt(1-0.83^2)), color="black", size=3) + #intersection point
-  annotate("label", x=1, y=0.2, label="hat(theta)[\"Ridge\"]",
-           parse=TRUE, color='black', size=3) +
-  xlim(-1.5, 3) +
-  ylim(-1.5, 3) +
-  coord_equal() +
-  theme_light() +
-  labs(title = "",
-       x = expression(theta_1),
-       y = expression(theta_2))
-
-ggsave(filename = paste0("../figure/wd-l2-geom.png"), 
-       plot=p_ridge_geom, width=12, height=7.5)
-
diff --git a/slides/regularization/rsrc/fig-eval_ofit_1.R b/slides/regularization/rsrc/model_eval.R
old mode 100644
new mode 100755
similarity index 57%
rename from slides/regularization/rsrc/fig-eval_ofit_1.R
rename to slides/regularization/rsrc/model_eval.R
index 42392c24..5289a4af
--- a/slides/regularization/rsrc/fig-eval_ofit_1.R
+++ b/slides/regularization/rsrc/model_eval.R
@@ -1,3 +1,11 @@
+# ------------------------------------------------------------------------------
+# intro
+
+# FIG: binary classification visualization under
+#      appropriate, overfitted and underfitted models.
+
+# DATA: 100000 2-feature samples from Normal distribution into two classes.
+# ------------------------------------------------------------------------------
 
 library(mlr3misc)
 library(mvtnorm)
@@ -6,8 +14,12 @@ library(mlr3learners)
 library(mlr3viz)
 library(ggplot2)
 library(gridExtra)
+library(e1071)
 
 set.seed(600000)
+
+# DATA -------------------------------------------------------------------------
+
 n = 100000
 
 mu1 = c(0, 3)
@@ -28,10 +40,10 @@ trainsize = 200
 trainset = 1:trainsize
 testset = (trainsize+1):n
 
-l1 = lrn("classif.qda", predict_type = "prob")
-l2 = lrn("classif.log_reg", predict_type = "prob")
-l3 = lrn("classif.svm", type = "C-classification", predict_type = "prob", 
-  kernel = "radial", gamma = 99, cost = 1)
+l1 = lrn("classif.qda", predict_type = "prob") # appropriate
+l2 = lrn("classif.svm", type = "C-classification", predict_type = "prob", 
+         kernel = "radial", gamma = 99, cost = 1) # overfit
+l3 = lrn("classif.log_reg", predict_type = "prob") # underfit
 
 l1$train(task)
 r1 = range(dd[trainset,]$V1)
@@ -43,26 +55,25 @@ pred_true = as.data.table(l1$predict_newdata(d_grid))
 d_grid$prob = pred_true$prob.1 
 true_decb = d_grid[d_grid$prob > 0.47 & d_grid$prob < 0.53,]
 
+# PLOT -------------------------------------------------------------------------
 
-make_plot = function(ll, file_postfix) {
+make_plot = function(ll) {
   ll$train(task, row_ids = trainset)
   pred_train = ll$predict(task, row_ids = trainset)
   trainerr = pred_train$score(msr("classif.ce"))
   pred_test = ll$predict(task, row_ids = testset)
   testerr = pred_test$score(msr("classif.ce"))
-  fname = sprintf("../figure/eval_ofit_1%s.pdf", file_postfix)
   task_train = task$filter(rows = trainset)
   pl = plot_learner_prediction(ll, task) + guides(shape = FALSE, alpha = FALSE)
   pl = pl + ggtitle(sprintf("TrainErr=%.2f;    TestErr=%.2f", trainerr, testerr))
   pl = pl + geom_point(data = true_decb, alpha=0.5, size=0.2)
-  ggsave(plot = pl, filename = fname, width = 8, height = 6)
   return(pl)
 }
 
-p1 = make_plot(l1, file_postfix = "a")
-p2 = make_plot(l2, file_postfix = "u")
-p3 = make_plot(l3, file_postfix = "o")
-
-#grid.arrange(p1, p2, p3)
-#print(p2)
+p1 = make_plot(l1) # appropriate
+p2 = make_plot(l2) # overfit
+p3 = make_plot(l3) # underfit
 
+ggsave("../figure/model_eval_01.png", plot = p1, width = 8, height = 6)
+ggsave("../figure/model_eval_02.png", plot = p2, width = 8, height = 6)
+ggsave("../figure/model_eval_03.png", plot = p3, width = 8, height = 6)
diff --git a/slides/regularization/rsrc/multicollinearity_example.R b/slides/regularization/rsrc/multicollinearity_example.R
new file mode 100755
index 00000000..aa111be8
--- /dev/null
+++ b/slides/regularization/rsrc/multicollinearity_example.R
@@ -0,0 +1,130 @@
+# ------------------------------------------------------------------------------
+# l1 vs l2
+
+# FIG: draw how coefficient values and MSE of linear regression change with 
+#      different regularization constants (lambda).
+#  left: Under L1 regularization
+#  right: Under L2 regularization
+# DATA: 
+#     xi ~ Normal(0, 2) i=1,2,3,4 uncorrelated, x5 = x4 + Normal(0, 0.3)
+#     y = 0.2*x1 + 0.2*x2 + 0.2*x3 + 0.2*x4 + 0.2*x5 + eps(100*1 ~Normal)
+# ------------------------------------------------------------------------------
+
+library(dplyr)
+library(ggrepel)
+library(BBmisc)
+library(MASS)
+library(mlr3)
+library(mlr3learners)
+library(mlr3tuning)
+library(ggplot2)
+library(data.table)
+library(gridExtra)
+options(scipen = 10000)
+
+set.seed(20200611)
+# FICTIONAL DATA ---------------------------------------------------------------
+
+# Create 4 normally distributed, uncorrelated RV
+Sigma <- diag(rep(2, 4))
+design_matrix <- data.frame(mvrnorm(100, mu = rep(0, 4), Sigma = Sigma,
+                                    empirical = TRUE))
+
+# Add X5 - almost perfectly correlated to X4
+colnames(design_matrix) <- c("X1", "X2", "X3", "X4")
+design_matrix <- design_matrix %>%
+  mutate(X5 = X4 + rnorm(nrow(design_matrix), 0, 0.3))
+
+# Create target variable
+design_matrix <- design_matrix %>% mutate(y = 0.2 * X1 + 0.2 * X2 + 0.2 * X3
+                                             + 0.2 * X4 + 0.2 * X5 + 
+                                           rnorm(nrow(design_matrix), 0, 1))
+
+# REGRESSION TASK --------------------------------------------------------------
+
+task_mc <- TaskRegr$new(id = "fictional", backend = design_matrix, target = "y")
+featnames_mc <- task_mc$feature_names
+
+# COEFFICENT PATHS -------------------------------------------------------------
+
+compute_coef_paths <- function(task, lambda_name, lambda_seq) {
+  alpha = ifelse(lambda_name=='lambda1', 1, 0)
+  path <- list()
+  # Compute coefficients for each model (on entire data)
+  for (i in seq_along(lambda_seq)) {
+    lamval <- lambda_seq[i]
+    learner = lrn("regr.glmnet", alpha = alpha, lambda=lamval)
+    learner$train(task)
+    cc <- t(as.matrix(coef(learner$model)))
+    names <- colnames(cc)
+    cc <- as.numeric(cc)
+    names(cc) <- names
+    cc <- as.list(cc)
+    cc$lambda <- lamval
+    path[[i]] <- cc
+  }
+  
+  path <- rbindlist(path, fill = TRUE)
+  path[is.na(path)] <- 0
+  
+  # Perform cross validation
+  learner = lrn("regr.glmnet", alpha = alpha, lambda=to_tune(lambda_seq))
+
+  # Construct tuning instance
+  instance = ti(
+    task = task,
+    learner = learner,
+    resampling = rsmp("cv", folds = 3),
+    measures = msr("regr.mse"),
+    terminator = trm("evals", n_evals = length(lambda_seq))
+  )
+  
+  tuner <- tnr("grid_search", resolution = length(lambda_seq))
+  tuner$optimize(instance)
+  cv_lam <- as.data.frame(instance$archive$data)[,1:2]
+  colnames(cv_lam) <- c("lambda", "mse")
+  cv_lam$lambda <- as.numeric(as.character(cv_lam$lambda))
+  cv_lam <- cv_lam %>% arrange(lambda)
+  
+  list(path = path, cv_lam = cv_lam)
+}
+
+# PLOT PATHS -------------------------------------------------------------------
+
+plot_coef_paths_mc <- function(obj, featnames, title, xlab) {
+  ggd <- melt(obj$path, id.var = "lambda", measure = featnames,
+              variable.name = "featname", value.name = "coefval")
+  ggd$label <- ifelse(ggd$lambda == min(lambda_seq_mc),
+                      as.character(ggd$featname), NA)
+  ggd$mse <- rep(obj$cv_lam[, "mse"], 5)
+  pl <- ggplot(data = ggd, mapping = aes(x = lambda, y = coefval,
+                                         group = featname, col = featname)) +
+    geom_line() +
+    geom_label_repel(aes(label = label), na.rm = TRUE) + 
+    scale_x_log10() + 
+    ggtitle(title) + 
+    xlab(xlab) + 
+    theme_bw() + 
+    scale_color_manual(values = c(rep("black", 3), "#7FFF32", "#067B7F"), 
+                       guide = FALSE) + 
+    geom_line(mapping = aes(x = ggd$lambda, y = ggd$mse * 0.5), 
+              col = "black", linetype = "longdash") + 
+    geom_text(x = max(log(ggd$lambda, 10)), 
+              y = 0.5 * (max(ggd$mse)) - 0.01, vjust = 1, hjust = 1, 
+              label = "MSE", col = "black") +
+    scale_y_continuous(sec.axis = sec_axis(~. * 2, name = "MSE")) +
+    geom_hline(aes(yintercept = 0), col = "black", linetype = "dotted")
+
+}
+
+#Visualize shrinkage in presence of multicollinearity
+lambda_seq_mc <- 2^seq(-10, 20, length.out = 50)
+
+path_l1_mc <- compute_coef_paths(task_mc, "lambda1", lambda_seq_mc)
+path_l2_mc <- compute_coef_paths(task_mc, "lambda2", lambda_seq_mc)
+
+p_l1 <- plot_coef_paths_mc(path_l1_mc, featnames_mc, "Lasso", expression(lambda))
+p_l2 <- plot_coef_paths_mc(path_l2_mc, featnames_mc, "Ridge", expression(lambda))
+
+p <- grid.arrange(p_l1, p_l2, nrow = 1)
+ggsave("../figure/multicollinearity_example.png", plot=p, width= 8, height =3)
diff --git a/slides/regularization/rsrc/make_nn_plots.R b/slides/regularization/rsrc/nn_size.R
old mode 100644
new mode 100755
similarity index 72%
rename from slides/regularization/rsrc/make_nn_plots.R
rename to slides/regularization/rsrc/nn_size.R
index 967b3fa9..b55aef79
--- a/slides/regularization/rsrc/make_nn_plots.R
+++ b/slides/regularization/rsrc/nn_size.R
@@ -1,13 +1,20 @@
+# ------------------------------------------------------------------------------
+# nonlin
+
+# FIG: plot schematic diagrams of one-hidden-layer neural network 
+#      with different sizes (1,2,3,5,10,100) (input size: 2, output size: 1).
+# ------------------------------------------------------------------------------
+
 library(RSNNS)
 library(nnet)
 library(clusterGeneration)
-#import the function from Github
 library(devtools)
 source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
 
-seed.val<-2
-set.seed(seed.val)
- 
+set.seed(2)
+
+# DATA -------------------------------------------------------------------------
+
 num.vars<-2
 num.obs<-1000
  
@@ -25,15 +32,15 @@ resp<-data.frame(y1)
 names(resp)<-c('Y1')
 dat.in<-data.frame(resp,rand.vars)
 
+# plot -------------------------------------------------------------------------
+
 nn_plot <- function(size) {
-  # Your existing code to generate the model might go here
-  mod1 <- nnet(rand.vars, resp, data=dat.in, size=size, linout=T) # Example
+  mod1 <- nnet(rand.vars, resp, data=dat.in, size=size, linout=T)
   save_dir <- "../figure"
   filename <- file.path(save_dir, sprintf("nn_size_%d.png", size))
   png(filename, width = 3000, height = 2800, res = 500)
   par(mar = c(1, 1, 1, 1))
   
-  # Your plot code
   plot.nnet(mod1, 
             nid=FALSE,
             rel.rsc=3, 
@@ -47,11 +54,9 @@ nn_plot <- function(size) {
             neg.col='black',
             max.sp=TRUE)
   
-  # Close the device, saving the file
   dev.off()
 }
 
-# Your existing loop
 vec <- c(1, 2, 3, 5, 10, 100)
 for (i in vec) {
   nn_plot(i)
diff --git a/slides/regularization/rsrc/ozone_mse_boxplot.R b/slides/regularization/rsrc/ozone_mse_boxplot.R
new file mode 100755
index 00000000..6aa2a699
--- /dev/null
+++ b/slides/regularization/rsrc/ozone_mse_boxplot.R
@@ -0,0 +1,30 @@
+# ------------------------------------------------------------------------------
+# intro
+
+# FIG: boxplot of MSE for training and test results.
+
+# DATA: from data_ozone_example.RData
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+
+theme_set(theme_minimal())
+
+# DATA -------------------------------------------------------------------------
+
+load("data_ozone_example.RData")
+dfp <- df_incdata[df_incdata$nobs == 50, ]
+
+# PLOTS ------------------------------------------------------------------------
+
+p <- ggplot(data = dfp, aes(x = 0, y = value, fill = variable)) +
+  geom_boxplot() + labs(colour = " ") +
+  scale_colour_discrete(labels = c("Train error", "Test error")) +
+  xlab(" ") + ylab("MSE") +
+  ylim(c(0, 400)) +
+  theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank()) +
+  scale_fill_brewer(palette="Dark2")
+
+ggsave("../figure/ozone_mse_boxplot.png", plot=p, width=4, height=2)
diff --git a/slides/regularization/rsrc/poly_ridge.R b/slides/regularization/rsrc/poly_ridge.R
new file mode 100755
index 00000000..d0bd76be
--- /dev/null
+++ b/slides/regularization/rsrc/poly_ridge.R
@@ -0,0 +1,132 @@
+# ------------------------------------------------------------------------------
+# l2
+
+# FIG: 
+#  (1) true and fitted polynomials by OLS regression (degree = 10, overfitted).
+#  (2) true and fitted polynomials with different regularization 
+#      constant (lambda) by ridge regression (large lambda helps with overfit).
+
+# DATA: y = 5 + 2x + 10x^2 - 2*x^3 (x 40*1 ~Unif) + noise (40*1 ~Normal)
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(viridis)
+
+set.seed(314259)
+
+theme_set(theme_minimal())
+
+# DATA -------------------------------------------------------------------------
+
+f <- function (x) {
+  return (5 + 2 * x + 10 * x^2 - 2 * x^3)
+}
+
+x <- runif(40, -2, 5)
+y <- f(x) + rnorm(length(x), 0, 10)
+
+x.true <- seq(-2, 5, length.out = 400)
+y.true <- f(x.true)
+df <- data.frame(x = x.true, y = y.true)
+
+lambda <- 0
+
+lambda.vec <- c(0, 10, 100)
+
+# FUNC -------------------------------------------------------------------------
+
+# calculate ridge coefficients
+betaRidge <- function (X, y, lambda)
+{
+  return (solve(t(X) %*% X + lambda * diag(ncol(X))) %*% (t(X) %*% y))
+}
+
+# generate polynomials
+baseTrafo <- function (x, degree)
+{
+  out <- cbind(1, x)
+  for (i in seq_len(degree)[-1]) {
+    out <- cbind(out, x^i)
+  }
+  return (out)
+}
+
+# generate df with polynomial features, true polynomial values, coefficients
+getPolyData <- function(x, y, lambda.vec, base.trafo, ...)
+{
+  X <- base.trafo(x, ...)
+  
+  x.pred <- seq(min(x), max(x), length.out = 500)
+  X.pred <- base.trafo(x.pred, ...)
+  
+  df.truth <- data.frame(feature = x, truth = y)
+  
+  # browser()
+  
+  df.betas <- matrix(NA, nrow=length(lambda.vec), ncol=ncol(X))
+  row.names(df.betas) <- lambda.vec
+  
+  for(i in 1:length(lambda.vec)){
+    df.betas[i,] <- betaRidge(X, y, lambda.vec[i])
+  }
+  
+  df.polys <- lapply(1:length(lambda.vec), function (i) {
+    return (data.frame(
+      feature = x.pred,
+      pred = X.pred %*% df.betas[i,],
+      lambda = row.names(df.betas)[i]
+    ))
+  })
+  return (list(polys = df.polys,
+               truth = df.truth,
+               betas = df.betas))
+}
+
+# plot true and fitted polynomials
+plotRidge <- function (x, y, lambda.vec, base.trafo, ...)
+{
+  requireNamespace("ggplot2")
+  
+  # browser()
+  
+  res <- getPolyData(x, y, lambda.vec, base.trafo, ...)
+  df.polys <- res$polys
+  df.truth <- res$truth
+  
+  plot.df <- df.polys[[1]]
+  for (i in seq_along(df.polys)[-1]) {
+    plot.df <- rbind(plot.df, df.polys[[i]])
+  }
+  plot.df$lambda <- as.factor(plot.df$lambda)
+  
+  gg <- ggplot()
+  if (length(lambda.vec) == 1) {
+    gg <- gg + geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda), show.legend = FALSE)
+  } else {
+    gg <- gg + geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda))
+  }
+  
+  return (
+    gg +
+      geom_point(data = df.truth, mapping = aes(x = feature, y = truth))
+  )
+}
+
+# PLOTS ------------------------------------------------------------------------
+
+p1 <- plotRidge(x, y, lambda, baseTrafo, degree = 10) +
+  geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) +
+  xlab("x") + ylab("f(x)") +
+  theme(plot.title = element_text(size = 15)) +
+  scale_color_viridis(end = 0.9, discrete = TRUE)
+
+# multiple lines
+p2 <- plotRidge(x, y, lambda.vec, baseTrafo, degree = 10) +
+  geom_line(data = df, aes(x = x, y = y), color = "red", size = 1) +
+  xlab("x") + ylab("f(x)") +
+  labs(color=expression(lambda)) +
+  theme(plot.title = element_text(size = 15)) +
+  scale_color_viridis(end = 0.9, discrete = TRUE)
+
+ggsave("../figure/poly_ridge_01.png", plot = p1, width = 6, height = 2)
+ggsave("../figure/poly_ridge_02.png", plot = p2, width = 7.5, height = 3)
diff --git a/slides/regularization/rsrc/make_reg_contours.R b/slides/regularization/rsrc/reg_contours.R
old mode 100644
new mode 100755
similarity index 74%
rename from slides/regularization/rsrc/make_reg_contours.R
rename to slides/regularization/rsrc/reg_contours.R
index ef649c34..70d9318d
--- a/slides/regularization/rsrc/make_reg_contours.R
+++ b/slides/regularization/rsrc/reg_contours.R
@@ -1,34 +1,43 @@
-# Load necessary libraries
+# ------------------------------------------------------------------------------
+# l1, l2
+
+# FIG: contour plots for l1, l2 regularized linear model and corresponding 
+#      optimal points with different regularization constants (lambda).
+
+# DATA: y = X (100*2 ~Normal)·beta_true(3,-2) + noise (100*1 ~Normal)
+# ------------------------------------------------------------------------------
+
 library(ggplot2)
 library(MASS)
 library(glmnet)
 library(gridExtra)
 
-# Example dataset
 set.seed(123)
+
+# DATA -------------------------------------------------------------------------
+
 n <- 100
 X <- matrix(rnorm(2 * n), n, 2)
 beta_true <- c(3, -2)
 y <- X %*% beta_true + rnorm(n)
 
-# Prepare data for contour plot
 grid_range <- seq(-5, 5, length.out = 100)
 grid_data <- expand.grid(X1 = grid_range, X2 = grid_range)
 grid_data$loss <- apply(grid_data, 1, function(vec) {
   sum((y - X %*% vec)^2) / (2 * n)
 })
 
+# DATA L2 ----------------------------------------------------------------------
 
 lambdas_l2 <- c(0, 10, 100, 500)
-# Ridge regression implementation
+
+# coefficients for ridge regression
 ridge_regression <- function(X, y, lambda) {
   n <- nrow(X)
   d <- ncol(X)
   
-  # Adding a column of ones for the intercept term
-  X_ext <- cbind(1, X)  # Ensure X_ext has n rows and d+1 columns
+  X_ext <- cbind(1, X)  # n rows and d+1(intercept term) columns
   
-  # Ridge regression closed-form solution
   I <- diag(d + 1)
   I[1, 1] <- 0  # No regularization on the intercept
   
@@ -36,82 +45,75 @@ ridge_regression <- function(X, y, lambda) {
   return(beta)
 }
 
-# OLS regression implementation
+# coefficients for ols
 ols_regression <- function(X, y) {
   n <- nrow(X)
   d <- ncol(X)
   
-  # Adding a column of ones for the intercept term
   X_ext <- cbind(1, X)
   
-  # OLS closed-form solution
   beta <- solve(t(X_ext) %*% X_ext) %*% t(X_ext) %*% y
   return(beta)
 }
 
-# Calculate coefficients using ridge_regression for each lambda
+# coefficients using ridge_regression for each lambda
+# put intercept term at the end
 coefs_manual <- sapply(lambdas_l2, function(lambda) {
   beta <- ridge_regression(X, y, lambda)
-  return(beta[2:3, 1])  # Extract coefficients excluding the intercept
+  return(beta[2:3, 1])  # excluding intercept
 })
 
 coefs_manual <- t(coefs_manual)
 coefs_df_manual <- as.data.frame(coefs_manual)
 names(coefs_df_manual) <- c("X1", "X2")
 
-# Function to create contour plots for regularized loss
+# PLOT L2 ----------------------------------------------------------------------
+
+# contour plots with optimal points
 create_reg_contour_plot <- function(coefs, title, lambda, alpha, X, y, grid_range, true_minimizer = c(3, -2.5)) {
   n <- nrow(X)
   d <- ncol(X)
   
-  # Make sure coefs is a numeric vector
   coefs <- as.numeric(coefs)
   
-  # Define the loss function for OLS
   loss_ols <- function(beta, X, y) {
-    X_ext <- cbind(1, X) # Include intercept term
+    X_ext <- cbind(1, X)
     return(sum((y - X_ext %*% beta)^2) / (2 * n))
   }
   
-  # Define the regularized loss function for Ridge
   regularized_loss_ridge <- function(beta, X, y, lambda) {
     ridge_term <- ifelse(alpha == 0, lambda * sum(beta[-1]^2), 0)
-    X_ext <- cbind(1, X) # Include intercept term
+    X_ext <- cbind(1, X)
     return(sum((y - X_ext %*% beta)^2) / (2 * n) + ridge_term)
   }
-
-  # Define the regularized loss function for LASSO
+  
   regularized_loss_lasso <- function(beta, X, y, lambda) {
     lasso_term <- ifelse(alpha == 1, lambda * sum(abs(beta[-1])), 0)
     X_ext <- cbind(1, X) # Include intercept term
     return(sum((y - X_ext %*% beta)^2) / (2 * n) + lasso_term)
   }
   
-  # Prepare grid data for contour plot
+  # data for contour plot
   grid_data <- expand.grid(X1 = grid_range, X2 = grid_range)
   X_ext <- cbind(1, grid_data)
   
   if(lambda == 0 && alpha == 0) {
-    # Use OLS loss function
     beta_center_ols <- c(1, coefs)
     grid_data$reg_loss <- apply(X_ext, 1, function(vec) {
       loss_ols(vec, X, y)  # Directly use vec as beta values
     })
   } else if(alpha == 0) {
-    # Use Ridge loss function
     beta_center <- c(1, coefs)
     grid_data$reg_loss <- apply(X_ext, 1, function(vec) {
       regularized_loss_ridge(vec - beta_center, X, y, lambda)
     })
   } else {
-    # Use LASSO loss function
     beta_center <- c(1, coefs)
     grid_data$reg_loss <- apply(X_ext, 1, function(vec) {
       regularized_loss_lasso(vec, X, y, lambda)
     })
   }
   
-  # Create the contour plot
   plot <- ggplot(grid_data, aes(x = X1, y = X2)) +
     geom_contour_filled(aes(z = reg_loss), breaks = pretty(range(grid_data$reg_loss), n = 15)) +
     geom_point(aes(x = coefs[1], y = coefs[2]), color = "red", size = 2) +
@@ -128,8 +130,7 @@ create_reg_contour_plot <- function(coefs, title, lambda, alpha, X, y, grid_rang
 }
 
 
-# Calculate coefficients using OLS regression for lambda = 0
-coefs_ols <- ols_regression(X, y)[2:3, 1]  # Extract coefficients excluding the intercept
+coefs_ols <- ols_regression(X, y)[2:3, 1]
 
 # Create plots for each lambda
 plots_l2 <- list()
@@ -137,7 +138,6 @@ for (i in 1:length(lambdas_l2)) {
   lambda_value <- lambdas_l2[i]
   title_expression <- bquote("L2 Regularization:" ~ lambda == .(lambda_value))
   
-  # Use OLS coefficients for lambda = 0
   if (lambda_value == 0) {
     coefs_to_use <- coefs_ols
   } else {
@@ -147,13 +147,12 @@ for (i in 1:length(lambdas_l2)) {
   plots_l2[[i]] <- create_reg_contour_plot(coefs_to_use, title_expression, lambda_value, 0, X, y, grid_range)
 }
 
-# Display the grid of contour plots
 ridge_contours <- grid.arrange(grobs = plots_l2, ncol = 2, nrow = 2)
 
-ggsave("../figure/ridge_contours.png", plot = ridge_contours, width =9, height = 6)
+ggsave("../figure/reg_contours_02.png", plot = ridge_contours, width = 9, height = 6)
 
+# DATA L1 ----------------------------------------------------------------------
 
-# Assuming lambdas_l1 contains your lambda values for LASSO
 lambdas_l1 <- c(0, 1, 2, 10)
 
 # glmnet requires a matrix for X and a vector for y
@@ -166,14 +165,14 @@ lasso_models <- lapply(lambdas_l1, function(lambda) {
 })
 
 coefs_l1 <- sapply(lasso_models, function(model) {
-  coef(model)[2:3,1]  # Extracting only the relevant coefficients
+  coef(model)[2:3,1]
 })
 
-# Transpose and convert to data frame
 coefs_l1 <- t(coefs_l1)
 coefs_df_l1 <- as.data.frame(coefs_l1)
 names(coefs_df_l1) <- c("X1", "X2")
 
+# PLOT L1 ----------------------------------------------------------------------
 
 plots_l1 <- list()
 for (i in 1:length(lambdas_l1)) {
@@ -191,10 +190,6 @@ for (i in 1:length(lambdas_l1)) {
   plots_l1[[i]] <- create_reg_contour_plot(coefs_to_use, title_expression, lambda_value, 1, X, y, grid_range)
 }
 
-# Display the grid of LASSO contour plots
 lasso_contours <- grid.arrange(grobs = plots_l1, ncol = 2, nrow = 2)
 
-
-ggsave("../figure/lasso_contours.png", plot = lasso_contours, width =9, height = 6)
-
-                                 
\ No newline at end of file
+ggsave("../figure/reg_contours_01.png", plot = lasso_contours, width =9, height = 6)
diff --git a/slides/regularization/rsrc/reg_perspectives.py b/slides/regularization/rsrc/reg_perspectives.py
deleted file mode 100644
index aafd0c34..00000000
--- a/slides/regularization/rsrc/reg_perspectives.py
+++ /dev/null
@@ -1,317 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from matplotlib.patches import Circle
-
-# Define the grid for plotting
-x = np.linspace(-3.0, 3.0, 400)
-y = np.linspace(-3.0, 3.0, 400)
-X, Y = np.meshgrid(x, y)
-
-# Define the center of the objective function
-objective_center = np.array([1.5, 1.5])  # Adjust as needed
-
-# Elliptical objective function with rotation
-def rotated_elliptical_objective(X, Y, center, a, b, angle_deg):
-    """ Rotated elliptical objective function. """
-    angle_rad = np.radians(angle_deg)
-    X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1])
-    Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1])
-    return (X_rot**2 / a**2) + (Y_rot**2 / b**2)
-
-# Define elliptical parameters
-a, b = 1.5, 0.75  # Semi-major and semi-minor axes lengths
-rotation_angle = -30  # Rotation angle in degrees
-
-# Calculate rotated elliptical objective function values
-Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle)
-
-# Define the constraint circle for ridge regression (L2)
-constraint_radius = 1.0  # Example radius
-
-def draw_plot(ax, contour_levels, last_plot=False):
-    # Plot contour lines around the objective center if any contour levels are provided
-    if contour_levels:
-        CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5)
-    
-    # Plot the constraint circle
-    circle = Circle((0, 0), constraint_radius, color='blue', alpha=0.3, linestyle='--')
-    ax.add_artist(circle)
-    
-    # Plot the minimum point in all plots
-    ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=4)
-
-    # Set the same scale for both axes and set limits
-    ax.set_aspect('equal', 'box')
-    ax.set_xlim(-3, 3)
-    ax.set_ylim(-3, 3)
-    ax.axhline(0, color='black', linewidth=0.5)
-    ax.axvline(0, color='black', linewidth=0.5)
-
-    # Define the legend elements
-    legend_elements = [plt.Line2D([0], [0], color='black', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}$')]
-    
-    # Add the intersection point for the last plot
-    if last_plot:
-        # Calculate the intersection point
-        last_contour = CS.allsegs[-1][0]
-        distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2)
-        min_idx = np.argmin(np.abs(distances - constraint_radius))
-        intersection_point = last_contour[min_idx]
-        ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=4)
-        legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}_{ridge}$'))
-
-    # Add the legend to the top-left of the plot
-    ax.legend(handles=legend_elements, loc='upper left', fontsize='small', frameon=True, handletextpad=0.2, borderpad=0.1, labelspacing=0.1)
-
-# Rest of your plotting code remains the same
-
-
-# Create contour levels
-first_contour_level = 0.1  # Start with a small contour level
-max_contour_level = (constraint_radius**2) * 0.6  # Largest contour level touching the circle
-
-# For each subsequent plot, we add one more contour level, increasing the value
-contour_levels_for_plots = [
-    [],  # No contour for the first plot
-    [first_contour_level],  # One small contour for the second plot
-    [first_contour_level, first_contour_level * 3],  # Two contours for the third plot
-    [first_contour_level, first_contour_level * 3, max_contour_level]  # Three contours for the last plot
-]
-
-# Create a 2x2 grid of plots
-fig, axs = plt.subplots(2, 2, figsize=(6, 6), dpi=120)
-
-# Plot for each subplot in the 2x2 grid
-for i, ax in enumerate(axs.flatten()):
-    last_plot = i == len(axs.flatten()) - 1  # Check if it's the last plot
-    draw_plot(ax, contour_levels_for_plots[i], last_plot)
-
-# Adjust layout to prevent overlapping
-plt.tight_layout()
-plt.show()
-
-
-def create_diamond(ax, constraint_radius):
-    """Create and add a diamond shape for L1 regularization."""
-    diamond = plt.Polygon([[-constraint_radius, 0], [0, constraint_radius], [constraint_radius, 0], [0, -constraint_radius]], 
-                          closed=True, color='blue', alpha=0.3, linestyle='--')
-    ax.add_patch(diamond)
-
-
-# Elliptical objective function with rotation
-def rotated_elliptical_objective(X, Y, center, a, b, angle_deg):
-    """ Rotated elliptical objective function. """
-    angle_rad = np.radians(angle_deg)
-    X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1])
-    Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1])
-    return (X_rot**2 / a**2) + (Y_rot**2 / b**2)
-
-# Define elliptical parameters
-a, b = 1.5, 0.75  # Semi-major and semi-minor axes lengths
-rotation_angle = -10  # Rotation angle in degrees
-
-# Calculate rotated elliptical objective function values
-Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle)
-
-# Define the constraint circle for ridge regression (L2)
-constraint_radius = 1.0  # Example radius
-
-def draw_plot(ax, contour_levels, last_plot=False):
-    # Plot contour lines around the objective center if any contour levels are provided
-    if contour_levels:
-        CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5)
-    
-    # Plot the diamond shape for L1 regularization
-    create_diamond(ax, constraint_radius)
-    
-    # Plot the minimum point in all plots
-    ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=4)
-
-    # Set the same scale for both axes and set limits
-    ax.set_aspect('equal', 'box')
-    ax.set_xlim(-3, 3)
-    ax.set_ylim(-3, 3)
-    ax.axhline(0, color='black', linewidth=0.5)
-    ax.axvline(0, color='black', linewidth=0.5)
-
-    # Define the legend elements
-    legend_elements = [plt.Line2D([0], [0], color='black', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}$')]
-    
-    # Add the intersection point for the last plot
-   # Add the intersection point for the last plot
-    if last_plot:
-        # Calculate the intersection point
-        last_contour = CS.allsegs[-1][0]
-        distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2)
-        min_idx = np.argmin(np.abs(distances - constraint_radius))
-        intersection_point = last_contour[min_idx]
-        ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=4)
-        legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=4, label=r'$\hat{\theta}_{lasso}$'))
-
-    # Add the legend to the top-left of the plot
-    ax.legend(handles=legend_elements, loc='upper left', fontsize='small', frameon=True, handletextpad=0.2, borderpad=0.1, labelspacing=0.1)
-
-# Create contour levels
-first_contour_level = 0.1  # Start with a small contour level
-max_contour_level = (constraint_radius**2) * 1.17  # Largest contour level touching the circle
-
-# For each subsequent plot, we add one more contour level, increasing the value
-contour_levels_for_plots = [
-    [],  # No contour for the first plot
-    [first_contour_level],  # One small contour for the second plot
-    [first_contour_level, first_contour_level * 4],  # Two contours for the third plot
-    [first_contour_level, first_contour_level * 4, max_contour_level]  # Three contours for the last plot
-]
-
-# Create a 2x2 grid of plots
-fig, axs = plt.subplots(2, 2, figsize=(6, 6), dpi=120)
-
-# Plot for each subplot in the 2x2 grid
-for i, ax in enumerate(axs.flatten()):
-    last_plot = i == len(axs.flatten()) - 1  # Check if it's the last plot
-    draw_plot(ax, contour_levels_for_plots[i], last_plot)
-
-# Adjust layout to prevent overlapping
-plt.tight_layout()
-plt.show()
-
-
- # Define the center of the objective function and elliptical parameters
-objective_center = np.array([1.5, 1.5])
-a, b = 1.5, 0.75
-rotation_angle = -30
-
-# Elliptical objective function
-def rotated_elliptical_objective(X, Y, center, a, b, angle_deg):
-    angle_rad = np.radians(angle_deg)
-    X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1])
-    Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1])
-    return (X_rot**2 / a**2) + (Y_rot**2 / b**2)
-
-Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle)
-
-# Define the constraint circle for ridge regression (L2)
-constraint_radius = 1.0
-
-# Define contour levels in increasing order
-contour_levels = [(constraint_radius**2) * 0.6, (constraint_radius**2) * 1.2, (constraint_radius**2) * 2.4]
-
-def draw_plot(ax, plot_index, last_plot=False):
-    # Plot all contours
-    CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5)
-
-    # Control visibility of contours based on plot index
-    for i, contour in enumerate(CS.collections):
-        contour.set_visible(i >= plot_index)
-
-    # Plot the constraint circle
-    circle = Circle((0, 0), constraint_radius, color='blue', alpha=0.3, linestyle='--')
-    ax.add_artist(circle)
-
-    # Plot the minimum point and set limits
-    min_point_handle, = ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=4)
-    ax.set_aspect('equal', 'box')
-    ax.set_xlim(-3, 3)
-    ax.set_ylim(-3, 3)
-
-    # Draw coordinate axes
-    ax.axhline(0, color='black', linewidth=0.5)
-    ax.axvline(0, color='black', linewidth=0.5)
-
-    # Add legend for the minimum point
-    if last_plot:
-        # Calculate intersection point for the last plot using the last contour segment (smallest contour)
-        last_contour = CS.allsegs[-3][0]  # Use the last contour segment
-        distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2)
-        min_idx = np.argmin(np.abs(distances - constraint_radius))
-        intersection_point = last_contour[min_idx]
-        ridge_point_handle, = ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=4)
-        ax.legend([min_point_handle, ridge_point_handle], [r'$\hat{\theta}$', r'$\hat{\theta}_{ridge}$'], loc='upper left', fontsize='small', frameon=True)
-    else:
-        ax.legend([min_point_handle], [r'$\hat{\theta}$'], loc='upper left', fontsize='small', frameon=True)
-
-
-# Create a 2x2 grid of plots
-fig, axs = plt.subplots(2, 2, figsize=(6, 6), dpi=120)
-
-for i, ax in enumerate(axs.flatten()):
-    last_plot = i == len(axs.flatten()) - 1
-    draw_plot(ax, 3 - i, last_plot)  # Reverse the order of plots
-
-plt.tight_layout()
-plt.show()
-
-
-# Elliptical objective function with rotation
-def rotated_elliptical_objective(X, Y, center, a, b, angle_deg):
-    """ Rotated elliptical objective function. """
-    angle_rad = np.radians(angle_deg)
-    X_rot = np.cos(angle_rad) * (X - center[0]) - np.sin(angle_rad) * (Y - center[1])
-    Y_rot = np.sin(angle_rad) * (X - center[0]) + np.cos(angle_rad) * (Y - center[1])
-    return (X_rot**2 / a**2) + (Y_rot**2 / b**2)
-
-# Define elliptical parameters
-a, b = 1.5, 0.75  # Semi-major and semi-minor axes lengths
-rotation_angle = -30  # Rotation angle in degrees
-
-# Calculate rotated elliptical objective function values
-Z_rotated_elliptical = rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle)
-
-# Define the constraint circle for ridge regression (L2)
-constraint_radius = 1.0  # Example radius
-constraint_radius_large = 1.33  # Larger radius for comparison
-
-# Create contour levels
-contour_levels = [0.1, 0.3, 0.6]  # Example contour levels
-
-# Create a 2x1 grid of plots
-fig, axs = plt.subplots(1, 2, figsize=(12, 6), dpi=100)
-
-def draw_plot(ax, constraint_radius, contour_levels):
-    # Plot contour lines around the objective center
-    CS = ax.contour(X, Y, Z_rotated_elliptical, levels=contour_levels, colors='red', linewidths=0.5)
-    
-    # Plot the constraint circle
-    circle = Circle((0, 0), constraint_radius, color='blue', alpha=0.3, linestyle='--')
-    ax.add_artist(circle)
-    
-    # Plot the minimum point
-    ax.plot(objective_center[0], objective_center[1], 'o', color='black', markersize=6)
-
-    # Set the same scale for both axes and set limits
-    ax.set_aspect('equal', 'box')
-    ax.set_xlim(-3, 3)
-    ax.set_ylim(-3, 3)
-    ax.axhline(0, color='black', linewidth=0.5)
-    ax.axvline(0, color='black', linewidth=0.5)
-
-    # Define the legend elements
-    legend_elements = [
-        plt.Line2D([0], [0], marker='o', color='black', markersize=6, label=r'$\hat{\theta}$', linestyle='None')
-    ]
-    
-    # Calculate and plot the intersection point for the second contour and larger circle if needed
-    if constraint_radius == constraint_radius_large:
-        last_contour = CS.allsegs[1][0]  # Use the second contour for intersection
-        distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2)
-        min_idx = np.argmin(np.abs(distances - constraint_radius))
-        intersection_point = last_contour[min_idx]
-        ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=6)
-        legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=6, label=r'$\hat{\theta}_{ridge}$'))
-    else:
-        last_contour = CS.allsegs[2][0]  # Use the second contour for intersection
-        distances = np.sqrt((last_contour[:, 0])**2 + (last_contour[:, 1])**2)
-        min_idx = np.argmin(np.abs(distances - constraint_radius))
-        intersection_point = last_contour[min_idx]
-        ax.plot(intersection_point[0], intersection_point[1], 'o', color='green', markersize=6)
-        legend_elements.append(plt.Line2D([0], [0], color='green', marker='o', linestyle='None', markersize=6, label=r'$\hat{\theta}_{ridge}$')) 
-
-    # Add the legend
-    ax.legend(handles=legend_elements, loc='upper left', fontsize='large', frameon=True, handletextpad=0.4, borderpad=0.1, labelspacing=0.1)
-
-# Draw plots
-draw_plot(axs[0], constraint_radius, contour_levels)
-draw_plot(axs[1], constraint_radius_large, contour_levels)
-
-plt.tight_layout()
-plt.show()
\ No newline at end of file
diff --git a/slides/regularization/rsrc/reg_surfaces.R b/slides/regularization/rsrc/reg_surfaces.R
new file mode 100755
index 00000000..3c073091
--- /dev/null
+++ b/slides/regularization/rsrc/reg_surfaces.R
@@ -0,0 +1,92 @@
+# ------------------------------------------------------------------------------
+# l1, l2
+
+# FIG: plot 3D regression surfaces with two coefficients 
+#      under different regularization constanta (lambda 0, 1, 10) 
+#      using l1 and l2 regularization.
+
+# DATA: y(500*1) = -0.5 * x1(~Unif(-1,1)) + 
+#                  3 * x2(~Unif(-1,1)) + epsilon(~Norm(0,0.1)).
+
+# ENV: for use of vistool, need to build a virtual env in miniconda 3
+#      conda create -n r-reticulate python=3.12
+#      conda activate r-reticulate
+#      conda install -c plotly plotly-orca python-kaleido
+#      conda deactivate
+# ------------------------------------------------------------------------------
+
+library(vistool)
+library(plotly)
+set.seed(0)
+
+#library(plotly)
+#use_condaenv("r-reticulate", required = TRUE)
+#py_config()
+
+# DATA -------------------------------------------------------------------------
+
+n <- 500
+x1 <- runif(n, -1, 1)
+x2 <- runif(n, -1, 1)
+epsilon <- rnorm(n, 0, 0.1)
+y <- -0.5 * x1 + 3 * x2 + epsilon
+
+# Regularization Norm Functions
+l1_norm <- function(beta1, beta2) {
+  return(abs(beta1) + abs(beta2))
+}
+
+l2_norm_squared <- function(beta1, beta2) {
+  return(beta1^2 + beta2^2)
+}
+
+# Updated Regularized Least Squares Objective Function with 1/n factor
+updated_objective <- function(x, x1, x2, y, lam, regularization) {
+  # x: beta, need to use x for building objective in vistool
+  residuals <- y - x[1] * x1 - x[2] * x2
+  error_term <- sum(residuals^2) / n
+  if (regularization == 'l1') {
+    penalty <- l1_norm(x[1], x[2])
+  } else if (regularization == 'l2') {
+    penalty <- l2_norm_squared(x[1], x[2])
+  }
+  return(error_term + lam * penalty)
+}
+
+# PLOT -------------------------------------------------------------------------
+
+regularizations <- c('l1', 'l2')
+lambdas <- c(0, 1, 10)
+
+for (reg in regularizations) {
+  for (lam in lambdas) {
+    obj_lm = Objective$new(id = "reg surfaces", fun = updated_objective, xdim = 2, 
+                           x1 = x1, x2 = x2, y = y, 
+                           lam = lam, regularization = reg, minimize = TRUE)
+    viz_lm = as_visualizer(obj_lm, x1_limits = c(-10, 10), x2_limits = c(-10, 10))
+    result <- optim(c(0, 0), updated_objective, x1 = x1, x2 = x2, y = y, lam = lam, 
+                    regularization = reg, method = 'L-BFGS-B')
+    plot_obj <- viz_lm$plot()
+    plot_obj <- plot_obj %>%
+      layout(
+        title = paste("Regularization:",reg,"λ:", as.character(lam)),
+        scene = list(
+          xaxis = list(title = "β1"),
+          yaxis = list(title = "β2"),
+          zaxis = list(title = "Objective")
+        )
+      ) %>%
+      add_trace(
+        type = "scatter3d",
+        mode = "markers",
+        x = result$par[1],  # beta1
+        y = result$par[2],  # beta2
+        z = result$value,  # objective value
+        marker = list(color = 'red', size = 3),
+        name = "Minimum Point"
+      )
+    savename = paste0("../figure/reg_surfaces_", reg, "_lam", as.character(lam),".png")
+    save_image(plot_obj, savename, engine = "kaleido", width = 600, height = 500)
+  }
+}
+
diff --git a/slides/regularization/rsrc/regu_example_1.R b/slides/regularization/rsrc/regu_example_1.R
deleted file mode 100644
index 96a4ecc0..00000000
--- a/slides/regularization/rsrc/regu_example_1.R
+++ /dev/null
@@ -1,43 +0,0 @@
-library(mlr)
-library(BBmisc)
-library(data.table)
-
-set.seed(123)
-
-task = bh.task
-task = dropFeatures(task, c("chas", "nox", "rm"))
-featnames = getTaskFeatureNames(task)
-
-compute_coef_paths = function(task, lambda_name, lambda_seq) {
-  lrn = makeLearner("regr.penalized", trace = FALSE, lambda1 = 0, lambda2 = 0)
-  path = list()
-  for (i in seq_along(lambda_seq)) {
-    lamval = lambda_seq[[i]]
-    pv = namedList(lambda_name, lamval)
-    lrn2 = setHyperPars(lrn, par.vals = pv)
-    m1 = train(lrn2, task)
-    mm1 = getLearnerModel(m1)
-    cc = coefficients(mm1)
-    cc = as.list(cc)
-    cc$lambda = lamval
-    path[[i]] = cc
-  }
-  path = rbindlist(path, fill = TRUE)
-  path[is.na(path)] = 0
-  ps = makeParamSet(
-    makeDiscreteParam(id = lambda_name, values = lambda_seq)
-  )
-  ctrl = makeTuneControlGrid()
-  tr = tuneParams(lrn, task, cv3, par.set = ps, control = ctrl, show.info = FALSE)
-  cv_lam = as.data.frame(tr$opt.path)[, c(lambda_name, "mse.test.mean")]
-  colnames(cv_lam) = c("lambda", "mse")
-  cv_lam$lambda = as.numeric(as.character(cv_lam$lambda))
-  list(path = path, cv_lam = cv_lam)
-}
-
-lambda_seq = 2^seq(-10, 20, length.out = 50)
-path_l1 = compute_coef_paths(task, "lambda1", lambda_seq)
-path_l2 = compute_coef_paths(task, "lambda2", lambda_seq)
-
-save2("regu_example_1.RData", path_l1 = path_l1, path_l2 = path_l2, featnames = featnames, lambda_seq = lambda_seq)
-
diff --git a/slides/regularization/rsrc/regu_example_1.RData b/slides/regularization/rsrc/regu_example_1.RData
deleted file mode 100644
index db2c210b..00000000
Binary files a/slides/regularization/rsrc/regu_example_1.RData and /dev/null differ
diff --git a/slides/regularization/rsrc/regu_example_2.R b/slides/regularization/rsrc/regu_example_2.R
deleted file mode 100644
index 1ce02f71..00000000
--- a/slides/regularization/rsrc/regu_example_2.R
+++ /dev/null
@@ -1,59 +0,0 @@
-library(mlr)
-library(pensim)
-library(ggplot2)
-library(gridExtra)
-library(MASS)
-
-set.seed(19873)
-n <- 100    # Number of observations
-p <- 50     # Number of predictors included in model
-CovMatrix <- outer(1:p, 1:p, function(x,y) {.7^abs(x-y)})
-x <- mvrnorm(n, rep(0,p), CovMatrix)
-y <- 10 * apply(x[, 1:2], 1, sum) +
-  5 * apply(x[, 3:4], 1, sum) +
-  apply(x[, 5:14], 1, sum) +
-  rnorm(n)
-
-
-dd = as.data.frame(x)
-dd$y = y
-task = makeRegrTask(data = dd, target = "y")
-
-
-get_pen_coefs = function(task, alpha, lam) {
-  featnames = getTaskFeatureNames(task)
-  lrn = makeLearner("regr.glmnet", alpha = alpha, lambda = lam)
-  m = train(lrn, task)
-  mm = getLearnerModel(m)
-  cc1 = as.matrix(coef(mm))[,1]
-  return(abs(cc1))
-}
-
-compute_cv = function(task, alpha, lambda_seq) {
-  lrn = makeLearner("regr.glmnet", alpha = alpha)
-  ps = makeParamSet(
-    makeDiscreteParam("lambda", values = lambda_seq)
-  )
-  ctrl = makeTuneControlGrid()
-  tr = tuneParams(lrn, task, cv3, par.set = ps, control = ctrl, show.info = FALSE)
-  cv_lam = as.data.frame(tr$opt.path)[, c("lambda", "mse.test.mean")]
-  colnames(cv_lam) = c("lambda", "mse")
-  cv_lam$lambda = as.numeric(as.character(cv_lam$lambda))
-  cv_lam
-}
-
-lams = c(0.01, 100)
-cc_l2_1 = get_pen_coefs(task, alpha = 0, lam = lams[1])
-cc_l2_2 = get_pen_coefs(task, alpha = 0, lam = lams[2])
-cc_l1_1 = get_pen_coefs(task, alpha = 1, lam = lams[1])
-cc_l1_2 = get_pen_coefs(task, alpha = 1, lam = lams[2])
-
-
-lambda_seq = 2^seq(-20, 1, length.out = 50)
-cv_l1 = compute_cv(task, alpha = 1, lambda_seq)
-cv_l2 = compute_cv(task, alpha = 0, lambda_seq)
-
-save2("regu_example_2.RData", lams, lambda_seq,
-      cc_l2_1, cc_l2_2, cc_l1_1, cc_l1_2,
-      cv_l1, cv_l2)
-
diff --git a/slides/regularization/rsrc/regu_example_2.RData b/slides/regularization/rsrc/regu_example_2.RData
deleted file mode 100644
index 39155db3..00000000
Binary files a/slides/regularization/rsrc/regu_example_2.RData and /dev/null differ
diff --git a/slides/regularization/rsrc/ridge_perspectives.R b/slides/regularization/rsrc/ridge_perspectives.R
new file mode 100755
index 00000000..97c321a7
--- /dev/null
+++ b/slides/regularization/rsrc/ridge_perspectives.R
@@ -0,0 +1,273 @@
+# ------------------------------------------------------------------------------
+# l2, nonlin
+
+# FIG: schematic diagrams of ridge regularization
+#  (1) increase objective function until the constraints are met 
+#  (2) optimize the objective function till optimum under constraints
+#  (3) different strength of ridge constraint
+#  (4) single schematic diagram
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(grid)
+library(dplyr)
+library(gridExtra)
+library(pracma)
+
+# DATA -------------------------------------------------------------------------
+
+# Define the grid for plotting
+x <- seq(-3.0, 3.0, length.out = 400)
+y <- seq(-3.0, 3.0, length.out = 400)
+X <- outer(rep(1, length(x)), y)
+Y <- outer(x, rep(1, length(y)))
+
+# Define elliptical parameters
+a <- 1.5
+b <- 0.75
+rotation_angle <- -30
+constraint_radius <- 1.0
+objective_center <- c(1.5, 1.5)
+
+# Rotated elliptical objective function
+rotated_elliptical_objective <- function(X, Y, center, a, b, angle_deg) {
+  angle_rad <- deg2rad(angle_deg)
+  X_rot <- cos(angle_rad) * (X - center[1]) - sin(angle_rad) * (Y - center[2])
+  Y_rot <- sin(angle_rad) * (X - center[1]) + cos(angle_rad) * (Y - center[2])
+  (X_rot^2 / a^2) + (Y_rot^2 / b^2)
+}
+
+Z_rotated_elliptical <- rotated_elliptical_objective(X, Y, objective_center, a, b, rotation_angle)
+
+# Create data frame for ggplot
+data <- data.frame(
+  x = as.vector(X),
+  y = as.vector(Y),
+  z = as.vector(Z_rotated_elliptical)
+)
+
+# Function to create plots
+create_plot <- function(data, levels, objective_center, constraint_radius, type, last_plot = FALSE){
+  #type: outside / inside
+  p <- ggplot()
+  if(length(levels)!=0){
+    p <- p +
+      geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = levels)
+    if (last_plot){
+      plot_build <- ggplot_build(p)
+      plot_data <- plot_build$data[[1]]
+      level_value <- ifelse(type=="outside", max(levels), min(levels))
+      filtered_data <- plot_data[plot_data$level == level_value, c("x","y")]
+      distances <- sqrt(filtered_data$x^2 + filtered_data$y^2)
+      min_idx <- which.min(abs(distances - constraint_radius))
+      intersection_point <- filtered_data[min_idx,]
+      p <- p + 
+        geom_point(aes(x = intersection_point[[1]], y = intersection_point[[2]]), color = 'green', size = 2) +
+        annotate("label", x = intersection_point[[1]] - 0.5, y = intersection_point[[2]] + 0.6, label = expression(hat(theta)[ridge]), color = "green", size = 3)
+    }
+  }
+  
+  # Create data for the circle
+  theta <- seq(0, 2 * pi, length.out = 100)
+  center <- c(0, 0)
+  circle_data <- data.frame(
+    x = center[1] + constraint_radius * cos(theta),
+    y = center[2] + constraint_radius * sin(theta)
+  )
+  
+  # Plot the circle with dashed lines and blue color
+  p <- p +
+    geom_path(data = circle_data, aes(x = x, y = y), color = 'blue', linetype = 'dashed', size = 0.5, alpha = 0.3) +
+    geom_polygon(data = circle_data, aes(x = x, y = y), fill = 'blue', alpha = 0.3) # Fill the circle with blue color and alpha 0.3
+  
+  p <- p +
+    geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) +
+    annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + 
+    geom_hline(yintercept = 0, color = 'black', size = 0.5) +
+    geom_vline(xintercept = 0, color = 'black', size = 0.5) +
+    theme_linedraw() +
+    theme(
+      panel.grid = element_blank(),
+      axis.title = element_blank(),
+      plot.title = element_blank()
+    ) + 
+    coord_fixed(xlim = c(-3, 3), ylim = c(-3, 3), expand = FALSE)
+  return(p)
+}
+
+# PLOT 1: outside --------------------------------------------------------------
+# increase objective function until the constraints are met
+
+contour_levels <- list(
+  c(),
+  c(0.1),
+  c(0.1, 0.3),
+  c(0.1, 0.3, 0.6)
+)
+
+plots_out <- lapply(1:4, function(i) {
+  create_plot(data, contour_levels[[i]], objective_center, constraint_radius, type="outside", last_plot = (i == 4))
+})
+
+p_outside <- grid.arrange(grobs = plots_out, nrow = 2, ncol = 2)
+
+# PLOT 2: inside ---------------------------------------------------------------
+# optimize the objective function till optimum under constraints
+
+contour_levels <- list(
+  c(),
+  c(2.4),
+  c(1.2, 2.4),
+  c(0.6, 1.2, 2.4)
+)
+
+# Generate plots
+plots_in <- lapply(1:4, function(i) {
+  create_plot(data, contour_levels[[i]], objective_center, constraint_radius, type="inside", last_plot = (i == 4))
+})
+
+# Arrange plots in a 2x2 grid
+p_inside <- grid.arrange(grobs = plots_in, nrow = 2, ncol = 2)
+
+# PLOT 3: constraints ----------------------------------------------------------
+# different strength of ridge constraint
+
+contour_levels <- c(0.1, 0.3, 0.6)
+
+# p1
+p1 <- ggplot() + 
+  geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = contour_levels)
+
+plot_build_1 <- ggplot_build(p1)
+plot_data_1 <- plot_build_1$data[[1]]
+filtered_data_1 <- plot_data_1[plot_data_1$level == 0.6, c("x","y")]
+distances_1 <- sqrt(filtered_data_1$x^2 + filtered_data_1$y^2)
+min_idx_1 <- which.min(abs(distances_1 - constraint_radius))
+intersection_point_1 <- filtered_data_1[min_idx_1,]
+  
+p1 <- p1 + 
+    geom_point(aes(x = intersection_point_1[[1]], y = intersection_point_1[[2]]), color = 'green', size = 2) +
+    annotate("label", x = intersection_point_1[[1]] - 0.5, y = intersection_point_1[[2]] + 0.6, label = expression(hat(theta)[ridge]), color = "green", size = 3)
+
+theta <- seq(0, 2 * pi, length.out = 100)
+center <- c(0, 0)
+circle_data_1 <- data.frame(
+  x = center[1] + cos(theta),
+  y = center[2] + sin(theta)
+)
+  
+p1 <- p1 +
+  geom_path(data = circle_data_1, aes(x = x, y = y), color = 'blue', linetype = 'dashed', size = 0.5, alpha = 0.3) +
+  geom_polygon(data = circle_data_1, aes(x = x, y = y), fill = 'blue', alpha = 0.3) +
+  geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) +
+  annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + 
+  geom_hline(yintercept = 0, color = 'black', size = 0.5) +
+  geom_vline(xintercept = 0, color = 'black', size = 0.5) +
+  theme_linedraw() +
+  theme(
+    panel.grid = element_blank(),
+    axis.title = element_blank(),
+    plot.title = element_blank()
+  ) + 
+  coord_fixed(xlim = c(-3, 3), ylim = c(-3, 3), expand = FALSE)
+
+# p2
+constraint_radius <- 1.33
+level_value <- 0.3
+
+p2 <- ggplot() + 
+  geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = contour_levels)
+
+plot_build <- ggplot_build(p2)
+plot_data <- plot_build$data[[1]]
+filtered_data <- plot_data[plot_data$level == level_value, c("x","y")]
+distances <- sqrt(filtered_data$x^2 + filtered_data$y^2)
+min_idx <- which.min(abs(distances - constraint_radius))
+intersection_point <- filtered_data[min_idx,]
+
+p2 <- p2 + 
+  geom_point(aes(x = intersection_point[[1]], y = intersection_point[[2]]), color = 'green', size = 2) +
+  annotate("label", x = intersection_point[[1]] - 0.5, y = intersection_point[[2]] + 0.6, label = expression(hat(theta)[ridge]), color = "green", size = 3)
+
+theta <- seq(0, 2 * pi, length.out = 100)
+center <- c(0, 0)
+circle_data <- data.frame(
+  x = center[1] + constraint_radius * cos(theta),
+  y = center[2] + constraint_radius * sin(theta)
+)
+
+
+p2 <- p2 +
+  geom_path(data = circle_data, aes(x = x, y = y), color = 'blue', linetype = 'dashed', size = 0.5, alpha = 0.3) +
+  geom_polygon(data = circle_data, aes(x = x, y = y), fill = 'blue', alpha = 0.3) +
+  geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) +
+  annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + 
+  geom_hline(yintercept = 0, color = 'black', size = 0.5) +
+  geom_vline(xintercept = 0, color = 'black', size = 0.5) +
+  theme_linedraw() +
+  theme(
+    panel.grid = element_blank(),
+    axis.title = element_blank(),
+    plot.title = element_blank()
+  ) + 
+  coord_fixed(xlim = c(-3, 3), ylim = c(-3, 3), expand = FALSE)
+
+# Arrange plots in a 1x2 grid
+p_cons <- grid.arrange(p1, p2, nrow = 1, ncol = 2)
+
+# PLOT 4: single schematic plot ------------------------------------------------
+
+constraint_radius <- 1
+contour_levels <- c(0.1, 0.3, 0.6)
+level_value <- 0.6
+
+p <- ggplot() + 
+  geom_contour(data = data, aes(x = x, y = y, z = z), color = 'red', breaks = contour_levels)
+
+plot_build <- ggplot_build(p)
+plot_data <- plot_build$data[[1]]
+filtered_data <- plot_data[plot_data$level == level_value, c("x","y")]
+distances <- sqrt(filtered_data$x^2 + filtered_data$y^2)
+min_idx <- which.min(abs(distances - constraint_radius))
+intersection_point <- filtered_data[min_idx,]
+
+p <- p + 
+  geom_point(aes(x = intersection_point[[1]], y = intersection_point[[2]]), color = 'green', size = 2) +
+  annotate("label", x = intersection_point[[1]] - 0.4, y = intersection_point[[2]] + 0.4, label = expression(hat(theta)[ridge]), color = "green", size = 3)
+theta <- seq(0, 2 * pi, length.out = 100)
+center <- c(0, 0)
+circle_datas <- data.frame(
+  x1 = center[1] + constraint_radius * cos(theta),
+  y1 = center[2] + constraint_radius * sin(theta),
+  x2 = center[1] + (constraint_radius / 1.5)* cos(theta),
+  y2 = center[2] + (constraint_radius / 1.5) * sin(theta),
+  x3 = center[1] + (constraint_radius / 3) * cos(theta),
+  y3 = center[2] + (constraint_radius / 3) * sin(theta)
+)
+
+p <- p +
+  geom_polygon(data = circle_datas, aes(x = x1, y = y1), fill = 'blue', alpha = 0.3) +
+  geom_polygon(data = circle_datas, aes(x = x2, y = y2), fill = 'blue', alpha = 0.5) +
+  geom_polygon(data = circle_datas, aes(x = x3, y = y3), fill = 'blue', alpha = 0.7) +
+  geom_point(aes(x = objective_center[1], y = objective_center[2]), color = "black", size = 2) +
+  annotate("label", x = objective_center[1], y = objective_center[2]+0.8, label = expression(hat(theta)), color = "black", size = 3) + 
+  geom_hline(yintercept = 0, color = 'black', size = 0.5) +
+  geom_vline(xintercept = 0, color = 'black', size = 0.5) +
+  geom_segment(aes(x = 0, y = -1.5, xend = 0, yend = 3), color = 'black', 
+               arrow = arrow(length = unit(0.2, "cm"), ends = "last", type = "closed")) +  # y-axis with arrow
+  annotate("text", x = -0.2, y = 2.8, label = expression(theta[2]), color = "black", size = 3) + 
+  geom_segment(aes(x = -1.5, y = 0, xend = 3, yend = 0), color = 'black', 
+               arrow = arrow(length = unit(0.2, "cm"), ends = "last", type = "closed")) +  # x-axis with arrow
+  annotate("text", x = 2.8, y = -0.2, label = expression(theta[1]), color = "black", size = 3) + 
+  theme_void() +
+  theme(
+    panel.grid = element_blank(),
+    axis.title = element_blank(),
+    plot.title = element_blank()
+  ) + 
+  coord_fixed(xlim = c(-1.5, 3), ylim = c(-1.5, 3), expand = FALSE)
+
+ggsave(filename = "../ridge_perspectives_01.png", plot = p_outside, width = 6, height = 6)
+ggsave(filename = "../ridge_perspectives_02.png", plot = p_inside, width = 6, height = 6)
+ggsave(filename = "../ridge_perspectives_03.png", plot = p_cons, width = 6, height = 3)
+ggsave(filename = "../ridge_perspectives_04.png", plot = p, width = 3, height = 3)
diff --git a/slides/regularization/rsrc/ridge_polynomial_reg.R b/slides/regularization/rsrc/ridge_polynomial_reg.R
deleted file mode 100644
index 8ce4659b..00000000
--- a/slides/regularization/rsrc/ridge_polynomial_reg.R
+++ /dev/null
@@ -1,74 +0,0 @@
-betaRidge <- function (X, y, lambda)
-{
-  return (solve(t(X) %*% X + lambda * diag(ncol(X))) %*% (t(X) %*% y))
-}
-
-baseTrafo <- function (x, degree)
-{
-  out <- cbind(1, x)
-  for (i in seq_len(degree)[-1]) {
-    out <- cbind(out, x^i)
-  }
-  # poly ist schei?e
-  return (out)
-}
-
-getPolyData <- function(x, y, lambda.vec, base.trafo, ...)
-{
-  X <- base.trafo(x, ...)
-  
-  x.pred <- seq(min(x), max(x), length.out = 500)
-  X.pred <- base.trafo(x.pred, ...)
-  
-  df.truth <- data.frame(feature = x, truth = y)
-  
-  # browser()
-  
-  df.betas <- matrix(NA, nrow=length(lambda.vec), ncol=ncol(X))
-  row.names(df.betas) <- lambda.vec
-  
-  for(i in 1:length(lambda.vec)){
-    df.betas[i,] <- betaRidge(X, y, lambda.vec[i])
-  }
-  
-  df.polys <- lapply(1:length(lambda.vec), function (i) {
-    return (data.frame(
-      feature = x.pred,
-      pred = X.pred %*% df.betas[i,],
-      lambda = row.names(df.betas)[i]
-    ))
-  })
-  return (list(polys = df.polys,
-               truth = df.truth,
-               betas = df.betas))
-}
-
-plotRidge <- function (x, y, lambda.vec, base.trafo, ...)
-{
-  requireNamespace("ggplot2")
-  
-  # browser()
-  
-  res <- getPolyData(x, y, lambda.vec, base.trafo, ...)
-  df.polys <- res$polys
-  df.truth <- res$truth
-  
-  plot.df <- df.polys[[1]]
-  for (i in seq_along(df.polys)[-1]) {
-    plot.df <- rbind(plot.df, df.polys[[i]])
-  }
-  plot.df$lambda <- as.factor(plot.df$lambda)
-  
-  gg <- ggplot2::ggplot()
-  if (length(lambda.vec) == 1) {
-    gg <- gg + ggplot2::geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda), show.legend = FALSE)
-  } else {
-    gg <- gg + ggplot2::geom_line(data = plot.df, aes(x = feature, y = pred, color = lambda))
-  }
-  
-  return (
-    gg +
-      ggplot2::geom_point(data = df.truth, mapping = aes(x = feature, y = truth))
-  )
-}
-
diff --git a/slides/regularization/rsrc/ridge_vs_sgd_path.R b/slides/regularization/rsrc/ridge_vs_sgd_path.R
new file mode 100755
index 00000000..c759001c
--- /dev/null
+++ b/slides/regularization/rsrc/ridge_vs_sgd_path.R
@@ -0,0 +1,105 @@
+# ------------------------------------------------------------------------------
+# early stopping
+
+# FIG: 
+# LEFT: how coefficients of a linear model change 
+#       with regularization constant (lambda) for ridge regression.
+# RIGHT: how coefficients of a linear model change with iterations for SGD.
+
+# DATA: linear regression model data generated by
+#       y = X(100*10 ~Normal)·true_coef(10*1) + noise(100*1 ~Normal).
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(gridExtra)
+
+set.seed(6)
+
+# DATA -------------------------------------------------------------------------
+
+# generate data for design matrix, response variable, 
+# and true coefficients for a linear model
+# with n samples, p features and no intercept.
+generate_data <- function(n, p) {
+  X <- matrix(rnorm(n * p), nrow = n, ncol = p)
+  true_coef <- seq(-1, 1, length.out = p)
+  noise <- rnorm(n)
+  y <- X %*% true_coef + noise
+  return(list(X = X, y = y, true_coef = true_coef))
+}
+
+# compute the ridge coefficients analytically
+compute_ridge_path <- function(X, y, alphas) {
+  coefs <- matrix(0, nrow = 1, ncol = ncol(X))
+  for (i in 1:length(alphas)) {
+    ridge_coefs <- solve(t(X) %*% X + alphas[i] * diag(ncol(X))) %*% t(X) %*% y
+    coefs <- rbind(coefs, as.vector(ridge_coefs))
+  }
+  return(coefs)
+}
+
+# compute the optimization trajectory for SGD
+compute_sgd_trajectory <- function(X, y, batch_size, learning_rate, n_iter) {
+  w <- rep(0, ncol(X))
+  coefs <- matrix(0, nrow = 1, ncol = ncol(X))
+  for (i in 1:n_iter) {
+    indices <- sample(1:nrow(X), replace = FALSE)
+    for (j in seq(1, nrow(X), batch_size)) {
+      indices_batch <- indices[j:min(j + batch_size - 1, nrow(X))]
+      X_batch <- X[indices_batch, ]
+      y_batch <- y[indices_batch]
+      gradient <- -2 * t(X_batch) %*% (y_batch - X_batch %*% w) / batch_size
+      w <- w - learning_rate * gradient
+    }
+    coefs <- rbind(coefs, as.vector(w))
+  }
+  return(coefs)
+}
+
+n <- 100
+p <- 10
+batch_size <- 4
+learning_rate <- 0.01
+n_iter <- 50
+t_values <- seq(0.001, n_iter + 1, by = 1)  # Include 0 in t_values for the zero coefficients
+alphas <- 1 / (learning_rate * t_values[1:length(t_values)])  # Exclude 0 to avoid division by zero
+
+data <- generate_data(n, p)
+X <- data$X
+y <- data$y
+true_coef <- data$true_coef
+
+ridge_coefs <- compute_ridge_path(X, y, alphas)
+
+sgd_coefs <- compute_sgd_trajectory(X, y, batch_size, learning_rate, n_iter)
+
+# PLOT -------------------------------------------------------------------------
+
+# Ridge path
+inv_alphas <- 1/alphas
+df_ridge <- data.frame(inv_alphas, ridge_coefs[-1,])
+
+df_ridge_long <- df_ridge %>%
+  pivot_longer(cols = starts_with("X"), names_to = "line", values_to = "value")
+
+p1 <- ggplot(df_ridge_long, aes(x = inv_alphas, y = value, color = line)) +
+  geom_line(show.legend = FALSE) +
+  labs(title = "Ridge Regression Path", x = expression("1 / ( lr *"~lambda~")"), y = "Parameters") +
+  theme_minimal()
+
+# SGD path
+df_SGD <- data.frame(t_values, sgd_coefs)
+
+df_SGD_long <- df_SGD %>%
+  pivot_longer(cols = starts_with("X"), names_to = "line", values_to = "value")
+
+p2 <- ggplot(df_SGD_long, aes(x = t_values, y = value, color = line)) +
+  geom_line(show.legend = FALSE) +
+  labs(title = "SGD Trajectory", x = "Iterations", y = "Parameters") +
+  theme_minimal()
+
+p = grid.arrange(p1, p2, ncol = 2)
+
+ggsave("../figure/ridge_vs_sgd_path.png", plot=p, width=12, height=4.5)
diff --git a/slides/regularization/rsrc/shrinkage.R b/slides/regularization/rsrc/shrinkage.R
new file mode 100755
index 00000000..930e549f
--- /dev/null
+++ b/slides/regularization/rsrc/shrinkage.R
@@ -0,0 +1,98 @@
+# ------------------------------------------------------------------------------
+# l1 vs l2
+
+# FIG:
+#  (1): how coefficient values and MSE changes with regularization constant
+#       (lambda) for linear regression with l1 and l2 regularization.
+#  (2): histogram of coefficient values with two regularization constants
+#       (lambda 0.01, 100) to show how they affect shrinkage
+#       for linear regression with l1 and l2 regularization.
+# DATA:
+#  (1): data from data_regu_example_1.RData
+#  (2): data from data_regu_example_2.RData
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(gridExtra)
+library(ggrepel)
+library(data.table)
+library(viridis)
+
+# DATA -------------------------------------------------------------------------
+
+load("data_regu_example_1.RData")
+load("data_regu_example_2.RData")
+
+d_l1 <- rbind(
+  data.frame(lam = paste(lams[1]), coefval = cc_l1_1),
+  data.frame(lam = paste(lams[2]), coefval = cc_l1_2)
+)
+d_l1$lam <- as.factor(d_l1$lam)
+d_l2 <- rbind(
+  data.frame(lam = paste(lams[1]), coefval = cc_l2_1),
+  data.frame(lam = paste(lams[2]), coefval = cc_l2_2)
+)
+d_l2$lam <- as.factor(d_l2$lam)
+
+# PLOTS -------------------------------------------------------------------------
+
+### (1)
+plot_coef_paths <- function(path, featnames, title, xlab) {
+  ggd <- melt(path, id.vars = "lambda", measure = featnames, variable.name = "featname", value.name = "coefval")
+  ggd$label <- ifelse(ggd$lambda == min(lambda_seq), as.character(ggd$featname), NA)
+  pl <- ggplot(data = ggd, aes(x = lambda, y = coefval, group = featname, col = featname)) +
+    guides(color = "none") +
+    geom_line() +
+    geom_label_repel(aes(label = label), na.rm = TRUE, max.overlaps = Inf) +
+    scale_color_discrete(guide = FALSE) +
+    scale_x_log10() +
+    ggtitle(title) +
+    xlab(xlab) +
+    theme_bw() +
+    scale_color_viridis(end = 0.9, discrete = TRUE)
+}
+
+plot_cv_path <- function(cv_lam, title, xlab, ylab) {
+  pl <- ggplot(data = cv_lam, aes(x = lambda, y = mse)) +
+    geom_line() +
+    scale_x_log10() +
+    ggtitle(title) +
+    xlab(xlab) +
+    ylab(ylab)
+}
+
+p1l1 <- plot_coef_paths(path_l1$path, featnames, "Lasso", expression(lambda))
+p1l2 <- plot_coef_paths(path_l2$path, featnames, "Ridge", expression(lambda))
+p1l3 <- plot_cv_path(path_l1$cv_lam, "Lasso", expression(lambda), 'MSE') + 
+  theme_minimal() + ylim(25, 90)
+p1l4 <- plot_cv_path(path_l2$cv_lam, "Ridge", expression(lambda), 'MSE') + 
+  theme_minimal() + ylim(20, 90)
+
+p1 <- grid.arrange(p1l1, p1l2, p1l3, p1l4, nrow = 2)
+ggsave("../figure/shrinkage_01.png", plot = p1, width = 8, height = 4)
+
+
+
+### (2)
+# histogram of coefficients value of data d
+plot_coef_hist <- function(d, title) {
+  pl <- ggplot(d, aes(x = coefval, fill = lam)) +
+    scale_fill_viridis(end = 0.9, discrete = TRUE) +
+    geom_histogram(alpha = 0.9, position = "dodge") +
+    theme_gray(base_size = 14) +
+    ggtitle(title)
+  return(pl)
+}
+
+# MSE with different lambda for data cv_lam
+
+p2l1 <- plot_coef_hist(d_l1, "Lasso") + guides(fill=guide_legend(title=expression(lambda)))
+p2l2 <- plot_coef_hist(d_l2, "Ridge")+ guides(fill=guide_legend(title=expression(lambda))) + 
+  ylim(0, 50)
+p2l3 <- plot_cv_path(cv_l1, "Lasso", expression(lambda), 'MSE')  + 
+  theme_gray(base_size = 14) + ylim(1, 10)
+p2l4 <- plot_cv_path(cv_l2, "Ridge", expression(lambda), 'MSE')  + 
+  theme_gray(base_size = 14) + ylim(1, 10)
+
+p2 <- grid.arrange(p2l1, p2l2, p2l3, p2l4, nrow = 2)
+ggsave("../figure/shrinkage_02.png", plot = p2, width = 8, height = 5)
diff --git a/slides/regularization/rsrc/soft-thresholding.R b/slides/regularization/rsrc/soft_thresholding.R
old mode 100644
new mode 100755
similarity index 63%
rename from slides/regularization/rsrc/soft-thresholding.R
rename to slides/regularization/rsrc/soft_thresholding.R
index 5b96abba..b1de8c12
--- a/slides/regularization/rsrc/soft-thresholding.R
+++ b/slides/regularization/rsrc/soft_thresholding.R
@@ -1,52 +1,55 @@
-library(ggplot2)
-
-# Define the soft thresholding function
-soft_threshold <- function(rho, lamda) {
-  if (rho < -lamda) {
-    return (rho + lamda)
-  } else if (rho > lamda) {
-    return (rho - lamda)
-  } else {
-    return (0)
-  }
-}
-
-# Lambda value
-lamda <- 3
-
-# Generate sequence of rho values (similar to x1 in Python)
-x1 <- seq(-10, 10, by = 0.1)
-
-# Apply the soft thresholding function to each value in x1
-y_st <- sapply(x1, function(rho) soft_threshold(rho, lamda))
-
-# Compute the ridge estimate for each value in x1
-y_ridge <- x1 / (1 + lamda)
-
-# Create a data frame for plotting
-data <- data.frame(rho = x1, theta = y_st, OLS = x1, Ridge = y_ridge)
-
-# Plot using ggplot2
-p <- ggplot(data, aes(x = rho)) +
-  geom_line(aes(y = theta), color = 'blue', linetype = "solid", size=1.2) +
-  geom_line(aes(y = OLS), color = 'grey', linetype = "dashed", size=1.2) +
-  geom_line(aes(y = Ridge), color = 'red', linetype = "solid", size=1.2) + 
-  labs(x = expression(theta[j]), y = expression(theta[j]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design, lambda=3)') +
-  theme_minimal() +
-  theme(
-    plot.title = element_text(hjust = 0.5, size = 20),  
-    axis.title = element_text(size = 18),              
-    axis.text = element_text(size = 18),             
-    axis.ticks = element_line(size = 1)
-  ) +
-  scale_color_manual(values = c('blue', 'grey', 'red')) +
-  geom_hline(yintercept = 0, linetype="solid", color = "black") +
-  geom_vline(xintercept = 0, linetype="solid", color = "black") +
-  guides(color = guide_legend(title = NULL)) +
-  theme(legend.position = "bottom") +
-  annotate("text", x = -9, y = -4, label = expression(S(theta[j], lambda)), parse = TRUE, size=8, color="blue") +
-  annotate("text", x = 7, y = 9, label = "OLS", parse = TRUE, size=8, color ="grey") +
-  annotate("text", x = 7, y = 0.5, label = "Ridge", color = "red", parse = TRUE, size=8) # Label for Ridge
-
-# Display the plot
-print(p)
+# ------------------------------------------------------------------------------
+# l1
+
+# FIG: draw lasso and ridge solution paths in terms of OLS.
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+
+# DATA -------------------------------------------------------------------------
+
+soft_threshold <- function(rho, lamda) {
+  if (rho < -lamda) {
+    return (rho + lamda)
+  } else if (rho > lamda) {
+    return (rho - lamda)
+  } else {
+    return (0)
+  }
+}
+
+lamda <- 3
+
+x1 <- seq(-10, 10, by = 0.1)
+
+y_st <- sapply(x1, function(rho) soft_threshold(rho, lamda))
+
+# ridge estimate
+y_ridge <- x1 / (1 + lamda)
+
+# PLOT -------------------------------------------------------------------------
+
+data <- data.frame(rho = x1, theta = y_st, OLS = x1, Ridge = y_ridge)
+
+p <- ggplot(data, aes(x = rho)) +
+  geom_line(aes(y = theta), color = 'blue', linetype = "solid", size=1.2) +
+  geom_line(aes(y = OLS), color = 'grey', linetype = "dashed", size=1.2) +
+  geom_line(aes(y = Ridge), color = 'red', linetype = "solid", size=1.2) + 
+  labs(x = expression(theta[OLS]), y = expression(theta[pen]), title = 'Lasso vs Ridge solution in terms of OLS (orthonormal design, lambda=3)') +
+  theme_minimal() +
+  theme(
+    plot.title = element_text(hjust = 0.5, size = 20),  
+    axis.title = element_text(size = 18),              
+    axis.text = element_text(size = 18),             
+    axis.ticks = element_line(size = 1)
+  ) +
+  scale_color_manual(values = c('blue', 'grey', 'red')) +
+  geom_hline(yintercept = 0, linetype="solid", color = "black") +
+  geom_vline(xintercept = 0, linetype="solid", color = "black") +
+  guides(color = guide_legend(title = NULL)) +
+  theme(legend.position = "bottom") +
+  annotate("text", x = -9, y = -4, label = "Lasso", parse = TRUE, size=8, color="blue") +
+  annotate("text", x = 7, y = 9, label = "OLS", parse = TRUE, size=8, color ="grey") +
+  annotate("text", x = 7, y = 0.5, label = "Ridge", color = "red", parse = TRUE, size=8)
+
+ggsave("../figure/soft_thresholding.png", plot = p, width = 10, height = 5)
diff --git a/slides/regularization/rsrc/solution_path.R b/slides/regularization/rsrc/solution_path.R
new file mode 100755
index 00000000..76cbfb5d
--- /dev/null
+++ b/slides/regularization/rsrc/solution_path.R
@@ -0,0 +1,118 @@
+# ------------------------------------------------------------------------------
+# l1, l2
+
+# FIG: solution path under l1 and l2 regularization.
+# DATA:
+#   x = seq(0, 1, length.out = 40)
+#   noise ~ Unif(0, 1)
+#   y = sin(x * 1.5 * pi)
+#   y_noise = (y + noise) - mean(y + noise)
+# ------------------------------------------------------------------------------
+
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(Matrix)
+library(glmnet)
+library(pracma)
+library(gridExtra)
+set.seed(0)
+
+# DATA -------------------------------------------------------------------------
+
+# Cost function definitions
+cost_l2 <- function(x, y) {
+  return(x^2 + y^2)
+}
+
+cost_l1 <- function(x, y) {
+  return(abs(x) + abs(y))
+}
+
+costfunction <- function(X, y, theta) {
+  m <- length(y)
+  h <- X %*% theta
+  return((1 / (2 * m)) * t(h - y) %*% (h - y))
+}
+
+closed_form_reg_solution <- function(X, y, lambda = 10) {
+  m <- nrow(X)
+  n <- ncol(X)
+  I <- diag(n)
+  return(solve(t(X) %*% X + lambda * I) %*% t(X) %*% y)
+}
+
+# Dataset creation and normalization
+x <- seq(0, 1, length.out = 40)
+noise <- runif(40, 0, 1)
+y <- sin(x * 1.5 * pi)
+y_noise <- (y + noise) - mean(y + noise)
+X <- cbind(x, x^2)
+X <- sweep(X, 2, sqrt(colSums(X^2)), FUN = "/")
+
+# Setup of meshgrid of theta values
+theta1 <- seq(-2, 17, length.out = 100)
+theta2 <- seq(-17, 3, length.out = 100)
+grid <- expand.grid(theta1 = theta1, theta2 = theta2)
+
+# Computing the cost function for each theta combination
+grid <- grid %>%
+  mutate(Z_l2 = cost_l2(theta1, theta2),
+         Z_l1 = cost_l1(theta1, theta2),
+         Z_ls = apply(grid, 1, function(row) costfunction(X, y_noise, matrix(c(row[1], row[2]), nrow = 2))))
+
+# Calculating the regularization paths
+lambda_range_l2 <- 10^seq(0, 4, length.out = 100) / 1000
+lambda_range_l1 <- 10^seq(0, 2, length.out = 100) / 1000
+
+theta_l2 <- sapply(lambda_range_l2, function(l) closed_form_reg_solution(X, y_noise, l))
+theta_l1 <- sapply(lambda_range_l1, function(l) coef(glmnet(X, y_noise, alpha=1, lambda=l, standardize=FALSE, intercept=FALSE))[2:3])
+
+theta_l2_df <- data.frame(t(theta_l2))
+theta_l1_df <- data.frame(t(theta_l1))
+
+
+# L2 plot
+
+l2_contour_levels <- c(.5, 1.5, 3, 6, 9, 15, 30, 60, 100, 150, 250)
+
+p2 <- ggplot() +
+  geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_l2), color = 'cyan', breaks = l2_contour_levels) +
+  geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_ls), color = 'red', breaks = c(.01, .06, .09, .11, .15)) +
+  geom_point(data = theta_l2_df, aes(x = X1, y = X2), color = 'red', alpha = 0.2) +
+  labs(title = 'L2 regularization solution path', x = expression(theta[1]), y = expression(theta[2])) +
+  theme_minimal() + 
+  coord_fixed()
+
+# L1 & L2 plot
+
+# Plot L2 Regularization
+inside_l2 <- theta_l2_df %>%
+  filter(cost_l2(X1, X2) < max(l2_contour_levels))
+
+p_l2 <- ggplot() +
+  geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_l2), color = 'cyan', breaks = l2_contour_levels) +
+  geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_ls), color = 'red', breaks = c(.01, .06, .09, .11, .15)) +
+  geom_point(data = inside_l2, aes(x = X1, y = X2), color = 'green', alpha = 0.5) +
+  labs(title = 'L2 regularization solution path', x = expression(theta[1]), y = expression(theta[2])) +
+  theme_minimal() + 
+  coord_fixed()
+
+# Plot L1 Regularization
+l1_contour_levels = c(.5, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14)
+
+inside_l1 <- theta_l1_df %>%
+  filter(cost_l1(X1, X2) < max(l1_contour_levels))
+
+p_l1 <- ggplot() +
+  geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_l1), color = 'cyan', breaks = l1_contour_levels) +
+  geom_contour(data = grid, aes(x = theta1, y = theta2, z = Z_ls), color = 'red', breaks = c(.01, .06, .09, .11, .15)) +
+  geom_point(data = inside_l1, aes(x = X1, y = X2), color = 'green', alpha = 0.5) +
+  labs(title = 'L1 regularization solution path', x = expression(theta[1]), y = expression(theta[2])) +
+  theme_minimal() + 
+  coord_fixed()
+
+p <- grid.arrange(p_l2, p_l1, ncol = 2) 
+
+ggsave(filename = "../figure/solution_paths_02.png", plot = p2, width = 5, height = 5)
+ggsave(filename = "../figure/solution_paths_01.png", plot = p, width = 10, height = 5)
diff --git a/slides/regularization/rsrc/table_equivariance.R b/slides/regularization/rsrc/table_equivariance.R
new file mode 100755
index 00000000..2b8eafd7
--- /dev/null
+++ b/slides/regularization/rsrc/table_equivariance.R
@@ -0,0 +1,89 @@
+# ------------------------------------------------------------------------------
+# l1 vs l2
+
+# TABLE: coefficients and MSE of OLS and ridge for X and rescaled X
+# DATA: Y  = X(100*5 ~Normal) * beta_true + epsilon(100*1 ~Normal)
+# ------------------------------------------------------------------------------
+
+library(MASS)
+library(xtable)
+library(dplyr)
+
+set.seed(123)
+
+# DATA -------------------------------------------------------------------------
+
+n <- 100
+p <- 5
+X <- matrix(rnorm(n * p), n, p)
+beta_true <- c(1, 2, 3, 4, 5)
+epsilon <- rnorm(n)
+Y <- X %*% beta_true + epsilon
+
+# OLS Solution
+beta_ols <- solve(t(X) %*% X) %*% t(X) %*% Y
+
+# Ridge Solution
+lambda <- 10
+beta_ridge <- solve(t(X) %*% X + lambda * diag(p)) %*% t(X) %*% Y
+
+# Rescale and repeat
+X_rescaled <- X
+X_rescaled[,5] <- 100 * X_rescaled[,5]
+beta_ols_rescaled <- solve(t(X_rescaled) %*% X_rescaled) %*% t(X_rescaled) %*% Y
+beta_ridge_rescaled <- solve(t(X_rescaled) %*% X_rescaled + lambda * diag(p)) %*% t(X_rescaled) %*% Y
+
+# Results
+results <- rbind(t(beta_ols), t(beta_ols_rescaled), t(beta_ridge), t(beta_ridge_rescaled))
+colnames(results) <- paste("Coefficient", 1:p)
+
+# MSE
+loss_ols <- mean((Y - X %*% beta_ols)^2)
+loss_ols_rescaled <- mean((Y - X_rescaled %*% beta_ols_rescaled)^2)
+loss_ridge <- mean((Y - X %*% beta_ridge)^2) # + lambda * sum(beta_ridge^2)
+loss_ridge_rescaled <- mean((Y - X_rescaled %*% beta_ridge_rescaled)^2) #+ lambda * sum(beta_ridge_rescaled^2)
+
+losses <- c(loss_ols, loss_ols_rescaled, loss_ridge, loss_ridge_rescaled)
+results <- cbind(results, MSE = losses)
+rownames(results) <- c("OLS", "OLS Rescaled", "Ridge", "Ridge Rescaled")
+print(results)
+
+# TABLE ------------------------------------------------------------------------
+results <- round(results, 3)
+# Function to bold specific column values
+bold_coefficient5 <- function(x) {
+  x[, "Coefficient 5"] <- paste0("\\textbf{", formatC(x[, "Coefficient 5"], format = "f", digits = 3), "}")
+  x
+}
+
+table_ols <- bold_coefficient5(results[1:2,])
+table_ridge <- bold_coefficient5(results[3:4,])
+
+table_ols <- xtable(table_ols)
+align(table_ols) <- "|c|cccccc|"
+table_ridge <- xtable(table_ridge)
+align(table_ridge) <- "|c|cccccc|"
+
+add.to.row <- list(pos = list(-1, nrow(table_ols)),
+                   command = c("\\hline\n\\textbf{Method} & \\( \\hat{\\theta}_1 \\) & \\( \\hat{\\theta}_2 \\) & \\( \\hat{\\theta}_3 \\) & \\( \\hat{\\theta}_4 \\) & \\( \\hat{\\theta}_5 \\) & MSE \\\\ \\hline\n",
+                               "\\hline\n"))
+
+print(table_ols, file = "table_equivariance_ols.tex", include.rownames = TRUE,
+      include.colnames = FALSE,
+      sanitize.text.function = identity, 
+      tabular.environment = "tabular",
+      floating = FALSE,
+      add.to.row = add.to.row,
+      hline.after = NULL,
+      booktabs = FALSE,
+      comment = FALSE)
+
+print(table_ridge, file = "table_equivariance_ridge.tex", include.rownames = TRUE,
+      include.colnames = FALSE,
+      sanitize.text.function = identity, 
+      tabular.environment = "tabular",
+      floating = FALSE,
+      add.to.row = add.to.row,
+      hline.after = NULL,
+      booktabs = FALSE,
+      comment = FALSE)
diff --git a/slides/regularization/rsrc/table_equivariance_ols.tex b/slides/regularization/rsrc/table_equivariance_ols.tex
new file mode 100755
index 00000000..80d4bdbb
--- /dev/null
+++ b/slides/regularization/rsrc/table_equivariance_ols.tex
@@ -0,0 +1,7 @@
+\begin{tabular}{|c|cccccc|}
+  \hline
+\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline
+ OLS & 0.984 & 2.147 & 3.006 & 3.918 & \textbf{5.205} & 0.812 \\ 
+  OLS Rescaled & 0.984 & 2.147 & 3.006 & 3.918 & \textbf{0.052} & 0.812 \\ 
+   \hline
+\end{tabular}
diff --git a/slides/regularization/rsrc/table_equivariance_ridge.tex b/slides/regularization/rsrc/table_equivariance_ridge.tex
new file mode 100755
index 00000000..d22cca73
--- /dev/null
+++ b/slides/regularization/rsrc/table_equivariance_ridge.tex
@@ -0,0 +1,7 @@
+\begin{tabular}{|c|cccccc|}
+  \hline
+\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline
+ Ridge & 0.709 & 1.874 & 2.661 & 3.558 & \textbf{4.636} & 1.366 \\ 
+  Ridge Rescaled & 0.802 & 1.943 & 2.675 & 3.569 & \textbf{0.051} & 1.08 \\ 
+   \hline
+\end{tabular}
diff --git a/slides/regularization/rsrc/make_weightdecay_lambda_plot.R b/slides/regularization/rsrc/weightdecay_lambda.R
old mode 100644
new mode 100755
similarity index 62%
rename from slides/regularization/rsrc/make_weightdecay_lambda_plot.R
rename to slides/regularization/rsrc/weightdecay_lambda.R
index 28b150fc..b9af3b4e
--- a/slides/regularization/rsrc/make_weightdecay_lambda_plot.R
+++ b/slides/regularization/rsrc/weightdecay_lambda.R
@@ -1,6 +1,17 @@
-source("utils.R")
+# ------------------------------------------------------------------------------
+# wd vs l2
+
+# FIG: draw the path of optimal point for each iteration using weight decay. 
+#      use different decay parameters(lambda) to show how strong the pulling is.
+
+# DATA: linear model data from data_func_utils.R
+# ------------------------------------------------------------------------------
+
+source("data_func_utils.R")
 library(gridExtra)
 
+# DATA -------------------------------------------------------------------------
+
 x1 <- seq(0,1.5,length.out = 100)
 x2 <- seq(0,3.5,length.out = 100)
 lambda <- 5
@@ -14,6 +25,8 @@ gd_l2_betas <- gradient_descent(beta_start, step_size,
 
 ret <- weight_decay(beta_start, lambda, step_size, R_emp_grad, num_steps)
 
+# PLOT -------------------------------------------------------------------------
+
 remp_l2_plot_1 <-  plot_r_emp(R_emp, x1, x2) +
   geom_path(data = ret$betas_gd, aes(x=V1, y=V2), colour = "red", size=1.1) +
   geom_path(data = ret$betas_wd, aes(x=V1, y=V2), colour = "yellow", size=1.1) +
@@ -34,5 +47,5 @@ remp_l2_plot_2 <-  plot_r_emp(R_emp, x1, x2) +
 
 #p <- grid.arrange(remp_l2_plot_1 , remp_l2_plot_2 , ncol=2)
 
-ggsave("../figure/weightdecay_lambda_plot_01.png", plot = remp_l2_plot_1, width = 2.6, height = 3.1, dpi="retina")
-ggsave("../figure/weightdecay_lambda_plot_02.png", plot = remp_l2_plot_2, width = 2.6, height = 3.1, dpi="retina")
+ggsave("../figure/weightdecay_lambda_01.png", plot = remp_l2_plot_1, width = 2.6, height = 3.1, dpi="retina")
+ggsave("../figure/weightdecay_lambda_02.png", plot = remp_l2_plot_2, width = 2.6, height = 3.1, dpi="retina")
diff --git a/slides/regularization/slides-regu-early-stopping.tex b/slides/regularization/slides-regu-early-stopping.tex
index 87a6bac2..c2f647de 100644
--- a/slides/regularization/slides-regu-early-stopping.tex
+++ b/slides/regularization/slides-regu-early-stopping.tex
@@ -106,7 +106,7 @@
   \begin{figure}
     \centering
       %\scalebox{0.75}
-      {\includegraphics{figure_man/ridge-vs-sgd-path.png}}
+      {\includegraphics{figure/ridge_vs_sgd_path.png}}
       %\scriptsize{\\Ali et al. (2020)\\}
   \end{figure}
 
diff --git a/slides/regularization/slides-regu-geom-l2.tex b/slides/regularization/slides-regu-geom-l2.tex
index c83460ad..0997c083 100644
--- a/slides/regularization/slides-regu-geom-l2.tex
+++ b/slides/regularization/slides-regu-geom-l2.tex
@@ -8,7 +8,7 @@
 \begin{document}
 
 \titlemeta{Regularization }{Geometry of L2 Regularization}
-{figure/l2_reg_hess_03_plot.png} {
+{figure/l2_reg_hess_03.png} {
   \item Approximate transformation of unregularized minimizer to regularized 
   \item Principal components of Hessian influence where parameters are decayed  
 }
@@ -87,7 +87,7 @@
 % \end{footnotesize}
 
 \begin{figure}
-\includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_01_plot.png}\\
+\includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_01.png}\\
 \end{figure}
 
 % % \begin{footnotesize}
@@ -95,7 +95,7 @@
 % % \end{footnotesize}
 
 % \begin{figure}
-% \includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_02_plot.png}\\
+% \includegraphics[width=0.9\textwidth]{figure/l2_reg_hess_02.png}\\
 % \end{figure}
 
 
@@ -106,7 +106,7 @@
 % \end{footnotesize}
 
 \begin{figure}
-\includegraphics[width=0.85\textwidth]{figure/l2_reg_hess_03_plot.png}\\
+\includegraphics[width=0.85\textwidth]{figure/l2_reg_hess_03.png}\\
 \end{figure}
 
 
@@ -144,7 +144,7 @@
       
       \begin{figure}
             \centering
-              \scalebox{0.8}{\includegraphics{figure/l2_reg_hess_04_plot.png}}
+              \scalebox{0.8}{\includegraphics{figure/l2_reg_hess_04.png}}
               %\caption{\tiny The solid ellipses represent the contours of the unregularized objective and the dashed circles represent the contours of the $L2$ penalty. At $\hat{\thetab}_{\text{ridge}}$, the competing objectives reach an equilibrium.}
           \end{figure}
       
diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex
index aae6ca1e..f30328e2 100644
--- a/slides/regularization/slides-regu-intro.tex
+++ b/slides/regularization/slides-regu-intro.tex
@@ -45,12 +45,12 @@
 \begin{column}{0.5\textwidth}
   \raggedright
   Overfitted model\\
-  \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1o}
+  \includegraphics[width=0.85\textwidth]{figure/model_eval_02.png}
 \end{column}
 \begin{column}{0.5\textwidth}
   \raggedright
     Appropriate model\\
-  \includegraphics[width=0.85\textwidth]{figure/eval_ofit_1a}
+  \includegraphics[width=0.85\textwidth]{figure/model_eval_01.png}
 \end{column}
 \end{columns}
     
diff --git a/slides/regularization/slides-regu-l1.tex b/slides/regularization/slides-regu-l1.tex
index 7c9cd8be..0af32f9f 100644
--- a/slides/regularization/slides-regu-l1.tex
+++ b/slides/regularization/slides-regu-l1.tex
@@ -8,7 +8,7 @@
 
 \begin{document}
 
-\titlemeta{Regularization}{Lasso Regression}{figure/lin_reg_l1.png}{
+\titlemeta{Regularization}{Lasso Regression}{figure/lin_model_regu_01.png}{
     \item Lasso regression / $L1$ penalty
     \item Know that lasso selects features
     \item Support recovery
@@ -40,24 +40,24 @@
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=0.99\textwidth]{figure/lin_reg_l1.png}
+\includegraphics[width=0.99\textwidth]{figure/lin_model_regu_01.png}
 \end{figure}
 \end{column}
 
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=0.99\textwidth]{figure/lin_reg_l2.png}
+\includegraphics[width=0.99\textwidth]{figure/lin_model_regu_02.png}
 \end{figure}
 \end{column}
 \end{columns}
 
 %\begin{figure}
-%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l1.png}
+%\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_01.png}
 %\end{figure}
 
 %\begin{figure}
-%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png}
+%\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png}
 %\end{figure}
 
 \lz
@@ -70,7 +70,7 @@
 
 Contours of regularized objective for different $\lambda$ values.
 \begin{figure}
-\includegraphics[width=0.85\textwidth]{figure/lasso_contours.png}
+\includegraphics[width=0.85\textwidth]{figure/reg_contours_01.png}
 \end{figure}
 
 Green  = true minimizer of the unreg.objective and red = lasso solution.
@@ -80,9 +80,23 @@
 Regularized empirical risk $\riskr(\theta_1,\theta_2)$ using squared loss for $\lambda \uparrow$. $L1$ penalty makes non-smooth kinks at coordinate axes more pronounced, while $L2$ penalty warps $\riskr$ toward a ``basin'' (elliptic paraboloid). 
  
 \begin{figure}
-\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\
+    \begin{minipage}{0.32\linewidth}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l1_lam0.png}}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam0.png}}
+    \end{minipage}
+   \begin{minipage}{0.32\linewidth}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l1_lam1.png}}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam1.png}}
+    \end{minipage}
+    \begin{minipage}{0.32\linewidth}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l1_lam10.png}}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam10.png}}
+   \end{minipage}
 \end{figure}
 
+%\begin{figure}
+%\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\
+%\end{figure}
 \framebreak
 
 We can also rewrite this as a constrained optimization problem. The penalty results in the constrained region to look like a diamond shape.
@@ -95,7 +109,7 @@
 \vspace{-0.1cm}
 \begin{figure}%\includegraphics[width=0.3\textwidth]{figure_man/lasso_hat.png}\\
 \includegraphics[width=0.95\textwidth]
-{figure_man/lasso_contours_cases.png}\\
+{figure/lasso_contour_cases.png}\\
 \end{figure}
 
 \end{vbframe}
@@ -112,7 +126,7 @@
 %Soft threshold ensures exact zeros, while $L2$ penalty shrinks uniformly.
 \vspace{-0.16cm}
 \begin{figure}
-\includegraphics[width=0.5\textwidth]{figure_man/soft-thresholding.pdf}\\
+\includegraphics[width=0.5\textwidth]{figure/soft_thresholding.png}\\
 \end{figure}
 
 \end{vbframe}
@@ -124,7 +138,7 @@
 \end{itemize}
  \lz
 \begin{figure}
-\includegraphics[width=0.9\textwidth]{figure_man/solution_paths_l1_l2.png}\\
+\includegraphics[width=0.9\textwidth]{figure/solution_paths_01.png}\\
 \end{figure}
 
 \end{vbframe}
diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex
index 7c956053..e409b5d8 100644
--- a/slides/regularization/slides-regu-l1vsl2.tex
+++ b/slides/regularization/slides-regu-l1vsl2.tex
@@ -61,7 +61,7 @@
 %Coefficient histograms for different $\lambda$ values for ridge and lasso for simulated data along with the cross-validated MSE.
 
 \begin{figure}
-\includegraphics[width=0.6\textwidth]{figure/shrinkage_2.png}\\
+\includegraphics[width=0.6\textwidth]{figure/shrinkage_02.png}\\
 \end{figure}
 
 \end{vbframe}
@@ -91,12 +91,7 @@
 \vspace{-0.4cm}
 \begin{table}[h]
 \centering
-\begin{tabular}{|c|c c c c c c|}
-\hline
-\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline
-OLS             & 0.983 & 2.147 & 3.005 & 3.917 & \textbf{5.204} & 0.812 \\ %\hline
-OLS rescaled    & 0.983 & 2.147 & 3.005 & 3.917 & \textbf{0.052} & 0.812 \\ \hline
-\end{tabular}
+\input{rsrc/table_equivariance_ols.tex}
 %\caption{Equivariant OLS estimates under rescaling of $x_5$}
 \end{table}
 \vspace{-0.1cm}
@@ -108,12 +103,7 @@
 \vspace{-0.4cm}
 \begin{table}[h]
 \centering
-\begin{tabular}{|c|c c c c c c|}
-\hline
-\textbf{Method} & \( \hat{\theta}_1 \) & \( \hat{\theta}_2 \) & \( \hat{\theta}_3 \) & \( \hat{\theta}_4 \) & \( \hat{\theta}_5 \) & MSE \\ \hline
-ridge           & 0.709 & 1.873 & 2.661 & 3.557 & \textbf{4.636} & 1.366 \\ %\hline
-ridge rescaled  & 0.802 & 1.942 & 2.675 & 3.569 & \textbf{0.051} & 1.079 \\ \hline
-\end{tabular}
+\input{rsrc/table_equivariance_ridge.tex}
 %\caption{ridge estimates for $\lambda=10$ under rescaling of $x_5$}
 \end{table}
 }
@@ -171,7 +161,7 @@
 $x_1$-$x_4$ are independent, but $x_4$ and $x_5$ are strongly correlated.
 
 \begin{center}
-\includegraphics[width=0.6\textwidth]{figure/regu_example_multicollinearity.png}
+\includegraphics[width=0.6\textwidth]{figure/multicollinearity_example.png}
 \end{center}
 
 
diff --git a/slides/regularization/slides-regu-l2-nonlin.tex b/slides/regularization/slides-regu-l2-nonlin.tex
index c475ec33..da2065f6 100644
--- a/slides/regularization/slides-regu-l2-nonlin.tex
+++ b/slides/regularization/slides-regu-l2-nonlin.tex
@@ -15,7 +15,7 @@
   }{% Lecture title  
     Intuition for L2 Regularization in Non-Linear Models
   }{% Relative path to title page image: Can be empty but must not start with slides/
-  figure_man/bias-variance-ridge.png
+  figure/bias_var_decomp.png
   }{
   \item Understand how regularization and parameter shrinkage can be beneficial to non-linear models
 }
diff --git a/slides/regularization/slides-regu-l2.tex b/slides/regularization/slides-regu-l2.tex
index 06c3618f..985b66f9 100644
--- a/slides/regularization/slides-regu-l2.tex
+++ b/slides/regularization/slides-regu-l2.tex
@@ -15,7 +15,7 @@
   }{% Lecture title  
     Ridge Regression
   }{% Relative path to title page image: Can be empty but must not start with slides/
-  figure/ridge_outside.png
+  figure/ridge_perspectives_01.png
   }{
   \item Regularized linear model
   \item Ridge regression / $L2$ penalty
@@ -44,7 +44,7 @@
 %Assume the data generating process $y=3x_{1} -2x_{2} +\epsilon $, where $\displaystyle \epsilon \sim N( 0,1)$. The true minimizer is given by $\theta ^{*} =( 3,-2)^{T}$.
 
 %\begin{figure}
-%\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png}
+%\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png}
 %\end{figure}
 
 %With increasing regularization, $\theta_{\textit{reg}}$ is pulled back to the origin.
@@ -74,7 +74,7 @@
 Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$, with $ \thetah_{\text{ridge}} = \argmin_{\thetab} \|\yv - \Xmat \thetab\|^2 + \lambda \|\thetab\|^2 $.
 
 \begin{figure}
-\includegraphics[width=0.8\textwidth]{figure/lin_reg_l2.png}
+\includegraphics[width=0.8\textwidth]{figure/lin_model_regu_02.png}
 \end{figure}
 \vspace{-0.2cm}
 {\small With increasing regularization, $\hat{\theta}_{\textit{ridge}}$ is pulled back to the origin\\ (contour lines show unregularized objective).}
@@ -84,7 +84,7 @@
 $ \thetah_{\text{ridge}} = \argmin_{\thetab} \|\yv - \Xmat \thetab\|^2 + \lambda \|\thetab\|^2 $.
 
 \begin{figure}
-\includegraphics[width=0.8\textwidth]{figure/ridge_contours.png}
+\includegraphics[width=0.8\textwidth]{figure/reg_contours_02.png}
 \end{figure}
 \vspace{-0.2cm}
 Green  = true coefs of the DGP and red = ridge solution.
@@ -103,7 +103,7 @@
 \vspace{-1.0cm}
 
 \begin{figure}
-\includegraphics[width=0.6\textwidth]{figure/ridge_constraints.png}
+\includegraphics[width=0.6\textwidth]{figure/ridge_perspectives_03.png}
 \end{figure}
 
 \begin{footnotesize} 
@@ -116,7 +116,7 @@
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=\textwidth]{figure/ridge_inside.png}
+\includegraphics[width=\textwidth]{figure/ridge_perspectives_02.png}
 \end{figure}
 \end{column}
 
@@ -140,7 +140,7 @@
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=\textwidth]{figure/ridge_outside.png}
+\includegraphics[width=\textwidth]{figure/ridge_perspectives_01.png}
 \end{figure}
 \end{column}
 
@@ -162,7 +162,7 @@
 \begin{column}{0.5\textwidth}
 \lz
 \begin{figure}
-\includegraphics[width=\textwidth]{figure_man/solution-path-ridge-only.png}
+\includegraphics[width=\textwidth]{figure/solution_paths_02.png}
 \end{figure}
 \end{column}
 
@@ -194,7 +194,7 @@
 Using model complexity $d = 10$ overfits:
 
 \begin{center}
-\includegraphics[width = 10cm ]{figure/poly_ridge_1.png} \\
+\includegraphics[width = 10cm ]{figure/poly_ridge_01.png} \\
 \end{center}
 
 \framebreak
@@ -204,7 +204,7 @@
 \vfill
 
 \begin{center}
-\includegraphics[width = 11cm ]{figure/poly_ridge_2.png} \\
+\includegraphics[width = 11cm ]{figure/poly_ridge_02.png} \\
 \end{center}
 
 
@@ -241,7 +241,7 @@
 Let $y=3x_{1} -2x_{2} +\epsilon $, $ \epsilon \sim N( 0,1)$. The true minimizer is $\theta ^{*} =( 3,-2)^{T}$. Consider $\lambda $ values of 0.01, 0.5, 1, 1.5, 2, 2.5, 10.
 
 \begin{figure}
-\includegraphics[width=0.7\textwidth]{figure/lin_reg_l1.png}
+\includegraphics[width=0.7\textwidth]{figure/lin_model_regu_01.png}
 \end{figure}
 
 With increasing regularization, $\theta_{\textit{ridge}}$ is pulled back to the origin. Contours = unreg. objective, dots = reg. solution for increasing $\lambda$.
@@ -252,7 +252,7 @@
 
 Contours of regularized objective for different $\lambda$ values.
 \begin{figure}
-\includegraphics[width=0.9\textwidth]{figure/lasso_contours.png}
+\includegraphics[width=0.9\textwidth]{figure/reg_contours_01.png}
 \end{figure}
 
 \framebreak
@@ -289,18 +289,33 @@
 \end{itemize}
  \lz
 \begin{figure}
-\includegraphics[width=0.9\textwidth]{figure_man/solution_paths_l1_l2.png}\\
+\includegraphics[width=0.9\textwidth]{figure_man/solution_paths_01.png}\\
 \end{figure}
 
 \end{vbframe}
 
 \begin{vbframe}{Effect of $L1$/$L2$ on Loss Surface}
 Regularized empirical risk $\riskr(\theta_1,\theta_2)$ using squared loss for $\lambda \uparrow$. $L1$ penalty makes non-smooth kinks at coordinate axes more pronounced, while $L2$ penalty warps $\riskr$ toward a ``basin'' (elliptic paraboloid). 
- 
+
 \begin{figure}
-\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\
+    \begin{minipage}{0.32\linewidth}
+        \centerline{\includegraphics[width=0.3\textwidth]{figure/reg_surfaces_l1_lam0.png}}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam0.png}}
+    \end{minipage}
+   \begin{minipage}{0.32\linewidth}
+        \centerline{\includegraphics[width=0.3\textwidth]{figure/reg_surfaces_l1_lam1.png}}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam1.png}}
+    \end{minipage}
+    \begin{minipage}{0.32\linewidth}
+        \centerline{\includegraphics[width=0.3\textwidth]{figure/reg_surfaces_l1_lam10.png}}
+        \centerline{\includegraphics[width=\textwidth]{figure/reg_surfaces_l2_lam10.png}}
+   \end{minipage}
 \end{figure}
 
+%\begin{figure}
+%\includegraphics[width=0.8\textwidth]{figure/reg_surfaces_l1_l2.png}\\
+%\end{figure}
+
 \end{vbframe}
 \end{comment}
 
diff --git a/slides/regularization/slides-regu-nonlin.tex b/slides/regularization/slides-regu-nonlin.tex
index d80bc442..b359ce23 100644
--- a/slides/regularization/slides-regu-nonlin.tex
+++ b/slides/regularization/slides-regu-nonlin.tex
@@ -15,7 +15,7 @@
   }{% Lecture title  
     Non-Linear Models and Structural Risk Minimization
   }{% Relative path to title page image: Can be empty but must not start with slides/
-  figure/fig-regu-nonlin-2.png
+  figure/classifi_nn_w_size_2.png
   }{
   \item Regularization even more important in non-linear models
   \item Norm penalties applied similarly 
@@ -88,13 +88,13 @@
 \vspace{-0.8cm}
 %\vfill
 
-\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-1.png}\end{center}}
-\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-2.png}\end{center}}
-\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-3.png}\end{center}}
-\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-4.png}\end{center}}
+\only<1>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_1.png}\end{center}}
+\only<2>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_2.png}\end{center}}
+\only<3>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_3.png}\end{center}}
+\only<4>{\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_4.png}\end{center}}
 
-%\only<5>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-5.png}}
-%\only<6>{\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-6.png}}
+%\only<5>{\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_5.png}}
+%\only<6>{\includegraphics[width=\textwidth]{figure/classifi_nn_w_size_6.png}}
 
 $\lambda$ affects smoothness of decision boundary and magnitude of weights
 
@@ -106,7 +106,7 @@
 Same settings as before, but each $\lambda$ is evaluated with
 5x10 REP-CV
 
-\begin{center}\includegraphics[width=1\textwidth]{figure/fig-regu-nonlin-srm-1.png}\end{center}
+\begin{center}\includegraphics[width=1\textwidth]{figure/classifi_nn_err_decay.png}\end{center}
 
 Typical U-shape with sweet spot between overfitting and underfitting
 \end{frame}
@@ -163,7 +163,7 @@
 \only<1>{
 \begin{center}
 \begin{minipage}{0.5\textwidth}
-\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-1.png}
+\includegraphics[width=\linewidth]{figure/classifi_nn_size_1.png}
 \end{minipage}%
 \begin{minipage}{0.5\textwidth}
 \includegraphics[width=\linewidth]{figure/nn_size_1.png}
@@ -173,7 +173,7 @@
 \only<2>{
 \begin{center}
 \begin{minipage}{0.5\textwidth}
-\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-2.png}
+\includegraphics[width=\linewidth]{figure/classifi_nn_size_2.png}
 \end{minipage}%
 \begin{minipage}{0.5\textwidth}
 \includegraphics[width=\linewidth]{figure/nn_size_2.png}
@@ -183,7 +183,7 @@
 \only<3>{
 \begin{center}
 \begin{minipage}{0.5\textwidth}
-\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-3.png}
+\includegraphics[width=\linewidth]{figure/classifi_nn_size_3.png}
 \end{minipage}%
 \begin{minipage}{0.5\textwidth}
 \includegraphics[width=\linewidth]{figure/nn_size_3.png}
@@ -193,7 +193,7 @@
 \only<4>{
 \begin{center}
 \begin{minipage}{0.5\textwidth}
-\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-4.png}
+\includegraphics[width=\linewidth]{figure/classifi_nn_size_4.png}
 \end{minipage}%
 \begin{minipage}{0.5\textwidth}
 \includegraphics[width=\linewidth]{figure/nn_size_5.png}
@@ -203,7 +203,7 @@
 \only<5>{
 \begin{center}
 \begin{minipage}{0.5\textwidth}
-\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-5.png}
+\includegraphics[width=\linewidth]{figure/classifi_nn_size_5.png}
 \end{minipage}%
 \begin{minipage}{0.5\textwidth}
 \includegraphics[width=\linewidth]{figure/nn_size_10.png}
@@ -214,7 +214,7 @@
 \only<6>{
 \begin{center}
 \begin{minipage}{0.5\textwidth}
-\includegraphics[width=\linewidth]{figure/fig-regu-nonlin-size-6.png}
+\includegraphics[width=\linewidth]{figure/classifi_nn_size_6.png}
 \end{minipage}%
 \begin{minipage}{0.5\textwidth}
 \includegraphics[width=\linewidth]{figure/nn_size_100.png}
@@ -230,7 +230,7 @@
 \begin{frame} {Structural Risk Minimization}
 Again, complexity vs CV score. 
 
-\begin{center}\includegraphics[width=\textwidth]{figure/fig-regu-nonlin-srm-2.png}\end{center}
+\begin{center}\includegraphics[width=\textwidth]{figure/classifi_nn_err_size.png}\end{center}
 
 Minimal model with good generalization seems to size=10 
 
@@ -251,7 +251,7 @@
 \end{column}
 \begin{column}{0.5\textwidth}
 \begin{figure}
-\includegraphics[width=0.6\textwidth]{figure/ridge_hat.png}
+\includegraphics[width=0.6\textwidth]{figure/ridge_perspectives_04.png}
 \end{figure}
 \end{column}
 \end{columns}
diff --git a/slides/regularization/slides-regu-wd-vs-l2.tex b/slides/regularization/slides-regu-wd-vs-l2.tex
index 58f62513..4d40045e 100644
--- a/slides/regularization/slides-regu-wd-vs-l2.tex
+++ b/slides/regularization/slides-regu-wd-vs-l2.tex
@@ -55,8 +55,8 @@
 How strongly we are pulled back (for fixed $\alpha$) depends on $\lambda$:
 
 \begin{figure}
-  \subfloat[Small $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_plot_01.png}}
-  \subfloat[Large $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_plot_02.png}}\\
+  \subfloat[Small $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_01.png}}
+  \subfloat[Large $\lambda$]{\includegraphics[width=0.4\textwidth]{figure/weightdecay_lambda_02.png}}\\
 \end{figure}
 \end{vbframe}