diff --git a/slides/advriskmin/figure_man/cart_tree_i2ml.png b/slides/advriskmin/figure_man/cart_tree_i2ml.png new file mode 100644 index 00000000..e2dd719e Binary files /dev/null and b/slides/advriskmin/figure_man/cart_tree_i2ml.png differ diff --git a/slides/advriskmin/references.bib b/slides/advriskmin/references.bib index 0d6ada62..6ad18b07 100644 --- a/slides/advriskmin/references.bib +++ b/slides/advriskmin/references.bib @@ -1,3 +1,12 @@ +@misc{BISCHL22I2ML, + author = {Bischl, Bernd and Bothmann, Ludwig and Scheipl, Fabian and Pielok, Tobias and Wimmer, Lisa and Li, Yawei and Kolb, Chris and Schalk, Daniel and Seibold, Heidi and Molnar, Christoph and Richter, Jakob}, + title = {{Introduction to Machine Learning (I2ML)}}, + howpublished = "\url{https://slds-lmu.github.io/i2ml/}", + year = {2022}, + url = {https://slds-lmu.github.io/i2ml/chapters/06_cart/}, + note = "[Online; accessed 2024-10-08]" +} + @article{BROWN2024BIAS, title={Bias/Variance is not the same as Approximation/Estimation}, author={Brown, Gavin and Ali, Riccardo}, diff --git a/slides/advriskmin/slides-advriskmin-classification-brier.tex b/slides/advriskmin/slides-advriskmin-classification-brier.tex index e88556ec..9f1dfdfd 100644 --- a/slides/advriskmin/slides-advriskmin-classification-brier.tex +++ b/slides/advriskmin/slides-advriskmin-classification-brier.tex @@ -95,7 +95,7 @@ \pixbayes = \eta(\xv) = \P(y~|~\xv = \xv), \end{eqnarray*} -which means that the Brier score will reach its minimum if the prediction equals the \enquote{true} probability of the outcome. +which means that the Brier score attains its minimum if the prediction equals the \enquote{true} probability of the outcome. \lz @@ -115,7 +115,7 @@ \begin{eqnarray*} && \argmin_c \quad L(1, c) \eta(\xv) + L(0, c) (1 - \eta(\xv)) \\ - &=& \argmin_c \quad (c - 1)^2 \eta(\xv) + c^2 (1 - \eta(\xv))\\ + &=& \argmin_c \quad (c - 1)^2 \eta(\xv) + c^2 (1 - \eta(\xv)) \quad |{+\eta(\xv)^2-\eta(\xv)^2}\\ &=& \argmin_c \quad (c^2 -2c\eta(\xv) + \eta(\xv)^2)- \eta(\xv)^2 + \eta(\xv) \\ &=& \argmin_c \quad (c - \eta(\xv))^2. \end{eqnarray*} diff --git a/slides/advriskmin/slides-advriskmin-classification-furtherlosses.tex b/slides/advriskmin/slides-advriskmin-classification-furtherlosses.tex index 96ec64fa..8673e8f7 100644 --- a/slides/advriskmin/slides-advriskmin-classification-furtherlosses.tex +++ b/slides/advriskmin/slides-advriskmin-classification-furtherlosses.tex @@ -98,12 +98,12 @@ \begin{vbframe}{Classification Losses: Exponential Loss} -Another possible choice for a (binary) loss function that is a smooth +Another smooth approximation to the 0-1-loss is the \textbf{exponential loss}: \begin{itemize} \item $\Lyf = \exp(-yf)$, used in AdaBoost. \item Convex, differentiable (thus easier to optimize than 0-1-loss). -\item The loss increases exponentially for wrong predictions with high confidence; if the prediction is right with a small confidence only, there, loss is still positive. +\item Loss increases exponentially for wrong predictions with high confidence; if prediction is correct but with low confidence only, the loss is still positive. \item No closed-form analytic solution to (empirical) risk minimization. \end{itemize} @@ -121,10 +121,10 @@ \item Let $y \in \setmp$ with $\nn$ negative and $\np$ positive samples. %$y_i, i = 1, \ldots, n_{-1} + n_1$. \item The AUC can then be defined as $$AUC = \frac{1}{\np} \frac{1}{\nn} \sum_{i: \yi = 1} \sum_{j: \yi[j] = -1} [f^{(i)} > f^{(j)}]$$ -\item This is not differentiable w.r.t $f$ due to $[f^{(i)} > f^{(j)}]$. -\item But the indicator function can be approximated by the distribution function of the triangular distribution on $[-1, 1]$ with mean $0$. -\item However, direct optimization of the AUC is numerically more difficult, and might not work as well as using - a common loss and tuning for AUC in practice. +\item This is not differentiable w.r.t $f$ due to indicator $[f^{(i)} > f^{(j)}]$. +\item The indicator function can be approximated by the distribution function of the triangular distribution on $[-1, 1]$ with mean $0$. +\item However, direct optimization of the AUC is numerically difficult and might not work as well as using +a common loss and tuning for AUC in practice. \end{itemize} \end{vbframe} diff --git a/slides/advriskmin/slides-advriskmin-tree-splitting.tex b/slides/advriskmin/slides-advriskmin-tree-splitting.tex index 736d24ab..4cb7c5c0 100644 --- a/slides/advriskmin/slides-advriskmin-tree-splitting.tex +++ b/slides/advriskmin/slides-advriskmin-tree-splitting.tex @@ -22,20 +22,22 @@ }{% Lecture title Loss functions and tree splitting }{% Relative path to title page image: Can be empty but must not start with slides/ - figure/plot_brier.png + figure_man/cart_tree_i2ml.png }{ - \item Know how tree splitting is 'nothing new' related to loss functions + \item Know how tree splitting is 'nothing new' and related to loss functions \item Brier score minimization corresponds to gini splitting \item Bernoulli loss minimization corresponds to entropy splitting } \begin{vbframe}{Bernoulli loss min = Entropy splitting} -When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another approach that is common in literature is to minimize the average node impurity $\text{Imp}(\Np)$. +For an introduction on trees and splitting criteria we refer our \textbf{I2ML} lecture (Chapter 6, \citelink{BISCHL22I2ML})\\ +\vspace{0.2cm} +When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another common approach is to minimize the average node impurity $\text{Imp}(\Np)$. \vspace*{0.2cm} -\textbf{Claim:} Entropy splitting $\text{Imp}(\Np) = -\sum_{k = 1}^g \pikN \log \pikN$ is equivalent to minimize risk measured by the Bernoulli loss. +\textbf{Claim:} Entropy splitting $\text{Imp}(\Np) = -\textstyle\sum_{k = 1}^g \pikN \log \pikN$ is equivalent to minimize risk measured by the Bernoulli loss. \begin{footnotesize} Note that $\pikN := \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$. @@ -43,18 +45,12 @@ \vspace*{0.2cm} -\textbf{Proof: } To prove this we show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills - -\vspace*{- 0.2cm} - - -$$ - \risk(\Np) = n_\Np \text{Imp}(\Np), -$$ +\textbf{Proof: } To prove this we show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills $\risk(\Np) = n_\Np \text{Imp}(\Np), +$ - where - %$I$ is the entropy criterion $\text{Imp}(\Np)$ and - $\risk(\Np)$ is calculated w.r.t. the (multiclass) Bernoulli loss +where +%$I$ is the entropy criterion $\text{Imp}(\Np)$ and +$\risk(\Np)$ is calculated w.r.t. the (multiclass) Bernoulli loss $$ L(y, \pix) = -\sum_{k = 1}^g [y = k] \log \left(\pi_k(\xv)\right).