advriskmin classification updates

slds-lmu · Oct 8, 2024 · e5f69b3 · e5f69b3
1 parent dab1919
commit e5f69b3
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 23 deletions.
diff --git a/slides/advriskmin/figure_man/cart_tree_i2ml.png b/slides/advriskmin/figure_man/cart_tree_i2ml.png
diff --git a/slides/advriskmin/references.bib b/slides/advriskmin/references.bib
@@ -1,3 +1,12 @@
+@misc{BISCHL22I2ML,
+  author = {Bischl, Bernd and Bothmann, Ludwig and Scheipl, Fabian and Pielok, Tobias and Wimmer, Lisa and Li, Yawei and Kolb, Chris and Schalk, Daniel and Seibold, Heidi and Molnar, Christoph and Richter, Jakob},
+  title = {{Introduction to Machine Learning (I2ML)}},
+  howpublished = "\url{https://slds-lmu.github.io/i2ml/}",
+  year = {2022},
+  url = {https://slds-lmu.github.io/i2ml/chapters/06_cart/},
+  note = "[Online; accessed 2024-10-08]"
+}
+
 @article{BROWN2024BIAS,
   title={Bias/Variance is not the same as Approximation/Estimation},
   author={Brown, Gavin and Ali, Riccardo},

diff --git a/slides/advriskmin/slides-advriskmin-classification-brier.tex b/slides/advriskmin/slides-advriskmin-classification-brier.tex
@@ -95,7 +95,7 @@
 \pixbayes = \eta(\xv) = \P(y~|~\xv = \xv),
 \end{eqnarray*}
 
-which means that the Brier score will reach its minimum if the prediction equals the \enquote{true} probability of the outcome. 
+which means that the Brier score attains its minimum if the prediction equals the \enquote{true} probability of the outcome. 
 
 \lz 
 
@@ -115,7 +115,7 @@
 
 \begin{eqnarray*}
 	&& \argmin_c \quad L(1, c) \eta(\xv) + L(0, c) (1 - \eta(\xv)) \\ 
-	&=&  \argmin_c \quad (c - 1)^2 \eta(\xv) + c^2 (1 - \eta(\xv))\\
+	&=&  \argmin_c \quad (c - 1)^2 \eta(\xv) + c^2 (1 - \eta(\xv))  \quad |{+\eta(\xv)^2-\eta(\xv)^2}\\
   &=&  \argmin_c \quad (c^2 -2c\eta(\xv) + \eta(\xv)^2)- \eta(\xv)^2 + \eta(\xv) \\
 	&=&  \argmin_c \quad (c - \eta(\xv))^2.
 \end{eqnarray*}

diff --git a/slides/advriskmin/slides-advriskmin-classification-furtherlosses.tex b/slides/advriskmin/slides-advriskmin-classification-furtherlosses.tex
@@ -98,12 +98,12 @@
 
 \begin{vbframe}{Classification Losses: Exponential Loss}
 
-Another possible choice for a (binary) loss function that is a smooth 
+Another smooth 
 approximation to the 0-1-loss is the \textbf{exponential loss}:
 \begin{itemize}
 \item $\Lyf = \exp(-yf)$, used in AdaBoost.
 \item Convex, differentiable (thus easier to optimize than 0-1-loss).
-\item The loss increases exponentially for wrong predictions with high confidence; if the prediction is right with a small confidence only, there, loss is still positive.
+\item Loss increases exponentially for wrong predictions with high confidence; if prediction is correct but with low confidence only, the loss is still positive.
 \item No closed-form analytic solution to (empirical) risk minimization.
 \end{itemize}
 
@@ -121,10 +121,10 @@
 \item Let $y \in \setmp$ with $\nn$ negative and $\np$ positive samples. %$y_i, i = 1, \ldots, n_{-1} + n_1$.
 \item The AUC can then be defined as
 $$AUC = \frac{1}{\np} \frac{1}{\nn} \sum_{i: \yi = 1} \sum_{j: \yi[j] = -1} [f^{(i)} > f^{(j)}]$$
-\item This is not differentiable w.r.t $f$ due to $[f^{(i)} > f^{(j)}]$.
-\item But the indicator function can be approximated by the distribution function of the triangular distribution on $[-1, 1]$ with mean $0$.
-\item However, direct optimization of the AUC is numerically more difficult, and might not work as well as using 
-    a common loss and tuning for AUC in practice.
+\item This is not differentiable w.r.t $f$ due to indicator $[f^{(i)} > f^{(j)}]$.
+\item The indicator function can be approximated by the distribution function of the triangular distribution on $[-1, 1]$ with mean $0$.
+\item However, direct optimization of the AUC is numerically difficult and might not work as well as using 
+a common loss and tuning for AUC in practice.
 
 \end{itemize}
 \end{vbframe}

diff --git a/slides/advriskmin/slides-advriskmin-tree-splitting.tex b/slides/advriskmin/slides-advriskmin-tree-splitting.tex
@@ -22,39 +22,35 @@
   }{% Lecture title  
     Loss functions and tree splitting
   }{% Relative path to title page image: Can be empty but must not start with slides/
-  figure/plot_brier.png
+  figure_man/cart_tree_i2ml.png
   }{
-  \item Know how tree splitting is 'nothing new' related to loss functions 
+  \item Know how tree splitting is 'nothing new' and related to loss functions 
   \item Brier score minimization corresponds to gini splitting
   \item Bernoulli loss minimization corresponds to entropy splitting
 }
 
 \begin{vbframe}{Bernoulli loss min = Entropy splitting}
 
-When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another approach that is common in literature is to minimize the average node impurity $\text{Imp}(\Np)$. 
+For an introduction on trees and splitting criteria we refer our \textbf{I2ML} lecture (Chapter 6, \citelink{BISCHL22I2ML})\\
+\vspace{0.2cm}
+When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another common approach is to minimize the average node impurity $\text{Imp}(\Np)$. 
 
 \vspace*{0.2cm}
 
-\textbf{Claim:} Entropy splitting $\text{Imp}(\Np) = -\sum_{k = 1}^g \pikN \log \pikN$ is equivalent to minimize risk measured by the Bernoulli loss. 
+\textbf{Claim:} Entropy splitting $\text{Imp}(\Np) = -\textstyle\sum_{k = 1}^g \pikN \log \pikN$ is equivalent to minimize risk measured by the Bernoulli loss. 
 
 \begin{footnotesize}
 Note that $\pikN := \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$. 
 \end{footnotesize}
 
 \vspace*{0.2cm}
 
-\textbf{Proof: } To prove this we show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills 
-
-\vspace*{- 0.2cm}
-
-
-$$
-  \risk(\Np) = n_\Np \text{Imp}(\Np),
-$$
+\textbf{Proof: } To prove this we show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills $\risk(\Np) = n_\Np \text{Imp}(\Np),
+$
 
-  where 
-  %$I$ is the entropy criterion $\text{Imp}(\Np)$ and 
-  $\risk(\Np)$ is calculated w.r.t. the (multiclass) Bernoulli loss  
+where 
+%$I$ is the entropy criterion $\text{Imp}(\Np)$ and 
+$\risk(\Np)$ is calculated w.r.t. the (multiclass) Bernoulli loss  
 
 $$
   L(y, \pix) = -\sum_{k = 1}^g [y = k] \log \left(\pi_k(\xv)\right).