updates in advriskmin

slds-lmu · Sep 23, 2024 · 85fa599 · 85fa599
1 parent 94382b6
commit 85fa599
Show file tree

Hide file tree

Showing 4 changed files with 193 additions and 76 deletions.
diff --git a/slides/advriskmin/chapter-order.tex b/slides/advriskmin/chapter-order.tex
@@ -47,6 +47,9 @@ \subsection{Proper Scoring Rules}
 \subsection{Brier Score}
 \includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-brier.pdf}
 
+\subsection{Tree Splitting and Loss Functions}
+\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-tree-splitting.pdf}
+
 \subsection{Advanced Classification Losses}
 \includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-furtherlosses.pdf}
 

diff --git a/slides/advriskmin/slides-advriskmin-classification-bernoulli.tex b/slides/advriskmin/slides-advriskmin-classification-bernoulli.tex
@@ -28,7 +28,7 @@
   \item Know the Bernoulli loss and related losses (log-loss, logistic loss, Binomial loss)
   \item Derive the risk minimizer
   \item Derive the optimal constant model 
-  \item Understand the connection between log-loss and entropy splitting 
+  %\item Understand the connection between log-loss and entropy splitting 
 }
 
 \begin{vbframe}{Bernoulli Loss}
@@ -39,6 +39,15 @@
   L(y, \fx) &=& - y \cdot \fx + \log(1 + \exp(\fx)) \quad \text{for } y \in \setzo 
 \end{eqnarray*}
 
+%Note that the expression $L(y, \fx) = \log(1 + \exp(-y \cdot \fx))$ for $y \in \setmp$ can be derived from the typical log-loss for $y \in \setzo$ %:
+%\[
+%L(y, \fx) = - y \cdot \fx + \log(1 + \exp(\fx))
+%\]
+%by recognizing that the labels can be transformed from $y \in \setzo$ to $y' \in \setmp$ using the mapping $y' = 2y - 1$. Substituting $y'$ into the typical formulation gives:
+%\[
+%L(y', \fx) = \log(1 + \exp(-y' \cdot \fx))
+%\]
+
 \begin{itemize}
   % \item Two equivalent formulations: labels $y \in \setmp$ or $y \in \setzo$
   \item Two equivalent formulations for different label encodings
@@ -107,7 +116,7 @@
   \riskf &=& \E_x \left[L(1, \pix) \cdot \eta(\xv) + L(0, \pix) \cdot (1 - \eta(\xv)) \right],
 \end{eqnarray*}
 
-with $\eta(\xv) = \P(y = 1 ~|~ \xv)$ (see chapter on the 0-1-loss for more details). 
+with $\eta(\xv) = \P(y = 1 ~|~ \xv)$ (see section on the 0-1-loss for more details). 
 
 For a fixed $\xv$ we compute the point-wise optimal value $c$ by setting the derivative to $0$: 
 
@@ -165,112 +174,59 @@
 
 \begin{vbframe}{Bernoulli: Optimal constant Model}
 
-The optimal constant probability model $\pix = \theta$ w.r.t. the Bernoulli loss for labels from $\Yspace = \setzo$ is:
+{\small The optimal constant probability model $\pix = \theta$ w.r.t. the Bernoulli loss for labels from $\Yspace = \setzo$ is:
 
 \begin{eqnarray*}
   \thetah = \argmin_{\theta} \risket = \frac{1}{n} \sumin \yi
 \end{eqnarray*}
 
 Again, this is the fraction of class-1 observations in the observed data.
-We can simply prove this again by setting the derivative of the risk to 0 and solving for $\theta$.
-
-\framebreak
-
-The optimal constant score model $\fx = \theta$ w.r.t. the Bernoulli loss labels from $\Yspace = \setmp$ or $\Yspace = \setzo$ is:
+We can simply prove this again by setting the derivative of the risk to 0 and solving for $\theta$. The optimal constant score model $\fx = \theta$ w.r.t. the Bernoulli loss labels from $\Yspace = \setmp$ or $\Yspace = \setzo$ is:
 
 \begin{eqnarray*}
   \thetah = \argmin_{\theta} \risket = \log \frac{\np}{\nn} = \log \frac{\np / n}{\nn /n} 
 \end{eqnarray*}
 
 where $\nn$ and $\np$ are the numbers of negative and positive observations, respectively.
 
-\lz
-
 This again shows a tight (and unsurprising) connection of this loss to log-odds.
 
-\lz
-
 Proving this is also a (quite simple) exercise.
+}
 
 \end{vbframe}
 
 \begin{vbframe}{Bernoulli-Loss: Naming Convention}
 
 We have seen three loss functions that are closely related. In the literature, there are different names for the losses: 
 
-\begin{eqnarray*}
-  L(y, \fx) &=& \log(1+\exp(-y\fx)) \quad \text{for } y \in \setmp \\
-  L(y, \fx) &=& - y \cdot \fx + \log(1 + \exp(\fx)) \quad \text{for } y \in \setzo 
-\end{eqnarray*}
-
-are referred to as Bernoulli, Binomial or logistic loss. 
-
-  $$
-    L(y, \pix) = - y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right) \quad \text{for } y \in \setzo
-  $$
-
-is referred to as cross-entropy or log-loss. 
-
-\lz 
-
-We usually refer to all of them as \textbf{Bernoulli loss}, and rather make clear whether they are defined on labels $y \in \setzo$ or $y \in \setmp$ and on scores $\fx$ or probabilities $\pix$. 
-
-\end{vbframe}
-
-
+%\begin{eqnarray*}
+%  L(y, \fx) &=& \log(1+\exp(-y\fx)) \quad \text{for } y \in \setmp \\
+%  L(y, \fx) &=& - y \cdot \fx + \log(1 + \exp(\fx)) \quad \text{for } y \in \setzo 
+%\end{eqnarray*}
 
-\begin{vbframe}{Bernoulli loss min = Entropy splitting}
+\begin{eqnarray*} L(y, \fx) &=& \log(1+\exp(-y\fx)) \quad \text{for } y \in \setmp \\
+\ L(y, \fx) &=& - y \cdot \fx + \log(1 + \exp(\fx)) \quad \text{for } y \in \setzo \\
+L(y, \pix) &=& - y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right) \quad \text{for } y \in \setzo \\
+L(y, \pix) &=& - \frac{1 + y}{2} \log\left(\pix\right) - \frac{1 - y}{2} \log\left(1 - \pix\right) \quad \text{for } y \in \setmp \end{eqnarray*}
 
-When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another approach that is common in literature is to minimize the average node impurity $\text{Imp}(\Np)$. 
-
-\vspace*{0.2cm}
-
-\textbf{Claim:} Entropy splitting $\text{Imp}(\Np) = -\sum_{k = 1}^g \pikN \log \pikN$ is equivalent to minimize risk measured by the Bernoulli loss. 
-
-\begin{footnotesize}
-Note that $\pikN := \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$. 
-\end{footnotesize}
+\lz
 
-\vspace*{0.2cm}
+are equally referred to as Bernoulli, Binomial, logistic, log loss, or cross-entropy (showing equivalence is a simple exercise).
 
-\textbf{Proof: } To prove this we show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills 
+%  $$
+%    L(y, \pix) = - y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right) \quad \text{for } y \in \setzo
+%  $$
 
-\vspace*{- 0.2cm}
+%is referred to as cross-entropy or log-loss. 
 
+%\lz 
 
-$$
-  \risk(\Np) = n_\Np \text{Imp}(\Np),
-$$
-
-  where 
-  %$I$ is the entropy criterion $\text{Imp}(\Np)$ and 
-  $\risk(\Np)$ is calculated w.r.t. the (multiclass) Bernoulli loss  
+%We usually refer to all of them as \textbf{Bernoulli loss}, and rather make clear whether they are defined on labels $y \in \setzo$ or $y \in \setmp$ and on scores $\fx$ or probabilities $\pix$. 
 
-$$
-  L(y, \pix) = -\sum_{k = 1}^g [y = k] \log \left(\pi_k(\xv)\right).
-$$
-
-\framebreak 
-\begin{footnotesize}
-\begin{eqnarray*}
-\risk(\Np) &=& \sum_{\xy \in \Np} \left(- \sum_{k = 1}^g [y = k] \log \pi_k(\xv) \right) \\
-&\overset{(*)}{=}& -\sum_{k = 1}^g \sum_{\xy \in \Np} [y = k]\log \pikN \\
-&=& -\sum_{k = 1}^g \log \pikN \underbrace{\sum_{\xy \in \Np} [y = k]}_{n_{\Np}\cdot \pikN } \\
- &=& -n_{\Np} \sum_{k = 1}^g \pikN \log \pikN = n_\Np \text{Imp}(\Np), 
-\end{eqnarray*} 
-
-where in $^{(*)}$ the optimal constant per node $\pikN = \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$ was plugged in. 
-\end{footnotesize}
-% \framebreak
-
-% \textbf{Conclusion}: 
-% \begin{itemize}
-%   \item Stumps/trees with entropy splitting use the same loss function as logistic regression (binary) / softmax regression (multiclass).
-%   \item    While logistic regression is based on the hypothesis space of \textbf{linear functions}, stumps/trees use \textbf{step functions} as hypothesis spaces. 
+\end{vbframe}
 
-% \end{itemize}  
 
-\end{vbframe}
 
 
 

diff --git a/slides/advriskmin/slides-advriskmin-classification-brier.tex b/slides/advriskmin/slides-advriskmin-classification-brier.tex
@@ -77,7 +77,9 @@
 L\left(y, \pix\right) &=& (\pix - y)^2
 \end{eqnarray*}
 
-\vspace{0.2cm}
+As the Brier score is a proper scoring rule, it can be used for calibration. Note that is is not convex on probabilities anymore.
+
+%\vspace{0.2cm}
 \begin{center}
 \includegraphics[width = 0.8\textwidth]{figure/plot_brier.png}
 \end{center}

diff --git a/slides/advriskmin/slides-advriskmin-tree-splitting.tex b/slides/advriskmin/slides-advriskmin-tree-splitting.tex
@@ -0,0 +1,156 @@
+\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer}
+%<<setup-child, include = FALSE>>=
+%library(knitr)
+%library(qrmix)
+%library(mlr)
+%library(quantreg)
+%library(reshape2)
+%set_parent("../style/preamble.Rnw")
+%@
+
+\input{../../style/preamble}
+\input{../../latex-math/basic-math}
+\input{../../latex-math/basic-ml}
+\input{../../latex-math/ml-trees} % For the comparison of Brier and Gini index
+
+\title{Introduction to Machine Learning}
+
+\begin{document}
+
+\titlemeta{% Chunk title (example: CART, Forests, Boosting, ...), can be empty
+    Advanced Risk Minimization
+  }{% Lecture title  
+    Loss functions and tree splitting
+  }{% Relative path to title page image: Can be empty but must not start with slides/
+  figure/plot_brier.png
+  }{
+  \item Know how tree splitting is 'nothing new' related to loss functions 
+  \item Brier score minimization corresponds to gini splitting
+  \item Bernoulli loss minimization corresponds to entropy splitting
+}
+
+\begin{vbframe}{Bernoulli loss min = Entropy splitting}
+
+When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another approach that is common in literature is to minimize the average node impurity $\text{Imp}(\Np)$. 
+
+\vspace*{0.2cm}
+
+\textbf{Claim:} Entropy splitting $\text{Imp}(\Np) = -\sum_{k = 1}^g \pikN \log \pikN$ is equivalent to minimize risk measured by the Bernoulli loss. 
+
+\begin{footnotesize}
+Note that $\pikN := \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$. 
+\end{footnotesize}
+
+\vspace*{0.2cm}
+
+\textbf{Proof: } To prove this we show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills 
+
+\vspace*{- 0.2cm}
+
+
+$$
+  \risk(\Np) = n_\Np \text{Imp}(\Np),
+$$
+
+  where 
+  %$I$ is the entropy criterion $\text{Imp}(\Np)$ and 
+  $\risk(\Np)$ is calculated w.r.t. the (multiclass) Bernoulli loss  
+
+$$
+  L(y, \pix) = -\sum_{k = 1}^g [y = k] \log \left(\pi_k(\xv)\right).
+$$
+
+\framebreak 
+\begin{footnotesize}
+\begin{eqnarray*}
+\risk(\Np) &=& \sum_{\xy \in \Np} \left(- \sum_{k = 1}^g [y = k] \log \pi_k(\xv) \right) \\
+&\overset{(*)}{=}& -\sum_{k = 1}^g \sum_{\xy \in \Np} [y = k]\log \pikN \\
+&=& -\sum_{k = 1}^g \log \pikN \underbrace{\sum_{\xy \in \Np} [y = k]}_{n_{\Np}\cdot \pikN } \\
+ &=& -n_{\Np} \sum_{k = 1}^g \pikN \log \pikN = n_\Np \text{Imp}(\Np), 
+\end{eqnarray*} 
+
+where in $^{(*)}$ the optimal constant per node $\pikN = \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$ was plugged in. 
+\end{footnotesize}
+% \framebreak
+
+% \textbf{Conclusion}: 
+% \begin{itemize}
+%   \item Stumps/trees with entropy splitting use the same loss function as logistic regression (binary) / softmax regression (multiclass).
+%   \item    While logistic regression is based on the hypothesis space of \textbf{linear functions}, stumps/trees use \textbf{step functions} as hypothesis spaces. 
+
+% \end{itemize}  
+
+\end{vbframe}
+
+
+
+
+\begin{vbframe}{Brier score minimization = Gini splitting}
+
+When fitting a tree we minimize the risk within each node $\Np$ by risk minimization and predict the optimal constant. Another approach that is common in literature is to minimize the average node impurity $\text{Imp}(\Np)$. 
+
+\vspace*{0.2cm}
+
+\textbf{Claim:} Gini splitting $\text{Imp}(\Np) = \sum_{k=1}^g \pikN \left(1-\pikN \right)$ is equivalent to the Brier score minimization. 
+
+\begin{footnotesize}
+Note that $\pikN := \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k]$ 
+\end{footnotesize}
+
+\vspace*{0.2cm}
+
+\begin{footnotesize}
+
+\textbf{Proof: } We show that the risk related to a subset of observations $\Np \subseteq \D$ fulfills 
+
+
+$$
+  \risk(\Np) = n_\Np \text{Imp}(\Np),
+$$
+
+  where $\text{Imp}$ is the Gini impurity and $\risk(\Np)$ is calculated w.r.t. the (multiclass) Brier score
+
+
+$$
+  L(y, \pix) = \sum_{k = 1}^g \left([y = k] - \pi_k(\xv)\right)^2.
+$$
+
+\framebreak
+
+\vspace*{-0.5cm}
+\begin{eqnarray*}
+\risk(\Np) &=& \sum_{\xy \in \Np}  \sum_{k = 1}^g \left([y = k] - \pi_k(\xv)\right)^2 
+= \sum_{k = 1}^g \sum_{\xy \in \Np} \left([y = k] - \frac{n_{\Np,k}}{n_{\Np }}\right)^2,
+\end{eqnarray*}
+
+by plugging in the optimal constant prediction w.r.t. the Brier score ($n_{\Np,k}$ is defined as the number of class $k$ observations in node $\Np$): 
+$$\hat \pi_k(\xv)= \pikN = \frac{1}{n_{\Np}} \sum\limits_{(\xv,y) \in \Np} [y = k] = \frac{n_{\Np,k}}{n_{\Np }}. $$ 
+
+ We split the inner sum and further simplify the expression
+
+\begin{eqnarray*}
+&=& \sum_{k = 1}^{g} \left(\sum_{\xy \in \Np: ~ y = k} \left(1 - \frac{n_{\Np,k}}{n_{\Np }}\right)^2 + \sum_{\xy \in \Np: ~ y \ne k} \left(0 - \frac{n_{\Np,k}}{n_{\Np }}\right)^2\right) \\
+&=& \sum_{k = 1}^g n_{\Np,k}\left(1 - \frac{n_{\Np,k}}{n_{\Np }}\right)^2 + (n_{\Np } - n_{\Np,k})\left(\frac{n_{\Np,k}}{n_{\Np }}\right)^2, 
+\end{eqnarray*}
+
+since for $n_{\Np,k}$ observations the condition $y = k$ is met, and for the remaining $(n_\Np - n_{\Np,k})$ observations it is not. 
+
+
+We further simplify the expression to
+
+% \begin{footnotesize}
+\begin{eqnarray*}
+\risk(\Np) &=&  \sum_{k = 1}^g n_{\Np,k}\left(\frac{n_{\Np } - n_{\Np,k}}{n_{\Np }}\right)^2 + (n_{\Np } - n_{\Np,k})\left(\frac{n_{\Np,k}}{n_{\Np }}\right)^2 \\
+&=& \sum_{k = 1}^g \frac{n_{\Np,k}}{n_{\Np }} \frac{n_{\Np } - n_{\Np,k}}{n_{\Np }} \left(n_{\Np } - n_{\Np,k } + n_{\Np,k}\right) \\
+&=& n_{\Np } \sum_{k = 1}^g \pikN \cdot \left(1 - \pikN \right) = n_\Np \text{Imp}(\Np).
+\end{eqnarray*}
+% \end{footnotesize}
+
+\end{footnotesize}
+
+\end{vbframe}
+
+
+\endlecture
+
+\end{document}