diff --git a/slides-pdf/lecture_sl.pdf b/slides-pdf/lecture_sl.pdf new file mode 100644 index 00000000..d4cb9c92 Binary files /dev/null and b/slides-pdf/lecture_sl.pdf differ diff --git a/slides-pdf/slides-advriskmin-logreg-deepdive.pdf b/slides-pdf/slides-advriskmin-logreg-deepdive.pdf index 1c4f956a..104cfef6 100644 Binary files a/slides-pdf/slides-advriskmin-logreg-deepdive.pdf and b/slides-pdf/slides-advriskmin-logreg-deepdive.pdf differ diff --git a/slides-pdf/slides-mc-binary-reduction.pdf b/slides-pdf/slides-mc-binary-reduction.pdf index 945d20e1..fa05fac7 100644 Binary files a/slides-pdf/slides-mc-binary-reduction.pdf and b/slides-pdf/slides-mc-binary-reduction.pdf differ diff --git a/slides-pdf/slides-mc-codebooks.pdf b/slides-pdf/slides-mc-codebooks.pdf index 174b3c16..56a2273b 100644 Binary files a/slides-pdf/slides-mc-codebooks.pdf and b/slides-pdf/slides-mc-codebooks.pdf differ diff --git a/slides-pdf/slides-mc-softmax-regression.pdf b/slides-pdf/slides-mc-softmax-regression.pdf index 38603677..94871389 100644 Binary files a/slides-pdf/slides-mc-softmax-regression.pdf and b/slides-pdf/slides-mc-softmax-regression.pdf differ diff --git a/slides-pdf/slides-regu-enetlogreg.pdf b/slides-pdf/slides-regu-enetlogreg.pdf index 3471ca77..f30d2d92 100644 Binary files a/slides-pdf/slides-regu-enetlogreg.pdf and b/slides-pdf/slides-regu-enetlogreg.pdf differ diff --git a/slides-pdf/slides-regu-into.pdf b/slides-pdf/slides-regu-into.pdf new file mode 100644 index 00000000..b815808c Binary files /dev/null and b/slides-pdf/slides-regu-into.pdf differ diff --git a/slides-pdf/slides-regu-l1.pdf b/slides-pdf/slides-regu-l1.pdf index 71aaf72c..f767da60 100644 Binary files a/slides-pdf/slides-regu-l1.pdf and b/slides-pdf/slides-regu-l1.pdf differ diff --git a/slides-pdf/slides-regu-l1vsl2.pdf b/slides-pdf/slides-regu-l1vsl2.pdf index ac3fed51..095c4cd9 100644 Binary files a/slides-pdf/slides-regu-l1vsl2.pdf and b/slides-pdf/slides-regu-l1vsl2.pdf differ diff --git a/slides-pdf/slides-regu-l2.pdf b/slides-pdf/slides-regu-l2.pdf index bf1b50ff..eb0f2074 100644 Binary files a/slides-pdf/slides-regu-l2.pdf and b/slides-pdf/slides-regu-l2.pdf differ diff --git a/slides-pdf/slides-regu-nonlin.pdf b/slides-pdf/slides-regu-nonlin.pdf index f27908bc..d8add4b8 100644 Binary files a/slides-pdf/slides-regu-nonlin.pdf and b/slides-pdf/slides-regu-nonlin.pdf differ diff --git a/slides-pdf/slides-regu-others.pdf b/slides-pdf/slides-regu-others.pdf index 3c78a991..eee13bcd 100644 Binary files a/slides-pdf/slides-regu-others.pdf and b/slides-pdf/slides-regu-others.pdf differ diff --git a/slides-pdf/slides_sl.pdf b/slides-pdf/slides_sl_270123.pdf similarity index 100% rename from slides-pdf/slides_sl.pdf rename to slides-pdf/slides_sl_270123.pdf diff --git a/slides/advriskmin/chapter-order-slides-all.tex b/slides/advriskmin/chapter-order-slides-all.tex new file mode 100644 index 00000000..9bc47bd7 --- /dev/null +++ b/slides/advriskmin/chapter-order-slides-all.tex @@ -0,0 +1,72 @@ +%% needs to be filled!! +% slides-advriskmin-risk-minimizer +% slides-advriskmin-pseudo-residuals +% slides-advriskmin-regression-l2 +% slides-advriskmin-regression-l1 +% slides-advriskmin-regression-further-losses +% slides-advriskmin-classification-01 +% slides-advriskmin-classification-bernoulli +% slides-advriskmin-logreg-deepdive +% slides-advriskmin-classification-brier +% slides-advriskmin-classification-furtherlosses +% slides-advriskmin-classification-deepdive +% slides-advriskmin-max-likelihood-l2 +% slides-advriskmin-max-likelihood-other +% slides-advriskmin-losses-properties +% slides-advriskmin-bias-variance-decomposition + +%- slides-evaluation-intro + +\subsection{Risk Minimizers} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-risk-minimizer.pdf} + +\subsection{Pseudo-Residuals} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-pseudo-residuals.pdf} + +\subsection{L2- and L1-loss} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-regression-l2-l1.pdf} + +\subsection{L1 Risk Minimizer (Deep-Dive)} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-regression-l1-deepdive.pdf} + +\subsection{Advanced Regression Losses} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-regression-further-losses.pdf} + +\subsection{0-1-loss} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-01.pdf} + +\subsection{Bernoulli Loss} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-bernoulli.pdf} + +\subsection{Logistic Regression (Deep-Dive)} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-logreg-deepdive.pdf} + +%\subsection{Proper Scoring Rules} +%\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-proper-scoring-rules.pdf} + +\subsection{Brier Score} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-brier.pdf} + +%\subsection{Tree Splitting and Loss Functions} +%\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-tree-splitting.pdf} + +\subsection{Advanced Classification Losses} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-furtherlosses.pdf} + +\subsection{Optimal constant model for the empirical log loss risk} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-classification-deepdive.pdf} + +\subsection{Maximum Likelihood Estimization vs. Empirical Risk Minimization I} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-max-likelihood-l2.pdf} + +\subsection{Maximum Likelihood Estimization vs. Empirical Risk Minimization II} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-max-likelihood-other.pdf} + +\subsection{Loss Properties} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-losses-properties.pdf} + +\subsection{Bias Variance Decomposition} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-bias-variance-decomposition.pdf} + +\subsection{Bias Variance Decomposition (Deep-Dive)} +\includepdf[pages=-]{../../slides-pdf/slides-advriskmin-bias-variance-decomposition-deepdive.pdf} diff --git a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex index a21f0180..96872c71 100644 --- a/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex +++ b/slides/advriskmin/slides-advriskmin-logreg-deepdive.tex @@ -62,24 +62,25 @@ \vspace*{-0.5cm} +{\small \begin{align*} - \frac{\partial}{\partial\thetab}\riske & = & + \frac{\partial}{\partial\thetab}\riske & = -\sumin \frac{\partial}{\partial\pixit }\yi\log(\pixit)\frac{\partial\pixit}{\partial \thetab} + \\ - && \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetab}\\ - & = & + & \qquad \frac{\partial}{\partial\pixit} (1-\yi)\log(1-\pixit)\frac{\partial\pixit}{\partial \thetab}\\ + & = -\sumin \frac{\yi}{\pixit}\frac{\partial\pixit}{\partial \thetab} - \frac{1-\yi}{1-\pixit}\frac{\partial\pixit}{\partial \thetab}\\ - &=& + &= -\sumin \left(\frac{\yi}{\pixit} - \frac{1-\yi}{1-\pixit}\right)\frac{\partial s(\fxit)}{\partial \fxit}\frac{\partial \fxit}{\partial\thetab}\\ - &=& + &= -\sum^n_{i=1} \left(\yi(1-\pixit) - (1-\yi)\pixit \right)\left(\xi\right)^\top.\\ \end{align*} - +} \framebreak \begin{align*} \quad &=& - \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top.\\ + \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\ \quad &=& \left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)^\top\mathbf{X}\\ \end{align*} @@ -96,7 +97,7 @@ \vspace*{1cm} -$\Rightarrow$ The gradient $\nabla_{\thetab}\riske = \left(\frac{\partial}{\partial\thetab}\riske\right)^\top = \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)$ +$\implies$ The gradient $\nabla_{\thetab}\riske = \left(\frac{\partial}{\partial\thetab}\riske\right)^\top = \mathbf{X}^\top\left(\pi(\mathbf{X}\vert\;\thetab) - \mathbf{y}\right)$ \vspace*{1cm} @@ -109,17 +110,19 @@ We find the Hessian via differentiation, s.t., +{\small \begin{align*} - \nabla^2_{\thetab}\riske = \frac{\partial^2}{\partial{\thetab^\top}\partial\thetab}\riske & = & + \nabla^2_{\thetab}\riske = \frac{\partial^2}{\partial{\thetab^\top}\partial\thetab}\riske & = \frac{\partial}{\partial{\thetab^\top}} \sumin \left(\pixit - \yi\right)\left(\xi\right)^\top\\ - & = & + & = \sum^n_{i=1}\xi \left(\pixit\left(1-\pixit\right)\right)\left(\xi\right)^\top\\ - & = & + & = \mathbf{X}^\top \mathbf{D} \mathbf{X}\\ \end{align*} where $\mathbf{D} \in \mathbb{R}^{n\times n}$ is a diagonal matrix with diagonal $$\left(\pixit[1](1-\pixit[1], \dots, \pixit[n](1-\pixit[n]\right).$$ +} Can now be used in Newton-Raphson and other 2nd order optimizers. diff --git a/slides/all/slides_sl.tex b/slides/all/slides_sl.tex index 8681b122..a6ccfaab 100644 --- a/slides/all/slides_sl.tex +++ b/slides/all/slides_sl.tex @@ -75,7 +75,7 @@ % Include tuning lecture slides \section{Advanced Risk Minimization} -\input{../advriskmin/chapter-order.tex} +\input{../advriskmin/chapter-order-slides-all.tex} \section{Multiclass Classification} \input{../multiclass/chapter-order.tex} diff --git a/slides/regularization/slides-regu-l1vsl2.tex b/slides/regularization/slides-regu-l1vsl2.tex index 30820d6a..c0f3df5f 100644 --- a/slides/regularization/slides-regu-l1vsl2.tex +++ b/slides/regularization/slides-regu-l1vsl2.tex @@ -185,7 +185,7 @@ \item $L1$ penalty is not \textit{strictly} convex. Hence, no unique solution exists if $x_4=x_5$, and sum of coefficients can be arbitrarily allocated to both features while remaining minimizers (no grouping property!):\\ For any solution $\thetah_{4,lasso},\thetah_{5,lasso}$, equivalent minimizers are given by \vspace{-0.1cm} - {\small $$\Tilde{\theta}_{4,lasso}=s\cdot(\thetah_{4,lasso}+\thetah_{5,lasso}) \,\,\text{and}\,\,\Tilde{\theta}_{5,lasso}=(1-s)\cdot(\thetah_{4,lasso}+\thetah_{5,lasso})\,\forall s\in[0,1]$$} + {\footnotesize $$\Tilde{\theta}_{4,lasso}=s\cdot(\thetah_{4,lasso}+\thetah_{5,lasso}) \,\,\text{and}\,\,\Tilde{\theta}_{5,lasso}=(1-s)\cdot(\thetah_{4,lasso}+\thetah_{5,lasso})\,\forall s\in[0,1]$$} \end{itemize} \framebreak