diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex index d4d7b409..a67295a4 100644 --- a/slides/information-theory/slides-info-kl-ment.tex +++ b/slides/information-theory/slides-info-kl-ment.tex @@ -31,13 +31,12 @@ In the following, we derive an alternative measure, namely the KL divergence (relative entropy), that fixes these shortcomings by taking an inductive inference viewpoint. \citebutton{Caticha, 2003}{https://arxiv.org/pdf/physics/0311093.pdf} \end{vbframe} \begin{vbframe}{Inductive inference} - Let's construct a "new" entropy measure $S(p)$ just by desired properties.\\ + We construct a "new" entropy measure $S(p)$ just by desired properties.\\ \lz Let $\mathcal{X}$ be a measurable space with $\sigma$-algebra $\mathcal{F}$ and measure $\mu$ that can be continuous or discrete. \\ We start with a prior distribution $q$ over $\mathcal{X}$ dominated by $\mu$ and a constraint of the form $$\int_D a(\xv) dq(\xv) = c \in \R$$ - with $D \in \mathcal{F}.$ Note that the constraint function $a(\xv)$ is analogous to moment condition functions $g(\cdot)$ in the discrete case. + with $D \in \mathcal{F}.$ The constraint function $a(\xv)$ is analogous to moment condition functions $g(\cdot)$ in the discrete case. We want to update the prior distribution $q$ to a posterior distribution $p$ that fulfills the constraint and is maximal w.r.t. $S(p).$ \\ - \lz For this maximization to make sense, $S$ must be transitive, i.e., $$S(p_1) < S(p_2), S(p_2) < S(p_3) \Rightarrow S(p_1) < S(p_3).$$ \end{vbframe} diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex index 7421d76b..f578f63b 100644 --- a/slides/information-theory/slides-info-kl-ml.tex +++ b/slides/information-theory/slides-info-kl-ml.tex @@ -72,7 +72,8 @@ \begin{vbframe}{KL divergence} -Divergences can be used to measure the similarity of distributions. \lz For distributions $p, q$ they are defined such that +Divergences can be used to measure the similarity of distributions.\\ +\lz For distributions $p, q$ they are defined such that \begin{enumerate} \item $D(p, q) \geq 0,$ \item $D(p, q) = 0$ iff $p = q.$ diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex index 3ac2f675..707dd8ef 100644 --- a/slides/information-theory/slides-info-kl.tex +++ b/slides/information-theory/slides-info-kl.tex @@ -179,7 +179,8 @@ Because KL quantifies the difference between distributions, it can be used as a loss function between distributions. %to find a good fit for the observed data. \\ \lz -In our example, we can identify an optimal $\sigma$ which minimizes the KL. +In our example, we investigated the KL between $p=N(0, 1)$ and $q=LP(0, \sigma).$ Now, we identify an optimal $\sigma$ which minimizes the KL. + \begin{figure} \includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png} \end{figure} @@ -187,7 +188,8 @@ % \begin{figure} % \centering -% \scalebox{0.9}{\includegraphics{figure_man/binom1.png}} +% \scalebox{0.9} +{\includegraphics{figure_man/binom1.png}} % \tiny{\\ Credit: Will Kurt} % \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}} % \end{figure} diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex index 8747ed72..ca1d4f63 100644 --- a/slides/regularization/chapter-order.tex +++ b/slides/regularization/chapter-order.tex @@ -38,6 +38,9 @@ \subsection{Geometric Analysis of L2-Regularization and Weight Decay} \subsection{Geometric Analysis of L1-regularization} \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l1.pdf} +\subsection{Soft-thresholding and L1 regularization deep-dive} +\includepdf[pages=-]{../slides-pdf/slides-regu-lasso-deepdive.pdf} + \subsection{Early Stopping} \includepdf[pages=-]{../slides-pdf/slides-regu-early-stopping.pdf} diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png index 1b67caaf..1af66601 100644 Binary files a/slides/regularization/figure_man/bias-variance-ridge.png and b/slides/regularization/figure_man/bias-variance-ridge.png differ diff --git a/slides/regularization/figure_man/data-augmentation-cat.png b/slides/regularization/figure_man/data-augmentation-cat.png new file mode 100644 index 00000000..ec17ce92 Binary files /dev/null and b/slides/regularization/figure_man/data-augmentation-cat.png differ diff --git a/slides/regularization/rsrc/data-augmentation-images-cat.py b/slides/regularization/rsrc/data-augmentation-images-cat.py new file mode 100644 index 00000000..a65458bf --- /dev/null +++ b/slides/regularization/rsrc/data-augmentation-images-cat.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Dec 12 17:55:57 2023 + +@author: chris +""" + +from keras.datasets import cifar10 +import matplotlib.pyplot as plt +import numpy as np +from tensorflow.keras.preprocessing.image import ImageDataGenerator + +# Load CIFAR-10 dataset +(x_train, y_train), (_, _) = cifar10.load_data() + +# Selecting a random dog image +# In CIFAR-10, the label for dogs is 5 +dog_indices = np.where(y_train == 3)[0] # cat is 3 +random_index = np.random.choice(dog_indices) +dog_image = x_train[random_index] + +# Data augmentation techniques +datagen = ImageDataGenerator( + rotation_range=20, + width_shift_range=0.2, + height_shift_range=0.2, + shear_range=0.2, + zoom_range=0.2, + horizontal_flip=True, + fill_mode='nearest' +) + +# Preparing the image for augmentation +dog_image = dog_image.reshape((1,) + dog_image.shape) + +# Applying the augmentation and plotting +fig, axs = plt.subplots(1, 5, figsize=(15, 3)) +axs[0].imshow(dog_image[0]) +axs[0].axis('off') +axs[0].set_title("Original", fontsize=20) + +# Generate 4 augmented images +i = 1 +for batch in datagen.flow(dog_image, batch_size=1): + axs[i].imshow(batch[0].astype('uint8')) + axs[i].axis('off') + axs[i].set_title(f"Augmented {i}", fontsize=20) + i += 1 + if i > 4: + break + +plt.tight_layout() +plt.show() diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex index 1a43b639..86860621 100644 --- a/slides/regularization/slides-regu-intro.tex +++ b/slides/regularization/slides-regu-intro.tex @@ -31,6 +31,22 @@ \item \textbf{Structured regularization} methods incorporate structural prior knowledge over groups of parameters or subnetworks (e.g., the group lasso \citebutton{Yuan and Lin, 2005}{https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00532.x}) \end{itemize} +\end{vbframe} + +\begin{vbframe}{Invariance/Symmetry} +Prior knowledge often requires that model predictions should be invariant under certain input transformations.\\ +In image classification, label ``cat'' should hold regardless of position or size of relevant object (translation/scale invariance) +\begin{enumerate}\setlength\itemsep{1.02em} + \item \textbf{Pre-processing}: By computing invariant features under transformations, downstream models too will respect invariances + %\item \textbf{Explicit regularization}: Penalty for changes in model output under transformed inputs is added to loss + \item \textbf{Data augmentation}: Extend training data by replicating inputs under invariant transformations (e.g., flipping/rotating images) + \begin{figure} + \includegraphics[width=0.75\textwidth]{figure_man/data-augmentation-cat.png}\\ + \end{figure} + \item \textbf{Network architecture}: Build invariance property directly into network structure, e.g. CNNs \citebutton{Geometric DL, Bronstein et al., 2021}{https://arxiv.org/pdf/2104.13478.pdf} +\end{enumerate} + + \end{vbframe}