Merge overleaf-2023-12-12-1715 into main

slds-lmu · Dec 12, 2023 · cce13dd · cce13dd
2 parents f41d325 + 59800c8
commit cce13dd
Show file tree

Hide file tree

Showing 8 changed files with 80 additions and 6 deletions.
diff --git a/slides/information-theory/slides-info-kl-ment.tex b/slides/information-theory/slides-info-kl-ment.tex
@@ -31,13 +31,12 @@
 In the following, we derive an alternative measure, namely the KL divergence (relative entropy), that fixes these shortcomings by taking an inductive inference viewpoint.  \citebutton{Caticha, 2003}{https://arxiv.org/pdf/physics/0311093.pdf}
 \end{vbframe}
 \begin{vbframe}{Inductive inference}
-    Let's construct a "new" entropy measure $S(p)$ just by desired properties.\\
+    We construct a "new" entropy measure $S(p)$ just by desired properties.\\
     \lz
     Let $\mathcal{X}$ be a measurable space with $\sigma$-algebra $\mathcal{F}$ and measure $\mu$ that can be continuous or discrete. \\
     We start with a prior distribution $q$ over $\mathcal{X}$ dominated by $\mu$ and a constraint of the form $$\int_D a(\xv) dq(\xv) = c \in \R$$
-    with $D \in \mathcal{F}.$ Note that the constraint function $a(\xv)$ is analogous to moment condition functions $g(\cdot)$ in the discrete case.
+    with $D \in \mathcal{F}.$ The constraint function $a(\xv)$ is analogous to moment condition functions $g(\cdot)$ in the discrete case.
     We want to update the prior distribution $q$ to a posterior distribution $p$ that fulfills the constraint and is maximal w.r.t. $S(p).$  \\
-    \lz
     For this maximization to make sense, $S$ must be transitive, i.e., 
     $$S(p_1) < S(p_2), S(p_2) < S(p_3) \Rightarrow S(p_1) < S(p_3).$$
 \end{vbframe}

diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex
@@ -72,7 +72,8 @@
 
 \begin{vbframe}{KL divergence}
 
-Divergences can be used to measure the similarity of distributions. \lz For distributions $p, q$ they are defined such that
+Divergences can be used to measure the similarity of distributions.\\
+\lz For distributions $p, q$ they are defined such that
 \begin{enumerate}
     \item $D(p, q) \geq 0,$
     \item $D(p, q) = 0$ iff $p = q.$

diff --git a/slides/information-theory/slides-info-kl.tex b/slides/information-theory/slides-info-kl.tex
@@ -179,15 +179,17 @@
 Because KL quantifies the difference between distributions, it can be used as a loss function between distributions. %to find a good fit for the observed data. \\
 \lz
 
-In our example, we can identify an optimal $\sigma$ which minimizes the KL.
+In our example, we investigated the KL between $p=N(0, 1)$ and $q=LP(0, \sigma).$ Now, we identify an optimal $\sigma$ which minimizes the KL.
+
 \begin{figure}
 \includegraphics[width = 6cm ]{figure/kl_norm_lp_sigma.png} 
 \end{figure}
 
 
 % \begin{figure}
 %     \centering
-%       \scalebox{0.9}{\includegraphics{figure_man/binom1.png}}
+%       \scalebox{0.9}
+{\includegraphics{figure_man/binom1.png}}
 %       \tiny{\\ Credit: Will Kurt}
 %       \caption{ \footnotesize{\textit{Left}: Histogram of observed frequencies of a random variable $X$ which takes values between 0 and 10. \textit{Right}: The KL divergence between the observed data and Binom(10,p) is minimized when $p \approx 0.57$.}}
 % \end{figure}

diff --git a/slides/regularization/chapter-order.tex b/slides/regularization/chapter-order.tex
@@ -38,6 +38,9 @@ \subsection{Geometric Analysis of L2-Regularization and Weight Decay}
 \subsection{Geometric Analysis of L1-regularization}
 \includepdf[pages=-]{../slides-pdf/slides-regu-geom-l1.pdf}
 
+\subsection{Soft-thresholding and L1 regularization deep-dive}
+\includepdf[pages=-]{../slides-pdf/slides-regu-lasso-deepdive.pdf}
+
 \subsection{Early Stopping}
 \includepdf[pages=-]{../slides-pdf/slides-regu-early-stopping.pdf}
 

diff --git a/slides/regularization/figure_man/bias-variance-ridge.png b/slides/regularization/figure_man/bias-variance-ridge.png
diff --git a/slides/regularization/figure_man/data-augmentation-cat.png b/slides/regularization/figure_man/data-augmentation-cat.png
diff --git a/slides/regularization/rsrc/data-augmentation-images-cat.py b/slides/regularization/rsrc/data-augmentation-images-cat.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Dec 12 17:55:57 2023
+
+@author: chris
+"""
+
+from keras.datasets import cifar10
+import matplotlib.pyplot as plt
+import numpy as np
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+
+# Load CIFAR-10 dataset
+(x_train, y_train), (_, _) = cifar10.load_data()
+
+# Selecting a random dog image
+# In CIFAR-10, the label for dogs is 5
+dog_indices = np.where(y_train == 3)[0] # cat is 3
+random_index = np.random.choice(dog_indices)
+dog_image = x_train[random_index]
+
+# Data augmentation techniques
+datagen = ImageDataGenerator(
+    rotation_range=20,
+    width_shift_range=0.2,
+    height_shift_range=0.2,
+    shear_range=0.2,
+    zoom_range=0.2,
+    horizontal_flip=True,
+    fill_mode='nearest'
+)
+
+# Preparing the image for augmentation
+dog_image = dog_image.reshape((1,) + dog_image.shape)
+
+# Applying the augmentation and plotting
+fig, axs = plt.subplots(1, 5, figsize=(15, 3))
+axs[0].imshow(dog_image[0])
+axs[0].axis('off')
+axs[0].set_title("Original", fontsize=20)
+
+# Generate 4 augmented images
+i = 1
+for batch in datagen.flow(dog_image, batch_size=1):
+    axs[i].imshow(batch[0].astype('uint8'))
+    axs[i].axis('off')
+    axs[i].set_title(f"Augmented {i}", fontsize=20)
+    i += 1
+    if i > 4:
+        break
+
+plt.tight_layout()
+plt.show()
diff --git a/slides/regularization/slides-regu-intro.tex b/slides/regularization/slides-regu-intro.tex
@@ -31,6 +31,22 @@
     \item \textbf{Structured regularization} methods incorporate structural prior knowledge over groups of parameters or subnetworks (e.g., the group lasso \citebutton{Yuan and Lin, 2005}{https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00532.x})
 \end{itemize}
 
+\end{vbframe}
+
+\begin{vbframe}{Invariance/Symmetry}
+Prior knowledge often requires that model predictions should be invariant under certain input transformations.\\
+In image classification, label ``cat'' should hold regardless of position or size of relevant object (translation/scale invariance)
+\begin{enumerate}\setlength\itemsep{1.02em}
+    \item \textbf{Pre-processing}: By computing invariant features under transformations, downstream models too will respect invariances
+    %\item \textbf{Explicit regularization}: Penalty for changes in model output under transformed inputs is added to loss
+    \item \textbf{Data augmentation}: Extend training data by replicating inputs under invariant transformations (e.g., flipping/rotating images)
+    \begin{figure}
+    \includegraphics[width=0.75\textwidth]{figure_man/data-augmentation-cat.png}\\
+    \end{figure}
+    \item \textbf{Network architecture}: Build invariance property directly into network structure, e.g. CNNs \citebutton{Geometric DL, Bronstein et al., 2021}{https://arxiv.org/pdf/2104.13478.pdf}
+\end{enumerate}
+
+
 \end{vbframe}