Merge pull request #173 from slds-lmu/kl_tobi_1

add plots to kl_ml slides and R files
slds-lmu · Dec 20, 2023 · d593e69 · d593e69
2 parents 0a87b99 + 03ebc19
commit d593e69
Show file tree

Hide file tree

Showing 10 changed files with 293 additions and 5 deletions.
diff --git a/slides/information-theory/figure/ftrue.pdf b/slides/information-theory/figure/ftrue.pdf
diff --git a/slides/information-theory/figure/gaussian_mixture_scatter.png b/slides/information-theory/figure/gaussian_mixture_scatter.png
diff --git a/slides/information-theory/figure/gaussian_mixture_with_marginals.png b/slides/information-theory/figure/gaussian_mixture_with_marginals.png
diff --git a/slides/information-theory/figure/kl_fitting_plot.png b/slides/information-theory/figure/kl_fitting_plot.png
diff --git a/slides/information-theory/figure/normal_distributions.png b/slides/information-theory/figure/normal_distributions.png
diff --git a/slides/information-theory/rsrc/make_ftrue_plot.R b/slides/information-theory/rsrc/make_ftrue_plot.R
@@ -0,0 +1,42 @@
+
+library(ggplot2)
+library(ggpubr)
+
+set.seed(123)
+
+df = data.frame(x = runif(50, -2.5, 2.5), type = FALSE)
+df = rbind(df, data.frame(x = rep(0, 10), type = TRUE))
+
+df$y = 2 * df$x + rnorm(60)
+
+lm_fit = lm(data = df, y ~ x)
+
+k <- 5
+sigma <- sigma(lm_fit)
+ab <- coef(lm_fit); a <- ab[1]; b <- ab[2]
+
+x <- seq(-k*sigma, k*sigma, length.out = 50)
+y <- dnorm(x, 0, sigma)/dnorm(0, 0, sigma) * 1
+
+x0 <- 0
+y0 <- a+b*x0
+path1 <- data.frame(x = y + x0, y = x + y0)
+segment1 <- data.frame(x = x0, y = y0 - k*sigma, xend = x0, yend = y0 + k*sigma)
+df[df$type, ]$y <- df[df$type, ]$y
+
+
+p = ggplot() + geom_point(data = df[!df$type, ], aes(x = x, y = y), alpha = 0.2, size = 3)
+p = p + geom_point(data = df[df$type, ], aes(x = x, y = y), size = 3)
+p = p + geom_path(data = path1, aes(x = x, y = y), color = "orange") 
+p = p + geom_segment(aes(x=x,y=y,xend=xend,yend=yend), data = segment1, lty = 2) 
+p = p + geom_point(data = df[df$type, ], aes(x = x + 2, y = y + 4), size = 3)
+p = p + geom_path(data = path1, aes(x = x + 2, y = y + 4), color = "orange") 
+p = p + geom_segment(aes(x=x + 2,y=y + 4,xend=xend + 2,yend=yend + 4), data = segment1, lty = 2)
+p = p + geom_point(data = df[df$type, ], aes(x = x - 2, y = y - 4), size = 3)
+p = p + geom_path(data = path1, aes(x = x - 2, y = y - 4), color = "orange") 
+p = p + geom_segment(aes(x=x - 2,y=y - 4,xend=xend - 2,yend=yend - 4), data = segment1, lty = 2) 
+p = p + geom_abline(slope = b, intercept = a)
+
+p
+
+ggsave("../figure/ftrue.pdf", width = 5, height = 3)
diff --git a/slides/information-theory/rsrc/make_gaussian_mixture_plots.R b/slides/information-theory/rsrc/make_gaussian_mixture_plots.R
@@ -0,0 +1,114 @@
+library(ggplot2)
+library(MASS)     
+library(gridExtra) 
+
+
+# Define parameters for the Gaussian distributions
+mean1 <- c(0, 0)
+mean2 <- c(5, 5)
+covariance <- matrix(c(1, 0, 0, 1), nrow = 2)
+
+# Generate samples
+set.seed(0)
+samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance)
+samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance)
+samples <- rbind(samples1, samples2)
+
+# Create a data frame
+df <- data.frame(x = samples[,1], y = samples[,2])
+
+# Create the main contour plot
+p_main <- ggplot(df, aes(x = x, y = y)) + 
+  geom_density_2d_filled() +
+  theme_minimal() +
+  theme(axis.title = element_blank(),
+        axis.text = element_blank(),
+        axis.ticks = element_blank(),
+        panel.grid = element_blank()) +
+  theme(legend.position = "none")
+
+# Create the top marginal plot (X-axis)
+p_top <- ggplot(df, aes(x = x)) +
+  geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
+  theme_void() +
+  theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
+        axis.title.x = element_text(size = 20)) + 
+  labs(x = "x1")
+
+# Create the right marginal plot (Y-axis)
+p_right <- ggplot(df, aes(x = y)) +
+  geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
+  theme_void() +
+  theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
+        axis.title.y = element_text(size = 20)) + 
+  labs(x = "x2") +
+  coord_flip() 
+
+# Arrange the plots together
+empty <- ggplot() + geom_blank(aes(x = 0, y = 0)) + theme_void()
+p1 <- arrangeGrob(p_top, empty, p_main, p_right,
+                          ncol = 2, nrow = 2,
+                          widths = c(4, 0.5),
+                          heights = c(2, 7))
+
+
+# Define parameters for the Gaussian distributions
+mean1 <- c(0, 0)
+mean2 <- c(10, 10)
+mean3 <- c(10, 0)
+mean4 <- c(0, 10)
+
+# Generate samples
+set.seed(0)
+samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance)
+samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance)
+samples3 <- mvrnorm(n = 1000, mu = mean3, Sigma = covariance)
+samples4 <- mvrnorm(n = 1000, mu = mean4, Sigma = covariance)
+
+samples <- rbind(samples1, samples2, samples3, samples4)
+
+# Create a data frame
+df <- data.frame(x = samples[,1], y = samples[,2])
+
+# Create the main contour plot
+p_main <- ggplot(df, aes(x = x, y = y)) + 
+  geom_density_2d_filled() +
+  theme_minimal() +
+  theme(axis.title = element_blank(),
+        axis.text = element_blank(),
+        axis.ticks = element_blank(),
+        panel.grid = element_blank()) +
+  theme(legend.position = "none")
+
+# Create the top marginal plot (X-axis)
+p_top <- ggplot(df, aes(x = x)) +
+  geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
+  theme_void() +
+  theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
+        axis.title.x = element_text(size = 20)) + 
+  labs(x = "x1")
+
+# Create the right marginal plot (Y-axis)
+p_right <- ggplot(df, aes(x = y)) +
+  geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
+  theme_void() +
+  theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
+        axis.title.y = element_text(size = 20)) + 
+  labs(x = "x2") +
+  coord_flip() 
+
+# Arrange the plots together
+empty <- ggplot() + geom_blank(aes(x = 0, y = 0)) + theme_void()
+p2 <- arrangeGrob(p_top, empty, p_main, p_right,
+                          ncol = 2, nrow = 2,
+                          widths = c(4, 0.5),
+                          heights = c(2, 7))
+
+# Display the plot
+grid.newpage()
+grid.draw(p2)
+
+plot = grid.arrange(p1, p2,  ncol = 2)
+
+ggsave(file = "../figure/gaussian_mixture_with_marginals.png", plot = plot, width = 24, height = 8, dpi = 300)
+
diff --git a/slides/information-theory/rsrc/make_gaussian_mixture_scatter.R b/slides/information-theory/rsrc/make_gaussian_mixture_scatter.R
@@ -0,0 +1,61 @@
+library(ggplot2)
+library(MASS)     
+library(gridExtra) 
+
+
+mean1 <- c(6, 0)
+mean2 <- c(0, 10)
+covariance <- matrix(c(1, 0, 0, 1), nrow = 2)
+
+# Generate samples
+set.seed(0)
+samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance)
+samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance)
+samples <- rbind(samples1, samples2)
+
+# Additional bivariate gaussian with mean c(0,0)
+q1 <- mvrnorm(n = 100, mu = c(0, 0), Sigma = covariance)
+q2 <- mvrnorm(n = 100, mu = c(2.5, 5), Sigma = matrix(c(1.5, 0.5, 0.5, 1.5), nrow = 2))
+q3 <- mvrnorm(n = 100, mu = c(6, 0), Sigma = covariance)
+q4 <- mvrnorm(n = 100, mu = c(0, 10), Sigma = covariance)
+
+# Create a data frame
+df <- data.frame(x = samples[,1], y = samples[,2])
+q1_df <- data.frame(x = q1[,1], y = q1[,2])
+q2_df <- data.frame(x = q2[,1], y = q2[,2])
+q3_df <- data.frame(x = q3[,1], y = q3[,2])
+q4_df <- data.frame(x = q4[,1], y = q4[,2])
+
+p1 <- ggplot(df, aes(x = x, y = y)) +
+  geom_density_2d_filled() +
+  geom_point(data = additional_df, aes(x = x, y = y), color = "red", size = 0.5) +
+  theme_minimal() +
+  labs(x = expression(theta[1]), y = expression(theta[2]), title = "") +
+  theme(legend.position = "none",
+        plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold"))
+
+
+p2 <- ggplot(df, aes(x = x, y = y)) +
+  geom_density_2d_filled() +
+  geom_point(data = q2_df, aes(x = x, y = y), color = "red", size = 0.5) +
+  theme_minimal() +
+  labs(x = expression(theta[1]), y = expression(theta[2]), title = "Sampling from q(x)") +
+  theme(legend.position = "none",
+        plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold"))
+
+# Update p3
+p3 <- ggplot(df, aes(x = x, y = y)) +
+  geom_density_2d_filled() +
+  geom_point(data = q3_df, aes(x = x, y = y), color = "red", size = 0.5) +
+  geom_point(data = q4_df, aes(x = x, y = y), color = "red", size = 0.5) +
+  theme_minimal() +
+  labs(x = expression(theta[1]), y = expression(theta[2]), title = "") +
+  theme(legend.position = "none",
+        plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold"))
+
+
+
+plot = grid.arrange(p1, p2, p3, ncol = 3)
+
+
+ggsave("../figure/gaussian_mixture_scatter.png", plot = plot, width = 11, height = 3)
diff --git a/slides/information-theory/rsrc/make_kl_fitting_plot.R b/slides/information-theory/rsrc/make_kl_fitting_plot.R
@@ -0,0 +1,58 @@
+library(ggplot2)
+library(MASS)     
+library(gridExtra)
+
+########## CREATE NORMAL DISTRIBUTIONS
+
+
+set.seed(123)
+
+x <- seq(-4, 7, length.out = 1000)
+normal_density1 <- dnorm(x, 0, 1)
+normal_density2 <- dnorm(x, 3, 1)
+data <- data.frame(x = x, NormalDensity1 = normal_density1, NormalDensity2 = normal_density2)
+p = ggplot(data, aes(x = x)) +
+    geom_line(aes(y = NormalDensity1), color = "blue", size = 1) +
+    geom_line(aes(y = NormalDensity2), color = "red", size = 1) +
+    labs(x = "x",
+         y = "Density") +
+    geom_text(aes(x = 1.5, y = 0.34, label = "?"), color = "black", size = 15)
+
+ggsave("../figure/normal_distributions.png", plot = p, width = 7, height = 3)
+
+
+
+# Generate samples from both distributions
+
+samples1 <- rnorm(1000, 0, 1)
+samples2 <- rnorm(1000, 5, 1)
+
+# Combine samples to form a Gaussian mixture
+mixture_samples <- c(samples1, samples2)
+
+data <- data.frame(value = mixture_samples)
+
+# Use density function to estimate the density of the mixture
+density_data <- density(mixture_samples, bw = "nrd0")
+density_df <- data.frame(value = density_data$x, density = density_data$y)
+
+# Plotting the density using ggplot2
+p1 = ggplot(density_df, aes(x = value, y = density)) +
+       geom_line(color = "blue") +
+       labs(title = "Reverse KL",
+       x = "x",
+       y = "Density") +
+       stat_function(fun = dnorm, args = list(mean = 0, sd = 1), color = "red")
+
+
+p2 <- ggplot(density_df, aes(x = value, y = density)) +
+  geom_line(aes(color = "p(x)")) + 
+  labs(title = "Forward KL",
+       x = "x",
+       y = "Density") +
+  stat_function(fun = dnorm, args = list(mean = 2.5, sd = 3), aes(color = "q(x)")) + 
+  scale_color_manual(name = "", values = c("p(x)" = "blue", "q(x)" = "red")) +  
+  theme(legend.position = "right") 
+
+plot = grid.arrange(p1, p2, ncol = 2, widths = c(1, 1.25))
+ggsave("../figure/kl_fitting_plot.png", plot =plot, width = 8, height = 3)
diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex
@@ -22,7 +22,11 @@
 \begin{vbframe} {Measuring Distribution Similarity in ML}
 \begin{itemize}
     \item Information theory provides tools (e.g., divergence measures) to quantify the similarity between probability distributions
-\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_dist_sim.png}
+
+    \begin{center}
+    \includegraphics[width=0.7\linewidth]{figure/normal_distributions.png}
+    \end{center}
+
     \item The most prominent divergence measure is the KL divergence 
 \item In ML, measuring (and maximizing) the similarity between probability distributions is a ubiquitous concept, which will be shown in the following.
 \end{itemize}
@@ -31,7 +35,9 @@
     \item \textbf{Probabilistic model fitting}\\
 Assume our learner is probabilistic, i.e., we model $p(y| \mathbf{x})$ for example (for example, ridge regression, logistic regression, ...).
 
-\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_prob_fit.png}
+\begin{center}
+\includegraphics[width=0.5\linewidth]{figure/ftrue.pdf}
+\end{center}
 
 We want to minimize the difference between $p(y \vert \mathbf{x})$ and the conditional data generating process $\mathbb{P}_{y\vert\mathbf{x}}$ based on the data stemming from $\mathbb{P}_{y, \mathbf{x}}.$
 
@@ -47,7 +53,9 @@
     \item \textbf{Feature selection}
 In feature selection, we want to choose features the target strongly depends on. 
 
-\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_mi.png}
+\begin{center}
+\includegraphics[width=0.9\linewidth]{figure/gaussian_mixture_with_marginals.png}
+\end{center}
 
 We can measure dependency by measuring the similarity between $p(\mathbf{x}, y)$ and $p(\mathbf{x})\cdot p(y).$ \\
 We will later see that measuring this similarity with KL  leads to the concept of mutual information.
@@ -60,7 +68,9 @@
     \item \textbf{Variational inference (VI)}
 Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable.
 
-\includegraphics[width=0.99\linewidth]{figure_man/kl_ml_vi.png}
+\begin{center}
+\includegraphics[width=0.99\linewidth]{figure/gaussian_mixture_scatter.png}
+\end{center}
 
 In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to 
     $p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$
@@ -122,8 +132,11 @@
      $\Rightarrow$ We can estimate the gradient of the reverse KL without bias (even if we only have an unnormalized target distribution)
  \end{itemize}
  \framebreak
+
+\begin{center}
+\includegraphics[width=0.7\linewidth]{figure/kl_fitting_plot.png}
+\end{center}
 
-\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_fkl_rkl.png} \\
 The asymmetry of the KL has the following implications
 \begin{itemize}
     \item The forward KL $D_{KL}(p\|q_{\bm{\phi}}) = \E_{\xv \sim p} \log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right)$ is mass-covering since $p(\xv)\log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right) \approx 0$ if $p(\xv) \approx 0$ (as long as both distribution do not extremely differ)