diff --git a/slides/information-theory/figure/ftrue.pdf b/slides/information-theory/figure/ftrue.pdf new file mode 100644 index 00000000..43d89c56 Binary files /dev/null and b/slides/information-theory/figure/ftrue.pdf differ diff --git a/slides/information-theory/figure/gaussian_mixture_scatter.png b/slides/information-theory/figure/gaussian_mixture_scatter.png new file mode 100644 index 00000000..8568314b Binary files /dev/null and b/slides/information-theory/figure/gaussian_mixture_scatter.png differ diff --git a/slides/information-theory/figure/gaussian_mixture_with_marginals.png b/slides/information-theory/figure/gaussian_mixture_with_marginals.png new file mode 100644 index 00000000..de7459db Binary files /dev/null and b/slides/information-theory/figure/gaussian_mixture_with_marginals.png differ diff --git a/slides/information-theory/figure/kl_fitting_plot.png b/slides/information-theory/figure/kl_fitting_plot.png new file mode 100644 index 00000000..6bece41d Binary files /dev/null and b/slides/information-theory/figure/kl_fitting_plot.png differ diff --git a/slides/information-theory/figure/normal_distributions.png b/slides/information-theory/figure/normal_distributions.png new file mode 100644 index 00000000..10d50c93 Binary files /dev/null and b/slides/information-theory/figure/normal_distributions.png differ diff --git a/slides/information-theory/rsrc/make_ftrue_plot.R b/slides/information-theory/rsrc/make_ftrue_plot.R new file mode 100644 index 00000000..0274fa7e --- /dev/null +++ b/slides/information-theory/rsrc/make_ftrue_plot.R @@ -0,0 +1,42 @@ + +library(ggplot2) +library(ggpubr) + +set.seed(123) + +df = data.frame(x = runif(50, -2.5, 2.5), type = FALSE) +df = rbind(df, data.frame(x = rep(0, 10), type = TRUE)) + +df$y = 2 * df$x + rnorm(60) + +lm_fit = lm(data = df, y ~ x) + +k <- 5 +sigma <- sigma(lm_fit) +ab <- coef(lm_fit); a <- ab[1]; b <- ab[2] + +x <- seq(-k*sigma, k*sigma, length.out = 50) +y <- dnorm(x, 0, sigma)/dnorm(0, 0, sigma) * 1 + +x0 <- 0 +y0 <- a+b*x0 +path1 <- data.frame(x = y + x0, y = x + y0) +segment1 <- data.frame(x = x0, y = y0 - k*sigma, xend = x0, yend = y0 + k*sigma) +df[df$type, ]$y <- df[df$type, ]$y + + +p = ggplot() + geom_point(data = df[!df$type, ], aes(x = x, y = y), alpha = 0.2, size = 3) +p = p + geom_point(data = df[df$type, ], aes(x = x, y = y), size = 3) +p = p + geom_path(data = path1, aes(x = x, y = y), color = "orange") +p = p + geom_segment(aes(x=x,y=y,xend=xend,yend=yend), data = segment1, lty = 2) +p = p + geom_point(data = df[df$type, ], aes(x = x + 2, y = y + 4), size = 3) +p = p + geom_path(data = path1, aes(x = x + 2, y = y + 4), color = "orange") +p = p + geom_segment(aes(x=x + 2,y=y + 4,xend=xend + 2,yend=yend + 4), data = segment1, lty = 2) +p = p + geom_point(data = df[df$type, ], aes(x = x - 2, y = y - 4), size = 3) +p = p + geom_path(data = path1, aes(x = x - 2, y = y - 4), color = "orange") +p = p + geom_segment(aes(x=x - 2,y=y - 4,xend=xend - 2,yend=yend - 4), data = segment1, lty = 2) +p = p + geom_abline(slope = b, intercept = a) + +p + +ggsave("../figure/ftrue.pdf", width = 5, height = 3) \ No newline at end of file diff --git a/slides/information-theory/rsrc/make_gaussian_mixture_plots.R b/slides/information-theory/rsrc/make_gaussian_mixture_plots.R new file mode 100644 index 00000000..e5b17238 --- /dev/null +++ b/slides/information-theory/rsrc/make_gaussian_mixture_plots.R @@ -0,0 +1,114 @@ +library(ggplot2) +library(MASS) +library(gridExtra) + + +# Define parameters for the Gaussian distributions +mean1 <- c(0, 0) +mean2 <- c(5, 5) +covariance <- matrix(c(1, 0, 0, 1), nrow = 2) + +# Generate samples +set.seed(0) +samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance) +samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance) +samples <- rbind(samples1, samples2) + +# Create a data frame +df <- data.frame(x = samples[,1], y = samples[,2]) + +# Create the main contour plot +p_main <- ggplot(df, aes(x = x, y = y)) + + geom_density_2d_filled() + + theme_minimal() + + theme(axis.title = element_blank(), + axis.text = element_blank(), + axis.ticks = element_blank(), + panel.grid = element_blank()) + + theme(legend.position = "none") + +# Create the top marginal plot (X-axis) +p_top <- ggplot(df, aes(x = x)) + + geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) + + theme_void() + + theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0), + axis.title.x = element_text(size = 20)) + + labs(x = "x1") + +# Create the right marginal plot (Y-axis) +p_right <- ggplot(df, aes(x = y)) + + geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) + + theme_void() + + theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0), + axis.title.y = element_text(size = 20)) + + labs(x = "x2") + + coord_flip() + +# Arrange the plots together +empty <- ggplot() + geom_blank(aes(x = 0, y = 0)) + theme_void() +p1 <- arrangeGrob(p_top, empty, p_main, p_right, + ncol = 2, nrow = 2, + widths = c(4, 0.5), + heights = c(2, 7)) + + +# Define parameters for the Gaussian distributions +mean1 <- c(0, 0) +mean2 <- c(10, 10) +mean3 <- c(10, 0) +mean4 <- c(0, 10) + +# Generate samples +set.seed(0) +samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance) +samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance) +samples3 <- mvrnorm(n = 1000, mu = mean3, Sigma = covariance) +samples4 <- mvrnorm(n = 1000, mu = mean4, Sigma = covariance) + +samples <- rbind(samples1, samples2, samples3, samples4) + +# Create a data frame +df <- data.frame(x = samples[,1], y = samples[,2]) + +# Create the main contour plot +p_main <- ggplot(df, aes(x = x, y = y)) + + geom_density_2d_filled() + + theme_minimal() + + theme(axis.title = element_blank(), + axis.text = element_blank(), + axis.ticks = element_blank(), + panel.grid = element_blank()) + + theme(legend.position = "none") + +# Create the top marginal plot (X-axis) +p_top <- ggplot(df, aes(x = x)) + + geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) + + theme_void() + + theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0), + axis.title.x = element_text(size = 20)) + + labs(x = "x1") + +# Create the right marginal plot (Y-axis) +p_right <- ggplot(df, aes(x = y)) + + geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) + + theme_void() + + theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0), + axis.title.y = element_text(size = 20)) + + labs(x = "x2") + + coord_flip() + +# Arrange the plots together +empty <- ggplot() + geom_blank(aes(x = 0, y = 0)) + theme_void() +p2 <- arrangeGrob(p_top, empty, p_main, p_right, + ncol = 2, nrow = 2, + widths = c(4, 0.5), + heights = c(2, 7)) + +# Display the plot +grid.newpage() +grid.draw(p2) + +plot = grid.arrange(p1, p2, ncol = 2) + +ggsave(file = "../figure/gaussian_mixture_with_marginals.png", plot = plot, width = 24, height = 8, dpi = 300) + diff --git a/slides/information-theory/rsrc/make_gaussian_mixture_scatter.R b/slides/information-theory/rsrc/make_gaussian_mixture_scatter.R new file mode 100644 index 00000000..abff1f60 --- /dev/null +++ b/slides/information-theory/rsrc/make_gaussian_mixture_scatter.R @@ -0,0 +1,61 @@ +library(ggplot2) +library(MASS) +library(gridExtra) + + +mean1 <- c(6, 0) +mean2 <- c(0, 10) +covariance <- matrix(c(1, 0, 0, 1), nrow = 2) + +# Generate samples +set.seed(0) +samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance) +samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance) +samples <- rbind(samples1, samples2) + +# Additional bivariate gaussian with mean c(0,0) +q1 <- mvrnorm(n = 100, mu = c(0, 0), Sigma = covariance) +q2 <- mvrnorm(n = 100, mu = c(2.5, 5), Sigma = matrix(c(1.5, 0.5, 0.5, 1.5), nrow = 2)) +q3 <- mvrnorm(n = 100, mu = c(6, 0), Sigma = covariance) +q4 <- mvrnorm(n = 100, mu = c(0, 10), Sigma = covariance) + +# Create a data frame +df <- data.frame(x = samples[,1], y = samples[,2]) +q1_df <- data.frame(x = q1[,1], y = q1[,2]) +q2_df <- data.frame(x = q2[,1], y = q2[,2]) +q3_df <- data.frame(x = q3[,1], y = q3[,2]) +q4_df <- data.frame(x = q4[,1], y = q4[,2]) + +p1 <- ggplot(df, aes(x = x, y = y)) + + geom_density_2d_filled() + + geom_point(data = additional_df, aes(x = x, y = y), color = "red", size = 0.5) + + theme_minimal() + + labs(x = expression(theta[1]), y = expression(theta[2]), title = "") + + theme(legend.position = "none", + plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold")) + + +p2 <- ggplot(df, aes(x = x, y = y)) + + geom_density_2d_filled() + + geom_point(data = q2_df, aes(x = x, y = y), color = "red", size = 0.5) + + theme_minimal() + + labs(x = expression(theta[1]), y = expression(theta[2]), title = "Sampling from q(x)") + + theme(legend.position = "none", + plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold")) + +# Update p3 +p3 <- ggplot(df, aes(x = x, y = y)) + + geom_density_2d_filled() + + geom_point(data = q3_df, aes(x = x, y = y), color = "red", size = 0.5) + + geom_point(data = q4_df, aes(x = x, y = y), color = "red", size = 0.5) + + theme_minimal() + + labs(x = expression(theta[1]), y = expression(theta[2]), title = "") + + theme(legend.position = "none", + plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold")) + + + +plot = grid.arrange(p1, p2, p3, ncol = 3) + + +ggsave("../figure/gaussian_mixture_scatter.png", plot = plot, width = 11, height = 3) diff --git a/slides/information-theory/rsrc/make_kl_fitting_plot.R b/slides/information-theory/rsrc/make_kl_fitting_plot.R new file mode 100644 index 00000000..972d1082 --- /dev/null +++ b/slides/information-theory/rsrc/make_kl_fitting_plot.R @@ -0,0 +1,58 @@ +library(ggplot2) +library(MASS) +library(gridExtra) + +########## CREATE NORMAL DISTRIBUTIONS + + +set.seed(123) + +x <- seq(-4, 7, length.out = 1000) +normal_density1 <- dnorm(x, 0, 1) +normal_density2 <- dnorm(x, 3, 1) +data <- data.frame(x = x, NormalDensity1 = normal_density1, NormalDensity2 = normal_density2) +p = ggplot(data, aes(x = x)) + + geom_line(aes(y = NormalDensity1), color = "blue", size = 1) + + geom_line(aes(y = NormalDensity2), color = "red", size = 1) + + labs(x = "x", + y = "Density") + + geom_text(aes(x = 1.5, y = 0.34, label = "?"), color = "black", size = 15) + +ggsave("../figure/normal_distributions.png", plot = p, width = 7, height = 3) + + + +# Generate samples from both distributions + +samples1 <- rnorm(1000, 0, 1) +samples2 <- rnorm(1000, 5, 1) + +# Combine samples to form a Gaussian mixture +mixture_samples <- c(samples1, samples2) + +data <- data.frame(value = mixture_samples) + +# Use density function to estimate the density of the mixture +density_data <- density(mixture_samples, bw = "nrd0") +density_df <- data.frame(value = density_data$x, density = density_data$y) + +# Plotting the density using ggplot2 +p1 = ggplot(density_df, aes(x = value, y = density)) + + geom_line(color = "blue") + + labs(title = "Reverse KL", + x = "x", + y = "Density") + + stat_function(fun = dnorm, args = list(mean = 0, sd = 1), color = "red") + + +p2 <- ggplot(density_df, aes(x = value, y = density)) + + geom_line(aes(color = "p(x)")) + + labs(title = "Forward KL", + x = "x", + y = "Density") + + stat_function(fun = dnorm, args = list(mean = 2.5, sd = 3), aes(color = "q(x)")) + + scale_color_manual(name = "", values = c("p(x)" = "blue", "q(x)" = "red")) + + theme(legend.position = "right") + +plot = grid.arrange(p1, p2, ncol = 2, widths = c(1, 1.25)) +ggsave("../figure/kl_fitting_plot.png", plot =plot, width = 8, height = 3) diff --git a/slides/information-theory/slides-info-kl-ml.tex b/slides/information-theory/slides-info-kl-ml.tex index f578f63b..5585e87b 100644 --- a/slides/information-theory/slides-info-kl-ml.tex +++ b/slides/information-theory/slides-info-kl-ml.tex @@ -22,7 +22,11 @@ \begin{vbframe} {Measuring Distribution Similarity in ML} \begin{itemize} \item Information theory provides tools (e.g., divergence measures) to quantify the similarity between probability distributions -\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_dist_sim.png} + + \begin{center} + \includegraphics[width=0.7\linewidth]{figure/normal_distributions.png} + \end{center} + \item The most prominent divergence measure is the KL divergence \item In ML, measuring (and maximizing) the similarity between probability distributions is a ubiquitous concept, which will be shown in the following. \end{itemize} @@ -31,7 +35,9 @@ \item \textbf{Probabilistic model fitting}\\ Assume our learner is probabilistic, i.e., we model $p(y| \mathbf{x})$ for example (for example, ridge regression, logistic regression, ...). -\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_prob_fit.png} +\begin{center} +\includegraphics[width=0.5\linewidth]{figure/ftrue.pdf} +\end{center} We want to minimize the difference between $p(y \vert \mathbf{x})$ and the conditional data generating process $\mathbb{P}_{y\vert\mathbf{x}}$ based on the data stemming from $\mathbb{P}_{y, \mathbf{x}}.$ @@ -47,7 +53,9 @@ \item \textbf{Feature selection} In feature selection, we want to choose features the target strongly depends on. -\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_mi.png} +\begin{center} +\includegraphics[width=0.9\linewidth]{figure/gaussian_mixture_with_marginals.png} +\end{center} We can measure dependency by measuring the similarity between $p(\mathbf{x}, y)$ and $p(\mathbf{x})\cdot p(y).$ \\ We will later see that measuring this similarity with KL leads to the concept of mutual information. @@ -60,7 +68,9 @@ \item \textbf{Variational inference (VI)} Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable. -\includegraphics[width=0.99\linewidth]{figure_man/kl_ml_vi.png} +\begin{center} +\includegraphics[width=0.99\linewidth]{figure/gaussian_mixture_scatter.png} +\end{center} In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to $p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$ @@ -122,8 +132,11 @@ $\Rightarrow$ We can estimate the gradient of the reverse KL without bias (even if we only have an unnormalized target distribution) \end{itemize} \framebreak + +\begin{center} +\includegraphics[width=0.7\linewidth]{figure/kl_fitting_plot.png} +\end{center} -\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_fkl_rkl.png} \\ The asymmetry of the KL has the following implications \begin{itemize} \item The forward KL $D_{KL}(p\|q_{\bm{\phi}}) = \E_{\xv \sim p} \log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right)$ is mass-covering since $p(\xv)\log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right) \approx 0$ if $p(\xv) \approx 0$ (as long as both distribution do not extremely differ)