Skip to content

Commit

Permalink
Merge pull request #173 from slds-lmu/kl_tobi_1
Browse files Browse the repository at this point in the history
add plots to kl_ml slides and R files
  • Loading branch information
chriskolb authored Dec 20, 2023
2 parents 0a87b99 + 03ebc19 commit d593e69
Show file tree
Hide file tree
Showing 10 changed files with 293 additions and 5 deletions.
Binary file added slides/information-theory/figure/ftrue.pdf
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
42 changes: 42 additions & 0 deletions slides/information-theory/rsrc/make_ftrue_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

library(ggplot2)
library(ggpubr)

set.seed(123)

df = data.frame(x = runif(50, -2.5, 2.5), type = FALSE)
df = rbind(df, data.frame(x = rep(0, 10), type = TRUE))

df$y = 2 * df$x + rnorm(60)

lm_fit = lm(data = df, y ~ x)

k <- 5
sigma <- sigma(lm_fit)
ab <- coef(lm_fit); a <- ab[1]; b <- ab[2]

x <- seq(-k*sigma, k*sigma, length.out = 50)
y <- dnorm(x, 0, sigma)/dnorm(0, 0, sigma) * 1

x0 <- 0
y0 <- a+b*x0
path1 <- data.frame(x = y + x0, y = x + y0)
segment1 <- data.frame(x = x0, y = y0 - k*sigma, xend = x0, yend = y0 + k*sigma)
df[df$type, ]$y <- df[df$type, ]$y


p = ggplot() + geom_point(data = df[!df$type, ], aes(x = x, y = y), alpha = 0.2, size = 3)
p = p + geom_point(data = df[df$type, ], aes(x = x, y = y), size = 3)
p = p + geom_path(data = path1, aes(x = x, y = y), color = "orange")
p = p + geom_segment(aes(x=x,y=y,xend=xend,yend=yend), data = segment1, lty = 2)
p = p + geom_point(data = df[df$type, ], aes(x = x + 2, y = y + 4), size = 3)
p = p + geom_path(data = path1, aes(x = x + 2, y = y + 4), color = "orange")
p = p + geom_segment(aes(x=x + 2,y=y + 4,xend=xend + 2,yend=yend + 4), data = segment1, lty = 2)
p = p + geom_point(data = df[df$type, ], aes(x = x - 2, y = y - 4), size = 3)
p = p + geom_path(data = path1, aes(x = x - 2, y = y - 4), color = "orange")
p = p + geom_segment(aes(x=x - 2,y=y - 4,xend=xend - 2,yend=yend - 4), data = segment1, lty = 2)
p = p + geom_abline(slope = b, intercept = a)

p

ggsave("../figure/ftrue.pdf", width = 5, height = 3)
114 changes: 114 additions & 0 deletions slides/information-theory/rsrc/make_gaussian_mixture_plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
library(ggplot2)
library(MASS)
library(gridExtra)


# Define parameters for the Gaussian distributions
mean1 <- c(0, 0)
mean2 <- c(5, 5)
covariance <- matrix(c(1, 0, 0, 1), nrow = 2)

# Generate samples
set.seed(0)
samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance)
samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance)
samples <- rbind(samples1, samples2)

# Create a data frame
df <- data.frame(x = samples[,1], y = samples[,2])

# Create the main contour plot
p_main <- ggplot(df, aes(x = x, y = y)) +
geom_density_2d_filled() +
theme_minimal() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank()) +
theme(legend.position = "none")

# Create the top marginal plot (X-axis)
p_top <- ggplot(df, aes(x = x)) +
geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
theme_void() +
theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
axis.title.x = element_text(size = 20)) +
labs(x = "x1")

# Create the right marginal plot (Y-axis)
p_right <- ggplot(df, aes(x = y)) +
geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
theme_void() +
theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
axis.title.y = element_text(size = 20)) +
labs(x = "x2") +
coord_flip()

# Arrange the plots together
empty <- ggplot() + geom_blank(aes(x = 0, y = 0)) + theme_void()
p1 <- arrangeGrob(p_top, empty, p_main, p_right,
ncol = 2, nrow = 2,
widths = c(4, 0.5),
heights = c(2, 7))


# Define parameters for the Gaussian distributions
mean1 <- c(0, 0)
mean2 <- c(10, 10)
mean3 <- c(10, 0)
mean4 <- c(0, 10)

# Generate samples
set.seed(0)
samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance)
samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance)
samples3 <- mvrnorm(n = 1000, mu = mean3, Sigma = covariance)
samples4 <- mvrnorm(n = 1000, mu = mean4, Sigma = covariance)

samples <- rbind(samples1, samples2, samples3, samples4)

# Create a data frame
df <- data.frame(x = samples[,1], y = samples[,2])

# Create the main contour plot
p_main <- ggplot(df, aes(x = x, y = y)) +
geom_density_2d_filled() +
theme_minimal() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank()) +
theme(legend.position = "none")

# Create the top marginal plot (X-axis)
p_top <- ggplot(df, aes(x = x)) +
geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
theme_void() +
theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
axis.title.x = element_text(size = 20)) +
labs(x = "x1")

# Create the right marginal plot (Y-axis)
p_right <- ggplot(df, aes(x = y)) +
geom_density(aes(y = after_stat(density)), fill = "blue", alpha = 0.5) +
theme_void() +
theme(plot.margin = margin(t = 0, r = 30, b = 0, l = 0),
axis.title.y = element_text(size = 20)) +
labs(x = "x2") +
coord_flip()

# Arrange the plots together
empty <- ggplot() + geom_blank(aes(x = 0, y = 0)) + theme_void()
p2 <- arrangeGrob(p_top, empty, p_main, p_right,
ncol = 2, nrow = 2,
widths = c(4, 0.5),
heights = c(2, 7))

# Display the plot
grid.newpage()
grid.draw(p2)

plot = grid.arrange(p1, p2, ncol = 2)

ggsave(file = "../figure/gaussian_mixture_with_marginals.png", plot = plot, width = 24, height = 8, dpi = 300)

61 changes: 61 additions & 0 deletions slides/information-theory/rsrc/make_gaussian_mixture_scatter.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
library(ggplot2)
library(MASS)
library(gridExtra)


mean1 <- c(6, 0)
mean2 <- c(0, 10)
covariance <- matrix(c(1, 0, 0, 1), nrow = 2)

# Generate samples
set.seed(0)
samples1 <- mvrnorm(n = 1000, mu = mean1, Sigma = covariance)
samples2 <- mvrnorm(n = 1000, mu = mean2, Sigma = covariance)
samples <- rbind(samples1, samples2)

# Additional bivariate gaussian with mean c(0,0)
q1 <- mvrnorm(n = 100, mu = c(0, 0), Sigma = covariance)
q2 <- mvrnorm(n = 100, mu = c(2.5, 5), Sigma = matrix(c(1.5, 0.5, 0.5, 1.5), nrow = 2))
q3 <- mvrnorm(n = 100, mu = c(6, 0), Sigma = covariance)
q4 <- mvrnorm(n = 100, mu = c(0, 10), Sigma = covariance)

# Create a data frame
df <- data.frame(x = samples[,1], y = samples[,2])
q1_df <- data.frame(x = q1[,1], y = q1[,2])
q2_df <- data.frame(x = q2[,1], y = q2[,2])
q3_df <- data.frame(x = q3[,1], y = q3[,2])
q4_df <- data.frame(x = q4[,1], y = q4[,2])

p1 <- ggplot(df, aes(x = x, y = y)) +
geom_density_2d_filled() +
geom_point(data = additional_df, aes(x = x, y = y), color = "red", size = 0.5) +
theme_minimal() +
labs(x = expression(theta[1]), y = expression(theta[2]), title = "") +
theme(legend.position = "none",
plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold"))


p2 <- ggplot(df, aes(x = x, y = y)) +
geom_density_2d_filled() +
geom_point(data = q2_df, aes(x = x, y = y), color = "red", size = 0.5) +
theme_minimal() +
labs(x = expression(theta[1]), y = expression(theta[2]), title = "Sampling from q(x)") +
theme(legend.position = "none",
plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold"))

# Update p3
p3 <- ggplot(df, aes(x = x, y = y)) +
geom_density_2d_filled() +
geom_point(data = q3_df, aes(x = x, y = y), color = "red", size = 0.5) +
geom_point(data = q4_df, aes(x = x, y = y), color = "red", size = 0.5) +
theme_minimal() +
labs(x = expression(theta[1]), y = expression(theta[2]), title = "") +
theme(legend.position = "none",
plot.title = element_text(color = "red", size = 15, hjust = 0.5, face = "bold"))



plot = grid.arrange(p1, p2, p3, ncol = 3)


ggsave("../figure/gaussian_mixture_scatter.png", plot = plot, width = 11, height = 3)
58 changes: 58 additions & 0 deletions slides/information-theory/rsrc/make_kl_fitting_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
library(ggplot2)
library(MASS)
library(gridExtra)

########## CREATE NORMAL DISTRIBUTIONS


set.seed(123)

x <- seq(-4, 7, length.out = 1000)
normal_density1 <- dnorm(x, 0, 1)
normal_density2 <- dnorm(x, 3, 1)
data <- data.frame(x = x, NormalDensity1 = normal_density1, NormalDensity2 = normal_density2)
p = ggplot(data, aes(x = x)) +
geom_line(aes(y = NormalDensity1), color = "blue", size = 1) +
geom_line(aes(y = NormalDensity2), color = "red", size = 1) +
labs(x = "x",
y = "Density") +
geom_text(aes(x = 1.5, y = 0.34, label = "?"), color = "black", size = 15)

ggsave("../figure/normal_distributions.png", plot = p, width = 7, height = 3)



# Generate samples from both distributions

samples1 <- rnorm(1000, 0, 1)
samples2 <- rnorm(1000, 5, 1)

# Combine samples to form a Gaussian mixture
mixture_samples <- c(samples1, samples2)

data <- data.frame(value = mixture_samples)

# Use density function to estimate the density of the mixture
density_data <- density(mixture_samples, bw = "nrd0")
density_df <- data.frame(value = density_data$x, density = density_data$y)

# Plotting the density using ggplot2
p1 = ggplot(density_df, aes(x = value, y = density)) +
geom_line(color = "blue") +
labs(title = "Reverse KL",
x = "x",
y = "Density") +
stat_function(fun = dnorm, args = list(mean = 0, sd = 1), color = "red")


p2 <- ggplot(density_df, aes(x = value, y = density)) +
geom_line(aes(color = "p(x)")) +
labs(title = "Forward KL",
x = "x",
y = "Density") +
stat_function(fun = dnorm, args = list(mean = 2.5, sd = 3), aes(color = "q(x)")) +
scale_color_manual(name = "", values = c("p(x)" = "blue", "q(x)" = "red")) +
theme(legend.position = "right")

plot = grid.arrange(p1, p2, ncol = 2, widths = c(1, 1.25))
ggsave("../figure/kl_fitting_plot.png", plot =plot, width = 8, height = 3)
23 changes: 18 additions & 5 deletions slides/information-theory/slides-info-kl-ml.tex
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
\begin{vbframe} {Measuring Distribution Similarity in ML}
\begin{itemize}
\item Information theory provides tools (e.g., divergence measures) to quantify the similarity between probability distributions
\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_dist_sim.png}

\begin{center}
\includegraphics[width=0.7\linewidth]{figure/normal_distributions.png}
\end{center}

\item The most prominent divergence measure is the KL divergence
\item In ML, measuring (and maximizing) the similarity between probability distributions is a ubiquitous concept, which will be shown in the following.
\end{itemize}
Expand All @@ -31,7 +35,9 @@
\item \textbf{Probabilistic model fitting}\\
Assume our learner is probabilistic, i.e., we model $p(y| \mathbf{x})$ for example (for example, ridge regression, logistic regression, ...).

\includegraphics[width=0.4\linewidth]{figure_man/kl_ml_prob_fit.png}
\begin{center}
\includegraphics[width=0.5\linewidth]{figure/ftrue.pdf}
\end{center}

We want to minimize the difference between $p(y \vert \mathbf{x})$ and the conditional data generating process $\mathbb{P}_{y\vert\mathbf{x}}$ based on the data stemming from $\mathbb{P}_{y, \mathbf{x}}.$

Expand All @@ -47,7 +53,9 @@
\item \textbf{Feature selection}
In feature selection, we want to choose features the target strongly depends on.

\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_mi.png}
\begin{center}
\includegraphics[width=0.9\linewidth]{figure/gaussian_mixture_with_marginals.png}
\end{center}

We can measure dependency by measuring the similarity between $p(\mathbf{x}, y)$ and $p(\mathbf{x})\cdot p(y).$ \\
We will later see that measuring this similarity with KL leads to the concept of mutual information.
Expand All @@ -60,7 +68,9 @@
\item \textbf{Variational inference (VI)}
Our data can also induce probability distributions: By Bayes' theorem it holds that the posterior density $$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}) = \frac{p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})}{\int p(\mathbf{y}|\mathbf{X}, \bm{\theta})p(\bm{\theta})d\bm{\theta}}.$$ However, computing this density analytically is usually intractable.

\includegraphics[width=0.99\linewidth]{figure_man/kl_ml_vi.png}
\begin{center}
\includegraphics[width=0.99\linewidth]{figure/gaussian_mixture_scatter.png}
\end{center}

In VI, we want to fit a density $q_{\bm{\phi}}$ with parameters $\bm{\phi}$ to
$p(\bm{\theta}\vert \mathbf{X}, \mathbf{y}).$
Expand Down Expand Up @@ -122,8 +132,11 @@
$\Rightarrow$ We can estimate the gradient of the reverse KL without bias (even if we only have an unnormalized target distribution)
\end{itemize}
\framebreak

\begin{center}
\includegraphics[width=0.7\linewidth]{figure/kl_fitting_plot.png}
\end{center}

\includegraphics[width=0.6\linewidth]{figure_man/kl_ml_fkl_rkl.png} \\
The asymmetry of the KL has the following implications
\begin{itemize}
\item The forward KL $D_{KL}(p\|q_{\bm{\phi}}) = \E_{\xv \sim p} \log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right)$ is mass-covering since $p(\xv)\log\left(\frac{p(\xv)}{q_{\bm{\phi}}(\xv)}\right) \approx 0$ if $p(\xv) \approx 0$ (as long as both distribution do not extremely differ)
Expand Down

0 comments on commit d593e69

Please sign in to comment.