diff --git a/slides/information-theory/figure/cross_entropy_plot_1.png b/slides/information-theory/figure/cross_entropy_plot_1.png new file mode 100644 index 00000000..7f196cab Binary files /dev/null and b/slides/information-theory/figure/cross_entropy_plot_1.png differ diff --git a/slides/information-theory/figure/cross_entropy_plot_2.png b/slides/information-theory/figure/cross_entropy_plot_2.png new file mode 100644 index 00000000..e8282bdf Binary files /dev/null and b/slides/information-theory/figure/cross_entropy_plot_2.png differ diff --git a/slides/information-theory/figure/kl_ce_comparison.png b/slides/information-theory/figure/kl_ce_comparison.png new file mode 100644 index 00000000..dc162960 Binary files /dev/null and b/slides/information-theory/figure/kl_ce_comparison.png differ diff --git a/slides/information-theory/rsrc/make_cross_entropy_plots.R b/slides/information-theory/rsrc/make_cross_entropy_plots.R new file mode 100644 index 00000000..37698cb7 --- /dev/null +++ b/slides/information-theory/rsrc/make_cross_entropy_plots.R @@ -0,0 +1,158 @@ +library(ggplot2) +library(gridExtra) +library(extraDistr) + +### CREATE LOG DIFFERENCE PLOT FOR KL + +set.seed(123) + +cross_ent <- function(x, p, q, first) { + ent_p <- -p * log(p) + kl <- p * log(p / q) + cross_ent1 <- ent_p + kl + cross_ent2 <- -p * log(q) + + data <- data.frame( + x = x, + P = p, + Q = q, + EntP = ent_p, + KL = kl, + Cross_Ent1 = cross_ent1, + Cross_Ent2 = cross_ent2 + ) + + if (first == "normal") { + kl_int <- function(x) { + p <- dnorm(x, 0, 1) + q <- dlaplace(x, 0, 3) + log_ratio <- log(p / q) + p * log_ratio + } + result <- integrate(kl_int, lower = -20, upper = 20) + kl <- round(result$value, 2) + + ent_int <- function(x) { + p <- dnorm(x, 0, 1) + q <- dlaplace(x, 0, 3) + - p * log(p) + } + result <- integrate(ent_int, lower = -20, upper = 20) + entropy <- round(result$value, 2) + + cross_entropy1 = entropy + kl + + cross_int <- function(x) { + p <- dnorm(x, 0, 1) + q <- dlaplace(x, 0, 3) + - p * log(q) + } + result <- integrate(cross_int, lower = -20, upper = 20) + cross_entropy2 <- round(result$value, 2) + + } else if (first == "laplace") { + kl_int <- function(x) { + q <- dnorm(x, 0, 1) + p <- dlaplace(x, 0, 3) + log_ratio <- log(p / q) + p * log_ratio + } + result <- integrate(kl_int, lower = -20, upper = 20) + kl <- round(result$value, 2) + + ent_int <- function(x) { + q <- dnorm(x, 0, 1) + p <- dlaplace(x, 0, 3) + - p * log(p) + } + result <- integrate(ent_int, lower = -20, upper = 20) + entropy <- round(result$value, 2) + + cross_entropy1 = entropy + kl + + cross_int <- function(x) { + q <- dnorm(x, 0, 1) + p <- dlaplace(x, 0, 3) + - p * log(q) + } + result <- integrate(cross_int, lower = -20, upper = 20) + cross_entropy2 <- round(result$value, 2) + } + + plot1 = ggplot(data, aes(x = x)) + + geom_line(aes(y = P), + color = "blue", + size = 1, + linetype = "solid") + + geom_line(aes(y = Q), + color = "red", + size = 1, + linetype = "solid") + + labs(title = "N(0,1) and LP(0,3) Densities", x = "x", y = "Density") + + scale_color_manual(values = c("blue")) + + plot2 = ggplot(data, aes(x = x)) + + geom_line( + aes(y = EntP), + color = "blue", + size = 1, + linetype = "solid" + ) + + geom_line( + aes(y = KL), + color = "orange", + size = 1, + linetype = "solid" + ) + + geom_ribbon(aes(ymin = KL, ymax = EntP), alpha = 0.2) + + geom_ribbon(aes(ymin = 0, ymax = KL), alpha = 0.2) + + labs( + title = sprintf("H(p) = %g, D_KL(p||q) = %g", entropy, kl), + x = "x", + y = "Integrals" + ) + + scale_color_manual(values = c("blue")) + + plot3 = ggplot(data, aes(x = x)) + + geom_line( + aes(y = Cross_Ent1), + color = "darkgreen", + size = 1, + linetype = "solid" + ) + + geom_ribbon(aes(ymin = 0, ymax = Cross_Ent1), alpha = 0.2) + + labs( + title = sprintf("H(p||q) = %g + %g = %g", entropy, kl, cross_entropy1), + x = "x", + y = "Cross-Entropy" + ) + + scale_color_manual(values = c("blue")) + + plot4 = ggplot(data, aes(x = x)) + + geom_line( + aes(y = Cross_Ent2), + color = "darkgreen", + size = 1, + linetype = "solid" + ) + + geom_ribbon(aes(ymin = 0, ymax = Cross_Ent2), alpha = 0.2) + + labs( + title = sprintf("H(p||q) = -Int[p(x)*log(q(x))dx] = %g", cross_entropy2), + x = "x", + y = "Cross-Entropy" + ) + + scale_color_manual(values = c("blue")) + + plot = grid.arrange(plot1, plot2, plot3, plot4, ncol = 2) + + return(plot) + +} + +x <- seq(-4, 4, length.out = 1000) +plot1 = cross_ent(x, p = dnorm(x, 0, 1), q = dlaplace(x, 0, 3), first = "normal") +plot2 = cross_ent(x, p = dlaplace(x, 0, 3), q = dnorm(x, 0, 1), first = "laplace") + +ggsave("..figure/cross_entropy_plot_1.png", plot = plot1, width =8, height = 5) +ggsave("..figure/cross_entropy_plot_2.png", plot = plot2, width =8, height = 5) + diff --git a/slides/information-theory/rsrc/make_kl_ce_comparison.R b/slides/information-theory/rsrc/make_kl_ce_comparison.R new file mode 100644 index 00000000..a2fea6be --- /dev/null +++ b/slides/information-theory/rsrc/make_kl_ce_comparison.R @@ -0,0 +1,67 @@ +library(ggplot2) +library(gridExtra) +library(extraDistr) + +set.seed(123) + +### CREATE KL PLOT FOR VARYING b of LP(0,b) with N(0,1) + +x <- seq(-4, 4, length.out = 1000) +seq <- seq(0.1, 10, length.out = 1000) +kls = list() +ces = list() + +for (i in seq_along(seq)){ + + integrand <- function(x) { + p <- dnorm(x, 0, 1) + q <- dlaplace(x, 0, seq[i]) + log_ratio <- log(p/q) + p*log_ratio + } + + result <- integrate(integrand, lower = -20, upper = 20) + kl <-result$value + kls[[i]] = kl +} + +for (i in seq_along(seq)){ + + integrand <- function(x) { + p <- dnorm(x, 0, 1) + q <- dlaplace(x, 0, seq[i]) + -p*log(q) + } + + result <- integrate(integrand, lower = -20, upper = 20) + ce <-result$value + ces[[i]] = ce +} + +data <- data.frame(Sigma = seq, KL = unlist(kls), CE = unlist(ces)) + +min_kl <- round(min(data$KL), 2) +min_ce <- round(min(data$CE), 2) +minimizer_kl <- round(data$Sigma[which.min(data$KL)], 2) +minimizer_ce <- round(data$Sigma[which.min(data$CE)], 2) + +plot1 = ggplot(data, aes(x = Sigma)) + + geom_line(aes(y = KL), color = "orange", size = 1, linetype = "solid") + + labs(title = "KL Divergence depending on Sigma", x = "Sigma", y = "KL Divergence") + + scale_y_continuous(limits = c(0, 5)) + + geom_text(aes(x = 5, y = 3, label = paste("min D_KL(p||q) =",min_kl)), color = "black", size = 3) + + geom_text(aes(x = 5, y = 2, label = paste("Minimizer =",minimizer_kl)), color = "black", size = 3) + + scale_color_manual(values = c("orange")) + + +plot2 = ggplot(data, aes(x = Sigma)) + + geom_line(aes(y = CE), color = "darkgreen", size = 1, linetype = "solid") + + labs(title = "Cross-Entropy depending on Sigma", x = "Sigma", y = "Cross-Entropy") + + geom_text(aes(x = 5, y = 1.5, label = paste("min H(p||q) =",min_ce)), color = "black", size = 3) + + geom_text(aes(x = 5, y = 0.5, label = paste("Minimizer =",minimizer_ce)), color = "black", size = 3) + + scale_y_continuous(limits = c(0, 5)) + + scale_color_manual(values = c("darkgreen")) + +plot = grid.arrange(plot1, plot2, ncol = 2) + +ggsave("..figure/kl_ce_comparison.png", plot = plot, width =8, height = 3) diff --git a/slides/information-theory/slides-info-cross-entropy-kld.tex b/slides/information-theory/slides-info-cross-entropy-kld.tex index b3045d7c..3fd58045 100644 --- a/slides/information-theory/slides-info-cross-entropy-kld.tex +++ b/slides/information-theory/slides-info-cross-entropy-kld.tex @@ -61,6 +61,31 @@ \item Can now become negative, as the $h(p)$ can be negative! \end{itemize} \end{vbframe} + +\begin{vbframe} {Cross-Entropy Example} + +Let $p(x)=N(0,1)$ and $q(x)=LP(0, 3)$. We can visualize +$$ +H(p \| q) = H(p) + D_{KL}(p \| q) +$$ +\begin{center} + \includegraphics[width = 0.8\textwidth]{figure/cross_entropy_plot_1.png} +\end{center} + +\end{vbframe} + +\begin{vbframe} {Cross-Entropy Example} + +Let $p(x)=LP(0, 3)$ and $q(x)=N(0,1)$. We can visualize +$$ +H(p \| q) = H(p) + D_{KL}(p \| q) +$$ + +\begin{center} + \includegraphics[width = 0.8\textwidth]{figure/cross_entropy_plot_2.png} +\end{center} + +\end{vbframe} \begin{vbframe}{Proof: Maximum of Differential Entropy} \textbf{Claim}: For a given variance, the continuous distribution that maximizes differential entropy is the Gaussian. diff --git a/slides/information-theory/slides-info-ml.tex b/slides/information-theory/slides-info-ml.tex index 98d28956..08d3111e 100644 --- a/slides/information-theory/slides-info-ml.tex +++ b/slides/information-theory/slides-info-ml.tex @@ -47,6 +47,16 @@ \end{itemize} \end{vbframe} +\begin{vbframe}{KL vs Cross-Entropy Example} +Let $p(x)=N(0,1)$ and $q(x)=LP(0,3)$ and consider again +$$ \argmin_{\thetab} D_{KL}(p \| q_{\thetab}) = \argmin_{\thetab} -\E_{X \sim p} \log q(x|\thetab) = \argmin_{\thetab} H(p \| q_{\thetab}) $$ + +\begin{center} + \includegraphics[width=1\textwidth]{figure/kl_ce_comparison.png} +\end{center} + +\end{vbframe} + \begin{vbframe}{Cross-Entropy vs. Log-Loss} \begin{itemize}