2021-12-CausalSalad.Rmd

---
title: "Causal inference workshop by Richard McElreath"
author: "Jitao david Zhang"
date: "29/11/2021"
output: html_document
---

This document walks through the models introduced in the causal inference workshop given by Richard McElreath, to whom the credit should go to. I have twisted the models from time to time to make sure that I understand how they work. All errors and mistakes are my responsibility.

* [The workshop, Science Before Statistics: Causal Inference](https://www.youtube.com/watch?v=KNPYUVmY3NM&t=2084s)
* [The original R script posted on GitHub](https://github.com/rmcelreath/causal_salad_2021/blob/main/1_causal_salad.r)

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      fig.height = 4, 
                      fig.width = 4,
                      fig.path = "figures/",
                      dev = c("png", "pdf"))
library(ribiosUtils)
library(ribiosIO)
library(rethinking)
library(dagitty)
library(ggplot2)
library(ggdag)
library(cmdstanr) ## installation instructions available at https://mc-stan.org/cmdstanr/articles/cmdstanr.html
library(ggpmisc)
library(ggthemes)
library(gridExtra)
ggplot2::theme_set(theme_minimal(base_size=14))
```

# The basics

## The simplest relation: two variables, one causing the other

```{r}
set.seed(1887)
N <- 50
x <- rnorm(N, mean=5)
y <- rnorm(N, mean=x*2)
xydata <- data.frame(x=x, y=y)
xylm <- lm(y~x, data=xydata)
my.formula <- y ~ x
yxPlot <- ggplot(xydata, aes(x=x, y=y)) + geom_point() +
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label..,  ..rr.label.., sep = "*plain(\",\")~")), 
               coef.digits=2, rr.digits=2,
               parse = TRUE) +
  ggtitle("True effect: 2.0")
print(yxPlot)
```

```{r}
yxPlainPlot <- ggplot(xydata, aes(x=x, y=y)) + geom_point()
print(yxPlainPlot)
```

```{r}
View(xydata)
```

```{r fig.height=3, fig.width=5}
summary(lm(y ~ x, data=xydata))
```

```{r xCy, fig.height=3, fig.width=3}
set.seed(1887)
model0 <- dagify(y ~ x,
                 exposure = "x",
                 outcome = "y")
model0 <- tidy_dagitty(model0,
                       layout="linear")
xCausesY <- ggplot(model0, aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_fan() +
      geom_dag_text(size=12, vjust=0.3) +
      theme_dag()
print(xCausesY)
```

```{r yCx, fig.height=3, fig.width=3}
set.seed(1887)
model0r <- dagify(x ~ y,
                 exposure = "y",
                 outcome = "x")
model0r <- tidy_dagitty(model0r,
                       layout="linear")
yCausesX <- ggplot(model0r, aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_fan() +
      geom_dag_text(size=12, vjust=0.3) +
      theme_dag()
print(yCausesX)
```

Yet another possibility by the Reichenbach causality theorem is that both X and Y are regulated by a common cause U.

```{r latent}
set.seed(1887)
latmod <- dagify(x ~ u,
                 y ~ u,
                 latent  = "u",
                 coords = list(x = c(x=0, u=1, y=2),
                               y= c(x=0, u=0, y=0))) %>%
  tidy_dagitty(layout="auto")
latmodPlot <- ggplot(latmod %>%
  dplyr::mutate(latent = ifelse(name %in% "u", TRUE, FALSE)),
  aes(x = x, y = y, xend = xend, yend = yend, color=latent)) +
  scale_color_manual(values=c("black", "lightgrey"), guide="none") +
      geom_dag_point() +
      geom_dag_edges_fan() +
      geom_dag_text(size=12, vjust=0.3, color="white") +
      theme_dag()
print(latmodPlot)
```

```{r latmodSim}
set.seed(1887)
N <- 50
u <- rnorm(N, mean=5)
x <- rnorm(N, mean=u)
y <- rnorm(N, mean=u*2)
xyudata <- data.frame(x=x, y=y, u=u)
xyulm <- lm(y~x, data=xyudata)
my.formula <- y ~ x
uyxPlot <- ggplot(xyudata, aes(x=x, y=y)) + geom_point() +
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label..,  ..rr.label.., sep = "*plain(\",\")~")), 
               coef.digits=2, rr.digits=2,
               parse = TRUE) +
  ggtitle("True effect: 0.0")
print(uyxPlot)
```

What happens if we switch target variable with the random variable? Well, we get an equally well fit.

```{r fig.width=9.75, fig.height=3.25}
xyPlot <- ggplot(xydata, aes(x=y, y=x)) + geom_point() +
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
                        eq.x.rhs = "y",
               eq.with.lhs = "italic(hat(x))~`=`~",
               aes(label = paste(..eq.label..,  ..rr.label.., sep = "*plain(\",\")~")), 
               coef.digits=2, rr.digits=2,
               parse = TRUE) +
  ggtitle("The reverse fit")
grid.arrange(grobs=list(yxPlot, xyPlot, uyxPlot), nrow=1, ncol=3)
```

Learning causality from correlation needs experiments.

```{r learningCausal}
set.seed(1887)
xknock <- rnorm(3, mean=1, sd=0.5)
xCauseYpred <- data.frame(x=xknock, 
                          y=predict(lm(y~x, data=xydata), newdata=list(x=xknock)))
yCauseXpred <- data.frame(x=xknock, y=rnorm(3, mean=median(xydata$y)))

expLearnCausal <- ggplot(xydata, aes(x=x, y=y)) + geom_point() +
  xlim(0, 8) + ylim(0, 16) +
  geom_vline(xintercept=1, lty=2) +
  geom_point(data=xCauseYpred, col="red", shape=8, cex=4) +
  geom_point(data=yCauseXpred, col="navyblue", shape=4, cex=4)
print(expLearnCausal)
```

## A binary case

```{r}
set.seed(1887)
N <- 50
x <- rbern(N, prob=0.5)
y <- rnorm(N, mean=x*2)
xybid <- data.frame(x=factor(x), y=y)
ggplot(xybid, aes(x=x, y=y)) + geom_boxplot() + geom_point() + 
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  ggtitle("True effect: 2.0")
```

## A chain

```{r, fig.height=3, fig.width=9}
set.seed(1888)
N <- 50
x <- rnorm(N, mean=5)
z <- ifelse(x<5, 0, 1)
y <- rnorm(N, mean=z*1.5)
xzydata <- data.frame(x=x, z=z, y=y)
xz.formula <- z ~ x
zy.formula <- y ~ z
xzPlot <- ggplot(xzydata, aes(x=x, y=z)) + geom_point() +
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(z))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE)
zyPlot <- ggplot(xzydata, aes(x=z, y=y)) + geom_point() +
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
                        eq.x.rhs = "z",
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE)
xyWoZPlot <- ggplot(xzydata, aes(x=x, y=y)) + geom_point() +
  geom_smooth(method = "lm", se=FALSE, color="darkgray", formula = my.formula) +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE)
grid.arrange(grobs=list(xzPlot, zyPlot, xyWoZPlot), nrow=1, ncol=3)
```

```{r chainCondition}
chainYvX <- lm(y~x, data=xzydata)
chainYvXcZ <- lm(y~x+z, data=xzydata)
with(xzydata,
     plot(coeftab(lm(y ~ x),
                  lm(y ~ x + z)),
              pars="x"))
```

```{r xyPlot2, fig.height=3, fig.width=7}
xyPlot2 <- ggplot(xzydata, aes(x=x, y=y)) + geom_point() +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  stat_quant_band(formula = y~x) 
  ## +theme(axis.title = element_text(size=16))
xyCondZPlot2 <- ggplot(xzydata %>% dplyr::mutate(z=factor(z)), aes(x=x, y=y, col=z)) + geom_point() +
  geom_point() +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  stat_quant_band(formula = y~x)
grid.arrange(grobs=list(xyPlot2, xyCondZPlot2), nrow=1, ncol=2)
```

```{r chainModel, fig.height=3, fig.width=3}
set.seed(1888)
chainModel <- dagify(z ~ x,
                 y ~ z,
                 exposure = "x",
                 outcome="y") %>%
  tidy_dagitty(layout="linear")
ggplot(chainModel, aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_link() +
      geom_dag_text(size=12, vjust=0.3) +
      theme_dag()
```
## A fork

```{r modelFork, fig.height=3, fig.width=3}
set.seed(1888)
modelFork <- dagify(y ~ z,
                    x ~ z)%>%
  tidy_dagitty(seed=2, layout="tree")
ggplot(modelFork, aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_link() +
      geom_dag_text(size=12, vjust=0.3) +
      theme_dag()
```
```{r linearFork, fig.height=3, fig.width=3}
set.seed(1887)
lfmod <- dagify(x ~ z,
                y ~ z,
                 coords = list(x = c(x=0, z=1, y=2),
                               y= c(x=0, z=0, y=0))) %>%
  tidy_dagitty(layout="auto")
lfmodPlot <- ggplot(lfmod,
                    aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_fan() +
      geom_dag_text(size=12, vjust=0.3, color="white") +
      theme_dag()
print(lfmodPlot)
```

```{r forkFit, fig.height=3, fig.width=7}
set.seed(1888)
N <- 50
z <- rbern(50, 0.5)
x <- rnorm(N, mean=z)
y <- rnorm(N, mean=z)
xzydata <- data.frame(x=x, z=factor(z), y=y)
xz.formula <- z ~ x
zy.formula <- y ~ z
forkXyPlot <- ggplot(xzydata, aes(x=x, y=y)) + geom_point() +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  stat_quant_band(formula = y~x)
forkXyCondZPlot <- ggplot(xzydata, aes(x=x, y=y, col=z)) + geom_point() +
  geom_point() +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  stat_quant_band(formula = y~x)

grid.arrange(grobs=list(forkXyPlot, forkXyCondZPlot), nrow=1, ncol=2)
```

```{r fig.height=3, fig.width=4.5}
xzPlot <- ggplot(xzydata, aes(x=z, y=x)) + geom_boxplot() + geom_point() + 
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(x))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  ggtitle("True effect: 1.0")
yzPlot <- ggplot(xzydata, aes(x=z, y=y)) + geom_boxplot() + geom_point() + 
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  ggtitle("True effect: 1.0")
grid.arrange(grobs=list(xzPlot, yzPlot), nrow=1, ncol=2)
```

```{r}
View(xzydata)
```

## A collider

```{r modelCollide, fig.height=3, fig.width=3}
set.seed(1888)
modelCollider <- dagify(z ~ x,
                 z ~ y) %>%
  tidy_dagitty(layout="sugiyama")
ggplot(modelCollider, aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_link() +
      geom_dag_text(size=12, vjust=0.3) +
      theme_dag()
```
```{r linearCollider, fig.height=3, fig.width=3}
set.seed(1887)
lcmod <- dagify(z ~ x,
                z ~ y,
                 coords = list(x = c(x=0, z=1, y=2),
                               y= c(x=0, z=0, y=0))) %>%
  tidy_dagitty(layout="auto")
lcmodPlot <- ggplot(lcmod,
                    aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_fan() +
      geom_dag_text(size=12, vjust=0.3, color="white") +
      theme_dag()
print(lcmodPlot)
```

```{r colliderFit, fig.height=3, fig.width=7}
set.seed(1888)
N <- 50
x <- rnorm(N)
y <- rnorm(N)
z <- ifelse(x+y>0, 1, -1)
xzydata <- data.frame(x=x, z=factor(z), y=y)
xz.formula <- z ~ x
zy.formula <- y ~ z
colXyPlot <- ggplot(xzydata, aes(x=x, y=y)) + geom_point() +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  stat_quant_band(formula = y~x)
colXyCondZPlot <- ggplot(xzydata, aes(x=x, y=y, col=z)) + geom_point() +
  geom_point(data=filter(xzydata, z==-1), mapping=aes(x=x, y=y), col="mistyrose",
             inherit.aes = FALSE) +
  ggpmisc::stat_poly_eq(formula = my.formula,
               eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., sep = "*plain(\",\")~")), 
               parse = TRUE) +
  stat_quant_band(formula = y~x)
grid.arrange(grobs=list(colXyPlot, colXyCondZPlot), nrow=1, ncol=2)
```
```{r}
View(xzydata)
```


```{r}
xyzPlot <- ggplot(xzydata, aes(x=x+y, y=z)) + geom_point()
print(xyzPlot)
```
## Descendent

```{r descend, fig.height=3, fig.width=3}
set.seed(1887)
ldmod <- dagify(u ~ x,
                u ~ y,
                z ~ u,
                 coords = list(x = c(x=0, u=1, z=1, y=2),
                               y= c(x=0, u=0, z=-1, y=0))) %>%
  tidy_dagitty(layout="auto")
ldmodPlot <- ggplot(ldmod,
                    aes(x = x, y = y, xend = xend, yend = yend)) +
      geom_dag_point() +
      geom_dag_edges_fan() +
  ylim(-2,0.5)+
      geom_dag_text(size=12, vjust=0.3, color="white") +
      theme_dag()
print(ldmodPlot)
```
# A summary figure of causal inference

```{r, fig.height=8, fig.width=8}
library(cowplot)
removeTitle <- function(gg) {
  return(gg + theme(plot.title=element_blank()))
}
ggdraw() + 
  draw_plot(yxPlainPlot, x=0.05, y=.75, width=.25, height=0.25) +
  draw_plot(xCausesY, x=0.35, y=0.91, width=.4, height=0.1, scale=1) +
  draw_plot(yCausesX, x=0.35, y=0.833, width=.4, height=0.1, scale=1) +
  draw_plot(latmodPlot, x=0.35, y=0.75, width=.4, height=0.1, scale=1) +
  draw_plot(removeTitle(yxPlot), 
            x=0, y=0.375, width=.375, height=0.375) +
  draw_plot(removeTitle(xyPlot), x=0.375, y=0.375, width=.375, height=0.375) +
  draw_plot(expLearnCausal, x=0.1875, y=0, width=.375, height=.375) +
  draw_plot_label(label = c("A", "B", "C", "D", "E", "F", "G"), size = 15,
                  x = c(0, .33, 0.33, 0.33, 0, 0.375, 0.1875), 
                  y = c(1, 1, 0.93, 0.85, 0.75, 0.75, 0.375))
```

# The two moms example

We consider the problem of estimating causal effect of mom's family size on daughter's family size. The data we have are a large number of pairs of moms and daughters, their family size, and the birth orders (which we assume by prior knowledge or our believe is a factor that affects family size).

```{r, fig.height=3, fig.width=3}
tm <- dagitty("dag {
              B1 -> M ;
              M -> D;
              B2 -> D }")
plot(tm)
```

```{r twoMomsWoConfound}
set.seed(1887)
N <- 200 # number of pairs

B1 <- rbinom(N, size=1, prob=0.5) # 50% first borns
M <- rnorm(N, 2*B1)
B2 <- rbinom(N, size=1, prob=0.5)
D <- rnorm(N, 2*B2 + 0*M) # change the 0 to another number to turn on the causal influence of mom

summary(lm(D ~ M))
summary(lm(D ~ M + B1))
summary(lm(D ~ M + B2))

plot(coeftab(lm(D ~ M),
             lm(D ~ M + B1),
             lm(D ~ M + B2)),
              pars="M")
```
Given the causal model, adding B1 to the model leads to worse precision, while adding B2 has better precision.

Now with confound.

```{r, fig.height=3, fig.width=3}
tmc <- dagitty("dag {
              B1 -> M ;
              M -> D;
              B2 -> D;
              U -> D;
              U -> M}")
plot(tmc)
```

```{r twoMomWithConfound}
set.seed(1887)
N <- 200
U <- rnorm(N)
B1 <- rbinom(N, size=1, prob=0.5)
M <- rnorm(N, 2*B1 + U)
B2 <- rbinom(N, size=1, prob=0.5)
D <- rnorm(N, 2*B2 + U + 0*M )

# fit the regression models
summary(lm(D ~ M))
summary(lm(D ~ M + B1))
summary(lm(D ~ M + B2))
summary(lm(D ~ M + B1 + B2))

plot(coeftab(lm(D ~ M),
             lm(D ~ M + B1),
             lm(D ~ M + B2),
             lm(D ~ M + B1 + B2)),
              pars="M")
```

Now a model including B1 is seriously wrong.

We look at the Akaike Infomation Criterion (AIC), a commonly used estimator of prediction error and a quality-control tool of statistical models for a  given set of data, of the models. Having B1 or not has no significant effect.

```{r aic}
AIC(lm(D ~ M))
AIC(lm(D ~ M + B1))
```

## The Bayesian inference method

Best scenario: the confound is observed. In reality this is probably difficult.

```{r}
precis(lm(D ~ M + B2 + U))
```

What can Bayesian inference bring us if the confounding factor is not observable? Below we build a STAN model specifying the DAG above, in particular the unmeasured confound U, as well as the priors. Then the model runs to derive posterior distribution of the parameters. FBI=full bayesian inference (replacing Richar'ds Full Luxury Bayesian Inference, flbi).

```{r stan}
dat <- list(N=N,M=M,D=D,B1=B1,B2=B2)
set.seed(1887)
fbi <- ulam(
    alist(
        # mom model
            M ~ normal( mu , sigma ),
            mu <- a1 + b*B1 + k*U[i],
        # daughter model
            D ~ normal( nu , tau ),
            nu <- a2 + b*B2 + m*M + k*U[i],
        # B1 and B2
            B1 ~ bernoulli(p),
            B2 ~ bernoulli(p),
        # unmeasured confound
            vector[N]:U ~ normal(0,1),
        # priors
            c(a1,a2,b,m) ~ normal( 0 , 0.5 ),
            c(k,sigma,tau) ~ exponential( 1 ),
            p ~ beta(2,2)
    ), data=dat , chains=4 , cores=4 , iter=2000 , cmdstan=TRUE )

precis(fbi)
```

Comparing the results of Bayesian inference and linear regressions above.

```{r}
m <- M
plot( coeftab( lm( D ~ m ) , lm( D ~ m + B1 ) ,  lm( D ~ m + B2 ) , fbi ) , pars="m" )
```

A strength of a Bayesian inference model is that it is a generative model, which means that we can extract or draw samples from the fit models. We can compare simulated with estimated confounding factors.

```{r}
post <- extract.samples(fbi)
Uest <- apply(post$U,2,mean) 
{
  blank()
  plot(U, Uest, xlab="U (simulated)",ylab="U (estimated)", col=2 , lwd=2 )
  abline(a=0,b=1,lty=2)
}
```

Alternative to setting up an explicit term for the unobserved confounding factor, we can also assume that mom and daughter are multi-normally distributed.

```{r}
# version that marginalizes out the missing data
fbi_plus <- ulam(
    alist(
        c(M,D) ~ multi_normal( c(mu,nu) , Rho , Sigma ),
        mu <- a1 + b*B1,
        nu <- a2 + b*B2 + m*M,
        c(a1,a2,b,m) ~ normal( 0 , 0.5 ),
        Rho ~ lkj_corr( 2 ),
        Sigma ~ exponential( 1 )
    ), data=dat , chains=4 , cores=4 , cmdstan=TRUE )

precis(fbi_plus,3)
```

Below is a more exotic example, where there is no instrument variable (B1 -> D), but two measures of U (V and W)

```{r}
set.seed(1887)
N <- 200 # number of pairs
U <- rnorm(N,0,1) # simulate confound
V <- rnorm(N,U,1)
W <- rnorm(N,U,1)
# birth order and family sizes
B1 <- rbinom(N,size=1,prob=0.5) # 50% first borns
M <- rnorm( N , 2*B1 + U )
B2 <- rbinom(N,size=1,prob=0.5)
D <- rnorm( N , 2*B2 + 0.5*B1 + U + 0*M )

# confounded regression
precis( lm( D ~ M + B1 + B2 + V + W ) )

# full-luxury bayesian inference
dat2 <- list(N=N,M=M,D=D,B1=B1,B2=B2,V=V,W=W)
fbi2 <- ulam(
    alist(
        M ~ normal( muM , sigmaM ),
        muM <- a1 + b*B1 + k*U[i],
        D ~ normal( muD , sigmaD ),
        muD <- a2 + b*B2 + d*B1 + m*M + k*U[i],
        W ~ normal( muW , sigmaW ),
        muW <- a3 + w*U[i],
        V ~ normal( muV , sigmaV ),
        muV <- a4 + v*U[i],
        vector[N]:U ~ normal(0,1),
        c(a1,a2,a3,a4,b,d,m) ~ normal( 0 , 0.5 ),
        c(k,w,v) ~ exponential( 1 ),
        c(sigmaM,sigmaD,sigmaW,sigmaV) ~ exponential( 1 )
    ), data=dat2 , chains=4 , cores=4 , iter=2000 , cmdstan=TRUE )

precis(fbi2)
```

# The peer bias example

We next consider the problem of peer bias. The data are a large set of applications submitted by individuals, the categories of applicants which may be subject to discrimination (e.g. gender, racial groups), the fields of applications, and the success and failure of the applications. We are interested in the question whether is there discrimination by category.

Assume we have the following model, which was explained at 43:58, 1:16:30, and 2:37:46 of the [recorded talk *Science Before Statistics: Causal Inference*, available on YouTube](https://www.youtube.com/watch?v=KNPYUVmY3NM&t=9174s).

* `X`: social categories which we cannot intervene, for instance gender and racial groups.
* `E`: fields of applications that are affected by the social categories. For instance, if it is about study disciplines, physics and psychology may represent different fields. Individuals from different social categories (gender for instance) may have different preferences of `E`.
* `Y`: outcome of application, success or failure. It is potentially influenced by both `X` (which may represent discrimination) and `Y` (which reflects the varying opportunities between fields).

There can be confounder in the model, especially a latent variable `Q` that embodies quality of the applicants, i.e. their suitability for the field. It is  difficult or impossible to measure, however it influences both the subject the applicant chooses (`E`) and the outcome of application (`Y`). Though it is difficult to measure, we may have proxy variables, for instance grades, reference letters, that are influenced by `Q`.

```{r peerBiasSetup}
library(rethinking)
```

## A simulation study without discrimination

```{r peerBias}
# simulation in which there is *no* discrimination
# conditioning on E reveals the truth
set.seed(1887)
N <- 500 ## 500 individuals
X <- rbern(N,prob=0.5) ## two social categories, 0 and 1
pY <- c( 0.25 , 0.05 )
pE <- X*inv_logit(-2) + (1-X)*inv_logit(+1) ## the social category affects field of choice
E <- sapply( 1:N , function(n) sample( 1:2 , size=1 , prob=c(pE[n],1-pE[n]) ) )
```

To better see that X affects E, we check the contingency table below:

* For `X=0`, the probability of choosing `E=1` is about 2.2 fold higher than choosing `E=2`
* For `X=1`, the probability of choosing `E=2` is about 7 fold higher than choosing `E=1`.

```{r peerBiasContingencyTable}
table(X, E)
```

Here we assume that there is no discrimination: Y is determined only by E. 

```{r peerY}
p <- pY[E]
Y <- rbern(N,prob=p)
```

However, logistic regression using `X` as variable returns a non-zero coefficient.

```{r peerYX}
precis( glm( Y ~ X , family=binomial ) )
```

Compare the model above with another logistic regression model conditional on both `X` and `E`: in the later model, the coefficient of `X` overlaps with zero. We conclude from the comparison that the effect of `X` is likely mediated by `E`.

```{r peerXE}
precis( glm( Y ~ X + E , family=binomial ) )
```


```{r peerModelSideBySide}
mg0 <- glm( Y ~ X , family=binomial )
mg1 <- glm( Y ~ X + E , family=binomial )
plot( coeftab( mg0 , mg1 ), pars="X" )
```

## A simulation study with discrimination and latent variable

Now we simulate a situation there is both discrimination and latent variable `Q`.

```{r discrimination}
set.seed(1887)
# simulation in which there really is discrimination
N <- 500
Q <- rnorm(N)
X <- rbern(N,prob=0.5)
pY <- c( 0.25 , 0.1 )
pE <- X*inv_logit(Q-2) + (1-X)*inv_logit(Q+1)
E <- sapply( 1:N , function(n) sample( 1:2 , size=1 , prob=c(pE[n],1-pE[n]) ) )

## Y is determined by E, Q, and X
p <- inv_logit( logit(pY[E]) + Q - X )
Y <- rbern(N,prob=p)
```

The horrifying obesrvation is that now if we include both `X` and `E` in the model, we would conclude that there is *no* discrimination, opposite to our model specification.

```{r discrimEandX}
pbdX <- glm( Y ~ X , family=binomial )
pbdXE <- glm( Y ~ X + E , family=binomial )

precis( pbdX )
precis( pbdXE )
```

For a moment, we assume that Q can be observed. Adding it to the model improves the estimate of X, though is not enough to allow the logistic regression model fully recover the amplitude of X (`-1`).

```{r}
pbdXEQ <- glm( Y ~ X + E + Q , family=binomial )
precis( pbdXEQ )
```

As said, `Q` is often not directly observable. We can have descandants of Q, `R1` and `R2` in this case.

```{r qDesc}
R1 <- rnorm(N,0.5*Q)
R2 <- rnorm(N,0.5*Q)

pbdXER <- glm( Y ~ X + E + R1 + R2 , family=binomial )
precis( pbdXER )
```
We diplsay the coefficients of `X` visually.

```{r}
plot( coeftab( pbdX , pbdXE, pbdXEQ, pbdXER) , pars="X" )
```

## The Bayesian model for the discrimination csae

```{r bayesAgainstDisc}
dat <- list( Y=Y , E=E , XX=X , id=1:N , R1=R1 , R2=R2 )
pbdXERbayes <- ulam(
    alist(
        # Y model
        Y ~ bernoulli(p),
        logit(p) <- a[E] + X*XX + h*Q[id],
        a[E] ~ normal(0,1),
        X ~ normal(0,1),
        h ~ half_normal(0,1),
        # Q model
        vector[id]:Q ~ normal(0,1),
        R1 ~ normal(Q,1),
        R2 ~ normal(Q,1)
    ) , data=dat , chains=4 , cores=4 , cmdstan=TRUE )
```

```{r}
precis(pbdXERbayes, 2 ,omit="Q")
```

```{r}
plot( coeftab( pbdX , pbdXE , pbdXER , pbdXERbayes ) , pars="X" )
```

```{r pbXERbayesPost}
post <- extract.samples(pbdXERbayes)
Qest <- apply(post$Q,2,mean)
{
  plot(Q,Qest)
  abline(a=0,b=1,lty=2)
}
```

A partial identification analysis: we assume that `Y` is affected by `E`, `X`, and `Q`, and we model `Q` as a latent Gaussian variable. We let `g` represent the coefficients of `X`. And we use an informative prior for `h`, i.e. effect of `Q`.

```{r pbdXEBayes2}
dat2 <- list( Y=Y , E=E , X=X, id=1:N )

pbdXEbayes2 <- ulam(
    alist(
        # Y model
        Y ~ bernoulli(p),
        logit(p) <- a[E] + g*X + h*Q[id],
        a[E] ~ normal(0,1),
        g ~ normal(0,1),
        h ~ uniform(0,2),
        # Q model
        vector[id]:Q ~ normal(0,1)
    ) , data=dat2 , chains=4 , cores=4, cmdstan = TRUE)

precis(pbdXEbayes2,2,omit="Q")
```

We retrieve samples from posterior distribution again.

```{r}
post <- extract.samples(pbdXEbayes2)

{
  plot( post$h , post$g , pch=16 , col=grau(0.2) , cex=2 , ylab="effect of X" , xlab="effect of Q" )
  abline(h=0,lty=2)
}
quantile(post$h)
```


# Plots for d-seperation

```{r dsepSetup}
a <- 0.7
cols <- c( col.alpha(1,a) , col.alpha(2,a) )
```

```{r dsepPipe}

N <- 1000
X <- rnorm(N)
Z <- rbern(N,inv_logit(X))
Y <- rnorm(N,(2*Z-1))

plot( X , Y , col=cols[Z+1] , pch=16 )
abline(lm(Y[Z==1]~X[Z==1]),col=2,lwd=3)
abline(lm(Y[Z==0]~X[Z==0]),col=1,lwd=3)
abline(lm(Y~X),lwd=3,lty=3)
```
```{r dsepFork}
N <- 1000
Z <- rbern(N)
X <- rnorm(N,2*Z-1)
Y <- rnorm(N,(2*Z-1))

plot( X , Y , col=cols[Z+1] , pch=16 )
abline(lm(Y[Z==1]~X[Z==1]),col=2,lwd=3)
abline(lm(Y[Z==0]~X[Z==0]),col=1,lwd=3)
abline(lm(Y~X),lwd=3,lty=3)
```

```{r dsepCollider}
N <- 1000
X <- rnorm(N)
Y <- rnorm(N)
Z <- rbern(N,inv_logit(2*X+2*Y-2))

plot( X , Y , col=cols[Z+1] , pch=16 )
abline(lm(Y[Z==1]~X[Z==1]),col=2,lwd=3)
abline(lm(Y[Z==0]~X[Z==0]),col=1,lwd=3)
abline(lm(Y~X),lwd=3,lty=3)
```