Skip to content

Commit

Permalink
Add examples from "Causal Inference and Discovery in Python" to tests (
Browse files Browse the repository at this point in the history
…#1165)

* Add test file

Signed-off-by: Rahul Shrestha <rahulshrestha0101@gmail.com>

added chapter 6

Fix

Signed-off-by: Rahul Shrestha <rahulshrestha0101@gmail.com>

Add chapter 8

Signed-off-by: Rahul Shrestha <rahulshrestha0101@gmail.com>

Fix formatting issue with linter

Add references

* Added reference message

Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com>

* Added econml requirement

Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com>

* removed econml from imports

Signed-off-by: Amit Sharma <amit_sharma@live.com>

---------

Signed-off-by: rahulbshrestha <rahulshrestha0101@gmail.com>
Signed-off-by: Amit Sharma <amit_sharma@live.com>
Co-authored-by: Amit Sharma <amit_sharma@live.com>
  • Loading branch information
rahulbshrestha and amit-sharma authored Aug 4, 2024
1 parent becbf7f commit daf5f94
Showing 1 changed file with 199 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""
Note: The tests below are taken from the book 'Causal Inference and Discovery in Python' (https://www.packtpub.com/product/causal-inference-and-discovery-in-python/9781804612989)
by Aleksander Molak (https://github.com/AlxndrMlk). Other than the assert statements, all of the code below is taken from the book.
"""

import numpy as np
import pandas as pd
from pytest import mark
from scipy import stats
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression
from tqdm import tqdm

from dowhy import CausalModel


class GPSMemorySCM:
def __init__(self, random_seed=None):
self.random_seed = random_seed
self.u_x = stats.truncnorm(0, np.infty, scale=5)
self.u_y = stats.norm(scale=2)
self.u_z = stats.norm(scale=2)
self.u = stats.truncnorm(0, np.infty, scale=4)

def sample(self, sample_size=100, treatment_value=None):
"""Samples from the SCM"""
if self.random_seed:
np.random.seed(self.random_seed)

u_x = self.u_x.rvs(sample_size)
u_y = self.u_y.rvs(sample_size)
u_z = self.u_z.rvs(sample_size)
u = self.u.rvs(sample_size)

if treatment_value:
gps = np.array([treatment_value] * sample_size)
else:
gps = u_x + 0.7 * u

hippocampus = -0.6 * gps + 0.25 * u_z
memory = 0.7 * hippocampus + 0.25 * u

return gps, hippocampus, memory

def intervene(self, treatment_value, sample_size=100):
"""Intervenes on the SCM"""
return self.sample(treatment_value=treatment_value, sample_size=sample_size)


@mark.usefixtures("fixed_seed")
class TestCausalInferenceDiscoveryBook(object):
def test_dowhy_chapter_8(self):
"""
This test was taken from Chapter 8 of the book, 'Causal Inference and Discovery in Python'
"""

# Construct the graph (the graph is constant for all iterations)
nodes = ["S", "Q", "X", "Y", "P"]
edges = ["SQ", "SY", "QX", "QY", "XP", "YP", "XY"]

# Generate the GML graph
gml_string = "graph [directed 1\n"

for node in nodes:
gml_string += f'\tnode [id "{node}" label "{node}"]\n'

for edge in edges:
gml_string += f'\tedge [source "{edge[0]}" target "{edge[1]}"]\n'

gml_string += "]"

# Define the true effect
TRUE_EFFECT = 0.7

# Define experiment params
sample_sizes = [30, 100, 1000, 10000]
noise_coefs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
n_samples = 20

# Record the results
results = []

# Run the experiment
for sample_size in tqdm(sample_sizes):
for noise_coef in noise_coefs:
for i in range(n_samples):
# Generate the data
S = np.random.random(sample_size)
Q = 0.2 * S + noise_coef * np.random.random(sample_size)
X = 0.14 * Q + noise_coef * np.random.random(sample_size)
Y = TRUE_EFFECT * X + 0.11 * Q + 0.32 * S + noise_coef * np.random.random(sample_size)
P = 0.43 * X + 0.21 * Y + noise_coef * np.random.random(sample_size)

# Encode as a pandas df
df = pd.DataFrame(np.vstack([S, Q, X, Y, P]).T, columns=["S", "Q", "X", "Y", "P"])

# Instantiate the CausalModel
model = CausalModel(data=df, treatment="X", outcome="Y", graph=gml_string)

# Get the estimand
estimand = model.identify_effect()

# Get estimate (DML)
estimate_dml = model.estimate_effect(
identified_estimand=estimand,
method_name="backdoor.econml.dml.DML",
method_params={
"init_params": {
"model_y": GradientBoostingRegressor(),
"model_t": GradientBoostingRegressor(),
"model_final": LassoCV(fit_intercept=False),
},
"fit_params": {},
},
)

# Get estimate (Linear Regression)
estimate_lr = model.estimate_effect(
identified_estimand=estimand, method_name="backdoor.linear_regression"
)

results.append(
{
"sample_size": sample_size,
"noise_coef": noise_coef,
"estimate_dml": estimate_dml.value,
"estimate_lr": estimate_lr.value,
"error_dml": estimate_dml.value - TRUE_EFFECT,
"error_lr": estimate_lr.value - TRUE_EFFECT,
}
)

def test_dowhy_chapter_7(self):
"""
This test was taken from Chapter 7 of the book, 'Causal Inference and Discovery in Python'
"""

# Instantiate the SCM
scm = GPSMemorySCM()
# Generate observational data
gps_obs, hippocampus_obs, memory_obs = scm.sample(1000)
# Encode as a pandas df
df = pd.DataFrame(np.vstack([gps_obs, hippocampus_obs, memory_obs]).T, columns=["X", "Z", "Y"])

# Create the graph describing the causal structure
gml_graph = """
graph [
directed 1
node [
id "X"
label "X"
]
node [
id "Z"
label "Z"
]
node [
id "Y"
label "Y"
]
node [
id "U"
label "U"
]
edge [
source "X"
target "Z"
]
edge [
source "Z"
target "Y"
]
edge [
source "U"
target "X"
]
edge [
source "U"
target "Y"
]
]
"""

# With graph
model = CausalModel(data=df, treatment="X", outcome="Y", graph=gml_graph)

# model.view_model()

estimand = model.identify_effect()

estimate = model.estimate_effect(identified_estimand=estimand, method_name="frontdoor.two_stage_regression")

refute_subset = model.refute_estimate(
estimand=estimand, estimate=estimate, method_name="data_subset_refuter", subset_fraction=0.4
)

assert refute_subset is not None

0 comments on commit daf5f94

Please sign in to comment.