Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sdaza committed Dec 9, 2024
1 parent 950355f commit 185b11b
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 199 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ jobs:
run: |
poetry install
# - name: Linting
# run: |
# # Install flake8
# pip install flake8
# # Run flake8
# flake8 . --ignore=E501
- name: Linting
run: |
# Install flake8
pip install flake8
# Run flake8
flake8 . --ignore=E501,F401,F403,F405,W504
- name: Run tests
run: |
Expand Down
17 changes: 6 additions & 11 deletions experiment_utils/experiment_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,8 @@ def __check_input(self):
log_and_raise_error(self.logger, "Regression covariates should be a subset of covariates")

# check if all required columns are present
required_columns = (
self.experiment_identifier +
[self.treatment_col] +
self.outcomes +
self.covariates +
([self.instrument_col] if self.instrument_col is not None else [])
)
required_columns = (self.experiment_identifier + [self.treatment_col] + self.outcomes +
self.covariates + ([self.instrument_col] if self.instrument_col is not None else []))

missing_columns = set(required_columns) - set(self.data.columns)

Expand Down Expand Up @@ -199,7 +194,7 @@ def linear_regression(self, data: pd.DataFrame, outcome_variable: str) -> Dict:
"treated_units": data[self.treatment_col].sum(),
"control_units": data[self.treatment_col].count() - data[self.treatment_col].sum(),
"control_value": intercept,
"treatment_value": intercept+coefficient,
"treatment_value": intercept + coefficient,
"absolute_effect": coefficient,
"relative_effect": relative_effect,
"standard_error": standard_error,
Expand Down Expand Up @@ -243,7 +238,7 @@ def weighted_least_squares(self, data: pd.DataFrame, outcome_variable: str) -> D
"treated_units": data[self.treatment_col].sum().astype(int),
"control_units": (data[self.treatment_col].count() - data[self.treatment_col].sum()).astype(int),
"control_value": intercept,
"treatment_value": intercept+coefficient,
"treatment_value": intercept + coefficient,
"absolute_effect": coefficient,
"relative_effect": relative_effect,
"standard_error": standard_error,
Expand Down Expand Up @@ -286,7 +281,7 @@ def iv_regression(self, data: pd.DataFrame, outcome_variable: str) -> Dict:
"treated_units": data[self.treatment_col].sum().astype(int),
"control_units": (data[self.treatment_col].count() - data[self.treatment_col].sum()).astype(int),
"control_value": intercept,
"treatment_value": intercept+coefficient,
"treatment_value": intercept + coefficient,
"absolute_effect": coefficient,
"relative_effect": relative_effect,
"standard_error": standard_error,
Expand Down Expand Up @@ -633,7 +628,7 @@ def combine_effects(self, data: pd.DataFrame = None, grouping_cols: List = None)
'standard_error', 'pvalue']
if 'balance' in data.columns:
index_to_insert = len(grouping_cols)
result_columns.insert(index_to_insert+1, 'balance')
result_columns.insert(index_to_insert + 1, 'balance')
pooled_results['stat_significance'] = pooled_results['stat_significance'].astype(int)

self.logger.info('Combining effects using fixed-effects meta-analysis!')
Expand Down
80 changes: 38 additions & 42 deletions experiment_utils/power_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,20 @@ def __run_experiment(self, baseline=[1.0], sample_size=[100], effect=[0.10],
log_and_raise_error(self.logger, 'Compliance rates should be same length as the number of self.variants or length 1!')
compliance = list(itertools.repeat(compliance[0], self.variants))

if len(standard_deviation) != self.variants+1:
if len(standard_deviation) != self.variants + 1:
if len(standard_deviation) > 1:
log_and_raise_error(self.logger, 'Standard deviations should be same length as the number of self.variants+1 or length 1!')
standard_deviation = list(itertools.repeat(standard_deviation[0], self.variants+1))
log_and_raise_error(self.logger, 'Standard deviations should be same length as the number of self.variants + 1 or length 1!')
standard_deviation = list(itertools.repeat(standard_deviation[0], self.variants + 1))

if len(sample_size) != self.variants+1:
if len(sample_size) != self.variants + 1:
if len(sample_size) > 1:
log_and_raise_error(self.logger, 'N should be same length as the number of self.variants+1 or length 1!')
sample_size = list(itertools.repeat(sample_size[0], self.variants+1))
log_and_raise_error(self.logger, 'N should be same length as the number of self.variants + 1 or length 1!')
sample_size = list(itertools.repeat(sample_size[0], self.variants + 1))

if len(baseline) != self.variants+1:
if len(baseline) != self.variants + 1:
if len(baseline) > 1:
log_and_raise_error(self.logger, 'Baseline values should be same length as the number of self.variants+1 or length 1!')
baseline = list(itertools.repeat(baseline[0], self.variants+1))
log_and_raise_error(self.logger, 'Baseline values should be same length as the number of self.variants + 1 or length 1!')
baseline = list(itertools.repeat(baseline[0], self.variants + 1))

re = list(range(self.variants))

Expand All @@ -123,11 +123,11 @@ def __run_experiment(self, baseline=[1.0], sample_size=[100], effect=[0.10],

for i in range(self.variants):
if self.relative_effect:
re[i] = baseline[i+1] * (1.00 + effect[i])
re[i] = baseline[i + 1] * (1.00 + effect[i])
else:
re[i] = baseline[i+1] + effect[i]
t_data_c = np.random.poisson(re[i], int(np.round(sample_size[i+1] * compliance[i])))
t_data_nc = np.random.poisson(baseline[i+1], int(np.round(sample_size[i+1] * (1 - compliance[i]))))
re[i] = baseline[i + 1] + effect[i]
t_data_c = np.random.poisson(re[i], int(np.round(sample_size[i + 1] * compliance[i])))
t_data_nc = np.random.poisson(baseline[i + 1], int(np.round(sample_size[i + 1] * (1 - compliance[i]))))
t_data = np.append(t_data_c, t_data_nc)
dd = np.append(dd, t_data)
vv = np.append(vv, list(itertools.repeat(i + 1, len(t_data))))
Expand All @@ -139,12 +139,12 @@ def __run_experiment(self, baseline=[1.0], sample_size=[100], effect=[0.10],

for i in range(self.variants):
if self.relative_effect:
re[i] = baseline[i+1] * (1.00 + effect[i])
re[i] = baseline[i + 1] * (1.00 + effect[i])
else:
re[i] = baseline[i+1] + effect[i]
re[i] = baseline[i + 1] + effect[i]

t_data_c = np.random.binomial(n=1, size=int(np.round(sample_size[i+1] * compliance[i])), p=re[i])
t_data_nc = np.random.binomial(n=1, size=int(np.round(sample_size[i+1] * (1 - compliance[i]))), p=baseline[i+1])
t_data_c = np.random.binomial(n=1, size=int(np.round(sample_size[i + 1] * compliance[i])), p=re[i])
t_data_nc = np.random.binomial(n=1, size=int(np.round(sample_size[i + 1] * (1 - compliance[i]))), p=baseline[i + 1])
t_data = np.append(t_data_c, t_data_nc)
dd = np.append(dd, t_data)
vv = np.append(vv, list(itertools.repeat(i + 1, len(t_data))))
Expand All @@ -156,19 +156,19 @@ def __run_experiment(self, baseline=[1.0], sample_size=[100], effect=[0.10],

for i in range(self.variants):
if self.relative_effect:
re[i] = baseline[i+1] * (1.00 + effect[i])
re[i] = baseline[i + 1] * (1.00 + effect[i])
else:
re[i] = baseline[i+1] + effect[i]
re[i] = baseline[i + 1] + effect[i]

t_data_c = np.random.normal(re[i], standard_deviation[i+1], int(np.round(sample_size[i+1] * compliance[i])))
t_data_nc = np.random.normal(baseline[i+1], standard_deviation[i+1], int(np.round(sample_size[i+1] * (1 - compliance[i]))))
t_data_c = np.random.normal(re[i], standard_deviation[i + 1], int(np.round(sample_size[i + 1] * compliance[i])))
t_data_nc = np.random.normal(baseline[i + 1], standard_deviation[i + 1], int(np.round(sample_size[i + 1] * (1 - compliance[i]))))

t_data = np.append(t_data_c, t_data_nc)
dd = np.append(dd, t_data)
vv = np.append(vv, list(itertools.repeat(i + 1, len(t_data))))

return dd, vv

def get_power(self, baseline=[1.0], effect=[0.10], sample_size=[1000], compliance=[1.0], standard_deviation=[1]):
'''
Estimate power using simulation.
Expand All @@ -192,9 +192,6 @@ def get_power(self, baseline=[1.0], effect=[0.10], sample_size=[1000], complianc
'''

# create empty values for results
results = []
ncomparisons = len(self.comparisons)

pvalues = {}
for c in range(len(self.comparisons)):
pvalues[c] = []
Expand All @@ -203,8 +200,8 @@ def get_power(self, baseline=[1.0], effect=[0.10], sample_size=[1000], complianc
for i in range(self.nsim):
# y = output, x = index of condition
y, x = self.__run_experiment(baseline=baseline, effect=effect,
sample_size=sample_size, compliance=compliance,
standard_deviation=standard_deviation)
sample_size=sample_size, compliance=compliance,
standard_deviation=standard_deviation)

# iterate over variants
l_pvalues = []
Expand Down Expand Up @@ -248,12 +245,11 @@ def get_power(self, baseline=[1.0], effect=[0.10], sample_size=[1000], complianc
}

if self.correction in correction_methods:
significant = correction_methods[self.correction](np.array(l_pvalues), self.alpha/pvalue_adjustment[self.alternative])
significant = correction_methods[self.correction](np.array(l_pvalues), self.alpha / pvalue_adjustment[self.alternative])

for v, p in enumerate(significant):
pvalues[v].append(p)

# results.append(int(np.sum(pvalues)) >= len(self.comparisons))
power = pd.DataFrame(pd.DataFrame(pvalues).mean()).reset_index()
power.columns = ['comparisons', 'power']
power['comparisons'] = power['comparisons'].map(dict(enumerate(self.comparisons)))
Expand Down Expand Up @@ -284,7 +280,7 @@ def grid_sim_power(self, baseline_rates=None, effects=None, sample_sizes=None,
"""

pdict = {'baseline': baseline_rates, 'effect': effects, 'sample_size': sample_sizes,
'compliance': compliances, 'standard_deviation': standard_deviations}
'compliance': compliances, 'standard_deviation': standard_deviations}
grid = self.__expand_grid(pdict)

parameters = list(grid.itertuples(index=False, name=None))
Expand Down Expand Up @@ -314,7 +310,7 @@ def grid_sim_power(self, baseline_rates=None, effects=None, sample_sizes=None,

results['index'] = index
results = results.pivot(index=['index'], columns=['comparisons'], values=['power'])
results.columns = [str((i,j)) for i,j in self.comparisons]
results.columns = [str((i, j)) for i, j in self.comparisons]

grid = pd.concat([grid, results], axis=1)
grid.sample_size = grid.sample_size.map(str)
Expand All @@ -328,10 +324,10 @@ def plot_power(self, data):
Plot statistical power by scenario
'''

value_vars = [str((i,j)) for i,j in self.comparisons]
value_vars = [str((i, j)) for i, j in self.comparisons]

cols = ['baseline', 'effect', 'sample_size', 'compliance', 'standard_deviation',
'variants', 'comparisons', 'nsim', 'alpha', 'alternative', 'metric', 'relative_effect']
'variants', 'comparisons', 'nsim', 'alpha', 'alternative', 'metric', 'relative_effect']

temp = pd.melt(data, id_vars=cols, var_name='comparison', value_name='power', value_vars=value_vars)

Expand All @@ -352,7 +348,7 @@ def __expand_grid(self, dictionary):
Auxiliary function to expand a dictionary
'''
return pd.DataFrame([row for row in itertools.product(*dictionary.values())],
columns=dictionary.keys())
columns=dictionary.keys())

def bonferroni(self, pvals, alpha=0.05):
"""A function for controlling the FWER at some level alpha using the
Expand All @@ -370,7 +366,7 @@ def bonferroni(self, pvals, alpha=0.05):
True if a hypothesis is rejected, False if not.
"""
m, pvals = len(pvals), np.asarray(pvals)
return pvals < alpha/float(m)
return pvals < alpha / float(m)

def hochberg(self, pvals, alpha=0.05):
"""A function for controlling the FWER using Hochberg's procedure.
Expand All @@ -391,7 +387,7 @@ def hochberg(self, pvals, alpha=0.05):
# sort the p-values into ascending order
ind = np.argsort(pvals)

test = [p <= alpha/(m+1-(k+1)) for k, p in enumerate(pvals[ind])]
test = [p <= alpha / (m + 1 - (k + 1)) for k, p in enumerate(pvals[ind])]
significant = np.zeros(np.shape(pvals), dtype='bool')
significant[ind[0:np.sum(test)]] = True
return significant
Expand All @@ -415,12 +411,12 @@ def holm_bonferroni(self, pvals, alpha=0.05):

m, pvals = len(pvals), np.asarray(pvals)
ind = np.argsort(pvals)
test = [p > alpha/(m+1-k) for k, p in enumerate(pvals[ind])]
test = [p > alpha / (m + 1 - k) for k, p in enumerate(pvals[ind])]

"""The minimal index k is m-np.sum(test)+1 and the hypotheses 1, ..., k-1
"""The minimal index k is m-np.sum(test) + 1 and the hypotheses 1, ..., k-1
are rejected. Hence m-np.sum(test) gives the correct number."""
significant = np.zeros(np.shape(pvals), dtype='bool')
significant[ind[0:m-np.sum(test)]] = True
significant[ind[0:m - np.sum(test)]] = True
return significant

def sidak(self, pvals, alpha=0.05):
Expand All @@ -440,7 +436,7 @@ def sidak(self, pvals, alpha=0.05):
True if a hypothesis is rejected, False if not.
"""
n, pvals = len(pvals), np.asarray(pvals)
return pvals < 1. - (1.-alpha) ** (1./n)
return pvals < 1. - (1. - alpha) ** (1. / n)

def lsu(self, pvals, q=0.05):
"""The (non-adaptive) one-stage linear step-up procedure (LSU) for
Expand All @@ -462,8 +458,8 @@ def lsu(self, pvals, q=0.05):

m = len(pvals)
sort_ind = np.argsort(pvals)
k = [i for i, p in enumerate(pvals[sort_ind]) if p < (i+1.)*q/m]
k = [i for i, p in enumerate(pvals[sort_ind]) if p < (i + 1.) * q / m]
significant = np.zeros(m, dtype='bool')
if k:
significant[sort_ind[0:k[-1]+1]] = True
significant[sort_ind[0:k[-1] + 1]] = True
return significant
Loading

0 comments on commit 185b11b

Please sign in to comment.