-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTMLE_Functions_Meta.R
323 lines (271 loc) · 12.9 KB
/
TMLE_Functions_Meta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
##############
# TMLE_Functions_Meta.R
# R code to obtain point estimates with TMLE
# Modified from Stage2_Functions.R in https://github.com/LauraBalzer/TwoStageTMLE
#-----------------------------------------------------#-----------------------------------------------------
# do.TMLE: master function to run TMLE for point estimation
#
# input:
# goal - aRR: arithmetic risk ratio; O/W risk difference,
# target of inference: cluster-level ("clust") or pooled-indv effect ("indv") (target)
# training data (train),
# prespecified adjustment variables for the conditional mean outcome (QAdj),
# prespecified adjustment variables for the propensity score (gAdj),
# initial estimator of the conditional mean outcome (Q.out),
# estimator of the propensity score (p.out),
# indicator to print updates (verbose)
#
# output: list
# training data augmented with estimates,
# prespecified adjustment variables for the conditional mean outcome (QAdj),
# prespecified adjustment variables for the propensity score (gAdj),
# initial estimator of the conditional mean outcome (Q.out),
# estimator of the propensity score (p.out),
# estimated fluctuation coef (epsilon),
#-----------------------------------------------------#-----------------------------------------------------
# UPDATES
# expand the candidate prediction functions with Qform (outcome regression) and gform (pscore)
# (1) 'glm' for main terms;
# (2) 'step' for stepwise;
# (3) 'step.interaction for stepwise interaction
# (4) 'lasso' for LASSO;
# (5) 'mars' for multivariate adaptive regression splines (MARS)
# (6) 'mars.corP' for MARS after screening
# See do.Init.Qbar for details
do.TMLE <- function(goal, target='NOT.WORKING', sample.effect, train,
QAdj, Qform='glm', gAdj=NULL, gform='glm',
Q.out=NULL, p.out=NULL,
scale_value, scale_value_min,
doing.CV=F, verbose=F) {
#=====================================================
# Step1 - initial estimation of E(Y|A,W)= Qbar(A,W)
#=====================================================
# run glm on the adjustment set
Q <- do.Init.Qbar(train=train, QAdj=QAdj, Qform=Qform, glm.out=Q.out, verbose=verbose)
train <- Q$train
#==========================================================
# Step2: Calculate the clever covariate
#==========================================================
G <- get.clever.cov(train=train, gAdj=gAdj, gform=gform, p.out=p.out, verbose=verbose)
train <- G$train
#==========================================================
# Step3: Targeting
#==========================================================
eps <- get.epsilon(train=train, goal=goal, verbose=verbose)
train <- do.targeting(train=train, eps=eps, goal=goal)
#==========================================================
# Step4: Parameter estimation
# will unscale if appropriate
#==========================================================
if(nrow(train)> length(unique(train$id)) & target=='clust') {
# IF DATA ARE AT THE INDV-LEVEL, BUT GOAL IS THE CLUSTER-LEVEL EFFECT
# get point estimates by aggregating to the cluster level
# (e.g. by taking the weighted sum)
# then take mean of cluster-level endpoints
if(!doing.CV) print('data=indv; target=clust')
R1<- mean( aggregate(data.frame(train$alpha*train$Qbar1W.star), by=list(train$id), sum)[,2] )
R0<- mean( aggregate(data.frame(train$alpha*train$Qbar0W.star), by=list(train$id), sum)[,2] )
} else{
# OTHERWISE, JUST TAKE THE WEIGHTED MEAN ACROSS ALL ROWS
# future work: robustify so that dont need weights that sum to J
R1 <- mean( train$alpha*train$Qbar1W.star )
R0 <- mean( train$alpha*train$Qbar0W.star )
}
# UNSCALE THE OUTCOME
R1 <- R1*(scale_value - scale_value_min) + scale_value_min
R0 <- R0*(scale_value - scale_value_min) + scale_value_min
#==========================================================
# Step 5: Variance estimation
#==========================================================
variance.out <- get.IC.variance(goal=goal, target=target, Vdata=train, R1=R1, R0=R0,
scale_value = scale_value, scale_value_min = scale_value_min,
doing.CV = doing.CV, sample.effect = sample.effect)
RETURN<- list(train=train,
QAdj=Q$QAdj, Qform=Q$Qform, Q.out=Q$glm.out,
gAdj=G$gAdj, gform=G$gform, p.out=G$p.out,
eps=eps, R1=R1, R0=R0,
var.R1=variance.out$var.R1,
var.R0=variance.out$var.R0,
var.pair=variance.out$var.pair,
var.break=variance.out$var.break)
RETURN
}
#-----------------------------------------------------#-----------------------------------------------------
# do.Init.Qbar - function to do initial estimation of E[Y|A,W] = Qbar(A,W)
# input: data set, adjustment variable(s), outcome regression fit, verbose
# output: adjustment variable(s), outcome regression fit
# data set augmented w/ initial predictions: Qbar(A,W), Qbar(1,W) and Qbar(0,W)
#-----------------------------------------------------#-----------------------------------------------------
# UDPATE - expand the candidate functions for the outcomes regression Qform,
do.Init.Qbar<- function(train, QAdj, Qform='glm', glm.out=NULL, verbose=F){
if( is.null(QAdj) ){
QAdj<- 'U'
}
train.temp <- train[, c(QAdj, 'A', 'Y')]
X1<- X0<- train.temp
X1$A<-1; X0$A<- 0
# needed for penalized regression
Xl <- model.matrix(~-1 +. , subset(train.temp, select=-Y) )
X1l <- model.matrix(~-1 +. , subset(X1, select=-Y))
X0l <- model.matrix(~-1 +. , subset(X0, select=-Y))
if( is.null(glm.out) ){
# fit using the training data
# run main terms regression
glm.out<- suppressWarnings( glm( Y~. , family='binomial', data=train.temp, weights=train$alpha ) )
if (Qform=='step'){ # stepwise
glm.out <- step(glm.out, direction = "both", trace = 0, k = 2)
} else if (Qform=='step.interaction'){
glm.out <- step(glm.out, scope=Y~.^2, direction = "both", trace = 0, k = 2)
} else if (Qform=='lasso'){
familyl <- ifelse(sum(unique(train$Y))>2, 'gaussian', 'binomial')
glm.out <- glmnet(x=Xl, y=train$Y, weights=train$alpha,
family=familyl, alpha=1, nlambda = 100)
} else if(Qform %in% c('mars', 'mars.corP')){
# using default settings of SL.earth in SuperLearner
X <- subset(train.temp, select=-Y)
if(Qform=='mars.corP'){
X <- X[,screen.corP(Y=train$Y, X=X, family='binomial')==T]
}
glm.out <- earth(x = X, y=train$Y, weights=train$alpha,
degree = 2, penalty = 3,
nk = max(21, 2 * ncol(X) + 1), pmethod = "backward", nfold = 0,
ncross = 1, minspan = 0, endspan = 0,
glm = list(family = binomial))
}
if(verbose) print(glm.out)
}
# get initial predictions
if(Qform=='lasso'){
# from LASSO
QbarAW <- predict(glm.out, newx=Xl, type='response', s = min(glm.out$lambda) )
Qbar1W <- predict(glm.out, newx=X1l, type='response', s = min(glm.out$lambda) )
Qbar0W <- predict(glm.out, newx=X0l, type='response', s = min(glm.out$lambda) )
}else{
# for glms
QbarAW <- predict(glm.out, newdata=train.temp, type='response')
Qbar1W <- predict(glm.out, newdata=X1, type='response')
Qbar0W <- predict(glm.out, newdata=X0, type='response')
}
Qbar <- data.frame(QbarAW, Qbar1W, Qbar0W)
colnames(Qbar) <- c('QbarAW', 'Qbar1W', 'Qbar0W')
list(QAdj=QAdj, Qform=Qform, glm.out=glm.out, train=cbind(train, Qbar) )
}
#-----------------------------------------------------#-----------------------------------------------------
# get.clever.cov - function calculate the clever covariate
# input:
# data set, adjustment variable(s), pscore regression, verbose
# output:
# adjustment variable(s), pscore regression,
# data set augmented with pscore & clever covariate (H.AW, H.1W, H.0W)
#-----------------------------------------------------#-----------------------------------------------------
get.clever.cov<- function(train, gAdj, gform, p.out=NULL, verbose=F){
if( is.null(gAdj) ){
gAdj <- 'U'
}
train.temp <- train[, c(gAdj, 'A')]
# needed for penalized regression
Xl <- model.matrix(~-1 +. , subset(train.temp, select=-A) )
if( is.null(p.out) ){ # fit pscore on training set
# run main terms regression
p.out<- suppressWarnings( glm( A~. , family='binomial', data= train.temp, weights=train$alpha) )
if (gform=='step'){ # stepwise
p.out <- step(p.out, direction = "both", trace = 0, k = 2)
} else if (gform=='step.interaction'){
p.out <- step(p.out, scope=A~.^2, direction = "both", trace = 0, k = 2)
} else if (gform=='lasso'){
p.out <- glmnet(x=Xl, y=train$A, weights=train$alpha,
family='binomial', alpha=1, nlambda = 100)
} else if(gform %in% c('mars', 'mars.corP')){
# using default settings of SL.earth in SuperLearner
X <- subset(train.temp, select=-A)
if(gform=='mars.corP'){
X <- X[,screen.corP(Y=train$A, X=X, family='binomial')==T]
}
p.out <- earth(x = X, y=train$A, weights=train$alpha,
degree = 2, penalty = 3,
nk = max(21, 2 * ncol(X) + 1), pmethod = "backward", nfold = 0,
ncross = 1, minspan = 0, endspan = 0,
glm = list(family = binomial))
}
if(verbose){ print(p.out)}
}
# now use p.out to get estimated pscores
if(gform!='lasso'){
pscore <- predict(p.out, newdata= train.temp, type="response")
}else{
pscore <- predict(p.out, newx=Xl, type='response', s = min(p.out$lambda) )
}
# bound g - should not apply for a randomized trial
pscore [pscore < 0.025] <- 0.025
pscore [pscore > 0.975] <- 0.975
A.train <- train$A
# Clever covariate is two-dimensional;
H.1W <- A.train/pscore
H.0W <- (1-A.train)/(1-pscore )
# via delta method
H.AW <- H.1W - H.0W
p.df <- data.frame(pscore, H.1W , H.0W , H.AW)
colnames(p.df) <- c('pscore', 'H.1W' , 'H.0W' , 'H.AW')
list(gAdj=gAdj, gform=gform, p.out=p.out, train=data.frame(train, p.df) )
}
#-----------------------------------------------------#-----------------------------------------------------
# get.epsilon - function calculate the fluctuation coefficient
# input:
# data set, goal with 'aRR'=arithmetic RR, verbose
# output:
# estimated fluctuation coefficient (eps)
#-----------------------------------------------------#-----------------------------------------------------
get.epsilon <- function(train, goal, verbose=F){
A.train<- train$A
Y.train<- train$Y
# Skip fitting if outcome=0 or outcome=1
# for all observations in either txt or control group
Skip.update <- mean(Y.train[A.train==1])==0 | mean(Y.train[A.train==0])==0 |
mean(Y.train[A.train==1])==1 | mean(Y.train[A.train==0])==1
if(goal=='RD'){
# if going after RD, then use a 1-dim clever covariate
if(!Skip.update){
logitUpdate<- suppressWarnings(
glm(Y.train ~ -1 +offset(qlogis(train$QbarAW )) + train$H.AW, family="binomial", weights=train$alpha))
eps<-logitUpdate$coef
} else{
eps<- 0
}
names(eps) <- 'H.AW'
}else{
# targeting the risk or odds ratio requires a two-dimensional clever covariate
if( !Skip.update ){
logitUpdate<- suppressWarnings(
glm(Y.train ~ -1 +offset(qlogis(train$QbarAW )) + train$H.0W + train$H.1W, family="binomial", weights=train$alpha))
eps<-logitUpdate$coef
} else{
eps <- c(0,0)
}
names(eps)<- c('H.0W', 'H.1W')
}
if(verbose) print(eps)
eps
}
#-----------------------------------------------------#-----------------------------------------------------
# do.targeting - function to update initial estimators of QbarAW
# input:
# data set (train), fluctuation coefficient (eps), goal (aRR= arithmetic risk ratio; otherwise RD)
# output: data.frame w/ targeted predictions: Qbar*(A,W), Qbar*(1,W), Qbar*(0,W)
#-----------------------------------------------------#-----------------------------------------------------
do.targeting <- function(train, eps, goal){
g1W<- train$pscore
g0W<- (1 - g1W)
if(goal=='RD'){
# updated QbarAW estimates for training set.
QbarAW.star <- plogis( qlogis(train$QbarAW ) + eps*train$H.AW)
Qbar1W.star <- plogis( qlogis(train$Qbar1W ) + eps/g1W )
Qbar0W.star <- plogis( qlogis(train$Qbar0W ) - eps/g0W )
}else{
# updated QbarAW estimates for training set.
QbarAW.star <- plogis( qlogis(train$QbarAW) + eps['H.0W']*train$H.0W + eps['H.1W']*train$H.1W)
Qbar0W.star <- plogis( qlogis(train$Qbar0W) + eps['H.0W']/g0W )
Qbar1W.star <- plogis( qlogis(train$Qbar1W) + eps['H.1W']/g1W )
}
train <- data.frame(train, QbarAW.star, Qbar1W.star, Qbar0W.star)
train
}