forked from aurelienbibaut/C-TMLE-continuous_index
-
Notifications
You must be signed in to change notification settings - Fork 0
/
C-TMLE_truncation.R
201 lines (184 loc) · 10.5 KB
/
C-TMLE_truncation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
source('TMLE_scripts/utilities.R')
source('TMLE_scripts/generate_data.R')
library(speedglm); library(caret)
# Initial model fit for the Q part of the likelihood
fit_Q1_n_d <- function(observed_data.df, d0, intercept_only = F){
if(intercept_only){
Q1_n_d.model_fit <- speedglm(L1 ~ A0, subset(observed_data.df, A0 == 1),
family = binomial())
}else{
Q1_n_d.model_fit <- speedglm(L1 ~ A0, subset(observed_data.df, A0 == 1),
family = binomial())
}
Q1_n_d.fitted_values <- predict(Q1_n_d.model_fit, newdata = observed_data.df,
type = 'response')
list(model_fit = Q1_n_d.model_fit,
fitted_values = Q1_n_d.fitted_values)
}
# Build a sequence of TMLEs with increasing fits
# and return the corresponding fluctuation models
C_TMLE.build_TMLEs_sequence <- function(observed_data.df, d0, gns_d, Q1_n_d, max_its = Inf){
fluctuation_models <- list()
# First targeting step
C_d <- 1 / gns_d[, 1] # clever covariate, setting A0 to d0(A0)
Q1_n_d.logit <- logit(Q1_n_d)
observed_data.df <- cbind(observed_data.df, C_d = C_d,
offset = Q1_n_d.logit)
epsilon <- speedglm(L1 ~ offset(offset) + C_d - 1,
subset(observed_data.df, A0 == 1),
family = binomial())$coefficients[1]
Q1_n_d_star.logit <- Q1_n_d.logit + epsilon * C_d
# Subsequent targeting steps
Qns_d.logit <- list(Q1_n_d.logit)
Qn_d_stars.logit <- list(Q1_n_d_star.logit)
fluctuation_models[[1]] <- as.matrix(t(c(clever_covariate.id = 1,
epsilon = epsilon)),
nrow = 1)
if(max_its == 1) return(fluctuation_models)
for(k in 2:min(ncol(gns_d), max_its)){
# Perform targeting step based on gn_d_delta, fluctuating Qn_{k-1}
observed_data.df$offset <- Qns_d.logit[[length(Qns_d.logit)]]
C_d <- 1 / gns_d[, k] # clever covariate, setting A0 to d0(A0)
epsilon <- speedglm(L1 ~ offset(offset) + C_d - 1,
subset(observed_data.df, A0 == 1),
family = binomial())$coefficients[1]
candidate_Q_n_d_star.logit <- Qns_d.logit[[length(Qns_d.logit)]] + epsilon * C_d
# Does the candidate Q_n_star have a better goodness of fit than the current latest Q_n_star?
candidate_Q_n_d_star <- expit(candidate_Q_n_d_star.logit)
candidate_Q_n_d_star.loss.summand <-
- (observed_data.df$A0 == d0(observed_data.df$L0)) * (observed_data.df$L1 * log(candidate_Q_n_d_star) +
(1 - observed_data.df$L1) * log(1 - candidate_Q_n_d_star))
current_Q_n_d_star <- expit(Qns_d.logit[[length(Qns_d.logit)]])
current_Q_n_d_star.loss.summand <-
- (observed_data.df$A0 == d0(L0)) * (observed_data.df$L1 * log(current_Q_n_d_star) +
(1 - observed_data.df$L1) * log(1 - current_Q_n_d_star))
# It's okay if Q(Y | A, W) = 1 and Y = 1, or Q(Y | A, W) = 0 and Y = 0. Thus I take the corresponding NaN's out
candidate_Q_n_d_star.loss.summand[is.nan(candidate_Q_n_d_star.loss.summand)] <- 0
current_Q_n_d_star.loss.summand[is.nan(current_Q_n_d_star.loss.summand)] <- 0
candidate_Q_n_d_star.loss <- sum(candidate_Q_n_d_star.loss.summand)
current_Q_n_d_star.loss<- sum(current_Q_n_d_star.loss.summand)
if(candidate_Q_n_d_star.loss < current_Q_n_d_star.loss){
cat("k = ", k, ", fluctuating Q_n_d_{k-1} improves the fit directly\n")
Qns_d.logit[[k]] <- Qns_d.logit[[k - 1]]
Qn_d_stars.logit[[k]] <- candidate_Q_n_d_star.logit
# The new fluctuation is obtained from the previous one by replacing
# the last clever covariate by the current one
fluctuation_models[[k]] <- fluctuation_models[[k - 1]]
fluctuation_models[[k]][nrow(fluctuation_models[[k]]), ] <- c(clever_covariate.id = k,
epsilon = epsilon)
}else{
cat("k =", k, ", fluctuating Qn_d_star_{k-1}\n")
# Fluctuate Q_n_star_{k-1} based on g_n_delta
observed_data.df$offset <- Qn_d_stars.logit[[k - 1]]
Qns_d.logit[[k]] <- Qn_d_stars.logit[[k - 1]]
epsilon <- speedglm(L1 ~ offset(offset) + C_d - 1,
subset(observed_data.df, A0 == 1),
family = binomial())$coefficients[1]
Qn_d_stars.logit[[k]] <- Qn_d_stars.logit[[k - 1]] + epsilon * C_d
# The new fluctation model is obtained from the previous one by
# adding to it the current clever covariate
fluctuation_models[[k]] <- rbind(fluctuation_models[[k - 1]],
c(k, epsilon))
}
}
fluctuation_models
}
# Compute the C-TMLE for given initial estimator, sequence of increasingly
# unbiased propensity scores, and fluctuation model
C_TMLE.predict <- function(Q1_d_n, gns_d, fluctuation_model){
C_ds <- 1 / gns_d
expit(logit(Q1_d_n) +
as.matrix(C_ds[, fluctuation_model[, "clever_covariate.id"]]) %*% fluctuation_model[, "epsilon.C_d"])
}
# Compute the cross validated log likelihood of a Q_d fit
test_set.log_likelihood <- function(Qn_d_star.test_set, test_set.df){
L0 <- test_set.df$L0; A0 <- test_set.df$A0; L1 <-test_set.df$L1
summand <- -((A0 == d0(L0)) * (L1 * log(Qn_d_star.test_set) +
(1 - L1) * log(1 - Qn_d_star.test_set)))
# It's okay if Q(Y | A, W) = 1 and Y = 1, or Q(Y | A, W) = 0 and Y = 0. Thus I take the corresponding NaN's out
summand[is.nan(summand)] <- 0
sum(summand)
}
# Compute increasingly unbiased sequence of propensity scores
# To be modified for one's particular application
fit_propensity_scores_sequence <- function(training_set.df, d0, test_set.df = NULL, max_degree = 3){
gns_d.training_set <- vector()
if(!is.null(test_set.df)){
gns_d.test_set <- vector()
}else{
gns_d.test_set <- NULL
}
for(deg in 1:max_degree){
gn.model_fit <- speedglm(A0 ~ 1 + poly(L0, degree = deg, raw = TRUE),
data = training_set.df, family = binomial())
gn.training_set <- predict(gn.model_fit, newdata = training_set.df,
type = 'response')
if(!is.null(test_set.df)) gn.test_set <- predict(gn.model_fit, newdata = test_set.df,
type = 'response')
gns_d.training_set <- cbind(gns_d.training_set,
(training_set.df$A0 == d0(training_set.df$L0)) * gn.training_set +
(training_set.df$A0 != d0(training_set.df$L0)) * (1 - gn.training_set))
if(!is.null(test_set.df)) gns_d.test_set <- cbind(gns_d.test_set,
(test_set.df$A0 == d0(test.df$L0)) * gn.test_set +
(test_set.df$A0 != d0(test.df$L0)) * (1 - gn.test_set))
}
gns_d.training_set <- cbind(g_to_g_delta(0.2, gns_d.training_set[, 1]),
g_to_g_delta(0.1, gns_d.training_set[, 1]),
g_to_g_delta(0.01, gns_d.training_set[, 1]), gns_d.training_set)
if(!is.null(test_set.df)) gns_d.test_set <- cbind(g_to_g_delta(0.2, gns_d.test_set[, 1]),
g_to_g_delta(0.1, gns_d.test_set[, 1]),
g_to_g_delta(0.01, gns_d.test_set[, 1]), gns_d.test_set)
list(gns_d.training_set = gns_d.training_set,
gns_d.test_set = gns_d.test_set)
}
# Main C-TMLE function
C_TMLE <- function(observed_data.df, d0, V = 5, Q.intercept_only = F){
# Make V folds of the observed dataset
n <- nrow(observed_data.df)
folds <- createFolds(1:n, V)
test_sets.loss <- matrix(NA, nrow = 6, ncol = V) # loss of each TMLE in the sequence on each test set
# Compute the sequence of TMLEs for each fold
# and its log likelihood loss on the test set
for(fold_id in 1:length(folds)){
cat("Fold ", fold_id, "\n")
test_set.df <- observed_data.df[folds[[fold_id]], ]
training_set.df <- observed_data.df[setdiff(1:n, folds[[fold_id]]), ]
# Fit the sequence of TMLEs on the training set
Q1_d_n.fit <- fit_Q1_n_d(training_set.df, d0, intercept_only = Q.intercept_only)
gns_d <- fit_propensity_scores_sequence(training_set.df, d0, test_set.df, 3)
fluctuation_models <- C_TMLE.build_TMLEs_sequence(training_set.df, d0,
gns_d$gns_d.training_set, Q1_d_n.fit$fitted_values)
# Compute the loss of each TMLE on the test set
Q1_d_n.test_set <- predict(Q1_d_n.fit$model_fit, newdata = test_set.df,
type = 'response')
Q_d_n_stars.test_set <- lapply(fluctuation_models,
function(fluctuation_model) C_TMLE.predict(Q1_d_n.test_set,
gns_d$gns_d.test_set,
fluctuation_model))
test_sets.loss[, fold_id] <- unlist(lapply(Q_d_n_stars.test_set,
function(Q_d_n_star.test_set) test_set.log_likelihood(Q_d_n_star.test_set,
test_set.df)))
}
# Find best TMLE in the sequence based on the cross validated losses
CV_losses <- apply(test_sets.loss, 1, sum)
best_index <- which.min(CV_losses)
cat("Best TMLE's index:", best_index, "\n")
# Return the corresponding TMLE based on the full data
Q1_d_n.full_data <- fit_Q1_n_d(observed_data.df, d0, intercept_only = Q.intercept_only)$fitted_values
gns_d.full_data <- fit_propensity_scores_sequence(observed_data.df, d0, test_set.df, 3)
fluctuation_models.full_data <- C_TMLE.build_TMLEs_sequence(training_set.df, d0,
gns_d$gns_d.training_set,
Q1_d_n.fit$fitted_values,
max_its = best_index)
C_TMLE.predict(Q1_d_n.test_set,
gns_d$gns_d.test_set, fluctuation_models.full_data[[best_index]])
}
# Test the C-TMLE procedure
observed_data <- generate_data('L0_exp', 2, 4, 1, 2, 0.5, 1e4,
alpha1 = -0.01, alpha2 = 1,
beta3 = 1)
d0 <- alwaysTreated0
observed_data.df <- data.frame(L0 = observed_data$L0,
A0 = observed_data$A0,
L1 = observed_data$L1)
C_TMLE.Q_d_n_star <- C_TMLE(observed_data.df, d0, V = 5, Q.intercept_only = F)