-
Notifications
You must be signed in to change notification settings - Fork 1
/
createoutcomes.Rmd
448 lines (344 loc) · 18 KB
/
createoutcomes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
---
title: "DoD Military Community and Family Policy Military OneSource eNewsletter"
author: "Paul Testa and Jake Bowers"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
toc: TRUE
---
```{r init,echo=F}
library(knitr)
opts_chunk$set(eval=T,echo=F,results="hide",message=F,warning=F,cache=T)
```
\tableofcontents
# Setup
- Download "designdata.csv" and "subscriptiondata.csv" onto local machine
- Load files and libaries
```{r setup}
wrkdat<-read.csv("data/designdata.csv",as.is=TRUE)
subscribers<-read.csv("data/subscriptiondata.csv",as.is=TRUE)
```
# Data
First check data
```{r clean}
stopifnot(nrow(wrkdat)==491879) # 491879 observations
# Make sure emails are unique:
stopifnot(length(unique(wrkdat$emailhash))==dim(data)[1])
stopifnot(length(unique(subscribers$emailhash))==dim(subscribers)[1])
stopifnot(nrow(subscribers)==7759) # 7759 subscriptions
stopifnot(sum(wrkdat$emailhash%in%subscribers$emailhash)==5563) # 5563 subscribers in initial email list
# Rename Treatment indicator
names(wrkdat)[3]<-"treatment"
```
# Mark unmatchable subscribers
If a subscriber has no match on email or full name or last name to any email or name in the experimental pool, note this. These people are unmatchable. They may be people who subscribed but were outside the study. Or they may be people in the study who subscribed changing their names and email addresses. For now, we exclude these people from matching. We'll return to them later.
```{r}
subscribers$noemail2match <- !(subscribers$emailhash %in% wrkdat$emailhash)
subscribers$noname2match <- !(subscribers$name1hash %in% na.omit(unlist(wrkdat[,grep("name[0-9].?hash",names(wrkdat),value=TRUE)])))
subscribers$nolastname2match <- !(subscribers$Last.Name1hash %in% na.omit(unlist(wrkdat[,grep("Last.Name[0-9].?hash",names(wrkdat),value=TRUE)])))
subscribers$noway2match <- subscribers$noemail2match & subscribers$noname2match & subscribers$nolastname2match
stopifnot(sum(subscribers$noway2match)==177)
```
There are `r sum(subscribers$noway2match)` people who have no email, no name,
and no last name that overlap with anyone in the original experimental pool
(this includes looking accross up to 24 different names and up to 24 different
last names). If these people subscribed but were not part of the experiment, we
can just exclude them. If these people were part of the experiment, then we
have 744 positive outcomes with unknown relationship with treatment assignment.
The question we will ask is whether our main results will change (and how much)
given a few ways to distribute these responses across treatments and
batches.
## Match on email
Then, match those sent email treatments to subscribers by email address.
```{r}
# Matches by email
stopifnot(sum(wrkdat$emailhash%in%subscribers$emailhash)==5563) # 5563
# Matches by name1hash
stopifnot(sum(wrkdat$name1hash%in%subscribers$name1hash)==7951) # 7951 subscribers in
# Additional variables
## table(wrkdat$Order.Within.Condition)
table(wrkdat$Condition.Order.Within.Batch)
### Make sure that each row is a unique email address
stopifnot(nrow(wrkdat)==length(unique(wrkdat$emailhash)))
stopifnot(nrow(subscribers)==length(unique(subscribers$emailhash)))
row.names(wrkdat)<-wrkdat$emailhash
row.names(subscribers)<-subscribers$emailhash
## Make sure that Subscribe.Date has no missing values or extreme values.
stopifnot(all(!is.na(subscribers$Subscribe.Date)))
sort(unique(subscribers$Subscribe.Date))[1:10]
rev(sort(unique(subscribers$Subscribe.Date)))[1:10]
```
First match on email addresses. If an email address in the design data ends up
in the subscriber file, then we know that person subscribed to the newsletter.
```{r}
matchedemails<-intersect(row.names(wrkdat),row.names(subscribers))
wrkdat[matchedemails,"sdate"]<-subscribers[matchedemails,"Subscribe.Date"]
stopifnot(sum(!is.na(wrkdat$sdate))==5563)
subscribers$noemailmatch<- !(row.names(subscribers) %in% matchedemails)
stopifnot(sum(!subscribers$noemailmatch)==5563)
## Excluding the people for whom there is no point in matching for now
subdat<-subscribers[subscribers$noemailmatch & !subscribers$noway2match,]
```
So, we have 2019 subscribers who were not matched on email address.
## Match on full name within batch
Next, among those 2169 unmatched subscribers, try to match on name within
batch. This is tricky but we start simple. Add batch information to the
subscription file to add in merging otherwise we will have *at least* 900
subscriptions with no match to the experimental treatments.
We assume that if you have subscribed after the mailing of batch A but before
batch B, then you were randomized to batch A.
```{r}
table(wrkdat$Batch)
table(as.Date(subscribers$Subscribe.Date,format="%m/%d/%y"),exclude=c())
```
Add start date of batch using information provided by the agency.^[After cleaning the file in a spreadsheet to remove extraneous information.]
```{r}
batchdates<-read.csv("batchdates.csv",as.is=TRUE,header=TRUE,strip.white=TRUE)
## Remove empty rows
batchdates[batchdates==""]<-NA ## replace empty strings with NA
emptyrow<-apply(batchdates,1,function(x){ all(is.na(x))})
badrows<-apply(batchdates,1,function(x){ tmp<-grep("TAR",x); if(length(tmp)==0){ return(FALSE) } else { TRUE } })
batchdates<-batchdates[!emptyrow & !badrows,]
batchdates$Batch<-rep(1:82,each=2)
stopifnot(all(na.omit(batchdates$Batch.Number-batchdates$Batch)==0))
## Fill in treatment information
batchdates$eMail.to.send[seq(2,164,by=2)]<-batchdates$eMail.to.send[seq(1,163,by=2)]
batchdates$eMail.to.send.1[seq(2,164,by=2)]<-batchdates$eMail.to.send.1[seq(1,163,by=2)]
batchdates$eMail.to.send.2[seq(2,164,by=2)]<-batchdates$eMail.to.send.2[seq(1,163,by=2)]
batchdates$eMail.to.send.3[seq(2,164,by=2)]<-batchdates$eMail.to.send.3[seq(1,163,by=2)]
batchdates$eMail.to.send.4[seq(2,164,by=2)]<-batchdates$eMail.to.send.4[seq(1,163,by=2)]
batchdates$eMail.to.send.5[seq(2,164,by=2)]<-batchdates$eMail.to.send.5[seq(1,163,by=2)]
## Pull off the actual send times and change the data to have multiple rows for batches rather than columns
bdlong<-reshape(batchdates,direction="long",v.names=c("eMail.to.send","Details"),
varying=list(eMail.to.send=grep("eMail.to.send",names(batchdates),value=TRUE),
Details=grep("Details",names(batchdates),value=TRUE)))
bdlongActual <- bdlong[grep("Actual",bdlong$Details),]
## Three actual send times were not recorded (we assume they were actually sent). So use the set time.
##bdlongActual[bdlongActual$Batch==16&bdlongActual$eMail.to.send=="A",] <- bdlong[bdlong$Batch==16&bdlong$eMail.to.send=="A"&bdlong$Batch.Number==16,"Details"]
## Strip off "PM"
bdlongActual[bdlongActual$Batch==16&bdlongActual$eMail.to.send=="A","Details"] <- bdlong["31.4","Details"]
bdlongActual[bdlongActual$Batch==36&bdlongActual$eMail.to.send=="E","Details"] <- bdlong["71.5","Details"]
bdlongActual[bdlongActual$Batch==80&bdlongActual$eMail.to.send=="F","Details"] <- bdlong["159.6","Details"]
## Make sure that each treatment is associated with one and only one batch
stopifnot(all(with(bdlongActual,table(Batch,eMail.to.send))==1))
## Convert Details into dates
bdlongActual$dateString<-sapply(strsplit(bdlongActual$Details,":\ "),function(x){ x[2] })
bdlongActual$date <- as.Date(bdlongActual$dateString,format="%d-%b-%Y %H:%M:%S")
bdlongActual[grep("PM",bdlongActual$dateString),"date"]<-as.Date(bdlongActual$dateString[grep("PM",bdlongActual$dateString)],
,format="%m/%d/%Y %H:%M:%S %p")
stopifnot(sum(is.na(bdlongActual$date))==0)
table(bdlongActual$date)
```
Now, the subscription data only has day of subscription, no time. So, in order
to map date of subscription onto batch among those not matched by email, we
have to collapse batches into a day (often there were two batches per day). We
hope that we can still back out original batch after doing the matching by name
and collapsed-batch. If not, we will lose some statistical power because our
blocking variable (batch) will be less fine grained, but we will probably gain
more because we can increase the sample size and also will reduce our problems
with unmatchable subscribers (unknown relationships between treatement
assignment and outcome).
```{r}
subdat$dateDay<-as.Date(subdat$Subscribe.Date,format="%m/%d/%y")
subscribers$dateDay<-as.Date(subscribers$Subscribe.Date,format="%m/%d/%y")
## Batch Day/Collapsed Batch to Date mapping
bdlongActual$dateDay <- round(bdlongActual$date)
stopifnot(length(unique(bdlongActual$dateDay))==41)
batchdaydat<-data.frame(dateDay=unique(bdlongActual$dateDay),
batchday=rank(unique(bdlongActual$dateDay)))
row.names(batchdaydat)<-batchdaydat$dateDay
## Just add appropriate batchday info to the subdat data
subdat<-merge(subdat,batchdaydat,all.x=TRUE,sort=FALSE)
## Now we have some subscription dates that are not on a mailing day
sum(is.na(subdat$batchday))
stopifnot(length(unique(subdat$emailhash))==nrow(subdat))
row.names(subdat)<-subdat$emailhash
intermaildates<-unique(subdat$dateDay[is.na(subdat$batchday)])
## Choose the mailing date that is closest to the subscription day, yet before the subscription day
getcloseday<-function(x){
# x is date subscribed, a scalar
y <- batchdaydat$dateDay
thediff<-x - y
return( as.Date(max(y[thediff>=0])) )
}
getcloseday(subdat$dateDay[1])
## This is a very weird hack to ensure that the newdate column stayed a Date
subdat$newdate<-NA
class(subdat$newdate)<-"Date"
for(i in 1:length(subdat$dateDay)){
subdat$newdate[i]<-getcloseday(subdat$dateDay[i])
}
stopifnot(all(with(subdat[!is.na(subdat$batchday),],dateDay-newdate)==0))
subdat$batchday2<-NA
subdat$batchday2 <- batchdaydat[as.character(subdat$newdate),"batchday"]
## Check that each newdate is associated with one batch and only one batch
thetab<-with(subdat,table(newdate,batchday2))
stopifnot(all(c(unique(thetab[upper.tri(thetab)]), unique(thetab[lower.tri(thetab)]))==0))
stopifnot(nrow(subdat)==2019)
```
Now, make a collapsed batchday variable on the design data:
```{r}
wrkdat$batchday2<-as.numeric(cut(wrkdat$Batch,seq(0,82,by=2)))
## Make sure that the new batch collects exactly two of the original batches
tmptab<-with(wrkdat,table(batchday2,Batch))
stopifnot(all(apply(tmptab,1,function(x){ sum(x!=0) }) == 2 ))
stopifnot(all(apply(tmptab,2,function(x){ sum(x!=0) }) == 1 ))
```
Now match subscribers who have the same name *and* batch number as experimental pool subjects.
```{r}
## Make a dataset called expool for those people in the experimental pool not matched on email
expool<-wrkdat[is.na(wrkdat$sdate),]
stopifnot(nrow(expool)==486316)
## Go batch by batch just for clarity. There are probably faster vectorized ways.
lastnms<-grep("Last.Name[0-9].?hash$",names(wrkdat),value=TRUE)
expbatches <- split(expool[,c("emailhash","name1hash","sdate","batchday2",lastnms)],expool$batchday2)
subbatches <- split(subdat,subdat$batchday2)
stopifnot(length(expbatches)==length(subbatches))
## Remove subscriber names that are duplicated within batch
uniqsubs<-lapply(subbatches,function(dat){
thetab <- table(dat$name1hash)
dups <- names(thetab[thetab>1])
if(length(dups)==0){
message("ok")
return(dat)
} else {
message(length(dups))
return(dat[!(dat$name1hash %in% dups),])
}
})
## How many names do we remove? About 4*2=8.
table(sapply(subbatches,nrow)-sapply(uniqsubs,nrow))
## Now merge from the subscribers onto the experimental pool batch by batch
merge2list<-list()
for(i in 1:length(expbatches)){
message(i)
merge2list[[i]]<- merge(expbatches[[i]],uniqsubs[[i]],by="name1hash",all.x=TRUE,sort=FALSE)
stopifnot(nrow(merge2list[[i]])==nrow(expbatches[[i]]))
uniqsubs[[i]]$matched <- uniqsubs[[i]]$name1hash %in% expbatches[[i]]$name1hash
}
sum(sapply(merge2list,nrow))
mergeexpool<-do.call("rbind",merge2list)
stopifnot(nrow(mergeexpool)==486316)
mergeexpool$sdate<-mergeexpool$Subscribe.Date
row.names(mergeexpool)<-mergeexpool$emailhash
subdat2<-do.call("rbind",uniqsubs)
row.names(subdat2)<-subdat2$emailhash ## subdat2 take unmatched by email and adds full names
stopifnot(sum(!subdat2$matched)==1206)
subdat[row.names(subdat2),"name1matched"]<-subdat2$matched
rm(expool,expbatches,uniqsubs,merge2list,i,subbatches)
```
So we have 1206 subscribers who are not matched after matching on full name and email address.
## Match on last name within batch
Now, do one last matching on unique last names within batch. Some inspection of
the data suggested that people called "Jake Bowers" might sometimes use "J
Bowers" or "Jacob Bowers". So, among those not already matched, and among those
last names that are unique within batch, do one last merge.
```{r}
subdat2<-subdat2[!subdat2$matched,] ## for next round of matching, exclude people already matched
## First remove last names that are duplicated within batch among subscribers (about 39)
subdat2$duplastname<-unsplit(lapply(split(subdat2,subdat2$batchday2),function(dat){
thetab <- table(dat$Last.Name1hash)
dups <- names(thetab[thetab>1])
dat$Last.Name1hash %in% dups
}),subdat2$batchday2)
sub2batches<-split(subdat2[!subdat2$duplastname,],subdat2[!subdat2$duplastname,"batchday2"])
mergeexpool$Last.Name1hash<-mergeexpool$Last.Name1hash.x
mergeexpool$emailhash<-mergeexpool$emailhash.x
mergeexpool$batchday2<-mergeexpool$batchday2.x
expool2batches<-split(mergeexpool[,c("emailhash",lastnms,"batchday2","sdate")],mergeexpool$batchday2)
## Find the subscriber last names which uniquely match *any* of the experimental pool last names within batchday
whichsub2list<-list()
for(i in 1:length(expool2batches)){
message(i)
whichsub2list[[i]]<-apply(expool2batches[[i]][,lastnms],1,function(x){
##res<-sub2batches[[i]]$Subscribe.Date[sub2batches[[i]]$Last.Name1hash %in% x]
res<- which(sub2batches[[i]]$Last.Name1hash %in% x)
if(length(res)==0 | length(res)>1){
## Not a match if no match or if more than one unique match
return(NA)
} else {
return(res)
}
})
thetab <- table(whichsub2list[[i]])
uniqnms <- as.numeric(names(thetab[thetab==1]))
poolrows <- names(whichsub2list[[i]][whichsub2list[[i]] %in% uniqnms])
expool2batches[[i]]$sdate3<-NA
expool2batches[[i]][poolrows,"sdate3"]<- sub2batches[[i]][uniqnms,"Subscribe.Date"]
sub2batches[[i]]$nolastnmmatch<-TRUE
sub2batches[[i]][uniqnms,"nolastnmmatch"]<-FALSE
}
subdat3<-do.call("rbind",sub2batches)
stopifnot(nrow(subdat2[!subdat2$duplastname,])==nrow(subdat3))
row.names(subdat3)<-subdat3$emailhash ## subdat3 are those unmatched by email or full name within batch and matches by last names
mergeexpool2 <- do.call("rbind",expool2batches)
stopifnot(nrow(mergeexpool2)==nrow(mergeexpool))
rm(expool2batches,sub2batches)
mergeexpool2$finalsdate<-with(mergeexpool2,ifelse(is.na(sdate3),sdate,sdate3))
table(is.na(mergeexpool2$finalsdate))
table(is.na(mergeexpool2$sdate))
table(is.na(mergeexpool2$sdate3))
stopifnot(length(unique(mergeexpool2$emailhash))==nrow(mergeexpool2))
row.names(mergeexpool2)<-mergeexpool2$emailhash
```
## Put it all back together
Put the experimental pool back together.
```{r}
nrow(wrkdat)
sum(!is.na(wrkdat$sdate))
nrow(mergeexpool) ## The those in experimental pool with no email match.
nrow(mergeexpool2) ## Adds name and last name matches
setdiff(names(mergeexpool),names(wrkdat))
wrkdat[row.names(mergeexpool2),"sdateNms"]<-mergeexpool2$finalsdate
table(is.na(wrkdat$sdateNms),is.na(wrkdat$sdate))
wrkdat$finalsdate<-with(wrkdat,ifelse(is.na(sdate),sdateNms,sdate))
wrkdat$subscribed<-as.numeric(!is.na(wrkdat$finalsdate))
table(wrkdat$subscribed)
nrow(subscribers)-sum(wrkdat$subscribed)
```
Looks like we will have about 1177 subscribers Which subscribers are still not matched? What (pseudo)batches were they in?
```{r}
nrow(subscribers) ## 7759 subscribers
sum(subscribers$noway2match) ## 177 no match possible, maybe not in the experiment at all?
sum(!subscribers$noemailmatch) ## 5563 email matches: 7759 - 5563 - 177 = 2019 unmatched at this point
sum(subdat$name1matched,na.rm=TRUE) ## 805 of the no email matches matched on name within batch
sum(!subdat3$nolastnmmatch) ## 196 of the no name and no email matches matched on at least one last name within batch
## So, 2019-805-196+177 = 1195 unmatched
subdat3$lastnmmatch<-!subdat3$nolastnmmatch
## About 2019 people were not matched by email and also had no overlap in name with the experimental pool at all
setdiff(names(subdat),names(subscribers)) ## subdat are those unmatched by email and unmatchable by name
subscribers[row.names(subdat),"batchday2"]<-subdat$batchday2
subscribers[row.names(subdat),"newdate"]<-subdat$newdate
##setdiff(names(subdat2),names(subscribers))
subscribers[row.names(subdat),"name1matched"]<-subdat$name1matched
subscribers[is.na(subscribers$name1matched),"name1matched"]<-FALSE
setdiff(names(subdat3),names(subscribers))
subscribers[row.names(subdat3),"lastnmmatch"]<-subdat3$lastnmmatch
subscribers[is.na(subscribers$lastnmmatch),"lastnmmatch"]<-FALSE
nrow(subscribers)
thetab<- with(subscribers,table(possmatch=!noway2match,emailmatch=!noemailmatch,name1matched,lastnmmatch))
## The FALSE, FALSE, FALSE top row shows the unmatched
ftable(thetab,col.vars="possmatch")
subscribers$matched<-with(subscribers,{ !noemailmatch | name1matched | lastnmmatch })
table(subscribers$matched,exclude=c())
## Total number of subscriptions on the experimental pool file and the total number on our file.
sum(wrkdat$subscribed) - sum(subscribers$matched)
```
There is a discrepancy of 18 cases between what we have assigned to the experimental pool and what we calculate as final matches among the subscribers. This is a very small amount given the total number of missing data, so, for now, we will proceed with analysis and return to sleuth this problem out during the bounding analysis.
```{r}
## This is a very weird hack to ensure that the newdate column stayed a Date
subscribers$newdate<-NA
class(subscribers$newdate)<-"Date"
for(i in 1:length(subscribers$dateDay)){
subscribers$newdate[i]<-getcloseday(subscribers$dateDay[i])
}
subscribers$batchday2<-NA
subscribers$batchday2 <- batchdaydat[as.character(subscribers$newdate),"batchday"]
## The unmatched people are distributed unevenly across the batchdays
with(subscribers[!subscribers$matched,],table(batchday2,exclude=c()))
```
# Save files for analysis
```{r}
write.csv(wrkdat,file="data/wrkdat.csv")
write.csv(subscribers,file="data/subscribersPlus.csv")
```