diff --git a/day11-MVMatching.Rmd b/day11-MVMatching.Rmd index d2ff890..77b62a5 100644 --- a/day11-MVMatching.Rmd +++ b/day11-MVMatching.Rmd @@ -32,6 +32,8 @@ output: reveal_plugins: ["notes","search","zoom","chalkboard"] pandoc_args: [ "--csl", "chicago-author-date.csl" ] css: icpsr-revealjs.css + includes: + in_header: icpsr-revealjs.html --- @@ -144,6 +146,7 @@ $\\newcommand{\\permsd}{\\ensuremath{\\sigma_{\\Pdistsym}}}$ $\\newcommand{\\dZ}[1]{\\ensuremath{d_{Z}[{#1}]}}$ $\\newcommand{\\tz}[1]{\\ensuremath{t_{{z}}[#1]}}$ $\\newcommand{\\tZ}[1]{\\ensuremath{t_{{Z}}[#1]}} $ +$\\newcommand{\\bX}[1]{\\ensuremath{\\mathbf{X}[#1]}} $ '` ## Today @@ -252,7 +255,7 @@ text(2,-1.2,label=round(dist(rbind(colMeans(Xsd),Xsd["407",])),2)) -The Mahalanobis distance \citep{mahalanobis1930test}, avoids the scale and correlation problem in the euclidean distance.^[For more see ] $dist_M = \sqrt{ (\bm{x} - \bm{\bar{x}})^T \bm{M}^{-1} (\bm{y} - \bm{\bar{y}}) }$ where $\bm{M}=\begin{bmatrix} var(x) & cov(x,y) \\ cov(x,y) & var(y) \end{bmatrix}$ +The Mahalanobis distance \citep{mahalanobis1930test}, avoids the scale and correlation problem in the euclidean distance.^[For more see ] $dist_M = \sqrt{ (\mathbf{x} - \mathbf{\bar{x}})^T \mathbf{M}^{-1} (\mathbf{y} - \mathbf{\bar{y}}) }$ where $\mathbf{M}=\begin{bmatrix} var(x) & cov(x,y) \\ cov(x,y) & var(y) \end{bmatrix}$ Here, using simulated data: The contour lines show points with the same @@ -396,6 +399,63 @@ setdiffsfm1 ``` +Balance testing by hand + +```{r} +d.stat<-function(zz, mm, ss){ + ## this is the d statistic (harmonic mean weighted diff of means statistic) + ## from Hansen and Bowers 2008 + h.fn<-function(n, m){(m*(n-m))/n} + myssn<-apply(mm, 2, function(x){sum((zz-unsplit(tapply(zz, ss, mean), ss))*x)}) + hs<-tapply(zz, ss, function(z){h.fn(m=sum(z), n=length(z))}) + mywtsum<-sum(hs) + myadjdiff<-myssn/mywtsum + return(myadjdiff) +} + +newexp <- function(zz,ss){ + newzz <- unsplit(lapply(split(zz,ss),sample),ss) + return(newzz) +} + +## Test + +thetab <- table(meddat$nhTrt,meddat$fmMhRank,exclude=c()) +test1 <- newexp(meddat$nhTrt,meddat$fmMhRank) +thetabtest <- table(test1,meddat$fmMhRank) +all.equal(thetab, thetabtest) + +d.dist<-replicate(100, d.stat(zz=newexp(zz=meddat$nhTrt,ss=meddat$fmPs), + meddat[,c("nhPopD","nhAboveHS")], ss=meddat$fmPs)) + + +obsds <- d.stat(zz=meddat$nhTrt,mm=meddat[,c("nhPopD","nhAboveHS")],ss=meddat$fmPs) + +d2.stat <- function(dstats,ddist=NULL,theinvcov=NULL){ + ## d is the vector of d statistics + ## ddist is the matrix of the null reference distributions of the d statistics + if(is.null(theinvcov) & !is.null(ddist)){ + as.numeric( t(dstats) %*% solve(cov(t(ddist))) %*% dstats) + } else { + as.numeric( t(dstats) %*% theinvcov %*% dstats) + } +} + +invCovDDist <- solve(cov(t(d.dist))) +obs.d2<- d2.stat(obsds,d.dist,invCovDDist) + +d2.dist<-apply(d.dist, 2, function(thed){ + d2.stat(thed,theinvcov=invCovDDist) + }) +## The chi-squared reference distribution only uses a one-sided p-value going in the positive direction +d2p<-mean(d2.dist>=obs.d2) +cbind(obs.d2,d2p) +xb2$overall[,] + + +``` + + ## Matching on the Mahalanobis Distance @@ -444,6 +504,10 @@ psdist <- match_on(theglm,data=meddat) psdist[1:4,1:4] fmPs <- fullmatch(psdist,data=meddat) summary(fmPs,min.controls=0,max.controls=Inf) +xb2 <- balanceTest(nhTrt~nhPopD+nhAboveHS + strata(fmMh) + strata(fmMhRank) + strata(fmPs), data=meddat, + report="all") +xb2$overall[,] +meddat$fmPs <- factor(fmPs) ``` ## Matching on the propensity score @@ -455,6 +519,7 @@ simpdist <- outer(thepscore,thepscore,function(x,y){ abs(x-y) }) mad(thepscore[meddat$nhTrt==1]) mad(thepscore[meddat$nhTrt==0]) optmatch:::match_on_szn_scale(thepscore,Tx=meddat$nhTrt) +simpdist["101",c("401","402","403")] simpdist["101",c("401","402","403")]/.9137 psdist["101",c("401","402","403")] ``` @@ -474,6 +539,36 @@ xb4 <- balanceTest(update(balfmla,.~.+strata(fmMh)+strata(fmMhRank)+ strata(fmPs data=meddat,report="all",p.adjust.method="none") xb4$overall[,] xb4$results[,"p",] + +xb2$results[,,"fmPs"] +setdiffs <- meddat %>% group_by(fmPs) %>% summarize(md=mean(nhPopD[nhTrt==1]) - mean(nhPopD[nhTrt==0]), + nb=n(), + nTb = sum(nhTrt), + nCb = sum(1-nhTrt), + hwt = ( 2*( nCb * nTb ) / (nTb + nCb)) +) + +with(setdiffs,sum(md*nTb/sum(nTb))) + +xb2a <- xBalance(nhTrt ~ nhPopD + nhAboveHS,strata=list(fmPs=~fmPs),data=meddat,report="all") +xb2a$results +obststat <- with(setdiffs, sum(md*hwt/sum(hwt))) + + +newexp <- function(zz,ss){ + newzz <- unsplit(lapply(split(zz,ss),sample),ss) + return(newzz) +} + + +with(meddat,newexp(zz=nhTrt,ss=fmPs)) + +weighted.mean(setdiffsHR$mneddiffs,w=setdiffsHR$hwt) +thelmversion <- + lm(HomRate08~nhTrt+fmEx1,data=droplevels(meddat[!is.na(meddat$fmEx1),])) + + + ``` ## Can you do better? @@ -494,6 +589,7 @@ The optmatch package allows calipers (which forbids certain pairs from being mat ```{r} quantile(as.vector(psdist),seq(0,1,.1)) psdistCal <- psdist + caliper(psdist,2) +as.matrix(psdist)[5:10,5:10] as.matrix(psdistCal)[5:10,5:10] ``` ## Calipers diff --git a/day12-MatchingTools.Rmd b/day12-MatchingTools.Rmd index aa9ec30..aaf5f72 100644 --- a/day12-MatchingTools.Rmd +++ b/day12-MatchingTools.Rmd @@ -3,6 +3,7 @@ title: Matching Tools and Approaches date: '`r format(Sys.Date(), "%B %d, %Y")`' author: ICPSR 2019 Session 2 bibliography: + - BIB/abbrev-long.bib - BIB/refs.bib - BIB/master.bib - BIB/misc.bib @@ -23,14 +24,6 @@ output: in_header: - defs-all.sty pandoc_args: [ "--csl", "chicago-author-date.csl" ] - revealjs::revealjs_presentation: - slide_level: 2 - incremental: true - transition: none - fig_caption: true - self_contained: false - reveal_plugins: ["notes","search","zoom","chalkboard"] - pandoc_args: [ "--csl", "chicago-author-date.csl" ] --- @@ -99,38 +92,40 @@ library(mvtnorm) ## Today - 1. Agenda: - 2. Reading for tomorrow and next week: DOS 8--9, 13 and \cite[\S~9.5]{gelman2006dau}, and \cite{hans:04} \cite{ho:etal:07} - 3. Questions arising from the reading or assignments or life? + 1. Agenda: Spiral back to optimal, full matching versus greedy matching, + with-replacement, without-replacement, fixed ratio, etc..; spiral back to + details of propensity scores; give you some practice time so that new + questions can be raised. + 2. Reading for tomorrow and next week: DOS 8--9, 13 and + \cite[\S~9.5]{gelman2006dau}, and \cite{hans:04}, \cite{ho:etal:07} + 3. Additional reading for today \cite{hans:klop:06} and citations therein for + technical details on optimal, full matching. + 4. Questions arising from the reading or assignments or life? # But first, review: - -\begin{frame}[fragile] -\frametitle{What have we done so far?} - -The problem of attrition in randomized studies (simple experiments, -encouragement designs, etc.) - -\medskip +## What have we done so far? Making the case for adequate adjustment in observational studies -\begin{itemize} -\item The potential for making this case for one or more variables using -stratification. -\item Using optimal, full matching technology to make and evaluate -stratifications. -\item Mahalanobis distance to compare units on $\bm{x}$. -\item The propensity score to compare units on $\hat{Z} \leftarrow \bm{x}$ -\end{itemize} - -\end{frame} + - The potential for making this case for one or more variables using + stratification. + - Using optimal, full matching technology to make and evaluate + stratifications --- stratification reduces extrapolation and interpolation + without requiring models of adjustment (models relating $\bm{x}$ to $Z$ and + $Y$. + - Scalar distance --- to represent substantive knowledge about key + alternative explanations. + - Mahalanobis distance to compare units on $\bm{x}$ --- to try to include all + covariates equally. + - The propensity score to compare units on $\hat{Z} \leftarrow \bm{x}$ --- to + downweight covariates not relevant for confounding. + - Assessing stratifications using the block-randomized design as a standard + of comparison. (What does this mean in practice? What does `xBalance` or `balanceTest` do?) # Matching on Many Covariates: Using Propensity Scores -\begin{frame} -\frametitle{The propensity score} +## The propensity score Given covariates $\mathbf{x} (=(x_1, \ldots, x_k))$, and a treatment variable $Z$, $Z(u) \in \{0, 1\}$, $\PP (Z \vert \mathbf{x})$ is known as the (true) \textbf{propensity score} (PS). @@ -140,67 +135,68 @@ with an estimated PS, $\hat{\PP} (Z \vert \mathbf{x})$ or $\hat{\phi}(\mathbf{x})$. Theoretically, propensity-score strata or matched sets both -\begin{enumerate} -\item reduce extrapolation; and -\item balance each of $x_1, \ldots, x_k$. -\end{enumerate} + + 1. reduce extrapolation; and + 2. balance each of $x_1, \ldots, x_k$. + They do this by making the comparison more "experiment-like", at least in terms of $x_1, \ldots, x_k$. -Theory \citep{rosrub83} also tells us that in the absence of hidden bias, such a stratification -item supports unbiased estimation of treatment effects. +Theory @rosrub83 also tells us that in the **absence of hidden bias**, such a stratification +supports unbiased estimation of treatment effects. -\end{frame} -\begin{frame} -\frametitle{Propensity scoring in practice} +## Propensity scoring in practice -\begin{columns} -\column{.65\linewidth} -\begin{itemize} -\item Fitted propensity scores help identify extrapolation. -\item In practice, stratification on $\hat{\phi}(\mathbf{x})$ -helps balance each of $x_1, \ldots, x_k$. -\item There are \emph{lots of cases} in which adjustment with the propensity -score fails to generate estimates that square with those of -randomized studies. -\end{itemize} -\column{.35\linewidth} \igrphx{meddatpsplot} -\end{columns} + - Fitted propensity scores help identify extrapolation. + - In practice, stratification on $\hat{\phi}(\mathbf{x})$ +helps balance each of $x_1, \ldots, x_k$ compared to no stratification. -\begin{itemize} -\item There are various reasons for this, starting with: lots of observational studies that -don't measure quite enough $x$es. -\item (Propensity scores address bias on measured variables, not -unmeasured ones.) \textit{hidden bias}. -\end{itemize} -\end{frame} +There are \emph{lots of cases} in which adjustment with the propensity score alone fails to generate estimates that agree with those of randomized studies. + +There are various reasons for this, starting with: + + - lots of observational studies that don't measure quite enough $x$es or the right $x$es or the right $x$es in the right way + - **hidden biases** --- propensity scores address bias on measured variables, not unmeasured ones. + -\begin{frame}[fragile] -\frametitle{More intuition about the propensity score} -A propensity score is the output of a function of covariates as they relate to $Z$ (the "treatment" or "intervention"). Why reduce the dimension of $\bm{x}$ in this way rather than, say, using Mahalanobis distance? + +## More intuition about the propensity score + +A propensity score is the output of a function of covariates as they relate to +$Z$ (the "treatment" or "intervention"). Why reduce the dimension of $\bm{x}$ +in this way rather than, say, using Mahalanobis distance? \medskip Recall that an experiment breaks the relationship between $Z$ and $\bm{x}=\{ x_1,x_2,\ldots \}$ but not between $\bm{x}$ and $Y$ or $y_1,y_0$. +\includegraphics[width=.25\textwidth]{xyzdiagram.pdf} -\begin{tikzcd}[column sep=small] +```{r tikzarrows, eval=FALSE, include=FALSE, engine='tikz', engine.opts=list(template="icpsr-tikz2pdf.tex")} +\usetikzlibrary{arrows} +\begin{tikzcd}[ampersand replacement=\&, column sep=small] Z \arrow[from=1-1,to=1-3] & & Y \\ - & \bm{x} \arrow[from=2-2,to=1-1, "\text{0 if Z rand}"] \arrow[from=2-2,to=1-3] & + & \mathbf{x} \arrow[from=2-2,to=1-1, "\text{0 if Z rand}"] \arrow[from=2-2,to=1-3] & \end{tikzcd} +``` Making strata of units who are similar on the propensity score reduces (or removes) -the relationship between $Z$ and the relevant $\bm{x}$ within strata (either the -units have similar values for $\bm{x}$ or the particular $x$s do not have a +the relationship between $Z$ and the relevant $\mathbf{x}$ within strata (either the +units have similar values for $\bm{x}$ or the particular $x$s which do not have a strong (linear, additive) relationship with $Z$). -\end{frame} +## Matching on the propensity score -```{r echo=FALSE, cache=TRUE} +Common practice uses logistic regression to make the scores. (We will be +using something other than logistic regression in the future because of +separation problems / overfitting when the number of covariates increases +relative to sample size.) + +```{r loadmeddat, include=FALSE, echo=FALSE, cache=TRUE} load(url("http://jakebowers.org/Data/meddat.rda")) meddat<- mutate(meddat, HomRate03=(HomCount2003/Pop2003)*1000, @@ -208,13 +204,6 @@ meddat<- mutate(meddat, row.names(meddat) <- meddat$nh ``` -## Matching on the propensity score - -Common practice is to use logistic regression to make the scores. (We will be -using something other than logistic regression in the future because of -separation problems / overfitting when the number of covariates increases -relative to sample size.) - ```{r} theglm <- glm(nhTrt~nhPopD+nhAboveHS,data=meddat,family=binomial(link='logit')) thepscore <- theglm$linear.predictor @@ -258,157 +247,6 @@ optmatch:::match_on_szn_scale(thepscore,Tx=meddat$nhTrt) simpdist["101",c("401","402","403")]/.9137 psdist["101",c("401","402","403")] ``` - - -# Matching Tricks of the Trade: Calipers, Exact Matching - -## Calipers - -The optmatch package allows calipers (which disallow certain pairs from being matched).^[You can also implement penalties by hand by directly setting elements of the matrices to be `Inf`. Or you can implement penalties by just inflating the size of different distance matrix entries.] Here, for example, we disallow comparisons which differ by more than 2 standard deviations on the propensity score. - -```{r} -quantile(as.vector(psdist),seq(0,1,.1)) -psdistCal <- psdist + caliper(psdist,2) -as.matrix(psdistCal)[5:10,5:10] -``` -## Calipers - -The optmatch package allows calipers (which disallow certain pairs from being matched).^[You can implement penalties by hand.] Here, for example, we disallow comparisons which differ by more than 2 standard deviations on the propensity score. - -```{r} -fmCal1 <- fullmatch(psdist+caliper(psdist,2),data=meddat,tol=.00001) -summary(fmCal1,min.controls=0,max.controls=Inf) -pmCal1 <- pairmatch(psdist+caliper(psdist,2),data=meddat, remove.unmatchables=TRUE) -``` - -## Calipers - -Another example: We may want to primarily match on mahalanobis distance but -disallow any pairs with extreme propensity distance and/or extreme differences -in baseline homicide rates (here using many covariates all together). - -```{r} -thecovs <- unique(c(names(meddat)[c(5:7,9:24)],"HomRate03")) -balfmla<-reformulate(thecovs,response="nhTrt") -mhdist <- match_on(balfmla,data=meddat,method="rank_mahalanobis") -tmpHom03 <- meddat$HomRate03 -names(tmpHom03) <- rownames(meddat) -absdistHom03 <- match_on(tmpHom03, z = meddat$nhTrt,data=meddat) -quantile(as.vector(absdistHom03),seq(0,1,.1)) -## Apply the calipers to the distance matrix -distCal <- mhdist + caliper(psdist,2) + caliper(absdistHom03,2) -as.matrix(distCal)[5:10,5:10] -``` - -## Calipers - -```{r} -as.matrix(distCal)[5:10,5:10] -fmCal2 <- fullmatch(distCal,data=meddat,tol=.00001) -summary(fmCal2,min.controls=0,max.controls=Inf) -``` - - -## Exact Matching - -We often have covariates that are categorical/nominal and on which we really care about strong balance. One approach to solve this problem is match **exactly** on one or more of such covariates. If `fullmatch` or `match_on` is going slow, this is also an approach to speed things up. - -```{r echo=FALSE} -meddat$classLowHi <- ifelse(meddat$nhClass %in% c(2,3),"hi","lo") -``` - -```{r} -dist2 <- mhdist + exactMatch(nhTrt~classLowHi,data=meddat) -dist3 <- dist2 + caliper(psdist,3) -## or mhdist <- match_on(balfmla,within=exactMatch(nhTrt~classLowHi,data=meddat),data=meddat,method="rank_mahalanobis") -## or fmEx1 <- fullmatch(update(balfmla,.~.+strata(classLowHi)),data=meddat,method="rank_mahalanobis") -fmEx1 <- fullmatch(dist3,data=meddat,tol=.00001) -summary(fmEx1,min.controls=0,max.controls=Inf) - -xbEx1 <- - balanceTest(update(balfmla,.~.+strata(fmEx1)),data=meddat,report="all") - - -xbEx1 - -meddat$fmEx1 <- fmEx1 - -setdiffsHR <- meddat %>% filter(!is.na(fmEx1)) %>% group_by(fmEx1) %>% summarize(mneddiffs = - mean(HomRate08[nhTrt==1]) - - mean(HomRate08[nhTrt==0]), - nb = n(), - pr = mean(nhTrt==1), - nTb = sum(nhTrt==1), - nCb = nb - nTb, - hwt = ( 2*( nCb * nTb ) / (nTb + nCb))) - - -setdiffsHR - - -weighted.mean(setdiffsHR$mneddiffs,w=setdiffsHR$hwt) -thelmversion <- - lm(HomRate08~nhTrt+fmEx1,data=droplevels(meddat[!is.na(meddat$fmEx1),])) - -coef(thelmversion)[2] -``` - -## Exact Matching - -```{r} -print(fmEx1,grouped=T) -``` -## Exact Matching - -\scriptsize -```{r} -ftable(Class=meddat$classLowHi,Trt=meddat$nhTrt,fmEx1,col.vars=c("Class","Trt")) -``` - -## Back to matching on a scalar - -Why would matching on a scalar like `nhAboveHS` yield so many sets? - -```{r} -table(round(meddat$nhAboveHS,4),meddat$nhTrt,exclude=c()) -``` - -```{r} -tmpHS <- meddat$nhAboveHS -names(tmpHS) <- rownames(meddat) -absdistHS <- match_on(tmpHS, z = meddat$nhTrt,data=meddat) -absHSMatch <- fullmatch(absdistHS,data=meddat,tol=.0000001) -summary(absHSMatch,min.controls=0,max.controls=Inf) -meddat$absHSMatch <- absHSMatch -setdiffsHS <- meddat %>% group_by(absHSMatch) %>% summarize(mneddiffs = - mean(nhAboveHS[nhTrt==1]) - - mean(nhAboveHS[nhTrt==0]), - mnnhAboveHS = mean(nhAboveHS), - minnhAboveHS = min(nhAboveHS), - maxnhAboveHS = max(nhAboveHS), - setsize=n()) - -setdiffsHS -``` - -## Back to matching on a scalar - -These sets make sense because the individual neighborhoods have different -values for baseline homicide rate. - -```{r} -absHomMatch <- fullmatch(absdistHom03,data=meddat) -meddat$absHomMatch <- absHomMatch -setdiffsHM <- meddat %>% group_by(absHomMatch) %>% summarize(mneddiffs = - mean(HomRate03[nhTrt==1]) - - mean(HomRate03[nhTrt==0]), - mnHomRate03 = mean(HomRate03), - minHomRate03 = min(HomRate03), - maxHomRate03 = max(HomRate03)) - -setdiffsHM -``` - # Major Matching modes ## Optimal (communitarian) vs greedy (individualistic) matching @@ -425,7 +263,7 @@ Tom & 1& 1 & 20 & $\infty$ \\ \hline \end{tabular} \end{center} -```{r results="hide", warnings=FALSE, messages=FALSE} +```{r results="hide", warning=FALSE, messages=FALSE} bookmat <- matrix(c(0,1,1,10,10,0,10,10,1,1,20,Inf),nrow=3,byrow=TRUE) dimnames(bookmat) <- list(c("Ben","Jake","Tom"),c("Mo","John","Terry","Pat")) fullmatch(bookmat) @@ -463,7 +301,7 @@ match keeps all the obs, and has mean distance (1+0+1)/3=.67. } \note{ \begin{itemize} \item This is what many understand by matching. [Greedy pair match follows.] - \item Sometimes matching means ``with-replacement'' matching. We'll permit that too, but in a structured way, and keeping track of how many replacements. + \item Sometimes matching means ``with-replacement'' matching. We'll permit that too, but in a structured way, and keeping track of how many replacements. \item Why keep track of the replacements? --B/c they affect standard errors of the matched comparison. (``Effective sample size'' to be introduced a few slides later.) \end{itemize} } @@ -733,26 +571,6 @@ New site\\ \end{tikzpicture} - -## The World of Matching Today - -This is an active and productive research area (i.e. lots of new ideas, not everyone agrees -about everything). Here are just a few of the different approaches to creating -strata or sets or selecting subsets. - - - The work on cardinality matching and fine balance - - ) - - The work on speedier approximate full matching - - - The work on using genetic algorithms to (1) find approximate strata - with-replacement and (2) to find - an approximation to a completely randomized study (i.e. best subset - selection) - - The work on coarsened exact matching - - The work on entropy-balancing approach to causal effects . - - The covariate-balancing propensity score (CBPS) . - # Information and Balance: Matching structure and effective sample size ## Matching with a varying number of controls and full matching @@ -966,26 +784,20 @@ plot(graph_from_adjacency_matrix(blah2,mode="undirected",diag=FALSE), ## Matching so as to maximize effective sample size +Effective sample size for this match = $2 * 4/3 + 5* 3/2 = 10.17$ --- compare to 7 for pairs, 9.33 for triples. + \tiny -```{r} +```{r tidy=FALSE} effectiveSampleSize(fmnuke) -nukewts <- cbind(nuke.nopt,fmnuke) %>% - group_by(fmnuke) %>% summarise(nb = n(), - nTb = sum(pr), - nCb = nb - nTb, +nukewts <- cbind(nuke.nopt,fmnuke) %>% group_by(fmnuke) %>% summarise(nb = n(), + nTb = sum(pr), nCb = nb - nTb, hwt = ( 2*( nCb * nTb ) / (nTb + nCb))) nukewts -dim(nukewts) sum(nukewts$hwt) -``` - -## Matching so as to maximize effective sample size - -```{r} stratumStructure(fmnuke) ``` -So effective sample size for this match = $2 * 4/3 + 5* 3/2 = 10.17$ --- compare to 7 for pairs, 9.33 for triples. +## Matching so as to maximize effective sample size ```{r} nukemh <- match_on(pr~date+cap,data=nuke.nopt) @@ -1058,29 +870,67 @@ str(mds) quantile(unlist(mds)) ``` +# Work in Class -## Playing with Caliper Matching -```{r eval=FALSE} +## Can you do better? + +**Challenge:** What is your best matched design for the question about the +effect of the Metrocable intervention on Homicides in 2008? ("Best" in that it +(1) might be defensible in substantive terms and (2) compares most favorably +with a block-randomized experiment (in regards observed covariates). + +# Other Matching Technology + +## Playing with Caliper Matching -install.packages("/Library/gurobi702/mac64/R/gurobi_7.0-2.tgz",repos=NULL) +```{r designmatchtry, eval=TRUE} +## install.packages("/Library/gurobi810/mac64/R/gurobi_8.1-0_R_3.5.0.tgz",repos=NULL) library(gurobi) library(designmatch) -distmat <- as.matrix(dist3) -distmat[is.infinite(distmat)]<-1000 -res <- bmatch(t_ind=meddat$nhTrt,dist_mat=distmat, - n_controls=1, - solver=list(name='gurobi',approximate=0)) +thecovs <- unique(c(names(meddat)[c(5:7,9:24)],"HomRate03")) +balfmla<-reformulate(thecovs,response="nhTrt") + +distmat <- as.matrix(psdist) +distmat[is.infinite(distmat)]<-1000*max(distmat) +z <- as.vector(meddat$nhTrt) +mom_covs <- fill.NAs(update(balfmla,.~-nhClass+.),data=meddat) +mom_tols <- round(absstddif(mom_covs,z,20),2) +momlist <- list(covs=mom_covs,tols=mom_tols) +fine_covs <- matrix(meddat$nhClass,ncol=1) +finelist <- list(covs=fine_covs) + +# solverlist <- list(name='gurobi',approximate=0,t_max=1000,round_cplex=0,trace=1) +solverlist <- list(name='glpk',approximate=1,t_max=1000,round_cplex=0,trace=1) + +res <- bmatch(t_ind=z,dist_mat=NULL, ##distmat, + mom=momlist, + #fine=finelist, + solver=solverlist) ``` +## RCBalance -## Summary: +```{r} +unloadNamespace("dplyr") +unloadNamespace("plyr") +library(plyr) +library(dplyr) +library(rcbalance) -What do you think? +mydist <- build.dist.struct(z=z,X=meddat[,thecovs]) +rcb <- rcbalance(mydist,fb.list=list("nhClass"), + treated.info=meddat[meddat$nhTrt==1,], + control.info=meddat[meddat$nhTrt==0,], +exclude.treated=TRUE) + +rcb$matches + +``` ## References diff --git a/day9-AdjustmentBalance.Rmd b/day9-AdjustmentBalance.Rmd index b2df34f..ce5ba59 100644 --- a/day9-AdjustmentBalance.Rmd +++ b/day9-AdjustmentBalance.Rmd @@ -3,7 +3,7 @@ title: Statistical Adjustment and Assessment of Adjustment in Observational Stud date: '`r format(Sys.Date(), "%B %d, %Y")`' author: ICPSR 2019 Session 2 bibliography: - - refs.bib + - BIB/refs.bib - BIB/master.bib - BIB/misc.bib fontsize: 10pt @@ -21,8 +21,8 @@ output: template: icpsr.beamer incremental: true includes: - in_header: - - defs-all.sty + in_header: + - defs-all.sty pandoc_args: [ "--csl", "chicago-author-date.csl" ] revealjs::revealjs_presentation: slide_level: 2 @@ -32,6 +32,8 @@ output: self_contained: false reveal_plugins: ["notes","search","zoom","chalkboard"] pandoc_args: [ "--csl", "chicago-author-date.csl" ] + includes: + in_header: icpsr-revealjs.html --- @@ -149,12 +151,10 @@ library(optmatch) ``` ## Today -\begin{enumerate} - \item Agenda: + 1. Agenda: The problem of covariance adjustment to reduce "bias"/ confounding. \textbf{How can we answer the question about whether we have adjusted enough.} A simple approach: stratification on one categorical variable (and interaction effects). A more complex approach: find sets that are as similar as possible in terms of a continuous variable (bipartite matching). Balance assessment after stratification. -\item Reading for tomorrow and next week: DOS 8--9, 13 and \cite[\S~9.5]{gelman2006dau}, and \cite{ho:etal:07} -\item Questions arising from the reading or assignments or life? -\end{enumerate} + 2. Reading for tomorrow and next week: DOS 8--9, 13 and \cite[\S~9.5]{gelman2006dau}, and @ho:etal:07. + 3. Questions arising from the reading or assignments or life? # Strategies for Causal Inference @@ -193,10 +193,8 @@ The problem of covariance adjustment to reduce "bias"/ confounding. \textbf{How ## The Neyman-Rubin Model for (simple) experiments -\vfill This is what randomization ensures: -$$ (\alert<4>{y_t, y_c},\alert<2>{X}) \perp \alert<3>{Z} $$ -\vfill +$$ ({y_t, y_c},{X}) \perp {Z} $$ I.e., each of $X$, $y_{c}$ and $y_{t}$ is balanced between treatment and control groups (in expectation; given variability from randomization). @@ -607,7 +605,7 @@ coef(lm1a)[2] ## Exactly what does this kind of adjustment do? -So, how would you explain what it means to "control for HS-Education" here? +So, how would you explain what it means to "control for HS-Education" here? ```{r} plot(eZX,eYX) @@ -615,7 +613,7 @@ plot(eZX,eYX) -## Did we adjust enough? +## Did we adjust enough? Maybe adding some more information to the plot can help us decide whether, and to what extend, we effectively "controlled for" the proportion of the neighborhood with more than High School education. Specifically, we might be interested in assessing extrapolation/interpolation problems arising from our linear assumptions. @@ -750,11 +748,11 @@ scatter3d(HomRate08~nhAboveHS+nhPopD, ## The Problem of Using the Linear Model for Adjustment - - Problem of Interepretability: "Controlling for" is "removing (additive) linear relationships" it is not "holding constant" + - Problem of Interepretability: "Controlling for" is "removing (additive) linear relationships" it is not "holding constant" - Problem of Diagnosis and Assessment: What is the standard against which we can compare a given linear covariance adjustment specification? - Problem of extrapolation and interpolation: Often known as "common support", too. - Problems of overly influential points and curse of dimensionality: As dimensions increase, odds of influential point increase (ex. bell curve in one dimension, one very influential point in 2 dimensions); also real limits on number of covariates (roughly $\sqrt{n}$ for OLS). - - Problems of bias: + - Problems of bias: \begin{equation} Y_i = \beta_0 + \beta_1 Z_i + e_i (\#eq:olsbiv) diff --git a/day9-AdjustmentBalance.pdf b/day9-AdjustmentBalance.pdf index be2b794..2f2f475 100644 Binary files a/day9-AdjustmentBalance.pdf and b/day9-AdjustmentBalance.pdf differ diff --git a/defs-all.sty b/defs-all.sty index 0b48622..cb5dad3 100755 --- a/defs-all.sty +++ b/defs-all.sty @@ -45,26 +45,18 @@ \newcommand{\covp}{\ensuremath{\mathbf{Cov}_{P}}} \newcommand{\covpn}{\ensuremath{\mathbf{Cov}_{P_{n}}}} \newcommand{\covq}{\ensuremath{\mathbf{Cov}_{Q}}} - \newcommand{\hatvar}{\ensuremath{\widehat{\mathrm{Var}}}} \newcommand{\hatcov}{\ensuremath{\widehat{\mathrm{Cov}}}} - \newcommand{\sehat}{\ensuremath{\widehat{\mathrm{se}}}} - \newcommand{\combdiff}[1]{\ensuremath{\Delta_{{z}}[#1]}} \newcommand{\Combdiff}[1]{\ensuremath{\Delta_{{Z}}[#1]}} - \newcommand{\psvec}{\ensuremath{\varphi}} \newcommand{\psvecgc}{\ensuremath{\tilde{\varphi}}} - - \newcommand{\atob}[2]{\ensuremath{#1\!\! :\!\! #2}} \newcommand{\stratA}{\ensuremath{\mathbf{S}}} \newcommand{\stratAnumstrat}{\ensuremath{S}} \newcommand{\sAsi}{\ensuremath{s}} - \newcommand{\permsd}{\ensuremath{\sigma_{\Pdistsym}}} -% \newcommand{\dz}[1]{\ensuremath{d_{z}[{#1}]}} \newcommand{\dZ}[1]{\ensuremath{d_{Z}[{#1}]}} \newcommand{\tz}[1]{\ensuremath{t_{{z}}[#1]}} \newcommand{\tZ}[1]{\ensuremath{t_{{Z}}[#1]}} @@ -86,11 +78,12 @@ \newenvironment{Column}[1][.5\linewidth]{\begin{column}{#1}}{\end{column}} -\usepackage{tikz} -\usepackage{tikz-cd} -\usepackage{textpos} +%\usepackage{tikz} +%\usepackage{tikz-cd} +%\usepackage{textpos} -\usetikzlibrary{arrows} % also see \tikzstyle spec below after \begin{doc} +\usepgflibrary{arrows} % also see \tikzstyle spec below after \begin{doc} +%\usetikzlibrary{arrows} % also see \tikzstyle spec below after \begin{doc} \newcommand{\mlpnode}[1]{\tikz[baseline=-.5ex] \coordinate (#1) {};} %\newcommand{\mlpnode}[1]{\raisebox{.5ex}{\pnode{#1}}} diff --git a/icpsr-revealjs.css b/icpsr-revealjs.css index d5eb74f..0f28ad5 100644 --- a/icpsr-revealjs.css +++ b/icpsr-revealjs.css @@ -1,5 +1,5 @@ .reveal, -.reveal h1,{ font-size: 2.5em; } +.reveal h1,{ font-size: 2em; } .reveal h2, .reveal h3, .reveal h4, @@ -8,3 +8,20 @@ font-family: "Fira Sans"; } +.scrollable-slide { + height: 800px; + overflow-y: auto !important; +} +::-webkit-scrollbar { + width: 6px; +} +::-webkit-scrollbar-track { + -webkit-box-shadow: inset 0 0 6px rgba(0,0,0,0.3); +} +::-webkit-scrollbar-thumb { + background-color: #333; +} +::-webkit-scrollbar-corner { + background-color: #333; +} + diff --git a/icpsr-tikz2pdf.tex b/icpsr-tikz2pdf.tex new file mode 100644 index 0000000..bdb35c3 --- /dev/null +++ b/icpsr-tikz2pdf.tex @@ -0,0 +1,20 @@ +\documentclass{article} +\include{preview} +\usepackage[pdftex,active,tightpage]{preview} +\usepackage{amsmath,amssymb,bm} +\usepackage{notation} +\usepackage{tikz} +\usepackage{tikz} +\usepackage{tikz-cd} +\usepackage{textpos} +\usetikzlibrary{arrows} % also see \tikzstyle spec below after \begin{doc} +\usepgflibrary{arrows} % also see \tikzstyle spec below after \begin{doc} +\usetikzlibrary{matrix} + +\tikzstyle{every picture}+=[remember picture] + +\begin{document} +\begin{preview} +%% TIKZ_CODE %% +\end{preview} +\end{document} diff --git a/icpsr.beamer b/icpsr.beamer index fa33c3c..ffae2a3 100644 --- a/icpsr.beamer +++ b/icpsr.beamer @@ -24,9 +24,7 @@ $endif$ \usepackage{tikz-cd} \usepackage{textpos} \usepackage{ifxetex,ifluatex} -\usepackage{fixltx2e} % provides \textsubscript -\usepackage{ifxetex,ifluatex} -\usepackage{fixltx2e} % provides \textsubscript +% \usepackage{fixltx2e} % provides \textsubscript \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc} \usepackage[utf8]{inputenc} @@ -90,7 +88,7 @@ $endfor$ \setbeamertemplate{sections/subsections in toc}[circle] \setbeamersize{description width=2ex} % \setbeamersize{text margin left=1ex,text margin right=1ex} -\setbeamersize{text margin left=2ex,text margin right=1ex} +\setbeamersize{text margin left=2ex,text margin right=2ex} %%\setbeamersize{ParSkip 1ex plus 1pt minus 1pt} %% %% %% \setbeamertemplate{caption}[numbered] \setbeamertemplate{caption label separator}{:} diff --git a/images/xyzdiagram.pdf b/images/xyzdiagram.pdf new file mode 100644 index 0000000..6621203 Binary files /dev/null and b/images/xyzdiagram.pdf differ