diff --git a/multisite/.Rprofile b/multisite/.Rprofile new file mode 100644 index 0000000..81b960f --- /dev/null +++ b/multisite/.Rprofile @@ -0,0 +1 @@ +source("renv/activate.R") diff --git a/multisite/multisite.Rmd b/multisite/multisite.Rmd new file mode 100644 index 0000000..8456ced --- /dev/null +++ b/multisite/multisite.Rmd @@ -0,0 +1,543 @@ +--- +bibliography: ./refs.bib +link-citations: true +numbersections: true +colorlinks: true +secnumdepth: 2 +number-sections: true +linkReferences: true +output: + bookdown::html_document2: + number_sections: true + toc: true + theme: journal + extensions: +implicit_figure + code_folding: hide + fig_caption: true + includes: + after_body: ../linking_script.html +--- + + + + +# Introduction + +## What is a multisite or block randomized trial? + +A multisite or block-randomized trial is a randomized experiment "in which sample members are randomly assigned to a program or a control group *within* each of a number of sites" (@Raudenbush2015). + +For illustration, this guide will focus on multisite educational trials, although multisite trials are not unique to education. +Multisite trials are a subset of multilevel randomized controlled trials (RCTs), in which units are nested within hierarchical structures, such as students nested within schools nested within districts. +This guide uses as an illustrative example the case where each site is a school, although they could also be districts or classrooms; thus the term "site" and "school" are used interchangeably. + +An advantage of multisite trials is that they allow a researcher to study average impact across +units or sites, while also getting a sense of heterogeneity across sites (@Raudenbush2015). +However, the opportunities provided by multisite trials also come with their own challenges. +Much of the rest of this guide will discuss the choices that researchers must make when analyzing multisite trials, and the consequences of these choices. + +## Preliminaries: estimands, estimators, and estimates +Before diving in, let's introduce the definitions of estimand, estimator, and estimate. +These concepts are sometimes conflated, but disentangling them increases clarity and understanding. +The main distinction is that the *estimand* is the goal, while the *estimator* is the analysis we do in order to reach that goal. + +An **estimand** is an unobserved quantity of interest about which the researcher wishes to learn. +In this guide, the only type of estimand considered is the overall average treatment effect (ATE). +Other options include focusing on treatment effect for only a subgroup, or calculating a different summary, such as an odds ratio. +After choosing an estimand, the researcher chooses an **estimator**, which is a method used to calculate the final **estimate** which should tell the researcher something about the estimand. +Finally, the researcher must also choose a standard error estimator if she wants to summarize how the estimates might vary if the research design or underlying data generating process were repeated. + +First, to provide context, let's consider an example. +The researcher decides their *estimand* will be the average treatment effect for the pool of subjects in the experiment. +In this example, the researchers observe all of the subjects for whom they want to estimate an effect. +As with any causal analysis, the researchers do not observe the control outcomes of the subjects assigned to the active treatment, or the treated outcomes of the subjects assigned to the control treatment. +Thus, causal inference is sometimes referenced as a missing data problem, because it is impossible to observe both potential outcomes (the potential outcome given active treatment and the potential outcome given control treatment). +See [10 Things to Know About Causal Inference](https://egap.org/resource/10-things-to-know-about-causal-inference/){target="_blank"} +and [10 Types of Treatment Effect You Should Know About](https://egap.org/resource/10-types-of-treatment-effect-you-should-know-about/){target="_blank"} +for a discussion of other common estimands. + +Given an estimand, the researchers choose their *estimator* to be the coefficient from an OLS regression of the observed outcome on site-specific fixed effects and the treatment indicator. +To calculate standard errors, they use Huber-White robust standard errors. +All these choices result in a point *estimate* (e.g. the program increased reading scores by $5$ points) and a measure of uncertainty (e.g. a standard error of $2$ points). + +Next, we'll also need some notation. +This guide follows the Neyman-Rubin potential outcomes notation (@Neyman1923, @Imbens2015). +The outcomes are $Y_{ij}$ for unit $i$ in site $j$. +The potential outcomes are $Y_{ij}(1)$: the outcome given active treatment, and $Y_{ij}(0)$: the outcome given control treatment. +The quantity $B_{ij}$ is the unit-level intention-to-treat effect (ITT) $B_{ij} = Y_{ij}(1) - Y_{ij}(0)$. +If there is no noncompliance, the ITT is the ATE, as defined above. +Then $B_j$ is the average impact at site $j$, $B_j = 1/N_j \sum_{i = 1}^{N_j} B_{ij}$ where $N_j$ is the number of units at site $j$. +Finally, $N = \sum_{j = 1}^{J} N_j$. + +This guide is structured around the choices an analyst must make concerning estimand and estimators, and the resulting consequences. +The choice of estimand impacts the substantive conclusion that a researcher makes. +The choice of estimator and standard error estimator results in different statistical properties, including a potential tradeoff between bias and variance. +This guide summarizes material using the framework provided by @Miratrix2020. + +# A multisite trial is a type of a blocked or stratified randomized experiment. + +## A multisite trial is fundamentally a blocked or stratified RCT. + +A multisite trial is a blocked RCT with 2 levels: randomization occurs at the student level (level 1) within blocks defined by sites/schools (level 2). +For example, in a study of a new online math tool for high school students, randomization occurs at the student level within blocks defined by sites/schools. +Perhaps half of students at each school are assigned to the status quo / control treatment (no additional math practice), +and half are assigned to theactive treatment (an offer of additional math practice at home using an online tool). + +Because of the direct correspondence between multisite trials and blocked experiments, statistical properties of blocked experiments also translate directly to multisite experiments. +The main difference between a traditional blocked RCT and a multisite experiment is that in many blocked RCTs, the researcher is able to choose the blocks. +For example, in a clinical trial, a researcher may decide to block based on gender or specific age categories. +Blocking can help increase statistical power overall or ensure statistical power to assess effects within subgroups (such as those defined by time of entering the study, or defined by other important covariates that might predict the outcome) [@moore2012multivariate;@moore2013;@bowers2011mem]. +@Pashley2021 makes the distinction between **fixed blocks**, where the number and covariate distribution of blocks is chosen by the researcher, and **structural blocks**, where natural groupings determine the number of blocks and their covariate distributions. +Multisite experiments have structural blocks, such as districts, schools, or classrooms. +The type of block can impact variance estimation, as shown in @Pashley2021 and @Pashley2022. + +The [EGAP Metaketa Projects](https://egap.org/our-work-0/the-metaketa-initiative/){target="_blank"} +are also multisite trials: the 5--7 countries that contain sites for each study are fixed and chosen in advance by the different research teams. + +## A multisite trial is not a cluster-randomized trial + +A different type of RCT is a [cluster-randomized +design](https://egap.org/resource/10-things-to-know-about-cluster-randomization/){target="_blank"} +, +in which entire schools are assigned to either the active treatment or control +treatment. +[This video explains the difference between cluster and +block-randomized designs](https://youtu.be/bL2U9z8hX1k){target="_blank"}. +In a multisite trial trial, treatment is assigned **within a block to individual units**. +In a cluster-randomized trial, treatment is assigned to **groups** of units. +Some designs [combine cluster- and +block-randomization](https://declaredesign.org/r/designlibrary/reference/block_cluster_two_arm_designer.html){target="_blank"} +. + +Another design that is not a multisite or block-randomized trial is an experiment that takes place in only one school and assigns individual students to active treatment and control treatment: this study has only one site and thus differences +between sites do not matter in this design. + +## Why choose a multisite or block-randomized trial design? + +In most contexts, blocking reduces estimation error over an unblocked (completely randomized) experiment [@moore2012multivariate; @gerber2012field]. +Thus, blocked experiments generally offer higher statistical power than unblocked experiments. +Blocking is most helpful in increasing precision and statistical power in the setting where there is variation in the outcome, and where the blocks are related to this variation. + +In multisite trials as compared to block-randomized trials, the researcher cannot purposely construct blocks to reduce variation, because they are defined by pre-existing sites. +However, the researcher can hope, and often expect, that sites naturally explain some between-site variation. +For example, if some schools tend to have larger impacts than others, and the size of the impact is related to the average income of families attending that school, then blocked randomization using the school as a block improves efficiency over complete randomization. + +Randomizing with purposefully created blocks or pre-existing sites also helps analysts learn about how treatment effects may vary across the sites or groups of people categorized into the blocks. +If a new treatment should help the lowest performing students most, but in any given study most students are not the lowest performing, then researchers may prefer to create blocks of students within schools with the students divided by their previous performance. +This blocking within site would allow comparisons of the treatment effects on the relatively rare lowest performing students with the treatment effects on the relatively rare highest performing students. + +## Why not block? + +Often, in a multisite trial with treatment administered by site administrators (like principals of schools), an analyst has no choice but to randomize within site. +In other studies, the construction and choice of blocking criteria is a choice. +@Pashley2022 shows that blocking is generally beneficial, but also explores settings in which it may be harmful. +Blocking does result in fewer degrees of freedom, but in practice this reduction is rarely an issue, unless an experiment is very small [@Imai2008]. +Any use of blocking requires that an analyst keep track of the blocks and also that an analyst reflect the blocks in subsequent analysis: in many circumstances estimating average treatment effects from a block-randomized experiment while ignoring the blocks will yield biased estimates of the underlying targeted estimands (See ["The trouble with 'controlling for blocks'"](https://declaredesign.org/blog/biased-fixed-effects.html) and ["Estimating Average Treatment Effects in Block Randomized Experiments"](https://egap.org/resource/sd-block-rand) for demostrations of bias arising from different approaches to weighting by blocks.) + +# Analysis can either target the population in the experiment, or a broader population. + +The first choice a researcher must make in defining their estimand is the population of interest. +The researcher may want to focus on the **finite population**: only those units in the experimental pool or sample. +Alternatively, they can expand their estimand to consider the **super population**. +A super population framework considers the units in the experiment to be a sample from a broader, unobserved population, and targets the impact in this larger population. + +A researcher might be interested in a finite population framework if most or all of the population is included in the study. +For example, a state-level policymaker considering results from a statewide trial may only be interested in the impact on schools in their own state. +Similarly, if an organization is evaluating itself and includes all of its own sites, they would use a finite population framework. +An additional common case of a finite population framework is for proof-of-concept or pilot studies. +A researcher may be running a small study to test whether an intervention is worth exploring in a larger trial. +They may have even specifically selected a set of units assumed to be a worst case scenario to see whether there is still a measurable impact in such a group. +Finally, many field experiments use a finite population framework out of necessity. +The units and sites available for study may not arrive via any known or replicable sampling process, sometimes called a "convenience sample." + +A super population framework is of interest when a researcher plans to report estimates of the effect on units not included in the given study. +For many trials, the end goal is not to study the units at hand, but rather to provide predictions of the likely impacts if the intervention were expanded. +For example, a state-level policymaker with access to a trial performed on only a subset of schools in their state might prefer a super population framework. +However, one challenge of the super population framework is that it assumes that sites are randomly sampled from the broader population of interest. +As noted above, sites are often selected based on availability rather than a random sampling approach. +Thus, when taking a super population framework when sites are not randomly sampled, the population we are making inference about becomes fuzzy. +We may not be able to generalize to the whole population of interest, but instead can only generalize to a broader population of units that could have feasibly included in the study. + +One of the main consequences of the choice of population framework is the amount of uncertainty in the final estimates. +This topic will be discussed in more detail later in the guide. +When accounting for sites randomly sampled via a known sampling process from a super population, we naturally have an additional source of uncertainty deriving from which units were selected for the study at hand: randomization to +treatment is one source of randomness, and sampling from the population is another source of randomness. +Although the point estimates from either perspective will often be the same, the breadth of intervals will generally be larger for super population studies. + +For more discussion of the consequence of the super population and finite population frameworks, see @Schochet2016 and @Pashley2021. + +# The average site effect is not the same as the average person effect. + +The second choice a researcher makes is the target of inference: is the researcher interested in the **average student**, or +the **average site** (@Miratrix2020)? + +When we consider the average student impact, we weigh each student equally. +Thus, larger sites have a larger impact on the outcome. +For example, if one very large site is an outlier, the impact at that site will heavily drive the final results. +Taking this approach makes sense from a utilitarian perspective, i.e., if the benefit of the intervention is equal to the total sum of benefits across all people. +Average student impact might be of interest to a high-level policymaker, such as a state official. +The average student impact is +\[ +\frac{1}{N} \sum_{j = 1}^{J} \sum_{i = 1}^{N_j} B_{ij} = \sum_{j = 1}^{J} \frac{N_j}{N} B_j. +\] + +When we consider the average site impact, we weigh each site equally. +Thus, larger sites will be equally weighted to smaller sites. +A site-level decision maker, such as a school principal, might be more interested in the average site impact, so that site size does not influence the final answer. +The average site impact is +\[ +\frac{1}{J} \sum_{j = 1}^{J} B_j. +\] +Note that in the case where all sites are of the same size, or all sites have the same impact, then these two estimands are the same. + +To summarize the previous two sections, there have been two axes of choices: the population of interest (FP or SP for finite and super population), and the target of inference (persons or sites). +These choices result in four possible estimands: FP-persons, SP-persons, FP-sites, and SP-sites. + +# There are many widely-used estimators that target the same estimands, including design-based, linear regression, and multilevel models. + +After choosing an *estimand*, the researcher must then choose an *estimator*, a process to arrive at the estimate of interest. +There are three main categories of estimators: **design based**, **linear regression**, and **multilevel modeling**. +Linear regression and multilevel modeling are both model-based approaches to statistical inference.^[ +Linear regression can be used as a tool in both design-based approaches (to calculate the difference in means) +and model-based approaches (to estimate the parameters of a Normal data-generating process). +In general, this guide considers linear regression as used in a model-based approach.] +In model-based approaches, the researcher estimates the parameter in a likelihood function that is chosen to represent the natural stochastic process that generates the outcomes in the study. +See @rubin:1990b for more discussion of the differences between design- and model-based approaches to statistical inference. + +The different categories of estimator differ both philosophically and practically. +Each category assumes a different source of randomness, and thus has a different statistical justification. + +**Design-based** estimators specifically target the four estimands outlined above. +The only source of uncertainty is assumed to be the treatment assignment: which units happened to be assigned to the active treatment, and which happened to be assigned to the control treatment. +This assumption is the reason for their name; the uncertainty in the estimates is by design, from the purposeful randomization of units. +Using design-based estimators is also sometimes called Neymanian inference, as the estimators and properties were first introduced by Neyman (@Neyman1923). + +**Linear regression** estimators are the most familiar to many researchers. +With these estimators, the observed outcomes are assumed to be a linear function of the treatment assignment, (optionally) site-specific effects, (optionally) covariates, and random error. +In standard regression theory, the only source of randomness is the error term. +The covariates, which in the case of RCTs includes the treatment indicator, are considered fixed. +This assumption is in direct contrast to the design-based framework, in which the treatment assignment is considered random. +In econometric theory, the randomness in the error term in regression models is sometimes viewed as deriving from sampling from a larger population. + +**Multilevel model** estimators are a generalization of linear regression. +When assuming *fixed effects*, as in a standard regression model, each site's parameter is considered to be fixed and independent. +When assuming *random effects*, as in a multilevel model, each site's parameter is assumed to be drawn from a shared distribution of site impacts. +Most standard statistical software assumes a Normal distribution to model the site-specific impacts. +Multilevel models can incorporate both random site-level intercepts, and random site-level coefficients (in our cases, these are site-specific treatment impacts). +Now, uncertainty stems both from the individual-level random error term, and from the additional uncertainty of site-level parameters being considered random. +In general, multilevel models naturally lend themselves to a super population framework, because they already incorporate the assumption that sites are being randomly drawn from a broader, unobserved population. +Multilevel models are also called mixed effects models or mixed models, where a mixed model has a combination of fixed and random effects. +For a more comprehensive look at multilevel models, see @Raudenbush2015. + +Let's examine a few popular models among linear regression and multilevel models in more detail. +Note that these models as presented do not include covariates, but covariates can easily be incorporated to increase power if the analyst is willing to increase bias by a small amount in exchange (often a very small amount if the experiment is large enough) [@lin2013agnostic]. + +## Linear regression model assumptions + +**Fixed effects with a constant treatment (FE)** + +With this model, the researcher assumes that there are site-specific fixed effects (intercepts), but a common overall ATE. +The assumed model is +\[ +Y_{ij} = \sum_{k = 1}^{J} \alpha_k \text{Site}_{k,ij} + \beta T_{ij} + e_{ij}, +\] +where $\text{Site}_{k,ij}$ is an indicator for unit $ij$ being in site $k$ (out of $J$ sites), $T_{ij}$ is a treatment indicator, and $e_{ij}$ is an $iid$ error term. +For more discussion, see @Raudenbush2015. + +**Fixed effects with interactions (FE-inter)** + +With this model, the researcher assumes site-specific heterogeneous treatment effects, so in addition to fitting a separate fixed effect for the *intercepts* for each site, a separate treatment impact *coefficient* is found for each site. +\[ +Y_{ij} = \sum_{k = 1}^{J} \alpha_k \text{Site}_{k,ij} + +\sum_{k = 1}^{J} \beta_k \text{Site}_{k,ij} T_{ij} + e_{ij} +\] +Given a series of site-specific treatment estimates $\hat{\beta}_j$, these estimates are then averaged, with weights by either simple weighting (see @Clark2011) or by site size. + +## Multilevel model assumptions + +Once an analyst selects a multilevel model, for site intercepts and site impacts they must decide: what is considered random, and what is considered fixed? + +**Fixed intercept, random treatment coefficient (FIRC)** + +This model is similar to the fixed effects models above, but assumes that the site impact $\beta_j$ is drawn from a shared distribution. +\begin{align*} +\text{Level 1}\qquad & Y_{ij} = \sum_{k = 1}^{J} \alpha_k +\text{Site}_{k,ij} + \beta_j T_{ij} + e_{ij}\\ +\text{Level 2}\qquad & \beta_j = \beta + b_j +\end{align*} +See @Raudenbush2015 and @Bloom2017. + +**Random intercept, random treatment coefficient (RIRC)** + +This model further generalizes to assume that both the site intercept and site impact are drawn from shared distributions. +\begin{align*} +\text{Level 1}\qquad & Y_{ij} = A_j + \beta_j T_{ij} + e_{ij}\\ +\text{Level 2}\qquad & \beta_j = \beta + b_j\\ +& A_j = \alpha + a_j +\end{align*} + +**Random intercept, constant treatment coefficient (RICC)** + +Finally, this model assumes that the site intercepts are drawn from a shared distribution, but the treatment impact is shared. +\begin{align*} +\text{Level 1}\qquad & Y_{ij} = A_j + \beta T_{ij} + e_{ij}\\ +\text{Level 2}\qquad & A_j = \alpha + a_j\\ +\end{align*} +As noted previously, the multilevel framework generally naturally corresponds to the super population perspective. +However, for RICC models, the site *impacts* are not assumed to be drawn from a super population; only the site *intercepts* are assumed to be random. +Thus, when it comes to estimating treatment impacts, RICC models actually take a finite population perspective. + +There are also weighted versions of both traditional regressions and multilevel models. +For example, a fixed-effects model can weigh each person by their inverse chance of treatment to help increase precision. +Weighted regression for traditional regression is discussed in @Miratrix2020, and weighted regression for multilevel models is discussed in @Raudenbush2020. + + +# Some estimators attempt to reduce variance by increasing bias. + +Each category of estimator (design, regression, and multilevel) results in a different estimation approach. +One way to characterize the categories is the weights induced by the choice of estimator. +The properties of each estimator also result in different consequences for bias and variance. +Design-based estimators are unbiased, but may not always afford the most precise estimates. +In general, model-based estimators trade bias for variance. +Thus, they can sometimes have a lower mean squared error than design-based estimators. +One way that model-based estimators increase precision is through the easy incorporation of covariates. +Although design-based estimators can also incorporate covariates, it is not always as straightforward. +Covariate adjustment methods that incorporate covariates result in the equivalent to a weighted regression approach. + + +## Design-based estimators + +Design-based estimators are the most straightforward, as they are composed of simple weighted combinations of means. +First, the site-specific treatment impact estimates $\hat{B_j}$ are calculated by taking differences in means between the active treatment and control treatment groups for each site. +Then, the overall estimate is a weighted combination of these estimates, weighted by either person or site weighting. + +The design-based estimators are +\begin{align*} +\hat{\beta}_{DB-persons} &= \sum_{j = 1}^{J} \frac{N_j}{N} \hat{B_j} \\ +\hat{\beta}_{DB-sites} &= \sum_{j = 1}^{J} \frac{1}{J} \hat{B_j} +\end{align*} +Design-based estimators are generally *unbiased* for their corresponding estimands (person-weighted or site-weighted). +Unbiasedness does not hold for one superpopulation model; see @Pashley2022 for more details. + + +## Linear regression estimators + +Consider the FE model (fixed effects with a constant treatment). +This regression model results in a *precision-weighted* estimate, in which each site impact is weighted by the estimated precision of estimating that site's impact. +The estimator is +\[ +\hat{\beta}_{FE} = \sum_{j = 1}^{J} \frac{N_j p_j (1 - p_j)}{Z} \hat{B_j}, +\] +where $p_j$ is the proportion treated at site $j$. +The quantity $Z$ is a normalizing constant: $Z = \sum_{j = 1}^{J} N_j p_j (1-p_j)$ to ensure the weights sum to one. +In this model, the weights include $p_j$, which tells us information about the precision of the estimate for that site: +$N_j p_j (1 - p_j)$ is the inverse of $Var(\hat{\beta_j})$. +This expression shows that sites with larger $N_j$, or have $p_j$ closer to $0.5$, have larger weights. + +The FE estimator is not generally unbiased for either person-weighted or site-weighted estimands. +If the impact size $B_j$ is related to the weights ($N_j p_j (1 - p_j)$), then the estimator could be biased. +For example, if sites that treat a higher proportion of treated units also have a large impact, then $B_j$ can be related to $p_j (1- p_j)$. +This setting is plausible for example if sites with more resources to intervene on more students also implement the intervention more effectively. +If larger sites are more effective, then $B_j$ can be related to $N_j p_j (1- p_j)$. + +Instead, the FE estimator is unbiased for an estimand that weights the site impacts by $N_j p_j (1- p_j)$. +However, this estimand does not have a natural substantive interpretation. +Although the FE estimator is generally biased for the estimands of interest, it may have increased precision and thus a lower mean squared error. + +In contrast, the FE-inter model ends up with weights identical to the design-based estimators, depending on if the estimated site impacts are weighted equally or by size. + +## Multilevel model estimators + +Multilevel models also result in precision weighting, but in these models the estimated precision also takes into account the assumed underlying variance in site impacts. +For example, consider the FIRC model: +\[ +\hat{\beta}_{ML-FIRC*} = \sum_{j = 1}^{J} \frac{1}{Z} +\left(\frac{\sigma^2}{N_j p_j ( 1 - p_j)} + \tau^2\right)^{-1} +\] +where $Z$ is again a normalizing constant, $Z = \sum_{j = 1}^{J} \left(\frac{\sigma^2}{N_j p_j ( 1 - p_j)} + \tau^2\right)^{-1}$. +This equation assumes that $b_j$ has known variance $\tau^2$, and $e_{ij}$ has known variance $\sigma^2$. +In general, we do not know these quantities, and instead must estimate them. +However, we can see that the implied precision weights incorporate the additional uncertainty assumed in the value of $b_j$. + +The RIRC model imposes the same structure on the site impacts, and thus the weights are similar to the FIRC model. +The RICC model assumes a constant treatment impact, and thus is essentially equivalent to the fixed effects with constant treatment model (FE) when it comes to estimating the site impacts. + +We summarize the weights below. + +| Weight name | Weight | Estimators | +| ----- | ----- | ----- | +| Unbiased person-weighting | $w_j \propto N_j$ | $\hat{\beta}_{DB-FP-person}$, $\hat{\beta}_{DB-SP-person}$, $\hat{\beta}_{FE-weight-person}$, $\hat{\beta}_{FE-inter-person}$ | +| Fixed-effect precision-weighting | $w_j \propto N_j p_j (1 - p_j)$ | $\hat{\beta}_{FE}$, $\hat{\beta}_{FE-HW}$, $\hat{\beta}_{FE-CR}$, $\hat{\beta}_{ML-RICC}$ (approximately) | +| Random-effect precision-weighting | $w_j \propto \left[\hat{\tau} + N_j p_j (1 - p_j)\right]^{-1}$ \ (approximately) | $\hat{\beta}_{ML-FIRC}$, $\hat{\beta}_{ML-RIRC}$ | +| Unbiased site-weighting | $w_j \propto 1$ | $\hat{\beta}_{DB-FP-site}$, $\hat{\beta}_{DB-SP-site}$, $\hat{\beta}_{FE-weight-site}$, $\hat{\beta}_{FE-inter-site}$ | + +# For each estimator that achieves a point estimator, there may be multiple options for estimating standard errors. + +The difference between the finite population and super population framework comes into focus when calculating the standard error of various estimators. +In general, the super population framework results in larger estimates of error because of the additional uncertainty induced by assuming the sites observed are randomly drawn from a larger population. +In general, variation can be characterized by either *within site* variation or *between site* variation. +In the finite population framework, estimators calculate variation *within* sites, and then estimators average this variation across sites. +In the super population framework, estimators look at the variation *between* sites to "capture both any within-site estimation error along with the uncertainty associated with sampling sites from a larger population" (@Miratrix2020). +For both approaches, modeling assumptions can stabilize uncertainty estimation procedures, but also risk inducing bias if the modeling assumptions are wrong. + +For design-based estimators, for the finite population framework Neyman developed a conservative estimator for the standard error using the observed outcomes. +First, within-site uncertainty is estimated, and then these estimates are averaged with weights according to the target estimand. +The super population framework induces more complicated expressions that take into account the additional population variance. +The details of standard errors for super population design-based estimators are beyond the scope of this guide. + +For linear regression estimators, the traditional way to calculate standard errors is using classical regression theory. +We term this a model-based standard error approach, as they rely on the assumed model of $iid$ standard errors. +Alternatively, heteroscedastically robust standard errors (Huber-White) or cluster robust standard errors relax this $iid$ +assumption (see @Weiss2019 and @Richburg-Hayes2008). +Robust standard errors fall into a design-based approach instead of a model-based approach [@lin2013agnostic; Chapter 3 of @gerber2012field]. +Huber-White standard errors correspond to the finite population framework, while the asymptotic theory justifying traditional cluster robust standard errors corresponds to the super population framework in regards the clusters. In a [cluster-randomized trial](https://egap.org/resource/10-things-to-know-about-cluster-randomization/), treatment is assigned to clusters, so there is also a finite-population-of-clusters perspective on cluster robust standard errors that is approximated in what are commonly known as CR2 standard errors [@Pustejovsky2018]. + +To briefly summarize this correspondence, first consider the motivation behind robust standard error estimators. +In the FE model, treatment effects are assumed to be constant across sites. +Thus, if there is truly treatment effect heterogeneity, units in different sites will have different amounts of variation, and this variation will be incorporated into the error term. +The assumption of $iid$ standard errors will be broken. +Huber-White standard errors allow for heteroscedasticity in the residuals while still assuming that sites are fixed, +which fits into a finite population framework: in fact @lin2013agnostic shows that the standard error derived by @Neyman1923 on finite-population and design-based principles is the same as the HC2 standard error. + +In contrast, for cluster robust standard errors, "the conventional adjustments, often implicitly, assume that the clusters in the sample are only a small fraction of the clusters in the population of interest" (@Abadie2017). +Using cluster robust standard errors accounts for both correlation of individuals within sites, and different amounts of variation across sites. +This strategy generally results in larger standard errors. +For more discussion of cluster robust standard errors, see @Abadie2017 and @Pustejovsky2018. + +Finally, the details for standard error estimation for multilevel modeling are outside the scope of this guide. +Generally, maximum likelihood theory is applied, which "requires a complete model for both the random effects and the residual variances" (@Miratrix2020). +FIRC and RIRC models naturally produce standard errors under the super population framework, while RICC essentially takes a +finite population framework because the treatment impacts are not assumed to be drawn from a super population, as they are assumed to be consistent across sites. + +# The analyst's choices of estimand, estimator, and standard error estimator matter in some cases, and matter less in others. + +After discussing the different choices a researcher can make in analyzing a multisite trial, a big question remains: how do these choices impact empirical results? +Which of these choices have a substantial impact on the conclusion we reach, and which do not matter as much? +@Miratrix2020 conducted an empirical study to investigate these questions using 12 large multisite trials. + +## Point estimates + +First, they consider the impact on point estimates. +The authors ask, "to what extent can the choice of estimator of the overall average treatment effect result in a different impact estimate?" +In general, the authors find that the choice of estimator can substantially impact the point estimates, although the degree of impact depends on the choice. +The authors reach the following conclusions. + +**Person-weighted estimands can result in a different conclusion than site-weighted estimands.** + +In some trials, estimates resulting from person-weighted estimands differed substantially from estimates resulting from site-weighted estimands. +These discrepancies could be due to a difference in the true underlying values of the estimands, but they could also be due to estimation error from the estimation procedure. +Through empirical exploration, they found that the difference is likely due to the estimands themselves being different. +They found that "the range of estimates across all estimators is rarely meaningfully larger than the range between the person- and site-weighted estimates alone." + +**For person-weighted estimands, the choice of estimator generally does not matter.** + +The unbiased design-based estimator and the precision-weighted fixed effect estimate both target the person-weighted estimand. +There was little difference in estimates between these estimators. +Most likely, "this implies that the potential bias in the bias-precision tradeoff to the fixed effect estimators is negligible in practice." +Other [authors](https://egap.org/resource/sd-block-rand/){target="_blank"} have been able to create situations in which the bias-precision tradeoff is more severe. + +**For site-weighted estimands, the choice of estimator can matter.** + +FIRC estimates did differ from the unbiased design-based site estimator. +FIRC can be seen as an adaptive estimator: when there is little observed variation between sites, it tends to be more similar to person-weighted estimate instead of the site-weighted estimate. + +**Different estimators have different bias-variance tradeoffs.** + +Finally, the authors consider the empirical bias-variance tradeoff of different estimators, and find: + +- FE estimators have little bias, but also do not improve precision much over design-based estimators. +- To further investigate, they conducted a simulation study and found: + - FIRC tends to have lower mean squared error than design-based estimators. + - Larger site impact heterogeneity results in more biased estimates for FIRC. + - Even with more site impact heterogeneity, the mean squared error for FIRC estimators is still generally lower. + - Coverage for design-based estimators is more reliable, especially when site size is variable and site size is correlated with impact. + +## Standard errors + +The second question concerns the choice of standard error estimators. +The authors ask, "to what extent can the choice of estimator of the standard error of the overall average treatment effect result in a different estimated standard error?" + +Similar to for point estimates, the choice of standard error estimator can substantially impact the estimated standard error. +The authors reach the following conclusions. + +**The choice of estimand impacts the standard error.** + +Super population estimators generally have larger standard errors than finite population estimators. +Site-weighted estimators generally have larger standard errors than person-weighted estimators. + +**Given a particular estimand, the choice of estimator matters in some contexts and not others.** + +For finite population estimands (including both person and site-weighted estimands) or super population person-weighted estimands, the choice of standard error estimator generally does not matter. +In practice, @Miratrix2020 found that estimators that attempt to improve precision by trading bias may not actually result in gains in precision in practice. +The use of robust standard errors also does not differ much from non-robust standard errors in practice. + +For super population site-weighted estimands, the choice of standard error estimator can matter a lot. +In most cases, standard error estimates differed substantially between the design-based super population estimator and FIRC. +The authors further conclude that for super population site-weighted estimands, the wide-ranging standard error estimates stem from instability in estimation. +Through a simulation study, they find that super population standard errors can underestimate the true error. +The design-based super population standard error estimator is particularly prone to underestimate the standard error compared to multilevel models, and can be unstable, in that it estimates a wide range of different values across simulations. + +# The choice of estimator impacts power. + +Given the discussion thus far, it is not surprising that modeling choices made by the analyst also impacts power. + +First, we define an important quantity in power calculations: the intraclass correlation coefficient (ICC). +Broadly, variation can be categorized into *within*-site variation, and *between*-site variation. +Blocking helps compensate for between-site variation. +In educational trials, the ICC is the proportion of variance in the outcome that lies *between* sites (@Schochet2016). +The ICC is defined as the ratio of the variance at the site level divided by the overall variance of the individual outcomes. +This quantity plays a different role in block-randomized trial power analysis depending on the target of inference chosen by the analyst. +ICC is also used in the design and analysis of cluster-randomized trials. + +We consider two different estimators and how they impact power. +First, consider a version of the FE model that has been expanded to include level 1 (student) covariates. +The standard error for the ATE estimator is + +\[ +SE = \sqrt{\frac{(1-\text{ICC})(1-R^2_{1})}{\bar{T}(1 - \bar{T}) J \bar{n}}}, +\] + +where $ICC$ is the intraclass correlation, $R_1^2$ is the proportion of variation explained by level 1 (student) covariates, $\bar{T}$ is the average number of treated units per site, $J$ is the number of sites, and $\bar{n}$ is the average number of students per site. +or more information about this standard error expression, see the technical appendix of @Hunter2022. + +In contrast, consider the RIRC model. +The standard error for the ATE estimator is +\[ +SE = \sqrt{\frac{\text{ICC} \omega}{J} + \frac{(1-\text{ICC})(1-R^2_{1})}{\bar{T}(1 - \bar{T}) J \bar{n}}}, +\] +where $\omega$ is the ratio between the impact variation and the control outcome variation. +We can see that in doing super population inference, the standard error has an additional term which is nonnegative, so it will be as least as large as the standard error from finite population inference. +A larger standard error will result in lower power. + +Examining these standard error formulae also gives a better understanding of what factors impact power. +For example, having more explanatory power of student-level covariates (higher $R_1^2$) decreases the standard error, and thus increases power. +Additionally, the individual-level covariates do not impact the super population term; they only help to reduce the component of standard error corresponding to the finite population. +However, site-level covariates, which would be denoted $R_2^2$, do not impact power. +Site-level covariates are not useful in these models because there are already site-level effects, so the addition of covariates at that level does not provide more information. + +To calculate power for multisite trials, users can use the PowerUpR! package (@Dong2013). +The package also calculates sample size requirements and minimum detectable effect size. +The newly-developed PUMP package (@Hunter2022) extends the functionality of PowerUpR! to experiments with multiple outcomes, in addition to providing user-friendly tools for exploring the sensitivity of power to different assumptions. + + +# Takeaway advice for researchers on multisite trials + +Many research plans and analyses do not clearly specify an estimand. +This lack of clarity can both obscure the goal, and result in poor analysis choices. +For example, different estimands imply different power analyses, but the choice is often not taken into account; super population estimands in particular result in larger standard errors, and thus often require larger sample sizes to be adequately powered. +Additionally, different estimands require different estimators, so not defining an estimand makes it difficult for readers to judge the validity of the analysis. +@Miratrix2020 shows that the choice of estimand, estimator, and standard error estimator can matter (albeit in some cases more than others), in that the choice can impact the final conclusion reached by a study. + +This guide did not focus on the problem of model misspecification. +For the empirical estimates from the multisite trials considered, model-based and design-based approaches did not result in substantially different answers. +However, it is conceivable that there are contexts when these estimators could differ, and further investigation into this area is warranted. + +Though this guide set up analyzing an RCT as a series of dichotomous choices, one way forward is for researchers to report more than one estimand. +Presenting a finite population person-weighted estimand is almost always compelling. +Then, the researcher may choose to also present a site-weighed estimand, or to expand their conclusion to a super population estimand. +In some cases, different estimands may result in the same conclusion. +However, it is possible that for some studies, there is evidence of a significant effect in the finite population, but the additional uncertainty of the super population estimation means there is insufficient evidence concerning the impact in a broader population. +In these cases, only reporting one of the finite population or super population estimands does not portray the full nuance of the results. + + +# References diff --git a/multisite/multisite.html b/multisite/multisite.html new file mode 100644 index 0000000..629119d --- /dev/null +++ b/multisite/multisite.html @@ -0,0 +1,2432 @@ + + + + + + + + + + + + + +multisite.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ + + +
+

1 Introduction

+
+

1.1 What is a multisite or block randomized trial?

+

A multisite or block-randomized trial is a randomized experiment “in which sample members are randomly assigned to a program or a control group within each of a number of sites” (S. W. Raudenbush and Bloom (2015)).

+

For illustration, this guide will focus on multisite educational trials, although multisite trials are not unique to education. +Multisite trials are a subset of multilevel randomized controlled trials (RCTs), in which units are nested within hierarchical structures, such as students nested within schools nested within districts. +This guide uses as an illustrative example the case where each site is a school, although they could also be districts or classrooms; thus the term “site” and “school” are used interchangeably.

+

An advantage of multisite trials is that they allow a researcher to study average impact across +units or sites, while also getting a sense of heterogeneity across sites (S. W. Raudenbush and Bloom (2015)). +However, the opportunities provided by multisite trials also come with their own challenges. +Much of the rest of this guide will discuss the choices that researchers must make when analyzing multisite trials, and the consequences of these choices.

+
+
+

1.2 Preliminaries: estimands, estimators, and estimates

+

Before diving in, let’s introduce the definitions of estimand, estimator, and estimate. +These concepts are sometimes conflated, but disentangling them increases clarity and understanding. +The main distinction is that the estimand is the goal, while the estimator is the analysis we do in order to reach that goal.

+

An estimand is an unobserved quantity of interest about which the researcher wishes to learn. +In this guide, the only type of estimand considered is the overall average treatment effect (ATE). +Other options include focusing on treatment effect for only a subgroup, or calculating a different summary, such as an odds ratio. +After choosing an estimand, the researcher chooses an estimator, which is a method used to calculate the final estimate which should tell the researcher something about the estimand. +Finally, the researcher must also choose a standard error estimator if she wants to summarize how the estimates might vary if the research design or underlying data generating process were repeated.

+

First, to provide context, let’s consider an example. +The researcher decides their estimand will be the average treatment effect for the pool of subjects in the experiment. +In this example, the researchers observe all of the subjects for whom they want to estimate an effect. +As with any causal analysis, the researchers do not observe the control outcomes of the subjects assigned to the active treatment, or the treated outcomes of the subjects assigned to the control treatment. +Thus, causal inference is sometimes referenced as a missing data problem, because it is impossible to observe both potential outcomes (the potential outcome given active treatment and the potential outcome given control treatment). +See 10 Things to Know About Causal Inference +and 10 Types of Treatment Effect You Should Know About +for a discussion of other common estimands.

+

Given an estimand, the researchers choose their estimator to be the coefficient from an OLS regression of the observed outcome on site-specific fixed effects and the treatment indicator. +To calculate standard errors, they use Huber-White robust standard errors. +All these choices result in a point estimate (e.g. the program increased reading scores by \(5\) points) and a measure of uncertainty (e.g. a standard error of \(2\) points).

+

Next, we’ll also need some notation. +This guide follows the Neyman-Rubin potential outcomes notation (Splawa-Neyman, Dabrowska, and Speed (1923/1990), Imbens and Rubin (2015)). +The outcomes are \(Y_{ij}\) for unit \(i\) in site \(j\). +The potential outcomes are \(Y_{ij}(1)\): the outcome given active treatment, and \(Y_{ij}(0)\): the outcome given control treatment. +The quantity \(B_{ij}\) is the unit-level intention-to-treat effect (ITT) \(B_{ij} = Y_{ij}(1) - Y_{ij}(0)\). +If there is no noncompliance, the ITT is the ATE, as defined above. +Then \(B_j\) is the average impact at site \(j\), \(B_j = 1/N_j \sum_{i = 1}^{N_j} B_{ij}\) where \(N_j\) is the number of units at site \(j\). +Finally, \(N = \sum_{j = 1}^{J} N_j\).

+

This guide is structured around the choices an analyst must make concerning estimand and estimators, and the resulting consequences. +The choice of estimand impacts the substantive conclusion that a researcher makes. +The choice of estimator and standard error estimator results in different statistical properties, including a potential tradeoff between bias and variance. +This guide summarizes material using the framework provided by Miratrix, Weiss, and Henderson (2021).

+
+
+
+

2 A multisite trial is a type of a blocked or stratified randomized experiment.

+
+

2.1 A multisite trial is fundamentally a blocked or stratified RCT.

+

A multisite trial is a blocked RCT with 2 levels: randomization occurs at the student level (level 1) within blocks defined by sites/schools (level 2). +For example, in a study of a new online math tool for high school students, randomization occurs at the student level within blocks defined by sites/schools. +Perhaps half of students at each school are assigned to the status quo / control treatment (no additional math practice), +and half are assigned to theactive treatment (an offer of additional math practice at home using an online tool).

+

Because of the direct correspondence between multisite trials and blocked experiments, statistical properties of blocked experiments also translate directly to multisite experiments. +The main difference between a traditional blocked RCT and a multisite experiment is that in many blocked RCTs, the researcher is able to choose the blocks. +For example, in a clinical trial, a researcher may decide to block based on gender or specific age categories. +Blocking can help increase statistical power overall or ensure statistical power to assess effects within subgroups (such as those defined by time of entering the study, or defined by other important covariates that might predict the outcome) (Moore 2012; Moore and Moore 2013; Bowers 2011). +Pashley and Miratrix (2021) makes the distinction between fixed blocks, where the number and covariate distribution of blocks is chosen by the researcher, and structural blocks, where natural groupings determine the number of blocks and their covariate distributions. +Multisite experiments have structural blocks, such as districts, schools, or classrooms. +The type of block can impact variance estimation, as shown in Pashley and Miratrix (2021) and Pashley and Miratrix (2022).

+

The EGAP Metaketa Projects +are also multisite trials: the 5–7 countries that contain sites for each study are fixed and chosen in advance by the different research teams.

+
+
+

2.2 A multisite trial is not a cluster-randomized trial

+

A different type of RCT is a cluster-randomized +design +, +in which entire schools are assigned to either the active treatment or control +treatment. +This video explains the difference between cluster and +block-randomized designs. +In a multisite trial trial, treatment is assigned within a block to individual units. +In a cluster-randomized trial, treatment is assigned to groups of units. +Some designs combine cluster- and +block-randomization +.

+

Another design that is not a multisite or block-randomized trial is an experiment that takes place in only one school and assigns individual students to active treatment and control treatment: this study has only one site and thus differences +between sites do not matter in this design.

+
+
+

2.3 Why choose a multisite or block-randomized trial design?

+

In most contexts, blocking reduces estimation error over an unblocked (completely randomized) experiment (Moore 2012; Gerber and Green 2012). +Thus, blocked experiments generally offer higher statistical power than unblocked experiments. +Blocking is most helpful in increasing precision and statistical power in the setting where there is variation in the outcome, and where the blocks are related to this variation.

+

In multisite trials as compared to block-randomized trials, the researcher cannot purposely construct blocks to reduce variation, because they are defined by pre-existing sites. +However, the researcher can hope, and often expect, that sites naturally explain some between-site variation. +For example, if some schools tend to have larger impacts than others, and the size of the impact is related to the average income of families attending that school, then blocked randomization using the school as a block improves efficiency over complete randomization.

+

Randomizing with purposefully created blocks or pre-existing sites also helps analysts learn about how treatment effects may vary across the sites or groups of people categorized into the blocks. +If a new treatment should help the lowest performing students most, but in any given study most students are not the lowest performing, then researchers may prefer to create blocks of students within schools with the students divided by their previous performance. +This blocking within site would allow comparisons of the treatment effects on the relatively rare lowest performing students with the treatment effects on the relatively rare highest performing students.

+
+
+

2.4 Why not block?

+

Often, in a multisite trial with treatment administered by site administrators (like principals of schools), an analyst has no choice but to randomize within site. +In other studies, the construction and choice of blocking criteria is a choice. +Pashley and Miratrix (2022) shows that blocking is generally beneficial, but also explores settings in which it may be harmful. +Blocking does result in fewer degrees of freedom, but in practice this reduction is rarely an issue, unless an experiment is very small (Imai, King, and Stuart 2008). +Any use of blocking requires that an analyst keep track of the blocks and also that an analyst reflect the blocks in subsequent analysis: in many circumstances estimating average treatment effects from a block-randomized experiment while ignoring the blocks will yield biased estimates of the underlying targeted estimands (See “The trouble with ‘controlling for blocks’” and “Estimating Average Treatment Effects in Block Randomized Experiments” for demostrations of bias arising from different approaches to weighting by blocks.)

+
+
+
+

3 Analysis can either target the population in the experiment, or a broader population.

+

The first choice a researcher must make in defining their estimand is the population of interest. +The researcher may want to focus on the finite population: only those units in the experimental pool or sample. +Alternatively, they can expand their estimand to consider the super population. +A super population framework considers the units in the experiment to be a sample from a broader, unobserved population, and targets the impact in this larger population.

+

A researcher might be interested in a finite population framework if most or all of the population is included in the study. +For example, a state-level policymaker considering results from a statewide trial may only be interested in the impact on schools in their own state. +Similarly, if an organization is evaluating itself and includes all of its own sites, they would use a finite population framework. +An additional common case of a finite population framework is for proof-of-concept or pilot studies. +A researcher may be running a small study to test whether an intervention is worth exploring in a larger trial. +They may have even specifically selected a set of units assumed to be a worst case scenario to see whether there is still a measurable impact in such a group. +Finally, many field experiments use a finite population framework out of necessity. +The units and sites available for study may not arrive via any known or replicable sampling process, sometimes called a “convenience sample.”

+

A super population framework is of interest when a researcher plans to report estimates of the effect on units not included in the given study. +For many trials, the end goal is not to study the units at hand, but rather to provide predictions of the likely impacts if the intervention were expanded. +For example, a state-level policymaker with access to a trial performed on only a subset of schools in their state might prefer a super population framework. +However, one challenge of the super population framework is that it assumes that sites are randomly sampled from the broader population of interest. +As noted above, sites are often selected based on availability rather than a random sampling approach. +Thus, when taking a super population framework when sites are not randomly sampled, the population we are making inference about becomes fuzzy. +We may not be able to generalize to the whole population of interest, but instead can only generalize to a broader population of units that could have feasibly included in the study.

+

One of the main consequences of the choice of population framework is the amount of uncertainty in the final estimates. +This topic will be discussed in more detail later in the guide. +When accounting for sites randomly sampled via a known sampling process from a super population, we naturally have an additional source of uncertainty deriving from which units were selected for the study at hand: randomization to +treatment is one source of randomness, and sampling from the population is another source of randomness. +Although the point estimates from either perspective will often be the same, the breadth of intervals will generally be larger for super population studies.

+

For more discussion of the consequence of the super population and finite population frameworks, see Schochet (2016) and Pashley and Miratrix (2021).

+
+
+

4 The average site effect is not the same as the average person effect.

+

The second choice a researcher makes is the target of inference: is the researcher interested in the average student, or +the average site (Miratrix, Weiss, and Henderson (2021))?

+

When we consider the average student impact, we weigh each student equally. +Thus, larger sites have a larger impact on the outcome. +For example, if one very large site is an outlier, the impact at that site will heavily drive the final results. +Taking this approach makes sense from a utilitarian perspective, i.e., if the benefit of the intervention is equal to the total sum of benefits across all people. +Average student impact might be of interest to a high-level policymaker, such as a state official. +The average student impact is +\[ +\frac{1}{N} \sum_{j = 1}^{J} \sum_{i = 1}^{N_j} B_{ij} = \sum_{j = 1}^{J} \frac{N_j}{N} B_j. +\]

+

When we consider the average site impact, we weigh each site equally. +Thus, larger sites will be equally weighted to smaller sites. +A site-level decision maker, such as a school principal, might be more interested in the average site impact, so that site size does not influence the final answer. +The average site impact is +\[ +\frac{1}{J} \sum_{j = 1}^{J} B_j. +\] +Note that in the case where all sites are of the same size, or all sites have the same impact, then these two estimands are the same.

+

To summarize the previous two sections, there have been two axes of choices: the population of interest (FP or SP for finite and super population), and the target of inference (persons or sites). +These choices result in four possible estimands: FP-persons, SP-persons, FP-sites, and SP-sites.

+
+
+

5 There are many widely-used estimators that target the same estimands, including design-based, linear regression, and multilevel models.

+

After choosing an estimand, the researcher must then choose an estimator, a process to arrive at the estimate of interest. +There are three main categories of estimators: design based, linear regression, and multilevel modeling. +Linear regression and multilevel modeling are both model-based approaches to statistical inference.1 +In model-based approaches, the researcher estimates the parameter in a likelihood function that is chosen to represent the natural stochastic process that generates the outcomes in the study. +See Rubin (1990) for more discussion of the differences between design- and model-based approaches to statistical inference.

+

The different categories of estimator differ both philosophically and practically. +Each category assumes a different source of randomness, and thus has a different statistical justification.

+

Design-based estimators specifically target the four estimands outlined above. +The only source of uncertainty is assumed to be the treatment assignment: which units happened to be assigned to the active treatment, and which happened to be assigned to the control treatment. +This assumption is the reason for their name; the uncertainty in the estimates is by design, from the purposeful randomization of units. +Using design-based estimators is also sometimes called Neymanian inference, as the estimators and properties were first introduced by Neyman (Splawa-Neyman, Dabrowska, and Speed (1923/1990)).

+

Linear regression estimators are the most familiar to many researchers. +With these estimators, the observed outcomes are assumed to be a linear function of the treatment assignment, (optionally) site-specific effects, (optionally) covariates, and random error. +In standard regression theory, the only source of randomness is the error term. +The covariates, which in the case of RCTs includes the treatment indicator, are considered fixed. +This assumption is in direct contrast to the design-based framework, in which the treatment assignment is considered random. +In econometric theory, the randomness in the error term in regression models is sometimes viewed as deriving from sampling from a larger population.

+

Multilevel model estimators are a generalization of linear regression. +When assuming fixed effects, as in a standard regression model, each site’s parameter is considered to be fixed and independent. +When assuming random effects, as in a multilevel model, each site’s parameter is assumed to be drawn from a shared distribution of site impacts. +Most standard statistical software assumes a Normal distribution to model the site-specific impacts. +Multilevel models can incorporate both random site-level intercepts, and random site-level coefficients (in our cases, these are site-specific treatment impacts). +Now, uncertainty stems both from the individual-level random error term, and from the additional uncertainty of site-level parameters being considered random. +In general, multilevel models naturally lend themselves to a super population framework, because they already incorporate the assumption that sites are being randomly drawn from a broader, unobserved population. +Multilevel models are also called mixed effects models or mixed models, where a mixed model has a combination of fixed and random effects. +For a more comprehensive look at multilevel models, see S. W. Raudenbush and Bloom (2015).

+

Let’s examine a few popular models among linear regression and multilevel models in more detail. +Note that these models as presented do not include covariates, but covariates can easily be incorporated to increase power if the analyst is willing to increase bias by a small amount in exchange (often a very small amount if the experiment is large enough) (Lin 2013).

+
+

5.1 Linear regression model assumptions

+

Fixed effects with a constant treatment (FE)

+

With this model, the researcher assumes that there are site-specific fixed effects (intercepts), but a common overall ATE. +The assumed model is +\[ +Y_{ij} = \sum_{k = 1}^{J} \alpha_k \text{Site}_{k,ij} + \beta T_{ij} + e_{ij}, +\] +where \(\text{Site}_{k,ij}\) is an indicator for unit \(ij\) being in site \(k\) (out of \(J\) sites), \(T_{ij}\) is a treatment indicator, and \(e_{ij}\) is an \(iid\) error term. +For more discussion, see S. W. Raudenbush and Bloom (2015).

+

Fixed effects with interactions (FE-inter)

+

With this model, the researcher assumes site-specific heterogeneous treatment effects, so in addition to fitting a separate fixed effect for the intercepts for each site, a separate treatment impact coefficient is found for each site. +\[ +Y_{ij} = \sum_{k = 1}^{J} \alpha_k \text{Site}_{k,ij} + +\sum_{k = 1}^{J} \beta_k \text{Site}_{k,ij} T_{ij} + e_{ij} +\] +Given a series of site-specific treatment estimates \(\hat{\beta}_j\), these estimates are then averaged, with weights by either simple weighting (see Clark and Silverberg (2011)) or by site size.

+
+
+

5.2 Multilevel model assumptions

+

Once an analyst selects a multilevel model, for site intercepts and site impacts they must decide: what is considered random, and what is considered fixed?

+

Fixed intercept, random treatment coefficient (FIRC)

+

This model is similar to the fixed effects models above, but assumes that the site impact \(\beta_j\) is drawn from a shared distribution. +\[\begin{align*} +\text{Level 1}\qquad & Y_{ij} = \sum_{k = 1}^{J} \alpha_k +\text{Site}_{k,ij} + \beta_j T_{ij} + e_{ij}\\ +\text{Level 2}\qquad & \beta_j = \beta + b_j +\end{align*}\] +See S. W. Raudenbush and Bloom (2015) and Bloom and Porter (2017).

+

Random intercept, random treatment coefficient (RIRC)

+

This model further generalizes to assume that both the site intercept and site impact are drawn from shared distributions. +\[\begin{align*} +\text{Level 1}\qquad & Y_{ij} = A_j + \beta_j T_{ij} + e_{ij}\\ +\text{Level 2}\qquad & \beta_j = \beta + b_j\\ +& A_j = \alpha + a_j +\end{align*}\]

+

Random intercept, constant treatment coefficient (RICC)

+

Finally, this model assumes that the site intercepts are drawn from a shared distribution, but the treatment impact is shared. +\[\begin{align*} +\text{Level 1}\qquad & Y_{ij} = A_j + \beta T_{ij} + e_{ij}\\ +\text{Level 2}\qquad & A_j = \alpha + a_j\\ +\end{align*}\] +As noted previously, the multilevel framework generally naturally corresponds to the super population perspective. +However, for RICC models, the site impacts are not assumed to be drawn from a super population; only the site intercepts are assumed to be random. +Thus, when it comes to estimating treatment impacts, RICC models actually take a finite population perspective.

+

There are also weighted versions of both traditional regressions and multilevel models. +For example, a fixed-effects model can weigh each person by their inverse chance of treatment to help increase precision. +Weighted regression for traditional regression is discussed in Miratrix, Weiss, and Henderson (2021), and weighted regression for multilevel models is discussed in Raudenbush S. W. and Schwartz (2020).

+
+
+
+

6 Some estimators attempt to reduce variance by increasing bias.

+

Each category of estimator (design, regression, and multilevel) results in a different estimation approach. +One way to characterize the categories is the weights induced by the choice of estimator. +The properties of each estimator also result in different consequences for bias and variance. +Design-based estimators are unbiased, but may not always afford the most precise estimates. +In general, model-based estimators trade bias for variance. +Thus, they can sometimes have a lower mean squared error than design-based estimators. +One way that model-based estimators increase precision is through the easy incorporation of covariates. +Although design-based estimators can also incorporate covariates, it is not always as straightforward. +Covariate adjustment methods that incorporate covariates result in the equivalent to a weighted regression approach.

+
+

6.1 Design-based estimators

+

Design-based estimators are the most straightforward, as they are composed of simple weighted combinations of means. +First, the site-specific treatment impact estimates \(\hat{B_j}\) are calculated by taking differences in means between the active treatment and control treatment groups for each site. +Then, the overall estimate is a weighted combination of these estimates, weighted by either person or site weighting.

+

The design-based estimators are +\[\begin{align*} +\hat{\beta}_{DB-persons} &= \sum_{j = 1}^{J} \frac{N_j}{N} \hat{B_j} \\ +\hat{\beta}_{DB-sites} &= \sum_{j = 1}^{J} \frac{1}{J} \hat{B_j} +\end{align*}\] +Design-based estimators are generally unbiased for their corresponding estimands (person-weighted or site-weighted). +Unbiasedness does not hold for one superpopulation model; see Pashley and Miratrix (2022) for more details.

+
+
+

6.2 Linear regression estimators

+

Consider the FE model (fixed effects with a constant treatment). +This regression model results in a precision-weighted estimate, in which each site impact is weighted by the estimated precision of estimating that site’s impact. +The estimator is +\[ +\hat{\beta}_{FE} = \sum_{j = 1}^{J} \frac{N_j p_j (1 - p_j)}{Z} \hat{B_j}, +\] +where \(p_j\) is the proportion treated at site \(j\). +The quantity \(Z\) is a normalizing constant: \(Z = \sum_{j = 1}^{J} N_j p_j (1-p_j)\) to ensure the weights sum to one. +In this model, the weights include \(p_j\), which tells us information about the precision of the estimate for that site: +\(N_j p_j (1 - p_j)\) is the inverse of \(Var(\hat{\beta_j})\). +This expression shows that sites with larger \(N_j\), or have \(p_j\) closer to \(0.5\), have larger weights.

+

The FE estimator is not generally unbiased for either person-weighted or site-weighted estimands. +If the impact size \(B_j\) is related to the weights (\(N_j p_j (1 - p_j)\)), then the estimator could be biased. +For example, if sites that treat a higher proportion of treated units also have a large impact, then \(B_j\) can be related to \(p_j (1- p_j)\). +This setting is plausible for example if sites with more resources to intervene on more students also implement the intervention more effectively. +If larger sites are more effective, then \(B_j\) can be related to \(N_j p_j (1- p_j)\).

+

Instead, the FE estimator is unbiased for an estimand that weights the site impacts by \(N_j p_j (1- p_j)\). +However, this estimand does not have a natural substantive interpretation. +Although the FE estimator is generally biased for the estimands of interest, it may have increased precision and thus a lower mean squared error.

+

In contrast, the FE-inter model ends up with weights identical to the design-based estimators, depending on if the estimated site impacts are weighted equally or by size.

+
+
+

6.3 Multilevel model estimators

+

Multilevel models also result in precision weighting, but in these models the estimated precision also takes into account the assumed underlying variance in site impacts. +For example, consider the FIRC model: +\[ +\hat{\beta}_{ML-FIRC*} = \sum_{j = 1}^{J} \frac{1}{Z} +\left(\frac{\sigma^2}{N_j p_j ( 1 - p_j)} + \tau^2\right)^{-1} +\] +where \(Z\) is again a normalizing constant, \(Z = \sum_{j = 1}^{J} \left(\frac{\sigma^2}{N_j p_j ( 1 - p_j)} + \tau^2\right)^{-1}\). +This equation assumes that \(b_j\) has known variance \(\tau^2\), and \(e_{ij}\) has known variance \(\sigma^2\). +In general, we do not know these quantities, and instead must estimate them. +However, we can see that the implied precision weights incorporate the additional uncertainty assumed in the value of \(b_j\).

+

The RIRC model imposes the same structure on the site impacts, and thus the weights are similar to the FIRC model. +The RICC model assumes a constant treatment impact, and thus is essentially equivalent to the fixed effects with constant treatment model (FE) when it comes to estimating the site impacts.

+

We summarize the weights below.

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Weight nameWeightEstimators
Unbiased person-weighting\(w_j \propto N_j\)\(\hat{\beta}_{DB-FP-person}\), \(\hat{\beta}_{DB-SP-person}\), \(\hat{\beta}_{FE-weight-person}\), \(\hat{\beta}_{FE-inter-person}\)
Fixed-effect precision-weighting\(w_j \propto N_j p_j (1 - p_j)\)\(\hat{\beta}_{FE}\), \(\hat{\beta}_{FE-HW}\), \(\hat{\beta}_{FE-CR}\), \(\hat{\beta}_{ML-RICC}\) (approximately)
Random-effect precision-weighting\(w_j \propto \left[\hat{\tau} + N_j p_j (1 - p_j)\right]^{-1}\)  (approximately)\(\hat{\beta}_{ML-FIRC}\), \(\hat{\beta}_{ML-RIRC}\)
Unbiased site-weighting\(w_j \propto 1\)\(\hat{\beta}_{DB-FP-site}\), \(\hat{\beta}_{DB-SP-site}\), \(\hat{\beta}_{FE-weight-site}\), \(\hat{\beta}_{FE-inter-site}\)
+
+
+
+

7 For each estimator that achieves a point estimator, there may be multiple options for estimating standard errors.

+

The difference between the finite population and super population framework comes into focus when calculating the standard error of various estimators. +In general, the super population framework results in larger estimates of error because of the additional uncertainty induced by assuming the sites observed are randomly drawn from a larger population. +In general, variation can be characterized by either within site variation or between site variation. +In the finite population framework, estimators calculate variation within sites, and then estimators average this variation across sites. +In the super population framework, estimators look at the variation between sites to “capture both any within-site estimation error along with the uncertainty associated with sampling sites from a larger population” (Miratrix, Weiss, and Henderson (2021)). +For both approaches, modeling assumptions can stabilize uncertainty estimation procedures, but also risk inducing bias if the modeling assumptions are wrong.

+

For design-based estimators, for the finite population framework Neyman developed a conservative estimator for the standard error using the observed outcomes. +First, within-site uncertainty is estimated, and then these estimates are averaged with weights according to the target estimand. +The super population framework induces more complicated expressions that take into account the additional population variance. +The details of standard errors for super population design-based estimators are beyond the scope of this guide.

+

For linear regression estimators, the traditional way to calculate standard errors is using classical regression theory. +We term this a model-based standard error approach, as they rely on the assumed model of \(iid\) standard errors. +Alternatively, heteroscedastically robust standard errors (Huber-White) or cluster robust standard errors relax this \(iid\) +assumption (see Weiss and Gupta (2017) and Richburg-Hayes and Bloom (2008)). +Robust standard errors fall into a design-based approach instead of a model-based approach (Lin 2013; Chapter 3 of Gerber and Green 2012). +Huber-White standard errors correspond to the finite population framework, while the asymptotic theory justifying traditional cluster robust standard errors corresponds to the super population framework in regards the clusters. In a cluster-randomized trial, treatment is assigned to clusters, so there is also a finite-population-of-clusters perspective on cluster robust standard errors that is approximated in what are commonly known as CR2 standard errors (Pustejovsky and Tipton 2018).

+

To briefly summarize this correspondence, first consider the motivation behind robust standard error estimators. +In the FE model, treatment effects are assumed to be constant across sites. +Thus, if there is truly treatment effect heterogeneity, units in different sites will have different amounts of variation, and this variation will be incorporated into the error term. +The assumption of \(iid\) standard errors will be broken. +Huber-White standard errors allow for heteroscedasticity in the residuals while still assuming that sites are fixed, +which fits into a finite population framework: in fact Lin (2013) shows that the standard error derived by Splawa-Neyman, Dabrowska, and Speed (1923/1990) on finite-population and design-based principles is the same as the HC2 standard error.

+

In contrast, for cluster robust standard errors, “the conventional adjustments, often implicitly, assume that the clusters in the sample are only a small fraction of the clusters in the population of interest” (Abadie et al. (2017)). +Using cluster robust standard errors accounts for both correlation of individuals within sites, and different amounts of variation across sites. +This strategy generally results in larger standard errors. +For more discussion of cluster robust standard errors, see Abadie et al. (2017) and Pustejovsky and Tipton (2018).

+

Finally, the details for standard error estimation for multilevel modeling are outside the scope of this guide. +Generally, maximum likelihood theory is applied, which “requires a complete model for both the random effects and the residual variances” (Miratrix, Weiss, and Henderson (2021)). +FIRC and RIRC models naturally produce standard errors under the super population framework, while RICC essentially takes a +finite population framework because the treatment impacts are not assumed to be drawn from a super population, as they are assumed to be consistent across sites.

+
+
+

8 The analyst’s choices of estimand, estimator, and standard error estimator matter in some cases, and matter less in others.

+

After discussing the different choices a researcher can make in analyzing a multisite trial, a big question remains: how do these choices impact empirical results? +Which of these choices have a substantial impact on the conclusion we reach, and which do not matter as much? +Miratrix, Weiss, and Henderson (2021) conducted an empirical study to investigate these questions using 12 large multisite trials.

+
+

8.1 Point estimates

+

First, they consider the impact on point estimates. +The authors ask, “to what extent can the choice of estimator of the overall average treatment effect result in a different impact estimate?” +In general, the authors find that the choice of estimator can substantially impact the point estimates, although the degree of impact depends on the choice. +The authors reach the following conclusions.

+

Person-weighted estimands can result in a different conclusion than site-weighted estimands.

+

In some trials, estimates resulting from person-weighted estimands differed substantially from estimates resulting from site-weighted estimands. +These discrepancies could be due to a difference in the true underlying values of the estimands, but they could also be due to estimation error from the estimation procedure. +Through empirical exploration, they found that the difference is likely due to the estimands themselves being different. +They found that “the range of estimates across all estimators is rarely meaningfully larger than the range between the person- and site-weighted estimates alone.”

+

For person-weighted estimands, the choice of estimator generally does not matter.

+

The unbiased design-based estimator and the precision-weighted fixed effect estimate both target the person-weighted estimand. +There was little difference in estimates between these estimators. +Most likely, “this implies that the potential bias in the bias-precision tradeoff to the fixed effect estimators is negligible in practice.” +Other authors have been able to create situations in which the bias-precision tradeoff is more severe.

+

For site-weighted estimands, the choice of estimator can matter.

+

FIRC estimates did differ from the unbiased design-based site estimator. +FIRC can be seen as an adaptive estimator: when there is little observed variation between sites, it tends to be more similar to person-weighted estimate instead of the site-weighted estimate.

+

Different estimators have different bias-variance tradeoffs.

+

Finally, the authors consider the empirical bias-variance tradeoff of different estimators, and find:

+
    +
  • FE estimators have little bias, but also do not improve precision much over design-based estimators.
  • +
  • To further investigate, they conducted a simulation study and found: +
      +
    • FIRC tends to have lower mean squared error than design-based estimators.
    • +
    • Larger site impact heterogeneity results in more biased estimates for FIRC.
    • +
    • Even with more site impact heterogeneity, the mean squared error for FIRC estimators is still generally lower.
    • +
    • Coverage for design-based estimators is more reliable, especially when site size is variable and site size is correlated with impact.
    • +
  • +
+
+
+

8.2 Standard errors

+

The second question concerns the choice of standard error estimators. +The authors ask, “to what extent can the choice of estimator of the standard error of the overall average treatment effect result in a different estimated standard error?”

+

Similar to for point estimates, the choice of standard error estimator can substantially impact the estimated standard error. +The authors reach the following conclusions.

+

The choice of estimand impacts the standard error.

+

Super population estimators generally have larger standard errors than finite population estimators. +Site-weighted estimators generally have larger standard errors than person-weighted estimators.

+

Given a particular estimand, the choice of estimator matters in some contexts and not others.

+

For finite population estimands (including both person and site-weighted estimands) or super population person-weighted estimands, the choice of standard error estimator generally does not matter. +In practice, Miratrix, Weiss, and Henderson (2021) found that estimators that attempt to improve precision by trading bias may not actually result in gains in precision in practice. +The use of robust standard errors also does not differ much from non-robust standard errors in practice.

+

For super population site-weighted estimands, the choice of standard error estimator can matter a lot. +In most cases, standard error estimates differed substantially between the design-based super population estimator and FIRC. +The authors further conclude that for super population site-weighted estimands, the wide-ranging standard error estimates stem from instability in estimation. +Through a simulation study, they find that super population standard errors can underestimate the true error. +The design-based super population standard error estimator is particularly prone to underestimate the standard error compared to multilevel models, and can be unstable, in that it estimates a wide range of different values across simulations.

+
+
+
+

9 The choice of estimator impacts power.

+

Given the discussion thus far, it is not surprising that modeling choices made by the analyst also impacts power.

+

First, we define an important quantity in power calculations: the intraclass correlation coefficient (ICC). +Broadly, variation can be categorized into within-site variation, and between-site variation. +Blocking helps compensate for between-site variation. +In educational trials, the ICC is the proportion of variance in the outcome that lies between sites (Schochet (2016)). +The ICC is defined as the ratio of the variance at the site level divided by the overall variance of the individual outcomes. +This quantity plays a different role in block-randomized trial power analysis depending on the target of inference chosen by the analyst. +ICC is also used in the design and analysis of cluster-randomized trials.

+

We consider two different estimators and how they impact power. +First, consider a version of the FE model that has been expanded to include level 1 (student) covariates. +The standard error for the ATE estimator is

+

\[ +SE = \sqrt{\frac{(1-\text{ICC})(1-R^2_{1})}{\bar{T}(1 - \bar{T}) J \bar{n}}}, +\]

+

where \(ICC\) is the intraclass correlation, \(R_1^2\) is the proportion of variation explained by level 1 (student) covariates, \(\bar{T}\) is the average number of treated units per site, \(J\) is the number of sites, and \(\bar{n}\) is the average number of students per site. +or more information about this standard error expression, see the technical appendix of Hunter, Miratrix, and Porter (2022).

+

In contrast, consider the RIRC model. +The standard error for the ATE estimator is +\[ +SE = \sqrt{\frac{\text{ICC} \omega}{J} + \frac{(1-\text{ICC})(1-R^2_{1})}{\bar{T}(1 - \bar{T}) J \bar{n}}}, +\] +where \(\omega\) is the ratio between the impact variation and the control outcome variation. +We can see that in doing super population inference, the standard error has an additional term which is nonnegative, so it will be as least as large as the standard error from finite population inference. +A larger standard error will result in lower power.

+

Examining these standard error formulae also gives a better understanding of what factors impact power. +For example, having more explanatory power of student-level covariates (higher \(R_1^2\)) decreases the standard error, and thus increases power. +Additionally, the individual-level covariates do not impact the super population term; they only help to reduce the component of standard error corresponding to the finite population. +However, site-level covariates, which would be denoted \(R_2^2\), do not impact power. +Site-level covariates are not useful in these models because there are already site-level effects, so the addition of covariates at that level does not provide more information.

+

To calculate power for multisite trials, users can use the PowerUpR! package (Dong and Maynard (2013)). +The package also calculates sample size requirements and minimum detectable effect size. +The newly-developed PUMP package (Hunter, Miratrix, and Porter (2022)) extends the functionality of PowerUpR! to experiments with multiple outcomes, in addition to providing user-friendly tools for exploring the sensitivity of power to different assumptions.

+
+
+

10 Takeaway advice for researchers on multisite trials

+

Many research plans and analyses do not clearly specify an estimand. +This lack of clarity can both obscure the goal, and result in poor analysis choices. +For example, different estimands imply different power analyses, but the choice is often not taken into account; super population estimands in particular result in larger standard errors, and thus often require larger sample sizes to be adequately powered. +Additionally, different estimands require different estimators, so not defining an estimand makes it difficult for readers to judge the validity of the analysis. +Miratrix, Weiss, and Henderson (2021) shows that the choice of estimand, estimator, and standard error estimator can matter (albeit in some cases more than others), in that the choice can impact the final conclusion reached by a study.

+

This guide did not focus on the problem of model misspecification. +For the empirical estimates from the multisite trials considered, model-based and design-based approaches did not result in substantially different answers. +However, it is conceivable that there are contexts when these estimators could differ, and further investigation into this area is warranted.

+

Though this guide set up analyzing an RCT as a series of dichotomous choices, one way forward is for researchers to report more than one estimand. +Presenting a finite population person-weighted estimand is almost always compelling. +Then, the researcher may choose to also present a site-weighed estimand, or to expand their conclusion to a super population estimand. +In some cases, different estimands may result in the same conclusion. +However, it is possible that for some studies, there is evidence of a significant effect in the finite population, but the additional uncertainty of the super population estimation means there is insufficient evidence concerning the impact in a broader population. +In these cases, only reporting one of the finite population or super population estimands does not portray the full nuance of the results.

+
+
+

References

+
+
+Abadie, Alberto, Susan Athey, Guido W. Imbens, and Jeffrey Wooldridge. 2017. “When Should You Adjust Standard Errors for Clustering?” NBER. +
+
+Bloom, Raudenbush, H. S., and K. Porter. 2017. “Using Multisite Experiments to Study Cross-Site Variation in Treatment Effects: A Hybrid Approach with Fixed Intercepts and a Random Treatment Coefficient.” Journal of Research on Educational Effectiveness 10 (4): 817–42. +
+
+Bowers, Jake. 2011. “Making Effects Manifest in Randomized Experiments.” In Cambridge Handbook of Experimental Political Science, edited by James N. Druckman, Donald P. Green, James H. Kuklinski, and Arthur Lupia. New York, NY: Cambridge University Press. +
+
+Clark, Gleason, M. A., and M. K. Silverberg. 2011. “Do Charter Schools Improve Student Achievement? Evidence from a National Randomized Study.” Mathematica Policy Research, Inc. +
+
+Dong, Nianbo, and Rebecca Maynard. 2013. “PowerUP!: A Tool for Calculating Minimum Detectable Effect Sizes and Minimum Required Sample Sizes for Experimental and Quasi-Experimental Design Studies.” Journal of Research on Educational Effectiveness 6 (1): 24–67. +
+
+Gerber, Alan S, and Donald P Green. 2012. Field experiments: Design, analysis, and interpretation. WW Norton. +
+
+Hunter, Kristen, Luke Miratrix, and Kristin Porter. 2022. “Power Under Multiplicity Project (PUMP): Estimating Power, Minimum Detectable Effect Size, and Sample Size When Adjusting for Multiple Outcomes.” arXiv. https://arxiv.org/abs/2112.15273. +
+
+Imai, Kosuke, Gary King, and Elizabeth A. Stuart. 2008. “Misunderstandings Between Experimentalists and Observationalists about Causal Inference.” Journal of the Royal Statistical Society: Series A 171 (2): 481–502. +
+
+Imbens, Guido W., and Donald B. Rubin. 2015. Causal Inference for Statistics, Social, and Biomedical Sciences: An Introduction. Cambridge University Press. +
+
+Lin, Winston. 2013. Agnostic notes on regression adjustments to experimental data: Reexamining Freedman’s critique.” The Annals of Applied Statistics 7 (1): 295–318. +
+
+Miratrix, Luke E., Michael Weiss, and Brit Henderson. 2021. “An Applied Researcher’s Guide to Estimating Effects from Multisite Individually Randomized Trials: Estimands, Estimators, and Estimates.” Journal of Research on Educational Effectiveness 14. +
+
+Moore, Ryan T. 2012. “Multivariate Continuous Blocking to Improve Political Science Experiments.” Political Analysis 20 (4): 460–79. +
+
+Moore, Ryan T, and Sally A Moore. 2013. Blocking for Sequential Political Experiments.” Political Analysis 21: 507–23. +
+
+Pashley, Nicole E., and Luke W. Miratrix. 2021. “Insights on Variance Estimation for Blocked and Matched Pairs Designs.” Journal of Educational and Behavioral Statistics 46 (3): 271–96. +
+
+———. 2022. “Block When You Can, Except When You Shouldn’t.” Journal of Educational and Behavioral Statistics 47 (1). +
+
+Pustejovsky, James E., and Elizabeth Tipton. 2018. “Small-Sample Methods for Cluster-Robust Variance Estimation and Hypothesis Testing in Fixed Effects Models.” Journal of Business & Economic Statistics 36 (4). +
+
+Raudenbush, S. W., and D. Schwartz. 2020. “Randomized Experiments in Education, with Implications for Multilevel Causal Inference.” Annual Review of Statistics and Its Application 7 (1). +
+
+Raudenbush, Stephen W., and Howard S. Bloom. 2015. “Learning about and from a Distribution of Program Impacts Using Multisite Trials.” American Journal of Evaluation 36: 475–99. +
+
+Richburg-Hayes, Visher, L., and D. Bloom. 2008. “Do Learning Communities Effect Academic Outcomes? Evidence from an Experiment in a Community College.” Journal of Research on Educational Effectiveness 1 (1): 33–65. +
+
+Rubin, D. B. 1990. “Formal Modes of Statistical Inference for Causal Effects.” Journal of Statistical Planning and Inference 25: 279–92. +
+
+Schochet, Peter Z. 2016. “Statistical Theory for the RCT-YES Software: Design-Based Causal Inference for RCTs.” +
+
+Splawa-Neyman, Jerzy, Dortoa M Dabrowska, and Terence P. Speed. 1923/1990. “On the Application of Probability Theory to Agricultural Experiments. Essay on Principles. Section 9.” Statistical Science 5 (4): 465–72. +
+
+Weiss, Ratledge, M. J., and H. Gupta. 2017. “Supporting Community College Students from Start to Degree Completion: Long-Term Evidence from a Randomized Trial of CUNY’s ASAP.” Annual Economic Journal: Applied Economics 11 (3). +
+
+
+
+
+
    +
  1. +Linear regression can be used as a tool in both design-based approaches (to calculate the difference in means) +and model-based approaches (to estimate the parameters of a Normal data-generating process). +In general, this guide considers linear regression as used in a model-based approach.↩︎

  2. +
+
+ + + + + +
+ + + + + + + + + + + + + + + + diff --git a/multisite/refs.bib b/multisite/refs.bib new file mode 100644 index 0000000..c9fa9a2 --- /dev/null +++ b/multisite/refs.bib @@ -0,0 +1,249 @@ +%% This BibTeX bibliography file was created using BibDesk. +%% https://bibdesk.sourceforge.io/ + +%% Created for jwbowers at 2022-05-24 14:43:18 -0500 + + +%% Saved with string encoding Unicode (UTF-8) + + + +@article{rubin:1990b, + author = {Rubin, D. B.}, + date-added = {2022-05-24 14:23:51 -0500}, + date-modified = {2022-05-24 14:23:51 -0500}, + journal = {Journal of Statistical Planning and Inference}, + keywords = {Randomization}, + pages = {279--292}, + title = {Formal Modes of Statistical Inference for Causal Effects}, + volume = {25}, + year = {1990}, + bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA3Li4vLi4vLi4vLi4vLi4vRHJvcGJveC9QQVBFUlMvUnViaW4xOTkwe3J1YmluMTk5MGJ9LnBkZk8RAYAAAAAAAYAAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAAAAAABCRAAB/////xlSdWJpbjE5OTB7cnViaW4xOTkwYn0ucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAAAAAAAAAAAAAAUAAwAACiBjdQAAAAAAAAAAAAAAAAAGUEFQRVJTAAIAOS86VXNlcnM6andib3dlcnM6RHJvcGJveDpQQVBFUlM6UnViaW4xOTkwe3J1YmluMTk5MGJ9LnBkZgAADgA0ABkAUgB1AGIAaQBuADEAOQA5ADAAewByAHUAYgBpAG4AMQA5ADkAMABiAH0ALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASADdVc2Vycy9qd2Jvd2Vycy9Ecm9wYm94L1BBUEVSUy9SdWJpbjE5OTB7cnViaW4xOTkwYn0ucGRmAAATAAEvAAAVAAIAD///AAAACAANABoAJABeAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAeI=}} + +@article{freedman2008rae, + author = {Freedman, David A.}, + date-added = {2022-05-24 13:44:08 -0500}, + date-modified = {2022-05-24 13:44:08 -0500}, + journal = {Advances in Applied Mathematics}, + number = {2}, + pages = {180--193}, + publisher = {Elsevier}, + title = {{On regression adjustments to experimental data}}, + volume = {40}, + year = {2008}} + +@article{lin2013agnostic, + author = {Lin, Winston}, + date-added = {2022-05-24 13:43:00 -0500}, + date-modified = {2022-05-24 13:43:00 -0500}, + journal = {The Annals of Applied Statistics}, + number = {1}, + pages = {295--318}, + publisher = {Institute of Mathematical Statistics}, + title = {{Agnostic notes on regression adjustments to experimental data: Reexamining Freedman's critique}}, + volume = {7}, + year = {2013}} + +@incollection{bowers2011mem, + address = {New York, NY}, + author = {Bowers, Jake}, + booktitle = {Cambridge Handbook of Experimental Political Science}, + chapter = {32}, + date-added = {2022-05-24 13:10:38 -0500}, + date-modified = {2022-05-24 13:10:38 -0500}, + editor = {Druckman, James N. and Green, Donald P. and Kuklinski, James H. and Lupia, Arthur}, + publisher = {Cambridge University Press}, + title = {Making Effects Manifest in Randomized Experiments}, + year = {2011}} + +@article{moore2013, + author = {Moore, Ryan T and Moore, Sally A}, + date-added = {2022-05-24 13:08:29 -0500}, + date-modified = {2022-05-24 13:08:29 -0500}, + issn = {10471987}, + journal = {Political Analysis}, + pages = {507--523}, + pmid = {24143061}, + title = {Blocking for sequential political experiments}, + volume = {21}, + year = {2013}, + bdsk-url-1 = {http://dx.doi.org/10.1093/pan/mpt007}} + +@article{moore2012multivariate, + author = {Moore, Ryan T}, + date-added = {2022-05-24 13:08:24 -0500}, + date-modified = {2022-05-24 13:08:24 -0500}, + journal = {Political Analysis}, + number = {4}, + pages = {460--479}, + publisher = {Cambridge University Press}, + title = {Multivariate continuous blocking to improve political science experiments}, + volume = {20}, + year = {2012}} + +@book{gerber2012field, + author = {Gerber, Alan S and Green, Donald P}, + date-added = {2022-05-24 12:51:40 -0500}, + date-modified = {2022-05-24 12:51:40 -0500}, + publisher = {WW Norton}, + title = {{Field experiments: Design, analysis, and interpretation}}, + year = {2012}} + +@techreport{Abadie2017, + author = {Alberto Abadie and Susan Athey and Guido W. Imbens and Jeffrey Wooldridge}, + title = {When should you adjust standard errors for clustering?}, + institution = {NBER}, + year = {2017}, + bdsk-url-1 = {https://www-nber-org.ezp-prod1.hul.harvard.edu/papers/w24003}} + +@article{Bloom2017, + author = {Bloom, H. S., Raudenbush, S. W., Weiss, M. J., and Porter, K.}, + journal = {Journal of Research on Educational Effectiveness}, + number = {4}, + pages = {817-842}, + title = {Using multisite experiments to study cross-site variation in treatment effects: A hybrid approach with fixed intercepts and a random treatment coefficient}, + volume = {10}, + year = {2017}, + bdsk-url-1 = {https://doi.org/10.1080/19345747.2016.1264518}} + +@misc{Clark2011, + author = {Clark, M. A., Gleason, P., Tuttle, C. C., and Silverberg, M. K.}, + howpublished = {Mathematica Policy Research, Inc}, + title = {Do Charter Schools Improve Student Achievement? Evidence from a National Randomized Study}, + year = {2011}, + bdsk-url-1 = {https://eric.ed.gov/?id=ED528381}} + +@article{Dong2013, + author = {Dong, Nianbo and Maynard, Rebecca}, + issn = {1934-5747}, + journal = {Journal of Research on Educational Effectiveness}, + number = {1}, + pages = {24-67}, + title = {PowerUP!: A Tool for Calculating Minimum Detectable Effect Sizes and Minimum Required Sample Sizes for Experimental and Quasi-Experimental Design Studies}, + volume = {6}, + year = {2013}, + bdsk-url-1 = {https://doi.org/10.1080/19345747.2012.673143}} + +@misc{Hunter2022, + author = {Kristen Hunter and Luke Miratrix and Kristin Porter}, + journal = {arXiv}, + title = {Power Under Multiplicity Project (PUMP): Estimating Power, Minimum Detectable Effect Size, and Sample Size When Adjusting for Multiple Outcomes}, + url = {https://arxiv.org/abs/2112.15273}, + year = {2022}, + bdsk-url-1 = {https://arxiv.org/abs/2112.15273}} + +@book{Imbens2015, + author = {Guido W. Imbens and Donald B. Rubin}, + publisher = {Cambridge University Press}, + title = {Causal Inference for Statistics, Social, and Biomedical Sciences: {A}n Introduction}, + year = {2015}} + +@article{Imai2008, + author = {Kosuke Imai and Gary King and Elizabeth A. Stuart}, + journal = {Journal of the Royal Statistical Society: Series A}, + number = {2}, + pages = {481-502}, + title = {Misunderstandings between experimentalists and observationalists about causal inference}, + volume = {171}, + year = {2008}, + bdsk-url-1 = {https://doi.org/10.1111/j.1467-985X.2007.00527.x}} + +@article{Neyman1923, + author = {Jerzy Splawa-Neyman and Dortoa M Dabrowska and Terence P. Speed}, + journal = {Statistical Science}, + number = {4}, + pages = {465-472}, + publisher = {JSTOR}, + title = {On the Application of Probability Theory to Agricultural Experiments. {E}ssay on Principles. {S}ection 9.}, + volume = {5}, + year = {1923/1990}} + +@article{Miratrix2020, + author = {Luke E. Miratrix and Michael Weiss and Brit Henderson}, + journal = {Journal of Research on Educational Effectiveness}, + title = {An Applied Researcher's Guide to Estimating Effects From Multisite Individually Randomized Trials: Estimands, Estimators, and Estimates}, + volume = {14}, + issue = {1}, + year = {2021}, + bdsk-url-1 = {https://doi.org/10.1080/19345747.2020.1831115}} + +@article{Pashley2021, + author = {Nicole E. Pashley and Luke W. Miratrix}, + journal = {Journal of Educational and Behavioral Statistics}, + number = {3}, + pages = {271-296}, + title = {Insights on Variance Estimation for Blocked and Matched Pairs Designs}, + volume = {46}, + year = {2021}, + bdsk-url-1 = {https://doi.org/10.3102/1076998620946272}} + +@article{Pashley2022, + author = {Nicole E. Pashley and Luke W. Miratrix}, + journal = {Journal of Educational and Behavioral Statistics}, + number = {1}, + title = {Block when you can, except when you shouldn't}, + volume = {47}, + year = {2022}, + bdsk-url-1 = {https://doi.org/10.3102/10769986211027240}} + +@article{Pustejovsky2018, + author = {James E. Pustejovsky and Elizabeth Tipton}, + journal = {Journal of Business \& Economic Statistics}, + volume = {36}, + number = {4}, + title = {Small-Sample Methods for Cluster-Robust Variance Estimation and Hypothesis Testing in Fixed Effects Models}, + year = {2018}, + bdsk-url-1 = {https://doi.org/10.1080/07350015.2016.1247004}} + +@article{Raudenbush2007, + author = {Stephen W. Raudenbush and Andres Martinez and Jessaca Spybrook}, + journal = {Educational Evaluation and Policy Analysis}, + number = {1}, + pages = {5-29}, + title = {Strategies for Improving Precision in Group-Randomized Experiments}, + volume = {29}, + year = {2007}, + bdsk-url-1 = {https://doi.org/10.3102/0162373707299460}} + +@article{Raudenbush2015, + author = {Stephen W. Raudenbush and Howard S. Bloom}, + journal = {American Journal of Evaluation}, + pages = {475-499}, + title = {Learning About and From a Distribution of Program Impacts Using Multisite Trials}, + volume = {36}, + year = {2015}, + bdsk-url-1 = {https://doi.org/10.1177/1098214015600515}} + +@article{Raudenbush2020, + author = {Raudenbush, S. W., and Schwartz, D.}, + journal = {Annual Review of Statistics and Its Application}, + number = {1}, + title = {Randomized Experiments in Education, with Implications for Multilevel Causal Inference}, + volume = {7}, + year = {2020}, + bdsk-url-1 = {https://doi.org/10.1146/annurev-statistics-031219-041205}} + +@article{Richburg-Hayes2008, + author = {Richburg-Hayes, L., Visher, M., and Bloom, D.}, + journal = {Journal of Research on Educational Effectiveness}, + number = {1}, + pages = {33-65}, + title = {Do learning communities effect academic outcomes? Evidence from an experiment in a community college}, + volume = {1}, + year = {2008}, + bdsk-url-1 = {https://doi.org/10.1080/19345740701692472}} + +@misc{Schochet2016, + author = {Peter Z. Schochet}, + title = {Statistical Theory for the RCT-YES Software: Design-Based Causal Inference for RCTs}, + year = {2016}, + bdsk-url-1 = {https://www.mathematica.org/~/media/publications/pdfs/education/statistical_theory.pdf}} + +@article{Weiss2019, + author = {Weiss, M. J., Ratledge, A., Sommo, C., and Gupta, H.}, + journal = {Annual Economic Journal: Applied Economics}, + number = {3}, + title = {Supporting Community College Students from Start to Degree Completion: Long-Term Evidence from a Randomized Trial of CUNY's ASAP}, + volume = {11}, + year = {2017}, + bdsk-url-1 = {https://doi-org.ezp-prod1.hul.harvard.edu/10.1080/19345740701692472}} diff --git a/multisite/renv.lock b/multisite/renv.lock new file mode 100644 index 0000000..8ad143f --- /dev/null +++ b/multisite/renv.lock @@ -0,0 +1,196 @@ +{ + "R": { + "Version": "4.1.2", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cloud.r-project.org" + } + ] + }, + "Packages": { + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "543776ae6848fde2f48ff3816d0628bc", + "Requirements": [] + }, + "digest": { + "Package": "digest", + "Version": "0.6.29", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cf6b206a045a684728c3267ef7596190", + "Requirements": [] + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.15", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "699a7a93d08c962d9f8950b2d7a227f1", + "Requirements": [] + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "77bd60a6157420d4ffa93b27cf6a58b8", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.6.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4f2596dfb05dac67b9dc558e5c6fba2e", + "Requirements": [] + }, + "highr": { + "Package": "highr", + "Version": "0.9", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8eb36c8125038e648e5d111c0d7b2ed4", + "Requirements": [ + "xfun" + ] + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "526c484233f42522278ab06fb185cb26", + "Requirements": [ + "base64enc", + "digest", + "fastmap", + "rlang" + ] + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5aab57a3bd297eee1c1d862735972182", + "Requirements": [ + "htmltools" + ] + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d07e729b27b372429d42d24d503613a0", + "Requirements": [] + }, + "knitr": { + "Package": "knitr", + "Version": "1.37", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a4ec675eb332a33fe7b7fe26f70e1f98", + "Requirements": [ + "evaluate", + "highr", + "stringr", + "xfun", + "yaml" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cdc87ecd81934679d1557633d8e1fe51", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.15.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "206c4ef8b7ad6fb1060d69aa7b9dfe69", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3bf0219f19d9f5b3c682acbb3546a151", + "Requirements": [] + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.11", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "320017b52d05a943981272b295750388", + "Requirements": [ + "evaluate", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "stringr", + "tinytex", + "xfun", + "yaml" + ] + }, + "stringi": { + "Package": "stringi", + "Version": "1.7.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "bba431031d30789535745a9627ac9271", + "Requirements": [] + }, + "stringr": { + "Package": "stringr", + "Version": "1.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0759e6b6c0957edb1311028a49a35e76", + "Requirements": [ + "glue", + "magrittr", + "stringi" + ] + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.37", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a80abeb527a977e4bef21873d29222dd", + "Requirements": [ + "xfun" + ] + }, + "xfun": { + "Package": "xfun", + "Version": "0.29", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e2e5fb1a74fbb68b27d6efc5372635dc", + "Requirements": [] + }, + "yaml": { + "Package": "yaml", + "Version": "2.2.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4597f73aad7d32c2913ec33a345f900b", + "Requirements": [] + } + } +} diff --git a/multisite/renv/.gitignore b/multisite/renv/.gitignore new file mode 100644 index 0000000..275e4ca --- /dev/null +++ b/multisite/renv/.gitignore @@ -0,0 +1,6 @@ +library/ +local/ +cellar/ +lock/ +python/ +staging/ diff --git a/multisite/renv/activate.R b/multisite/renv/activate.R new file mode 100644 index 0000000..33833ee --- /dev/null +++ b/multisite/renv/activate.R @@ -0,0 +1,902 @@ + +local({ + + # the requested version of renv + version <- "0.15.2" + + # the project directory + project <- getwd() + + # figure out whether the autoloader is enabled + enabled <- local({ + + # first, check config option + override <- getOption("renv.config.autoloader.enabled") + if (!is.null(override)) + return(override) + + # next, check environment variables + # TODO: prefer using the configuration one in the future + envvars <- c( + "RENV_CONFIG_AUTOLOADER_ENABLED", + "RENV_AUTOLOADER_ENABLED", + "RENV_ACTIVATE_PROJECT" + ) + + for (envvar in envvars) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(tolower(envval) %in% c("true", "t", "1")) + } + + # enable by default + TRUE + + }) + + if (!enabled) + return(FALSE) + + # avoid recursion + if (identical(getOption("renv.autoloader.running"), TRUE)) { + warning("ignoring recursive attempt to run renv autoloader") + return(invisible(TRUE)) + } + + # signal that we're loading renv during R startup + options(renv.autoloader.running = TRUE) + on.exit(options(renv.autoloader.running = NULL), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # check to see if renv has already been loaded + if ("renv" %in% loadedNamespaces()) { + + # if renv has already been loaded, and it's the requested version of renv, + # nothing to do + spec <- .getNamespaceInfo(.getNamespace("renv"), "spec") + if (identical(spec[["version"]], version)) + return(invisible(TRUE)) + + # otherwise, unload and attempt to load the correct version of renv + unloadNamespace("renv") + + } + + # load bootstrap tools + `%||%` <- function(x, y) { + if (is.environment(x) || length(x)) x else y + } + + bootstrap <- function(version, library) { + + # attempt to download renv + tarball <- tryCatch(renv_bootstrap_download(version), error = identity) + if (inherits(tarball, "error")) + stop("failed to download renv ", version) + + # now attempt to install + status <- tryCatch(renv_bootstrap_install(version, tarball, library), error = identity) + if (inherits(status, "error")) + stop("failed to install renv ", version) + + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) + return(repos) + + # check for lockfile repositories + repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) + if (!inherits(repos, "error") && length(repos)) + return(repos) + + # if we're testing, re-use the test repositories + if (renv_bootstrap_tests_running()) + return(getOption("renv.tests.repos")) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- getOption( + "renv.repos.cran", + "https://cloud.r-project.org" + ) + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_repos_lockfile <- function() { + + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") + if (!file.exists(lockpath)) + return(NULL) + + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) + if (inherits(lockfile, "error")) { + warning(lockfile) + return(NULL) + } + + repos <- lockfile$R$Repositories + if (length(repos) == 0) + return(NULL) + + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) + vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) + names(vals) <- keys + + return(vals) + + } + + renv_bootstrap_download <- function(version) { + + # if the renv version number has 4 components, assume it must + # be retrieved via github + nv <- numeric_version(version) + components <- unclass(nv)[[1]] + + methods <- if (length(components) == 4L) { + list( + renv_bootstrap_download_github + ) + } else { + list( + renv_bootstrap_download_cran_latest, + renv_bootstrap_download_cran_archive + ) + } + + for (method in methods) { + path <- tryCatch(method(version), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("failed to download renv ", version) + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + utils::download.file( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + + message("* Downloading renv ", version, " ... ", appendLF = FALSE) + + type <- spec$type + repos <- spec$repos + + info <- tryCatch( + utils::download.packages( + pkgs = "renv", + destdir = tempdir(), + repos = repos, + type = type, + quiet = TRUE + ), + condition = identity + ) + + if (inherits(info, "condition")) { + message("FAILED") + return(FALSE) + } + + # report success and return + message("OK (downloaded ", type, ")") + info[1, 2] + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # retrieve package database + db <- tryCatch( + as.data.frame( + utils::available.packages(type = type, repos = repos), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + message("* Downloading renv ", version, " ... ", appendLF = FALSE) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) { + message("OK") + return(destfile) + } + + } + + message("FAILED") + return(FALSE) + + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + pat <- Sys.getenv("GITHUB_PAT") + if (nzchar(Sys.which("curl")) && nzchar(pat)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + message("* Downloading renv ", version, " from GitHub ... ", appendLF = FALSE) + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) { + message("FAILED") + return(FALSE) + } + + message("OK") + return(destfile) + + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + message("* Installing renv ", version, " ... ", appendLF = FALSE) + dir.create(library, showWarnings = FALSE, recursive = TRUE) + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + r <- file.path(bin, exe) + args <- c("--vanilla", "CMD", "INSTALL", "--no-multiarch", "-l", shQuote(library), shQuote(tarball)) + output <- system2(r, args, stdout = TRUE, stderr = TRUE) + message("Done!") + + # check for successful install + status <- attr(output, "status") + if (is.numeric(status) && !identical(status, 0L)) { + header <- "Error installing renv:" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- c(header, lines, output) + writeLines(text, con = stderr()) + } + + status + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + prefix <- renv_bootstrap_profile_prefix() + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(paste(c(path, prefix), collapse = "/")) + + path <- renv_bootstrap_library_root_impl(project) + if (!is.null(path)) { + name <- renv_bootstrap_library_root_name(project) + return(paste(c(path, prefix, name), collapse = "/")) + } + + renv_bootstrap_paths_renv("library", project = project) + + } + + renv_bootstrap_library_root_impl <- function(project) { + + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(root)) + return(root) + + type <- renv_bootstrap_project_type(project) + if (identical(type, "package")) { + userdir <- renv_bootstrap_user_dir() + return(file.path(userdir, "library")) + } + + } + + renv_bootstrap_validate_version <- function(version) { + + loadedversion <- utils::packageDescription("renv", fields = "Version") + if (version == loadedversion) + return(TRUE) + + # assume four-component versions are from GitHub; three-component + # versions are from CRAN + components <- strsplit(loadedversion, "[.-]")[[1]] + remote <- if (length(components) == 4L) + paste("rstudio/renv", loadedversion, sep = "@") + else + paste("renv", loadedversion, sep = "@") + + fmt <- paste( + "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", + "Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", + "Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", + sep = "\n" + ) + + msg <- sprintf(fmt, loadedversion, version, remote) + warning(msg, call. = FALSE) + + FALSE + + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- renv_bootstrap_paths_renv("profile", profile = FALSE) + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (!profile %in% c("", "default")) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("profiles", profile, "renv")) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + renv_bootstrap_path_absolute <- function(path) { + + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( + substr(path, 1L, 1L) %in% c(letters, LETTERS) && + substr(path, 2L, 3L) %in% c(":/", ":\\") + ) + + } + + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { + renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") + root <- if (renv_bootstrap_path_absolute(renv)) NULL else project + prefix <- if (profile) renv_bootstrap_profile_prefix() + components <- c(root, renv, prefix, ...) + paste(components, collapse = "/") + } + + renv_bootstrap_project_type <- function(path) { + + descpath <- file.path(path, "DESCRIPTION") + if (!file.exists(descpath)) + return("unknown") + + desc <- tryCatch( + read.dcf(descpath, all = TRUE), + error = identity + ) + + if (inherits(desc, "error")) + return("unknown") + + type <- desc$Type + if (!is.null(type)) + return(tolower(type)) + + package <- desc$Package + if (!is.null(package)) + return("package") + + "unknown" + + } + + renv_bootstrap_user_dir <- function(path) { + dir <- renv_bootstrap_user_dir_impl(path) + chartr("\\", "/", dir) + } + + renv_bootstrap_user_dir_impl <- function(path) { + + # use R_user_dir if available + tools <- asNamespace("tools") + if (is.function(tools$R_user_dir)) + return(tools$R_user_dir("renv", "cache")) + + # try using our own backfill for older versions of R + envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") + for (envvar in envvars) { + root <- Sys.getenv(envvar, unset = NA) + if (!is.na(root)) { + path <- file.path(root, "R/renv") + return(path) + } + } + + # use platform-specific default fallbacks + if (Sys.info()[["sysname"]] == "Windows") + file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") + else if (Sys.info()[["sysname"]] == "Darwin") + "~/Library/Caches/org.R-project.R/R/renv" + else + "~/.cache/R/renv" + + } + + renv_json_read <- function(file = NULL, text = NULL) { + + text <- paste(text %||% read(file), collapse = "\n") + + # find strings in the JSON + pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' + locs <- gregexpr(pattern, text)[[1]] + + # if any are found, replace them with placeholders + replaced <- text + strings <- character() + replacements <- character() + + if (!identical(c(locs), -1L)) { + + # get the string values + starts <- locs + ends <- locs + attr(locs, "match.length") - 1L + strings <- substring(text, starts, ends) + + # only keep those requiring escaping + strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) + + # compute replacements + replacements <- sprintf('"\032%i\032"', seq_along(strings)) + + # replace the strings + mapply(function(string, replacement) { + replaced <<- sub(string, replacement, replaced, fixed = TRUE) + }, strings, replacements) + + } + + # transform the JSON into something the R parser understands + transformed <- replaced + transformed <- gsub("[[{]", "list(", transformed) + transformed <- gsub("[]}]", ")", transformed) + transformed <- gsub(":", "=", transformed, fixed = TRUE) + text <- paste(transformed, collapse = "\n") + + # parse it + json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] + + # construct map between source strings, replaced strings + map <- as.character(parse(text = strings)) + names(map) <- as.character(parse(text = replacements)) + + # convert to list + map <- as.list(map) + + # remap strings in object + remapped <- renv_json_remap(json, map) + + # evaluate + eval(remapped, envir = baseenv()) + + } + + renv_json_remap <- function(json, map) { + + # fix names + if (!is.null(names(json))) { + lhs <- match(names(json), names(map), nomatch = 0L) + rhs <- match(names(map), names(json), nomatch = 0L) + names(json)[rhs] <- map[lhs] + } + + # fix values + if (is.character(json)) + return(map[[json]] %||% json) + + # handle true, false, null + if (is.name(json)) { + text <- as.character(json) + if (text == "true") + return(TRUE) + else if (text == "false") + return(FALSE) + else if (text == "null") + return(NULL) + } + + # recurse + if (is.recursive(json)) { + for (i in seq_along(json)) { + json[i] <- list(renv_json_remap(json[[i]], map)) + } + } + + json + + } + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # attempt to load + if (renv_bootstrap_load(project, libpath, version)) + return(TRUE) + + # load failed; inform user we're about to bootstrap + prefix <- paste("# Bootstrapping renv", version) + postfix <- paste(rep.int("-", 77L - nchar(prefix)), collapse = "") + header <- paste(prefix, postfix) + message(header) + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + message("* Successfully installed and loaded renv ", version, ".") + return(renv::load()) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + +})