"""Wrapper for doubly robust DiD estimators."""
from moderndid.core.preprocess import preprocess_drdid
from .container import DRDIDResult
from .estimators.drdid_imp_local_rc import drdid_imp_local_rc
from .estimators.drdid_imp_panel import drdid_imp_panel
from .estimators.drdid_imp_rc import drdid_imp_rc
from .estimators.drdid_panel import drdid_panel
from .estimators.drdid_rc import drdid_rc
from .estimators.drdid_trad_rc import drdid_trad_rc
[docs]
def drdid(
data,
yname,
tname,
idname=None,
treatname=None,
xformla=None,
panel=True,
est_method="imp",
weightsname=None,
boot=False,
boot_type="weighted",
n_boot=999,
inf_func=False,
trim_level=0.995,
):
r"""Compute the locally efficient doubly robust DiD estimator for the ATT.
Implements doubly robust difference-in-differences estimation following [2]_.
These estimators combine inverse probability weighting and outcome regression,
providing consistency if either the propensity score model or the outcome
regression model is correctly specified, but not necessarily both.
The parameter of interest is the average treatment effect on the treated (ATT)
in a two-period setting
.. math::
\tau = \mathbb{E}[Y_1(1) - Y_1(0) \mid D = 1].
Identification relies on a conditional parallel trends assumption requiring
that treated and comparison groups would have evolved similarly absent
treatment, conditional on covariates :math:`X`
.. math::
\mathbb{E}[Y_1(0) - Y_0(0) \mid D=1, X] = \mathbb{E}[Y_1(0) - Y_0(0) \mid D=0, X].
For panel data, the doubly robust estimand combines outcome regression
:math:`\mu_{0,\Delta}(X) = \mathbb{E}[\Delta Y \mid D=0, X]` with propensity
score weighting :math:`\pi(X) = \mathbb{P}(D=1 \mid X)`
.. math::
\tau^{dr} = \mathbb{E}\left[\left(\frac{D}{\mathbb{E}[D]}
- \frac{\pi(X)(1-D)/(1-\pi(X))}
{\mathbb{E}[\pi(X)(1-D)/(1-\pi(X))]}\right)
(\Delta Y - \mu_{0,\Delta}(X))\right].
Parameters
----------
data : pd.DataFrame | pl.DataFrame
The input data containing outcome, time, unit ID, treatment,
and optionally covariates and weights. Accepts both pandas and polars DataFrames.
yname : str
Name of the column containing the outcome variable.
tname : str
Name of the column containing the time periods (must have exactly 2 periods).
idname : str | None, default None
Name of the column containing the unit ID. Required if panel=True.
treatname : str
Name of the column containing the treatment group indicator.
For panel data: time-invariant indicator (1 if ever treated, 0 if never treated).
For repeated cross-sections: treatment status in the post-period.
xformla : str | None, default None
A formula for the covariates to include in the model.
Should be of the form "~ X1 + X2" (intercept is always included).
If None, equivalent to "~ 1" (intercept only).
panel : bool, default True
Whether the data is panel (True) or repeated cross-sections (False).
Panel data should be in long format with each row representing
a unit-time observation.
est_method : {"imp", "trad", "imp_local", "trad_local"}, default "imp"
The method to estimate the nuisance parameters.
- "imp": Uses weighted least squares to estimate outcome regressions and
inverse probability tilting to estimate the propensity score, leading to
the improved locally efficient DR DiD estimator. For panel data, this
corresponds to equation (3.1) in Sant'Anna and Zhao (2020). For repeated
cross-sections, this uses a single propensity score model.
- "trad": Uses OLS to estimate outcome regressions and maximum likelihood
to estimate propensity score, leading to the "traditional" locally
efficient DR DiD estimator.
- "imp_local": For repeated cross-sections only. Implements the locally
efficient estimator from equation (3.4) in Sant'Anna and Zhao (2020)
with separate outcome regressions for each group and time period.
- "trad_local": For repeated cross-sections only. Traditional DR DiD
estimator from equation (3.3) in Sant'Anna and Zhao (2020) that is
not locally efficient.
weightsname : str | None, default None
Name of the column containing sampling weights.
If None, all observations have equal weight.
Weights are normalized to have mean 1.
boot : bool, default False
Whether to compute bootstrap standard errors. If False,
analytical standard errors are reported.
boot_type : {"weighted", "multiplier"}, default "weighted"
Type of bootstrap to perform (only relevant if boot=True).
n_boot : int, default 999
Number of bootstrap repetitions (only relevant if boot=True).
inf_func : bool, default False
Whether to return the influence function values.
trim_level : float, default 0.995
The level of trimming for the propensity score.
Returns
-------
DRDIDResult
NamedTuple containing:
- **att**: The DR DiD point estimate.
- **se**: The DR DiD standard error.
- **uci**: The upper bound of a 95% confidence interval.
- **lci**: The lower bound of a 95% confidence interval.
- **boots**: Bootstrap draws of the ATT if boot=True.
- **att_inf_func**: Influence function values if inf_func=True.
- **call_params**: Original function call parameters.
- **args**: Arguments used in the estimation.
Examples
--------
Estimate the average treatment effect on the treated (ATT) using panel data from a job training
program. The data tracks the same individuals over time, before and after
some received training.
.. ipython::
:okwarning:
In [1]: import moderndid
...: from moderndid import load_nsw
...:
...: nsw_data = load_nsw()
...:
...: att_result = moderndid.drdid(
...: data=nsw_data,
...: yname="re",
...: tname="year",
...: idname="id",
...: treatname="experimental",
...: xformla="~ age + educ + black + married + nodegree + hisp + re74",
...: panel=True,
...: est_method="imp",
...: )
In [2]: print(att_result)
Notes
-----
When panel data are available (`panel=True`), the function implements the
locally efficient doubly robust DiD estimator for the ATT defined in
equation (3.1) in [2]_. This estimator makes use of a logistic propensity score
model for the probability of being in the treated group, and of a linear regression
model for the outcome evolution among the comparison units.
When only stationary repeated cross-section data are available (`panel=False`),
the function implements the locally efficient doubly robust DiD estimator
for the ATT defined in equation (3.4) in [2]_. This estimator makes use of a
logistic propensity score model for the probability of being in the treated group,
and of (separate) linear regression models for the outcome of both treated and
comparison units, in both pre and post-treatment periods.
When `est_method="imp"` (the default), the nuisance parameters are estimated
using the methods described in Sections 3.1 and 3.2 of [2]_.
The propensity score parameters are estimated using the inverse probability
tilting estimator proposed by [1]_, and the outcome regression coefficients are
estimated using weighted least squares.
When `est_method="trad"`, the propensity score parameters are estimated using
maximum likelihood, and the outcome regression coefficients are estimated
using ordinary least squares.
The main advantage of using `est_method="imp"` is that the resulting estimator
is not only locally efficient and doubly robust for the ATT, but it is also
doubly robust for inference; see [2]_ for details.
See Also
--------
ipwdid : Inverse propensity weighted DiD estimator.
ordid : Outcome regression DiD estimator.
References
----------
.. [1] Graham, B., Pinto, C., and Egel, D. (2012),
"Inverse Probability Tilting for Moment Condition Models with Missing Data."
Review of Economic Studies, vol. 79 (3), pp. 1053-1079.
https://doi.org/10.1093/restud/rdr047
.. [2] Sant'Anna, P. H. C. and Zhao, J. (2020),
"Doubly Robust Difference-in-Differences Estimators."
Journal of Econometrics, Vol. 219 (1), pp. 101-122.
https://doi.org/10.1016/j.jeconom.2020.06.003
"""
if treatname is None:
raise ValueError("treatname is required. Please specify the treatment column.")
if panel and idname is None:
raise ValueError("idname must be provided when panel=True")
valid_panel_methods = ("imp", "trad")
valid_rc_methods = ("imp", "trad", "imp_local", "trad_local")
if panel:
if est_method not in valid_panel_methods:
if est_method in valid_rc_methods:
raise ValueError(
f"est_method='{est_method}' is only available for repeated cross-sections "
f"(panel=False). For panel data, use 'imp' or 'trad'."
)
raise ValueError(f"est_method='{est_method}' is not valid. For panel data, must be 'imp' or 'trad'.")
else:
if est_method not in valid_rc_methods:
raise ValueError(
f"est_method='{est_method}' is not valid. For repeated cross-sections, "
f"must be one of: 'imp', 'trad', 'imp_local', 'trad_local'."
)
if not 0 < trim_level < 1:
raise ValueError(f"trim_level={trim_level} is not valid. Must be between 0 and 1 (exclusive).")
if boot_type not in ("weighted", "multiplier"):
raise ValueError(f"boot_type='{boot_type}' is not valid. Must be 'weighted' or 'multiplier'.")
if not isinstance(n_boot, int) or n_boot < 1:
raise ValueError(f"n_boot={n_boot} is not valid. Must be a positive integer.")
call_params = {
"yname": yname,
"tname": tname,
"idname": idname,
"treatname": treatname,
"xformla": xformla,
"data_shape": data.shape,
"panel": panel,
"est_method": est_method,
"weightsname": weightsname,
"boot": boot,
"boot_type": boot_type,
"n_boot": n_boot,
"inf_func": inf_func,
"trim_level": trim_level,
}
dp = preprocess_drdid(
data=data,
yname=yname,
tname=tname,
treat_col=treatname,
idname=idname if panel else None,
xformla=xformla,
panel=panel,
weightsname=weightsname,
boot=boot,
boot_type=boot_type,
biters=n_boot,
inf_func=inf_func,
est_method=est_method,
trim_level=trim_level,
)
if panel:
if est_method == "imp":
result = drdid_imp_panel(
y1=dp.y1,
y0=dp.y0,
d=dp.D,
covariates=dp.covariates,
i_weights=dp.weights,
boot=boot,
boot_type=boot_type,
nboot=n_boot,
influence_func=inf_func,
trim_level=trim_level,
)
else:
result = drdid_panel(
y1=dp.y1,
y0=dp.y0,
d=dp.D,
covariates=dp.covariates,
i_weights=dp.weights,
boot=boot,
boot_type=boot_type,
nboot=n_boot,
influence_func=inf_func,
trim_level=trim_level,
)
else:
if est_method == "imp":
result = drdid_imp_rc(
y=dp.y,
post=dp.post,
d=dp.D,
covariates=dp.covariates,
i_weights=dp.weights,
boot=boot,
boot_type=boot_type,
nboot=n_boot,
influence_func=inf_func,
trim_level=trim_level,
)
elif est_method == "trad":
result = drdid_rc(
y=dp.y,
post=dp.post,
d=dp.D,
covariates=dp.covariates,
i_weights=dp.weights,
boot=boot,
boot_type=boot_type,
nboot=n_boot,
influence_func=inf_func,
trim_level=trim_level,
)
elif est_method == "imp_local":
result = drdid_imp_local_rc(
y=dp.y,
post=dp.post,
d=dp.D,
covariates=dp.covariates,
i_weights=dp.weights,
boot=boot,
boot_type=boot_type,
nboot=n_boot,
influence_func=inf_func,
trim_level=trim_level,
)
else:
result = drdid_trad_rc(
y=dp.y,
post=dp.post,
d=dp.D,
covariates=dp.covariates,
i_weights=dp.weights,
boot=boot,
boot_type=boot_type,
nboot=n_boot,
influence_func=inf_func,
trim_level=trim_level,
)
args = result.args.copy()
args.update(
{
"panel": panel,
"estMethod": est_method,
"normalized": True,
"boot": boot,
"boot_type": boot_type,
"nboot": n_boot,
"type": "dr",
"trim_level": trim_level,
}
)
return DRDIDResult(
att=result.att,
se=result.se,
uci=result.uci,
lci=result.lci,
boots=result.boots,
att_inf_func=result.att_inf_func,
call_params=call_params,
args=args,
)