"""Nonparametric instrumental variables estimation."""
import warnings
import numpy as np
from .confidence_bands import compute_ucb
from .estimators import npiv_est
from .selection import npiv_choose_j
[docs]
def npiv(
data=None,
yname=None,
xname=None,
wname=None,
y=None,
x=None,
w=None,
x_eval=None,
x_grid=None,
alpha=0.05,
basis="tensor",
biters=99,
j_x_degree=3,
j_x_segments=None,
k_w_degree=4,
k_w_segments=None,
k_w_smooth=2,
knots="uniform",
ucb_h=True,
ucb_deriv=True,
deriv_index=1,
deriv_order=1,
check_is_fullrank=False,
w_min=None,
w_max=None,
x_min=None,
x_max=None,
seed=None,
):
r"""Estimate nonparametric instrumental variables model with uniform confidence bands.
Estimates the structural function :math:`h_0` and its derivatives in the
nonparametric IV model
.. math::
\mathbb{E}[Y - h_0(X) \mid W] = 0 \quad \text{(a.s.)}
where :math:`Y` is a scalar outcome, :math:`X` is a (possibly endogenous)
regressor vector, and :math:`W` is a vector of instrumental variables. The
function is approximated by a B-spline sieve :math:`h_0(x) \approx (\psi^J(x))' c_J`
and coefficients are estimated by two-stage least squares using :math:`K`
B-spline basis functions of :math:`W` as instruments
.. math::
\hat{c}_J = (\boldsymbol{\Psi}_J' \mathbf{P}_K \boldsymbol{\Psi}_J)^{-}
\boldsymbol{\Psi}_J' \mathbf{P}_K \mathbf{Y},
where :math:`\mathbf{P}_K = \mathbf{B}_K (\mathbf{B}_K' \mathbf{B}_K)^{-} \mathbf{B}_K'`
projects onto the instrument space. Function and derivative estimates are then given by
.. math::
\hat{h}_J(x) = (\psi^J(x))' \hat{c}_J, \quad
\partial^a \hat{h}_J(x) = (\partial^a \psi^J(x))' \hat{c}_J.
When ``j_x_segments`` is None, a bootstrap implementation of Lepski's
method selects the sieve dimension :math:`\tilde{J}` that adapts to the
unknown smoothness of :math:`h_0` and instrument strength, achieving the
minimax sup-norm convergence rate for both :math:`h_0` and its derivatives.
The adaptive CCK procedure then constructs honest uniform confidence bands
that guarantee coverage uniformly over a class of data-generating processes.
When a fixed ``j_x_segments`` is supplied, the standard undersmoothing approach
of [1]_ is used instead.
Parameters
----------
data : DataFrame, optional
Input data. Accepts any object implementing the Arrow PyCapsule Interface
(``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf
DataFrames. When provided, ``yname``, ``xname``, and ``wname`` are required.
yname : str, optional
Name of the outcome column in ``data``.
xname : str or list of str, optional
Name(s) of the endogenous regressor column(s) in ``data``.
wname : str or list of str, optional
Name(s) of the instrumental variable column(s) in ``data``.
y : ndarray of shape (n,), optional
Outcome variable. Required when ``data`` is not provided.
x : ndarray of shape (n,) or (n, p_x), optional
Endogenous regressors. Automatically promoted to 2-d if needed.
Required when ``data`` is not provided.
w : ndarray of shape (n,) or (n, p_w), optional
Instrumental variables. Requires :math:`K \geq J`.
Required when ``data`` is not provided.
x_eval : ndarray of shape (m, p_x), optional
Points at which to evaluate :math:`\hat{h}` and its derivatives. If
None, evaluates at the sample points ``x``.
x_grid : ndarray, optional
Alias for ``x_eval``. Ignored when ``x_eval`` is provided.
alpha : float, default=0.05
Significance level for :math:`100(1-\alpha)\%` confidence bands.
basis : {"tensor", "additive", "glp"}, default="tensor"
Multivariate basis construction for :math:`X`:
- ``"tensor"``: Full tensor product of univariate B-splines.
- ``"additive"``: Sum of univariate B-splines (additive model).
- ``"glp"``: Generalized linear product (hierarchical interactions).
biters : int, default=99
Number of multiplier bootstrap draws for critical value computation.
Each draw generates i.i.d. :math:`N(0,1)` weights
:math:`(\varpi_i)_{i=1}^n` to form bootstrap sup-:math:`t` statistics.
j_x_degree : int, default=3
Degree of B-spline basis for :math:`X` (order
:math:`r = \text{degree} + 1`). For UCBs of first derivatives, degree
:math:`\geq 2` is required; for second derivatives, :math:`\geq 3`.
j_x_segments : int, optional
Number of segments for the :math:`X` basis, determining sieve dimension
:math:`J`. When None, the data-driven Lepski procedure selects
:math:`\tilde{J}` adaptively. Supplying a fixed value triggers the
undersmoothing UCB approach.
k_w_degree : int, default=4
Degree of B-spline basis for :math:`W`. Defaults to
``j_x_degree + 1`` because the reduced form
:math:`\mathbb{E}[h_0(X) \mid W]` is smoother than :math:`h_0`.
k_w_segments : int, optional
Number of segments for the instrument basis. When None, chosen
proportionally to ``j_x_segments`` via the resolution-level mapping
:math:`l_w = \lceil (l + q) \, d / d_w \rceil`, where :math:`q` is controlled by ``k_w_smooth``.
k_w_smooth : int, default=2
Controls the resolution gap :math:`q` between the :math:`X` and
:math:`W` bases in the data-driven procedure. Larger values yield more
instrument basis functions relative to the :math:`X` basis.
knots : {"uniform", "quantiles"}, default="uniform"
Knot placement strategy:
- ``"uniform"``: Equally spaced knots on the support.
- ``"quantiles"``: Knots at empirical quantiles of the data.
ucb_h : bool, default=True
Compute uniform confidence bands for :math:`\hat{h}`.
ucb_deriv : bool, default=True
Compute uniform confidence bands for :math:`\partial^a \hat{h}`.
deriv_index : int, default=1
Which component of :math:`X` to differentiate with respect to
(1-based indexing).
deriv_order : int, default=1
Order :math:`|a|` of the derivative (1 = first, 2 = second, etc.).
check_is_fullrank : bool, default=False
Verify that the basis matrices :math:`\boldsymbol{\Psi}_J` and
:math:`\mathbf{B}_K` have full column rank before estimation.
w_min, w_max : float, optional
Override the support bounds for :math:`W`. Defaults to data range.
x_min, x_max : float, optional
Override the support bounds for :math:`X`. Defaults to data range.
seed : int, optional
Random seed for bootstrap reproducibility.
Returns
-------
NPIVResult
Named tuple with the following fields:
- **h** -- Estimated :math:`\hat{h}_J(x)` at evaluation points.
- **deriv** -- Estimated :math:`\partial^a \hat{h}_J(x)`.
- **h_lower**, **h_upper** -- Lower/upper UCB for :math:`h_0`.
- **h_lower_deriv**, **h_upper_deriv** -- Lower/upper UCB for
:math:`\partial^a h_0`.
- **beta** -- Sieve coefficient vector :math:`\hat{c}_J`.
- **asy_se** -- Pointwise asymptotic standard errors
:math:`\hat{\sigma}_J(x)`.
- **deriv_asy_se** -- Pointwise asymptotic standard errors
:math:`\hat{\sigma}_J^a(x)` for derivatives.
- **cv**, **cv_deriv** -- Bootstrap critical values
:math:`z_{1-\alpha}^*` used for band construction.
- **residuals** -- TSLS residuals
:math:`\hat{u}_{i,J} = Y_i - \hat{h}_J(X_i)`.
- **j_x_degree**, **j_x_segments** -- Basis parameters for :math:`X`
(segments may differ from input when data-driven).
- **k_w_degree**, **k_w_segments** -- Basis parameters for :math:`W`.
- **args** -- Diagnostic dictionary. When data-driven selection is
used, includes ``j_x_seg``, ``k_w_seg``, ``j_hat_max``,
``theta_star``, and other selection diagnostics.
Examples
--------
The Engel dataset contains household expenditure shares and income measures
for 1655 households. We estimate a nonparametric Engel curve relating food
share (``food``) to log-expenditure (``logexp``), using log-wages
(``logwages``) as an instrument for potentially endogenous expenditure:
.. ipython::
:okwarning:
In [1]: import numpy as np
...: from moderndid import npiv, load_engel
...:
...: df = load_engel()
...: df.head()
Estimate the structural function with 5 B-spline segments and 95% uniform
confidence bands. The output is an ``NPIVResult`` containing the estimated
function, confidence bands, derivatives, and diagnostics:
.. ipython::
:okwarning:
In [2]: result = npiv(
...: data=df,
...: yname="food",
...: xname="logexp",
...: wname="logwages",
...: j_x_segments=5,
...: biters=500,
...: seed=42,
...: )
...: print(f"Estimates at {len(result.h)} points")
...: print(f"h[:5] = {result.h[:5]}")
...: print(f"95% UCB critical value: {result.cv:.3f}")
...: print(f"Basis: degree={result.j_x_degree}, segments={result.j_x_segments}")
The derivative of the Engel curve (the marginal propensity to spend on food)
is estimated simultaneously:
.. ipython::
In [3]: print(f"deriv[:5] = {result.deriv[:5]}")
...: print(f"Derivative UCB critical value: {result.cv_deriv:.3f}")
You can also pass numpy arrays directly instead of a DataFrame:
.. ipython::
:okwarning:
In [4]: rng = np.random.default_rng(0)
...: n = 200
...: w = rng.uniform(0, 1, (n, 1))
...: x = w + 0.2 * rng.normal(0, 1, (n, 1))
...: y = np.sin(2 * np.pi * x).ravel() + 0.1 * rng.normal(0, 1, n)
...: result = npiv(y=y, x=x, w=w, j_x_segments=4, biters=500, seed=0)
...: result.h.shape
See Also
--------
npiv_est : Core sieve TSLS estimation (no confidence bands).
compute_ucb : Multiplier bootstrap confidence band construction.
npiv_choose_j : Data-driven sieve dimension selection.
References
----------
.. [1] Chen, X., & Christensen, T. M. (2018). Optimal sup-norm rates and
uniform inference on nonlinear functionals of nonparametric IV
regression. *Quantitative Economics*, 9(1), 39-84.
.. [2] Chen, X., Christensen, T. M., & Kankanala, S. (2024). Adaptive
estimation and uniform confidence bands for nonparametric structural
functions and elasticities. *Review of Economic Studies*.
https://arxiv.org/abs/2107.11869.
.. [3] Newey, W. K., & Powell, J. L. (2003). Instrumental variable
estimation of nonparametric models. *Econometrica*, 71(5), 1565-1578.
"""
if data is not None:
if y is not None or x is not None or w is not None:
raise ValueError("Cannot specify both 'data' and array arguments (y, x, w)")
if yname is None or xname is None or wname is None:
raise ValueError("When 'data' is provided, 'yname', 'xname', and 'wname' are required")
from moderndid.core.dataframe import to_polars
df = to_polars(data)
y = df[yname].to_numpy()
if isinstance(xname, str):
xname = [xname]
x = df.select(xname).to_numpy()
if isinstance(wname, str):
wname = [wname]
w = df.select(wname).to_numpy()
elif y is None or x is None or w is None:
raise ValueError("Must provide either 'data' with column names, or array arguments (y, x, w)")
y = np.asarray(y)
x = np.asarray(x)
w = np.asarray(w)
if y.ndim > 1:
y = y.ravel()
if len(y) != y.size:
raise ValueError("y must be a 1-dimensional array")
x = np.atleast_2d(x)
w = np.atleast_2d(w)
n = len(y)
if x.shape[0] != n or w.shape[0] != n:
raise ValueError("All input arrays must have the same number of observations")
p_x = x.shape[1]
if x_eval is None and x_grid is not None:
warnings.warn("Using x_grid as x_eval", UserWarning)
x_eval = x_grid
if x_eval is not None:
x_eval = np.atleast_2d(x_eval)
if x_eval.shape[1] != p_x:
raise ValueError("x_eval must have same number of columns as x")
if alpha <= 0 or alpha >= 1:
raise ValueError("alpha must be between 0 and 1")
if biters < 1:
raise ValueError("biters must be positive")
if j_x_degree < 0:
raise ValueError("j_x_degree must be non-negative")
if k_w_degree < 0:
raise ValueError("k_w_degree must be non-negative")
if deriv_order < 0:
raise ValueError("deriv_order must be non-negative")
if deriv_index < 1 or deriv_index > p_x:
raise ValueError(f"deriv_index must be between 1 and {p_x}")
if basis not in ("tensor", "additive", "glp"):
raise ValueError("basis must be one of: 'tensor', 'additive', 'glp'")
if knots not in ("uniform", "quantiles"):
raise ValueError("knots must be 'uniform' or 'quantiles'")
if n < 50:
warnings.warn(f"Small sample size (n={n}) may lead to unreliable results", UserWarning)
if 0 < j_x_degree < deriv_order:
warnings.warn(
f"deriv_order ({deriv_order}) > j_x_degree ({j_x_degree}), derivative will be zero everywhere",
UserWarning,
)
data_driven = j_x_segments is None
selection_result = None
if data_driven:
try:
selection_result = npiv_choose_j(
y=y,
x=x,
w=w,
x_grid=x_grid,
j_x_degree=j_x_degree,
k_w_degree=k_w_degree,
k_w_smooth=k_w_smooth,
knots=knots,
basis=basis,
x_min=x_min,
x_max=x_max,
w_min=w_min,
w_max=w_max,
grid_num=50,
biters=biters if biters > 0 else 99,
check_is_fullrank=check_is_fullrank,
seed=seed,
)
j_x_segments = selection_result["j_x_seg"]
k_w_segments = selection_result["k_w_seg"]
except (ValueError, RuntimeError, np.linalg.LinAlgError) as e:
warnings.warn(
f"Data-driven selection failed: {e}. Using default values.",
UserWarning,
)
j_x_segments = max(3, min(int(np.ceil(n ** (1 / (2 * j_x_degree + p_x)))), 10))
k_w_segments = None
args = {"data_driven": data_driven}
if selection_result:
args.update(selection_result)
if ucb_h or ucb_deriv:
result = compute_ucb(
y=y,
x=x,
w=w,
x_eval=x_eval,
alpha=alpha,
biters=biters,
basis=basis,
j_x_degree=j_x_degree,
j_x_segments=j_x_segments,
k_w_degree=k_w_degree,
k_w_segments=k_w_segments,
knots=knots,
ucb_h=ucb_h,
ucb_deriv=ucb_deriv,
deriv_index=deriv_index,
deriv_order=deriv_order,
w_min=w_min,
w_max=w_max,
x_min=x_min,
x_max=x_max,
seed=seed,
selection_result=selection_result,
)
else:
result = npiv_est(
y=y,
x=x,
w=w,
x_eval=x_eval,
basis=basis,
j_x_degree=j_x_degree,
j_x_segments=j_x_segments,
k_w_degree=k_w_degree,
k_w_segments=k_w_segments,
knots=knots,
deriv_index=deriv_index,
deriv_order=deriv_order,
check_is_fullrank=check_is_fullrank,
w_min=w_min,
w_max=w_max,
x_min=x_min,
x_max=x_max,
)
if selection_result:
result.args.update(selection_result)
result.args["data_driven"] = True
return result