Source code for moderndid.npiv.npiv

"""Nonparametric instrumental variables estimation."""

import warnings

import numpy as np

from .confidence_bands import compute_ucb
from .estimators import npiv_est
from .selection import npiv_choose_j


[docs] def npiv( data=None, yname=None, xname=None, wname=None, y=None, x=None, w=None, x_eval=None, x_grid=None, alpha=0.05, basis="tensor", biters=99, j_x_degree=3, j_x_segments=None, k_w_degree=4, k_w_segments=None, k_w_smooth=2, knots="uniform", ucb_h=True, ucb_deriv=True, deriv_index=1, deriv_order=1, check_is_fullrank=False, w_min=None, w_max=None, x_min=None, x_max=None, seed=None, ): r"""Estimate nonparametric instrumental variables model with uniform confidence bands. Estimates the structural function :math:`h_0` and its derivatives in the nonparametric IV model .. math:: \mathbb{E}[Y - h_0(X) \mid W] = 0 \quad \text{(a.s.)} where :math:`Y` is a scalar outcome, :math:`X` is a (possibly endogenous) regressor vector, and :math:`W` is a vector of instrumental variables. The function is approximated by a B-spline sieve :math:`h_0(x) \approx (\psi^J(x))' c_J` and coefficients are estimated by two-stage least squares using :math:`K` B-spline basis functions of :math:`W` as instruments .. math:: \hat{c}_J = (\boldsymbol{\Psi}_J' \mathbf{P}_K \boldsymbol{\Psi}_J)^{-} \boldsymbol{\Psi}_J' \mathbf{P}_K \mathbf{Y}, where :math:`\mathbf{P}_K = \mathbf{B}_K (\mathbf{B}_K' \mathbf{B}_K)^{-} \mathbf{B}_K'` projects onto the instrument space. Function and derivative estimates are then given by .. math:: \hat{h}_J(x) = (\psi^J(x))' \hat{c}_J, \quad \partial^a \hat{h}_J(x) = (\partial^a \psi^J(x))' \hat{c}_J. When ``j_x_segments`` is None, a bootstrap implementation of Lepski's method selects the sieve dimension :math:`\tilde{J}` that adapts to the unknown smoothness of :math:`h_0` and instrument strength, achieving the minimax sup-norm convergence rate for both :math:`h_0` and its derivatives. The adaptive CCK procedure then constructs honest uniform confidence bands that guarantee coverage uniformly over a class of data-generating processes. When a fixed ``j_x_segments`` is supplied, the standard undersmoothing approach of [1]_ is used instead. Parameters ---------- data : DataFrame, optional Input data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. When provided, ``yname``, ``xname``, and ``wname`` are required. yname : str, optional Name of the outcome column in ``data``. xname : str or list of str, optional Name(s) of the endogenous regressor column(s) in ``data``. wname : str or list of str, optional Name(s) of the instrumental variable column(s) in ``data``. y : ndarray of shape (n,), optional Outcome variable. Required when ``data`` is not provided. x : ndarray of shape (n,) or (n, p_x), optional Endogenous regressors. Automatically promoted to 2-d if needed. Required when ``data`` is not provided. w : ndarray of shape (n,) or (n, p_w), optional Instrumental variables. Requires :math:`K \geq J`. Required when ``data`` is not provided. x_eval : ndarray of shape (m, p_x), optional Points at which to evaluate :math:`\hat{h}` and its derivatives. If None, evaluates at the sample points ``x``. x_grid : ndarray, optional Alias for ``x_eval``. Ignored when ``x_eval`` is provided. alpha : float, default=0.05 Significance level for :math:`100(1-\alpha)\%` confidence bands. basis : {"tensor", "additive", "glp"}, default="tensor" Multivariate basis construction for :math:`X`: - ``"tensor"``: Full tensor product of univariate B-splines. - ``"additive"``: Sum of univariate B-splines (additive model). - ``"glp"``: Generalized linear product (hierarchical interactions). biters : int, default=99 Number of multiplier bootstrap draws for critical value computation. Each draw generates i.i.d. :math:`N(0,1)` weights :math:`(\varpi_i)_{i=1}^n` to form bootstrap sup-:math:`t` statistics. j_x_degree : int, default=3 Degree of B-spline basis for :math:`X` (order :math:`r = \text{degree} + 1`). For UCBs of first derivatives, degree :math:`\geq 2` is required; for second derivatives, :math:`\geq 3`. j_x_segments : int, optional Number of segments for the :math:`X` basis, determining sieve dimension :math:`J`. When None, the data-driven Lepski procedure selects :math:`\tilde{J}` adaptively. Supplying a fixed value triggers the undersmoothing UCB approach. k_w_degree : int, default=4 Degree of B-spline basis for :math:`W`. Defaults to ``j_x_degree + 1`` because the reduced form :math:`\mathbb{E}[h_0(X) \mid W]` is smoother than :math:`h_0`. k_w_segments : int, optional Number of segments for the instrument basis. When None, chosen proportionally to ``j_x_segments`` via the resolution-level mapping :math:`l_w = \lceil (l + q) \, d / d_w \rceil`, where :math:`q` is controlled by ``k_w_smooth``. k_w_smooth : int, default=2 Controls the resolution gap :math:`q` between the :math:`X` and :math:`W` bases in the data-driven procedure. Larger values yield more instrument basis functions relative to the :math:`X` basis. knots : {"uniform", "quantiles"}, default="uniform" Knot placement strategy: - ``"uniform"``: Equally spaced knots on the support. - ``"quantiles"``: Knots at empirical quantiles of the data. ucb_h : bool, default=True Compute uniform confidence bands for :math:`\hat{h}`. ucb_deriv : bool, default=True Compute uniform confidence bands for :math:`\partial^a \hat{h}`. deriv_index : int, default=1 Which component of :math:`X` to differentiate with respect to (1-based indexing). deriv_order : int, default=1 Order :math:`|a|` of the derivative (1 = first, 2 = second, etc.). check_is_fullrank : bool, default=False Verify that the basis matrices :math:`\boldsymbol{\Psi}_J` and :math:`\mathbf{B}_K` have full column rank before estimation. w_min, w_max : float, optional Override the support bounds for :math:`W`. Defaults to data range. x_min, x_max : float, optional Override the support bounds for :math:`X`. Defaults to data range. seed : int, optional Random seed for bootstrap reproducibility. Returns ------- NPIVResult Named tuple with the following fields: - **h** -- Estimated :math:`\hat{h}_J(x)` at evaluation points. - **deriv** -- Estimated :math:`\partial^a \hat{h}_J(x)`. - **h_lower**, **h_upper** -- Lower/upper UCB for :math:`h_0`. - **h_lower_deriv**, **h_upper_deriv** -- Lower/upper UCB for :math:`\partial^a h_0`. - **beta** -- Sieve coefficient vector :math:`\hat{c}_J`. - **asy_se** -- Pointwise asymptotic standard errors :math:`\hat{\sigma}_J(x)`. - **deriv_asy_se** -- Pointwise asymptotic standard errors :math:`\hat{\sigma}_J^a(x)` for derivatives. - **cv**, **cv_deriv** -- Bootstrap critical values :math:`z_{1-\alpha}^*` used for band construction. - **residuals** -- TSLS residuals :math:`\hat{u}_{i,J} = Y_i - \hat{h}_J(X_i)`. - **j_x_degree**, **j_x_segments** -- Basis parameters for :math:`X` (segments may differ from input when data-driven). - **k_w_degree**, **k_w_segments** -- Basis parameters for :math:`W`. - **args** -- Diagnostic dictionary. When data-driven selection is used, includes ``j_x_seg``, ``k_w_seg``, ``j_hat_max``, ``theta_star``, and other selection diagnostics. Examples -------- The Engel dataset contains household expenditure shares and income measures for 1655 households. We estimate a nonparametric Engel curve relating food share (``food``) to log-expenditure (``logexp``), using log-wages (``logwages``) as an instrument for potentially endogenous expenditure: .. ipython:: :okwarning: In [1]: import numpy as np ...: from moderndid import npiv, load_engel ...: ...: df = load_engel() ...: df.head() Estimate the structural function with 5 B-spline segments and 95% uniform confidence bands. The output is an ``NPIVResult`` containing the estimated function, confidence bands, derivatives, and diagnostics: .. ipython:: :okwarning: In [2]: result = npiv( ...: data=df, ...: yname="food", ...: xname="logexp", ...: wname="logwages", ...: j_x_segments=5, ...: biters=500, ...: seed=42, ...: ) ...: print(f"Estimates at {len(result.h)} points") ...: print(f"h[:5] = {result.h[:5]}") ...: print(f"95% UCB critical value: {result.cv:.3f}") ...: print(f"Basis: degree={result.j_x_degree}, segments={result.j_x_segments}") The derivative of the Engel curve (the marginal propensity to spend on food) is estimated simultaneously: .. ipython:: In [3]: print(f"deriv[:5] = {result.deriv[:5]}") ...: print(f"Derivative UCB critical value: {result.cv_deriv:.3f}") You can also pass numpy arrays directly instead of a DataFrame: .. ipython:: :okwarning: In [4]: rng = np.random.default_rng(0) ...: n = 200 ...: w = rng.uniform(0, 1, (n, 1)) ...: x = w + 0.2 * rng.normal(0, 1, (n, 1)) ...: y = np.sin(2 * np.pi * x).ravel() + 0.1 * rng.normal(0, 1, n) ...: result = npiv(y=y, x=x, w=w, j_x_segments=4, biters=500, seed=0) ...: result.h.shape See Also -------- npiv_est : Core sieve TSLS estimation (no confidence bands). compute_ucb : Multiplier bootstrap confidence band construction. npiv_choose_j : Data-driven sieve dimension selection. References ---------- .. [1] Chen, X., & Christensen, T. M. (2018). Optimal sup-norm rates and uniform inference on nonlinear functionals of nonparametric IV regression. *Quantitative Economics*, 9(1), 39-84. .. [2] Chen, X., Christensen, T. M., & Kankanala, S. (2024). Adaptive estimation and uniform confidence bands for nonparametric structural functions and elasticities. *Review of Economic Studies*. https://arxiv.org/abs/2107.11869. .. [3] Newey, W. K., & Powell, J. L. (2003). Instrumental variable estimation of nonparametric models. *Econometrica*, 71(5), 1565-1578. """ if data is not None: if y is not None or x is not None or w is not None: raise ValueError("Cannot specify both 'data' and array arguments (y, x, w)") if yname is None or xname is None or wname is None: raise ValueError("When 'data' is provided, 'yname', 'xname', and 'wname' are required") from moderndid.core.dataframe import to_polars df = to_polars(data) y = df[yname].to_numpy() if isinstance(xname, str): xname = [xname] x = df.select(xname).to_numpy() if isinstance(wname, str): wname = [wname] w = df.select(wname).to_numpy() elif y is None or x is None or w is None: raise ValueError("Must provide either 'data' with column names, or array arguments (y, x, w)") y = np.asarray(y) x = np.asarray(x) w = np.asarray(w) if y.ndim > 1: y = y.ravel() if len(y) != y.size: raise ValueError("y must be a 1-dimensional array") x = np.atleast_2d(x) w = np.atleast_2d(w) n = len(y) if x.shape[0] != n or w.shape[0] != n: raise ValueError("All input arrays must have the same number of observations") p_x = x.shape[1] if x_eval is None and x_grid is not None: warnings.warn("Using x_grid as x_eval", UserWarning) x_eval = x_grid if x_eval is not None: x_eval = np.atleast_2d(x_eval) if x_eval.shape[1] != p_x: raise ValueError("x_eval must have same number of columns as x") if alpha <= 0 or alpha >= 1: raise ValueError("alpha must be between 0 and 1") if biters < 1: raise ValueError("biters must be positive") if j_x_degree < 0: raise ValueError("j_x_degree must be non-negative") if k_w_degree < 0: raise ValueError("k_w_degree must be non-negative") if deriv_order < 0: raise ValueError("deriv_order must be non-negative") if deriv_index < 1 or deriv_index > p_x: raise ValueError(f"deriv_index must be between 1 and {p_x}") if basis not in ("tensor", "additive", "glp"): raise ValueError("basis must be one of: 'tensor', 'additive', 'glp'") if knots not in ("uniform", "quantiles"): raise ValueError("knots must be 'uniform' or 'quantiles'") if n < 50: warnings.warn(f"Small sample size (n={n}) may lead to unreliable results", UserWarning) if 0 < j_x_degree < deriv_order: warnings.warn( f"deriv_order ({deriv_order}) > j_x_degree ({j_x_degree}), derivative will be zero everywhere", UserWarning, ) data_driven = j_x_segments is None selection_result = None if data_driven: try: selection_result = npiv_choose_j( y=y, x=x, w=w, x_grid=x_grid, j_x_degree=j_x_degree, k_w_degree=k_w_degree, k_w_smooth=k_w_smooth, knots=knots, basis=basis, x_min=x_min, x_max=x_max, w_min=w_min, w_max=w_max, grid_num=50, biters=biters if biters > 0 else 99, check_is_fullrank=check_is_fullrank, seed=seed, ) j_x_segments = selection_result["j_x_seg"] k_w_segments = selection_result["k_w_seg"] except (ValueError, RuntimeError, np.linalg.LinAlgError) as e: warnings.warn( f"Data-driven selection failed: {e}. Using default values.", UserWarning, ) j_x_segments = max(3, min(int(np.ceil(n ** (1 / (2 * j_x_degree + p_x)))), 10)) k_w_segments = None args = {"data_driven": data_driven} if selection_result: args.update(selection_result) if ucb_h or ucb_deriv: result = compute_ucb( y=y, x=x, w=w, x_eval=x_eval, alpha=alpha, biters=biters, basis=basis, j_x_degree=j_x_degree, j_x_segments=j_x_segments, k_w_degree=k_w_degree, k_w_segments=k_w_segments, knots=knots, ucb_h=ucb_h, ucb_deriv=ucb_deriv, deriv_index=deriv_index, deriv_order=deriv_order, w_min=w_min, w_max=w_max, x_min=x_min, x_max=x_max, seed=seed, selection_result=selection_result, ) else: result = npiv_est( y=y, x=x, w=w, x_eval=x_eval, basis=basis, j_x_degree=j_x_degree, j_x_segments=j_x_segments, k_w_degree=k_w_degree, k_w_segments=k_w_segments, knots=knots, deriv_index=deriv_index, deriv_order=deriv_order, check_is_fullrank=check_is_fullrank, w_min=w_min, w_max=w_max, x_min=x_min, x_max=x_max, ) if selection_result: result.args.update(selection_result) result.args["data_driven"] = True return result