Source code for moderndid.npiv.npiv

"""Nonparametric instrumental variables estimation."""

import warnings

import numpy as np

from .confidence_bands import compute_ucb
from .estimators import npiv_est
from .selection import npiv_choose_j



[docs]
def npiv(
    data=None,
    yname=None,
    xname=None,
    wname=None,
    y=None,
    x=None,
    w=None,
    x_eval=None,
    x_grid=None,
    alpha=0.05,
    basis="tensor",
    biters=99,
    j_x_degree=3,
    j_x_segments=None,
    k_w_degree=4,
    k_w_segments=None,
    k_w_smooth=2,
    knots="uniform",
    ucb_h=True,
    ucb_deriv=True,
    deriv_index=1,
    deriv_order=1,
    check_is_fullrank=False,
    w_min=None,
    w_max=None,
    x_min=None,
    x_max=None,
    seed=None,
):
    r"""Estimate nonparametric instrumental variables model with uniform confidence bands.

    Estimates the structural function :math:`h_0` and its derivatives in the
    nonparametric IV model

    .. math::
        \mathbb{E}[Y - h_0(X) \mid W] = 0 \quad \text{(a.s.)}

    where :math:`Y` is a scalar outcome, :math:`X` is a (possibly endogenous)
    regressor vector, and :math:`W` is a vector of instrumental variables. The
    function is approximated by a B-spline sieve :math:`h_0(x) \approx (\psi^J(x))' c_J`
    and coefficients are estimated by two-stage least squares using :math:`K`
    B-spline basis functions of :math:`W` as instruments

    .. math::
        \hat{c}_J = (\boldsymbol{\Psi}_J' \mathbf{P}_K \boldsymbol{\Psi}_J)^{-}
        \boldsymbol{\Psi}_J' \mathbf{P}_K \mathbf{Y},

    where :math:`\mathbf{P}_K = \mathbf{B}_K (\mathbf{B}_K' \mathbf{B}_K)^{-} \mathbf{B}_K'`
    projects onto the instrument space. Function and derivative estimates are then given by

    .. math::
        \hat{h}_J(x) = (\psi^J(x))' \hat{c}_J, \quad
        \partial^a \hat{h}_J(x) = (\partial^a \psi^J(x))' \hat{c}_J.

    When ``j_x_segments`` is None, a bootstrap implementation of Lepski's
    method selects the sieve dimension :math:`\tilde{J}` that adapts to the
    unknown smoothness of :math:`h_0` and instrument strength, achieving the
    minimax sup-norm convergence rate for both :math:`h_0` and its derivatives.

    The adaptive CCK procedure then constructs honest uniform confidence bands
    that guarantee coverage uniformly over a class of data-generating processes.
    When a fixed ``j_x_segments`` is supplied, the standard undersmoothing approach
    of [1]_ is used instead.

    Parameters
    ----------
    data : DataFrame, optional
        Input data. Accepts any object implementing the Arrow PyCapsule Interface
        (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf
        DataFrames. When provided, ``yname``, ``xname``, and ``wname`` are required.
    yname : str, optional
        Name of the outcome column in ``data``.
    xname : str or list of str, optional
        Name(s) of the endogenous regressor column(s) in ``data``.
    wname : str or list of str, optional
        Name(s) of the instrumental variable column(s) in ``data``.
    y : ndarray of shape (n,), optional
        Outcome variable. Required when ``data`` is not provided.
    x : ndarray of shape (n,) or (n, p_x), optional
        Endogenous regressors. Automatically promoted to 2-d if needed.
        Required when ``data`` is not provided.
    w : ndarray of shape (n,) or (n, p_w), optional
        Instrumental variables. Requires :math:`K \geq J`.
        Required when ``data`` is not provided.
    x_eval : ndarray of shape (m, p_x), optional
        Points at which to evaluate :math:`\hat{h}` and its derivatives. If
        None, evaluates at the sample points ``x``.
    x_grid : ndarray, optional
        Alias for ``x_eval``. Ignored when ``x_eval`` is provided.
    alpha : float, default=0.05
        Significance level for :math:`100(1-\alpha)\%` confidence bands.
    basis : {"tensor", "additive", "glp"}, default="tensor"
        Multivariate basis construction for :math:`X`:

        - ``"tensor"``: Full tensor product of univariate B-splines.
        - ``"additive"``: Sum of univariate B-splines (additive model).
        - ``"glp"``: Generalized linear product (hierarchical interactions).
    biters : int, default=99
        Number of multiplier bootstrap draws for critical value computation.
        Each draw generates i.i.d. :math:`N(0,1)` weights
        :math:`(\varpi_i)_{i=1}^n` to form bootstrap sup-:math:`t` statistics.
    j_x_degree : int, default=3
        Degree of B-spline basis for :math:`X` (order
        :math:`r = \text{degree} + 1`). For UCBs of first derivatives, degree
        :math:`\geq 2` is required; for second derivatives, :math:`\geq 3`.
    j_x_segments : int, optional
        Number of segments for the :math:`X` basis, determining sieve dimension
        :math:`J`. When None, the data-driven Lepski procedure selects
        :math:`\tilde{J}` adaptively. Supplying a fixed value triggers the
        undersmoothing UCB approach.
    k_w_degree : int, default=4
        Degree of B-spline basis for :math:`W`. Defaults to
        ``j_x_degree + 1`` because the reduced form
        :math:`\mathbb{E}[h_0(X) \mid W]` is smoother than :math:`h_0`.
    k_w_segments : int, optional
        Number of segments for the instrument basis. When None, chosen
        proportionally to ``j_x_segments`` via the resolution-level mapping
        :math:`l_w = \lceil (l + q) \, d / d_w \rceil`, where :math:`q` is controlled by ``k_w_smooth``.
    k_w_smooth : int, default=2
        Controls the resolution gap :math:`q` between the :math:`X` and
        :math:`W` bases in the data-driven procedure. Larger values yield more
        instrument basis functions relative to the :math:`X` basis.
    knots : {"uniform", "quantiles"}, default="uniform"
        Knot placement strategy:

        - ``"uniform"``: Equally spaced knots on the support.
        - ``"quantiles"``: Knots at empirical quantiles of the data.
    ucb_h : bool, default=True
        Compute uniform confidence bands for :math:`\hat{h}`.
    ucb_deriv : bool, default=True
        Compute uniform confidence bands for :math:`\partial^a \hat{h}`.
    deriv_index : int, default=1
        Which component of :math:`X` to differentiate with respect to
        (1-based indexing).
    deriv_order : int, default=1
        Order :math:`|a|` of the derivative (1 = first, 2 = second, etc.).
    check_is_fullrank : bool, default=False
        Verify that the basis matrices :math:`\boldsymbol{\Psi}_J` and
        :math:`\mathbf{B}_K` have full column rank before estimation.
    w_min, w_max : float, optional
        Override the support bounds for :math:`W`. Defaults to data range.
    x_min, x_max : float, optional
        Override the support bounds for :math:`X`. Defaults to data range.
    seed : int, optional
        Random seed for bootstrap reproducibility.

    Returns
    -------
    NPIVResult
        Named tuple with the following fields:

        - **h** -- Estimated :math:`\hat{h}_J(x)` at evaluation points.
        - **deriv** -- Estimated :math:`\partial^a \hat{h}_J(x)`.
        - **h_lower**, **h_upper** -- Lower/upper UCB for :math:`h_0`.
        - **h_lower_deriv**, **h_upper_deriv** -- Lower/upper UCB for
          :math:`\partial^a h_0`.
        - **beta** -- Sieve coefficient vector :math:`\hat{c}_J`.
        - **asy_se** -- Pointwise asymptotic standard errors
          :math:`\hat{\sigma}_J(x)`.
        - **deriv_asy_se** -- Pointwise asymptotic standard errors
          :math:`\hat{\sigma}_J^a(x)` for derivatives.
        - **cv**, **cv_deriv** -- Bootstrap critical values
          :math:`z_{1-\alpha}^*` used for band construction.
        - **residuals** -- TSLS residuals
          :math:`\hat{u}_{i,J} = Y_i - \hat{h}_J(X_i)`.
        - **j_x_degree**, **j_x_segments** -- Basis parameters for :math:`X`
          (segments may differ from input when data-driven).
        - **k_w_degree**, **k_w_segments** -- Basis parameters for :math:`W`.
        - **args** -- Diagnostic dictionary. When data-driven selection is
          used, includes ``j_x_seg``, ``k_w_seg``, ``j_hat_max``,
          ``theta_star``, and other selection diagnostics.

    Examples
    --------
    The Engel dataset contains household expenditure shares and income measures
    for 1655 households. We estimate a nonparametric Engel curve relating food
    share (``food``) to log-expenditure (``logexp``), using log-wages
    (``logwages``) as an instrument for potentially endogenous expenditure:

    .. ipython::
        :okwarning:

        In [1]: import numpy as np
           ...: from moderndid import npiv, load_engel
           ...:
           ...: df = load_engel()
           ...: df.head()

    Estimate the structural function with 5 B-spline segments and 95% uniform
    confidence bands. The output is an ``NPIVResult`` containing the estimated
    function, confidence bands, derivatives, and diagnostics:

    .. ipython::
        :okwarning:

        In [2]: result = npiv(
           ...:     data=df,
           ...:     yname="food",
           ...:     xname="logexp",
           ...:     wname="logwages",
           ...:     j_x_segments=5,
           ...:     biters=500,
           ...:     seed=42,
           ...: )
           ...: print(f"Estimates at {len(result.h)} points")
           ...: print(f"h[:5] = {result.h[:5]}")
           ...: print(f"95% UCB critical value: {result.cv:.3f}")
           ...: print(f"Basis: degree={result.j_x_degree}, segments={result.j_x_segments}")

    The derivative of the Engel curve (the marginal propensity to spend on food)
    is estimated simultaneously:

    .. ipython::

        In [3]: print(f"deriv[:5] = {result.deriv[:5]}")
           ...: print(f"Derivative UCB critical value: {result.cv_deriv:.3f}")

    You can also pass numpy arrays directly instead of a DataFrame:

    .. ipython::
        :okwarning:

        In [4]: rng = np.random.default_rng(0)
           ...: n = 200
           ...: w = rng.uniform(0, 1, (n, 1))
           ...: x = w + 0.2 * rng.normal(0, 1, (n, 1))
           ...: y = np.sin(2 * np.pi * x).ravel() + 0.1 * rng.normal(0, 1, n)
           ...: result = npiv(y=y, x=x, w=w, j_x_segments=4, biters=500, seed=0)
           ...: result.h.shape

    See Also
    --------
    npiv_est : Core sieve TSLS estimation (no confidence bands).
    compute_ucb : Multiplier bootstrap confidence band construction.
    npiv_choose_j : Data-driven sieve dimension selection.

    References
    ----------

    .. [1] Chen, X., & Christensen, T. M. (2018). Optimal sup-norm rates and
        uniform inference on nonlinear functionals of nonparametric IV
        regression. *Quantitative Economics*, 9(1), 39-84.

    .. [2] Chen, X., Christensen, T. M., & Kankanala, S. (2024). Adaptive
        estimation and uniform confidence bands for nonparametric structural
        functions and elasticities. *Review of Economic Studies*.
        https://arxiv.org/abs/2107.11869.

    .. [3] Newey, W. K., & Powell, J. L. (2003). Instrumental variable
        estimation of nonparametric models. *Econometrica*, 71(5), 1565-1578.
    """
    if data is not None:
        if y is not None or x is not None or w is not None:
            raise ValueError("Cannot specify both 'data' and array arguments (y, x, w)")
        if yname is None or xname is None or wname is None:
            raise ValueError("When 'data' is provided, 'yname', 'xname', and 'wname' are required")
        from moderndid.core.dataframe import to_polars

        df = to_polars(data)
        y = df[yname].to_numpy()
        if isinstance(xname, str):
            xname = [xname]
        x = df.select(xname).to_numpy()
        if isinstance(wname, str):
            wname = [wname]
        w = df.select(wname).to_numpy()
    elif y is None or x is None or w is None:
        raise ValueError("Must provide either 'data' with column names, or array arguments (y, x, w)")

    y = np.asarray(y)
    x = np.asarray(x)
    w = np.asarray(w)

    if y.ndim > 1:
        y = y.ravel()
        if len(y) != y.size:
            raise ValueError("y must be a 1-dimensional array")

    x = np.atleast_2d(x)
    w = np.atleast_2d(w)

    n = len(y)
    if x.shape[0] != n or w.shape[0] != n:
        raise ValueError("All input arrays must have the same number of observations")

    p_x = x.shape[1]

    if x_eval is None and x_grid is not None:
        warnings.warn("Using x_grid as x_eval", UserWarning)
        x_eval = x_grid

    if x_eval is not None:
        x_eval = np.atleast_2d(x_eval)
        if x_eval.shape[1] != p_x:
            raise ValueError("x_eval must have same number of columns as x")

    if alpha <= 0 or alpha >= 1:
        raise ValueError("alpha must be between 0 and 1")

    if biters < 1:
        raise ValueError("biters must be positive")

    if j_x_degree < 0:
        raise ValueError("j_x_degree must be non-negative")

    if k_w_degree < 0:
        raise ValueError("k_w_degree must be non-negative")

    if deriv_order < 0:
        raise ValueError("deriv_order must be non-negative")

    if deriv_index < 1 or deriv_index > p_x:
        raise ValueError(f"deriv_index must be between 1 and {p_x}")

    if basis not in ("tensor", "additive", "glp"):
        raise ValueError("basis must be one of: 'tensor', 'additive', 'glp'")

    if knots not in ("uniform", "quantiles"):
        raise ValueError("knots must be 'uniform' or 'quantiles'")

    if n < 50:
        warnings.warn(f"Small sample size (n={n}) may lead to unreliable results", UserWarning)

    if 0 < j_x_degree < deriv_order:
        warnings.warn(
            f"deriv_order ({deriv_order}) > j_x_degree ({j_x_degree}), derivative will be zero everywhere",
            UserWarning,
        )

    data_driven = j_x_segments is None
    selection_result = None
    if data_driven:
        try:
            selection_result = npiv_choose_j(
                y=y,
                x=x,
                w=w,
                x_grid=x_grid,
                j_x_degree=j_x_degree,
                k_w_degree=k_w_degree,
                k_w_smooth=k_w_smooth,
                knots=knots,
                basis=basis,
                x_min=x_min,
                x_max=x_max,
                w_min=w_min,
                w_max=w_max,
                grid_num=50,
                biters=biters if biters > 0 else 99,
                check_is_fullrank=check_is_fullrank,
                seed=seed,
            )
            j_x_segments = selection_result["j_x_seg"]
            k_w_segments = selection_result["k_w_seg"]

        except (ValueError, RuntimeError, np.linalg.LinAlgError) as e:
            warnings.warn(
                f"Data-driven selection failed: {e}. Using default values.",
                UserWarning,
            )
            j_x_segments = max(3, min(int(np.ceil(n ** (1 / (2 * j_x_degree + p_x)))), 10))
            k_w_segments = None

    args = {"data_driven": data_driven}
    if selection_result:
        args.update(selection_result)

    if ucb_h or ucb_deriv:
        result = compute_ucb(
            y=y,
            x=x,
            w=w,
            x_eval=x_eval,
            alpha=alpha,
            biters=biters,
            basis=basis,
            j_x_degree=j_x_degree,
            j_x_segments=j_x_segments,
            k_w_degree=k_w_degree,
            k_w_segments=k_w_segments,
            knots=knots,
            ucb_h=ucb_h,
            ucb_deriv=ucb_deriv,
            deriv_index=deriv_index,
            deriv_order=deriv_order,
            w_min=w_min,
            w_max=w_max,
            x_min=x_min,
            x_max=x_max,
            seed=seed,
            selection_result=selection_result,
        )
    else:
        result = npiv_est(
            y=y,
            x=x,
            w=w,
            x_eval=x_eval,
            basis=basis,
            j_x_degree=j_x_degree,
            j_x_segments=j_x_segments,
            k_w_degree=k_w_degree,
            k_w_segments=k_w_segments,
            knots=knots,
            deriv_index=deriv_index,
            deriv_order=deriv_order,
            check_is_fullrank=check_is_fullrank,
            w_min=w_min,
            w_max=w_max,
            x_min=x_min,
            x_max=x_max,
        )

    if selection_result:
        result.args.update(selection_result)
        result.args["data_driven"] = True

    return result