Source code for moderndid.npiv.selection

"""Data-driven dimension selection for nonparametric instrumental variables."""

from .lepski import npiv_j, npiv_jhat_max



[docs]
def npiv_choose_j(
    y,
    x,
    w,
    x_grid=None,
    j_x_degree=3,
    k_w_degree=4,
    k_w_smooth=2,
    knots="uniform",
    basis="tensor",
    x_min=None,
    x_max=None,
    w_min=None,
    w_max=None,
    grid_num=50,
    biters=99,
    check_is_fullrank=False,
    seed=None,
):
    r"""Select optimal B-spline dimensions.

    Implements the full data-driven selection procedure from [1]_, combining a maximum
    dimension selection step with a Lepski-style test. This procedure selects a
    data-driven sieve dimension, :math:`\tilde{J}`, that is sup-norm rate-adaptive. This means
    it adapts to unknown features of the data-generating process (e.g., smoothness of
    :math:`h_0`, instrument strength) to achieve the minimax optimal convergence rate in sup-norm,

    .. math::
        \sup_x |\hat{h}_{\tilde{J}}(x) - h_0(x)|.

    The procedure involves two main steps. First, determine a maximum feasible dimension, :math:`\hat{J}_{\max}`,
    based on the sample size and an estimate of the sieve measure of ill-posedness, :math:`\hat{s}_J^{-1}`.
    This step defines the search grid :math:`\hat{\mathcal{J}}` as

    .. math::
        \hat{J}_{\max} = \min \left\{ J \in \mathcal{T} : J \sqrt{\log J} \hat{s}_J^{-1} \leq c \sqrt{n} \right\}.

    Second, use a Lepski-style method with a multiplier bootstrap to select the optimal dimension
    :math:`\hat{J}` from the grid :math:`\hat{\mathcal{J}}`. This is done by comparing estimates across different
    dimensions and selecting the smallest dimension that is not statistically different from estimates
    at larger dimensions. The final choice is :math:`\tilde{J} = \min\{\hat{J}, \hat{J}_n\}`,
    where :math:`\hat{J}_n` is a slightly smaller, more conservative dimension for stability.

    .. math::
        \hat{J} = \min \left\{ J \in \hat{\mathcal{J}} : \sup_{x, J_2 > J} \left|
        \frac{\hat{h}_J(x) - \hat{h}_{J_2}(x)}{\hat{\sigma}_{J, J_2}(x)} \right|
        \leq \theta_{1-\hat{\alpha}}^* \right\}.

    Parameters
    ----------
    y : ndarray
        Dependent variable vector.
    x : ndarray
        Endogenous regressor matrix.
    w : ndarray
        Instrument matrix.
    x_grid : ndarray, optional
        Grid points for evaluation. If None, created automatically.
    j_x_degree : int, default=3
        Degree of B-spline basis for :math:`X`.
    k_w_degree : int, default=4
        Degree of B-spline basis for :math:`W`.
    k_w_smooth : int, default=2
        Smoothness parameter for :math:`K` selection.
    knots : {"uniform", "quantiles"}, default="uniform"
        Knot placement method.
    basis : {"tensor", "additive", "glp"}, default="tensor"
        Type of basis for multivariate :math:`X`:

        - "tensor": Full tensor product of univariate bases
        - "additive": Sum of univariate bases
        - "glp": Generalized linear product (hierarchical)
    x_min, x_max, w_min, w_max : float, optional
        Range limits for basis construction.
    grid_num : int, default=50
        Number of grid points for evaluation.
    biters : int, default=99
        Number of bootstrap replications for confidence bands.
    check_is_fullrank : bool, default=False
        Whether to check if basis matrices have full rank.
    seed : int, optional
        Random seed for reproducibility.

    Returns
    -------
    dict
        Dictionary containing:

        - **j_x_segments**: Selected number of segments for :math:`X`
        - **k_w_segments**: Corresponding segments for :math:`W`
        - **j_tilde**: Selected dimension
        - **theta_star**: Bootstrap critical value
        - **j_hat_max**: Maximum feasible dimension
        - Additional diagnostic information

    Examples
    --------
    Select the sieve dimension adaptively for the Engel curve IV problem.
    The procedure determines the maximum feasible dimension and then applies
    the Lepski test to choose the optimal number of B-spline segments:

    .. ipython::
        :okwarning:

        In [1]: import numpy as np
           ...: from moderndid import load_engel
           ...: from moderndid.npiv import npiv_choose_j
           ...:
           ...: df = load_engel()
           ...: y = df["food"].to_numpy()
           ...: x = df["logexp"].to_numpy().reshape(-1, 1)
           ...: w = df["logwages"].to_numpy().reshape(-1, 1)
           ...: sel = npiv_choose_j(y=y, x=x, w=w, biters=50, seed=42)
           ...: print(f"Selected segments: {sel['j_x_seg']}")
           ...: print(f"Selected dimension (J_tilde): {sel['j_tilde']}")
           ...: print(f"Max feasible dimension: {sel['j_hat_max']}")
           ...: print(f"Bootstrap critical value: {sel['theta_star']:.3f}")

    See Also
    --------
    npiv_jhat_max : Compute maximum feasible dimension
    npiv_j : Lepski-style test for dimension selection

    References
    ----------

    .. [1] Chen, X., Christensen, T. M., & Kankanala, S. (2024).
        Adaptive Estimation and Uniform Confidence Bands for Nonparametric
        Structural Functions and Elasticities. https://arxiv.org/abs/2107.11869.
    """
    # Step 1: Compute maximum feasible J and data-driven grid
    tmp1 = npiv_jhat_max(
        x=x,
        w=w,
        j_x_degree=j_x_degree,
        k_w_degree=k_w_degree,
        k_w_smooth=k_w_smooth,
        knots=knots,
        basis=basis,
        x_min=x_min,
        x_max=x_max,
        w_min=w_min,
        w_max=w_max,
    )

    j_x_segments_set = tmp1["j_x_segments_set"]
    k_w_segments_set = tmp1["k_w_segments_set"]
    alpha_hat = tmp1["alpha_hat"]

    # Step 2: Run Lepski test with bootstrap
    tmp2 = npiv_j(
        y=y,
        x=x,
        w=w,
        x_grid=x_grid,
        j_x_degree=j_x_degree,
        k_w_degree=k_w_degree,
        j_x_segments_set=j_x_segments_set,
        k_w_segments_set=k_w_segments_set,
        knots=knots,
        basis=basis,
        x_min=x_min,
        x_max=x_max,
        w_min=w_min,
        w_max=w_max,
        grid_num=grid_num,
        biters=biters,
        alpha=alpha_hat,
        check_is_fullrank=check_is_fullrank,
        seed=seed,
    )

    return {
        "j_hat_max": tmp1["j_hat_max"],
        "j_hat_n": tmp2["j_hat_n"],
        "j_hat": tmp2["j_hat"],
        "j_tilde": tmp2["j_tilde"],
        "j_x_seg": tmp2["j_x_seg"],
        "k_w_seg": tmp2["k_w_seg"],
        "j_x_segments_set": j_x_segments_set,
        "k_w_segments_set": k_w_segments_set,
        "theta_star": tmp2["theta_star"],
    }