Source code for moderndid.core.panel

"""Panel data utility functions."""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any

import polars as pl

from moderndid.core.dataframe import from_polars, to_polars
from moderndid.core.format import (
    _make_table,
    adjust_separators,
    format_footer,
    format_section_header,
    format_title,
)
from moderndid.core.preprocess.utils import (
    get_first_difference as _get_first_difference_impl,
)
from moderndid.core.preprocess.utils import (
    get_group as _get_group_impl,
)
from moderndid.core.preprocess.utils import (
    is_balanced_panel as _is_balanced_panel_impl,
)
from moderndid.core.preprocess.utils import (
    make_balanced_panel as _make_balanced_panel_impl,
)

__all__ = [
    "PanelDiagnostics",
    "are_varying",
    "assign_rc_ids",
    "complete_data",
    "deduplicate_panel",
    "diagnose_panel",
    "fill_panel_gaps",
    "get_first_difference",
    "get_group",
    "has_gaps",
    "is_balanced_panel",
    "make_balanced_panel",
    "panel_to_wide",
    "scan_gaps",
    "wide_to_panel",
]


[docs] @dataclass class PanelDiagnostics: """Structured report returned by :func:`diagnose_panel`. Attributes ---------- n_units : int Number of unique cross-sectional units. n_periods : int Number of unique time periods. n_observations : int Total row count. is_balanced : bool Whether every unit is observed in every period. n_duplicate_unit_time : int Number of rows that share a unit-time pair with another row. n_unbalanced_units : int Units observed in fewer than *n_periods* periods. n_gaps : int Missing unit-time combinations in the full cross-product. n_missing_rows : int Rows containing at least one null value. n_single_period_units : int Units observed in only one period. n_early_treated : int or None Units already treated in the first observed period. ``None`` when no treatment column is provided. treatment_time_varying : bool or None Whether the treatment indicator changes within units. ``None`` when no treatment column is provided. suggestions : list[str] Actionable remediation messages. """ n_units: int n_periods: int n_observations: int is_balanced: bool n_duplicate_unit_time: int n_unbalanced_units: int n_gaps: int n_missing_rows: int n_single_period_units: int n_early_treated: int | None treatment_time_varying: bool | None suggestions: list[str] = field(default_factory=list) def __repr__(self) -> str: # pragma: no cover """Return a formatted string representation.""" return _format_panel_diagnostics(self) def __str__(self) -> str: # pragma: no cover """Return a human-readable summary.""" return _format_panel_diagnostics(self)
[docs] def diagnose_panel( data: Any, idname: str, tname: str, treatname: str | None = None, ) -> PanelDiagnostics: """Run a diagnostic battery on panel data. Inspects the data for common issues that would cause estimation to fail or produce misleading results, including duplicate unit-time pairs, unbalanced units, gaps in the panel, missing values, single-period units, and early-treated units. When a treatment column is provided, the check also flags whether treatment varies within units over time (which usually indicates the data needs :func:`get_group` to derive the group-timing variable). The returned :class:`PanelDiagnostics` object includes a ``suggestions`` list that maps each detected problem to the appropriate remediation function (e.g., :func:`deduplicate_panel`, :func:`fill_panel_gaps`, :func:`make_balanced_panel`), making it a natural first step before calling any estimator. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. treatname : str or None Treatment indicator column. If provided, checks whether treatment varies within units over time. Returns ------- PanelDiagnostics Structured report with counts and actionable suggestions. Examples -------- .. ipython:: In [1]: from moderndid import diagnose_panel, load_favara_imbs ...: ...: df = load_favara_imbs() ...: diag = diagnose_panel(df, idname="county", tname="year", treatname="inter_bra") ...: diag See Also -------- deduplicate_panel : Remove duplicate unit-time pairs. fill_panel_gaps : Insert null rows for missing pairs. make_balanced_panel : Drop units not observed in every period. get_group : Derive group-timing from a binary treatment indicator. """ df = to_polars(data) n_units = df[idname].n_unique() n_periods = df[tname].n_unique() n_obs = len(df) balanced = bool(_is_balanced_panel_impl(df, tname, idname)) deduped = df.unique(subset=[idname, tname]) n_dups = n_obs - len(deduped) counts = df.group_by(idname).agg(pl.col(tname).n_unique().alias("_n")) n_unbalanced = int((counts["_n"] < n_periods).sum()) expected_full = n_units * n_periods n_gaps = expected_full - len(deduped) n_missing_rows = len(df) - len(df.drop_nulls()) n_single = int((counts["_n"] == 1).sum()) n_early: int | None = None if treatname is not None and treatname in df.columns: first_period = df[tname].min() n_early = int(df.filter((pl.col(tname) == first_period) & (pl.col(treatname) > 0))[idname].n_unique()) treat_varying: bool | None = None if treatname is not None and treatname in df.columns: nuniq = df.group_by(idname).agg(pl.col(treatname).n_unique().alias("_nu")) treat_varying = bool((nuniq["_nu"] > 1).any()) suggestions: list[str] = [] if n_dups > 0: suggestions.append(f"Call deduplicate_panel() to remove {n_dups} duplicate unit-time pairs") if n_gaps > 0: suggestions.append(f"Call fill_panel_gaps() to fill {n_gaps} missing unit-time pairs") if n_unbalanced > 0 and n_dups == 0: suggestions.append(f"Call make_balanced_panel() to drop {n_unbalanced} units not observed in all periods") if n_missing_rows > 0: suggestions.append(f"{n_missing_rows} rows contain missing values and will be dropped during preprocessing") if n_single > 0: suggestions.append( f"Call complete_data() or make_balanced_panel() to drop {n_single} units observed in only one period" ) if n_early is not None and n_early > 0: suggestions.append( f"{n_early} units are already treated in the first period and will be dropped during preprocessing" ) if treat_varying: suggestions.append("Treatment varies within units — verify this is expected or call get_group()") return PanelDiagnostics( n_units=n_units, n_periods=n_periods, n_observations=n_obs, is_balanced=balanced, n_duplicate_unit_time=n_dups, n_unbalanced_units=n_unbalanced, n_gaps=n_gaps, n_missing_rows=n_missing_rows, n_single_period_units=n_single, n_early_treated=n_early, treatment_time_varying=treat_varying, suggestions=suggestions, )
[docs] def get_group(data: Any, idname: str, tname: str, treatname: str, treat_period: int | None = None) -> Any: """Extract treatment-group timing into a ``"G"`` column. Staggered difference-in-differences estimators like :func:`att_gt` require a *group* variable (``gname``) that records the first period each unit receives treatment. Many real-world datasets instead contain a binary treatment indicator that switches from 0 to 1 when treatment begins. This function converts that indicator into the group-timing variable ``"G"`` expected by the estimator. For each treated unit, ``G`` equals the first period where the treatment indicator is positive. For never-treated units, ``G`` is 0. When the treatment indicator is *static* (e.g., a region dummy that equals 1 in every period for treated units), the first-switch logic would incorrectly assign ``G`` to the earliest observed period. In this case, pass ``treat_period`` to directly specify the known treatment onset: any unit with a positive value of *treatname* in any period receives ``G = treat_period``, and all others receive ``G = 0``. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. treatname : str Binary treatment indicator column. treat_period : int or None Known treatment onset period. When provided, units with any positive value of *treatname* are assigned ``G = treat_period`` and all others receive ``G = 0``, bypassing the first-switch detection logic. Useful for static treatment indicators that do not switch on at a specific time. Returns ------- DataFrame Original columns plus ``"G"``, in the same format as *data*. Examples -------- When the treatment indicator switches on at a specific period, the default behaviour detects the first switch automatically: .. ipython:: In [1]: from moderndid import get_group, load_favara_imbs ...: ...: df = load_favara_imbs() ...: df = get_group(df, idname="county", tname="year", treatname="inter_bra") ...: df.select("county", "year", "inter_bra", "G").head(10) When the treatment indicator is static (e.g., a region dummy), pass ``treat_period`` to specify the known onset: .. ipython:: In [2]: from moderndid import get_group, load_cai2016 ...: ...: df = load_cai2016() ...: df = get_group(df, idname="hhno", tname="year", ...: treatname="treatment", treat_period=2003) ...: df.select("hhno", "year", "treatment", "G").head(10) See Also -------- att_gt : Estimate group-time average treatment effects. diagnose_panel : Check whether treatment varies within units. """ result = _get_group_impl(data, idname, tname, treatname, treat_period=treat_period) return from_polars(result, data)
[docs] def get_first_difference(data: Any, idname: str, yname: str, tname: str) -> Any: r"""Add a ``"dy"`` column containing first-differenced outcomes. First-differencing computes :math:`\Delta Y_{it} = Y_{it} - Y_{i,t-1}` for each unit, removing time-invariant unit fixed effects. The :func:`att_gt` estimator performs this step internally, but exposing it here allows inspection of the transformed data before estimation. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. yname : str Outcome column. tname : str Time period column. Returns ------- DataFrame Original columns plus ``"dy"``, in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import get_first_difference, load_favara_imbs ...: ...: df = load_favara_imbs() ...: df = get_first_difference(df, idname="county", yname="Dl_vloans_b", tname="year") ...: df.select("county", "year", "Dl_vloans_b", "dy").head(10) See Also -------- att_gt : Estimate group-time average treatment effects. """ result = _get_first_difference_impl(data, idname, yname, tname) return from_polars(result, data)
[docs] def make_balanced_panel(data: Any, idname: str, tname: str) -> Any: """Drop units not observed in every time period. Many difference-in-differences estimators require a strictly balanced panel where every unit appears in every time period. When ``allow_unbalanced_panel=False`` (the default in :func:`att_gt`), the preprocessing pipeline calls this function automatically. Calling it beforehand lets you inspect how many units will be dropped and decide whether balancing, gap-filling with :func:`fill_panel_gaps`, or a flexible threshold via :func:`complete_data` is more appropriate. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. Returns ------- DataFrame Balanced panel in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import make_balanced_panel, load_favara_imbs ...: ...: df = load_favara_imbs() ...: balanced = make_balanced_panel(df, idname="county", tname="year") ...: print(f"Before: {df.shape[0]} rows, After: {balanced.shape[0]} rows") See Also -------- complete_data : Keep units observed in at least *min_periods* periods. fill_panel_gaps : Insert null rows instead of dropping units. is_balanced_panel : Check whether the panel is already balanced. """ result = _make_balanced_panel_impl(data, idname, tname) return from_polars(result, data)
[docs] def is_balanced_panel(data: Any, idname: str, tname: str) -> bool: """Check whether the panel is balanced. A balanced panel has exactly one observation for every unit-period combination. This is a quick Boolean check you can run before passing data to an estimator. If the panel is unbalanced, use :func:`make_balanced_panel` to drop incomplete units or :func:`fill_panel_gaps` to insert null rows for the missing pairs. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. Returns ------- bool ``True`` if every unit is observed in every period. Examples -------- .. ipython:: In [1]: from moderndid import is_balanced_panel, load_favara_imbs ...: ...: df = load_favara_imbs() ...: is_balanced_panel(df, idname="county", tname="year") See Also -------- make_balanced_panel : Drop units not observed in every period. diagnose_panel : Full diagnostic battery including balance checks. """ return _is_balanced_panel_impl(data, tname, idname)
[docs] def deduplicate_panel(data: Any, idname: str, tname: str, strategy: str = "last") -> Any: """Remove duplicate unit-time pairs. Duplicate unit-time rows cause hard errors during the preprocessing pipeline because the data cannot be unambiguously reshaped or differenced. Run :func:`diagnose_panel` first to see how many duplicates exist, then call this function to resolve them before estimation. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. strategy : ``"first"`` | ``"last"`` | ``"mean"`` How to resolve duplicates. ``"mean"`` averages numeric columns and keeps the first value for non-numeric columns. Returns ------- DataFrame Deduplicated panel in the same format as *data*. Raises ------ ValueError If *strategy* is not one of ``"first"``, ``"last"``, ``"mean"``. Examples -------- .. ipython:: In [1]: import polars as pl ...: from moderndid import deduplicate_panel, load_favara_imbs ...: ...: df = load_favara_imbs() ...: df_with_dups = pl.concat([df, df.head(5)]) ...: deduped = deduplicate_panel(df_with_dups, idname="county", tname="year") ...: print(f"Before: {df_with_dups.shape[0]} rows, After: {deduped.shape[0]} rows") See Also -------- diagnose_panel : Detect duplicates before removing them. """ if strategy not in ("first", "last", "mean"): msg = f"strategy must be 'first', 'last', or 'mean', got {strategy!r}" raise ValueError(msg) df = to_polars(data) if strategy in ("first", "last"): result = df.unique(subset=[idname, tname], keep=strategy) else: numeric_cols = [c for c in df.columns if c not in (idname, tname) and df[c].dtype.is_numeric()] non_numeric_cols = [c for c in df.columns if c not in (idname, tname) and not df[c].dtype.is_numeric()] aggs: list[pl.Expr] = [] for c in numeric_cols: aggs.append(pl.col(c).mean()) for c in non_numeric_cols: aggs.append(pl.col(c).first()) result = df.group_by([idname, tname]).agg(aggs) return from_polars(result, data)
[docs] def fill_panel_gaps(data: Any, idname: str, tname: str) -> Any: """Make the panel rectangular by inserting ``null`` rows for missing pairs. Unlike :func:`make_balanced_panel` (which drops incomplete units), this function *fills* gaps so that every unit appears in every period. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. Returns ------- DataFrame Rectangular panel in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import fill_panel_gaps, has_gaps, load_favara_imbs ...: ...: df = load_favara_imbs() ...: print(has_gaps(df, idname="county", tname="year")) In [2]: filled = fill_panel_gaps(df, idname="county", tname="year") ...: print(f"Before: {df.shape[0]} rows, After: {filled.shape[0]} rows") See Also -------- scan_gaps : Inspect which pairs are missing before filling. make_balanced_panel : Drop incomplete units instead of filling gaps. """ df = to_polars(data) ids = df.select(idname).unique() times = df.select(tname).unique() full = ids.join(times, how="cross") result = full.join(df, on=[idname, tname], how="left") return from_polars(result, data)
[docs] def complete_data(data: Any, idname: str, tname: str, min_periods: int | None = None) -> Any: """Keep units observed in at least *min_periods* time periods. Provides a flexible alternative to :func:`make_balanced_panel`. Rather than requiring every unit to appear in *all* periods, you can set a threshold so that units with a reasonable amount of data are retained. When *min_periods* is ``None`` the behaviour is identical to :func:`make_balanced_panel`. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. min_periods : int or None Minimum number of observed periods. ``None`` (default) means *all* periods, equivalent to :func:`make_balanced_panel`. Returns ------- DataFrame Filtered panel in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import complete_data, load_favara_imbs ...: ...: df = load_favara_imbs() ...: filtered = complete_data(df, idname="county", tname="year", min_periods=10) ...: print(f"Before: {df.shape[0]} rows, After: {filtered.shape[0]} rows") See Also -------- make_balanced_panel : Strict balancing (all periods required). """ df = to_polars(data) if df.is_empty(): return from_polars(df, data) if min_periods is None: min_periods = df[tname].n_unique() counts = df.group_by(idname).agg(pl.col(tname).n_unique().alias("_n_periods")) keep_ids = counts.filter(pl.col("_n_periods") >= min_periods)[idname].to_list() result = df.filter(pl.col(idname).is_in(keep_ids)) return from_polars(result, data)
[docs] def assign_rc_ids(data: Any) -> Any: """Add a unique ``"rowid"`` column for repeated cross-section data. In repeated cross-section designs each observation is a different individual, so there is no natural unit identifier to track over time. This function assigns a sequential integer ``"rowid"`` that can be passed as the ``idname`` argument to :func:`att_gt` with ``panel=False``. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. Returns ------- DataFrame Original data plus an integer ``"rowid"`` column, in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import assign_rc_ids, load_favara_imbs ...: ...: df = load_favara_imbs() ...: df = assign_rc_ids(df) ...: df.select("rowid", "county", "year").head(5) See Also -------- att_gt : Pass ``panel=False`` for repeated cross-section estimation. """ df = to_polars(data) result = df.with_row_index("rowid") return from_polars(result, data)
[docs] def are_varying(data: Any, idname: str, cols: list[str] | None = None) -> dict[str, bool]: """Check which columns vary within units over time. Difference-in-differences estimators distinguish between time-varying and time-invariant covariates. Time-invariant covariates (e.g., baseline demographics) are appropriate for inclusion in the propensity score or outcome regression model, while time-varying covariates require additional assumptions. This function classifies columns so you can make informed covariate-selection decisions before estimation. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. cols : list[str] or None Columns to check. Defaults to all columns except *idname*. Returns ------- dict[str, bool] Mapping of column name to ``True`` if the column varies within any unit, ``False`` otherwise. See Also -------- diagnose_panel : Full diagnostic battery including treatment variation. """ df = to_polars(data) if cols is None: cols = [c for c in df.columns if c != idname] nuniq = df.group_by(idname).agg([pl.col(c).n_unique().alias(c) for c in cols]) result: dict[str, bool] = {} for c in cols: result[c] = bool((nuniq[c] > 1).any()) return result
[docs] def scan_gaps(data: Any, idname: str, tname: str) -> Any: """Identify missing unit-time combinations. Returns a DataFrame listing every unit-period pair that is absent from the data. Inspecting these gaps helps you decide whether to drop incomplete units with :func:`make_balanced_panel` or fill them with null rows using :func:`fill_panel_gaps`. For a quick Boolean check without materialising the gaps, use :func:`has_gaps`. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. Returns ------- DataFrame Rows with ``idname`` and ``tname`` columns for every *absent* pair, returned in the same format as *data*. See Also -------- has_gaps : Quick Boolean check for missing pairs. fill_panel_gaps : Insert null rows for missing pairs. """ df = to_polars(data) ids = df.select(idname).unique() times = df.select(tname).unique() full = ids.join(times, how="cross") gaps = full.join(df.select([idname, tname]).unique(), on=[idname, tname], how="anti") return from_polars(gaps, data)
[docs] def has_gaps(data: Any, idname: str, tname: str) -> bool: """Check whether the panel has any implicit missing unit-time pairs. A lightweight Boolean check that compares the number of observed unit-period pairs against the full cross-product. If this returns ``True``, call :func:`scan_gaps` to see which specific pairs are missing, then decide whether to fill them with :func:`fill_panel_gaps` or drop incomplete units with :func:`make_balanced_panel`. Parameters ---------- data : DataFrame Panel data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. Returns ------- bool ``True`` if there are missing unit-time combinations. See Also -------- scan_gaps : Materialise the missing unit-time pairs. fill_panel_gaps : Insert null rows for the missing pairs. """ df = to_polars(data) n_units = df[idname].n_unique() n_periods = df[tname].n_unique() n_unique_pairs = df.select([idname, tname]).unique().height return n_unique_pairs < n_units * n_periods
[docs] def panel_to_wide(data: Any, idname: str, tname: str, separator: str = "_") -> Any: """Pivot a long panel to wide format. Reshapes the data so that each unit occupies a single row. Time-varying columns are spread into one column per period while time-invariant columns are kept as-is. Parameters ---------- data : DataFrame Panel data in long format. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. tname : str Time period column. separator : str String inserted between the variable name and time label in the wide column names. Default ``"_"``. Returns ------- DataFrame Wide-format DataFrame with one row per unit, in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import make_balanced_panel, panel_to_wide, load_favara_imbs ...: ...: df = load_favara_imbs() ...: df = make_balanced_panel(df, idname="county", tname="year") ...: wide = panel_to_wide(df, idname="county", tname="year") ...: wide.head(5) See Also -------- wide_to_panel : Inverse operation (wide to long). """ df = to_polars(data) other_cols = [c for c in df.columns if c not in (idname, tname)] if not other_cols: return from_polars(df.select(idname).unique(), data) nuniq = df.group_by(idname).agg([pl.col(c).n_unique().alias(c) for c in other_cols]) constants = [c for c in other_cols if not bool((nuniq[c] > 1).any())] varying_cols = [c for c in other_cols if bool((nuniq[c] > 1).any())] if constants: result = df.group_by(idname).agg([pl.col(c).first() for c in constants]) else: result = df.select(idname).unique() if varying_cols: pivoted = df.select([idname, tname, *varying_cols]).pivot( on=tname, index=idname, values=varying_cols, separator=separator, ) if len(varying_cols) == 1: stub = varying_cols[0] rename_map = {c: f"{stub}{separator}{c}" for c in pivoted.columns if c != idname} pivoted = pivoted.rename(rename_map) result = result.join(pivoted, on=idname) return from_polars(result, data)
[docs] def wide_to_panel( data: Any, idname: str, stub_names: list[str], separator: str = "_", tname: str = "time", ) -> Any: """Unpivot wide-format data into a long panel. Gathers time-varying columns back into long format using the stub names and separator to identify which wide columns belong to each variable and period. All other columns (except *idname*) are treated as time-invariant and repeated for every period. Parameters ---------- data : DataFrame Wide-format data. Accepts any object implementing the Arrow PyCapsule Interface (``__arrow_c_stream__``), including polars, pandas, pyarrow Table, and cudf DataFrames. idname : str Unit identifier column. stub_names : list[str] Variable-name prefixes that identify the time-varying columns. For example, ``["y", "x"]`` will match ``y_1``, ``y_2``, ``x_1``, ``x_2``, etc. separator : str Delimiter between the stub and the period label. Default ``"_"``. tname : str Name for the created time column. Default ``"time"``. Returns ------- DataFrame Long-format panel in the same format as *data*. Examples -------- .. ipython:: In [1]: from moderndid import make_balanced_panel, panel_to_wide, wide_to_panel, load_favara_imbs ...: ...: df = load_favara_imbs() ...: df = make_balanced_panel(df, idname="county", tname="year") ...: wide = panel_to_wide(df, idname="county", tname="year") ...: long = wide_to_panel(wide, idname="county", stub_names=["Dl_vloans_b", "Dl_hpi"], tname="year") ...: long.head(10) See Also -------- panel_to_wide : Inverse operation (long to wide). """ df = to_polars(data) varying_map: dict[str, tuple[str, str]] = {} periods: set[str] = set() for col in df.columns: if col == idname: continue for stub in stub_names: prefix = f"{stub}{separator}" if col.startswith(prefix): period = col[len(prefix) :] varying_map[col] = (stub, period) periods.add(period) break try: periods_sorted = sorted(periods, key=int) cast_period = int except ValueError: try: periods_sorted = sorted(periods, key=float) cast_period = float except ValueError: periods_sorted = sorted(periods) cast_period = str constant_cols = [c for c in df.columns if c != idname and c not in varying_map] frames: list[pl.DataFrame] = [] for period in periods_sorted: select_exprs: list[pl.Expr] = [pl.col(idname)] select_exprs.append(pl.lit(cast_period(period)).alias(tname)) for const in constant_cols: select_exprs.append(pl.col(const)) for stub in stub_names: col_name = f"{stub}{separator}{period}" if col_name in df.columns: select_exprs.append(pl.col(col_name).alias(stub)) else: select_exprs.append(pl.lit(None).alias(stub)) frames.append(df.select(select_exprs)) result = pl.concat(frames).sort([idname, tname]) return from_polars(result, data)
def _format_panel_diagnostics(diag: PanelDiagnostics) -> str: """Pretty-print a :class:`PanelDiagnostics` instance.""" def _bool_str(val: bool | None) -> str: if val is None: return "N/A" return "Yes" if val else "No" def _bool_or_count(val: int | None) -> str: if val is None: return "N/A" return str(val) lines = format_title("Panel Diagnostics") rows = [ ("Units", str(diag.n_units)), ("Periods", str(diag.n_periods)), ("Observations", str(diag.n_observations)), ("Balanced", _bool_str(diag.is_balanced)), ("Duplicate unit-time pairs", str(diag.n_duplicate_unit_time)), ("Unbalanced units", str(diag.n_unbalanced_units)), ("Gaps", str(diag.n_gaps)), ("Rows with missing values", str(diag.n_missing_rows)), ("Single-period units", str(diag.n_single_period_units)), ("Early-treated units", _bool_or_count(diag.n_early_treated)), ("Treatment time-varying", _bool_str(diag.treatment_time_varying)), ] table = _make_table( ["Metric", "Value"], rows, {"Metric": "l", "Value": "r"}, ) lines.extend(["", *table.split("\n")]) if diag.suggestions: lines.extend(format_section_header("Suggestions")) for s in diag.suggestions: lines.append(f" {s}") lines.extend(format_footer()) return "\n".join(adjust_separators(lines))