Source code for actuarialpy.columns

"""Small DataFrame validation helpers.

ActuarialPy intentionally avoids wrapping ordinary pandas operations unless the
helper adds validation or actuarial-specific safeguards.
"""

from __future__ import annotations

from collections.abc import Iterable
from typing import Any

import numpy as np
import pandas as pd


def as_list(value: Any) -> list[Any]:
    """Return value as a list. Strings are treated as single values."""
    if value is None:
        return []
    if isinstance(value, list):
        return value
    if isinstance(value, tuple):
        return list(value)
    if isinstance(value, set):
        return list(value)
    if isinstance(value, str):
        return [value]
    if isinstance(value, Iterable):
        return list(value)
    return [value]


def validate_columns(df: pd.DataFrame, cols: str | Iterable[str]) -> None:
    """Raise ValueError if any required columns are missing."""
    required = as_list(cols)
    missing = [col for col in required if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")


def ensure_unique_keys(df: pd.DataFrame, keys: str | Iterable[str], *, name: str = "data") -> None:
    """Raise ValueError if key columns are not unique."""
    key_list = as_list(keys)
    validate_columns(df, key_list)
    duplicates = df[df.duplicated(key_list, keep=False)]
    if not duplicates.empty:
        examples = duplicates[key_list].drop_duplicates().head(10).to_dict("records")
        raise ValueError(f"{name} has duplicate keys for {key_list}. Examples: {examples}")



[docs]
def factor_lookup(
    df: pd.DataFrame,
    factors: pd.DataFrame,
    keys: str | Iterable[str],
    *,
    factor_col: str,
    default: float | None = None,
) -> np.ndarray:
    """Join a factor onto ``df`` by value on one or more existing key columns.

    The single factor-join primitive behind grouped completion, seasonality, and
    :func:`adjust`. ``factors`` is a tidy table containing ``keys`` and ``factor_col``;
    each row of ``df`` is matched on its ``keys`` values. The factor table must be unique
    on ``keys`` -- a duplicate would fan rows out on the join -- so this raises otherwise.
    Returns a float array aligned to ``df``'s row order (the frame's own index never
    participates). An absent key gives ``default`` (``NaN`` when ``default`` is ``None``
    -- a surfaced gap, never silently filled).
    """
    key_cols = as_list(keys)
    if not key_cols:
        raise ValueError("keys must name at least one column")
    validate_columns(factors, key_cols + [factor_col])
    validate_columns(df, key_cols)
    ensure_unique_keys(factors, key_cols, name="factor table")
    if len(key_cols) == 1:
        lookup = factors.set_index(key_cols[0])[factor_col]
        factor = np.array(df[key_cols[0]].map(lookup), dtype="float64")
    else:
        lookup = factors.set_index(key_cols)[factor_col]
        row_keys = pd.MultiIndex.from_frame(df[key_cols])
        factor = np.array(lookup.reindex(row_keys), dtype="float64")
    if default is not None:
        factor = np.where(np.isnan(factor), float(default), factor)
    return factor



def grouped_factor_lookup(
    df: pd.DataFrame,
    factors: pd.DataFrame,
    by: str | Iterable[str],
    key_values: Any,
    *,
    key_col: str,
    factor_col: str,
) -> np.ndarray:
    """Look up a per-segment factor by ``(group..., key)``, joining by value.

    Thin wrapper over :func:`factor_lookup` for the case where the key is a *derived*
    quantity (``key_values``, positional in row order) rather than an existing column --
    e.g. a season extracted from a date, or a development period. ``factors`` is a tidy
    table with grouping column(s) ``by``, a key column (``key_col``) and ``factor_col``;
    it must be unique on ``by + [key_col]``. Returns a float array with ``NaN`` where the
    ``(group, key)`` pair is absent; order is preserved regardless of index.
    """
    by_cols = as_list(by)
    if not by_cols:
        raise ValueError("Pass by=... naming the grouping column(s) for a per-segment factor table.")
    key_frame = df[by_cols].reset_index(drop=True).copy()
    key_frame[key_col] = key_values
    return factor_lookup(key_frame, factors, by_cols + [key_col], factor_col=factor_col)


def sum_columns(df: pd.DataFrame, cols: str | Iterable[str], *, min_count: int = 1) -> pd.Series:
    """Validate and sum one or more DataFrame columns row-wise.

    This is kept as a small internal-friendly utility because many actuarial
    functions accept several expense or revenue columns. For simple user code,
    pandas syntax such as ``df[cols].sum(axis=1)`` is usually sufficient.
    """
    cols_list = as_list(cols)
    if not cols_list:
        raise ValueError("cols must contain at least one column")
    validate_columns(df, cols_list)
    return df[cols_list].sum(axis=1, min_count=min_count)


_DATE_NAME_TOKENS = {"date", "month", "period", "year", "quarter", "week", "yearmonth", "yyyymm"}
_DATE_AFFIX_TOKENS = ("date", "month", "period", "quarter", "week", "year")


def is_date_like(series: pd.Series, name: str) -> bool:
    """Heuristic test for a date/time column.

    Returns True if the column has a datetime or period dtype, or its name matches a
    common date token (e.g. ``month``, ``paid_month``, ``effective_date``). Used to
    place date columns first in summary output.
    """
    if pd.api.types.is_datetime64_any_dtype(series) or isinstance(series.dtype, pd.PeriodDtype):
        return True
    lowered = name.lower()
    if lowered in _DATE_NAME_TOKENS:
        return True
    return any(lowered.startswith(tok + "_") or lowered.endswith("_" + tok) for tok in _DATE_AFFIX_TOKENS)


def per_exposure_name(stem: str, exposure_col: str) -> str:
    """Output column name for a per-exposure quantity: ``{stem}_per_{exposure_col}``.

    Naming is mechanical and domain-free. Domain conventions (a health shop's
    ``_pmpm``) belong to the caller and are applied via the ``labels`` /
    ``profile`` options on the output views, never inferred from column names.
    """
    return f"{stem}_per_{exposure_col}"