Source code for actuarialpy.banding

"""Size-banding primitives.

Bucket rows into size bands by any numeric column (subscriber count, member
count, exposure, premium, total insured value, ...) and summarize experience by
band. Band edges are always a parameter, since different analyses use different
cut points (e.g. one scheme with six buckets and a coarser one with four).
"""

from __future__ import annotations

from collections.abc import Iterable, Sequence

import numpy as np
import pandas as pd

from actuarialpy.columns import validate_columns
from actuarialpy.experience import summarize_experience


def _default_labels(edges: Sequence[float]) -> list[str]:
    """Build readable labels from left-closed band edges.

    ``[0, 51, 76, 151, inf]`` -> ``["0-50", "51-75", "76-150", "151+"]``.
    """
    labels: list[str] = []
    for i in range(len(edges) - 1):
        lo = edges[i]
        hi = edges[i + 1]
        if np.isinf(hi):
            labels.append(f"{int(lo)}+")
        else:
            labels.append(f"{int(lo)}-{int(hi) - 1}")
    return labels



[docs]
def assign_band(
    df: pd.DataFrame,
    value_col: str,
    bands: Sequence[float],
    *,
    labels: Sequence[str] | None = None,
    band_col: str = "band",
    right: bool = False,
    copy: bool = True,
) -> pd.DataFrame:
    """Assign each row to an ordered size band based on ``value_col``.

    ``bands`` are bin edges. For integer counts the natural form is left-closed
    (``right=False``), so ``bands=[0, 51, 76, 151, 251, 501, inf]`` yields
    ``[0, 51)``, ``[51, 76)``, .... A trailing ``float("inf")`` captures the open
    top band. The resulting column is an ordered categorical so downstream
    group-bys keep band order.
    """
    validate_columns(df, [value_col])
    edges = list(bands)
    if len(edges) < 2:
        raise ValueError("bands must contain at least two edges (one band).")
    if labels is None:
        labels = _default_labels(edges)
    if len(labels) != len(edges) - 1:
        raise ValueError(f"Expected {len(edges) - 1} labels for {len(edges)} edges, got {len(labels)}.")
    result = df.copy() if copy else df
    result[band_col] = pd.cut(
        result[value_col],
        bins=edges,
        labels=list(labels),
        right=right,
        include_lowest=True,
        ordered=True,
    )
    return result




[docs]
def summarize_by_band(
    df: pd.DataFrame,
    value_col: str,
    bands: Sequence[float],
    *,
    labels: Sequence[str] | None = None,
    expense_cols: str | Iterable[str],
    revenue_cols: str | Iterable[str],
    exposure_cols: str | Iterable[str] | None = None,
    band_col: str = "band",
    ratio_col: str | None = None,
    right: bool = False,
    profile: str | None = None,
) -> pd.DataFrame:
    """Assign size bands then summarize experience grouped by band.

    Returns one row per band in band order (empty bands included), with the same
    aggregates, loss ratio, and per-exposure metrics as
    :func:`~actuarialpy.experience.summarize_experience`.
    """
    banded = assign_band(
        df,
        value_col,
        bands,
        labels=labels,
        band_col=band_col,
        right=right,
        copy=True,
    )
    summary = summarize_experience(
        banded,
        groupby=band_col,
        expense_cols=expense_cols,
        revenue_cols=revenue_cols,
        exposure_cols=exposure_cols,
        ratio_col=ratio_col,
        profile=profile,
    )
    # Preserve band order and surface empty bands explicitly.
    order = list(banded[band_col].cat.categories)
    summary[band_col] = pd.Categorical(summary[band_col], categories=order, ordered=True)
    return summary.sort_values(band_col).reset_index(drop=True)