Source code for actuarialpy.experience

"""Grouped actuarial experience summaries."""

from __future__ import annotations

from collections.abc import Iterable

import pandas as pd

from actuarialpy.columns import as_list, is_date_like, per_exposure_name, sum_columns, validate_columns
from actuarialpy.metrics import loss_ratio, per_exposure
from actuarialpy.profiles import apply_profile_labels, get_profile_defaults

_ID_LIKE_EXPOSURE_NAMES = {"member_id", "subscriber_id", "group_id", "employee_id", "policy_id", "claim_id"}


def _validate_exposures(exposures: list[str]) -> None:
    bad = [col for col in exposures if col.lower() in _ID_LIKE_EXPOSURE_NAMES or col.lower().endswith("_id")]
    if bad:
        raise ValueError(
            "Exposure columns must be numeric exposure measures, not identifiers. "
            f"Invalid exposure column(s): {bad}. For member-level monthly data, create a "
            "member_months column equal to 1 and use exposure_cols='member_months'."
        )


def _order_summary_columns(
    summary: pd.DataFrame,
    *,
    groups: list[str],
    expenses: list[str],
    revenues: list[str],
    exposures: list[str],
    total_expense_name: str,
    total_revenue_name: str,
    ratio_col: str,
    expense_per_names: list[str],
    revenue_per_names: list[str],
) -> pd.DataFrame:
    """Reorder summary columns into a consistent, readable layout.

    Order: date-like grouping columns, then other grouping columns, then exposure
    (volume), then the full expense block (components, total, then per-exposure rates),
    then the full revenue block, and finally the ratio. This keeps each total next to
    its own per-exposure rate (e.g. total expense beside expense PMPM) and is identical
    across every view. Any unexpected columns are appended rather than dropped.
    """
    date_groups = [g for g in groups if is_date_like(summary[g], g)]
    other_groups = [g for g in groups if g not in date_groups]
    expense_block = list(expenses) + [total_expense_name] + list(expense_per_names)
    revenue_block = list(revenues) + [total_revenue_name] + list(revenue_per_names)
    preferred = (
        date_groups + other_groups + list(exposures)
        + expense_block + revenue_block + [ratio_col]
    )

    seen: set[str] = set()
    ordered: list[str] = []
    for col in preferred:
        if col in summary.columns and col not in seen:
            seen.add(col)
            ordered.append(col)
    for col in summary.columns:  # preserve anything not explicitly ordered
        if col not in seen:
            seen.add(col)
            ordered.append(col)
    return summary[ordered]


[docs] def summarize_experience( df: pd.DataFrame, *, groupby: str | Iterable[str] | None = None, expense_cols: str | Iterable[str], revenue_cols: str | Iterable[str], exposure_cols: str | Iterable[str] | None = None, ratio_col: str | None = None, ratio_name: str | None = None, total_expense_name: str = "total_expense", total_revenue_name: str = "total_revenue", profile: str | None = None, labels: dict[str, str] | None = None, ) -> pd.DataFrame: """Summarize experience by grouping columns. Amounts and exposures are aggregated first. Ratios and per-exposure metrics are calculated after aggregation, which avoids averaging row-level ratios. By default the ratio column is named ``loss_ratio`` (general across lines of business); the ``health`` profile names it ``mlr`` and ``life`` ``benefit_ratio``. ``profile`` only supplies light defaults and does not rename total expense or total revenue. """ groups = as_list(groupby) expenses = as_list(expense_cols) revenues = as_list(revenue_cols) exposures = as_list(exposure_cols) _validate_exposures(exposures) validate_columns(df, groups + expenses + revenues + exposures) if ratio_col is not None and ratio_name is not None: raise ValueError("Specify only one of ratio_col or ratio_name.") if ratio_name is not None: ratio_col = ratio_name if ratio_col is None: ratio_col = get_profile_defaults(profile).get("ratio_col", "loss_ratio") amount_cols = list(dict.fromkeys(expenses + revenues + exposures)) if groups: summary = df[groups + amount_cols].groupby(groups, dropna=False, as_index=False).sum(numeric_only=True) else: summary = pd.DataFrame({col: [df[col].sum()] for col in amount_cols}) summary[total_expense_name] = sum_columns(summary, expenses) summary[total_revenue_name] = sum_columns(summary, revenues) summary[ratio_col] = loss_ratio(summary[total_expense_name], summary[total_revenue_name]) expense_per_names: list[str] = [] revenue_per_names: list[str] = [] for exposure in exposures: expense_per = per_exposure_name(total_expense_name, exposure) revenue_per = per_exposure_name(total_revenue_name, exposure) summary[expense_per] = per_exposure(summary[total_expense_name], summary[exposure]) summary[revenue_per] = per_exposure(summary[total_revenue_name], summary[exposure]) expense_per_names.append(expense_per) revenue_per_names.append(revenue_per) summary = _order_summary_columns( summary, groups=groups, expenses=expenses, revenues=revenues, exposures=exposures, total_expense_name=total_expense_name, total_revenue_name=total_revenue_name, ratio_col=ratio_col, expense_per_names=expense_per_names, revenue_per_names=revenue_per_names, ) return apply_profile_labels(summary, profile=profile, labels=labels)
[docs] def summarize_views( df: pd.DataFrame, *, views: dict[str, str | Iterable[str] | None], expense_cols: str | Iterable[str], revenue_cols: str | Iterable[str], exposure_cols: str | Iterable[str] | None = None, ratio_col: str | None = None, ratio_name: str | None = None, total_expense_name: str = "total_expense", total_revenue_name: str = "total_revenue", profile: str | None = None, ) -> dict[str, pd.DataFrame]: """Create multiple experience summary views from the same input data.""" return { name: summarize_experience( df, groupby=groupby, expense_cols=expense_cols, revenue_cols=revenue_cols, exposure_cols=exposure_cols, ratio_col=ratio_col, ratio_name=ratio_name, total_expense_name=total_expense_name, total_revenue_name=total_revenue_name, profile=profile, ) for name, groupby in views.items() }
[docs] def status_summary( df: pd.DataFrame, *, status_col: str, entity_col: str | None = None, expense_cols: str | Iterable[str], revenue_cols: str | Iterable[str], exposure_cols: str | Iterable[str] | None = None, profile: str | None = None, ) -> pd.DataFrame: """Summarize experience by status, optionally adding entity counts.""" validate_columns(df, [status_col] + ([entity_col] if entity_col else [])) summary = summarize_experience( df, groupby=status_col, expense_cols=expense_cols, revenue_cols=revenue_cols, exposure_cols=exposure_cols, profile=profile, ) if entity_col: counts = df.groupby(status_col, dropna=False)[entity_col].nunique().reset_index(name="entity_count") summary = counts.merge(summary, on=status_col, how="right") return summary