Source code for actuarialpy.claimants

"""Claimant and large-risk concentration summaries."""

from __future__ import annotations

from collections.abc import Iterable, Sequence

import pandas as pd

from actuarialpy.columns import as_list, per_exposure_name, sum_columns, validate_columns
from actuarialpy.metrics import per_exposure, safe_divide


[docs] def summarize_claimants( df: pd.DataFrame, *, claimant_col: str, amount_cols: str | Iterable[str], groupby: str | Iterable[str] | None = None, exposure_col: str | None = None, amount_name: str = "total_expense", ) -> pd.DataFrame: """Aggregate experience to claimant/member/risk level. ``claimant_col`` can be a member ID, policy ID, claim group ID, or another entity identifier. The function is descriptive; it does not cap, pool, or otherwise adjust the underlying amounts. """ groups = as_list(groupby) amounts = as_list(amount_cols) required = groups + [claimant_col] + amounts + ([exposure_col] if exposure_col else []) validate_columns(df, required) agg_cols = list(dict.fromkeys(amounts + ([exposure_col] if exposure_col else []))) out = df[groups + [claimant_col] + agg_cols].groupby( groups + [claimant_col], dropna=False, as_index=False ).sum(numeric_only=True) out[amount_name] = sum_columns(out, amounts) if exposure_col: out[per_exposure_name(amount_name, exposure_col)] = per_exposure(out[amount_name], out[exposure_col]) return out
[docs] def top_claimants( df: pd.DataFrame, *, claimant_col: str, amount_cols: str | Iterable[str] | None = None, amount_col: str | None = None, groupby: str | Iterable[str] | None = None, n: int = 25, amount_name: str = "total_expense", ) -> pd.DataFrame: """Return the top claimants by amount, optionally within each group.""" if n <= 0: raise ValueError("n must be positive") groups = as_list(groupby) if amount_col is None: if amount_cols is None: raise ValueError("Pass either amount_col or amount_cols.") base = summarize_claimants( df, claimant_col=claimant_col, amount_cols=amount_cols, groupby=groups, amount_name=amount_name, ) amount_col = amount_name else: validate_columns(df, groups + [claimant_col, amount_col]) base = df[groups + [claimant_col, amount_col]].copy() sort_cols = groups + [amount_col] if groups else [amount_col] ascending = [True] * len(groups) + [False] base = base.sort_values(sort_cols, ascending=ascending).copy() if groups: base["rank"] = base.groupby(groups, dropna=False)[amount_col].rank(method="first", ascending=False).astype(int) totals = base.groupby(groups, dropna=False)[amount_col].sum().reset_index(name="_group_total") base = base.merge(totals, on=groups, how="left") base = base[base["rank"] <= n].copy() base["share_of_total"] = safe_divide(base[amount_col], base["_group_total"]) base["cumulative_share"] = base.groupby(groups, dropna=False)["share_of_total"].cumsum() return base.drop(columns=["_group_total"]) base["rank"] = range(1, len(base) + 1) total = base[amount_col].sum() base = base[base["rank"] <= n].copy() base["share_of_total"] = safe_divide(base[amount_col], total) base["cumulative_share"] = base["share_of_total"].cumsum() return base
[docs] def large_claimant_flags( df: pd.DataFrame, *, amount_col: str = "total_expense", thresholds: Sequence[float] = (50_000, 100_000, 250_000), ) -> pd.DataFrame: """Add boolean flags for claimants above one or more amount thresholds.""" validate_columns(df, [amount_col]) out = df.copy() for threshold in thresholds: label = str(int(threshold)) if float(threshold).is_integer() else str(threshold).replace(".", "_") out[f"is_over_{label}"] = out[amount_col] >= threshold return out
[docs] def claim_concentration( df: pd.DataFrame, *, amount_col: str = "total_expense", groupby: str | Iterable[str] | None = None, top_n: Sequence[int] = (10, 25), thresholds: Sequence[float] = (50_000, 100_000, 250_000), ) -> pd.DataFrame: """Summarize how concentrated total amounts are among top claimants. The input should generally be one row per claimant within the requested grouping level, such as the output of ``summarize_claimants``. """ groups = as_list(groupby) validate_columns(df, groups + [amount_col]) def summarize(part: pd.DataFrame) -> dict[str, float]: sorted_part = part.sort_values(amount_col, ascending=False) total = sorted_part[amount_col].sum() row: dict[str, float] = { "claimant_count": len(sorted_part), "total_amount": total, } for n in top_n: top_amount = sorted_part.head(n)[amount_col].sum() row[f"top_{n}_amount"] = top_amount row[f"top_{n}_share"] = safe_divide(top_amount, total) for threshold in thresholds: label = str(int(threshold)) if float(threshold).is_integer() else str(threshold).replace(".", "_") mask = sorted_part[amount_col] >= threshold threshold_amount = sorted_part.loc[mask, amount_col].sum() row[f"count_over_{label}"] = int(mask.sum()) row[f"amount_over_{label}"] = threshold_amount row[f"share_over_{label}"] = safe_divide(threshold_amount, total) return row if groups: rows = [] for keys, part in df.groupby(groups, dropna=False, sort=False): if not isinstance(keys, tuple): keys = (keys,) rows.append({**dict(zip(groups, keys, strict=True)), **summarize(part)}) return pd.DataFrame(rows) return pd.DataFrame([summarize(df)])