Source code for actuarialpy.reserving

"""Reserving and claims-development tools.

Claims-development primitives that sit upstream of experience analysis: development
period measurement, development (completion) triangles, the IBNR identity, and
completion-factor validation.

ActuarialPy keeps factor *estimation* and factor *application* separate. The work of
turning transactional or development data into a triangle, measuring the development
period, and the completed/paid identity lives here, alongside :func:`completion_factors`.
Applying a factor is a single multiplication, but it hinges on a join -- each row's
development period matched to the right factor -- and a factor arriving in an arbitrary
external table can be joined many ways. :func:`apply_completion` therefore commits to one
well-defined contract: factors keyed by development period, each row's development period
taken as ``development_months(incurred, valuation)`` (or an explicit ``development_col``),
joined by value so the frame's index is irrelevant and a convention mismatch surfaces as
``NaN`` rather than silent corruption. Factors from this module's own pipeline satisfy
that contract by
construction; estimate them here, then complete in your pipeline or via
``Experience.complete``.
"""

from __future__ import annotations

import warnings
from dataclasses import dataclass
from typing import Any

import numpy as np
import pandas as pd

from actuarialpy.columns import as_list, grouped_factor_lookup, validate_columns



[docs]
def development_months(incurred_date, valuation_date):
    """Whole months of development between incurred (origin) and valuation.

    Either argument may be a scalar, a Series, or array-like, in any combination
    (e.g. a column of incurred dates against a single valuation date). The result is
    a Series when either argument is a Series, otherwise a scalar.
    """
    incurred = pd.to_datetime(incurred_date)
    valuation = pd.to_datetime(valuation_date)

    def year_month(value):
        accessor = value.dt if hasattr(value, "dt") else value  # Series use .dt; Timestamp/Index don't
        return accessor.year, accessor.month

    incurred_year, incurred_month = year_month(incurred)
    valuation_year, valuation_month = year_month(valuation)
    return (valuation_year - incurred_year) * 12 + (valuation_month - incurred_month)



# Backwards-compatible alias: "development" is the preferred cross-domain term.
lag_months = development_months



[docs]
def ibnr(completed, paid):
    """IBNR as completed minus paid (the completed/paid identity).

    Works element-wise on scalars or Series. ``completed`` and ``paid`` must be on
    the same basis; the result is the amount bridging paid-to-date to ultimate.
    """
    return completed - paid




[docs]
def validate_completion_factors(
    factors: pd.DataFrame, factor_col: str = "completion_factor", *, method: str = "divide"
) -> None:
    """Validate completion-factor values for a selected convention.

    ``divide`` factors (completed = paid / factor) should satisfy
    ``0 < factor <= 1``; ``multiply`` factors (completed = paid * factor) should
    satisfy ``factor >= 1``. Useful as a sanity check on estimated factors before
    they are applied upstream.
    """
    validate_columns(factors, [factor_col])
    values = factors[factor_col]
    bad_missing = values.isna()
    if bad_missing.any():
        raise ValueError(f"{bad_missing.sum()} completion factors are missing")
    if method == "divide":
        bad = (values <= 0) | (values > 1)
        if bad.any():
            raise ValueError("divide-method completion factors should generally satisfy 0 < factor <= 1")
    elif method == "multiply":
        bad = values < 1
        if bad.any():
            raise ValueError("multiply-method completion factors should generally satisfy factor >= 1")
    else:
        raise ValueError("method must be either 'divide' or 'multiply'")




[docs]
def make_completion_triangle(
    df: pd.DataFrame,
    *,
    origin_col: str,
    valuation_col: str,
    amount_col: str,
    cumulative: bool = True,
    index_name: str = "origin_period",
    development_name: str = "development_month",
) -> pd.DataFrame:
    """Build a development (completion) triangle by origin period and development period.

    Each cell aggregates ``amount_col`` for an origin month at a given valuation
    development period (whole months between origin and valuation, via :func:`development_months`).
    ``amount_col`` is treated as the *incremental* amount in each (origin, development period)
    cell; with ``cumulative=True`` -- the default, and the usual basis for
    estimating development/completion factors -- the cells are accumulated across
    development period. Set ``cumulative=False`` to return the incremental triangle, or if your
    input amounts are already cumulative-to-date snapshots.

    This consumes a compact development aggregate (one row per origin x valuation,
    i.e. months x months); it does not require transaction/line-level data.
    """
    validate_columns(df, [origin_col, valuation_col, amount_col])
    temp = df.copy()
    temp[index_name] = pd.to_datetime(temp[origin_col]).dt.to_period("M")
    temp[development_name] = development_months(temp[origin_col], temp[valuation_col])
    grouped = temp.groupby([index_name, development_name], dropna=False)[amount_col].sum().reset_index()
    triangle = grouped.pivot(index=index_name, columns=development_name, values=amount_col).sort_index(axis=1)
    if cumulative:
        triangle = triangle.cumsum(axis=1)
    return triangle




[docs]
@dataclass(frozen=True)
class ChainLadder:
    """Chain-ladder development pattern fitted from a cumulative triangle.

    Fit with :meth:`fit` from a cumulative development triangle (for example the
    output of :func:`make_completion_triangle` with ``cumulative=True``):

    - ``age_to_age`` -- link (age-to-age) factors, indexed by their starting development period.
    - ``cdf`` -- cumulative development factor to ultimate by development period, including the
      tail.
    - ``completion_factors`` -- ``1 / cdf`` by development period: the proportion of ultimate
      emerged by each development period. These are divide-convention factors in ``(0, 1]``
      (``completed = paid / factor``), so they line up with
      :func:`validate_completion_factors` and downstream completion.

    Use :meth:`project` to apply the pattern to a triangle and get per-origin
    ultimate and IBNR.
    """

    age_to_age: pd.Series
    cdf: pd.Series
    completion_factors: pd.Series
    tail: float
    method: str


[docs]
    @classmethod
    def fit(cls, triangle: pd.DataFrame, *, method: str = "volume", tail: float = 1.0) -> ChainLadder:
        """Estimate the development pattern from a cumulative triangle.

        ``method`` is ``"volume"`` (volume-weighted age-to-age factors, the
        default) or ``"simple"`` (straight average of individual link ratios).
        ``tail`` (>= 1) extends development beyond the latest observed development period.
        """
        if method not in ("volume", "simple"):
            raise ValueError("method must be 'volume' or 'simple'")
        if tail < 1.0:
            raise ValueError("tail must be >= 1.0")
        if not isinstance(triangle, pd.DataFrame):
            raise TypeError("triangle must be a pandas DataFrame")

        tri = triangle.sort_index(axis=1)
        cols = list(tri.columns)
        if len(cols) < 2:
            raise ValueError("triangle must have at least two development periods")
        if tri.shape[0] < 2:
            raise ValueError("triangle must have at least two origin periods")

        # age-to-age (link) factors between each pair of adjacent development periods
        ratios: dict[object, float] = {}
        for start, end in zip(cols[:-1], cols[1:], strict=True):
            pair = tri[[start, end]].dropna()
            if pair.empty:
                raise ValueError(f"no overlapping origins to estimate the {start}->{end} development factor")
            if method == "volume":
                start_sum = float(pair[start].sum())
                if start_sum == 0:
                    raise ValueError(f"zero cumulative at development period {start}; cannot estimate {start}->{end} factor")
                ratios[start] = float(pair[end].sum()) / start_sum
            else:
                ratios[start] = float((pair[end] / pair[start]).mean())
        age_to_age = pd.Series(ratios, name="age_to_age")

        # cumulative development factors to ultimate (with tail), accumulating back
        cdf_vals: dict[object, float] = {cols[-1]: float(tail)}
        running = float(tail)
        for start in reversed(cols[:-1]):
            running *= age_to_age[start]
            cdf_vals[start] = running
        cdf = pd.Series(cdf_vals, name="cdf").reindex(cols)

        completion = (1.0 / cdf).rename("completion_factor")
        return cls(
            age_to_age=age_to_age,
            cdf=cdf,
            completion_factors=completion,
            tail=float(tail),
            method=method,
        )



[docs]
    def project(self, triangle: pd.DataFrame) -> pd.DataFrame:
        """Project ultimate and IBNR per origin by applying the fitted pattern.

        For each origin, takes its latest observed cumulative amount and multiplies
        by the cumulative development factor at that development period. Returns one row per origin
        with the latest development period, latest cumulative, development factor applied,
        ultimate, and IBNR (ultimate minus latest).
        """
        tri = triangle.sort_index(axis=1)
        records: list[dict[str, float]] = []
        origins: list[object] = []
        for origin, row in tri.iterrows():
            observed = row.dropna()
            if observed.empty:
                continue
            latest_development = max(observed.index)
            if latest_development not in self.cdf.index:
                raise ValueError(f"no development factor for development period {latest_development}; fit on a matching triangle")
            latest = float(observed.loc[latest_development])
            factor = float(self.cdf.loc[latest_development])
            ultimate = latest * factor
            origins.append(origin)
            records.append({
                "latest_development": latest_development,
                "latest": latest,
                "development_factor": factor,
                "ultimate": ultimate,
                "ibnr": ultimate - latest,
            })
        return pd.DataFrame.from_records(records, index=pd.Index(origins, name=tri.index.name))





[docs]
def completion_factors(triangle: pd.DataFrame, *, method: str = "volume", tail: float = 1.0) -> pd.Series:
    """Completion factors by development period, via chain-ladder.

    Convenience wrapper around :class:`ChainLadder`: returns the proportion of
    ultimate emerged by each development period (``1 / cdf``) estimated from a cumulative
    triangle. Divide-convention factors in ``(0, 1]`` (``completed = paid /
    factor``). See :class:`ChainLadder` for the full pattern and per-origin
    ultimate/IBNR.
    """
    return ChainLadder.fit(triangle, method=method, tail=tail).completion_factors



def _emerged_factor(
    df: pd.DataFrame,
    factors: pd.Series | pd.DataFrame,
    *,
    date_col: str | None,
    valuation_date: Any,
    development_col: str | None,
    by_cols: list[str],
    factor_col: str,
    development_name: str,
) -> np.ndarray:
    """Per-row completion factor (proportion emerged), with the join and tail rule.

    Each row's development period is taken from ``development_col`` or computed as
    ``development_months(date, valuation_date)``; the factor is then joined by value
    (flat Series or per-segment table) exactly as in :func:`apply_completion`. A row past
    its (group's) last development period is fully emerged (``1.0``); an interior gap or
    absent group stays ``NaN``; a negative development period raises.
    """
    if development_col is not None:
        development = pd.to_numeric(df[development_col]).to_numpy()
    else:
        valuation = pd.Series(pd.to_datetime(valuation_date), index=df.index)
        development = development_months(df[date_col], valuation).to_numpy()
    if (development < 0).any():
        raise ValueError("Negative development period: some rows have an incurred date after valuation_date.")

    if isinstance(factors, pd.DataFrame):
        factor = grouped_factor_lookup(
            df, factors, by_cols, development, key_col=development_name, factor_col=factor_col
        )
        by_key = by_cols[0] if len(by_cols) == 1 else by_cols
        group_max = factors.groupby(by_key)[development_name].max()
        if len(by_cols) == 1:
            row_max = group_max.reindex(df[by_cols[0]].to_numpy()).to_numpy()
        else:
            row_max = group_max.reindex(pd.MultiIndex.from_frame(df[by_cols].reset_index(drop=True))).to_numpy()
        beyond = np.isnan(factor) & (development > row_max)  # absent group -> row_max NaN -> stays NaN
        factor[beyond] = 1.0
    else:
        max_development = int(pd.Index(factors.index).max())
        factor = np.array(pd.Series(development).map(factors), dtype="float64")  # NaN where absent
        factor[development > max_development] = 1.0  # beyond the fitted triangle -> complete
    return factor


def _cape_cod_elr(
    paid: np.ndarray, exposure: np.ndarray, emerged: np.ndarray, df: pd.DataFrame, by_cols: list[str]
) -> np.ndarray:
    """Cape Cod expected loss ratio = sum(paid) / sum(exposure * emerged), per segment.

    The Stanard-Buhlmann "used-up premium" ELR: a single loss ratio per ``by`` segment
    (or one overall) derived from the data, broadcast back to each row. Rows whose factor
    is ``NaN`` are excluded from the ratio (and stay ``NaN`` in the result).
    """
    used = exposure * emerged
    valid = ~np.isnan(used)
    frame = pd.DataFrame({
        "_paid": np.where(valid, paid, np.nan),
        "_used": np.where(valid, used, np.nan),
    })
    if not by_cols:
        return np.full(len(paid), np.nansum(frame["_paid"].to_numpy()) / np.nansum(frame["_used"].to_numpy()))
    for col in by_cols:
        frame[col] = df[col].to_numpy()
    grouped = frame.groupby(by_cols, dropna=False)
    elr_by_group = grouped["_paid"].sum(min_count=1) / grouped["_used"].sum(min_count=1)
    if len(by_cols) == 1:
        return df[by_cols[0]].map(elr_by_group).to_numpy(dtype="float64")
    keys = pd.MultiIndex.from_frame(df[by_cols].reset_index(drop=True))
    return np.array(elr_by_group.reindex(keys), dtype="float64")



[docs]
def apply_completion(
    df: pd.DataFrame,
    factors: pd.Series | pd.DataFrame,
    *,
    value_col: str,
    date_col: str | None = None,
    valuation_date: Any = None,
    development_col: str | None = None,
    by: str | list[str] | None = None,
    factor_col: str = "completion_factor",
    development_name: str = "development_month",
    out_col: str | None = None,
    copy: bool = True,
) -> pd.DataFrame:
    """Develop a paid amount to estimated ultimate with completion factors.

    For each row the development period is taken from ``development_col`` if supplied,
    otherwise computed as ``development_months(df[date_col], valuation_date)`` -- the
    convention :func:`make_completion_triangle` uses, so factors from
    :func:`completion_factors` or :func:`completion_factors_by` join by construction.
    The completed amount is ``paid / factor`` (the divide convention, factors in
    ``(0, 1]``).

    ``factors`` may be either of:

    - a flat Series indexed by development period (one pattern for the whole frame), or
    - a tidy DataFrame of per-segment factors -- grouping column(s), a development-period
      column (``development_name``) and a factor column (``factor_col``), the shape
      :func:`completion_factors_by` returns -- joined on ``by`` plus development period.
      The table must be unique on ``by + [development]`` (a duplicate would fan out the
      data); this is checked.

    The join is by value, never index alignment, so the frame's own index is irrelevant.
    A row past its (group's) largest development period is taken as fully complete
    (factor ``1.0``); a development period inside the fitted range but absent stays
    ``NaN`` -- a surfaced gap; a row whose group is absent from the factor table stays
    ``NaN``; a negative development period (incurred after ``valuation_date``) raises.
    Supply either ``development_col``, or both ``date_col`` and ``valuation_date``.
    """
    if development_col is None and (date_col is None or valuation_date is None):
        raise ValueError(
            "Provide development_col, or both date_col and valuation_date, to determine each row's development period."
        )
    by_cols = as_list(by)
    needed = [value_col] + ([development_col] if development_col is not None else [date_col]) + by_cols
    validate_columns(df, needed)
    result = df.copy() if copy else df

    factor = _emerged_factor(
        result, factors, date_col=date_col, valuation_date=valuation_date, development_col=development_col,
        by_cols=by_cols, factor_col=factor_col, development_name=development_name,
    )
    result[out_col or f"{value_col}_completed"] = result[value_col].to_numpy() / factor
    return result




[docs]
def develop_ultimate(
    df: pd.DataFrame,
    factors: pd.Series | pd.DataFrame,
    *,
    method: str = "bornhuetter_ferguson",
    value_col: str,
    date_col: str | None = None,
    valuation_date: Any = None,
    development_col: str | None = None,
    apriori_col: str | None = None,
    exposure_col: str | None = None,
    by: str | list[str] | None = None,
    factor_col: str = "completion_factor",
    development_name: str = "development_month",
    out_col: str | None = None,
    copy: bool = True,
) -> pd.DataFrame:
    """Develop a paid amount to estimated ultimate by a chosen reserving method.

    All methods share one input -- the proportion emerged at each row's development
    period, joined exactly as :func:`apply_completion` does (flat Series or per-segment
    table, beyond-the-triangle rows fully emerged). They differ only in how they combine
    that with the paid-to-date and an *a priori* expectation:

    - ``"chain_ladder"`` -- ``paid / emerged``. Ignores the a priori; equivalent to
      :func:`apply_completion`. Volatile for immature periods (a thin latest diagonal
      drives the whole tail).
    - ``"bornhuetter_ferguson"`` -- ``paid + apriori * (1 - emerged)``. Takes the
      unemerged portion from the a priori rather than from the data, so it is stable for
      green periods. Requires ``apriori_col`` (an expected ultimate per row -- an input,
      e.g. a plan, budget, or manual times exposure).
    - ``"benktander"`` -- one Bornhuetter-Ferguson iteration using the BF ultimate as the
      a priori: ``paid + bf * (1 - emerged)``. A credibility blend sitting between BF and
      chain ladder (weight ``emerged`` on chain ladder). Requires ``apriori_col``.
    - ``"cape_cod"`` -- Bornhuetter-Ferguson with the a priori derived from the data: a
      single expected loss ratio per segment, ``sum(paid) / sum(exposure * emerged)``,
      times each row's exposure. Requires ``exposure_col`` (an on-level premium /
      exposure per row). The loss ratio is mechanical; the exposure base is an input.

    The library applies a method; it does not pick the a priori or the exposure base.
    Supply either ``development_col`` or both ``date_col`` and ``valuation_date``; pass
    ``by`` with a per-segment factor table (and Cape Cod then derives one loss ratio per
    segment). Returns ``df`` with an ``out_col`` (default ``f"{value_col}_ultimate"``).
    """
    methods = {"chain_ladder", "bornhuetter_ferguson", "benktander", "cape_cod"}
    if method not in methods:
        raise ValueError(f"method must be one of {sorted(methods)}; got {method!r}.")
    if development_col is None and (date_col is None or valuation_date is None):
        raise ValueError(
            "Provide development_col, or both date_col and valuation_date, to determine each row's development period."
        )
    by_cols = as_list(by)
    needed = [value_col] + ([development_col] if development_col is not None else [date_col]) + by_cols
    if method in ("bornhuetter_ferguson", "benktander"):
        if apriori_col is None:
            raise ValueError(f"method={method!r} requires apriori_col (an expected ultimate per row).")
        needed.append(apriori_col)
    if method == "cape_cod":
        if exposure_col is None:
            raise ValueError("method='cape_cod' requires exposure_col (an on-level premium / exposure per row).")
        needed.append(exposure_col)
    validate_columns(df, needed)
    result = df.copy() if copy else df

    emerged = _emerged_factor(
        result, factors, date_col=date_col, valuation_date=valuation_date, development_col=development_col,
        by_cols=by_cols, factor_col=factor_col, development_name=development_name,
    )
    paid = result[value_col].to_numpy(dtype="float64")

    if method == "chain_ladder":
        ultimate = paid / emerged
    elif method == "bornhuetter_ferguson":
        apriori = result[apriori_col].to_numpy(dtype="float64")
        ultimate = paid + apriori * (1.0 - emerged)
    elif method == "benktander":
        apriori = result[apriori_col].to_numpy(dtype="float64")
        bf = paid + apriori * (1.0 - emerged)
        ultimate = paid + bf * (1.0 - emerged)
    else:  # cape_cod
        exposure = result[exposure_col].to_numpy(dtype="float64")
        elr = _cape_cod_elr(paid, exposure, emerged, result, by_cols)
        ultimate = paid + exposure * elr * (1.0 - emerged)

    result[out_col or f"{value_col}_ultimate"] = ultimate
    return result




[docs]
class InsufficientDataWarning(UserWarning):
    """Emitted when a segment has too little data to fit and is skipped or aggregated.

    Filter it with the standard :mod:`warnings` machinery, e.g.
    ``warnings.filterwarnings("ignore", category=InsufficientDataWarning)``.
    """




[docs]
def chain_ladder_by(
    df: pd.DataFrame,
    *,
    groupby: str | list[str],
    origin_col: str,
    valuation_col: str,
    amount_col: str,
    cumulative: bool = True,
    method: str = "volume",
    tail: float = 1.0,
    on_insufficient: str = "raise",
    warn: bool = True,
) -> dict[Any, ChainLadder]:
    """Fit a chain-ladder development pattern per segment of ``df``.

    Groups ``df`` by ``groupby``, builds a development triangle for each segment
    (see :func:`make_completion_triangle`), and fits a :class:`ChainLadder` to
    each. Returns ``{segment_key: ChainLadder}`` -- the key is a scalar for a
    single grouping column, or a tuple for several.

    Segments too small to fit (fewer than two origins or development periods, a zero cumulative,
    and so on) are handled by ``on_insufficient``:

    - ``"raise"`` (default): raise a ``ValueError`` naming the failing segment.
    - ``"skip"``: omit those segments from the result.
    - ``"aggregate"``: use the pooled pattern fit on the whole frame for them.

    When ``on_insufficient`` is ``"skip"`` or ``"aggregate"`` and ``warn`` is true,
    an :class:`InsufficientDataWarning` naming the affected segments is emitted;
    ``warn=False`` suppresses it (the standard :mod:`warnings` filters also apply).
    To ignore thin segments entirely, use ``on_insufficient="skip", warn=False``.
    """
    if on_insufficient not in ("raise", "skip", "aggregate"):
        raise ValueError("on_insufficient must be 'raise', 'skip', or 'aggregate'")
    group_cols = as_list(groupby)
    if not group_cols:
        raise ValueError("groupby must name at least one column")
    validate_columns(df, group_cols + [origin_col, valuation_col, amount_col])

    def _fit(frame: pd.DataFrame) -> ChainLadder:
        triangle = make_completion_triangle(
            frame,
            origin_col=origin_col,
            valuation_col=valuation_col,
            amount_col=amount_col,
            cumulative=cumulative,
        )
        return ChainLadder.fit(triangle, method=method, tail=tail)

    aggregate_pattern: ChainLadder | None = None
    if on_insufficient == "aggregate":
        try:
            aggregate_pattern = _fit(df)
        except ValueError as exc:
            raise ValueError(f"cannot fit the aggregate fallback pattern: {exc}") from exc

    by_key = group_cols if len(group_cols) > 1 else group_cols[0]
    patterns: dict[Any, ChainLadder] = {}
    insufficient: list[Any] = []
    for key, part in df.groupby(by_key, sort=True):
        try:
            patterns[key] = _fit(part)
        except ValueError as exc:
            if on_insufficient == "raise":
                raise ValueError(f"segment {key!r} cannot be fit: {exc}") from exc
            insufficient.append(key)
            if on_insufficient == "aggregate" and aggregate_pattern is not None:
                patterns[key] = aggregate_pattern

    if insufficient and warn:
        action = "using the aggregate pattern for" if on_insufficient == "aggregate" else "skipping"
        warnings.warn(
            f"{action} {len(insufficient)} segment(s) with insufficient data: {insufficient}",
            InsufficientDataWarning,
            stacklevel=2,
        )
    return patterns




[docs]
def completion_factors_by(
    df: pd.DataFrame,
    *,
    groupby: str | list[str],
    origin_col: str,
    valuation_col: str,
    amount_col: str,
    cumulative: bool = True,
    method: str = "volume",
    tail: float = 1.0,
    on_insufficient: str = "raise",
    warn: bool = True,
    development_name: str = "development_month",
) -> pd.DataFrame:
    """Completion factors per segment as a tidy table.

    Convenience over :func:`chain_ladder_by`: one row per (segment, development period) with the
    completion factor, ready to review, pivot, or join. Columns are the grouping
    column(s), ``development_name``, and ``completion_factor``. ``on_insufficient`` and
    ``warn`` behave as in :func:`chain_ladder_by`.
    """
    group_cols = as_list(groupby)
    patterns = chain_ladder_by(
        df,
        groupby=groupby,
        origin_col=origin_col,
        valuation_col=valuation_col,
        amount_col=amount_col,
        cumulative=cumulative,
        method=method,
        tail=tail,
        on_insufficient=on_insufficient,
        warn=warn,
    )
    records: list[dict[str, Any]] = []
    for key, fitted in patterns.items():
        key_tuple = key if isinstance(key, tuple) else (key,)
        key_map = dict(zip(group_cols, key_tuple, strict=True))
        for development, factor in fitted.completion_factors.items():
            records.append({**key_map, development_name: development, "completion_factor": float(factor)})
    if not records:
        return pd.DataFrame(columns=group_cols + [development_name, "completion_factor"])
    return pd.DataFrame.from_records(records)