Source code for actuarialpy.cohorts

"""Cohort and duration summaries."""

from __future__ import annotations

from collections.abc import Iterable

import pandas as pd

from actuarialpy.columns import as_list, validate_columns
from actuarialpy.experience import summarize_experience
from actuarialpy.periods import add_duration_column, add_period_column


[docs] def cohort_summary( df: pd.DataFrame, *, entity_col: str, date_col: str, start_date_col: str, duration_months: int = 12, groupby: str | Iterable[str] | None = None, expense_cols: str | Iterable[str], revenue_cols: str | Iterable[str], exposure_cols: str | Iterable[str] | None = None, profile: str | None = None, ) -> pd.DataFrame: """Summarize each entity's first N months or cohort-duration window. Each entity is clipped to its own first ``duration_months`` months of duration (month 1 is the entity's start month), aligning entities by tenure rather than calendar time. The output also reports how much of that window is actually present, so partial (not-yet-mature) cohorts can be spotted and excluded: - ``months_observed``: count of distinct duration months present (1..N). - ``last_month``: latest experience month observed; with ``first_month`` this gives the available range. - ``complete``: whether the full window is present, i.e. ``months_observed == duration_months``. For example, to keep only cohorts with a full first year:: cohorts = exp.cohort(entity_col="group", start_date_col="effective_date") mature = cohorts[cohorts["complete"]] """ groups = as_list(groupby) validate_columns(df, [entity_col, date_col, start_date_col] + groups) temp = add_duration_column(df, start_date_col, date_col, duration_col="duration_month", one_based=True) temp = temp[(temp["duration_month"] >= 1) & (temp["duration_month"] <= duration_months)].copy() temp["first_month"] = pd.to_datetime(temp[start_date_col]).dt.to_period("M") temp["cohort_year"] = pd.to_datetime(temp[start_date_col]).dt.year group_keys = [entity_col, "first_month", "cohort_year"] + groups summary = summarize_experience( temp, groupby=group_keys, expense_cols=expense_cols, revenue_cols=revenue_cols, exposure_cols=exposure_cols, profile=profile, ) coverage = ( temp.groupby(group_keys, dropna=False) .agg(months_observed=("duration_month", "nunique"), last_month=(date_col, "max")) .reset_index() ) coverage["last_month"] = pd.to_datetime(coverage["last_month"]).dt.to_period("M") coverage["complete"] = coverage["months_observed"] == duration_months summary = summary.merge(coverage, on=group_keys, how="left") coverage_cols = ["months_observed", "last_month", "complete"] metric_cols = [c for c in summary.columns if c not in group_keys and c not in coverage_cols] return summary[group_keys + coverage_cols + metric_cols]
[docs] def cohort_summary_by_period( cohort_df: pd.DataFrame, *, cohort_date_col: str = "first_month", freq: str = "Q", entity_col: str | None = None, expense_col: str = "total_expense", revenue_col: str = "total_revenue", exposure_cols: str | Iterable[str] | None = None, ) -> pd.DataFrame: """Roll entity-level cohort summaries into cohort month/quarter/year buckets.""" temp = cohort_df.copy() if cohort_date_col not in temp.columns: raise ValueError(f"Missing required column: {cohort_date_col}") if isinstance(temp[cohort_date_col].iloc[0], pd.Period): temp["cohort_period"] = temp[cohort_date_col].dt.asfreq(freq) else: temp = add_period_column(temp, cohort_date_col, freq, "cohort_period", copy=False) exposures = as_list(exposure_cols) summary = summarize_experience( temp, groupby="cohort_period", expense_cols=expense_col, revenue_cols=revenue_col, exposure_cols=exposures, ) if entity_col: counts = temp.groupby("cohort_period", dropna=False)[entity_col].nunique().reset_index(name="entity_count") summary = counts.merge(summary, on="cohort_period", how="right") return summary
[docs] def duration_summary( df: pd.DataFrame, *, entity_col: str, date_col: str, start_date_col: str, expense_cols: str | Iterable[str], revenue_cols: str | Iterable[str], exposure_cols: str | Iterable[str] | None = None, max_duration_month: int | None = None, ) -> pd.DataFrame: """Summarize experience by duration month since entity start.""" temp = add_duration_column(df, start_date_col, date_col, duration_col="duration_month", one_based=True) temp = temp[temp["duration_month"] >= 1].copy() if max_duration_month is not None: temp = temp[temp["duration_month"] <= max_duration_month] return summarize_experience( temp, groupby="duration_month", expense_cols=expense_cols, revenue_cols=revenue_cols, exposure_cols=exposure_cols, )