Source code for ncaa_eval.transform.sequential

"""Sequential feature transformations for NCAA basketball game data.

Provides rolling windows, EWMA, momentum, streak, per-possession, and
Four Factor features computed from chronologically ordered game data.

* :class:`DetailedResultsLoader` — loads box-score CSVs and provides
  per-team, per-season game views in long format.
* :class:`SequentialTransformer` — orchestrates all sequential feature
  computation steps in temporal order without data leakage.

Design invariants:
- No imports from ``ncaa_eval.ingest`` — pure CSV-loading transform layer.
- No ``df.iterrows()`` — vectorized pandas operations throughout.
- ``mypy --strict`` compliant: all types fully annotated.
- No hardcoded data paths — accept Path parameters.
"""

from __future__ import annotations

import logging
from pathlib import Path

import pandas as pd  # type: ignore[import-untyped]

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Module-level constants
# ---------------------------------------------------------------------------

_COUNTING_STATS: tuple[str, ...] = (
    "fgm",
    "fga",
    "fgm3",
    "fga3",
    "ftm",
    "fta",
    "oreb",
    "dreb",
    "ast",
    "to",
    "stl",
    "blk",
    "pf",
    "score",
    "opp_score",
)

_LONG_COLS: tuple[str, ...] = (
    "season",
    "day_num",
    "team_id",
    "opp_id",
    "won",
    "loc_encoded",
    "num_ot",
    "is_tournament",
    "score",
    "opp_score",
    "fgm",
    "fga",
    "fgm3",
    "fga3",
    "ftm",
    "fta",
    "oreb",
    "dreb",
    "ast",
    "to",
    "stl",
    "blk",
    "pf",
    "opp_oreb",
    "opp_dreb",
)


# ---------------------------------------------------------------------------
# Wide-to-long reshape helper
# ---------------------------------------------------------------------------


def _reshape_to_long(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
    """Reshape a W/L-columnar game DataFrame to long (per-team) format.

    Each input row (one game) becomes two output rows — one per team.
    Uses vectorized rename + concat; no iterrows.

    Args:
        df: Raw game DataFrame with W/L prefixed columns.
        is_tournament: True for tournament games, False for regular season.

    Returns:
        Long-format DataFrame with one row per (team, game).
    """
    common_cols = {"Season": "season", "DayNum": "day_num", "NumOT": "num_ot"}

    w_rename = {
        "WTeamID": "team_id",
        "LTeamID": "opp_id",
        "WScore": "score",
        "LScore": "opp_score",
        "WFGM": "fgm",
        "WFGA": "fga",
        "WFGM3": "fgm3",
        "WFGA3": "fga3",
        "WFTM": "ftm",
        "WFTA": "fta",
        "WOR": "oreb",
        "WDR": "dreb",
        "WAst": "ast",
        "WTO": "to",
        "WStl": "stl",
        "WBlk": "blk",
        "WPF": "pf",
        "LOR": "opp_oreb",
        "LDR": "opp_dreb",
    }
    l_rename = {
        "LTeamID": "team_id",
        "WTeamID": "opp_id",
        "LScore": "score",
        "WScore": "opp_score",
        "LFGM": "fgm",
        "LFGA": "fga",
        "LFGM3": "fgm3",
        "LFGA3": "fga3",
        "LFTM": "ftm",
        "LFTA": "fta",
        "LOR": "oreb",
        "LDR": "dreb",
        "LAst": "ast",
        "LTO": "to",
        "LStl": "stl",
        "LBlk": "blk",
        "LPF": "pf",
        "WOR": "opp_oreb",
        "WDR": "opp_dreb",
    }

    w_df = df.rename(columns={**common_cols, **w_rename})
    w_df["won"] = True
    # WLoc from winner's perspective: H=winner at home, A=winner away, N=neutral
    w_df["loc_encoded"] = df["WLoc"].map({"H": 1, "A": -1, "N": 0})

    l_df = df.rename(columns={**common_cols, **l_rename})
    l_df["won"] = False
    # For loser: invert H/A (winner home → loser away)
    l_df["loc_encoded"] = df["WLoc"].map({"H": -1, "A": 1, "N": 0})

    for side in (w_df, l_df):
        side["is_tournament"] = is_tournament

    return pd.concat(
        [w_df[list(_LONG_COLS)], l_df[list(_LONG_COLS)]],
        ignore_index=True,
    )


# ---------------------------------------------------------------------------
# DetailedResultsLoader
# ---------------------------------------------------------------------------



[docs]
class DetailedResultsLoader:
    """Loads detailed box-score results and provides per-team game views.

    Reads ``MRegularSeasonDetailedResults.csv`` and
    ``MNCAATourneyDetailedResults.csv`` into a combined long-format DataFrame
    with one row per (team, game).

    Box-score stats are only available from the 2003 season onwards.
    Pre-2003 seasons return empty DataFrames from :meth:`get_team_season`.
    """

    def __init__(self, df: pd.DataFrame) -> None:
        self._df = df  # Long-format, all seasons


[docs]
    @classmethod
    def from_csvs(cls, regular_path: Path, tourney_path: Path) -> DetailedResultsLoader:
        """Construct a loader from the two Kaggle detailed-results CSV paths.

        Args:
            regular_path: Path to ``MRegularSeasonDetailedResults.csv``.
            tourney_path: Path to ``MNCAATourneyDetailedResults.csv``.

        Returns:
            :class:`DetailedResultsLoader` instance with combined data.
        """
        reg_df = pd.read_csv(regular_path)
        tur_df = pd.read_csv(tourney_path)
        long = pd.concat(
            [
                _reshape_to_long(reg_df, is_tournament=False),
                _reshape_to_long(tur_df, is_tournament=True),
            ],
            ignore_index=True,
        )
        return cls(long)



[docs]
    def get_season_long_format(self, season: int) -> pd.DataFrame:
        """Return all games for a season in long format.

        Args:
            season: Season year (e.g., 2023).

        Returns:
            DataFrame sorted by ``(day_num, team_id)``, reset index.
        """
        mask = self._df["season"] == season
        return self._df[mask].sort_values(["day_num", "team_id"]).reset_index(drop=True)



[docs]
    def get_team_season(self, team_id: int, season: int) -> pd.DataFrame:
        """Return all games for one team in one season, sorted by day_num.

        Args:
            team_id: Canonical Kaggle TeamID integer.
            season: Season year (e.g., 2023).

        Returns:
            DataFrame sorted by ``day_num`` ascending, reset index.
            Returns empty DataFrame if team or season not found.
        """
        mask = (self._df["team_id"] == team_id) & (self._df["season"] == season)
        return self._df[mask].sort_values("day_num").reset_index(drop=True)




# ---------------------------------------------------------------------------
# OT rescaling helper
# ---------------------------------------------------------------------------



[docs]
def apply_ot_rescaling(
    team_games: pd.DataFrame,
    stats: tuple[str, ...] = _COUNTING_STATS,
) -> pd.DataFrame:
    """Rescale all counting stats to 40-minute equivalent for OT games.

    Applies: ``stat_adj = stat × 40 / (40 + 5 × num_ot)``
    Regulation games (num_ot=0) are unchanged (multiplier = 1.0).

    Returns a copy; does not modify the input DataFrame in-place.

    Args:
        team_games: Per-team game DataFrame containing a ``num_ot`` column.
        stats: Tuple of stat column names to rescale.

    Returns:
        Copy of ``team_games`` with rescaled stat columns.
    """
    df = team_games.copy()
    multiplier = 40.0 / (40.0 + 5.0 * df["num_ot"])
    available = [s for s in stats if s in df.columns]
    df[available] = df[available].mul(multiplier, axis=0)
    return df



# ---------------------------------------------------------------------------
# Time-decay weighting helper
# ---------------------------------------------------------------------------



[docs]
def compute_game_weights(
    day_nums: pd.Series,
    reference_day_num: int | None = None,
) -> pd.Series:
    """BartTorvik time-decay weights: 1% per day after 40 days old; floor 60%.

    Formula: ``weight = max(0.6, 1 − 0.01 × max(0, days_ago − 40))``

    Args:
        day_nums: Series of game day numbers (ascending order).
        reference_day_num: Reference point for ``days_ago``.
            Defaults to ``max(day_nums)``.

    Returns:
        Series of weights in [0.6, 1.0] for each game.
    """
    if day_nums.empty:
        return pd.Series([], dtype=float)
    ref = int(day_nums.max()) if reference_day_num is None else reference_day_num
    days_ago = ref - day_nums
    weight = (1.0 - 0.01 * (days_ago - 40).clip(lower=0)).clip(lower=0.6)
    return weight



# ---------------------------------------------------------------------------
# Rolling window features
# ---------------------------------------------------------------------------



[docs]
def compute_rolling_stats(
    team_games: pd.DataFrame,
    windows: list[int],
    stats: tuple[str, ...],
    weights: pd.Series | None = None,
) -> pd.DataFrame:
    """Compute rolling mean features for all specified windows and stats.

    No future data leakage: rolling window at position i only uses
    rows at positions ≤ i (pandas ``rolling`` default closed='right').

    Args:
        team_games: Per-team game DataFrame (sorted by day_num ascending).
        windows: List of window sizes (e.g., [5, 10, 20]).
        stats: Tuple of stat column names.
        weights: Optional per-game weights for weighted rolling mean.

    Returns:
        DataFrame with columns ``rolling_{w}_{stat}`` and
        ``rolling_full_{stat}`` (expanding mean).
    """
    result = {}

    for w in windows:
        for stat in stats:
            if stat not in team_games.columns:
                continue
            col = team_games[stat]
            if weights is not None:
                # Weighted rolling mean: sum(stat*w)/sum(w) over window
                num = (col * weights).rolling(w, min_periods=1).sum()
                den = weights.rolling(w, min_periods=1).sum()
                result[f"rolling_{w}_{stat}"] = num / den
            else:
                result[f"rolling_{w}_{stat}"] = col.rolling(w, min_periods=1).mean()

    # Full-season aggregate (expanding mean)
    for stat in stats:
        if stat in team_games.columns:
            result[f"rolling_full_{stat}"] = team_games[stat].expanding().mean()

    return pd.DataFrame(result, index=team_games.index)



# ---------------------------------------------------------------------------
# EWMA features
# ---------------------------------------------------------------------------



[docs]
def compute_ewma_stats(
    team_games: pd.DataFrame,
    alphas: list[float],
    stats: tuple[str, ...],
) -> pd.DataFrame:
    """Compute EWMA features for all specified alphas and stats.

    Uses ``adjust=False`` for standard exponential smoothing:
    ``value_t = α × obs_t + (1−α) × value_{t−1}``

    Args:
        team_games: Per-team game DataFrame (sorted by day_num ascending).
        alphas: List of smoothing factors (e.g., [0.15, 0.20]).
        stats: Tuple of stat column names.

    Returns:
        DataFrame with columns ``ewma_{alpha_str}_{stat}`` where
        ``alpha_str`` replaces the decimal point with 'p'
        (e.g., ``ewma_0p15_score``).
    """
    result = {}
    for alpha in alphas:
        alpha_str = f"{alpha:.2f}".replace(".", "p")
        for stat in stats:
            if stat in team_games.columns:
                result[f"ewma_{alpha_str}_{stat}"] = team_games[stat].ewm(alpha=alpha, adjust=False).mean()
    return pd.DataFrame(result, index=team_games.index)



# ---------------------------------------------------------------------------
# Momentum feature
# ---------------------------------------------------------------------------



[docs]
def compute_momentum(
    team_games: pd.DataFrame,
    alpha_fast: float,
    alpha_slow: float,
    stats: tuple[str, ...],
) -> pd.DataFrame:
    """Compute ewma_fast − ewma_slow momentum for each stat.

    Positive momentum means recent performance is above the longer-term
    trend (improving form into tournament).

    Args:
        team_games: Per-team game DataFrame (sorted by day_num ascending).
        alpha_fast: Fast EWMA smoothing factor (larger → more reactive).
        alpha_slow: Slow EWMA smoothing factor (smaller → smoother baseline).
        stats: Tuple of stat column names.

    Returns:
        DataFrame with columns ``momentum_{stat}``.
    """
    available = [s for s in stats if s in team_games.columns]
    fast = team_games[available].ewm(alpha=alpha_fast, adjust=False).mean()
    slow = team_games[available].ewm(alpha=alpha_slow, adjust=False).mean()
    result = fast - slow
    result.columns = [f"momentum_{s}" for s in result.columns]
    return result



# ---------------------------------------------------------------------------
# Streak feature
# ---------------------------------------------------------------------------



[docs]
def compute_streak(won: pd.Series) -> pd.Series:
    """Compute signed win/loss streak.

    Returns +N for a winning streak of N games, −N for a losing streak.
    Vectorized using cumsum-based grouping; no iterrows.

    Args:
        won: Boolean Series of game outcomes (True = win), sorted by day_num.

    Returns:
        Integer Series named ``"streak"``.
    """
    if won.empty:
        return pd.Series([], dtype=int, name="streak")

    # Force group break at position 0 by filling NaN shift with the opposite
    group = (won != won.shift(fill_value=not bool(won.iloc[0]))).cumsum()
    streak_len = won.groupby(group).cumcount() + 1
    return streak_len.where(won, -streak_len).rename("streak")



# ---------------------------------------------------------------------------
# Per-possession normalization
# ---------------------------------------------------------------------------



[docs]
def compute_possessions(team_games: pd.DataFrame) -> pd.Series:
    """Compute possession count: FGA − OR + TO + 0.44 × FTA.

    Zero or negative possession counts (rare in short fixtures) are
    replaced with NaN to prevent division-by-zero downstream.

    Args:
        team_games: Per-team game DataFrame with box-score columns.

    Returns:
        Series named ``"possessions"``.
    """
    poss = team_games["fga"] - team_games["oreb"] + team_games["to"] + 0.44 * team_games["fta"]
    # Guard: 0 or negative possessions → NaN
    return poss.where(poss > 0, other=float("nan")).rename("possessions")




[docs]
def compute_per_possession_stats(
    team_games: pd.DataFrame,
    stats: tuple[str, ...],
    possessions: pd.Series,
) -> pd.DataFrame:
    """Normalize counting stats by possessions (per-100 possessions).

    Args:
        team_games: Per-team game DataFrame.
        stats: Tuple of stat column names to normalize.
        possessions: Series of possession counts (NaN for guard rows).

    Returns:
        DataFrame with columns ``{stat}_per100``.
    """
    result = {}
    for stat in stats:
        if stat in team_games.columns:
            result[f"{stat}_per100"] = team_games[stat] * 100.0 / possessions
    return pd.DataFrame(result, index=team_games.index)



# ---------------------------------------------------------------------------
# Four Factors
# ---------------------------------------------------------------------------



[docs]
def compute_four_factors(
    team_games: pd.DataFrame,
    possessions: pd.Series,
) -> pd.DataFrame:
    """Compute Dean Oliver's Four Factors efficiency ratios.

    - ``efg_pct``: Effective field goal % = (FGM + 0.5 × FGM3) / FGA
    - ``orb_pct``: Offensive rebound % = OR / (OR + opp_DR)
    - ``ftr``: Free throw rate = FTA / FGA
    - ``to_pct``: Turnover % = TO / possessions

    All denominators are guarded against zero (returns NaN when zero).

    Args:
        team_games: Per-team game DataFrame with box-score columns.
        possessions: Series of possession counts (used for TO%).

    Returns:
        DataFrame with columns ``["efg_pct", "orb_pct", "ftr", "to_pct"]``.
    """
    fga = team_games["fga"].replace(0, float("nan"))
    orb_den = (team_games["oreb"] + team_games["opp_dreb"]).replace(0, float("nan"))
    return pd.DataFrame(
        {
            "efg_pct": (team_games["fgm"] + 0.5 * team_games["fgm3"]) / fga,
            "orb_pct": team_games["oreb"] / orb_den,
            "ftr": team_games["fta"] / fga,
            "to_pct": team_games["to"] / possessions,
        },
        index=team_games.index,
    )



# ---------------------------------------------------------------------------
# SequentialTransformer
# ---------------------------------------------------------------------------



[docs]
class SequentialTransformer:
    """Orchestrates all sequential feature computation steps.

    Applies OT rescaling, time-decay weighting, rolling windows, EWMA,
    momentum, streak, per-possession normalization, and Four Factors to
    a per-team game history in chronological order.

    All features respect temporal ordering — no feature for game N uses
    data from games N+1 or later.
    """

    def __init__(
        self,
        windows: list[int] | None = None,
        alphas: list[float] | None = None,
        alpha_fast: float = 0.20,
        alpha_slow: float = 0.10,
        stats: tuple[str, ...] | None = None,
    ) -> None:
        """Initialise with optional custom parameters.

        Args:
            windows: Rolling window sizes. Defaults to [5, 10, 20].
            alphas: EWMA smoothing factors. Defaults to [0.15, 0.20].
            alpha_fast: Fast EWMA alpha for momentum. Defaults to 0.20.
            alpha_slow: Slow EWMA alpha for momentum. Defaults to 0.10.
            stats: Counting stat columns. Defaults to ``_COUNTING_STATS``.
        """
        self._windows: list[int] = windows if windows is not None else [5, 10, 20]
        self._alphas: list[float] = alphas if alphas is not None else [0.15, 0.20]
        self._alpha_fast = alpha_fast
        self._alpha_slow = alpha_slow
        self._stats: tuple[str, ...] = stats if stats is not None else _COUNTING_STATS


[docs]
    def transform(
        self,
        team_games: pd.DataFrame,
        reference_day_num: int | None = None,
    ) -> pd.DataFrame:
        """Compute all sequential features for a team's game history.

        Input must be sorted by ``day_num`` ascending to ensure temporal
        integrity (no future data leakage).

        Orchestration order (critical for correctness):
        1. OT rescaling (before any aggregation)
        2. Time-decay weights
        3. Rolling stats (on OT-rescaled stats, with weights)
        4. EWMA (on OT-rescaled stats)
        5. Momentum
        6. Streak (on original won column)
        7. Possessions + per-possession stats
        8. Four Factors

        Args:
            team_games: Per-team game DataFrame sorted by ``day_num``.
            reference_day_num: Reference day for time-decay weights.
                Defaults to the last game's ``day_num``.

        Returns:
            New DataFrame with all feature columns appended to originals.
            Preserves input row order.
        """
        if team_games.empty:
            return team_games.copy()

        # Step 1: OT rescaling (before any aggregation)
        scaled = apply_ot_rescaling(team_games, stats=self._stats)

        # Step 2: Time-decay weights (computed on original day_nums)
        weights = compute_game_weights(team_games["day_num"], reference_day_num)

        # Step 3: Rolling features (on OT-rescaled stats, with time-decay)
        rolling = compute_rolling_stats(scaled, self._windows, self._stats, weights)

        # Step 4: EWMA (on OT-rescaled stats)
        ewma = compute_ewma_stats(scaled, self._alphas, self._stats)

        # Step 5: Momentum
        momentum = compute_momentum(scaled, self._alpha_fast, self._alpha_slow, self._stats)

        # Step 6: Streak (on original won column, not rescaled)
        streak = compute_streak(team_games["won"])

        # Step 7: Possessions + per-possession stats
        possessions = compute_possessions(scaled)
        per_poss = compute_per_possession_stats(scaled, self._stats, possessions)

        # Step 8: Four Factors
        four_factors = compute_four_factors(scaled, possessions)

        return pd.concat(
            [
                team_games,
                rolling,
                ewma,
                momentum,
                streak.to_frame(),
                per_poss,
                four_factors,
                possessions.to_frame(),
            ],
            axis=1,
        )