Source code for ncaa_eval.model.base

"""Model abstract base classes and configuration.

Defines the ``Model`` ABC, the ``StatefulModel`` template subclass for
sequential-update models, and the ``ModelConfig`` Pydantic base used by
every model's hyperparameter schema.
"""

from __future__ import annotations

import abc
import datetime
from pathlib import Path
from typing import Any, Literal, Self

import pandas as pd  # type: ignore[import-untyped]
from pydantic import BaseModel

from ncaa_eval.ingest.schema import Game
from ncaa_eval.transform.feature_serving import CalibrationMethod, FeatureConfig



[docs]
class ModelConfig(BaseModel):
    """Base configuration shared by all model implementations.

    Subclasses add model-specific hyperparameters as additional fields.
    """

    model_name: str
    calibration_method: CalibrationMethod | None = None




[docs]
class Model(abc.ABC):
    """Abstract base class for all NCAA prediction models.

    Every model — stateful or stateless — must implement these five
    methods so that the training CLI, evaluation engine, and persistence
    layer can treat all models uniformly.

    Attributes:
        feature_config: Declarative specification of which feature blocks
            the model expects.  Set by subclass ``__init__``.
    """

    feature_config: FeatureConfig


[docs]
    @abc.abstractmethod
    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        """Train the model on feature matrix *X* and labels *y*."""
        ...



[docs]
    @abc.abstractmethod
    def predict_proba(self, X: pd.DataFrame) -> pd.Series:
        """Return P(team_a wins) in [0, 1] for each row of *X*."""
        ...



[docs]
    @abc.abstractmethod
    def save(self, path: Path) -> None:
        """Persist the trained model to *path*."""
        ...



[docs]
    @classmethod
    @abc.abstractmethod
    def load(cls, path: Path) -> Self:
        """Load a previously-saved model from *path*."""
        ...



[docs]
    @abc.abstractmethod
    def get_config(self) -> ModelConfig:
        """Return the Pydantic-validated configuration for this model."""
        ...



[docs]
    def get_feature_importances(self) -> list[tuple[str, float]] | None:
        """Return feature name/importance pairs, or ``None`` if unavailable.

        The default returns ``None``.  Models that support feature importances
        (e.g. XGBoost) should override this method.
        """
        return None




# ---------------------------------------------------------------------------
# Location encoding helpers
# ---------------------------------------------------------------------------
_LOC_FROM_ENCODING: dict[int, Literal["H", "A", "N"]] = {1: "H", -1: "A", 0: "N"}



[docs]
class StatefulModel(Model):
    """Template base for models that process games sequentially.

    Concrete methods ``fit`` and ``predict_proba`` are provided as
    template methods.  Subclasses implement the abstract hooks:

    * ``update(game)`` — absorb a single game result
    * ``_predict_one(team_a_id, team_b_id)`` — return P(team_a wins)
    * ``start_season(season)`` — reset / prepare for a new season
    * ``get_state()`` / ``set_state(state)`` — snapshot / restore ratings
    """

    # ------------------------------------------------------------------
    # Concrete template methods
    # ------------------------------------------------------------------


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        """Reconstruct games from *X*/*y* and update sequentially.

        Reconstructs Game objects from the feature matrix and labels, then
        iterates chronologically, calling start_season() on season boundaries
        and update() per game.
        """
        games = self._to_games(X, y)
        current_season: int | None = None
        for game in games:
            if game.season != current_season:
                self.start_season(game.season)
                current_season = game.season
            self.update(game)



[docs]
    def predict_proba(self, X: pd.DataFrame) -> pd.Series:
        """Call ``_predict_one`` per row using ``itertuples``."""
        preds: list[float] = [self._predict_one(row.team_a_id, row.team_b_id) for row in X.itertuples()]
        return pd.Series(preds, index=X.index)


    # ------------------------------------------------------------------
    # Concrete helper
    # ------------------------------------------------------------------

    @staticmethod
    def _to_games(X: pd.DataFrame, y: pd.Series) -> list[Game]:
        """Reconstruct :class:`Game` objects from the feature DataFrame.

        Hoists optional-column checks outside the row loop to avoid O(n)
        column-existence tests, decodes ``loc_encoding`` via a lookup dict,
        resolves winner/loser ordering from the binary label, and coerces
        date values using ``pd.isna`` to handle ``None``, float ``NaN``,
        and ``pd.NaT`` uniformly.

        Args:
            X: Feature matrix with columns: ``team_a_id``, ``team_b_id``,
                ``season``, ``day_num``, ``date``, ``loc_encoding``,
                ``game_id``, ``is_tournament``.  Optionally ``w_score``,
                ``l_score``, ``num_ot``.
            y: Binary label — ``1`` (or ``True``) means team_a won.

        Returns:
            Chronologically-ordered list of :class:`Game` objects ready for
            sequential model processing via :meth:`update` and
            :meth:`start_season`.
        """
        # Hoist column-existence checks outside the loop (O(1) each, not O(n))
        has_scores = "w_score" in X.columns and "l_score" in X.columns
        has_num_ot = "num_ot" in X.columns

        games: list[Game] = []
        for row in X.itertuples():
            idx = row.Index
            team_a_won = bool(y.loc[idx])

            team_a_id = int(row.team_a_id)
            team_b_id = int(row.team_b_id)

            if team_a_won:
                w_team_id, l_team_id = team_a_id, team_b_id
            else:
                w_team_id, l_team_id = team_b_id, team_a_id

            loc_enc = int(row.loc_encoding)
            if loc_enc not in _LOC_FROM_ENCODING:
                msg = f"Unknown loc_encoding {loc_enc!r}; expected one of {sorted(_LOC_FROM_ENCODING)}"
                raise ValueError(msg)
            loc = _LOC_FROM_ENCODING[loc_enc]

            # Scores: use real values if present, else dummy
            w_score = int(row.w_score) if has_scores else 1
            l_score = int(row.l_score) if has_scores else 0
            num_ot = int(row.num_ot) if has_num_ot else 0

            # Date handling: pd.isna covers None, float NaN, and pd.NaT uniformly
            raw_date = row.date
            date_val: datetime.date | None = None
            if not pd.isna(raw_date):
                date_val = pd.Timestamp(raw_date).date()

            games.append(
                Game(
                    game_id=str(row.game_id),
                    season=int(row.season),
                    day_num=int(row.day_num),
                    date=date_val,
                    w_team_id=w_team_id,
                    l_team_id=l_team_id,
                    w_score=w_score,
                    l_score=l_score,
                    loc=loc,
                    num_ot=num_ot,
                    is_tournament=bool(row.is_tournament),
                )
            )
        return games

    # ------------------------------------------------------------------
    # Public API for external consumers
    # ------------------------------------------------------------------


[docs]
    def predict_matchup(self, team_a_id: int, team_b_id: int) -> float:
        """Return P(team_a wins) for a single matchup.

        Delegates to the ``_predict_one`` abstract hook.
        """
        return self._predict_one(team_a_id, team_b_id)


    # ------------------------------------------------------------------
    # Abstract hooks for subclasses
    # ------------------------------------------------------------------

    @abc.abstractmethod
    def _predict_one(self, team_a_id: int, team_b_id: int) -> float:
        """Return P(team_a wins) given team IDs."""
        ...


[docs]
    @abc.abstractmethod
    def update(self, game: Game) -> None:
        """Absorb the result of a single game."""
        ...



[docs]
    @abc.abstractmethod
    def start_season(self, season: int) -> None:
        """Called before the first game of each season."""
        ...



[docs]
    @abc.abstractmethod
    def get_state(self) -> dict[str, Any]:
        """Return a serialisable snapshot of internal ratings."""
        ...



[docs]
    @abc.abstractmethod
    def set_state(self, state: dict[str, Any]) -> None:
        """Restore internal ratings from a snapshot."""
        ...