Source code for ncaa_eval.model.xgboost_model

"""XGBoost gradient-boosting model — reference stateless model.

Wraps :class:`xgboost.XGBClassifier` behind the :class:`Model` ABC,
providing ``fit`` / ``predict_proba`` / ``save`` / ``load`` with XGBoost's
native UBJSON persistence format.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Annotated, Literal, Self

import pandas as pd  # type: ignore[import-untyped]
from pydantic import Field
from sklearn.model_selection import train_test_split  # type: ignore[import-untyped]
from xgboost import XGBClassifier

from ncaa_eval.model._feature_config_io import load_feature_config, save_feature_config
from ncaa_eval.model.base import Model, ModelConfig
from ncaa_eval.model.registry import register_model
from ncaa_eval.transform.feature_serving import (
    BatchRatingType,
    FeatureConfig,
    OrdinalCompositeMethod,
)



[docs]
class XGBoostModelConfig(ModelConfig):
    """Hyperparameters for the XGBoost gradient-boosting model.

    Defaults from ``specs/research/modeling-approaches.md`` §5.5 and §6.4.

    **Label balance:** Set ``scale_pos_weight = count(y==0) / count(y==1)``
    when training labels are imbalanced (e.g. team_a is always the winner).
    Leave as ``None`` (XGBoost default = 1.0) when team assignment is
    randomised before training.
    """

    model_name: Literal["xgboost"] = "xgboost"
    n_estimators: int = 500
    max_depth: int = 5
    learning_rate: float = 0.05
    subsample: float = 0.8
    colsample_bytree: float = 0.8
    min_child_weight: int = 3
    reg_alpha: float = 0.0
    reg_lambda: float = 1.0
    early_stopping_rounds: int = 50
    validation_fraction: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.1
    scale_pos_weight: float | None = None  # None → XGBoost default (1.0)




[docs]
@register_model("xgboost")
class XGBoostModel(Model):
    """XGBoost binary classifier wrapping :class:`XGBClassifier`.

    This is a *stateless* model — it implements :class:`Model` directly
    (no ``StatefulModel`` lifecycle hooks).

    **Label balance convention:** The feature server typically assigns
    ``team_a = w_team_id`` (the winner), so ``y`` may be heavily biased
    toward 1.  Callers should either randomise team assignment before
    training (recommended) or set ``scale_pos_weight`` in the config to
    ``count(y==0) / count(y==1)``.  The default ``scale_pos_weight`` is
    ``None`` (XGBoost default = 1.0), appropriate when team assignment is
    randomised.
    """

    def __init__(
        self,
        config: XGBoostModelConfig | None = None,
        *,
        batch_rating_types: tuple[BatchRatingType, ...] = ("srs",),
        graph_features_enabled: bool = False,
        ordinal_composite: OrdinalCompositeMethod | None = None,
    ) -> None:
        """Initialize XGBoost model with optional configuration.

        Builds an :class:`XGBClassifier` from config hyperparameters, setting
        ``objective="binary:logistic"`` and ``scale_pos_weight`` only when
        explicitly provided.

        Args:
            config: Pydantic config; defaults to
                :class:`XGBoostModelConfig` when ``None``.
            batch_rating_types: Which batch rating systems to include.
            graph_features_enabled: Whether to compute graph centrality features.
            ordinal_composite: Composite method for ordinal systems.
        """
        self._config = config or XGBoostModelConfig()
        self._is_fitted = False
        self.feature_names_: list[str] = []
        self.feature_config = FeatureConfig(
            batch_rating_types=batch_rating_types,
            graph_features_enabled=graph_features_enabled,
            ordinal_composite=ordinal_composite,
        )
        kwargs: dict[str, object] = dict(
            n_estimators=self._config.n_estimators,
            max_depth=self._config.max_depth,
            learning_rate=self._config.learning_rate,
            subsample=self._config.subsample,
            colsample_bytree=self._config.colsample_bytree,
            min_child_weight=self._config.min_child_weight,
            reg_alpha=self._config.reg_alpha,
            reg_lambda=self._config.reg_lambda,
            objective="binary:logistic",
            eval_metric="logloss",
            early_stopping_rounds=self._config.early_stopping_rounds,
            random_state=42,
        )
        if self._config.scale_pos_weight is not None:
            kwargs["scale_pos_weight"] = self._config.scale_pos_weight
        self._clf = XGBClassifier(**kwargs)


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        """Train on feature matrix *X* and binary labels *y*.

        Automatically splits *X* into train/validation sets using
        ``validation_fraction`` from the config.  The validation set is
        used for early stopping via ``eval_set``.

        **Label balance convention:** ``team_a`` assignment in the feature
        server is typically ``w_team_id`` (the winner).  If labels are
        imbalanced, either randomise team assignment upstream or set
        ``scale_pos_weight`` = ``count(y==0) / count(y==1)`` in the
        ``XGBoostModelConfig``.

        Raises:
            ValueError: If *X* is empty.
        """
        if X.empty:
            msg = "Cannot fit on an empty DataFrame"
            raise ValueError(msg)

        X_train, X_val, y_train, y_val = train_test_split(
            X,
            y,
            test_size=self._config.validation_fraction,
            random_state=42,
            stratify=y,
        )
        self.feature_names_ = list(X.columns)
        self._clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        self._is_fitted = True



[docs]
    def predict_proba(self, X: pd.DataFrame) -> pd.Series:
        """Return P(team_a wins) for each row of *X*.

        Raises:
            RuntimeError: If called before :meth:`fit`.
        """
        if not self._is_fitted:
            msg = "Model must be fitted before calling predict_proba"
            raise RuntimeError(msg)
        probs = pd.Series(self._clf.predict_proba(X)[:, 1], index=X.index)
        return probs



[docs]
    def save(self, path: Path) -> None:
        """Persist the trained model to *path* directory.

        Writes four files:
        - ``model.ubj`` — XGBoost native UBJSON format (stable across versions)
        - ``config.json`` — Pydantic-serialised hyperparameter config
        - ``feature_names.json`` — JSON array of feature column names
        - ``feature_config.json`` — FeatureConfig sidecar

        Raises:
            RuntimeError: If called before :meth:`fit`.
        """
        if not self._is_fitted:
            msg = "Model must be fitted before saving"
            raise RuntimeError(msg)
        path.mkdir(parents=True, exist_ok=True)
        self._clf.save_model(str(path / "model.ubj"))
        (path / "config.json").write_text(self._config.model_dump_json())
        (path / "feature_names.json").write_text(json.dumps(self.feature_names_))
        save_feature_config(self.feature_config, path)



[docs]
    @classmethod
    def load(cls, path: Path) -> Self:
        """Load a previously-saved XGBoost model from *path*.

        Raises:
            FileNotFoundError: If either ``config.json`` or ``model.ubj`` is missing.
        """
        config_path = path / "config.json"
        model_path = path / "model.ubj"
        missing = [p for p in (config_path, model_path) if not p.exists()]
        if missing:
            missing_names = ", ".join(p.name for p in missing)
            msg = f"Incomplete save at {path!r}: missing {missing_names}. The save may have been interrupted."
            raise FileNotFoundError(msg)
        config = XGBoostModelConfig.model_validate_json(config_path.read_text())
        instance = cls(config)
        instance._clf.load_model(str(model_path))
        instance._is_fitted = True
        feature_names_path = path / "feature_names.json"
        if feature_names_path.exists():
            instance.feature_names_ = json.loads(feature_names_path.read_text())
        loaded_fc = load_feature_config(path)
        if loaded_fc is not None:
            instance.feature_config = loaded_fc
        return instance



[docs]
    def get_config(self) -> XGBoostModelConfig:
        """Return the Pydantic-validated configuration for this model."""
        return self._config



[docs]
    def get_feature_importances(self) -> list[tuple[str, float]] | None:
        """Return feature name/importance pairs from the fitted classifier."""
        if not self._is_fitted or not self.feature_names_:
            return None
        importances = self._clf.feature_importances_
        return list(zip(self.feature_names_, importances.tolist()))