Source code for ncaa_eval.model.xgboost_model

"""XGBoost gradient-boosting model — reference stateless model.

Wraps :class:`xgboost.XGBClassifier` behind the :class:`Model` ABC,
providing ``fit`` / ``predict_proba`` / ``save`` / ``load`` with XGBoost's
native UBJSON persistence format.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Annotated, Literal, Self

import pandas as pd  # type: ignore[import-untyped]
from pydantic import Field
from sklearn.model_selection import train_test_split  # type: ignore[import-untyped]
from xgboost import XGBClassifier

from ncaa_eval.model._feature_config_io import load_feature_config, save_feature_config
from ncaa_eval.model.base import Model, ModelConfig
from ncaa_eval.model.registry import register_model
from ncaa_eval.transform.feature_serving import (
    BatchRatingType,
    FeatureConfig,
    OrdinalCompositeMethod,
)


[docs] class XGBoostModelConfig(ModelConfig): """Hyperparameters for the XGBoost gradient-boosting model. Defaults from ``specs/research/modeling-approaches.md`` §5.5 and §6.4. **Label balance:** Set ``scale_pos_weight = count(y==0) / count(y==1)`` when training labels are imbalanced (e.g. team_a is always the winner). Leave as ``None`` (XGBoost default = 1.0) when team assignment is randomised before training. """ model_name: Literal["xgboost"] = "xgboost" n_estimators: int = 500 max_depth: int = 5 learning_rate: float = 0.05 subsample: float = 0.8 colsample_bytree: float = 0.8 min_child_weight: int = 3 reg_alpha: float = 0.0 reg_lambda: float = 1.0 early_stopping_rounds: int = 50 validation_fraction: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.1 scale_pos_weight: float | None = None # None → XGBoost default (1.0)
[docs] @register_model("xgboost") class XGBoostModel(Model): """XGBoost binary classifier wrapping :class:`XGBClassifier`. This is a *stateless* model — it implements :class:`Model` directly (no ``StatefulModel`` lifecycle hooks). **Label balance convention:** The feature server typically assigns ``team_a = w_team_id`` (the winner), so ``y`` may be heavily biased toward 1. Callers should either randomise team assignment before training (recommended) or set ``scale_pos_weight`` in the config to ``count(y==0) / count(y==1)``. The default ``scale_pos_weight`` is ``None`` (XGBoost default = 1.0), appropriate when team assignment is randomised. """ def __init__( self, config: XGBoostModelConfig | None = None, *, batch_rating_types: tuple[BatchRatingType, ...] = ("srs",), graph_features_enabled: bool = False, ordinal_composite: OrdinalCompositeMethod | None = None, ) -> None: """Initialize XGBoost model with optional configuration. Builds an :class:`XGBClassifier` from config hyperparameters, setting ``objective="binary:logistic"`` and ``scale_pos_weight`` only when explicitly provided. Args: config: Pydantic config; defaults to :class:`XGBoostModelConfig` when ``None``. batch_rating_types: Which batch rating systems to include. graph_features_enabled: Whether to compute graph centrality features. ordinal_composite: Composite method for ordinal systems. """ self._config = config or XGBoostModelConfig() self._is_fitted = False self.feature_names_: list[str] = [] self.feature_config = FeatureConfig( batch_rating_types=batch_rating_types, graph_features_enabled=graph_features_enabled, ordinal_composite=ordinal_composite, ) kwargs: dict[str, object] = dict( n_estimators=self._config.n_estimators, max_depth=self._config.max_depth, learning_rate=self._config.learning_rate, subsample=self._config.subsample, colsample_bytree=self._config.colsample_bytree, min_child_weight=self._config.min_child_weight, reg_alpha=self._config.reg_alpha, reg_lambda=self._config.reg_lambda, objective="binary:logistic", eval_metric="logloss", early_stopping_rounds=self._config.early_stopping_rounds, random_state=42, ) if self._config.scale_pos_weight is not None: kwargs["scale_pos_weight"] = self._config.scale_pos_weight self._clf = XGBClassifier(**kwargs)
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> None: """Train on feature matrix *X* and binary labels *y*. Automatically splits *X* into train/validation sets using ``validation_fraction`` from the config. The validation set is used for early stopping via ``eval_set``. **Label balance convention:** ``team_a`` assignment in the feature server is typically ``w_team_id`` (the winner). If labels are imbalanced, either randomise team assignment upstream or set ``scale_pos_weight`` = ``count(y==0) / count(y==1)`` in the ``XGBoostModelConfig``. Raises: ValueError: If *X* is empty. """ if X.empty: msg = "Cannot fit on an empty DataFrame" raise ValueError(msg) X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=self._config.validation_fraction, random_state=42, stratify=y, ) self.feature_names_ = list(X.columns) self._clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) self._is_fitted = True
[docs] def predict_proba(self, X: pd.DataFrame) -> pd.Series: """Return P(team_a wins) for each row of *X*. Raises: RuntimeError: If called before :meth:`fit`. """ if not self._is_fitted: msg = "Model must be fitted before calling predict_proba" raise RuntimeError(msg) probs = pd.Series(self._clf.predict_proba(X)[:, 1], index=X.index) return probs
[docs] def save(self, path: Path) -> None: """Persist the trained model to *path* directory. Writes four files: - ``model.ubj`` — XGBoost native UBJSON format (stable across versions) - ``config.json`` — Pydantic-serialised hyperparameter config - ``feature_names.json`` — JSON array of feature column names - ``feature_config.json`` — FeatureConfig sidecar Raises: RuntimeError: If called before :meth:`fit`. """ if not self._is_fitted: msg = "Model must be fitted before saving" raise RuntimeError(msg) path.mkdir(parents=True, exist_ok=True) self._clf.save_model(str(path / "model.ubj")) (path / "config.json").write_text(self._config.model_dump_json()) (path / "feature_names.json").write_text(json.dumps(self.feature_names_)) save_feature_config(self.feature_config, path)
[docs] @classmethod def load(cls, path: Path) -> Self: """Load a previously-saved XGBoost model from *path*. Raises: FileNotFoundError: If either ``config.json`` or ``model.ubj`` is missing. """ config_path = path / "config.json" model_path = path / "model.ubj" missing = [p for p in (config_path, model_path) if not p.exists()] if missing: missing_names = ", ".join(p.name for p in missing) msg = f"Incomplete save at {path!r}: missing {missing_names}. The save may have been interrupted." raise FileNotFoundError(msg) config = XGBoostModelConfig.model_validate_json(config_path.read_text()) instance = cls(config) instance._clf.load_model(str(model_path)) instance._is_fitted = True feature_names_path = path / "feature_names.json" if feature_names_path.exists(): instance.feature_names_ = json.loads(feature_names_path.read_text()) loaded_fc = load_feature_config(path) if loaded_fc is not None: instance.feature_config = loaded_fc return instance
[docs] def get_config(self) -> XGBoostModelConfig: """Return the Pydantic-validated configuration for this model.""" return self._config
[docs] def get_feature_importances(self) -> list[tuple[str, float]] | None: """Return feature name/importance pairs from the fitted classifier.""" if not self._is_fitted or not self.feature_names_: return None importances = self._clf.feature_importances_ return list(zip(self.feature_names_, importances.tolist()))