"""XGBoost gradient-boosting model — reference stateless model.
Wraps :class:`xgboost.XGBClassifier` behind the :class:`Model` ABC,
providing ``fit`` / ``predict_proba`` / ``save`` / ``load`` with XGBoost's
native UBJSON persistence format.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Annotated, Literal, Self
import pandas as pd # type: ignore[import-untyped]
from pydantic import Field
from sklearn.model_selection import train_test_split # type: ignore[import-untyped]
from xgboost import XGBClassifier
from ncaa_eval.model._feature_config_io import load_feature_config, save_feature_config
from ncaa_eval.model.base import Model, ModelConfig
from ncaa_eval.model.registry import register_model
from ncaa_eval.transform.feature_serving import (
BatchRatingType,
FeatureConfig,
OrdinalCompositeMethod,
)
[docs]
class XGBoostModelConfig(ModelConfig):
"""Hyperparameters for the XGBoost gradient-boosting model.
Defaults from ``specs/research/modeling-approaches.md`` §5.5 and §6.4.
**Label balance:** Set ``scale_pos_weight = count(y==0) / count(y==1)``
when training labels are imbalanced (e.g. team_a is always the winner).
Leave as ``None`` (XGBoost default = 1.0) when team assignment is
randomised before training.
"""
model_name: Literal["xgboost"] = "xgboost"
n_estimators: int = 500
max_depth: int = 5
learning_rate: float = 0.05
subsample: float = 0.8
colsample_bytree: float = 0.8
min_child_weight: int = 3
reg_alpha: float = 0.0
reg_lambda: float = 1.0
early_stopping_rounds: int = 50
validation_fraction: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.1
scale_pos_weight: float | None = None # None → XGBoost default (1.0)
[docs]
@register_model("xgboost")
class XGBoostModel(Model):
"""XGBoost binary classifier wrapping :class:`XGBClassifier`.
This is a *stateless* model — it implements :class:`Model` directly
(no ``StatefulModel`` lifecycle hooks).
**Label balance convention:** The feature server typically assigns
``team_a = w_team_id`` (the winner), so ``y`` may be heavily biased
toward 1. Callers should either randomise team assignment before
training (recommended) or set ``scale_pos_weight`` in the config to
``count(y==0) / count(y==1)``. The default ``scale_pos_weight`` is
``None`` (XGBoost default = 1.0), appropriate when team assignment is
randomised.
"""
def __init__(
self,
config: XGBoostModelConfig | None = None,
*,
batch_rating_types: tuple[BatchRatingType, ...] = ("srs",),
graph_features_enabled: bool = False,
ordinal_composite: OrdinalCompositeMethod | None = None,
) -> None:
"""Initialize XGBoost model with optional configuration.
Builds an :class:`XGBClassifier` from config hyperparameters, setting
``objective="binary:logistic"`` and ``scale_pos_weight`` only when
explicitly provided.
Args:
config: Pydantic config; defaults to
:class:`XGBoostModelConfig` when ``None``.
batch_rating_types: Which batch rating systems to include.
graph_features_enabled: Whether to compute graph centrality features.
ordinal_composite: Composite method for ordinal systems.
"""
self._config = config or XGBoostModelConfig()
self._is_fitted = False
self.feature_names_: list[str] = []
self.feature_config = FeatureConfig(
batch_rating_types=batch_rating_types,
graph_features_enabled=graph_features_enabled,
ordinal_composite=ordinal_composite,
)
kwargs: dict[str, object] = dict(
n_estimators=self._config.n_estimators,
max_depth=self._config.max_depth,
learning_rate=self._config.learning_rate,
subsample=self._config.subsample,
colsample_bytree=self._config.colsample_bytree,
min_child_weight=self._config.min_child_weight,
reg_alpha=self._config.reg_alpha,
reg_lambda=self._config.reg_lambda,
objective="binary:logistic",
eval_metric="logloss",
early_stopping_rounds=self._config.early_stopping_rounds,
random_state=42,
)
if self._config.scale_pos_weight is not None:
kwargs["scale_pos_weight"] = self._config.scale_pos_weight
self._clf = XGBClassifier(**kwargs)
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
"""Train on feature matrix *X* and binary labels *y*.
Automatically splits *X* into train/validation sets using
``validation_fraction`` from the config. The validation set is
used for early stopping via ``eval_set``.
**Label balance convention:** ``team_a`` assignment in the feature
server is typically ``w_team_id`` (the winner). If labels are
imbalanced, either randomise team assignment upstream or set
``scale_pos_weight`` = ``count(y==0) / count(y==1)`` in the
``XGBoostModelConfig``.
Raises:
ValueError: If *X* is empty.
"""
if X.empty:
msg = "Cannot fit on an empty DataFrame"
raise ValueError(msg)
X_train, X_val, y_train, y_val = train_test_split(
X,
y,
test_size=self._config.validation_fraction,
random_state=42,
stratify=y,
)
self.feature_names_ = list(X.columns)
self._clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
self._is_fitted = True
[docs]
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
"""Return P(team_a wins) for each row of *X*.
Raises:
RuntimeError: If called before :meth:`fit`.
"""
if not self._is_fitted:
msg = "Model must be fitted before calling predict_proba"
raise RuntimeError(msg)
probs = pd.Series(self._clf.predict_proba(X)[:, 1], index=X.index)
return probs
[docs]
def save(self, path: Path) -> None:
"""Persist the trained model to *path* directory.
Writes four files:
- ``model.ubj`` — XGBoost native UBJSON format (stable across versions)
- ``config.json`` — Pydantic-serialised hyperparameter config
- ``feature_names.json`` — JSON array of feature column names
- ``feature_config.json`` — FeatureConfig sidecar
Raises:
RuntimeError: If called before :meth:`fit`.
"""
if not self._is_fitted:
msg = "Model must be fitted before saving"
raise RuntimeError(msg)
path.mkdir(parents=True, exist_ok=True)
self._clf.save_model(str(path / "model.ubj"))
(path / "config.json").write_text(self._config.model_dump_json())
(path / "feature_names.json").write_text(json.dumps(self.feature_names_))
save_feature_config(self.feature_config, path)
[docs]
@classmethod
def load(cls, path: Path) -> Self:
"""Load a previously-saved XGBoost model from *path*.
Raises:
FileNotFoundError: If either ``config.json`` or ``model.ubj`` is missing.
"""
config_path = path / "config.json"
model_path = path / "model.ubj"
missing = [p for p in (config_path, model_path) if not p.exists()]
if missing:
missing_names = ", ".join(p.name for p in missing)
msg = f"Incomplete save at {path!r}: missing {missing_names}. The save may have been interrupted."
raise FileNotFoundError(msg)
config = XGBoostModelConfig.model_validate_json(config_path.read_text())
instance = cls(config)
instance._clf.load_model(str(model_path))
instance._is_fitted = True
feature_names_path = path / "feature_names.json"
if feature_names_path.exists():
instance.feature_names_ = json.loads(feature_names_path.read_text())
loaded_fc = load_feature_config(path)
if loaded_fc is not None:
instance.feature_config = loaded_fc
return instance
[docs]
def get_config(self) -> XGBoostModelConfig:
"""Return the Pydantic-validated configuration for this model."""
return self._config
[docs]
def get_feature_importances(self) -> list[tuple[str, float]] | None:
"""Return feature name/importance pairs from the fitted classifier."""
if not self._is_fitted or not self.feature_names_:
return None
importances = self._clf.feature_importances_
return list(zip(self.feature_names_, importances.tolist()))