Source code for ncaa_eval.model.logistic_regression

"""Minimal logistic regression model — test fixture for the Model contract.

This is NOT a production model.  It exists solely to demonstrate and
test the stateless ``Model`` interface in ~30 lines of logic.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Literal, Self

import joblib  # type: ignore[import-untyped]
import numpy as np
import pandas as pd  # type: ignore[import-untyped]
from sklearn.linear_model import LogisticRegression  # type: ignore[import-untyped]

from ncaa_eval.model._feature_config_io import load_feature_config, save_feature_config
from ncaa_eval.model.base import Model, ModelConfig
from ncaa_eval.model.registry import register_model
from ncaa_eval.transform.feature_serving import (
    BatchRatingType,
    FeatureConfig,
    OrdinalCompositeMethod,
)



[docs]
class LogisticRegressionConfig(ModelConfig):
    """Hyperparameters for the logistic regression test fixture."""

    model_name: Literal["logistic_regression"] = "logistic_regression"
    C: float = 1.0  # noqa: N815 — sklearn convention
    max_iter: int = 200




[docs]
@register_model("logistic_regression")
class LogisticRegressionModel(Model):
    """Thin wrapper around sklearn ``LogisticRegression``."""

    def __init__(
        self,
        config: LogisticRegressionConfig | None = None,
        *,
        batch_rating_types: tuple[BatchRatingType, ...] = ("srs",),
        graph_features_enabled: bool = False,
        ordinal_composite: OrdinalCompositeMethod | None = None,
    ) -> None:
        """Initialize logistic regression model with optional configuration.

        Args:
            config: Pydantic config; defaults to
                :class:`LogisticRegressionConfig` when ``None``.
            batch_rating_types: Which batch rating systems to include.
            graph_features_enabled: Whether to compute graph centrality features.
            ordinal_composite: Composite method for ordinal systems.
        """
        self._config = config or LogisticRegressionConfig()
        self._clf = LogisticRegression(C=self._config.C, max_iter=self._config.max_iter)
        self.feature_config = FeatureConfig(
            batch_rating_types=batch_rating_types,
            graph_features_enabled=graph_features_enabled,
            ordinal_composite=ordinal_composite,
        )
        self.feature_names_: list[str] = []


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        """Train the model on feature matrix *X* and labels *y*."""
        self.feature_names_ = list(X.columns)
        self._clf.fit(X, y)



[docs]
    def predict_proba(self, X: pd.DataFrame) -> pd.Series:
        """Return P(team_a wins) in [0, 1] for each row of *X*."""
        probs = self._clf.predict_proba(X)[:, 1]
        return pd.Series(probs, index=X.index)



[docs]
    def save(self, path: Path) -> None:
        """Persist the trained classifier, config, and feature config to *path*."""
        path.mkdir(parents=True, exist_ok=True)
        joblib.dump(self._clf, path / "model.joblib")
        (path / "config.json").write_text(self._config.model_dump_json())
        (path / "feature_names.json").write_text(json.dumps(self.feature_names_))
        save_feature_config(self.feature_config, path)



[docs]
    @classmethod
    def load(cls, path: Path) -> Self:
        """Load a previously-saved model from *path*."""
        config = LogisticRegressionConfig.model_validate_json((path / "config.json").read_text())
        instance = cls(config)
        instance._clf = joblib.load(path / "model.joblib")
        feature_names_path = path / "feature_names.json"
        if feature_names_path.exists():
            instance.feature_names_ = json.loads(feature_names_path.read_text())
        loaded_fc = load_feature_config(path)
        if loaded_fc is not None:
            instance.feature_config = loaded_fc
        return instance



[docs]
    def get_feature_importances(self) -> list[tuple[str, float]] | None:
        """Return absolute coefficient values as feature importance."""
        if not self.feature_names_ or not hasattr(self._clf, "coef_"):
            return None
        coefs = np.abs(self._clf.coef_[0])
        return list(zip(self.feature_names_, coefs.tolist()))



[docs]
    def get_config(self) -> LogisticRegressionConfig:
        """Return the Pydantic-validated configuration."""
        return self._config