Source code for ncaa_eval.evaluation.kaggle_export

"""Kaggle March Machine Learning Mania submission CSV export.

Generates a CSV string with ``ID,Pred`` columns covering all C(n,2)
pairwise matchups for men's D1 teams in a given season.  The ``ID``
column uses Kaggle format ``YYYY_TeamID1_TeamID2`` (lower ID first)
and ``Pred`` is the win probability for TeamID1.
"""

from __future__ import annotations

import io
from collections.abc import Sequence
from itertools import combinations

import numpy as np
import numpy.typing as npt

# Kaggle March Machine Learning Mania uses day_num=136 as the Round-of-64 start.
# This is the canonical neutral-site context for all-pairs probability export.
KAGGLE_NEUTRAL_DAY_NUM: int = 136


[docs] def format_kaggle_submission( season: int, team_ids: Sequence[int], prob_matrix: npt.NDArray[np.float64], ) -> str: """Format a probability matrix as a Kaggle submission CSV string. Args: season: Tournament season year (e.g. 2025). team_ids: Team IDs corresponding to matrix rows/columns. prob_matrix: n×n pairwise probability matrix where ``P[i,j]`` is P(team_ids[i] beats team_ids[j]). Returns: CSV string with header ``ID,Pred`` and C(n,2) data rows. Raises: ValueError: If the matrix shape doesn't match the team count. """ n = len(team_ids) if n < 2: msg = f"Need at least 2 teams to generate matchups, got {n}" raise ValueError(msg) if prob_matrix.shape != (n, n): msg = f"prob_matrix shape {prob_matrix.shape} != ({n}, {n})" raise ValueError(msg) # Build index: team_id → matrix row/column idx = {tid: i for i, tid in enumerate(team_ids)} buf = io.StringIO() buf.write("ID,Pred\n") # Sorted team IDs so pairs are emitted in Kaggle order sorted_ids = sorted(team_ids) for low_id, high_id in combinations(sorted_ids, 2): i = idx[low_id] j = idx[high_id] pred = float(prob_matrix[i, j]) buf.write(f"{season}_{low_id}_{high_id},{pred}\n") return buf.getvalue()