Source code for ncaa_eval.evaluation.kaggle_export

"""Kaggle March Machine Learning Mania submission CSV export.

Generates a CSV string with ``ID,Pred`` columns covering all C(n,2)
pairwise matchups for men's D1 teams in a given season.  The ``ID``
column uses Kaggle format ``YYYY_TeamID1_TeamID2`` (lower ID first)
and ``Pred`` is the win probability for TeamID1.
"""

from __future__ import annotations

import io
from collections.abc import Sequence
from itertools import combinations

import numpy as np
import numpy.typing as npt

# Kaggle March Machine Learning Mania uses day_num=136 as the Round-of-64 start.
# This is the canonical neutral-site context for all-pairs probability export.
KAGGLE_NEUTRAL_DAY_NUM: int = 136



[docs]
def format_kaggle_submission(
    season: int,
    team_ids: Sequence[int],
    prob_matrix: npt.NDArray[np.float64],
) -> str:
    """Format a probability matrix as a Kaggle submission CSV string.

    Args:
        season: Tournament season year (e.g. 2025).
        team_ids: Team IDs corresponding to matrix rows/columns.
        prob_matrix: n×n pairwise probability matrix where ``P[i,j]``
            is P(team_ids[i] beats team_ids[j]).

    Returns:
        CSV string with header ``ID,Pred`` and C(n,2) data rows.

    Raises:
        ValueError: If the matrix shape doesn't match the team count.
    """
    n = len(team_ids)
    if n < 2:
        msg = f"Need at least 2 teams to generate matchups, got {n}"
        raise ValueError(msg)
    if prob_matrix.shape != (n, n):
        msg = f"prob_matrix shape {prob_matrix.shape} != ({n}, {n})"
        raise ValueError(msg)

    # Build index: team_id → matrix row/column
    idx = {tid: i for i, tid in enumerate(team_ids)}

    buf = io.StringIO()
    buf.write("ID,Pred\n")

    # Sorted team IDs so pairs are emitted in Kaggle order
    sorted_ids = sorted(team_ids)

    for low_id, high_id in combinations(sorted_ids, 2):
        i = idx[low_id]
        j = idx[high_id]
        pred = float(prob_matrix[i, j])
        buf.write(f"{season}_{low_id}_{high_id},{pred}\n")

    return buf.getvalue()