Source code for ncaa_eval.ingest.connectors.espn

"""ESPN data source connector backed by the cbbpy scraper library.

The `EspnConnector` fetches current/recent season game data from ESPN
via cbbpy.  It does **not** provide team or season master data — those come
exclusively from the Kaggle connector.  A ``team_name_to_id`` mapping (built
from Kaggle's MTeams.csv) is required for translating ESPN team names into
Kaggle integer IDs.
"""

from __future__ import annotations

import datetime
import logging
from typing import Literal, cast

import cbbpy.mens_scraper as ms  # type: ignore[import-untyped]
import pandas as pd  # type: ignore[import-untyped]
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential

from ncaa_eval.ingest.connectors.base import (
    Connector,
    DataFormatError,
)
from ncaa_eval.ingest.fuzzy import fuzzy_match_team
from ncaa_eval.ingest.schema import Game

logger = logging.getLogger(__name__)

# Expected columns from cbbpy get_team_schedule() output.
_SCHEDULE_COLUMNS = {"game_id", "game_day", "game_result", "team", "opponent"}


def _parse_game_result(result_str: str) -> tuple[int, int] | None:
    """Parse a cbbpy ``game_result`` string like ``'W 75-60'``.

    Returns ``(team_score, opponent_score)`` or ``None`` if unparseable.
    """
    if not isinstance(result_str, str) or not result_str.strip():
        return None
    parts = result_str.strip().split()
    if len(parts) != 2:
        return None
    scores = parts[1].split("-")
    if len(scores) != 2:
        return None
    try:
        return int(scores[0]), int(scores[1])
    except ValueError:
        return None


def _resolve_team_id(
    name: str,
    lower_map: dict[str, int],
    original_mapping: dict[str, int],
) -> int | None:
    """Resolve an ESPN team name to a Kaggle team ID.

    Tries exact match first, then falls back to fuzzy matching via
    :func:`~ncaa_eval.ingest.fuzzy.fuzzy_match_team`.

    Args:
        name: ESPN team name to resolve.
        lower_map: Pre-computed lowercase-keyed mapping (avoids per-call rebuild).
        original_mapping: Original mapping with original-case keys (used for fuzzy).

    Returns:
        Kaggle team ID if resolved, or ``None`` if no match was found.
    """
    # Exact match (case-insensitive).
    exact = lower_map.get(name.lower())
    if exact is not None:
        return exact

    # Fuzzy fallback via centralized utility.
    result = fuzzy_match_team(name, original_mapping)
    if result is not None:
        return result

    logger.warning("espn: no team ID match for '%s'", name)
    return None


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=2, max=30),
    retry=retry_if_exception_type(Exception),
    reraise=True,
)
def _fetch_single_team_schedule(team_name: str, season: int) -> pd.DataFrame | None:
    """Fetch a single team's schedule with retry on transient failures."""
    df = ms.get_team_schedule(team_name, season)
    if isinstance(df, pd.DataFrame) and not df.empty:
        return df
    return None



[docs]
class EspnConnector(Connector):
    """Connector for ESPN game data via the cbbpy scraper.

    Args:
        team_name_to_id: Mapping from team name strings to Kaggle TeamIDs.
        season_day_zeros: Mapping from season year to DayZero date.
    """

    def __init__(
        self,
        team_name_to_id: dict[str, int],
        season_day_zeros: dict[int, datetime.date],
    ) -> None:
        self._team_name_to_id = team_name_to_id
        self._season_day_zeros = season_day_zeros
        # Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
        self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}

    # -- Games --------------------------------------------------------------


[docs]
    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for *season* from ESPN via cbbpy.

        Uses `get_team_schedule()` for each team in the mapping and
        deduplicates by ESPN game ID.
        """
        df = self._fetch_schedule_df(season)
        if df is None or df.empty:
            return []
        return self._parse_schedule_df(df, season)


    # -- internal -----------------------------------------------------------

    def _fetch_schedule_df(self, season: int) -> pd.DataFrame | None:
        """Load a season schedule DataFrame from cbbpy via per-team schedules.

        `get_games_season` is intentionally avoided here: it fetches
        boxscores and play-by-play for every game (thousands of no-timeout
        HTTP requests) and returns a game-info schema that is incompatible
        with the schedule columns expected by `_parse_schedule_df`.
        `get_team_schedule` returns the correct schedule-format schema
        (`team`, `opponent`, `game_result`, …) with one request per team.
        """
        return self._fetch_per_team(season)

    def _fetch_per_team(self, season: int) -> pd.DataFrame | None:
        """Fetch schedules for each team in the mapping and concatenate.

        Iterates through each team in the mapping, calls
        _fetch_single_team_schedule() with retry logic, collects successful
        DataFrames, concatenates them, and deduplicates by ESPN game_id to
        eliminate cross-team overlap.
        """
        frames: list[pd.DataFrame] = []
        total = len(self._team_name_to_id)
        failed_teams: list[str] = []

        for team_name in self._team_name_to_id:
            try:
                df = _fetch_single_team_schedule(team_name, season)
                if df is not None:
                    frames.append(df)
                else:
                    # Empty schedule returned (no exception, but no data).
                    failed_teams.append(team_name)
            except Exception:  # noqa: BLE001
                logger.warning(
                    "espn: get_team_schedule('%s', %d) failed after retries",
                    team_name,
                    season,
                    exc_info=True,
                )
                failed_teams.append(team_name)
                continue

        success = len(frames)
        failed = len(failed_teams)

        # AC #5-6: Summary reporting
        if failed > 0:
            first_five = failed_teams[:5]
            logger.warning(
                "espn: fetched %d/%d teams for season %d (%d failed): %s",
                success,
                total,
                season,
                failed,
                first_five,
            )
        else:
            logger.info("espn: fetched %d/%d teams for season %d (%d failed)", success, total, season, failed)

        if not frames:
            return None
        combined = pd.concat(frames, ignore_index=True)
        # Deduplicate by ESPN game_id (each game appears in both teams' schedules).
        if "game_id" in combined.columns:
            combined = combined.drop_duplicates(subset=["game_id"])
        return combined

    def _parse_schedule_df(self, df: pd.DataFrame, season: int) -> list[Game]:
        """Convert a cbbpy schedule DataFrame into Game models.

        Iterates schedule rows, parses game results to extract scores,
        resolves both team names to Kaggle IDs via exact then fuzzy matching,
        determines winner/loser ordering, infers location, computes day_num
        from game date and day-zero, then constructs Game models.
        """
        missing = _SCHEDULE_COLUMNS - set(df.columns)
        if missing:
            msg = f"espn: schedule DataFrame missing columns: {sorted(missing)}"
            raise DataFormatError(msg)

        day_zero = self._season_day_zeros.get(season)
        games: list[Game] = []
        seen_ids: set[str] = set()

        for row in df.itertuples(index=False):
            espn_game_id = str(row.game_id)
            game_id = f"espn_{espn_game_id}"
            if game_id in seen_ids:
                continue
            seen_ids.add(game_id)

            # Parse scores from game_result.
            parsed = _parse_game_result(str(getattr(row, "game_result", "")))
            if parsed is None:
                logger.debug("espn: skipping game %s — unparseable result", espn_game_id)
                continue
            team_score, opp_score = parsed

            # Resolve team IDs.
            team_name = str(row.team)
            opp_name = str(row.opponent)
            team_tid = _resolve_team_id(team_name, self._lower_team_map, self._team_name_to_id)
            opp_tid = _resolve_team_id(opp_name, self._lower_team_map, self._team_name_to_id)
            if team_tid is None or opp_tid is None:
                logger.warning("espn: skipping game %s — unresolved team(s)", espn_game_id)
                continue
            if team_tid == opp_tid:
                logger.warning("espn: skipping game %s — same team ID for both sides", espn_game_id)
                continue

            # Determine winner/loser ordering.
            if team_score > opp_score:
                w_team_id, l_team_id = team_tid, opp_tid
                w_score, l_score = team_score, opp_score
            elif opp_score > team_score:
                w_team_id, l_team_id = opp_tid, team_tid
                w_score, l_score = opp_score, team_score
            else:
                # Tie — shouldn't happen in basketball but skip gracefully.
                logger.warning("espn: skipping game %s — tied scores", espn_game_id)
                continue

            # Parse date and compute day_num.
            game_date = self._parse_date(getattr(row, "game_day", None))
            day_num = 0
            if game_date is not None and day_zero is not None:
                day_num = (game_date - day_zero).days

            # Determine location.
            loc = self._infer_loc(row, team_tid, w_team_id)

            games.append(
                Game(
                    game_id=game_id,
                    season=season,
                    day_num=day_num,
                    date=game_date,
                    w_team_id=w_team_id,
                    l_team_id=l_team_id,
                    w_score=w_score,
                    l_score=l_score,
                    loc=loc,
                ),
            )
        return games

    @staticmethod
    def _parse_date(value: object) -> datetime.date | None:
        """Best-effort date parsing from cbbpy game_day values."""
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return None
        try:
            ts = pd.Timestamp(value)
            if pd.isna(ts):
                return None
            return cast("datetime.date", ts.date())
        except Exception:  # noqa: BLE001
            logger.debug("espn: could not parse date value %r for game", value)
            return None

    @staticmethod
    def _infer_loc(
        row: object,
        team_tid: int,
        w_team_id: int,
    ) -> Literal["H", "A", "N"]:
        """Infer game location from available ESPN context.

        Accepts any row-like object (named tuple from ``itertuples`` or
        ``pd.Series``).  Falls back to ``"N"`` (neutral) when location
        cannot be determined.
        """
        # Some DataFrames include a 'home_away' or 'is_neutral' column.
        if hasattr(row, "is_neutral"):
            val = getattr(row, "is_neutral")
            if val is True or str(val).lower() in ("true", "1", "yes"):
                return "N"

        if hasattr(row, "home_away"):
            ha = str(getattr(row, "home_away")).lower()
            if ha == "home":
                # The row's team was home.
                return "H" if team_tid == w_team_id else "A"
            if ha == "away":
                return "A" if team_tid == w_team_id else "H"

        # Default to neutral when ambiguous.
        return "N"