Source code for ncaa_eval.ingest.connectors.espn

"""ESPN data source connector backed by the cbbpy scraper library.

The `EspnConnector` fetches current/recent season game data from ESPN
via cbbpy.  It does **not** provide team or season master data — those come
exclusively from the Kaggle connector.  A ``team_name_to_id`` mapping (built
from Kaggle's MTeams.csv) is required for translating ESPN team names into
Kaggle integer IDs.
"""

from __future__ import annotations

import datetime
import logging
from typing import Literal, cast

import cbbpy.mens_scraper as ms  # type: ignore[import-untyped]
import pandas as pd  # type: ignore[import-untyped]
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential

from ncaa_eval.ingest.connectors.base import (
    Connector,
    DataFormatError,
)
from ncaa_eval.ingest.fuzzy import fuzzy_match_team
from ncaa_eval.ingest.schema import Game

logger = logging.getLogger(__name__)

# Expected columns from cbbpy get_team_schedule() output.
_SCHEDULE_COLUMNS = {"game_id", "game_day", "game_result", "team", "opponent"}


def _parse_game_result(result_str: str) -> tuple[int, int] | None:
    """Parse a cbbpy ``game_result`` string like ``'W 75-60'``.

    Returns ``(team_score, opponent_score)`` or ``None`` if unparseable.
    """
    if not isinstance(result_str, str) or not result_str.strip():
        return None
    parts = result_str.strip().split()
    if len(parts) != 2:
        return None
    scores = parts[1].split("-")
    if len(scores) != 2:
        return None
    try:
        return int(scores[0]), int(scores[1])
    except ValueError:
        return None


def _resolve_team_id(
    name: str,
    lower_map: dict[str, int],
    original_mapping: dict[str, int],
) -> int | None:
    """Resolve an ESPN team name to a Kaggle team ID.

    Tries exact match first, then falls back to fuzzy matching via
    :func:`~ncaa_eval.ingest.fuzzy.fuzzy_match_team`.

    Args:
        name: ESPN team name to resolve.
        lower_map: Pre-computed lowercase-keyed mapping (avoids per-call rebuild).
        original_mapping: Original mapping with original-case keys (used for fuzzy).

    Returns:
        Kaggle team ID if resolved, or ``None`` if no match was found.
    """
    # Exact match (case-insensitive).
    exact = lower_map.get(name.lower())
    if exact is not None:
        return exact

    # Fuzzy fallback via centralized utility.
    result = fuzzy_match_team(name, original_mapping)
    if result is not None:
        return result

    logger.warning("espn: no team ID match for '%s'", name)
    return None


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=2, max=30),
    retry=retry_if_exception_type(Exception),
    reraise=True,
)
def _fetch_single_team_schedule(team_name: str, season: int) -> pd.DataFrame | None:
    """Fetch a single team's schedule with retry on transient failures."""
    df = ms.get_team_schedule(team_name, season)
    if isinstance(df, pd.DataFrame) and not df.empty:
        return df
    return None


[docs] class EspnConnector(Connector): """Connector for ESPN game data via the cbbpy scraper. Args: team_name_to_id: Mapping from team name strings to Kaggle TeamIDs. season_day_zeros: Mapping from season year to DayZero date. """ def __init__( self, team_name_to_id: dict[str, int], season_day_zeros: dict[int, datetime.date], ) -> None: self._team_name_to_id = team_name_to_id self._season_day_zeros = season_day_zeros # Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing. self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()} # -- Games --------------------------------------------------------------
[docs] def fetch_games(self, season: int) -> list[Game]: """Fetch game results for *season* from ESPN via cbbpy. Uses `get_team_schedule()` for each team in the mapping and deduplicates by ESPN game ID. """ df = self._fetch_schedule_df(season) if df is None or df.empty: return [] return self._parse_schedule_df(df, season)
# -- internal ----------------------------------------------------------- def _fetch_schedule_df(self, season: int) -> pd.DataFrame | None: """Load a season schedule DataFrame from cbbpy via per-team schedules. `get_games_season` is intentionally avoided here: it fetches boxscores and play-by-play for every game (thousands of no-timeout HTTP requests) and returns a game-info schema that is incompatible with the schedule columns expected by `_parse_schedule_df`. `get_team_schedule` returns the correct schedule-format schema (`team`, `opponent`, `game_result`, …) with one request per team. """ return self._fetch_per_team(season) def _fetch_per_team(self, season: int) -> pd.DataFrame | None: """Fetch schedules for each team in the mapping and concatenate. Iterates through each team in the mapping, calls _fetch_single_team_schedule() with retry logic, collects successful DataFrames, concatenates them, and deduplicates by ESPN game_id to eliminate cross-team overlap. """ frames: list[pd.DataFrame] = [] total = len(self._team_name_to_id) failed_teams: list[str] = [] for team_name in self._team_name_to_id: try: df = _fetch_single_team_schedule(team_name, season) if df is not None: frames.append(df) else: # Empty schedule returned (no exception, but no data). failed_teams.append(team_name) except Exception: # noqa: BLE001 logger.warning( "espn: get_team_schedule('%s', %d) failed after retries", team_name, season, exc_info=True, ) failed_teams.append(team_name) continue success = len(frames) failed = len(failed_teams) # AC #5-6: Summary reporting if failed > 0: first_five = failed_teams[:5] logger.warning( "espn: fetched %d/%d teams for season %d (%d failed): %s", success, total, season, failed, first_five, ) else: logger.info("espn: fetched %d/%d teams for season %d (%d failed)", success, total, season, failed) if not frames: return None combined = pd.concat(frames, ignore_index=True) # Deduplicate by ESPN game_id (each game appears in both teams' schedules). if "game_id" in combined.columns: combined = combined.drop_duplicates(subset=["game_id"]) return combined def _parse_schedule_df(self, df: pd.DataFrame, season: int) -> list[Game]: """Convert a cbbpy schedule DataFrame into Game models. Iterates schedule rows, parses game results to extract scores, resolves both team names to Kaggle IDs via exact then fuzzy matching, determines winner/loser ordering, infers location, computes day_num from game date and day-zero, then constructs Game models. """ missing = _SCHEDULE_COLUMNS - set(df.columns) if missing: msg = f"espn: schedule DataFrame missing columns: {sorted(missing)}" raise DataFormatError(msg) day_zero = self._season_day_zeros.get(season) games: list[Game] = [] seen_ids: set[str] = set() for row in df.itertuples(index=False): espn_game_id = str(row.game_id) game_id = f"espn_{espn_game_id}" if game_id in seen_ids: continue seen_ids.add(game_id) # Parse scores from game_result. parsed = _parse_game_result(str(getattr(row, "game_result", ""))) if parsed is None: logger.debug("espn: skipping game %s — unparseable result", espn_game_id) continue team_score, opp_score = parsed # Resolve team IDs. team_name = str(row.team) opp_name = str(row.opponent) team_tid = _resolve_team_id(team_name, self._lower_team_map, self._team_name_to_id) opp_tid = _resolve_team_id(opp_name, self._lower_team_map, self._team_name_to_id) if team_tid is None or opp_tid is None: logger.warning("espn: skipping game %s — unresolved team(s)", espn_game_id) continue if team_tid == opp_tid: logger.warning("espn: skipping game %s — same team ID for both sides", espn_game_id) continue # Determine winner/loser ordering. if team_score > opp_score: w_team_id, l_team_id = team_tid, opp_tid w_score, l_score = team_score, opp_score elif opp_score > team_score: w_team_id, l_team_id = opp_tid, team_tid w_score, l_score = opp_score, team_score else: # Tie — shouldn't happen in basketball but skip gracefully. logger.warning("espn: skipping game %s — tied scores", espn_game_id) continue # Parse date and compute day_num. game_date = self._parse_date(getattr(row, "game_day", None)) day_num = 0 if game_date is not None and day_zero is not None: day_num = (game_date - day_zero).days # Determine location. loc = self._infer_loc(row, team_tid, w_team_id) games.append( Game( game_id=game_id, season=season, day_num=day_num, date=game_date, w_team_id=w_team_id, l_team_id=l_team_id, w_score=w_score, l_score=l_score, loc=loc, ), ) return games @staticmethod def _parse_date(value: object) -> datetime.date | None: """Best-effort date parsing from cbbpy game_day values.""" if value is None or (isinstance(value, float) and pd.isna(value)): return None try: ts = pd.Timestamp(value) if pd.isna(ts): return None return cast("datetime.date", ts.date()) except Exception: # noqa: BLE001 logger.debug("espn: could not parse date value %r for game", value) return None @staticmethod def _infer_loc( row: object, team_tid: int, w_team_id: int, ) -> Literal["H", "A", "N"]: """Infer game location from available ESPN context. Accepts any row-like object (named tuple from ``itertuples`` or ``pd.Series``). Falls back to ``"N"`` (neutral) when location cannot be determined. """ # Some DataFrames include a 'home_away' or 'is_neutral' column. if hasattr(row, "is_neutral"): val = getattr(row, "is_neutral") if val is True or str(val).lower() in ("true", "1", "yes"): return "N" if hasattr(row, "home_away"): ha = str(getattr(row, "home_away")).lower() if ha == "home": # The row's team was home. return "H" if team_tid == w_team_id else "A" if ha == "away": return "A" if team_tid == w_team_id else "H" # Default to neutral when ambiguous. return "N"