"""ESPN data source connector backed by the cbbpy scraper library.
The `EspnConnector` fetches current/recent season game data from ESPN
via cbbpy. It does **not** provide team or season master data — those come
exclusively from the Kaggle connector. A ``team_name_to_id`` mapping (built
from Kaggle's MTeams.csv) is required for translating ESPN team names into
Kaggle integer IDs.
"""
from __future__ import annotations
import datetime
import logging
from typing import Literal, cast
import cbbpy.mens_scraper as ms # type: ignore[import-untyped]
import pandas as pd # type: ignore[import-untyped]
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
from ncaa_eval.ingest.connectors.base import (
Connector,
DataFormatError,
)
from ncaa_eval.ingest.fuzzy import fuzzy_match_team
from ncaa_eval.ingest.schema import Game
logger = logging.getLogger(__name__)
# Expected columns from cbbpy get_team_schedule() output.
_SCHEDULE_COLUMNS = {"game_id", "game_day", "game_result", "team", "opponent"}
def _parse_game_result(result_str: str) -> tuple[int, int] | None:
"""Parse a cbbpy ``game_result`` string like ``'W 75-60'``.
Returns ``(team_score, opponent_score)`` or ``None`` if unparseable.
"""
if not isinstance(result_str, str) or not result_str.strip():
return None
parts = result_str.strip().split()
if len(parts) != 2:
return None
scores = parts[1].split("-")
if len(scores) != 2:
return None
try:
return int(scores[0]), int(scores[1])
except ValueError:
return None
def _resolve_team_id(
name: str,
lower_map: dict[str, int],
original_mapping: dict[str, int],
) -> int | None:
"""Resolve an ESPN team name to a Kaggle team ID.
Tries exact match first, then falls back to fuzzy matching via
:func:`~ncaa_eval.ingest.fuzzy.fuzzy_match_team`.
Args:
name: ESPN team name to resolve.
lower_map: Pre-computed lowercase-keyed mapping (avoids per-call rebuild).
original_mapping: Original mapping with original-case keys (used for fuzzy).
Returns:
Kaggle team ID if resolved, or ``None`` if no match was found.
"""
# Exact match (case-insensitive).
exact = lower_map.get(name.lower())
if exact is not None:
return exact
# Fuzzy fallback via centralized utility.
result = fuzzy_match_team(name, original_mapping)
if result is not None:
return result
logger.warning("espn: no team ID match for '%s'", name)
return None
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=2, min=2, max=30),
retry=retry_if_exception_type(Exception),
reraise=True,
)
def _fetch_single_team_schedule(team_name: str, season: int) -> pd.DataFrame | None:
"""Fetch a single team's schedule with retry on transient failures."""
df = ms.get_team_schedule(team_name, season)
if isinstance(df, pd.DataFrame) and not df.empty:
return df
return None
[docs]
class EspnConnector(Connector):
"""Connector for ESPN game data via the cbbpy scraper.
Args:
team_name_to_id: Mapping from team name strings to Kaggle TeamIDs.
season_day_zeros: Mapping from season year to DayZero date.
"""
def __init__(
self,
team_name_to_id: dict[str, int],
season_day_zeros: dict[int, datetime.date],
) -> None:
self._team_name_to_id = team_name_to_id
self._season_day_zeros = season_day_zeros
# Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}
# -- Games --------------------------------------------------------------
[docs]
def fetch_games(self, season: int) -> list[Game]:
"""Fetch game results for *season* from ESPN via cbbpy.
Uses `get_team_schedule()` for each team in the mapping and
deduplicates by ESPN game ID.
"""
df = self._fetch_schedule_df(season)
if df is None or df.empty:
return []
return self._parse_schedule_df(df, season)
# -- internal -----------------------------------------------------------
def _fetch_schedule_df(self, season: int) -> pd.DataFrame | None:
"""Load a season schedule DataFrame from cbbpy via per-team schedules.
`get_games_season` is intentionally avoided here: it fetches
boxscores and play-by-play for every game (thousands of no-timeout
HTTP requests) and returns a game-info schema that is incompatible
with the schedule columns expected by `_parse_schedule_df`.
`get_team_schedule` returns the correct schedule-format schema
(`team`, `opponent`, `game_result`, …) with one request per team.
"""
return self._fetch_per_team(season)
def _fetch_per_team(self, season: int) -> pd.DataFrame | None:
"""Fetch schedules for each team in the mapping and concatenate.
Iterates through each team in the mapping, calls
_fetch_single_team_schedule() with retry logic, collects successful
DataFrames, concatenates them, and deduplicates by ESPN game_id to
eliminate cross-team overlap.
"""
frames: list[pd.DataFrame] = []
total = len(self._team_name_to_id)
failed_teams: list[str] = []
for team_name in self._team_name_to_id:
try:
df = _fetch_single_team_schedule(team_name, season)
if df is not None:
frames.append(df)
else:
# Empty schedule returned (no exception, but no data).
failed_teams.append(team_name)
except Exception: # noqa: BLE001
logger.warning(
"espn: get_team_schedule('%s', %d) failed after retries",
team_name,
season,
exc_info=True,
)
failed_teams.append(team_name)
continue
success = len(frames)
failed = len(failed_teams)
# AC #5-6: Summary reporting
if failed > 0:
first_five = failed_teams[:5]
logger.warning(
"espn: fetched %d/%d teams for season %d (%d failed): %s",
success,
total,
season,
failed,
first_five,
)
else:
logger.info("espn: fetched %d/%d teams for season %d (%d failed)", success, total, season, failed)
if not frames:
return None
combined = pd.concat(frames, ignore_index=True)
# Deduplicate by ESPN game_id (each game appears in both teams' schedules).
if "game_id" in combined.columns:
combined = combined.drop_duplicates(subset=["game_id"])
return combined
def _parse_schedule_df(self, df: pd.DataFrame, season: int) -> list[Game]:
"""Convert a cbbpy schedule DataFrame into Game models.
Iterates schedule rows, parses game results to extract scores,
resolves both team names to Kaggle IDs via exact then fuzzy matching,
determines winner/loser ordering, infers location, computes day_num
from game date and day-zero, then constructs Game models.
"""
missing = _SCHEDULE_COLUMNS - set(df.columns)
if missing:
msg = f"espn: schedule DataFrame missing columns: {sorted(missing)}"
raise DataFormatError(msg)
day_zero = self._season_day_zeros.get(season)
games: list[Game] = []
seen_ids: set[str] = set()
for row in df.itertuples(index=False):
espn_game_id = str(row.game_id)
game_id = f"espn_{espn_game_id}"
if game_id in seen_ids:
continue
seen_ids.add(game_id)
# Parse scores from game_result.
parsed = _parse_game_result(str(getattr(row, "game_result", "")))
if parsed is None:
logger.debug("espn: skipping game %s — unparseable result", espn_game_id)
continue
team_score, opp_score = parsed
# Resolve team IDs.
team_name = str(row.team)
opp_name = str(row.opponent)
team_tid = _resolve_team_id(team_name, self._lower_team_map, self._team_name_to_id)
opp_tid = _resolve_team_id(opp_name, self._lower_team_map, self._team_name_to_id)
if team_tid is None or opp_tid is None:
logger.warning("espn: skipping game %s — unresolved team(s)", espn_game_id)
continue
if team_tid == opp_tid:
logger.warning("espn: skipping game %s — same team ID for both sides", espn_game_id)
continue
# Determine winner/loser ordering.
if team_score > opp_score:
w_team_id, l_team_id = team_tid, opp_tid
w_score, l_score = team_score, opp_score
elif opp_score > team_score:
w_team_id, l_team_id = opp_tid, team_tid
w_score, l_score = opp_score, team_score
else:
# Tie — shouldn't happen in basketball but skip gracefully.
logger.warning("espn: skipping game %s — tied scores", espn_game_id)
continue
# Parse date and compute day_num.
game_date = self._parse_date(getattr(row, "game_day", None))
day_num = 0
if game_date is not None and day_zero is not None:
day_num = (game_date - day_zero).days
# Determine location.
loc = self._infer_loc(row, team_tid, w_team_id)
games.append(
Game(
game_id=game_id,
season=season,
day_num=day_num,
date=game_date,
w_team_id=w_team_id,
l_team_id=l_team_id,
w_score=w_score,
l_score=l_score,
loc=loc,
),
)
return games
@staticmethod
def _parse_date(value: object) -> datetime.date | None:
"""Best-effort date parsing from cbbpy game_day values."""
if value is None or (isinstance(value, float) and pd.isna(value)):
return None
try:
ts = pd.Timestamp(value)
if pd.isna(ts):
return None
return cast("datetime.date", ts.date())
except Exception: # noqa: BLE001
logger.debug("espn: could not parse date value %r for game", value)
return None
@staticmethod
def _infer_loc(
row: object,
team_tid: int,
w_team_id: int,
) -> Literal["H", "A", "N"]:
"""Infer game location from available ESPN context.
Accepts any row-like object (named tuple from ``itertuples`` or
``pd.Series``). Falls back to ``"N"`` (neutral) when location
cannot be determined.
"""
# Some DataFrames include a 'home_away' or 'is_neutral' column.
if hasattr(row, "is_neutral"):
val = getattr(row, "is_neutral")
if val is True or str(val).lower() in ("true", "1", "yes"):
return "N"
if hasattr(row, "home_away"):
ha = str(getattr(row, "home_away")).lower()
if ha == "home":
# The row's team was home.
return "H" if team_tid == w_team_id else "A"
if ha == "away":
return "A" if team_tid == w_team_id else "H"
# Default to neutral when ambiguous.
return "N"