diff --git a/example.py b/example.py index 1a0eb8b..e803e31 100644 --- a/example.py +++ b/example.py @@ -1,6 +1,23 @@ -from pybaseball_live.schedule import schedule - +from pybaseball_live.schedule import schedule_range +from pybaseball_live.game import games +import datetime +import time if __name__ == "__main__": - schedule_df_2024_2023 = schedule(years=[2024, 2023]) - print(schedule_df_2024_2023) + todays_schedule = schedule_range( + sport_ids=[1], + game_types=["R"], + start_dt=datetime.datetime.now().date(), + end_dt=datetime.datetime.now().date(), + ) + + assert todays_schedule is not None + assert "game_id" in todays_schedule.columns + + todays_game_ids = todays_schedule["game_id"].to_list() + while True: + game_data = games(todays_game_ids) + for game_df in game_data.values(): + print(game_df) + + time.sleep(1) diff --git a/pybaseball_live/game.py b/pybaseball_live/game.py new file mode 100644 index 0000000..a05728c --- /dev/null +++ b/pybaseball_live/game.py @@ -0,0 +1,403 @@ +from typing import Tuple +import polars as pl +import numpy as np +import requests +from concurrent.futures import ThreadPoolExecutor, as_completed + +from .utils import GAME_ENDPOINT_ROOT +from .exceptions import BadResponseCode + + +__all__ = ["games"] + + +def games(game_ids: list[int]) -> dict[int, pl.DataFrame]: + """ + Retrieves live game data for a collection of live game ID's in parallel + + @params: + - games: collection of game IDs for which to retrieve live data + + @returns: + - dict[int, pl.DataFrame]: dictionary of game_id to game dataframe + """ + + def _game(game_id: int) -> Tuple[int, pl.DataFrame]: + live_game_url = GAME_ENDPOINT_ROOT.format(game_id=game_id) + if (response := requests.get(live_game_url)).status_code != 200: + raise BadResponseCode(url=live_game_url, bad_code=response.status_code) + return game_id, _game_to_df(response.json()) + + game_data: dict[int, pl.DataFrame] = {} + with ThreadPoolExecutor() as executor: + futures = [executor.submit(_game, game_id) for game_id in game_ids] + for future in as_completed(futures): + try: + game_id, df = future.result() + game_data[game_id] = df + except Exception as e: + print(f"An error occurred while processing game: {e}") + return game_data + + +def _game_to_df(game_data: dict) -> pl.DataFrame: + """ + Converts a dictionary of game_data into polars data frame format. + + @params: + - game_data: game information in dictionary format + + @returns: + - pl.DataFrame: dataframe containing data from all of the plays in game_data + """ + game_id = game_data.get("gamePk") + game_date = game_data.get("gameData", {}).get("datetime", {}).get("officialDate") + all_plays = game_data.get("liveData", {}).get("plays", {}).get("allPlays", []) + + flattened_data = [ + {**event_data, "game_id": game_id, "game_date": game_date} + for play in all_plays + for event_data in _extract_play_data(play, game_data) + ] + + return pl.DataFrame(flattened_data) + + +def _extract_play_data(play: dict, game_data: dict) -> list[dict]: + """ + Extracts data from a single play. + + @params: + - play: dictionary containing play data + - game_data: full game data dictionary + + @returns: + - list[dict]: list of extracted event data dictionaries + """ + play_events = play.get("playEvents", []) + ab_data = [] + + for n, event in enumerate(play_events): + if event.get("isPitch") or "call" in event.get("details", {}): + event_data = _extract_event_data(play, event, n, play_events) + elif event.get("count", {}).get("balls") == 4: + event_data = _extract_walk_data(play, event, game_data) + else: + continue + ab_data.append(event_data) + + return ab_data + + +def _extract_event_data(play: dict, event: dict, n: int, play_events: list) -> dict: + """ + Extracts data from a single event in a play. + + @params: + - play: dictionary containing play data + - event: dictionary containing event data + - n: index of the event in the play + - play_events: list of all events in the play + + @returns: + - dict: extracted event data + """ + matchup = play.get("matchup", {}) + about = play.get("about", {}) + details = event.get("details", {}) + + swing_list = ["X", "F", "S", "D", "E", "T", "W"] + whiff_list = ["S", "T", "W"] + + return { + "batter_id": matchup.get("batter", {}).get("id"), + "batter_name": matchup.get("batter", {}).get("fullName"), + "batter_hand": matchup.get("batSide", {}).get("code"), + "pitcher_id": matchup.get("pitcher", {}).get("id"), + "pitcher_name": matchup.get("pitcher", {}).get("fullName"), + "pitcher_hand": matchup.get("pitchHand", {}).get("code"), + **_get_team_data(about, play), + **_get_count_data(play, event, n), + "play_description": details.get("description"), + "play_code": details.get("code"), + "in_play": details.get("isInPlay"), + "is_strike": details.get("isStrike"), + "is_swing": ( + details.get("code") in swing_list if details.get("code") else np.nan + ), + "is_whiff": ( + details.get("code") in whiff_list if details.get("code") else np.nan + ), + "is_ball": details.get("isBall"), + "is_review": details.get("hasReview"), + "pitch_type": details.get("type", {}).get("code"), + "pitch_description": details.get("type", {}).get("description"), + **_get_pitch_data(event.get("pitchData", {})), + **_get_hit_data(event.get("hitData", {})), + "index_play": event.get("index"), + "play_id": event.get("playId"), + "start_time": event.get("startTime"), + "end_time": event.get("endTime"), + "is_pitch": event.get("isPitch"), + "type_type": event.get("type"), + **_get_ab_result(play, n, len(play_events) - 1), + } + + +def _extract_walk_data(play: dict, event: dict, game_data: dict) -> dict: + """ + Extracts data for a walk event. + + @params: + - play: dictionary containing play data + - event: dictionary containing event data + - game_data: full game data dictionary + + @returns: + - dict: extracted walk event data + """ + matchup = play.get("matchup", {}) + about = play.get("about", {}) + count = event.get("count", {}) + + base_data = { + "batter_id": matchup.get("batter", {}).get("id"), + "batter_name": matchup.get("batter", {}).get("fullName"), + "batter_hand": matchup.get("batSide", {}).get("code"), + "pitcher_id": matchup.get("pitcher", {}).get("id"), + "pitcher_name": matchup.get("pitcher", {}).get("fullName"), + "pitcher_hand": matchup.get("pitchHand", {}).get("code"), + **_get_team_data(about, play), + **{k: count.get(k) for k in ["strikes", "balls", "outs"]}, + **{f"{k}_after": count.get(k) for k in ["strikes", "balls", "outs"]}, + "index_play": event.get("index"), + "play_id": event.get("playId"), + "start_time": event.get("startTime"), + "end_time": event.get("endTime"), + "is_pitch": event.get("isPitch"), + "type_type": event.get("type"), + "event": play.get("result", {}).get("event"), + "event_type": play.get("result", {}).get("eventType"), + } + + # Fill remaining fields with np.nan + remaining_fields = [ + "play_description", + "play_code", + "in_play", + "is_strike", + "is_swing", + "is_whiff", + "is_ball", + "is_review", + "pitch_type", + "pitch_description", + "ab_number", + "start_speed", + "end_speed", + "sz_top", + "sz_bot", + "x", + "y", + "ax", + "ay", + "az", + "pfxx", + "pfxz", + "px", + "pz", + "vx0", + "vy0", + "vz0", + "x0", + "y0", + "z0", + "zone", + "type_confidence", + "plate_time", + "extension", + "spin_rate", + "spin_direction", + "vb", + "ivb", + "hb", + "launch_speed", + "launch_angle", + "launch_distance", + "launch_location", + "trajectory", + "hardness", + "hit_x", + "hit_y", + "type_ab", + "rbi", + "away_score", + "home_score", + "is_out", + ] + base_data.update({field: np.nan for field in remaining_fields}) + + return base_data + + +def _get_team_data(about: dict, play: dict) -> dict: + """ + Extracts team data for batter and pitcher. + + @params: + - about: dictionary containing game state information + - play: dictionary containing play data + + @returns: + - dict: team data for batter and pitcher + """ + game_data = play.get("game", {}).get("gameData", {}) + is_top_inning = about.get("isTopInning") + + batter_team = "away" if is_top_inning else "home" + pitcher_team = "home" if is_top_inning else "away" + + return { + "batter_team": game_data.get("teams", {}) + .get(batter_team, {}) + .get("abbreviation"), + "batter_team_id": game_data.get("teams", {}).get(batter_team, {}).get("id"), + "pitcher_team": game_data.get("teams", {}) + .get(pitcher_team, {}) + .get("abbreviation"), + "pitcher_team_id": game_data.get("teams", {}).get(pitcher_team, {}).get("id"), + } + + +def _get_count_data(play: dict, event: dict, n: int) -> dict: + """ + Extracts count data for an event. + + @params: + - play: dictionary containing play data + - event: dictionary containing event data + - n: index of the event in the play + + @returns: + - dict: count data before and after the event + """ + play_events = play.get("playEvents", []) + count = event.get("count", {}) + + if n == 0: + prev_count = {"strikes": 0, "balls": 0, "outs": count.get("outs")} + else: + prev_count = play_events[n - 1].get("count", {}) + + return { + "strikes": prev_count.get("strikes"), + "balls": prev_count.get("balls"), + "outs": prev_count.get("outs"), + "strikes_after": count.get("strikes"), + "balls_after": count.get("balls"), + "outs_after": count.get("outs"), + } + + +def _get_pitch_data(pitch_data: dict) -> dict: + """ + Extracts pitch data from the pitch_data dictionary. + + @params: + - pitch_data: dictionary containing pitch data + + @returns: + - dict: extracted pitch data + """ + coordinates = pitch_data.get("coordinates", {}) + breaks = pitch_data.get("breaks", {}) + + return { + "start_speed": pitch_data.get("startSpeed"), + "end_speed": pitch_data.get("endSpeed"), + "sz_top": pitch_data.get("strikeZoneTop"), + "sz_bot": pitch_data.get("strikeZoneBottom"), + "x": coordinates.get("x"), + "y": coordinates.get("y"), + "ax": coordinates.get("aX"), + "ay": coordinates.get("aY"), + "az": coordinates.get("aZ"), + "pfxx": coordinates.get("pfxX"), + "pfxz": coordinates.get("pfxZ"), + "px": coordinates.get("pX"), + "pz": coordinates.get("pZ"), + "vx0": coordinates.get("vX0"), + "vy0": coordinates.get("vY0"), + "vz0": coordinates.get("vZ0"), + "x0": coordinates.get("x0"), + "y0": coordinates.get("y0"), + "z0": coordinates.get("z0"), + "zone": pitch_data.get("zone"), + "type_confidence": pitch_data.get("typeConfidence"), + "plate_time": pitch_data.get("plateTime"), + "extension": pitch_data.get("extension"), + "spin_rate": breaks.get("spinRate"), + "spin_direction": breaks.get("spinDirection"), + "vb": breaks.get("breakVertical"), + "ivb": breaks.get("breakVerticalInduced"), + "hb": breaks.get("breakHorizontal"), + } + + +def _get_hit_data(hit_data: dict) -> dict: + """ + Extracts hit data from the hit_data dictionary. + + @params: + - hit_data: dictionary containing hit data + + @returns: + - dict: extracted hit data + """ + coordinates = hit_data.get("coordinates", {}) + + return { + "launch_speed": hit_data.get("launchSpeed"), + "launch_angle": hit_data.get("launchAngle"), + "launch_distance": hit_data.get("totalDistance"), + "launch_location": hit_data.get("location"), + "trajectory": hit_data.get("trajectory"), + "hardness": hit_data.get("hardness"), + "hit_x": coordinates.get("coordX"), + "hit_y": coordinates.get("coordY"), + } + + +def _get_ab_result(play: dict, n: int, last_event_index: int) -> dict: + """ + Extracts at-bat result data. + + @params: + - play: dictionary containing play data + - n: index of the current event + - last_event_index: index of the last event in the play + + @returns: + - dict: at-bat result data + """ + result = play.get("result", {}) + + if n == last_event_index: + return { + "type_ab": result.get("type"), + "event": result.get("event"), + "event_type": result.get("eventType"), + "rbi": result.get("rbi"), + "away_score": result.get("awayScore"), + "home_score": result.get("homeScore"), + "is_out": result.get("isOut"), + } + return { + "type_ab": np.nan, + "event": np.nan, + "event_type": np.nan, + "rbi": np.nan, + "away_score": np.nan, + "home_score": np.nan, + "is_out": np.nan, + } diff --git a/pybaseball_live/schedule.py b/pybaseball_live/schedule.py index 13bf79b..2a20a93 100644 --- a/pybaseball_live/schedule.py +++ b/pybaseball_live/schedule.py @@ -6,7 +6,7 @@ from .utils import ENDPOINT_URL from .exceptions import BadResponseCode -__all__ = ["schedule"] +__all__ = ["schedule", "schedule_range"] SCHEDULE_URL = ENDPOINT_URL.format(endpoint="schedule") @@ -20,7 +20,7 @@ def schedule( years: list[int] = [datetime.datetime.now().year], sport_ids: list[int] = [1], - game_types: list[int] = ["R"], + game_types: list[str] = ["R"], ) -> Optional[pl.DataFrame]: """ Retrieves the schedule of baseball games based on the specified parameters. @@ -92,3 +92,18 @@ def schedule( .unique(subset="game_id") .sort("date") ) + + +def schedule_range( + sport_ids: list[int], + game_types: list[str], + start_dt: datetime.date = datetime.datetime.now().date(), + end_dt: datetime.date = datetime.datetime.now().date(), +) -> Optional[pl.DataFrame]: + years = [yr for yr in range(start_dt.year, end_dt.year + 1)] + if (s := schedule(years=years, game_types=game_types, sport_ids=sport_ids)) is None: + return None + + return s.filter( + (pl.col("date").dt.date() >= start_dt) & (pl.col("date").dt.date() <= end_dt) + ) diff --git a/pybaseball_live/sport_id.py b/pybaseball_live/sport_id.py index 165bf06..8ad8b74 100644 --- a/pybaseball_live/sport_id.py +++ b/pybaseball_live/sport_id.py @@ -5,7 +5,7 @@ from .utils import ENDPOINT_URL from .exceptions import BadResponseCode, BadResponseData -__all__ = ["get_sport_id", "check_sport_id"] +__all__ = ["sports", "check_sport_id"] SPORT_URL = ENDPOINT_URL.format(endpoint="sports") @@ -31,7 +31,7 @@ def sports() -> pl.DataFrame: return pl.DataFrame(sports_data) -def check_sport_id(sport_id: int) -> Optional[str]: +def check_sport_id(sport_id: int) -> Optional[pl.DataFrame]: """ Checks if the provided sport ID exists in the list of sports retrieved from the MLB API. diff --git a/pybaseball_live/utils.py b/pybaseball_live/utils.py index f44626d..7e14928 100644 --- a/pybaseball_live/utils.py +++ b/pybaseball_live/utils.py @@ -3,3 +3,5 @@ ROOT_URL = "https://statsapi.mlb.com" ENDPOINT_URL = ROOT_URL + "/api/v1/{endpoint}" + +GAME_ENDPOINT_ROOT = ROOT_URL + "/api/v1.1/game/{game_id}/feed/live" diff --git a/tests/test_game.py b/tests/test_game.py new file mode 100644 index 0000000..2191ba0 --- /dev/null +++ b/tests/test_game.py @@ -0,0 +1,181 @@ +import pytest +import polars as pl +import numpy as np +from unittest.mock import patch, MagicMock +from pybaseball_live.game import ( + games, + _game_to_df, + _extract_play_data, + _extract_event_data, + _extract_walk_data, + _get_team_data, + _get_count_data, + _get_pitch_data, + _get_hit_data, + _get_ab_result, +) + + +@pytest.fixture +def sample_game_data(): + return { + "gamePk": 12345, + "gameData": { + "datetime": {"officialDate": "2023-07-01"}, + "teams": { + "away": {"abbreviation": "NYY", "id": 1}, + "home": {"abbreviation": "BOS", "id": 2}, + }, + }, + "liveData": { + "plays": { + "allPlays": [ + { + "result": { + "type": "atBat", + "event": "Single", + "eventType": "single", + }, + "about": {"isTopInning": True}, + "matchup": { + "batter": {"id": 123, "fullName": "John Doe"}, + "pitcher": {"id": 456, "fullName": "Jane Smith"}, + "batSide": {"code": "R"}, + "pitchHand": {"code": "L"}, + }, + "playEvents": [ + { + "details": { + "call": {"code": "B"}, + "description": "Ball", + }, + "count": {"balls": 1, "strikes": 0, "outs": 0}, + "pitchData": {"startSpeed": 90.5}, + "index": 0, + "playId": "play1", + "isPitch": True, + }, + { + "details": { + "call": {"code": "S"}, + "description": "Strike", + }, + "count": {"balls": 1, "strikes": 1, "outs": 0}, + "pitchData": {"startSpeed": 92.0}, + "index": 1, + "playId": "play2", + "isPitch": True, + }, + ], + } + ] + } + }, + } + + +def test_games(): + with patch("pybaseball_live.game.requests.get") as mock_get: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"gamePk": 12345} + mock_get.return_value = mock_response + + result = games([12345]) + assert isinstance(result, dict) + assert 12345 in result + assert isinstance(result[12345], pl.DataFrame) + + +def test_game_to_df(sample_game_data): + df = _game_to_df(sample_game_data) + assert isinstance(df, pl.DataFrame) + assert "game_id" in df.columns + assert "game_date" in df.columns + assert df.shape[0] > 0 + + +def test_extract_play_data(sample_game_data): + play = sample_game_data["liveData"]["plays"]["allPlays"][0] + result = _extract_play_data(play, sample_game_data) + assert isinstance(result, list) + assert len(result) == 2 # Two pitch events in the sample data + + +def test_extract_event_data(sample_game_data): + play = sample_game_data["liveData"]["plays"]["allPlays"][0] + event = play["playEvents"][0] + result = _extract_event_data(play, event, 0, play["playEvents"]) + assert isinstance(result, dict) + assert "batter_id" in result + assert "pitcher_id" in result + + +def test_extract_walk_data(sample_game_data): + play = sample_game_data["liveData"]["plays"]["allPlays"][0] + event = play["playEvents"][0] + event["count"]["balls"] = 4 # Simulate a walk + result = _extract_walk_data(play, event, sample_game_data) + assert isinstance(result, dict) + assert "batter_id" in result + assert "pitcher_id" in result + + +def test_get_team_data(sample_game_data): + play = sample_game_data["liveData"]["plays"]["allPlays"][0] + about = {"isTopInning": True} + result = _get_team_data(about, play) + assert isinstance(result, dict) + assert "batter_team" in result + assert "pitcher_team" in result + + +def test_get_count_data(sample_game_data): + play = sample_game_data["liveData"]["plays"]["allPlays"][0] + event = play["playEvents"][0] + result = _get_count_data(play, event, 0) + assert isinstance(result, dict) + assert "strikes" in result + assert "balls" in result + assert "outs" in result + + +def test_get_pitch_data(): + pitch_data = { + "startSpeed": 90.5, + "endSpeed": 83.2, + "strikeZoneTop": 3.5, + "strikeZoneBottom": 1.5, + "coordinates": {"x": 0.5, "y": 2.0}, + "breaks": {"spinRate": 2500}, + } + result = _get_pitch_data(pitch_data) + assert isinstance(result, dict) + assert "start_speed" in result + assert "end_speed" in result + assert "spin_rate" in result + + +def test_get_hit_data(): + hit_data = { + "launchSpeed": 95.0, + "launchAngle": 25.0, + "totalDistance": 350, + "location": "7", + "coordinates": {"coordX": 125.5, "coordY": 200.3}, + } + result = _get_hit_data(hit_data) + assert isinstance(result, dict) + assert "launch_speed" in result + assert "launch_angle" in result + assert "hit_x" in result + assert "hit_y" in result + + +def test_get_ab_result(sample_game_data): + play = sample_game_data["liveData"]["plays"]["allPlays"][0] + result_last = _get_ab_result(play, 1, 1) # Last event + result_not_last = _get_ab_result(play, 0, 1) # Not last event + assert isinstance(result_last, dict) + assert isinstance(result_not_last, dict) + assert result_last["type_ab"] is not None