-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
224 lines (184 loc) · 10 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import pandas as pd
from gather_data import fetch_player_game_logs, compile_team_data, get_team_stats, \
get_active_players, fetch_all_player_game_logs, get_top_90_players, get_roster, get_pitcher_roster, \
fetch_all_pitcher_game_logs_pybaseball, get_player_season_pa
from handle_edgecases import remove_non_numeric_rows, clean_and_format_date_statcast, modify_innings_pitched, determine_start_inning
from calculate_stats import calculate_hits_per_out, calculate_hits_per_pa, calculate_hits_per_out_statcast
import os
import pickle
pathName = os.getcwd()
playersUsed = 50
Folder = 'DataFiles\\'
# Get top batters for this year
top_90_player_names = get_top_90_players()
print(f"Found {len(top_90_player_names)} top players.")
print(top_90_player_names[:playersUsed]) # Print the first 10 player names as a sample
# Main script
year = 2023
# Batting roster (for IDs)
roster = get_roster(year)
roster_dict = {v: k for k, v in roster.items()} # Create a bidirectional dictionary
print(f"Found {len(roster)} players in the roster.")
print(list(roster.items())[:playersUsed]) # Print the first 10 entries as a sample
# Remove players from top_90_player_names if they are not in the roster
top_90_player_names = [name for name in top_90_player_names if name in roster]
print(f"Found {len(top_90_player_names)} top players in the roster.")
print(top_90_player_names[:playersUsed]) # Print the first 10 player names as a sample
# Get top batters' game logs and save
filename = 'top_player_game_logs' + str(year) + '.pkl'
top_90_player_ids = [roster[name] for name in top_90_player_names if
name in roster] # Get ids from each of top players
print(top_90_player_ids[:playersUsed])
if not os.path.exists(pathName + "\\" + Folder + filename):
top_90_player_ids = [roster[name] for name in top_90_player_names if
name in roster] # Get ids from each of top players
print(top_90_player_ids[:playersUsed])
top_player_game_logs = fetch_all_player_game_logs(top_90_player_ids[:playersUsed], year)
top_player_game_logs.to_pickle(f'{Folder}top_player_game_logs' + str(year) + '.pkl')
top_player_game_logs.to_csv(f'{Folder}top_player_game_logs' + str(year) + '.csv')
else:
top_player_game_logs = pd.read_pickle(Folder+filename)
#top_player_game_logs.info()
pitcher_roster = get_pitcher_roster(year)
print(f"Found {len(pitcher_roster)} pitchers in the roster.")
print(list(pitcher_roster.items())[:10]) # Print the first 10 entries as a sample
start_date = f"{year}-04-01"
end_date = f"{year}-10-01"
#Get top batter PA information and save
filename = f'top_batters_pas_{start_date}_to_{end_date}.pkl'
if not os.path.exists(pathName + "\\" + Folder + filename):
# Initialize an empty list to store the dataframes for each player and game
dataFrames = []
for player_id in top_90_player_ids:
player_name = roster_dict[player_id] # Retrieve the player's name using the bidirectional dictionary
pa_data, statcast_id = get_player_season_pa(player_name, year)
pa_data['bbref_id'] = player_id
dataFrames.append(pd.DataFrame(pa_data))
top_player_pas = pd.concat(dataFrames, ignore_index=True)
top_player_pas.to_pickle(Folder+filename)
top_player_pas.to_csv(f'{Folder}top_batters_pas_{start_date}_to_{end_date}.csv')
else:
top_player_pas = pd.read_pickle(Folder+filename)
# Get pitcher data
filename = f'all_pitchers_game_logs_{start_date}_to_{end_date}.pkl'
if not os.path.exists(pathName + "\\" + Folder + filename):
print('Pitching file does not exist. Creating')
# Get pitching roster and game logs
pitcher_roster = get_pitcher_roster(year)
print(f"Found {len(pitcher_roster)} pitchers in the roster.")
print(list(pitcher_roster.items())[:10]) # Print the first 10 entries as a sample
pitcher_data_game_logs = fetch_all_pitcher_game_logs_pybaseball(list(pitcher_roster.values()), year)
print(pitcher_data_game_logs.head())
# Save to pickle
with open(f'{Folder}all_pitchers_game_logs_{start_date}_to_{end_date}.pkl', 'wb') as f:
pickle.dump(pitcher_data_game_logs, f)
# Save to CSV
pitcher_data_game_logs.to_csv(f'{Folder}all_pitchers_game_logs_{start_date}_to_{end_date}.csv', index=False)
else:
print('Loading pitcher game logs')
pitcher_data_game_logs = pd.read_pickle(Folder+filename)
print('Selecting eventful pitches')
# Drop rows with NaN values in events
pitcher_data_game_logs = pitcher_data_game_logs.dropna(subset=['events'])
print('Cleaning pitcher date')
pitcher_data_game_logs = clean_and_format_date_statcast(pitcher_data_game_logs)
# Find the handedness of the starting pitcher for each game
print('Finding starting pitcher handedness')
team_abbr_map = {
'TB': 'TBR',
'SD': 'SDP',
'KC': 'KCR',
'WSH': 'WSN',
'AZ': 'ARI',
'CWS': 'CHW',
'SF': 'SFG'
}
# Find the handedness of the starting pitcher for each game
starting_pitcher_handedness = pitcher_data_game_logs.groupby(['DateTime', 'Tm'])['p_throws'].first().reset_index()
# Convert team abbreviations in starting_pitcher_handedness
starting_pitcher_handedness['Tm'] = starting_pitcher_handedness['Tm'].map(lambda x: team_abbr_map.get(x, x))
# Merge the starting pitcher handedness with the top_player_game_logs dataframe
top_player_game_logs = pd.merge(top_player_game_logs, starting_pitcher_handedness, left_on=['DateTime', 'Opp'], right_on=['DateTime', 'Tm'], how='left')
top_player_game_logs.info()
print(top_player_game_logs.loc[1:20])
print('Determining inning starts')
pitcher_data_game_logs = determine_start_inning(pitcher_data_game_logs)
# print('\n Printing pitcher data')
# pitcher_data_game_logs.info()
# Put together the matrix for machine learning
# Calculate this for last game, 3 games, last 7 games, and for the season. For starting pitchers, "games" are when they started
# 1. The player's hits per game
# 2. The player's hit per plate appearance against the handedness of the opposing starting pitcher (LHP or RHP)
# 3. The opposing starting pitcher's hits given per out
# 4. The opposing team's bullpen hits given per out
# Initialize an empty list to store the dataframes for each player and game
dataframes = []
# Convert 'Tm' column to string
top_player_game_logs['Tm_x'] = top_player_game_logs['Tm_x'].astype(str)
# Convert 'Unnamed: 5' column to string
top_player_game_logs['Unnamed: 5'] = top_player_game_logs['Unnamed: 5'].astype(str)
# Determine the home team in top_player_game_logs
top_player_game_logs['home_team'] = top_player_game_logs.apply(lambda row: row['Opp'] if '@' in row['Unnamed: 5'] else row['Tm_x'], axis=1)
# Read the stadium_hits.csv file
stadium_hits_df = pd.read_csv(f'{Folder}stadium_hits.csv')
# Merge the stadium_hits_df with the top_player_game_logs dataframe
top_player_game_logs = pd.merge(top_player_game_logs, stadium_hits_df, left_on='home_team', right_on='Baseball Reference Acronym', how='left')
for player_id in top_90_player_ids:
player_game_logs = top_player_game_logs[top_player_game_logs['Player'] == player_id]
player_name = roster_dict[player_id] # Retrieve the player's name using the bidirectional dictionary
for _, game in player_game_logs.iterrows():
game_date = game['DateTime']
#print(game_date)
opposing_team = game['Opp']
# 1. The player's hits per game and hit streak
games_list = [1, 3, 7, 'All']
hits_per_game_stats = {}
PAs_per_game_batter_stats = {}
hit_streak = 0
hit_data = player_game_logs[player_game_logs['DateTime'] < game_date].sort_values('DateTime', ascending=False)
for _, game in hit_data.iterrows():
if game['H'] > 0:
hit_streak += 1
else:
break
for games in games_list:
if games == 'All':
hits_per_game = hit_data['H'].mean()
PAs_per_game_batter = pd.to_numeric(hit_data['PA']).mean()
else:
hits_per_game = hit_data.head(games)['H'].mean()
PAs_per_game_batter = pd.to_numeric(hit_data.head(games)['PA']).mean()
hits_per_game_stats[f"{games}_games"] = hits_per_game
PAs_per_game_batter_stats[f"{games}_games"] = PAs_per_game_batter
# 2. The player's hit per plate appearance against the handedness of the opposing starting pitcher (LHP or RHP)
pa_data = top_player_pas[top_player_pas['bbref_id'] == player_id]
if pa_data is None:
hits_per_pa_stats = {f"{games}_games_{hand}": None for games in games_list for hand in ['L', 'R']}
else:
hits_per_pa_stats = calculate_hits_per_pa(pa_data, game_date, games_list)
#print(hits_per_pa_stats)
# 3 &4. The opposing pitching hits given per out
hits_per_out_stats = calculate_hits_per_out_statcast(pitcher_data_game_logs, opposing_team, game_date, games_list)
#print(hits_per_game_stats.items())
# Combine all the stats into a single dataframe for the player and game
game_stats = {
'Player': player_id,
'Date': game_date,
'Opposing_Team': opposing_team,
**{f"Hits_Per_Game_{k}": v for k, v in hits_per_game_stats.items()},
**{f"PAs_Per_Game_{k}": v for k, v in PAs_per_game_batter_stats.items()}, # Add PAs per game
**{f"Hits_Per_PA_{games}_games": hits_per_pa_stats.get(f"{games}_games_{game['p_throws']}", None) for games in games_list},
# **{f"Hits_Per_PA_{k}": v for k, v in hits_per_pa_stats.items()},
**hits_per_out_stats,
'Hits': game['H'],
'Stadium_Hits': game['H_stadium'],
'Hit_Streak': hit_streak # Add hit streak
}
dataframes.append(pd.DataFrame(game_stats, index=[0]))
# Concatenate all the dataframes into a single dataframe
final_dataframe = pd.concat(dataframes, ignore_index=True)
# Save the final dataframe to a file
final_dataframe.to_csv(f'{Folder}player_game_stats_{start_date}_to_{end_date}.csv', index=False)
print("Final dataframe saved to 'player_game_stats.csv'")
with open(f'{Folder}player_game_stats_{start_date}_to_{end_date}.pkl', 'wb') as f:
pickle.dump(final_dataframe, f)