Skip to content

Latest commit

 

History

History
3136 lines (2887 loc) · 64 KB

Google Case Study - Ryder Cup (3).md

File metadata and controls

3136 lines (2887 loc) · 64 KB
#Web Scraping ESPN Public Data
import pandas
import pandas as pd
import requests
pd.set_option('display.max_columns', None) #so we can see all columns in a wide dataframe
import time
import numpy as np
url_2023_season = 'https://site.web.api.espn.com/apis/common/v3/sports/golf/pga/statistics/byathlete?region=us&lang=en&contentorigin=espn&isqualified=true&page=1&limit=50&sort=general.amount%3Adesc&season=2023'
url_2022_season = 'https://site.web.api.espn.com/apis/common/v3/sports/golf/pga/statistics/byathlete?region=us&lang=en&contentorigin=espn&isqualified=true&page=1&limit=50&sort=general.amount%3Adesc&season=2022'
url_2021_season = 'https://site.web.api.espn.com/apis/common/v3/sports/golf/pga/statistics/byathlete?region=us&lang=en&contentorigin=espn&isqualified=true&page=1&limit=50&sort=general.amount%3Adesc&season=2021'
r_2023 = requests.get(url=url_2023_season).json()
r_2022 = requests.get(url=url_2022_season).json()
r_2021 = requests.get(url=url_2021_season).json()
#combine into a collection of dictionaries
all_dicts = [r_2021, r_2022, r_2023]
#adding the value headers manually
table_headers = ['Earnings','Cup','Evnts','Rnds','Cuts','Top10','Wins','Score','DDIS','DACC','GIR','PUTTS','SAND','BIRDS']
print(table_headers)
['Earnings', 'Cup', 'Evnts', 'Rnds', 'Cuts', 'Top10', 'Wins', 'Score', 'DDIS', 'DACC', 'GIR', 'PUTTS', 'SAND', 'BIRDS']
#pull stats for 2023
values_2023 =[]
for athlete in r_2023['athletes']:
       for category in athlete['categories']:
           x = category['totals'][0:14]
           values_2023.append(x)

#turn into dataframe
df_values_2023 = pd.DataFrame(values_2023, columns=table_headers)
#player info is in a different section
info_categories = r_2023['athletes'][0]['athlete'].keys()
info_2023 = []

for athlete in r_2023['athletes']:
    row = []
    for category in info_categories:
        if category in athlete['athlete']:
            row.append(athlete['athlete'][category])
        else:
            row.append(None)
    info_2023.append(row)

#create a player info df
info_df_2023 = pd.DataFrame(info_2023, columns=info_categories)
#combine both dataframes
df_2023 = pd.concat([info_df_2023,df_values_2023], axis=1)
#repeat for seasons 2021 and 2022
values_2022 =[]
for athlete in r_2022['athletes']:
       for category in athlete['categories']:
           x = category['totals'][0:14]
           values_2022.append(x)

df_values_2022 = pd.DataFrame(values_2022, columns=table_headers)

#2022 athletes
info_categories_2022 = r_2022['athletes'][0]['athlete'].keys()
info_2022 = []

for athlete in r_2022['athletes']:
    row = []
    for category in info_categories_2022:
        if category in athlete['athlete']:
            row.append(athlete['athlete'][category])
        else:
            row.append(None)
    info_2022.append(row)
    
info_df_2022 = pd.DataFrame(info_2022, columns=info_categories)

df_2022 = pd.concat([info_df_2022,df_values_2022], axis=1)
#2021
values_2021 =[]
for athlete in r_2021['athletes']:
       for category in athlete['categories']:
           x = category['totals'][0:14]
           values_2021.append(x)

df_values_2021 = pd.DataFrame(values_2021, columns=table_headers)

#2021 athletes
info_categories_2021 = r_2021['athletes'][0]['athlete'].keys()
info_2021 = []

for athlete in r_2021['athletes']:
    row = []
    for category in info_categories_2021:
        if category in athlete['athlete']:
            row.append(athlete['athlete'][category])
        else:
            row.append(None)
    info_2021.append(row)
    
info_df_2021 = pd.DataFrame(info_2021, columns=info_categories)

df_2021 = pd.concat([info_df_2021,df_values_2021], axis=1)
#combine all three dataframes using concat
golf_df_concat = pd.concat([df_2021, df_2022, df_2023])

#reformat on earnings, we'll have to remove commas and $ signs
golf_df_concat['Earnings'] = golf_df_concat['Earnings'].str.replace(',', '').str.replace('$', '')
/Users/joeyglenn4/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
#reformat categories from strings to numbers
golf_df_concat[['Earnings', 'Cup', 'Evnts', 'Rnds', 'Cuts', 'Top10', 'Wins']] = golf_df_concat[['Earnings', 'Cup', 'Evnts', 'Rnds', 'Cuts', 'Top10', 'Wins']].apply(pd.to_numeric)
golf_df_concat['Score'] = pd.to_numeric(golf_df_concat['Score'], errors='coerce')
golf_df_concat['DDIS'] = pd.to_numeric(golf_df_concat['DDIS'], errors='coerce')
golf_df_concat['DACC'] = pd.to_numeric(golf_df_concat['DACC'], errors='coerce')
golf_df_concat['GIR'] = pd.to_numeric(golf_df_concat['GIR'], errors='coerce')
golf_df_concat['PUTTS'] = pd.to_numeric(golf_df_concat['PUTTS'], errors='coerce')
golf_df_concat['SAND'] = pd.to_numeric(golf_df_concat['SAND'], errors='coerce')
golf_df_concat['BIRDS'] = pd.to_numeric(golf_df_concat['BIRDS'], errors='coerce')
golf_df_agg = golf_df_concat.groupby('displayName').agg({'Earnings':'sum','Cup':'sum','Evnts':'sum','Rnds':'sum','Cuts':'sum','Top10':'sum','Wins':'sum','Score':'mean','DDIS':'mean','DACC':'mean','GIR':'mean','PUTTS':'mean','SAND':'mean','BIRDS':'mean'})
#Quick Save
golf_df_agg.to_excel('golf_player_data.xlsx', index=True)
#Data check
golf_df_agg.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS
displayName
Aaron Wise 3454521 -12 24 86 19 4 0 69.40 308.30 59.2 69.10 1.748 0.0 43.00
Abraham Ancer 5816565 -10 27 98 24 9 1 69.50 290.40 71.1 69.30 1.732 0.0 50.80
Adam Hadwin 4373841 1182 39 134 31 8 0 69.95 295.45 63.8 67.75 1.746 0.0 55.35
Adam Schenk 1803965 584 18 61 12 1 0 70.40 304.50 53.5 66.00 1.808 0.0 50.90
Adam Scott 2913198 -4 20 72 18 5 0 70.10 312.40 55.3 67.70 1.742 0.0 52.70
golf_df_agg.tail()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS
displayName
Viktor Hovland 13957192 713 56 193 50 17 2 69.566667 305.133333 63.933333 67.566667 1.735000 0.0 51.10
Webb Simpson 2783012 1195 21 74 18 6 0 69.200000 292.500000 67.100000 69.400000 1.734000 0.0 61.70
Will Zalatoris 12892434 3680 49 163 38 17 1 69.750000 311.050000 55.900000 70.200000 1.755500 0.0 48.55
Wyndham Clark 1948049 535 15 56 13 4 0 69.700000 313.000000 54.000000 67.900000 1.734000 0.0 52.80
Xander Schauffele 15608924 585 53 181 47 20 3 69.400000 305.300000 59.433333 69.200000 1.726667 0.0 61.10
#what are the lowest/highest average scores
golf_df_agg.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS
count 9.200000e+01 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.0 92.000000
mean 6.776147e+06 1015.673913 33.836957 112.554348 26.097826 8.152174 1.152174 69.535326 300.308514 60.072283 66.812138 1.734138 0.0 52.900362
std 5.671968e+06 825.992618 17.431358 59.594853 14.576106 6.182099 1.459601 3.709235 17.933058 4.986574 3.976978 0.094267 0.0 6.790910
min 1.730193e+06 -18.000000 11.000000 34.000000 7.000000 1.000000 0.000000 34.650000 147.750000 35.550000 35.250000 0.872500 0.0 27.100000
25% 2.903400e+06 546.250000 20.000000 65.000000 14.000000 4.000000 0.000000 69.600000 296.000000 56.825000 65.875000 1.729917 0.0 48.900000
50% 4.533484e+06 817.500000 27.500000 91.000000 20.500000 6.000000 1.000000 69.833333 302.616667 60.575000 67.025000 1.742000 0.0 52.400000
75% 8.521836e+06 1311.000000 47.250000 154.500000 37.250000 10.000000 1.000000 70.137500 307.075000 63.037500 68.850000 1.757250 0.0 56.950000
max 3.061599e+07 3680.000000 75.000000 260.000000 64.000000 30.000000 6.000000 71.900000 323.700000 71.100000 72.066667 1.808000 0.0 78.300000
#Minimum average score is 34.55, lowest score in PGA was shot by Furyk (58)
#find that entry
unreal_golfer = golf_df_agg[golf_df_agg['Score'] == 34.55]
print(unreal_golfer)
Empty DataFrame
Columns: [Earnings, Cup, Evnts, Rnds, Cuts, Top10, Wins, Score, DDIS, DACC, GIR, PUTTS, SAND, BIRDS]
Index: []
#Tom Kim is good... but he's not that good
unreal_golfer_2023 = df_2023[df_2023['displayName'] == 'Tom Kim']
print(unreal_golfer_2023)
         id               uid                                  guid firstName  \
19  4602673  s:1100~a:4602673  cce7aed3-de73-3aef-886c-688ba5f9ea88       Tom   

   lastName displayName shortName  debutYear  \
19      Kim     Tom Kim    T. Kim        NaN   

                                                links  \
19  [{'language': 'en-US', 'rel': ['playercard', '...   

                                             headshot  \
19  {'href': 'https://a.espncdn.com/i/headshots/go...   

                                               status  age  \
19  {'id': '1', 'name': 'Active', 'type': 'active'...   20   

                                                 flag    Earnings  Cup Evnts  \
19  {'href': 'https://a.espncdn.com/i/teamlogos/co...  $3,237,766  917    12   

   Rnds Cuts Top10 Wins Score   DDIS  DACC   GIR  PUTTS SAND BIRDS  
19   42   11     3    1  69.3  295.5  71.1  70.5  1.745    0  54.2  
#ESPN incorrectly entered his stats as 0 in 2021-22
unreal_golfer_2022 = df_2022[df_2022['displayName'] == 'Tom Kim']
print(unreal_golfer_2022)
         id               uid                                  guid firstName  \
44  4602673  s:1100~a:4602673  cce7aed3-de73-3aef-886c-688ba5f9ea88       Tom   

   lastName displayName shortName  debutYear  \
44      Kim     Tom Kim    T. Kim        NaN   

                                                links  \
44  [{'language': 'en-US', 'rel': ['playercard', '...   

                                             headshot  \
44  {'href': 'https://a.espncdn.com/i/headshots/go...   

                                               status  age  \
44  {'id': '1', 'name': 'Active', 'type': 'active'...   20   

                                                 flag    Earnings   Cup Evnts  \
44  {'href': 'https://a.espncdn.com/i/teamlogos/co...  $2,824,580  1154    11   

   Rnds Cuts Top10 Wins Score DDIS DACC  GIR  PUTTS SAND BIRDS  
44    0   10     3    1  00.0  0.0  0.0  0.0  0.000    0   0.0  
#Clean record, locking in 2023 stats. You could argue that this is inconsistent, but Tom Kim won't qualify for the final dataset anyways.
#The dataset will be filtered for American candidates for the Ryder Cup.
golf_df_agg.loc['Tom Kim']
Earnings    6.062346e+06
Cup         2.071000e+03
Evnts       2.300000e+01
Rnds        4.200000e+01
Cuts        2.100000e+01
Top10       6.000000e+00
Wins        2.000000e+00
Score       3.465000e+01
DDIS        1.477500e+02
DACC        3.555000e+01
GIR         3.525000e+01
PUTTS       8.725000e-01
SAND        0.000000e+00
BIRDS       2.710000e+01
Name: Tom Kim, dtype: float64
golf_df_agg.loc['Tom Kim', "Score"] = 69.1
golf_df_agg.loc['Tom Kim', "DDIS"] = 295.1
golf_df_agg.loc['Tom Kim', "DACC"] = 69.7
golf_df_agg.loc['Tom Kim', "GIR"] = 70.5
golf_df_agg.loc['Tom Kim', "PUTTS"] = 1.737
golf_df_agg.loc['Tom Kim', "SAND"] = 0
golf_df_agg.loc['Tom Kim', "BIRDS"] = 55.8
#Check Tom's updated stats
golf_df_agg.loc['Tom Kim']
Earnings    6062346.000
Cup            2071.000
Evnts            23.000
Rnds             42.000
Cuts             21.000
Top10             6.000
Wins              2.000
Score            69.100
DDIS            295.100
DACC             69.700
GIR              70.500
PUTTS             1.737
SAND              0.000
BIRDS            55.800
Name: Tom Kim, dtype: float64
#check for any outliers
golf_df_agg.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS
count 9.200000e+01 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.000000 92.0 92.000000
mean 6.776147e+06 1015.673913 33.836957 112.554348 26.097826 8.152174 1.152174 69.909783 301.910145 60.443478 67.195290 1.743534 0.0 53.212319
std 5.671968e+06 825.992618 17.431358 59.594853 14.576106 6.182099 1.459601 0.495253 7.971194 4.374593 2.206917 0.025273 0.0 6.228607
min 1.730193e+06 -18.000000 11.000000 34.000000 7.000000 1.000000 0.000000 68.933333 280.700000 50.800000 57.800000 1.646000 0.0 42.600000
25% 2.903400e+06 546.250000 20.000000 65.000000 14.000000 4.000000 0.000000 69.600000 296.000000 57.050000 65.975000 1.730250 0.0 49.075000
50% 4.533484e+06 817.500000 27.500000 91.000000 20.500000 6.000000 1.000000 69.833333 302.616667 60.600000 67.075000 1.742000 0.0 52.600000
75% 8.521836e+06 1311.000000 47.250000 154.500000 37.250000 10.000000 1.000000 70.137500 307.075000 63.162500 69.016667 1.757250 0.0 56.950000
max 3.061599e+07 3680.000000 75.000000 260.000000 64.000000 30.000000 6.000000 71.900000 323.700000 71.100000 72.066667 1.808000 0.0 78.300000
df_2023_2 = df_2023.set_index('displayName')
#create a new column for golfers who are American and currently in the top 50
American_names = ['Scottie Scheffler','Max Homa','Kurt Kitayama','Keegan Bradley','Tom Hoge','Collin Morikawa','Patrick Cantlay','Tony Finau','Sahith Theegala','Rickie Fowler','Taylor Moore','Chris Kirk','Harris English','Jordan Spieth','Brian Harman','Taylor Montgomery','Russel Henley','Justin Thomas','Keith Mitchell','Andrew Putnam','Xander Schauffele','Wyndham Clark','Adam Schenk','Sam Burns','Denny McCarthy','Brendon Todd','Hayden Buckley','Justin Suh','Brandon Wu','Davis Thompson']


golf_df_agg['is_qualified'] = False  
golf_df_agg.loc[golf_df_agg.index.isin(American_names), 'is_qualified'] = True
#remove all unqualified candidates, also first cut is a pun
first_cut = golf_df_agg[golf_df_agg['is_qualified'] == True]
first_cut
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS is_qualified
displayName
Adam Schenk 1803965 584 18 61 12 1 0 70.400000 304.500000 53.500000 66.000000 1.808000 0.0 50.900000 True
Andrew Putnam 2314939 679 17 57 13 3 0 69.500000 283.100000 63.000000 68.700000 1.755000 0.0 61.700000 True
Brendon Todd 1742013 522 14 47 9 3 0 69.900000 280.700000 66.700000 65.500000 1.717000 0.0 59.700000 True
Brian Harman 8816347 1899 69 234 54 13 0 69.800000 292.466667 67.166667 66.633333 1.755000 0.0 59.966667 True
Chris Kirk 3177760 1024 15 48 11 4 1 69.800000 295.800000 61.600000 66.600000 1.697000 0.0 63.900000 True
Collin Morikawa 15965913 3040 53 174 43 20 2 69.600000 296.533333 68.700000 69.966667 1.743667 0.0 52.300000 True
Denny McCarthy 4693126 1591 45 154 35 7 0 69.700000 293.300000 61.850000 65.900000 1.740500 0.0 55.350000 True
Harris English 9063496 530 43 145 33 10 2 70.000000 296.850000 62.500000 64.750000 1.742000 0.0 49.300000 True
Hayden Buckley 1836289 594 14 44 8 3 0 69.800000 304.500000 64.400000 69.400000 1.755000 0.0 45.600000 True
Jordan Spieth 14858548 612 58 198 47 19 2 69.866667 303.300000 55.133333 66.133333 1.728667 0.0 56.366667 True
Justin Thomas 15326162 345 53 194 48 19 2 69.600000 307.633333 55.966667 66.933333 1.729667 0.0 60.766667 True
Keegan Bradley 10811950 3305 64 204 45 14 1 70.133333 301.766667 62.100000 67.533333 1.756667 0.0 46.166667 True
Keith Mitchell 5093720 1513 40 134 32 9 0 69.700000 310.850000 63.750000 67.250000 1.750000 0.0 54.700000 True
Kurt Kitayama 5693388 1040 12 34 7 3 1 70.400000 306.400000 55.000000 66.500000 1.764000 0.0 57.100000 True
Max Homa 16514432 3053 63 208 50 16 5 69.933333 302.733333 60.600000 65.800000 1.724333 0.0 51.000000 True
Patrick Cantlay 21172810 784 53 176 45 23 6 69.100000 306.366667 62.433333 70.566667 1.737000 0.0 53.533333 True
Rickie Fowler 3209972 741 12 42 11 4 0 69.600000 305.500000 57.700000 69.400000 1.707000 0.0 44.000000 True
Sahith Theegala 6899820 870 47 167 40 11 0 69.950000 302.400000 54.550000 67.050000 1.733000 0.0 51.300000 True
Sam Burns 17659657 971 62 201 45 20 5 69.766667 308.033333 58.100000 66.866667 1.720667 0.0 50.100000 True
Scottie Scheffler 30615994 1821 65 222 55 27 6 69.033333 308.433333 62.600000 71.466667 1.724333 0.0 44.400000 True
Taylor Montgomery 2382435 783 16 56 14 4 0 69.300000 304.900000 55.000000 62.800000 1.646000 0.0 52.500000 True
Taylor Moore 2825077 865 16 55 11 1 1 70.400000 301.200000 57.400000 65.600000 1.790000 0.0 48.200000 True
Tom Hoge 8109202 741 48 146 30 10 1 69.550000 296.150000 63.150000 68.250000 1.729000 0.0 52.050000 True
Tony Finau 15456084 953 63 214 52 18 4 69.433333 304.966667 58.400000 69.533333 1.729667 0.0 55.066667 True
Wyndham Clark 1948049 535 15 56 13 4 0 69.700000 313.000000 54.000000 67.900000 1.734000 0.0 52.800000 True
Xander Schauffele 15608924 585 53 181 47 20 3 69.400000 305.300000 59.433333 69.200000 1.726667 0.0 61.100000 True
first_cut.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS
count 2.600000e+01 26.000000 26.000000 26.000000 26.000000 26.000000 26.000000 26.000000 26.000000 26.000000 26.000000 26.000000 26.0 26.000000
mean 9.369234e+06 1153.076923 39.538462 132.769231 31.153846 11.000000 1.615385 69.744872 301.410897 60.182051 67.393590 1.736301 0.0 53.456410
std 7.495006e+06 831.900808 21.033746 70.764006 17.469270 7.828154 2.001538 0.353986 7.747845 4.333838 1.977024 0.029878 0.0 5.528432
min 1.742013e+06 345.000000 12.000000 34.000000 7.000000 1.000000 0.000000 69.033333 280.700000 53.500000 62.800000 1.646000 0.0 44.000000
25% 2.913248e+06 598.500000 16.000000 56.000000 12.250000 4.000000 0.000000 69.562500 296.612500 56.325000 66.033333 1.724917 0.0 50.300000
50% 7.504511e+06 824.500000 46.000000 150.000000 34.000000 10.000000 1.000000 69.733333 303.900000 61.100000 66.991667 1.733500 0.0 52.650000
75% 1.542360e+07 1394.750000 56.750000 197.000000 46.500000 18.750000 2.000000 69.925000 306.150000 62.900000 69.075000 1.753750 0.0 56.916667
max 3.061599e+07 3305.000000 69.000000 234.000000 55.000000 27.000000 6.000000 70.400000 313.000000 68.700000 71.466667 1.808000 0.0 63.900000
new_df = df_2023.set_index('displayName')
first_cut.update(new_df[['Cup']])
first_cut['Cup'] = first_cut['Cup'].astype(int)
# 6 golfers automatically qualify for the American team on points, 
#if points were taken today, this is who would auto qualify
first_six = first_cut.nlargest(6,'Cup')
index_list = first_six.index.tolist()
first_six
#print(index_list)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS is_qualified
displayName
Scottie Scheffler 30615994 1844 65 222 55 27 6 69.033333 308.433333 62.6 71.466667 1.724333 0.0 44.400000 True
Max Homa 16514432 1801 63 208 50 16 5 69.933333 302.733333 60.6 65.800000 1.724333 0.0 51.000000 True
Keegan Bradley 10811950 1153 64 204 45 14 1 70.133333 301.766667 62.1 67.533333 1.756667 0.0 46.166667 True
Kurt Kitayama 5693388 1040 12 34 7 3 1 70.400000 306.400000 55.0 66.500000 1.764000 0.0 57.100000 True
Chris Kirk 3177760 1024 15 48 11 4 1 69.800000 295.800000 61.6 66.600000 1.697000 0.0 63.900000 True
Sam Burns 17659657 984 62 201 45 20 5 69.766667 308.033333 58.1 66.866667 1.720667 0.0 50.100000 True
# remove the top 6 from the choices, they're on the team
USA_team = []
USA_team.extend(index_list)
# select rows where displayName is not in USA_team
second_cut = first_cut.drop(index=USA_team)
second_cut
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS is_qualified
displayName
Adam Schenk 1803965 584 18 61 12 1 0 70.400000 304.500000 53.500000 66.000000 1.808000 0.0 50.900000 True
Andrew Putnam 2314939 679 17 57 13 3 0 69.500000 283.100000 63.000000 68.700000 1.755000 0.0 61.700000 True
Brendon Todd 1742013 522 14 47 9 3 0 69.900000 280.700000 66.700000 65.500000 1.717000 0.0 59.700000 True
Brian Harman 8816347 739 69 234 54 13 0 69.800000 292.466667 67.166667 66.633333 1.755000 0.0 59.966667 True
Collin Morikawa 15965913 860 53 174 43 20 2 69.600000 296.533333 68.700000 69.966667 1.743667 0.0 52.300000 True
Denny McCarthy 4693126 480 45 154 35 7 0 69.700000 293.300000 61.850000 65.900000 1.740500 0.0 55.350000 True
Harris English 9063496 536 43 145 33 10 2 70.000000 296.850000 62.500000 64.750000 1.742000 0.0 49.300000 True
Hayden Buckley 1836289 594 14 44 8 3 0 69.800000 304.500000 64.400000 69.400000 1.755000 0.0 45.600000 True
Jordan Spieth 14858548 628 58 198 47 19 2 69.866667 303.300000 55.133333 66.133333 1.728667 0.0 56.366667 True
Justin Thomas 15326162 377 53 194 48 19 2 69.600000 307.633333 55.966667 66.933333 1.729667 0.0 60.766667 True
Keith Mitchell 5093720 450 40 134 32 9 0 69.700000 310.850000 63.750000 67.250000 1.750000 0.0 54.700000 True
Patrick Cantlay 21172810 821 53 176 45 23 6 69.100000 306.366667 62.433333 70.566667 1.737000 0.0 53.533333 True
Rickie Fowler 3209972 741 12 42 11 4 0 69.600000 305.500000 57.700000 69.400000 1.707000 0.0 44.000000 True
Sahith Theegala 6899820 869 47 167 40 11 0 69.950000 302.400000 54.550000 67.050000 1.733000 0.0 51.300000 True
Taylor Montgomery 2382435 783 16 56 14 4 0 69.300000 304.900000 55.000000 62.800000 1.646000 0.0 52.500000 True
Taylor Moore 2825077 865 16 55 11 1 1 70.400000 301.200000 57.400000 65.600000 1.790000 0.0 48.200000 True
Tom Hoge 8109202 755 48 146 30 10 1 69.550000 296.150000 63.150000 68.250000 1.729000 0.0 52.050000 True
Tony Finau 15456084 976 63 214 52 18 4 69.433333 304.966667 58.400000 69.533333 1.729667 0.0 55.066667 True
Wyndham Clark 1948049 535 15 56 13 4 0 69.700000 313.000000 54.000000 67.900000 1.734000 0.0 52.800000 True
Xander Schauffele 15608924 617 53 181 47 20 3 69.400000 305.300000 59.433333 69.200000 1.726667 0.0 61.100000 True
#highest earners
top_earners = second_cut.nlargest(6, 'Earnings')
top_earners
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS is_qualified
displayName
Patrick Cantlay 21172810 821 53 176 45 23 6 69.100000 306.366667 62.433333 70.566667 1.737000 0.0 53.533333 True
Collin Morikawa 15965913 860 53 174 43 20 2 69.600000 296.533333 68.700000 69.966667 1.743667 0.0 52.300000 True
Xander Schauffele 15608924 617 53 181 47 20 3 69.400000 305.300000 59.433333 69.200000 1.726667 0.0 61.100000 True
Tony Finau 15456084 976 63 214 52 18 4 69.433333 304.966667 58.400000 69.533333 1.729667 0.0 55.066667 True
Justin Thomas 15326162 377 53 194 48 19 2 69.600000 307.633333 55.966667 66.933333 1.729667 0.0 60.766667 True
Jordan Spieth 14858548 628 58 198 47 19 2 69.866667 303.300000 55.133333 66.133333 1.728667 0.0 56.366667 True
second_cut['Earnings_per_event'] = second_cut['Earnings'] / second_cut['Evnts']
second_cut['Points_per_event'] = second_cut['Cup'] / second_cut['Evnts']
second_cut['Top10_per_event'] = second_cut['Top10'] / second_cut['Evnts']
second_cut['Wins_per_event'] = second_cut['Wins'] / second_cut['Evnts']

second_cut
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Earnings Cup Evnts Rnds Cuts Top10 Wins Score DDIS DACC GIR PUTTS SAND BIRDS is_qualified Earnings_per_event Points_per_event Top10_per_event Wins_per_event
displayName
Adam Schenk 1803965 584 18 61 12 1 0 70.400000 304.500000 53.500000 66.000000 1.808000 0.0 50.900000 True 100220.277778 32.444444 0.055556 0.000000
Andrew Putnam 2314939 679 17 57 13 3 0 69.500000 283.100000 63.000000 68.700000 1.755000 0.0 61.700000 True 136172.882353 39.941176 0.176471 0.000000
Brendon Todd 1742013 522 14 47 9 3 0 69.900000 280.700000 66.700000 65.500000 1.717000 0.0 59.700000 True 124429.500000 37.285714 0.214286 0.000000
Brian Harman 8816347 739 69 234 54 13 0 69.800000 292.466667 67.166667 66.633333 1.755000 0.0 59.966667 True 127773.144928 10.710145 0.188406 0.000000
Collin Morikawa 15965913 860 53 174 43 20 2 69.600000 296.533333 68.700000 69.966667 1.743667 0.0 52.300000 True 301243.641509 16.226415 0.377358 0.037736
Denny McCarthy 4693126 480 45 154 35 7 0 69.700000 293.300000 61.850000 65.900000 1.740500 0.0 55.350000 True 104291.688889 10.666667 0.155556 0.000000
Harris English 9063496 536 43 145 33 10 2 70.000000 296.850000 62.500000 64.750000 1.742000 0.0 49.300000 True 210778.976744 12.465116 0.232558 0.046512
Hayden Buckley 1836289 594 14 44 8 3 0 69.800000 304.500000 64.400000 69.400000 1.755000 0.0 45.600000 True 131163.500000 42.428571 0.214286 0.000000
Jordan Spieth 14858548 628 58 198 47 19 2 69.866667 303.300000 55.133333 66.133333 1.728667 0.0 56.366667 True 256181.862069 10.827586 0.327586 0.034483
Justin Thomas 15326162 377 53 194 48 19 2 69.600000 307.633333 55.966667 66.933333 1.729667 0.0 60.766667 True 289172.867925 7.113208 0.358491 0.037736
Keith Mitchell 5093720 450 40 134 32 9 0 69.700000 310.850000 63.750000 67.250000 1.750000 0.0 54.700000 True 127343.000000 11.250000 0.225000 0.000000
Patrick Cantlay 21172810 821 53 176 45 23 6 69.100000 306.366667 62.433333 70.566667 1.737000 0.0 53.533333 True 399486.981132 15.490566 0.433962 0.113208
Rickie Fowler 3209972 741 12 42 11 4 0 69.600000 305.500000 57.700000 69.400000 1.707000 0.0 44.000000 True 267497.666667 61.750000 0.333333 0.000000
Sahith Theegala 6899820 869 47 167 40 11 0 69.950000 302.400000 54.550000 67.050000 1.733000 0.0 51.300000 True 146804.680851 18.489362 0.234043 0.000000
Taylor Montgomery 2382435 783 16 56 14 4 0 69.300000 304.900000 55.000000 62.800000 1.646000 0.0 52.500000 True 148902.187500 48.937500 0.250000 0.000000
Taylor Moore 2825077 865 16 55 11 1 1 70.400000 301.200000 57.400000 65.600000 1.790000 0.0 48.200000 True 176567.312500 54.062500 0.062500 0.062500
Tom Hoge 8109202 755 48 146 30 10 1 69.550000 296.150000 63.150000 68.250000 1.729000 0.0 52.050000 True 168941.708333 15.729167 0.208333 0.020833
Tony Finau 15456084 976 63 214 52 18 4 69.433333 304.966667 58.400000 69.533333 1.729667 0.0 55.066667 True 245334.666667 15.492063 0.285714 0.063492
Wyndham Clark 1948049 535 15 56 13 4 0 69.700000 313.000000 54.000000 67.900000 1.734000 0.0 52.800000 True 129869.933333 35.666667 0.266667 0.000000
Xander Schauffele 15608924 617 53 181 47 20 3 69.400000 305.300000 59.433333 69.200000 1.726667 0.0 61.100000 True 294508.000000 11.641509 0.377358 0.056604
# Great! We have our second_cut dataframe which features golfers who qualify to fill the last 6 spots
# We can see there are some consistent names at the top of results categories
second_cut.index.name = 'displayName'
names = second_cut.index
print(names)
Index(['Adam Schenk', 'Andrew Putnam', 'Brendon Todd', 'Brian Harman',
       'Collin Morikawa', 'Denny McCarthy', 'Harris English', 'Hayden Buckley',
       'Jordan Spieth', 'Justin Thomas', 'Keith Mitchell', 'Patrick Cantlay',
       'Rickie Fowler', 'Sahith Theegala', 'Taylor Montgomery', 'Taylor Moore',
       'Tom Hoge', 'Tony Finau', 'Wyndham Clark', 'Xander Schauffele'],
      dtype='object', name='displayName')
#let's export this in csv and take it to tableau
second_cut.to_csv('/Users/joeyglenn4/Desktop/Case Study/second_cut.csv', index=True)