-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtop-prospects.py
36 lines (30 loc) · 1.2 KB
/
top-prospects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Link to PFF's 2025 NFL Draft Big Board
link = 'https://www.pff.com/news/draft-2025-nfl-draft-board-big-board'
# Get the page and convert it to a BeautifulSoup object
prospects = requests.get(link)
prospects_soup = BeautifulSoup(prospects.text, 'html.parser')
# Scrape page for prospects
top_300 = prospects_soup.select(
'strong'
)
# Turn into something I can work with
prospect_range = range(0, len(top_300))
top_300 = [top_300[x].text for x in prospect_range]
top_300 = pd.DataFrame(top_300)[8:308]
top_300.reset_index(drop=True, inplace=True)
# Use regular expressions to clean up the data
top_300['Rank'] = top_300.index + 1
top_300[0] = top_300[0].str.replace(r'^\d+\.\s*', '', regex = True)
top_300['Position'] = top_300[0].str.extract(r'^([A-Z/]+)')
top_300[0] = top_300[0].str.replace(r'^([A-Z/]+)', '', regex = True)
top_300['Name'] = top_300[0].str.extract(r'^(.*?),')
top_300[0] = top_300[0].str.replace(r'^(.*?),', '', regex = True)
top_300.rename(columns = {0: 'School'}, inplace = True)
top_300 = top_300[['Rank', 'Name', 'Position', 'School']]
top_300.set_index('Rank', inplace = True)
# Print top 300 prospects
print(top_300)