-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraping_revised.py
72 lines (60 loc) · 3.24 KB
/
webscraping_revised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# ============================================================================
# Getting financial data from yahoo finance using webscraping
# Author - Mayank Rasu
# Please report bugs/issues in the Q&A section
# =============================================================================
import requests
from bs4 import BeautifulSoup
import pandas as pd
tickers = ["AAPL","MSFT"] #list of tickers whose financial data needs to be extracted
financial_dir = {}
for ticker in tickers:
#getting balance sheet data from yahoo finance for the given ticker
temp_dir = {}
url = 'https://in.finance.yahoo.com/quote/'+ticker+'/balance-sheet?p='+ticker
page = requests.get(url)
page_content = page.content
soup = BeautifulSoup(page_content,'html.parser')
tabl = soup.find_all("div", {"class" : "M(0) Mb(10px) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})
for t in tabl:
rows = t.find_all("div", {"class" : "rw-expnded"})
for row in rows:
temp_dir[row.get_text(separator='|').split("|")[0]]=row.get_text(separator='|').split("|")[1]
#getting income statement data from yahoo finance for the given ticker
url = 'https://in.finance.yahoo.com/quote/'+ticker+'/financials?p='+ticker
page = requests.get(url)
page_content = page.content
soup = BeautifulSoup(page_content,'html.parser')
tabl = soup.find_all("div", {"class" : "M(0) Mb(10px) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})
for t in tabl:
rows = t.find_all("div", {"class" : "rw-expnded"})
for row in rows:
temp_dir[row.get_text(separator='|').split("|")[0]]=row.get_text(separator='|').split("|")[1]
#getting cashflow statement data from yahoo finance for the given ticker
url = 'https://in.finance.yahoo.com/quote/'+ticker+'/cash-flow?p='+ticker
page = requests.get(url)
page_content = page.content
soup = BeautifulSoup(page_content,'html.parser')
tabl = soup.find_all("div", {"class" : "M(0) Mb(10px) Whs(n) BdEnd Bdc($seperatorColor) D(itb)"})
for t in tabl:
rows = t.find_all("div", {"class" : "rw-expnded"})
for row in rows:
temp_dir[row.get_text(separator='|').split("|")[0]]=row.get_text(separator='|').split("|")[1]
#getting key statistics data from yahoo finance for the given ticker
url = 'https://in.finance.yahoo.com/quote/'+ticker+'/key-statistics?p='+ticker
page = requests.get(url)
page_content = page.content
soup = BeautifulSoup(page_content,'html.parser')
tabl = soup.findAll("table", {"class": "W(100%) Bdcl(c) Mt(10px) "})
for t in tabl:
rows = t.find_all("tr")
for row in rows:
if len(row.get_text(separator='|').split("|")[0:2])>0:
temp_dir[row.get_text(separator='|').split("|")[0]]=row.get_text(separator='|').split("|")[-1]
#combining all extracted information with the corresponding ticker
financial_dir[ticker] = temp_dir
#storing information in pandas dataframe
combined_financials = pd.DataFrame(financial_dir)
tickers = combined_financials.columns
for ticker in tickers:
combined_financials = combined_financials[~combined_financials[ticker].str.contains("[a-z]").fillna(False)]