-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeKAP.py
265 lines (198 loc) · 9.59 KB
/
scrapeKAP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
def fon_data(url_end):
"""
Retrieves fund data from the KAP website based on the specified fund type.
Parameters:
------------
url_end : str
The specific endpoint for the type of fund. Example: 'YF', 'EYF', 'OKS', etc.
Returns:
--------
pd.DataFrame
A DataFrame containing the code, title, and founder of the funds retrieved from the KAP website.
Notes:
------
- The function sends a GET request to the KAP website and parses the HTML content to extract the relevant fund data.
- The extracted data includes fund code, title, and founder, organized into a DataFrame.
"""
base_url = "https://www.kap.org.tr/tr/YatirimFonlari/"
url = base_url + url_end
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all href tags
href_tags = soup.find_all('a')
# Filter href tags based on parent element's class name
desired_classes = ["comp-cell _04 vtable", "comp-cell _08 vtable", "comp-cell _009 vtable"]
href_list = []
for tag in href_tags:
parent_class = tag.find_parent().get('class', [])
parent_class_name = " ".join(parent_class) # Convert list of class names to single string
if parent_class_name in desired_classes:
href_list.append(tag.get_text(strip=True))
# Create a DataFrame to store the href values
data = {'Column1': [], 'Column2': [], 'Column3': []}
for i, href in enumerate(href_list):
remainder = (i + 1) % 3
if remainder == 1:
data['Column1'].append(href)
data['Column2'].append('')
data['Column3'].append('')
elif remainder == 2:
data['Column2'][-1] = href
elif remainder == 0:
data['Column3'][-1] = href
df = pd.DataFrame(data)
df.columns = ["CODE", "TITLE", "FOUNDER"]
return df
def get_fund_detail(url_name):
"""
Retrieves specific fund details from the KAP website.
Parameters:
------------
url_name : str
The URL for the specific fund detail page.
Returns:
--------
pd.DataFrame
A DataFrame containing the extracted details from the specified fund page.
Notes:
------
- This function sends a GET request to a specific KAP webpage and extracts relevant details using BeautifulSoup.
- The extracted data is organized into a DataFrame, with two columns representing different details of the fund.
"""
# Send a GET request to the URL
response = requests.get(url_name)
response.raise_for_status() # Raise an error for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Extract text from each anchor tag and split it into parts
data = []
# Find all anchor tags with the specified class
anchor_tags = soup.find_all('a', class_=lambda x: x and ('w-clearfix w-inline-block a-table-row' in x))
for tag in anchor_tags:
span_tag = tag.find('span')
if span_tag:
span_text = span_tag.get_text(strip=True)
div_tags = tag.find_all('div', class_='comp-cell-row-div vtable infoColumn')
if len(div_tags) >= 1:
data.append([span_text, div_tags[0].get_text(strip=True)])
# Create a DataFrame to store the extracted data
df = pd.DataFrame(data, columns=['Column1', 'Column2'])
return df
def get_fund_detail2(url_name):
"""
Retrieves additional specific fund details from the KAP website, including multiple columns.
Parameters:
------------
url_name : str
The URL for the specific fund detail page.
Returns:
--------
pd.DataFrame
A DataFrame containing detailed information about the funds, with multiple columns extracted from the webpage.
Notes:
------
- This function is similar to `get_fund_detail` but extracts more detailed information across multiple columns.
- The first row is used as the header for the DataFrame, and empty strings are replaced with `NaN`.
"""
# Send a GET request to the URL
response = requests.get(url_name)
response.raise_for_status() # Raise an error for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Extract text from each anchor tag and split it into parts
data = []
# Find all anchor tags with the specified class
anchor_tags = soup.find_all('a', class_=lambda x: x and ('w-clearfix w-inline-block a-table-row' in x))
for tag in anchor_tags:
span_tag = tag.find('span')
row_data = []
if span_tag:
span_text = span_tag.get_text(strip=True)
row_data.append(span_text)
div_tags = tag.find_all('div', class_='comp-cell-row-div vtable infoColumn')
for div_tag in div_tags:
row_data.append(div_tag.get_text(strip=True))
data.append(row_data)
# Create a DataFrame to store the extracted data
# Find the maximum length of row_data
max_columns = max(len(row) for row in data)
# Create column names dynamically
columns = [f'Column{i}' for i in range(1, max_columns + 1)]
# Create DataFrame
df = pd.DataFrame(data, columns=columns)
# Use the first row as the header
df.columns = df.iloc[0]
df = df[1:]
# Replace empty strings with pd.NA and drop columns with all NaN values
df = df.iloc[:, [0, 2]]
return df
def get_all():
"""
Retrieves and merges fund data from multiple categories on the KAP website.
Returns:
--------
pd.DataFrame
A merged DataFrame containing information on various fund categories, including code, title, type, and other details.
Notes:
------
- The function collects data from different fund types and combines it into a single DataFrame.
- Various additional details such as ISIN, manager, risk value, and more are fetched and merged into the final DataFrame.
"""
url_ends = ['YF', 'EYF', 'OKS', 'BYF', 'GMF', 'GSF', 'YYF', 'VFF', 'KFF', 'PFF']
# Initialize an empty DataFrame to hold all data
data1 = pd.DataFrame()
# Iterate through each URL end, collect data, and append to all_data DataFrame
for url_end in url_ends:
df = fon_data(url_end)
df["KIND"] = url_end
data1 = pd.concat([data1, df], ignore_index=True)
data1['REPRESENTATIVE'] = pd.Series(np.nan, dtype=str)
data1.loc[data1['KIND'] == 'YYF', 'REPRESENTATIVE'] = data1['FOUNDER']
data1.drop('FOUNDER', axis=1, inplace=True)
data2_founder1 = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_kurucu_unvan")
data2_founder1.columns = ["TITLE", "FOUNDER"]
data2_founder2 = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_kurucu_unvan_2")
data2_founder2.columns = ["TITLE", "FOUNDER"]
data2_founder = pd.concat([data2_founder1, data2_founder2], ignore_index=True)
data2_pmc1 = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_portfoy_ticaret_unvan")
data2_pmc1.columns = ["TITLE", "MANAGER"]
data2_pmc2 = get_fund_detail2("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_portfoy_yon_kurulus")
data2_pmc2.columns = ["TITLE", "MANAGER"]
data2_pmc3 = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_yonetici_unvan")
data2_pmc3.columns = ["TITLE", "MANAGER"]
data2_pmc = pd.concat([data2_pmc1, data2_pmc2, data2_pmc3], ignore_index=True)
data2_isin = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_ISIN")
data2_isin.columns = ["TITLE", "ISIN"]
data2_rd = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_fonun_risk_degeri")
data2_rd.columns = ["TITLE", "RD"]
data2_type = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_fon_tur")
data2_type.columns = ["TITLE", "TYPE"]
data2_ini = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_fon_icerigi")
data2_ini.columns = ["TITLE", "INTEREST"]
data2_auditor = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_bdk")
data2_auditor.columns = ["TITLE", "AUDITOR"]
data2_ipo1 = get_fund_detail("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_halka_arz1")
data2_ipo1.columns = ["TITLE", "IPO_DATE"]
data2_ipo2 = get_fund_detail2("https://www.kap.org.tr/tr/fonlarTumKalemler/kpy81_acc1_halka_arz2")
data2_ipo2.columns = ["TITLE", "IPO_DATE"]
data2_ipo = pd.concat([data2_ipo1, data2_ipo2], ignore_index=True)
data2_ipo['IPO_DATE'] = data2_ipo['IPO_DATE'].apply(lambda x: np.nan if (x is None or '-' in x) else x)
dfs = [data1, data2_founder, data2_isin, data2_rd, data2_pmc, data2_type, data2_ini, data2_auditor, data2_ipo]
# Merge all DataFrames on the 'TITLE' column
funds = dfs[0]
for df in dfs[1:]:
funds = pd.merge(funds, df, on='TITLE', how='outer')
funds['IPO_DATE'] = funds['IPO_DATE'].replace(' ', None)
funds['IPO_DATE'] = pd.to_datetime(funds['IPO_DATE'], format='%d/%m/%Y', errors='coerce')
return funds
funds = get_all()
funds.info()