-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_scraping.py
207 lines (131 loc) · 6.47 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup
#import numpy as np
import pandas as pd
USER = ' '
PASS = ' '
def login_to_medium(driver, email, password):
#user sign in
driver.get('https://medium.com/m/signin')
#sign-in
driver.find_element_by_xpath(".//button[contains(., 'Sign in')]").click()
#sign-in with Google
driver.find_element_by_xpath(".//button[contains(., 'Sign in with Google')]").click()
#select email for identification
email_field = driver.find_element_by_id("identifierID")
#writes email
email_field.send_keys(email)
#selecitng next button for password field
driver.find_element_by_id("identifierNext").click()
#sleep for a sec or two
time.sleep(1)
#locate password field
password_field = driver.find_element_by_name("password")
#writes password
password_field.send_keys("password")
#selecting next button
driver.find_element_by_id("passwordNext").click()
#wait for a moment
time.sleep(3)
#navigating to the stats page
driver.get('https://medium.com/me/stats')
#scrolling to bottom of the page to view all pubs
def scroll_down(driver):
SCROLL_PAUSE_TIME = 0.5
#scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
#scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#page-loading
time.sleep(SCROLL_PAUSE_TIME)
#compute new scroll height and compare with last one
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def extract_info(driver):
#table html from medium stats
table_html = driver.find_element_by_class_name('js-statsTableBody')
#raw html from table element
raw_html = table_html.getattribute('innerHTML')
#quit driver
driver.quit()
#html cleaning
soup = BeautifulSoup(raw_html, 'html.parser')
#story titles
story_titles = [item.text for i, item in enumerate(soup.select('h2'))]
#read times of followers/readers
read_times = [item.get('title') for i,item in enumerate(soup.findAll('span', {'class':'readingTime'}))]
#publication names
publications = []
h2tags = soup.findAll('h2')
for h2tag in h2tags:
page = [str(h2tag)]
elem = h2tag.next_sibling
while elem and elem.name != 'h2':
if elem.text.split('View story')[0]=='':
publications.append(None)
else:
'''
what the 1 liner is doing -
elem.text = "In Python in Plain English View story Details"
x = elem.text.split('View story')[0][3::]
print(x)
#txt.split('View story')-> ['In Python in Plain English ', ' Details']
#txt.split('View story')[0]-> In Python in Plain English
#txt.split('View story')[0][3::]-> Python in Plain English-> publication name
'''
publications.append(elem.text.split('View story')[0][3::])
elem = elem.next_sibling
#numerical data
num_vals = [item.text for i, item in enumerate(soup.findAll('span',{'class':'sortableTable-value'}))
if (len(item.text) < 13 or '.' in item.text)]
#num_vals = [437,97,22,19]
#views = num_vals[::4] -> 437 views
views = num_vals[::4]
#num_vals = [437,97,22,19]
#reads = num_vals[1::4] -> 97 reads
reads = num_vals[1::4]
#num_vals = [437,97,22,19]
#read_ratio = num_vals[2::4] -> 22 read-ratio
read_ratio = num_vals[2::4]
#num_vals = [437,97,22,19]
#fans = num_vals[3::4] -> 19 fans
fans = num_vals[3::4]
#creating dataframe to store the data
df = pd.DataFrame(data = {'Title': story_titles, 'Read Time': read_times, 'Publications': publications,
'Views': views, 'Reads': reads, 'Read Ratio': read_ratio, 'Fans': fans})
#reordering the columns
df = df[['Title', 'Publication', 'Read Time', 'Views', 'Reads', 'Read Ratio', 'Fans']]
#turn numerical features into floats
df = df.apply(pd.to_numeric, errors = 'ignore')
#returns only 11 and not 11 mins
df['Read Time'] = df['Read Time'].apply(lambda x: int(x.split()[0]))
return df
def show_info(df):
for index, row in df.iterrows():
if index == 0:
print("**************************************")
print('Title:', row['Title'])
print('Read Time:', row['Read Time'])
print('Publication:', row['Publication'])
print('Views:', row['Views'])
print('Reads:', row['Reads'])
print('Read Ratio:', row['Read Ratio'])
print('Fans:', row['Fans'])
print("Sukanya's Writer Statistics @ Medium")
time.sleep(.3)
if __name__ == "__main__":
driver = webdriver.Chrome(ChromeDriverManager().install())
#logs in
login_to_medium(driver, USER, PASS)
#scroll
scroll_down(driver)
#export data as csv to the local machine
df = extract_info(driver)
show_info(df)
df.to_csv('mywriterstats.csv', index= False)
print("Your writer stats at Medium is created and saved in Sukanya_Writer_Stats.csv!")