-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_to_xcel.py
71 lines (54 loc) · 2.11 KB
/
html_to_xcel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import xlsxwriter
import os
# simple module to dump all products and prices to excel / csv using selenium and BeautifulSoup
usr = os.getlogin()
# Create an new Excel file and add a worksheet.
workbook = xlsxwriter.Workbook('C:\\Users\\'+usr+'\\Desktop\\scrapper.xlsx')
worksheet = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})
worksheet.write('A1', 'names', bold)
worksheet.write('B1', 'prices', bold)
# open chrome without window to search
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome('C:\\Users\\'+usr+'\\Downloads\\chromedriver_win32\\chromedriver.exe', options=option)
url = 'https://shulinka.co.il/' # some url
products=[] # List to store name of the product
prices=[] # List to store price of the product
column_list = [] # List to store links of the page
driver.get(url)
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
# find the links to the categories
for link in soup.findAll('a'):
link_name =link.get('href')
if('https://shulinka.co.il/product-category/' in link_name ):
column_list.append(link.get('href'))
print(link.get('href'))
i = 0
j = 0
# start searching for names and prices in the links
for link in column_list:
driver = webdriver.Chrome('C:\\Users\\'+usr+'\\Downloads\\chromedriver_win32\\chromedriver.exe', options=option)
driver.get(link)
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
for name in soup.findAll('h2', attrs={'class':'woocommerce-loop-product__title'}):
print(name.text)
products.append(name.text)
price = name.findNextSibling().text
print(price)
prices.append(price)
worksheet.write(i, 1, price)
worksheet.write(i, 0, name.text)
i = i+1
driver.close()
# optional create data frame in pandas - problems in hebrew
df = pd.DataFrame({'Product Name': products, 'Price': prices})
print(df)
df.to_csv('products.csv', index=False, encoding='utf-8')
driver.quit()
workbook.close()