-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebcrawler.py
107 lines (79 loc) · 3.19 KB
/
webcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from os import system
import requests
from bs4 import BeautifulSoup
import argparse
from openpyxl import Workbook
import openpyxl
data= []
def yelp_spider(max_pages=0,URL=''):
page = 0
pages = []
print('\n')
while page <= max_pages:
pages.append(requests.get(URL + '&start='+str(page*10)))
print('page: ',page+1,' scanned')
page+=1
print('\n')
page=0
while page<=max_pages:
plain_text = pages[page].text
soup = BeautifulSoup(plain_text,'html.parser')#gets all the links and images and sorts them in the soup object
it=0
for link in soup.findAll('a',{'class':'css-1m051bw'}):
title = link.string
href = 'https://www.yelp.com' + link.get('href')
if it < 3: ##first 2 are always adds
it+=1
continue
print('title: ',title)
get_inside_data(href,title)
page+=1
print('page: ',page,' Done\n')
def get_inside_data(item_url,title):
page_source_code = requests.get(item_url)
plain_text = page_source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
sibs = soup.find_all('div',attrs={'class':'css-1vhakgw border--top__09f24__exYYb border-color--default__09f24__NPAKY'})
website_addr=''
ph_no=''
location=''
for divs in sibs:
text = str(divs.text)
##phone number
if 'Phone' in text:
ph_no = text.replace('Phone number','')
print('phone No: ',ph_no)
##Shop addres
if 'Get Directions' in text:
location = text.replace('Get Directions','')
print('location: ',location)
##website address
soup2 = BeautifulSoup(str(divs),'html.parser')
for j in soup2.find_all('a',attrs={'class':'css-1um3nx'}):
if 'biz' in str(j.get('href')):
website_addr='https://www.yelp.com' + j.get('href')
print('website: ',website_addr)
break
data.append([title,ph_no,location,website_addr])
print("\n###################################################\n")
def args_parsed():
parser = argparse.ArgumentParser()
parser.add_argument('-u',type=str,help='(Required): Enter the URL of the searched Buisnessess to start parsing.',required=True)
parser.add_argument('-p',type=int,default='1',help='(Optional): Is for how many pages you want to scrap (Default: 1),i.e: -p 2, will scan the firest 2 pages of a given link')
parser.add_argument('-o',type=str ,default='file.xlsx',help='(Optional): Used to export the collected data to excell file.',required=False)
args = parser.parse_args()
return args
def save_data():
wb = Workbook() # creates a workbook object.
ws = wb.active # creates a worksheet object.
ws.append(['TITLE OF BUISNESS','PHONE NUMBER','LOCATION','WEBSITE'])
for row in data:
ws.append(row) # adds values to cells, each list is a new row.
wb.save(filename = args_parsed().o) # save to excel file.
print('Output produce as: ',args_parsed().o)
if __name__ =='__main__':
system('color 0b')
yelp_spider(args_parsed().p - 1,args_parsed().u)
if args_parsed().o:
save_data()
print('\n\nby: The UPPERCASE GUY.\n\n')