-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbooking_hotels.py
227 lines (177 loc) · 8.69 KB
/
booking_hotels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import json
import time
import logging
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class Scraper():
#constructor
def __init__(self):
logging.basicConfig(
filename="Scraper.log",
level=logging.INFO,
format="{asctime} - {levelname} - {message}",
style="{")
def init_driver(self):
options = Options()
options.add_argument('--headless')
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Firefox(options=options)
return driver
def get_rooms(self, page_source):
room_detail = {}
page = BeautifulSoup(page_source, 'lxml')
address = page.find('span', class_='hp_address_subtitle js-hp_address_subtitle jq_tooltip')
room_detail['address'] = address.text if address != None else 'Unkown'
imgs = []
images = page.find_all('div', class_='bh-photo-grid-thumb-cell')
for image in images:
if(image.img != None):
img_source = image.img['src']
imgs.append(img_source)
link_image = page.find_all('a', class_='bh-photo-grid-item bh-photo-grid-side-photo active-image')
for url in link_image:
if(url.img != None):
url = url.img['src']
imgs.append(url)
link_image = page.find('a', class_='bh-photo-grid-item bh-photo-grid-photo1 active-image')
if(link_image != None):
link_image = link_image.img
if(link_image != None):
link_image = link_image['src']
imgs.append(link_image)
room_detail['images'] = imgs
description = page.find('p', class_='a53cbfa6de b3efd73f69')
room_detail['description'] = description.text if description != None else 'Unknown'
aminty = page.find_all('span', class_='a5a5a75131')
aminities = [x.text if x != None else 'Unknown' for x in aminty]
room_detail['aminties'] = aminities
rooms = page.find_all('div', class_='ed14448b9f b817090550 e7f103ee9e')
room_detail['room_type'] = []
room_detail['#individual'] = []
room_detail['#bedrooms'] = []
for room in rooms:
if(room == None):
continue
if(room.a == None):
continue
room_type = room.a.span.text
bedroom = room.find('span', class_='baf7cb1417')
bedroom = 'Unknown' if bedroom is None else bedroom.text
people = 'Unknown' if bedroom is 'Unknown' else bedroom.split()[0]
room_detail['room_type'].append(room_type)
room_detail['#individual'].append(people)
room_detail['#bedrooms'].append(bedroom)
return room_detail
def get_hotels(self, city, city_code, page_numbers=1):
driver = self.init_driver()
hotels = {}
for i in range(page_numbers):
url = f'https://www.booking.com/searchresults.en-gb.html?label=gog235jc-1FCAEiBWhvdGVsKIICOOgHSDNYA2hDiAEBmAEJuAEZyAEP2AEB6AEB-AENiAIBqAIDuAKZ7-2tBsACAdICJDQ5NDY3ZTNjLTZiYzMtNDFjNi05MGQ4LTk0NjAwNGNiZWRlY9gCBuACAQ&sid=0ca19a26ae77e73a83fd4cdc2f1ce287&aid=397594&city=-{city_code}&offset={i*25}'
logging.info(f"Fetching page {i+1} for {city}...")
driver.get(url)
try:
# Wait for some time for dynamic content to load
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.c82435a4b8.a178069f51.a6ae3c2b40.a18aeea94d.d794b7a0f7.f53e278e95.c6710787a4'))
)
except TimeoutException as e:
logging.error(f"Timeout while loading page {i+1}: {e}")
continue
page_source = driver.page_source
#start scrapping
soup = BeautifulSoup(page_source, 'lxml')
#get the hotel card of each hotel
hotel_cards = soup.find_all('div', class_="c82435a4b8 a178069f51 a6ae3c2b40 a18aeea94d d794b7a0f7 f53e278e95 c6710787a4")
#go through each card and check its info
for hotel_card in hotel_cards:
hotel = {'city':city,
'country':"Egypt"} #temp dictionary to hold the data of each hotel
if(hotel_card == None):
continue
#get the title and create new entry to save the new hotel
title = hotel_card.find('div', class_ = 'f6431b446c a15b38c233')
if title != None:
title = title.text.strip()
else:
continue
#get hotel image source link
img = hotel_card.find('img', class_='f9671d49b1')
img_source = img['src'] if img != None else 'no_image_found'
#get average score
average_score = hotel_card.find('div', class_ ='a3b8729ab1 d86cee9b25')
hotel['average_score'] = average_score.text if average_score != None else 'Unknown'
#get the grade
grade = hotel_card.find('div', class_='a3b8729ab1 e6208ee469 cb2cbb3ccb')
hotel['grade'] = grade.text if grade != None else 'Unknown'
#get number of reviewers
reviewers = hotel_card.find('div', class_='abf093bdfe f45d8e4c32 d935416c47')
hotel['#reviewers'] = reviewers.text if reviewers != None else 'Unknown'
#get link for the subpage of each hotel
hotel_link = hotel_card.find('div', class_='a5922b8ca1').a['href']
if hotel_link:
try:
#get the subpage of the hotel to scrap it
response = requests.get(hotel_link, timeout=10)
response.raise_for_status()
#sub_soup = BeautifulSoup(hotel_page_source, 'lxml')
room_details = self.get_rooms(response.text)
#save the data in the dictionary
hotel.update(room_details)
except requests.exceptions.HTTPError as e:
if response.status_code == 429:
logging.warning("Too many requests. Retrying after a delay")
time.sleep(60)
else:
logging.error(f"Http error occurred: {e}")
except requests.exceptions.RequestException as e:
logging.error(f'Failed to fetch hotel details for {title}: {e}')
continue
hotels[title] = hotel
logging.info(f"Page {i+1} for {city} completed with {len(hotel_cards)} hotels found.")
time.sleep(2)
driver.quit()
return hotels
if __name__== '__main__':
cities = {'cairo' : '290692',
'alex': '290263',
'hurghada': '290029',
'sharm-el-sheikh':'302053',
'ain-sokhna': '900040497',
'dahab': '293084',
'el-alamein':'289704',
'marsa-matruh': '298644',
'luxor':'290821',
'aswan':'291535'
}
pages = {'cairo' : 40,
'alex': 19,
'hurghada': 40,
'sharm-el-sheikh': 16,
'ain-sokhna': 9,
'dahab': 11,
'el-alamein': 20,
'marsa-matruh': 5,
'luxor': 10,
'aswan': 6
}
input_city = input('enter the city to get its hotels: \n> ').lower()
scraper = Scraper()
if input_city in cities:
try:
logging.info(f"Scraping {input_city}...")
hotels = scraper.get_hotels(input_city, cities[input_city],pages[input_city])
# Save to a JSON file
with open(f'eg_{input_city}_hotels.json', 'w') as json_file:
json.dump(hotels, json_file, indent=2)
logging.info(f"Scraping for {input_city} completed. Data saved.")
except Exception as e:
logging.error(f"Error scraping {input_city}: {e}")
else:
logging.error(f"Country {input_city} not found in the list.")