-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_links.py
58 lines (48 loc) · 1.5 KB
/
get_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from lxml import etree
from bs4 import BeautifulSoup
from parsel import Selector
from selenium.webdriver.common.keys import Keys
import requests
import getpass
from selenium import webdriver
import time
import random
import pandas
import time
print("1-start")
url = 'https://www.immoweb.be'
driver = webdriver.Firefox()
driver.implicitly_wait(30)
print("2-open browser")
driver.get(url)
time.sleep(random.uniform(1.0, 2.0))
print("3-get url")
driver.find_element_by_xpath('//*[@id="uc-btn-accept-banner"]').click()
print("4-coockie is clicked")
i = 1
url = 'https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=1&orderBy=relevance'
k=0
links = []
with open('Dataset/links.txt', 'w') as f:
while (i<334):
if (i==1):
print("5 - Search Lauched")
if (i%10 == 0):
time.sleep(10)
driver.get(url)
demo2 = driver.find_elements_by_xpath('/html/body/div[1]/div[2]/div/main/div/div[2]/div/div[3]/div/div/div[1]/div/ul')
for x in demo2:
soup = BeautifulSoup(x.get_attribute('outerHTML'), 'lxml')
for elem in soup.find_all('a'):
url1 = elem.get('href')
if "www.immoweb.be" in url1:
f.write("%s\n" % url1)
print("page: ", i)
previous = i
i += 1
url = url.replace(str(previous),str(i))
print("6-did you get the link?")
print("the browser will be closed in a few seconds")
driver.implicitly_wait(30)
driver.close()
print("Finish - browser is closed")