-
Notifications
You must be signed in to change notification settings - Fork 6
/
Extracting_URL's_HTML.py
51 lines (36 loc) · 1.24 KB
/
Extracting_URL's_HTML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
#function to print the list
def print_list(l):
for item in l:
print(item)
headers = {
'User-Agent': '<Name>',
'From': '<email>'}
#Collecting the page from the server and parsing it
url = 'http://careers.ecil.co.in/login.php';
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.text, 'html5lib')
#prettify() will format the html code in a tree structure
#print(soup.prettify())
images = [] #to store urls of the images
pdfs = [] #to store pdf urls
root = 'http://careers.ecil.co.in/'
# To get the src's from <img> tags and storing the urls in the list
for item in soup.findAll('img'):
link = item.get('src')
images.append(root + link)
# To get the href's from <a> tag and taking only the '.pdf' urls
for item in soup.findAll('a'):
link = item.get('href')
if link is None:
continue
else:
if link[-4:] == '.pdf':
pdfs.append(root+link)
print('Image URL\'s')
print('------------------------------------------------------------')
print_list(images)
print('\nPDF\'s URL\'s')
print('------------------------------------------------------------')
print_list(pdfs)