-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathEmailAddress-Scraper.py
172 lines (149 loc) · 5.89 KB
/
EmailAddress-Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# + Ensure the script can locate and extract information from the "Impressum" page of each domain,
# + Handle different variations of email addresses (@, [ät], at, and so on...)
# + Can extract the Email Address from an Image. (Sometimes webmaster paste the Email as a JPEG or PNG, to prevent scraping)
# + This is an industry project, and these steps need to be performed on these URLs
from bs4 import BeautifulSoup
import pandas
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
from selenium.common.exceptions import WebDriverException
import re
import requests
from io import BytesIO
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\USER\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
urls = [
"peersociallending.com",
"kreditvergleich-kostenlos.net",
"matblog.de",
"malta-tours.de",
"wiseclerk.com",
"urlaub-in-thailand.com",
"findle.top",
"niederrheinzeitung.de",
"finanziell-umdenken.blogspot.com",
"midbio.org",
"klaudija.de",
"pc-welt.wiki",
"websitevalue.co.uk",
"freizeitcafe.info",
"ladenbau.de",
"bierspot.de",
"biboxs.com",
"finance-it-blog.de",
"guenstigerkreditvergleich.com",
"cloudbiz.one",
"frag-den-heimwerker.com",
"fintech-intel.com",
"selbst-schuld.com",
"eltemkredit.com",
"binoro.de",
"siteurl.org",
"frachiseportal.at",
"finlord.cz",
"vj-coach.de",
"mountainstatescfc.org",
"crowdstreet.de"
]
driver = webdriver.Chrome(options=chrome_options)
impressums = []
for url in urls:
try:
link = 'https://' + url
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
anchors = soup.find_all('a')
check = 0
for anchor in anchors:
if 'Impressum' in anchor.text:
#Some websites have a proper link (https://...) to their impressum page and some have only a /page in href
if '://' in anchor.get('href'):
impressums.append(anchor.get('href'))
check = 1
print(link, ": Impressum found!")
break
else:
impressums.append(link + anchor.get('href'))
check = 1
print(link, ": Impressum found!")
break
if (check == 0):
print(link, ": Impressum doesn't exist!")
#Some websites may give 404 error
except WebDriverException as e:
print(link, ": Page doesn't exist!")
#Removing duplicates from the list
impressums = list(set(impressums))
driver.quit()
pattern1 = r'\b\S+\[at\]\S+\b' #this pattern is for [a]
pattern2 = r'\b\S+@\S+\b' #this pattern is for @
pattern3 = r'\b\S+\s*@\s*\S+' #this pattern is for spaces in an email (abc @ .com)
pattern4 = r'[\w.-]+\(a\)[\w.-]+' #this pattern is for (a)
driver = webdriver.Chrome(options=chrome_options)
mails, impressumsList = [], []
for impressum in impressums:
print(impressum)
driver.get(impressum)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
html = soup.find_all('p')
for h in html:
if '[at]' in h.text:
mails.append(re.findall(pattern1, h.text))
impressumsList.append(impressum)
if '[@]' in h.text:
mails.append([h.text])
impressumsList.append(impressum)
elif '@' in h.text:
if len(re.findall(pattern2, h.text)) > 0:
mails.append(re.findall(pattern2, h.text))
impressumsList.append(impressum)
break
else:
mails.append(re.findall(pattern3, h.text))
td_element = soup.find_all('td')
for a in td_element:
if '@' in a.text:
mails.append(list(set(re.findall(pattern2, a.text))))
impressumsList.append(impressum)
break
image = soup.find_all('img')
for img in image:
src = img.get('src')
try:
response = requests.get(src)
image_data = response.content
try:
image = Image.open(BytesIO(image_data))
text = pytesseract.image_to_string(image)
if '@' in text or '(a)' in text:
if len(re.findall(pattern2, text)) > 0:
mails.append(re.findall(pattern2, text))
impressumsList.append(impressum)
else:
mails.append(re.findall(pattern4, text))
impressumsList.append(impressum)
break
except Exception as e:
continue
except requests.exceptions.RequestException as e:
continue
driver.quit()
temp = []
for emails in mails:
for email in emails:
temp.append(email)
cleanedMails = []
for email in temp:
cleaned = email.replace(".comTel", ".com").replace(".deTel", ".de").replace("[@]", "@").replace("[at]", "@").replace(' ', '').replace("(a)", "@")
cleanedMails.append(cleaned)
final = zip(impressumsList, cleanedMails)
emailDomains = pandas.DataFrame(final, columns = ['Domains', 'Emails'])
#Cleaning website addresses to only obtain domain names
emailDomains['Domains'] = emailDomains['Domains'].apply(lambda x: x.replace('https://www.', '').replace('http://www.', '').replace('https://', '').replace('http://', ''))
emailDomains['Domains'] = emailDomains['Domains'].apply(lambda x: x.replace('https://www.', '').split('.')[0])
emailDomains.to_csv('EmailAddress-Processed.csv', index=False)