-
Notifications
You must be signed in to change notification settings - Fork 0
/
SEO_Scraper.py
122 lines (99 loc) · 3.31 KB
/
SEO_Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests
from bs4 import BeautifulSoup
import urllib
import json
def parse_data(allData):
g = 0
Data = []
l = {}
for i in range(0, len(allData)):
link = allData[i].find('a').get('href')
if (link is not None):
if (link.find('https') != -1 and link.find('http') == 0 and link.find('aclk') == -1):
g = g+1
l["link"] = link
try:
l["title"] = allData[i].find('h3').text
except:
l["title"] = None
Data.append(l)
l = {}
else:
continue
else:
continue
return Data
def display_results(json_data):
print(json.dumps(json_data, indent=2))
def extract_info(URL):
seo_info = {}
html = requests.get(URL)
soup = BeautifulSoup(html.text, 'html.parser')
seo_info["link"] = URL
try:
seo_info["metatitle"] = (soup.find('title')).get_text()
except:
seo_info["metatitle"] = None
try:
seo_info["metadescription"] = soup.find(
'meta', attrs={'name': 'description'})["content"]
except:
seo_info["metadescription"] = None
try:
seo_info["robots directives"] = soup.find('meta', attrs={'name': 'robots'})[
"content"].split(",")
except:
seo_info["robots directives"] = None
try:
seo_info["viewport"] = soup.find(
'meta', attrs={'name': 'viewport'})["content"]
except:
seo_info["viewport"] = None
try:
seo_info["charset"] = soup.find(
'meta', attrs={'charset': True})["charset"]
except:
seo_info["charset"] = None
try:
seo_info["html language"] = soup.find('html')["lang"]
except:
seo_info["html language"] = None
try:
seo_info["canonical"] = soup.find(
'link', attrs={'rel': 'canonical'})["href"]
except:
seo_info["canonical"] = None
try:
seo_info["list hreflangs"] = [[a['href'], a["hreflang"]]
for a in soup.find_all('link', href=True, hreflang=True)]
except:
seo_info["list hreflangs"] = None
try:
seo_info["mobile alternate"] = soup.find(
'link', attrs={'media': 'only screen and (max-width: 640px)'})["href"]
except:
seo_info["mobile alternate"] = None
return seo_info
def search_google(search_query):
query = urllib.parse.quote_plus(search_query)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
url = 'https://www.google.com/search?q={query}'.format(query=query)
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
allData = soup.find_all("div", {"class": "g"})
results = parse_data(allData)
seo_info_hold = []
err_links = []
for i in results:
try:
value = extract_info(i["link"])
seo_info_hold.append(value)
except:
err_links.append(i["link"])
print("Scraped SEO results : \n")
display_results(seo_info_hold)
if (len(err_links) > 0):
print("\nCan't able to scrape these links : \n")
# display_results(err_links)
return seo_info_hold;