forked from old-wan/chinaseotools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpage.py
151 lines (129 loc) · 3.01 KB
/
page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/python
#coding=utf-8
import requests
from bs4 import BeautifulSoup
from furl import furl
class page(object):
'''单页面分析,出报告'''
def __init__(self,url):
self.url=url
@property
def host(self):
return furl(self.url).host
@property
def getsoup(self):
resp=requests.get(self.url)
soup=BeautifulSoup(resp.text,'lxml')
return soup
def analyzeurl(self):
'''链接分析'''
length=self.getfurl.path.segments
if length>3:
print 'url结构复杂:',self.url
return
def gethtml(self):
'''获取网页源码'''
return self.getsoup.prettify()
def gettext(self):
'''获取网页文本'''
newsoup=self.getsoup
for style in newsoup.find_all('style'):
newsoup.style.decompose()
for script in newsoup.find_all("script"):
newsoup.script.decompose()
return " ".join(newsoup.get_text().split())
def checkinsearch(self):
'''检测是否被baidu/sogou/360/收录'''
pass
@property
def analyzetitle(self):
'''网页标题分析 标题范围30-120'''
title=self.getsoup.title.string
length=len(title)
if length>120:
return u'标题太长:'+title
elif length<10:
return u'标题太短:'+title
elif length==0:
return u'标题缺失:'
else:
return u'页面标题:'+title
def analyzeheadings(self):
''''H标签分析 H1有且只有一个'''
headings=['h1','h2','h3','h4','h5','h6']
info=[]
mess=[]
for heading in headings:
for h in self.getsoup.find_all(heading):
info.append({h.name:h.get_text()})
mess.append(h.name)
if mess.count('h1')!=1:
print 'H1 标签不唯一'
else:
print '有且仅有一个H1'
return info
def analyzeimages(self):
'''
每个图片都有自己的alt描述
没有alt描述的时候返回imgs
'''
imgs=[]
for img in self.getsoup.find_all('img'):
try:
if img['alt']:
pass
except KeyError as e:
imgs.append(img)
continue
return imgs
@property
def links(self):
'''获取页面内所有超链 相对转换为绝对'''
links=[]
for a in self.getsoup.find_all('a',href=True):
f=furl(a['href'])
if f.host:
links.append({'url':a['href'],'text':a.get_text().strip()})
else:
links.append({'url':'http://'+self.host+a['href'],'text':a.get_text().strip()})
return links
@property
def internallinks(self):
'''获取内链以及超文本'''
ilinks=[]
for i in self.links:
try:
p=furl(i['url'])
if p.host==self.host:
ilinks.append(i['url'])
else:
pass
except ValueError as e:
continue
return list(set(ilinks))
def externallinks(self):
'''获取外链接以及超文本'''
exlinks=[]
for i in self.links:
try:
p=furl(i['url'])
if p.host==self.host:
pass
else:
exlinks.append(i['url'])
except ValueError as e:
continue
return exlinks
def getreport(self):
'''分析报告'''
self.analyzeurl()
self.analyzetitle()
self.analyzeimages()
self.analyzeheadings()
pass
def main():
url="http://www.vrnew.com/"
vrnew=page(url)
print vrnew.internallinks
if __name__ == '__main__':
main()