-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpage.py
192 lines (155 loc) · 3.97 KB
/
page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/python
#codeing:utf-8
import requests
from bs4 import BeautifulSoup
from furl import furl
class htmlpage(object):
"""
taraget:单页面分析工具
"""
def __init__(self,url):
self.url=url
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"}
def get_furl(self):
return furl(self.url)
def get_scheme(self):
return self.get_furl().scheme
def get_host(self):
return self.get_furl().host
def get_resp(self):
return requests.get(self.url,headers=self.headers)
def get_soup(self):
return BeautifulSoup(self.get_resp().text,'lxml')
def reduce_noise(self):
"""
#target:页面基础降噪
"""
newsoup=self.get_soup()
for style in newsoup.find_all('style'):
newsoup.style.decompose()
for script in newsoup.find_all("script"):
newsoup.script.decompose()
return newsoup
def get_url(self):
"""
#target:获取访问url
"""
return self.url
def get_title(self):
"""
target:获取网页标题
"""
return self.get_soup().title.string
def get_keywords(self):
"""
target:获取网关键词
"""
return self.get_soup().find_all('meta',attrs={'name':'keywords'})[0]['content'].split(',')
def get_description(self):
"""
target:获取网页描述
"""
return self.get_soup().find_all('meta',attrs={'name':'description'})[0]['content']
def get_content(self):
"""
target:获取页面内容 模仿百度抓取
params:soup
return:string conetent
"""
newsoup=self.reduce_noise()
content=newsoup.body.get_text().split()
return " ".join(content)
def url_re2abs(self,url):
"""相对连接->绝对连接"""
f=furl(url)
if f.host:
return url
else:
me=self.get_furl()
me.path=str(f.path)
return me.url
def get_all_urls(self):
"""
target:获取页面内的所有urls
"""
urls=[]
newsoup=self.reduce_noise()
for i in newsoup.find_all('a'):
if i.string:
anchor=i.string.strip()
else:
anchor=i.string
urls.append({'url':self.url_re2abs(i.get("href")),'anchor':anchor})
return urls
def get_internal_urls(self):
"""
#target:获取页面内的站内链接和锚文字
"""
urls=[]
for url in self.get_all_urls():
if furl(url).host in [None,self.get_host()]:
urls.append(url)
else:
pass
return urls
def get_external_urls(self):
"""
#target:获取页面内的站外链接
"""
urls=[]
for url in self.get_all_urls():
if furl(url).host not in [None,self.get_host()]:
urls.append(url)
else:
pass
return urls
def check_in_baidu(self):
"""
target:检查在baidu中是否收录
#收录返回True,没有收录返回False
"""
payload = {'wd': self.url}
req=requests.get("https://www.baidu.com/s",params=payload,headers=self.headers)
info=self.url[7:]
soup=BeautifulSoup(req.text,'lxml')
s=soup.body.get_text()
if "没有找到该URL。您可以直接访问" in s:
return False
elif "很抱歉,没有找到" in s:
return False
else:
return True
def check_in_so(self):
"""
target:检查在360中是否收录
"""
return None
def check_in_sogou(self):
"""
target:检查在sogou中是否收录
"""
return None
def get_words(self):
"""
target:获取正文中频率出现最高的几个词
"""
tags = jieba.analyse.extract_tags(self.get_content(), topK=20)
print(",".join(tags))
return None
def show(self):
print("url:{}".format(self.get_url()))
print("title:{}".format(self.get_tdk()["title"]))
print("keywords:{}".format(self.get_tdk()["keywords"]))
print("description:{}".format(self.get_tdk()["description"]))
print("interlurl:{}".format(len(self.get_internal_urls())))
print("exterlurl:{}".format(len(self.get_external_urls())))
print(self.get_all_urls())
print(self.check_in_baidu())
def main():
url="http://www.vrnew.com/index.php/News/newscontent/id/611.html"
vrnew=htmlpage(url)
for i in vrnew.get_all_urls():
print(i['anchor'])
print(i['url'])
if __name__ == '__main__':
main()