-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsite.py
162 lines (133 loc) · 4.04 KB
/
site.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/python
#codeing:utf-8
import page
import sys
import os
import time
import re
import socket
import requests
import jieba
import jieba.analyse
import sqlite3
from bs4 import BeautifulSoup
from furl import furl
class website(object):
"""
# 网站分析
"""
def __init__(self,domain):
self.domain=domain
def get_index(self):
return "http://"+self.domain #暂不支持https
def get_ip(self):
"""
target:获取网站的IP地址
params:domain
return:string ip address
"""
ip=socket.getaddrinfo(self.domain,None)[0][4][0]
return ip
def get_whois(self):
"""
target:获取网站的whois信息
params:domain
return:dict whois info
"""
url="http://whois.chinaz.com/"+self.domain
headers={'User-agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3018.4 Safari/537.36"}
cookie={"Cookie":"BAIDUID=0C5608327F1D060AF6434C52FEA7F30D:FG=1; BIDUPSID=CBC07341311B0A31165A5B0A42F8D373; PSTM=1481992218; BDUSS=XhLdlltaXpxaU1ySDdtZ0NuRUtwbnRzSjNub00ySi03QThtTVY5fndVZnJNcGRZSVFBQUFBJCQAAAAAAAAAAAEAAAB6-OY1xu~Xxc31sMvIpb-01u0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOulb1jrpW9YN; BAIDUCUID=++; __cfduid=dd608171a35a5a40e06fc0b9cdcc8e1131489502762; Hm_lvt_28a17f66627d87f1d046eae152a1c93d=1495719879; Hm_lpvt_28a17f66627d87f1d046eae152a1c93d=1495719879"}
response=requests.get(url,headers=headers,cookies=cookie)
bp=BeautifulSoup(response.text,'lxml')
register=bp.select(".clearfix .bor-b1s")[1].get_text()[3:]
contacter=bp.select(".clearfix .bor-b1s")[2].get_text()[3:].rstrip("[whois反查]")
mail=bp.select(".clearfix .bor-b1s")[3].get_text()[4:].rstrip("[whois反查]")
creattime=bp.select(".clearfix .bor-b1s")[4].get_text()[4:]
builttime=bp.select(".clearfix .bor-b1s")[5].get_text()[4:]
passtime=bp.select(".clearfix .bor-b1s")[6].get_text()[4:]
dnsdomain=bp.select(".clearfix .bor-b1s")[7].get_text()[5:]
dns=bp.select(".clearfix .bor-b1s")[8].get_text()[3:]
return {'register':register,"contacter":contacter,'mail':mail,'creattime':creattime,'builttime':builttime,'passtime':passtime,'dnsdomain':dnsdomain,'dns':dns}
def get_env(self):
"""
#target:获取网站的服务器环境
#params:domain
#return:string env info
"""
url='http://'+self.domain
me=requests.get(url)
return me.headers["Server"]
def get_robots(self):
"""
#target:获取网页的robots
#params:url
#return:string robosts.txt
"""
url="http://"+"self.domain"+'robots.txt'
resp=requests.get(url)
return resp.text
def get_num_baidu_included(self):
"""
#target:获取百度收录数 备案方名称
#params:
#return:
"""
num=0
return num
def get_num_so_included(self):
"""
#target:获取360收录数 备案号
#params:
#return:
"""
num=0
return num
def get_num_sogou_included(self):
"""
#target:获取sougou收录数
#params:
#return:
"""
num=0
return num
def get_urls(self):
""" 抓取网站所有连接,并作相关记录
Example: {url:"http://www.vrnew.com " ,wordlist=[("首页",433),("vr",23),("Vr公司",20),("华锐视点",10),("北京虚拟现实",10),("虚拟现实公司",10),("北京华锐视点_VR虚拟现实/AR增强现实内容制作公司",1)]}
"""
index=self.get_index()
return None
def create_file_sitemap(self):
"""
#生成网站地图
"""
return None
def create_file_404(self):
"""
#生产网站死链
"""
return None
def analyse_log(self):
"""
#日志分析
"""
return None
def check_friend(self):
"""
target:友情链接检测 相互连接is a friend 不互相连接 is not friend
params:index url
return None
"""
friend=htmlpage(self.get_index())
for exurl in friend.get_external_urls():
exfriend=htmlpage(exurl)
if self.get_index() in exfriend.get_external_urls():
print("{} is a friend ".format(exurl))
else:
print("{} is not a friend ".format(exurl))
return None
def main():
vrnew=website('www.vrnew.com')
m=vrnew.get_absolute_url("http://www.vrnew.com/index.php/Product/index.html")
print(m)
if __name__ == '__main__':
main()