forked from imfht/ScanSql
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_website.py
98 lines (96 loc) · 3.78 KB
/
get_website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
#coding:utf-8
"""
Author: Fiht --<[url=mailto:fiht@qq.com]fiht@qq.com[/url]>
Purpose: 用来获取攻击目标
Created: 2016年04月20日
"""
import requests
import re
import threading
from optparse import OptionParser
from bs4 import BeautifulSoup
try:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
except Exception: #出错的肯定是python3 2333
pass
count = 0
#----------------------------------------------------------------------
def fuck_href(url):
""""""
i = re.findall('_(.*?)\.html',url)
if i:
return i[0]
#----------------------------------------------------------------------
def fuck_weight(text):
""""""
i = re.findall('themes/default/images/baidu/(.)\.gif',text)
if i:
return i[0]
#----------------------------------------------------------------------
def shit(shitDemo,num_want,weight=0,file=None):
"""return -1 if got a 404"""
global count
shit_list=[]
#print('shitDemo-->',shitDemo)
req = requests.get(shitDemo)
soup = BeautifulSoup(req.text,'lxml')
tag = soup.find(class_='listCentent')
if tag :
for i in tag.contents:
fuck_we = fuck_weight(i.__str__())
if fuck_we:
if int(fuck_we) > weight:
count = count+1
shit_list.append(fuck_href(i.__str__()))
return shit_list
#----------------------------------------------------------------------
def myTest():
""""""
for i in range(6):
req = requests.get('http://top.chinaz.com/tag/211_%d.html'%i)
soup = BeautifulSoup(req.text,'lxml')
for i in soup.findAll(class_='col-gray'):
if 'www' in i.string:
print(i.string.strip('www.'))
def run_get(url,page,num_want,file=None,weight=0):
""""""
global count
count = num_want
if file:
fil = open(file,'w+')
else:
fil=sys.stdout
fil.write('模板url %s \n 权重大于%d的网站\n'%(url,weight))
for i in range(2,page+1):
try:
lis = shit(url.replace('{page}',str(i)),num_want=num_want,file=fil,weight=weight)
for a in lis:
fil.write(a+'\n')
except KeyboardInterrupt:
print('接收到中断')
fil.close()
sys.exit(0)
#run_get('http://search.top.chinaz.com/Search.aspx?p={page}&url=%E4%B8%AD%E5%9B%BD',10,100)
if __name__=="__main__":
parser = OptionParser(' %prog args')
parser.add_option('-u','--url',dest='url',help='模板url,如=http://top.chinaz.com/diqu/index_ZhongQing_2.html -> http://top.chinaz.com/diqu/index_ZhongQing_{page}.html 其中{page}即为每次翻页的变量0->n')
parser.add_option('-p','--page',dest='page',type='int',default=10,help='一共有多少页')
parser.add_option('-n','--number',dest='num_want',default=9999999,type='int',help='想要获得多少个url')
parser.add_option('-w','--weight',dest='weight',default=0,type='int',help='过滤权重,只列出权重>w的网站,默认为0')
parser.add_option('-f','--file',dest='file',default=None,help='写入到文件而不是打印')
parser.add_option('-s','--search',dest='keyword',default=None,help='使用关键词获取网站')
(options,args) = parser.parse_args()
if options.url:#利用url抓取
if '{page}' in options.url:
run_get(options.url,options.page,options.num_want,file=options.file,weight=options.weight)
else:
print('模板url中没有发现{page}字段,请检查')
elif(options.keyword):#利用关键词抓取
run_get('http://search.top.chinaz.com/Search.aspx?p={page}&url=%s'%options.keyword,options.page,num_want=options.num_want,file=options.file,weight=options.weight)
sys.exit(0)
print('-->>>--')
else:
parser.print_help()