This repository has been archived by the owner on Jun 18, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
crawl_proxy_servers.py
71 lines (63 loc) · 2.35 KB
/
crawl_proxy_servers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
'''
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public
* License
* along with this program. If not, see
* <http://www.gnu.org/licenses/>.
'''
import requests
from bs4 import BeautifulSoup
proxy_list = []
test_url = 'http://ipservice.163.com/isFromMainland'
crawl_url = 'http://cn-proxy.com'
def crawl_proxy_servers():
page = requests.get(crawl_url)
if page.status_code == requests.codes.ok:
bsobj = BeautifulSoup(page.text, 'html.parser')
try:
proxy_table = bsobj.find('div', {'class': 'table-container'})
proxy_body = proxy_table.find('table').find('tbody')
for tr in proxy_body.findAll('tr'):
proxy_info = tr.findAll('td')
ip = proxy_info[0]
port = proxy_info[1]
print(ip.get_text())
print(port.get_text())
proxy = ip.get_text()+':'+port.get_text()
if test_proxy_server(proxy):
proxy_list.append(proxy)
except AttributeError as e:
print('Tag not found-->'+str(e))
else:
print("wrong url")
def test_proxy_server(proxy):
proxies = {'http': proxy}
try:
response = requests.get(test_url, proxies=proxies, timeout=10)
if response.status_code == requests.codes.ok:
print(response.text)
if response.text == 'true':
return True
else:
return False
else:
print('[163 test]: return error code')
return False
except (requests.exceptions.Timeout,
requests.exceptions.RequestException,
requests.exceptions.ConnectionError) as e:
print('[163 test]: some bad happened-->'+str(e))
return False
if __name__ == '__main__':
crawl_proxy_servers()
print("final list:")
print(proxy_list)