Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
allrobot authored Mar 10, 2023
1 parent c7f6a10 commit 22da41c
Show file tree
Hide file tree
Showing 2 changed files with 423 additions and 0 deletions.
141 changes: 141 additions & 0 deletions cool18/collect_all_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
import datetime, random, os, re, traceback, aiofiles, aiohttp, asyncio,curses
import sys
from bs4 import BeautifulSoup

import processing_file


# 返回请求网页的源代码
async def fetch(session, url, contype,proxy=''):
# 错误异常处理,如果超过1小时未请求到该网页资源,则抛出None
error_counter = 0
while True:
error_counter += 1
try:
params = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0)'
' Gecko/20100101 Firefox/106.0',
}
async with session.get(url, params=params, proxy=proxy) as response:
# print(f'【{str(datetime.datetime.now())[:16]}】:正在访问{url}')
encoding=contype.search(response.headers.get('Content-Type'))
if encoding:
encoding=encoding.group(1)
else:
encoding=None
return [encoding,await response.text()]
# 连接异常处理
except aiohttp.ClientConnectorError or aiohttp.ClientTimeout or ConnectionResetError:
if error_counter > 1000:
message = f'【{str(datetime.datetime.now())[:16]}】:fetch()函数\n' \
f'【{str(datetime.datetime.now())[:16]}】:无法连接 {url}\n'
async with aiofiles.open('../错误日志.txt', 'a+') as file:
await file.write(message)
print('*'*50)
print(message)
print('*' * 50)

return [url, None]
else:
print('*' * 50)
print(f'【{str(datetime.datetime.now())[:16]}】:{url} 链接已超时,正在重试中...',end='\r',flush=True)
print('*' * 50)
# 其它异常处理
except:
if error_counter > 1000:
message = f'【{str(datetime.datetime.now())[:16]}】:fetch()函数\n' \
f'【{str(datetime.datetime.now())[:16]}】:无法连接 {url}\n' \
f'【{str(datetime.datetime.now())[:16]}】:{traceback.format_exc()}\n'
async with aiofiles.open('../错误日志.txt', 'a+') as file:
await file.write(message)
print('*' * 50)
print(message)
print('*' * 50)
return [url, None]
await asyncio.sleep(random.randrange(5, 20))

# 生产函数,通过semaphore控制访问量,把获取的html添加到queue供消费函数访问
async def producer(session, url, semaphore, end_num,contype,proxy=''):
tid=end_num.search(url).group(0)[4:]
# 并发量,运行多个producer协程确保在semaphore控制的范围内
async with semaphore:
try:
html = await asyncio.wait_for(fetch(session, url, contype,proxy), 12000.0)
print(f'【{str(datetime.datetime.now())[:16]}】:已访问{url}',end='\r',flush=True)
if not BeautifulSoup(html[1],'lxml').select_one('head>script'):
async with aiofiles.open(f'{tid}.html', 'w',encoding=html[0]) as f:
await f.write(html[1])
print(f'【{str(datetime.datetime.now())[:16]}】:已经保存{url}',end='\r',flush=True)

# await queue.put(html)
except asyncio.TimeoutError:
message = f'【{str(datetime.datetime.now())[:16]}】:producer()函数\n' \
f'【{str(datetime.datetime.now())[:16]}】:半小时未成功连接,异常链接\n'
async with aiofiles.open('../错误日志.txt', 'a+') as file:
await file.write(message)
print(message)




async def producers(concurrency_num: int):
# 获取禁忌书屋网页编号
end_num = re.compile('tid=\d+')
# 获取网页编码
contype = re.compile(r'charset=([\w-]+)')

# 代理服务器
proxy = 'http://127.0.0.1:1162'
# 禁忌书屋主页
index_url = 'https://www.cool18.com/bbs4/'
# file_queue = asyncio.Queue(maxsize=2000)
print(f'【{str(datetime.datetime.now())[:16]}】:并行访问量限制为{concurrency_num},队列已创建')
async with aiohttp.ClientSession() as session:
index = 13586499
num=0
semaphore = asyncio.Semaphore(concurrency_num)
index_url_code=BeautifulSoup((await fetch(session, index_url,contype, proxy=proxy))[1],'lxml')
if index_url_code:
newest_max_num = index_url_code \
.select_one('#d_list > ul > li > a:nth-child(1)>font[color="grey"]') \
.parent['href']
newest_max_num = int(re.search('tid\=(.*?)$', newest_max_num).group(1).strip())
else:
f'【{str(datetime.datetime.now())[:16]}】:十分钟内未成功连接,请检查网络'
sys.exit()
# lock=[asyncio.Lock() for x in range(concurrency_num*5)]
# consumer_task = [asyncio.create_task(consumer(file_queue, lock[_],session, proxy=proxy)) for _ in range(concurrency_num*5)]
# consumer_task=asyncio.create_task(consumer(file_queue, session, proxy=proxy))
tasks = []
# 创建队列
for i in range(index, newest_max_num+1):
print(f'正在创建访问列表中...已添加第{num}页',end='\r',flush=True)
num+=1
url = f'https://www.cool18.com/bbs4/index.php?app=forum&act=threadview&tid={i}'
tasks.append(producer( session, url, semaphore, end_num,contype,proxy))
print(f'【{str(datetime.datetime.now())[:16]}】:爬虫程序开始运行')
await asyncio.gather(*tasks)
# await file_queue.put(None)
# await file_queue.join()
# [x.cancel() for x in consumer_task]
# await asyncio.gather(*consumer_task, return_exceptions=True)


async def main():
# 限制并行访问量为100
concurrency_num = 1000

dir_name=f'禁忌书屋'
processing_file.dir_name=dir_name
if not os.path.exists(dir_name):
os.mkdir(dir_name)

os.chdir(dir_name)
print(f'【{str(datetime.datetime.now())[:16]}】:{dir_name} 目录已创建')
await producers(concurrency_num)


if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Loading

0 comments on commit 22da41c

Please sign in to comment.