Skip to content

Commit

Permalink
Update and rename cool18/processing_file.py to cool18  酷18/processing…
Browse files Browse the repository at this point in the history
…_file.py

确保脚本路径存在“禁忌书屋”源码的文件夹
  • Loading branch information
allrobot authored Mar 11, 2023
1 parent 22da41c commit 6e0d659
Showing 1 changed file with 42 additions and 24 deletions.
66 changes: 42 additions & 24 deletions cool18/processing_file.py → cool18  酷18/processing_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
sensitive_words = f'({f.read()})'.replace('\n', '|').replace('||', '')
sensitive_words = re.sub('\|\|$', '', sensitive_words)

# 含有敏感词,返回False
async def check_sensitive(input):
if re.search(f'(.*?){sensitive_words}(.*?)(作者|送达者|$)', input):
return False
else:
return True

# 去除非法字符
async def del_illegal_words(string):
Expand All @@ -37,33 +43,29 @@ async def del_illegal_words(string):

# 正文去掉网页元素
async def remove_html(text):
text = re.sub(r'<font(.*?)font>', '', text)
text = re.sub(r'<font([^\t]*)font>', '', text)
# text = re.sub(r'<center(.*?)center>', '', text)
text = re.sub(r'<br/>', '\n', text)
text = re.sub(r'<p>', '\n', text)
text = re.sub(r'<p(.*?)>', '\n', text)
text = re.sub(r'</p>', '\n', text)
text = re.sub(r'<b>(.*?)</b>', '\n', text)
text = text.replace("<pre>", '')
text = text.replace("<i>", '')
text = text.replace("</pre>", '')
return text


# 含有敏感词,返回False
async def check_sensitive(input):
if re.search(f'(.*?){sensitive_words}(.*?)(作者|送达者|$)', input):
return False
else:
return True


# 返回文件副本名
async def copy_name(filename):
num = 0
while os.path.exists(filename):
copy_num = re.search('(.*?)((.*?)).txt', filename)
copy_num = re.search('(.*?) - 副本((.*?)).txt', filename)
num += 1
if copy_num:
if copy_num.group(2).isnumeric():
num=int(copy_num.group(2))+1
filename = copy_num.group(1) + ' - 副本(' + str(num) + ').txt'
else:
filename = filename[:-4] + ' - 副本(' + str(num) + ').txt'
Expand All @@ -87,7 +89,7 @@ async def write_file(filename, content):
if not similarity_ratio >= 95:
newname = await copy_name(filename)
# await loop.run_in_executor(None, os.rename, filename, newname)
async with aiofiles.open(newname, 'w', errors='ignore') as file:
async with aiofiles.open(newname, 'w', encoding='utf-8', errors='ignore') as file:
await file.write(content)
return True
else:
Expand Down Expand Up @@ -118,10 +120,10 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
link = Currect_tid
tid_num=int(re.search('=\d+',link).group(0)[1:])
first_pre_article_content = await remove_html(str(content[0])) + '\n'
article_name = html.select('td.show_content>center>font')[0].text
article_name = await del_illegal_words(html.select('td.show_content>center>font')[0].text)
# 检测文章名是否包含敏感词
if await check_sensitive(article_name):
article_filename = new_dir_name+'\\'+(await del_illegal_words(article_name)) + '.txt'
article_filename = new_dir_name+'\\'+article_name + '.txt'
valid = False
if len(first_pre_article_content) > 0:
valid = True
Expand All @@ -130,7 +132,7 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
for li in follow_up: # 遍历回帖,记录较多字符的回帖
new_a = li.select_one('a')
# 检查文章大小
if int(byte.search(new_a.next_sibling).group(0)) > 10000:
if int(byte.search(new_a.next_sibling).group(1)) > 10000:
valid = True
if await check_sensitive(str(new_a.text)):
new_ = []
Expand All @@ -142,7 +144,7 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
except:
pass
else:
if len(first_pre_article_content) < 1500: # 字数少,跳过
if len(first_pre_article_content) < 2000: # 字数少,跳过
valid = False

# 该页面字数较多,存为txt文件
Expand All @@ -153,9 +155,9 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
if follow_up_links:
follow_up_links = follow_up_links[::-1]
# 重命名文章名,整合文章的小说名总不能带(1)之类的字
bracket_exist = del_bracket.search(article_filename)
if bracket_exist:
article_filename = bracket_exist.group(1) + bracket_exist.group(5) + '.txt'
# bracket_exist = del_bracket.search(article_filename)
# if bracket_exist:
# article_filename = bracket_exist.group(1) + bracket_exist.group(5) + '.txt'
# 打开回帖,将文章内容加到列表中
for article_link in follow_up_links:
follow_up_file=str(re.search('\d+',article_link[1]).group(0))+'.html'
Expand Down Expand Up @@ -188,16 +190,14 @@ async def processing_data(html, new_dir_name,byte,del_bracket,index,Currect_tid)
file = await write_file(article_filename, text)

if not file:
print(' '*100,end='\r',flush=True)
print(
f'【{str(datetime.datetime.now())[:16]}】:tid={tid_num} 保存过了 '
f'无需保存 {article_name}',end='\r',flush=True
f'无需保存 {article_name}'
)
else:
print(' '*100,end='\r',flush=True)
print(
f'【{str(datetime.datetime.now())[:16]}】:tid={tid_num} 已保存 '
f'已保存 {article_name}',end='\r',flush=True
f' {article_name}'
)
# os.exit(0)
except:
Expand Down Expand Up @@ -238,7 +238,7 @@ async def consumer(file_name,new_dir_name,semaphore,byte,del_bracket,index):


async def consumers(concurrency_num: int,new_dir_name):
byte = re.compile('\d+')
byte = re.compile('\((\d+) bytes\)')
del_bracket = re.compile('(.*?)(\(|()(.*?)(\)|))(.*?)$')
index = 'https://www.cool18.com/bbs4/'

Expand All @@ -259,8 +259,8 @@ async def consumers(concurrency_num: int,new_dir_name):


async def main():
# 限制并行访问量
concurrency_num = 3000
# 限制并行访问量为100
concurrency_num = 1000

new_dir_name=f'禁忌书屋小说 {str(datetime.datetime.now())[:10]}'
if not os.path.exists(new_dir_name):
Expand All @@ -275,6 +275,24 @@ async def main():
print(f'【{str(datetime.datetime.now())[:16]}】:目录已创建')
await consumers(concurrency_num,new_dir_name)

async def main1():
# 限制并行访问量为100
concurrency_num = 1000

new_dir_name=f'禁忌书屋小说 {str(datetime.datetime.now())[:10]}'
if not os.path.exists(new_dir_name):
os.mkdir(new_dir_name)
target_path='禁忌书屋'
temp_path=r'C:\Users\li\PycharmProjects\禁忌书屋'
temp_new_dir=r'C:\Users\li\Downloads\禁忌书屋'
if os.path.exists(temp_path):
os.chdir(temp_path)
else:
print('禁忌书屋目录并不存在!请确保你已完整保存禁忌书屋的全部帖子')
sys.exit()

print(f'【{str(datetime.datetime.now())[:16]}】:目录已创建')
await consumers(concurrency_num,temp_new_dir)


if __name__ == '__main__':
Expand Down

0 comments on commit 6e0d659

Please sign in to comment.