-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwuxia_dl.py
135 lines (108 loc) · 3.86 KB
/
wuxia_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
import requests
import bs4
import argparse
import sys
from dataclasses import dataclass
import cloudscraper
import pdfkit
import os
import time
import subprocess
import datetime
#-------- constants --------
URL_CHAPTER = "https://www.wuxiaworld.com{chapPath}"
URL = "https://www.wuxiaworld.com/novel/{novelName}"
POLITENESS_FACTOR = 0.2 # delay between downloads, in seconds
#-------- /constants --------
@dataclass
class Chapter:
"""Class for storing chapters and links"""
index: int
name: str
url: str
def clean_chapter_anme(chapName):
"""
Replace colons and other troublesome characters
"""
return chapName.replace(':', '_')
def get_chapter_content(chapter_html):
soup = bs4.BeautifulSoup(chapter_html, 'html.parser')
div = soup.find("div", {"id": "chapter-content"})
node = div.find("script")
content = []
while True:
node = node.nextSibling
if node.name == 'a':
break
content.append(node)
return content
def make_pdf(chapter_list, name, scraper):
#generate default toc
if not os.path.exists("default_toc.xsl"):
with open('default_toc.xsl', 'w') as outfile:
subprocess.call(['wkhtmltopdf', '--dump-default-toc-xsl'], stdout=outfile)
toc = {
'xsl-style-sheet': 'default_toc.xsl'
}
body = ''
for index in range(len(chapter_list)):
print(f"Processing chapter {index+1}...")
chapter = chapter_list[index]
resp = scraper.get(URL_CHAPTER.format(chapPath=chapter.url))
content = [str(c) for c in get_chapter_content(resp.text)]
# add this chapter to the body
if index == 0:
body += f'<h2>{chapter.name}</h2>'+ "".join(content)
else:
body += f'<h2 style="page-break-before: always;">{chapter.name}</h2>'+ "".join(content)
time.sleep(POLITENESS_FACTOR)
options = {
'margin-bottom': '20mm',
'footer-center': '[page]'
}
output = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
</head>
<body>
{body}
<p style="page-break-before: always;">
<b>Extra metadata</b>
<p>Downloaded from: <a href="{url}">{name}</a></p>
<p >Created on: {date} </p>
<p>Created using: <a href="https://github.com/sanskarchand/useful_scripts">wuxial-dl</a></p>
</p>
</body>
</html>""".format(body=body,
date=str(datetime.datetime.now()),
name=name,
url=URL.format(novelName=name)
) # handle unicode
#pdfkit.from_string(output, "{name}.pdf", options=options)
pdfkit.from_string(output, f"{name}.pdf", toc=toc, options=options)
def extract_chapters(html_page):
chapter_list = []
soup = bs4.BeautifulSoup(html_page, 'html.parser')
chapterLists = soup.find_all("li", attrs={"class": "chapter-item"})
chapterAnchors = [chap.find('a') for chap in chapterLists]
for idx, chap in enumerate(chapterAnchors):
chapterName = chap.text.replace('\n', '')
chapterURL = chap['href']
chapter_list.append( Chapter(idx, chapterName, chapterURL) )
return chapter_list
def main():
if len(sys.argv) == 1:
novelName = 'the-second-coming-of-gluttony'
else:
novelName = sys.argv[1]
scraper = cloudscraper.create_scraper()
r = scraper.get(URL.format(novelName=novelName))
if r.status_code != 200:
print(f"Error: Could not get main page (Status {r.status_code})")
chapter_list = extract_chapters(r.text)
make_pdf(chapter_list, novelName, scraper)
if __name__ == '__main__':
main()