-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebnovelpub_dl.py
156 lines (121 loc) · 4.69 KB
/
webnovelpub_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
import requests
import bs4
import argparse
import sys
from dataclasses import dataclass
import pdfkit
import os
import time
import subprocess
import datetime
#-------- constants --------
URL_BASE = "https://www.webnovelpub.com{rest}"
URL_MAIN = "https://www.webnovelpub.com/novel/{slug}/chapters/page-{page_num}"
POLITENESS_FACTOR = 0.2 # delay between downloads, in seconds
#-------- /constants --------
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/109.0."
}
@dataclass
class Chapter:
"""Class for storing chapters and links"""
index: int
name: str
url: str
def clean_chapter_anme(chapName):
"""
Replace colons and other troublesome characters
"""
return chapName.replace(':', '_')
def get_chapter_content(chapter_html):
soup = bs4.BeautifulSoup(chapter_html, 'html.parser')
div = soup.find("div", {"id": "chapter-container"})
content = div.find_all("p")
return content
def make_pdf(chapter_list, slug):
#generate default toc
if not os.path.exists("default_toc.xsl"):
with open('default_toc.xsl', 'w') as outfile:
subprocess.call(['wkhtmltopdf', '--dump-default-toc-xsl'], stdout=outfile)
toc = {
'xsl-style-sheet': 'default_toc.xsl'
}
body = ''
for index in range(len(chapter_list)):
print(f"Processing chapter {index+1}...")
chapter = chapter_list[index]
resp = requests.get(URL_BASE.format(rest=chapter.url), headers=headers)
content = [str(c) for c in get_chapter_content(resp.text)]
# add this chapter to the body
if index == 0:
body += f'<h2>{chapter.name}</h2>'+ "".join(content)
else:
body += f'<h2 style="page-break-before: always;">{chapter.name}</h2>'+ "".join(content)
time.sleep(POLITENESS_FACTOR)
options = {
'margin-bottom': '20mm',
'footer-center': '[page]'
}
output = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
</head>
<body>
{body}
<p style="page-break-before: always;">
<b>Extra metadata</b>
<p>Downloaded from: webnovelpub</p>
<p >Created on: {date} </p>
<p>Created using: <a href="https://github.com/sanskarchand/useful_scripts">webnovel</a></p>
</p>
</body>
</html>""".format(body=body,
date=str(datetime.datetime.now()),
) # handle unicode
#pdfkit.from_string(output, "{name}.pdf", options=options)
pdfkit.from_string(output, f"{slug}.pdf", toc=toc, options=options)
def extract_chapters(html_page, slug=None):
chapter_list = []
soup = bs4.BeautifulSoup(html_page, 'html.parser')
numPagesCont = soup.find("li", {"class": "PagedList-skipToLast"})
lastPageURL = numPagesCont.find("a")["href"]
ind = lastPageURL.rfind("-")
totalNumPages = int(lastPageURL[ind+1:])
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/109.0."
}
for page_num in range(1, totalNumPages):
if page_num != 1:
r = requests.get(URL_MAIN.format(slug=slug, page_num=page_num), headers=headers)
if r.status_code != 200:
print(f"Error: Could not get main page (Status {r.status_code}). Aborting...")
sys.exit(1)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
chapterListCurrent = soup.find_all("li", attrs={"data-chapterno": True})
chapterAnchors = [chap.find('a') for chap in chapterListCurrent]
for idx, chap in enumerate(chapterAnchors):
chapterName = chap["title"]
chapterURL = chap['href']
chapter_list.append( Chapter(idx, chapterName, chapterURL) )
time.sleep(POLITENESS_FACTOR)
return chapter_list
def main():
if len(sys.argv) == 1:
slug = "the-regressed-demon-lord-is-kind-04022146"
else:
slug = sys.argv[1]
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/109.0."
}
print("Getting chapter list...")
r = requests.get(URL_MAIN.format(slug=slug, page_num=1), headers=headers)
if r.status_code != 200:
print(f"Error: Could not get main page (Status {r.status_code})")
chapter_list = extract_chapters(r.text, slug)
print(f"There are {len(chapter_list)} chapters")
make_pdf(chapter_list, slug)
if __name__ == '__main__':
main()