-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetEbook.py
executable file
·140 lines (103 loc) · 3.75 KB
/
getEbook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
from ebooklib import epub
import click
import re
def getPage(fictionUrl): # Todo: error handing
headers = {'User-Agent': "App-Scraper"}
pageResponse = requests.get(fictionUrl, headers)
soupPage = BeautifulSoup(pageResponse.content, 'html.parser')
return soupPage
# TODO: Make all the metadata one function. Dictionary?
def getTitle(soupPage):
title = soupPage.find('h1', property="name").string
return title
def getAuthor(soupPage):
author = soupPage.find('h4', property="author").find('a').string
return author
def getCoverUrl(soupPage):
img = soupPage.find('div', class_="row fic-header").find('img')
coverUrl = img.get('src')
if coverUrl.startswith("/"):
coverUrl = "https://www.royalroad.com" + coverUrl
return coverUrl
def getChTitle(soupPage):
chTitle = soupPage.find('h1').string
return chTitle
# TODO: Make this return html instead of \r (which it does for some reason?)
def getChContent(soupPage, stripDiv=False):
chContent = str(soupPage.find('div', class_="chapter-content"))
if stripDiv is True:
chContent = re.sub(r'\s*<.*?div[\w\W]*?>\s*', '', chContent, 1)
chContent = re.sub(r'\s*</div>\s*$', '', chContent)
chContent = re.sub(r'\s*width="[\w\W]*?"\s*', '', chContent)
chContent = re.sub(r'max-width: 100%; ', '', chContent)
chContent = re.sub(r'style=""', '', chContent)
return chContent
# TODO: should I make the two loops one loop?
# gets chapter links in chLinks
def getChLinks(soupPage):
chTable = soupPage.find(id="chapters")
chPaths = []
for link in chTable.find_all('a', href=True):
chPaths.append(link.get('href'))
chLinks = []
for path in chPaths:
chLinks.append("https://www.royalroad.com" + path)
return chLinks
def createBook(outfile, title, author, chLinks):
# create book and set metadata
print('Creating Ebook...')
book = epub.EpubBook()
book.set_title(title)
book.set_language('en')
book.add_author(author)
chs = []
# tocLinks = []
i = 0
for link in chLinks:
j = i + 1
filename = f"chap_{j}.xhtml"
soupPage = getPage(link) # TODO: some kind of input sanitization
chTitle = getChTitle(soupPage)
chContent = getChContent(soupPage)
print(f'Getting Chapter {j}: {chTitle}')
# Add ch to book
# TODO: do I need to make this start at 1?
chs.append(epub.EpubHtml(title=chTitle, file_name=filename))
chs[i].content = chContent
book.add_item(chs[i])
book.toc.append(chs[i])
book.spine.append(chs[i])
i += 1
# adds ncx and nav files
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# define CSS style
# TODO: Make css work. Seems to have no effect right now.
with open('cssFile.css', 'r', encoding="utf-8") as myfile:
style = myfile.read()
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css",
media_type="text/css", content=style)
# add CSS file
book.add_item(nav_css)
if outfile == '' or outfile is None:
outfile = title.replace(' ', '_')
epub.write_epub(f'{outfile}.epub', book, {})
print(f'Created {outfile}.epub')
@click.command()
@click.argument('fiction_id')
@click.option(
'-o', '--outfile',
help='Name of file to output to. Do not include extension.',
)
def main(fiction_id, outfile):
fictionUrl = "https://www.royalroad.com/fiction/" + fiction_id
soupPage = getPage(fictionUrl)
title = getTitle(soupPage)
author = getAuthor(soupPage)
# coverUrl = getCoverUrl(soupPage)
chLinks = getChLinks(soupPage)
createBook(outfile, title, author, chLinks)
main()