Skip to content

Commit

Permalink
Merge pull request #49 from xtmu/dev
Browse files Browse the repository at this point in the history
feat: add title_mode option
  • Loading branch information
p0n1 authored Jun 13, 2024
2 parents 767f040 + 9858a58 commit d9f70c2
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
30 changes: 22 additions & 8 deletions audiobook_generator/book_parsers/epub_book_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,6 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml")
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
for level in title_levels:
if soup.find(level):
title = soup.find(level).text
break
raw = soup.get_text(strip=False)
logger.debug(f"Raw text: <{raw[:]}>")

Expand All @@ -71,9 +65,29 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text)
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")

# fill in the title if it's missing
if title == "":
# Get proper chapter title
if self.config.title_mode == "auto":
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
for level in title_levels:
if soup.find(level):
title = soup.find(level).text
break
if title == "" or re.match(r'^\d{1,3}$',title) is not None:
title = cleaned_text[:60]
elif self.config.title_mode == "tag_text":
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
for level in title_levels:
if soup.find(level):
title = soup.find(level).text
break
if title == "":
title = "<blank>"
elif self.config.title_mode == "first_few":
title = cleaned_text[:60]
else:
raise ValueError("Unsupported title_mode")
logger.debug(f"Raw title: <{title}>")
title = self._sanitize_title(title, break_string)
logger.debug(f"Sanitized title: <{title}>")
Expand Down
1 change: 1 addition & 0 deletions audiobook_generator/config/general_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def __init__(self, args):
self.output_text = args.output_text
self.log = args.log
self.no_prompt = args.no_prompt
self.title_mode = args.title_mode

# Book parser specific arguments
self.newline_mode = args.newline_mode
Expand Down
6 changes: 6 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ def handle_args():
default="double",
help="Choose the mode of detecting new paragraphs: 'single' or 'double'. 'single' means a single newline character, while 'double' means two consecutive newline characters. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)",
)
parser.add_argument(
"--title_mode",
choices=["auto", "tag_text", "first_few"],
default="auto",
help="Choose the parse mode for chapter title, 'tag_text' search 'title','h1','h2','h3' tag for title, 'first_few' set first 60 characters as title, 'auto' auto apply the best mode for current chapter.",
)
parser.add_argument(
"--chapter_start",
default=1,
Expand Down

0 comments on commit d9f70c2

Please sign in to comment.