Skip to content

Commit

Permalink
Features: skip value enlarge and more verbose sitemap download process
Browse files Browse the repository at this point in the history
  • Loading branch information
bohdanbobrowski committed Nov 16, 2024
1 parent 216059b commit 9c41d66
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ Example:
- [X] image download and attachment bug solved (ex. modernistyczny-poznan.blogspot.com)
- [X] improved resistance to http errors
- [X] dedicated crawler class for zeissikonveb.de
- [X] (on GUI) skip value is enlarged on limiet value (if such is set)
- [X] download progress is much more verbose, also on GUI it can be cancelled everytime


[» Complete Change Log here «](https://github.com/bohdanbobrowski/blog2epub/blob/master/CHANGELOG.md)
Expand Down
10 changes: 10 additions & 0 deletions blog2epub/blog2epub_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,15 @@ def console_clear(self):
def console_delete_last_line(self):
self.console.text = "\n".join(self.console.text.split("\n")[:-1])

@mainthread
def _update_skip_value(self):
if self.blog2epub_settings.data.limit and int(self.blog2epub_settings.data.limit) > 0:
skip = int(self.blog2epub_settings.data.limit)
if self.blog2epub_settings.data.skip and int(self.blog2epub_settings.data.skip) > 0:
skip += int(self.blog2epub_settings.data.skip)
self.skip_entry.text = str(skip)
self.save_settings()

def _get_url(self):
if urllib.parse.urlparse(self.url_entry.text):
port, self.url_entry.text = prepare_port_and_url(self.url_entry.text)
Expand All @@ -440,6 +449,7 @@ def _download_ebook(self, blog2epub: Blog2Epub):
self._update_tab_generate()
if not blog2epub.crawler.cancelled:
self.interface.print("Download completed.")
self._update_skip_value()
if platform != "android":
notification.notify(
title="blog2epub - download completed",
Expand Down
5 changes: 5 additions & 0 deletions blog2epub/crawlers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def _break_the_loop(self):
return False

def _get_sitemap_url(self) -> str:
self.interface.print("Analysing sitemaps", end="")
robots_parser = robotparser.RobotFileParser()
robots_parser.set_url(urljoin(self.url, "/robots.txt"))
robots_parser.read()
Expand Down Expand Up @@ -296,6 +297,8 @@ def _get_pages_urls(self, sitemap_url: str) -> list[str] | None:
sitemap = requests.get(sitemap_url)
pages = None
if sitemap.status_code == 404:
self.interface.print("")
self.interface.print("Sitemap not found!")
pages = self._get_pages_from_blog_archive_widget()
if sitemap.status_code == 200:
sitemap_pages = []
Expand All @@ -310,7 +313,9 @@ def _get_pages_urls(self, sitemap_url: str) -> list[str] | None:
or re.search("wp-sitemap-posts-(post|page)-[0-9]+.xml$", sub_sitemap)
or re.search("(post|page)-sitemap[0-9-]*.xml$", sub_sitemap)
):
self.interface.print(".", end="")
pages += self._get_pages_from_sub_sitemap(sub_sitemap)
self.interface.print("")
if pages is not None:
self.interface.print(f"Found {len(pages)} articles to crawl.")
try:
Expand Down

0 comments on commit 9c41d66

Please sign in to comment.