Skip to content

Commit

Permalink
Download canceling improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
bohdanbobrowski committed Nov 13, 2024
1 parent 4743371 commit a5e6da1
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 2 deletions.
6 changes: 5 additions & 1 deletion blog2epub/crawlers/article_factory/abstract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Optional
from typing import Optional, Callable

from lxml.html.soupparser import fromstring

Expand All @@ -19,6 +19,8 @@ def __init__(
dirs: DirModel,
language: str,
downloader: Downloader,
cancelled: bool = False,
download_callback: Optional[Callable] = None,
):
self.url = url
self.html: bytes = html_content
Expand All @@ -33,6 +35,8 @@ def __init__(
self.tree = fromstring("<div></div>")
self.images_list: list[ImageModel] = []
self.comments = "" # TODO: should be a list in the future
self.cancelled: bool = cancelled
self.download_callback = download_callback

@abstractmethod
def process(self) -> ArticleModel:
Expand Down
3 changes: 3 additions & 0 deletions blog2epub/crawlers/article_factory/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def get_images(self) -> list[ImageModel]:
except IndexError:
image_description = ""
image_obj = ImageModel(url=image_url, description=image_description)
if self.download_callback:
if self.download_callback():
break
if self.downloader.download_image(image_obj):
self.images_list.append(image_obj)
self.interface.print(".", end="")
Expand Down
1 change: 1 addition & 0 deletions blog2epub/crawlers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ def crawl(self):
dirs=self.dirs,
language=self.language,
downloader=self.downloader,
download_callback=self._break_the_loop
)
art = art_factory.process()
self.images = self.images + art.images
Expand Down
3 changes: 2 additions & 1 deletion blog2epub/crawlers/zeissikonveb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ def __init__(self, **kwargs):
self.patterns.content = [Pattern(xpath='//div[@id="Section1"]')]
self.patterns.content_cleanup = [
Pattern(xpath='//div[@data-kind="MENU"]'),
Pattern(regex=r"[\s]*\!important"),
Pattern(regex=r"background-color:rgba(255,255,255,1);color:rgba(156,156,156,1);"),
Pattern(regex=r"font-size:[\s0-9]+px\;"),
Pattern(regex=r"line-height:[\s0-9]+\;"),
Pattern(regex=r"font-family:Arial, Helvetica, sans-serif"),
Pattern(regex=r"<p/>"),
Pattern(regex=r"<p\/>"),
]
self.patterns.date = [
Pattern(regex=r"letzte Änderung[\s:]*([0-9]{1,2})[\.\s]*([A-Za-z]+)[\.\s]*([0-9]{4})"),
Expand Down

0 comments on commit a5e6da1

Please sign in to comment.