From a5e6da1331371e7f429912c59958435a8b992689 Mon Sep 17 00:00:00 2001 From: Bohdan Bobrowski Date: Wed, 13 Nov 2024 21:07:35 +0100 Subject: [PATCH] Download canceling improvement --- blog2epub/crawlers/article_factory/abstract.py | 6 +++++- blog2epub/crawlers/article_factory/default.py | 3 +++ blog2epub/crawlers/default.py | 1 + blog2epub/crawlers/zeissikonveb.py | 3 ++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/blog2epub/crawlers/article_factory/abstract.py b/blog2epub/crawlers/article_factory/abstract.py index a397a6c..92eb20c 100644 --- a/blog2epub/crawlers/article_factory/abstract.py +++ b/blog2epub/crawlers/article_factory/abstract.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Callable from lxml.html.soupparser import fromstring @@ -19,6 +19,8 @@ def __init__( dirs: DirModel, language: str, downloader: Downloader, + cancelled: bool = False, + download_callback: Optional[Callable] = None, ): self.url = url self.html: bytes = html_content @@ -33,6 +35,8 @@ def __init__( self.tree = fromstring("
") self.images_list: list[ImageModel] = [] self.comments = "" # TODO: should be a list in the future + self.cancelled: bool = cancelled + self.download_callback = download_callback @abstractmethod def process(self) -> ArticleModel: diff --git a/blog2epub/crawlers/article_factory/default.py b/blog2epub/crawlers/article_factory/default.py index 74214ea..b720df3 100644 --- a/blog2epub/crawlers/article_factory/default.py +++ b/blog2epub/crawlers/article_factory/default.py @@ -79,6 +79,9 @@ def get_images(self) -> list[ImageModel]: except IndexError: image_description = "" image_obj = ImageModel(url=image_url, description=image_description) + if self.download_callback: + if self.download_callback(): + break if self.downloader.download_image(image_obj): self.images_list.append(image_obj) self.interface.print(".", end="") diff --git a/blog2epub/crawlers/default.py b/blog2epub/crawlers/default.py index caeeafb..4de798c 100644 --- a/blog2epub/crawlers/default.py +++ b/blog2epub/crawlers/default.py @@ -269,6 +269,7 @@ def crawl(self): dirs=self.dirs, language=self.language, downloader=self.downloader, + download_callback=self._break_the_loop ) art = art_factory.process() self.images = self.images + art.images diff --git a/blog2epub/crawlers/zeissikonveb.py b/blog2epub/crawlers/zeissikonveb.py index 2553198..2dc1069 100644 --- a/blog2epub/crawlers/zeissikonveb.py +++ b/blog2epub/crawlers/zeissikonveb.py @@ -31,11 +31,12 @@ def __init__(self, **kwargs): self.patterns.content = [Pattern(xpath='//div[@id="Section1"]')] self.patterns.content_cleanup = [ Pattern(xpath='//div[@data-kind="MENU"]'), + Pattern(regex=r"[\s]*\!important"), Pattern(regex=r"background-color:rgba(255,255,255,1);color:rgba(156,156,156,1);"), Pattern(regex=r"font-size:[\s0-9]+px\;"), Pattern(regex=r"line-height:[\s0-9]+\;"), Pattern(regex=r"font-family:Arial, Helvetica, sans-serif"), - Pattern(regex=r"

"), + Pattern(regex=r""), ] self.patterns.date = [ Pattern(regex=r"letzte Ă„nderung[\s:]*([0-9]{1,2})[\.\s]*([A-Za-z]+)[\.\s]*([0-9]{4})"),