Fixing images attachment and date parsing on russian blogs

bohdanbobrowski · Nov 13, 2024 · 61d81f9 · 61d81f9
1 parent 2162274
commit 61d81f9
Show file tree

Hide file tree

Showing 10 changed files with 79 additions and 80 deletions.
diff --git a/blog2epub/common/book.py b/blog2epub/common/book.py
@@ -310,9 +310,11 @@ def __init__(self, article: ArticleModel, number: int, language: str):
             lang=language,  # type: ignore
         )  # type: ignore
         tags = self._print_tags(article)
-        art_date = ""
+        art_date = "<p>"
         if article.date is not None:
-            art_date = article.date.strftime("%d %B %Y, %H:%M")
+            art_date += f"<i>Created: {article.date.strftime("%d %B %Y, %H:%M")}</i><br/>"
+        art_date += f"<i>Accessed: {article.accessed.strftime("%d %B %Y, %H:%M")}</i>"
+        art_date += "</p>"
         self.epub.content = (
             f"<h2>{article.title}</h2>{tags}{art_date}" + f'<p><i><a href="{article.url}">{article.url}</a></i></p>'
         )

diff --git a/blog2epub/common/language_tools.py b/blog2epub/common/language_tools.py
@@ -4,13 +4,8 @@
 def translate_month(date: str, language: str) -> str:
     date = date.lower()
     if language == "pl":
-        date = date.replace("poniedziałek", "")
-        date = date.replace("wtorek", "")
-        date = date.replace("środa", "")
-        date = date.replace("czwartek", "")
-        date = date.replace("piątek", "")
-        date = date.replace("sobota", "")
-        date = date.replace("niedziela", "")
+        for dn in ["poniedziałek", "wtorek", "środa", "czwartek", "piątek", "sobota", "niedziela"]:
+            date = date.replace(dn, "")
         date = date.strip(",")
         date = date.strip()
         replace_dict = {
@@ -34,6 +29,9 @@ def translate_month(date: str, language: str) -> str:
         for key, val in replace_dict_short.items():
             date = date.replace(key, val)
     if language == "ru":
+        for dn in ["понедельник", "вторник", "среда", "четверг", "пятница", "суббота", "воскресенье"]:
+            date = date.replace(dn, "")
+        date = date.strip(",")
         replace_dict = {
             "января": "january",
             "февраля": "february",

diff --git a/blog2epub/crawlers/article_factory/abstract.py b/blog2epub/crawlers/article_factory/abstract.py
@@ -13,15 +13,15 @@ class AbstractArticleFactory(ABC):
     def __init__(
         self,
         url: str,
-        html_content: str,
+        html_content: bytes,
         patterns: Optional[ContentPatterns],
         interface: EmptyInterface,
         dirs: DirModel,
         language: str,
         downloader: Downloader,
     ):
         self.url = url
-        self.html = html_content
+        self.html: bytes = html_content
         self.interface = interface
         self.dirs: DirModel = dirs
         self.language: Optional[str] = language
@@ -31,7 +31,7 @@ def __init__(
         self.title: Optional[str] = None
         self.tags: list[str] = []
         self.tree = fromstring("<div></div>")
-        self.images_list = [ImageModel]
+        self.images_list: list[ImageModel] = []
         self.comments = ""  # TODO: should be a list in the future
 
     @abstractmethod

diff --git a/blog2epub/crawlers/article_factory/default.py b/blog2epub/crawlers/article_factory/default.py
@@ -49,40 +49,35 @@ def get_date(self) -> Optional[datetime]:
 
     def _remove_images(self, images_html: list[bytes], images_list: list[ImageModel]):
         if len(images_html) > 0 and len(images_list) == len(images_html):
-            self.html = tostring(self.tree)
+            self.html: bytes = tostring(self.tree)
             for key, img_obj in enumerate(images_list):
                 replace_pattern = f"#blog2epubimage#{img_obj.hash}#".encode()
                 image_html = images_html[key]
                 self.html = self.html.replace(image_html, replace_pattern)
             self.tree = fromstring(self.html)
 
     def get_images(self) -> list[ImageModel]:
-        images_list = []
-        images_html = []
-        for pattern in self.patterns.images:
-            if pattern.regex:
-                pass
-            elif pattern.xpath:
-                images_in_pattern = self.tree.xpath(pattern.xpath)
-                for image_element in images_in_pattern:
-                    image_url = image_element.xpath("@src")[0]
-                    try:
-                        image_description = image_element.xpath("@alt")[0]
-                    except IndexError:
-                        image_description = ""
-                    image_obj = ImageModel(url=image_url, description=image_description)
-                    if self.downloader.download_image(image_obj):
-                        images_list.append(image_obj)
-                        images_html.append(tostring(image_element))
-        self._remove_images(images_html=images_html, images_list=images_list)
-        # images will be inserted back after cleaning the content
-        self.images_list = images_list
-        return images_list
-
-    def set_content(self, content):
-        self.content = content
-        self.html = content
-        self.tree = fromstring(self.html)
+        self.images_list = []
+        images_html: list[bytes] = []
+        if self.patterns is not None:
+            for pattern in self.patterns.images:
+                if pattern.regex:
+                    pass
+                elif pattern.xpath:
+                    images_in_pattern = self.tree.xpath(pattern.xpath)
+                    for image_element in images_in_pattern:
+                        image_url = image_element.xpath("@src")[0]
+                        try:
+                            image_description = image_element.xpath("@alt")[0]
+                        except IndexError:
+                            image_description = ""
+                        image_obj = ImageModel(url=image_url, description=image_description)
+                        if self.downloader.download_image(image_obj):
+                            self.images_list.append(image_obj)
+                            images_html.append(tostring(image_element))
+            self._remove_images(images_html=images_html, images_list=self.images_list)
+            # images will be inserted back after cleaning the content
+        return self.images_list
 
     def _insert_images(self, article_content: str, images_list: list[ImageModel]) -> str:
         for image in images_list:
@@ -172,12 +167,12 @@ def get_comments(self) -> str:
                 pass
         return result_comments
 
-    def _content_cleanup(self, content: str) -> str:
+    def _content_cleanup(self, content: bytes) -> bytes:
         """This  function removes from downloaded content unwanted patterns - like ads, etc."""
         if self.patterns:
             for pattern in self.patterns.content_cleanup:
                 if pattern.regex:
-                    content = re.sub(pattern.regex, "", content)
+                    content = re.sub(pattern.regex, b"", content)
         return content
 
     def _content_cleanup_xpath(self):

diff --git a/blog2epub/crawlers/article_factory/wordpress.py b/blog2epub/crawlers/article_factory/wordpress.py
@@ -45,7 +45,7 @@ def _get_single_images(self, images_elements) -> list[ImageModel]:
                     if img_parent:
                         img_parent.replace(img, etree.Comment(f"#blog2epubimage#{img_parent.hash}#"))
                         self.tree = img_parent.getroottree()
-                    self.html = etree.tostring(self.tree).decode("utf-8")
+                    self.html = etree.tostring(self.tree)
                     images_list.append(image_obj)
         return images_list
 
@@ -60,7 +60,7 @@ def _get_images_with_captions(self) -> list[ImageModel]:
             img_parent = img.getparent()
             img_parent.replace(img, etree.Comment(f"#blog2epubimage#{image_obj.hash}#"))
             self.tree = img_parent.getroottree()
-            self.html = etree.tostring(self.tree).decode("utf-8")
+            self.html = etree.tostring(self.tree)
             images_list.append(image_obj)
         return images_list
 

diff --git a/blog2epub/crawlers/default.py b/blog2epub/crawlers/default.py
@@ -62,7 +62,7 @@ def __init__(self, **kwargs):
                     xpath='//abbr[@itemprop="datePublished"]/@title',
                 ),
                 Pattern(
-                    xpath='//h2[@class="date-header"]/span/text()',
+                    xpath='//*[@class="date-header"]/span/text()',
                 ),
                 Pattern(
                     xpath='//meta[@property="article:published_time"]/@content',
@@ -168,37 +168,6 @@ def _add_tags(self, tags) -> None:
             else:
                 self.tags[tag] = 1
 
-    def _atom_feed_loop(self):
-        # TODO: This needs refactor! Maybe separate crawler for atom feed?
-        for item in self.atom_feed.entries:
-            try:
-                self.article_counter += 1
-                art = self.article_factory_class(item.links[0].href, item.title.value, self)
-                if (
-                    self.configuration.skip
-                    and self.configuration.skip.isdigit()
-                    and self.article_counter < int(self.configuration.skip)
-                ):
-                    self.interface.print("[skipping] " + art.title)
-                    continue
-                art_no = len(self.articles) + 1
-                self.interface.print(f"{art_no}. {art.title}")
-                art.date = item.updated
-                if self.start:
-                    self.end = art.date
-                else:
-                    self.start = art.date
-                if item.content:
-                    art.set_content(item.content.value)
-                art.get_images()
-                art.set_content(art.html)
-                self._add_tags(art.tags)
-                if self.configuration.limit and len(self.articles) >= int(self.configuration.limit):
-                    break
-            except AttributeError as e:
-                self.interface.print(str(e))
-                self.interface.print("[article not recognized - skipping]")
-
     def _break_the_loop(self):
         if (
             self.cancelled
@@ -235,7 +204,7 @@ def _check_for_sub_sitemaps(
         sub_sitemaps = []
         pages = []
         for element in sitemap_pages:
-            if element.endswith(".xml"):
+            if element.endswith(".xml") or re.search("sitemap.xml\\?page=[0-9]+$", element):
                 sub_sitemaps.append(element)
             else:
                 pages.append(element)
@@ -248,8 +217,10 @@ def _get_pages_urls(self, sitemap_url: str) -> list[str]:
             sitemap_pages.append(sitemap_element.getchildren()[0].text)  # type: ignore
         sub_sitemaps, pages = self._check_for_sub_sitemaps(sitemap_pages)
         for sub_sitemap in sub_sitemaps:
-            if re.search("wp-sitemap-posts-(post|page)-[0-9]+.xml$", sub_sitemap) or re.search(
-                "(post|page)-sitemap[0-9-]*.xml$", sub_sitemap
+            if (
+                re.search("sitemap.xml\\?page=[0-9]+$", sub_sitemap)
+                or re.search("wp-sitemap-posts-(post|page)-[0-9]+.xml$", sub_sitemap)
+                or re.search("(post|page)-sitemap[0-9-]*.xml$", sub_sitemap)
             ):
                 pages += self._get_pages_from_sub_sitemap(sub_sitemap)
         self.interface.print(f"Found {len(pages)} articles to crawl.")

diff --git a/blog2epub/crawlers/wordpress.py b/blog2epub/crawlers/wordpress.py
@@ -44,7 +44,7 @@ def _atom_feed_loop(self):
                 else:
                     self.start = art.date
                 if item.content:
-                    art.set_content(item.content.value)
+                    # art.set_content(item.content.value)
                     art.get_images()
                 else:
                     art.html = self.downloader.get_content(art.url)

diff --git a/blog2epub/models/book.py b/blog2epub/models/book.py
@@ -41,6 +41,7 @@ class ArticleModel(BaseModel):
     url: str
     title: Optional[str]
     date: Optional[datetime]
+    accessed: datetime = datetime.now()
     content: Optional[str]
     comments: Optional[str]  # TODO: replace with List[CommentModel]
     tags: list[str] = []

diff --git a/tests/integration/blog2epub/crawlers/test_default.py b/tests/integration/blog2epub/crawlers/test_default.py
@@ -47,3 +47,16 @@ def test_get_pages_urls(self, mock_configuration):
         assert "https://bohdan.bobrowski.com.pl/wp-sitemap-taxonomies-category-1.xml" not in pages
         assert "https://bohdan.bobrowski.com.pl/wp-sitemap-taxonomies-post_tag-1.xml" not in pages
         assert "https://bohdan.bobrowski.com.pl/wp-sitemap-users-1.xml" not in pages
+
+    def test_rocket_garage_blogspot_com(self, mock_configuration):
+        # given
+        given_crawler = DefaultCrawler(
+            url="rocket-garage.blogspot.com",
+            interface=EmptyInterface(),
+            configuration=mock_configuration,
+        )
+        # when
+        sitemap_url = given_crawler._get_sitemap_url()
+        pages = given_crawler._get_pages_urls(sitemap_url=sitemap_url)
+        # then
+        assert len(pages) > 1000
diff --git a/tests/integration/blog2epub/test_blog2epub_main.py b/tests/integration/blog2epub/test_blog2epub_main.py
@@ -14,7 +14,6 @@ def mock_configuration() -> ConfigurationModel:
     return ConfigurationModel(
         destination_folder=tempfile.gettempdir(),
         limit="2",
-        include_images=True,
     )
 
 
@@ -59,3 +58,23 @@ def test_bohdan_bobrowski_com_pl_has_images(self, mock_configuration):
         # then
         assert len(ebook.book_data.articles) >= 2
         assert ebook.book_data.articles[0].content.find("#blog2epubimage#") == -1
+
+
+class TestBlog2EPubMainVelosov:
+    def test_velosov_can_parse_the_date(self, mock_configuration):
+        # given
+        given_blog2epub = Blog2Epub(
+            url="velosov.blogspot.com",
+            interface=EmptyInterface(),
+            configuration=mock_configuration,
+        )
+        # when
+        given_blog2epub.download()
+        ebook = Book(
+            book_data=given_blog2epub.crawler.get_book_data(),
+            interface=EmptyInterface(),
+            configuration=mock_configuration,
+        )
+        ebook.save()
+        # then
+        pass