Split integration tests to separate files and add test for salam pax'…

…s blog
bohdanbobrowski · Nov 18, 2024 · 9054120 · 9054120
1 parent b2f7543
commit 9054120
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 10 deletions.
diff --git a/blog2epub/crawlers/article_factory/abstract.py b/blog2epub/crawlers/article_factory/abstract.py
@@ -21,6 +21,8 @@ def __init__(
         downloader: Downloader,
         cancelled: bool = False,
         download_callback: Optional[Callable] = None,
+        blog_title: Optional[str] = None,
+        blog_description: Optional[str] = None,
     ):
         self.url = url
         self.html: bytes = html_content
@@ -37,6 +39,8 @@ def __init__(
         self.comments = ""  # TODO: should be a list in the future
         self.cancelled: bool = cancelled
         self.download_callback = download_callback
+        self.blog_title: Optional[str] = blog_title
+        self.blog_description: Optional[str] = blog_description
 
     @abstractmethod
     def process(self) -> ArticleModel:

diff --git a/blog2epub/crawlers/article_factory/blogspot.py b/blog2epub/crawlers/article_factory/blogspot.py
@@ -0,0 +1,26 @@
+from typing import Optional
+
+from lxml.html.soupparser import fromstring
+
+from blog2epub.crawlers.article_factory.default import DefaultArticleFactory
+from blog2epub.models.book import ArticleModel
+
+
+class BlogspotArticleFactory(DefaultArticleFactory):
+    def get_title(self) -> Optional[str]:
+        title = super().get_title()
+        if self.blog_title is not None and title == self.blog_title:
+            title = "Pomidor"
+        return title
+
+    def process(self) -> ArticleModel:
+        self.tree = fromstring(self.html)
+        return ArticleModel(
+            url=self.url,
+            title=self.get_title(),
+            date=self.get_date(),
+            images=self.get_images(),
+            tags=self.get_tags(),
+            content=self.get_content(),
+            comments=self.get_comments(),
+        )
diff --git a/blog2epub/crawlers/article_factory/default.py b/blog2epub/crawlers/article_factory/default.py
@@ -16,14 +16,16 @@
 
 class DefaultArticleFactory(AbstractArticleFactory):
     def get_title(self) -> Optional[str]:
+        title = None
         if self.tree is not None and self.patterns is not None:
             for title_pattern in self.patterns.title:
                 if title_pattern.xpath:
                     title = self.tree.xpath(title_pattern.xpath)
-                    if len(title) > 0:
+                    if len(title) > 1:
                         title = title[0]
-                        return html.unescape(title.strip())
-        return None
+                        title = html.unescape(title.strip())
+                        break
+        return title
 
     def get_date(self) -> Optional[datetime]:
         result_date = None

diff --git a/blog2epub/crawlers/blogspot.py b/blog2epub/crawlers/blogspot.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding : utf-8 -*-
+
+from blog2epub.crawlers.article_factory.blogspot import BlogspotArticleFactory
 from blog2epub.crawlers.default import DefaultCrawler
 from blog2epub.models.content_patterns import Pattern
 
@@ -10,7 +12,7 @@ class BlogspotCrawler(DefaultCrawler):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.name = "blogger.com crawler"
-
+        self.article_factory_class = BlogspotArticleFactory
         self.patterns.content.append(
             Pattern(
                 xpath="//div[contains(@class, 'post-body')]",

diff --git a/blog2epub/crawlers/default.py b/blog2epub/crawlers/default.py
@@ -364,6 +364,7 @@ def crawl(self):
                     language=self.language,
                     downloader=self.downloader,
                     download_callback=self._break_the_loop,
+                    blog_title=self.title,
                 )
                 art = art_factory.process()
                 self.images = self.images + art.images

diff --git a/tests/integration/blog2epub/test_blog2epub_salam_pax.py b/tests/integration/blog2epub/test_blog2epub_salam_pax.py
@@ -16,11 +16,11 @@ def mock_configuration() -> ConfigurationModel:
     )
 
 
-class TestBlog2EPubMainSalamPax:
-    def test_velosov_can_parse_the_date(self, mock_configuration):
+class TestBlog2EPubSalamPax:
+    def test_salam_pax_get_different_titles(self, mock_configuration):
         # given
         given_blog2epub = Blog2Epub(
-            url="dear_raed.blogspot.com",
+            url="http://dear_raed.blogspot.com",
             interface=EmptyInterface(),
             configuration=mock_configuration,
             cache_folder="tests_cache",
@@ -32,6 +32,6 @@ def test_velosov_can_parse_the_date(self, mock_configuration):
             interface=EmptyInterface(),
             configuration=mock_configuration,
         )
-        ebook.save()
         # then
-        pass
+        assert len(ebook.book_data.articles) == 2
+        assert ebook.book_data.articles[0].title != ebook.book_data.articles[1].title
diff --git a/tests/integration/blog2epub/test_blog2epub_velosov.py b/tests/integration/blog2epub/test_blog2epub_velosov.py
@@ -34,4 +34,5 @@ def test_velosov_can_parse_the_date(self, mock_configuration):
         )
         ebook.save()
         # then
-        pass
+        assert len(ebook.book_data.articles) == 2
+        assert ebook.book_data.articles[0].title != ebook.book_data.articles[1].title