Skip to content

Commit

Permalink
Split integration tests to separate files and add test for salam pax'…
Browse files Browse the repository at this point in the history
…s blog
  • Loading branch information
bohdanbobrowski committed Nov 18, 2024
1 parent b2f7543 commit 9054120
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 10 deletions.
4 changes: 4 additions & 0 deletions blog2epub/crawlers/article_factory/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def __init__(
downloader: Downloader,
cancelled: bool = False,
download_callback: Optional[Callable] = None,
blog_title: Optional[str] = None,
blog_description: Optional[str] = None,
):
self.url = url
self.html: bytes = html_content
Expand All @@ -37,6 +39,8 @@ def __init__(
self.comments = "" # TODO: should be a list in the future
self.cancelled: bool = cancelled
self.download_callback = download_callback
self.blog_title: Optional[str] = blog_title
self.blog_description: Optional[str] = blog_description

@abstractmethod
def process(self) -> ArticleModel:
Expand Down
26 changes: 26 additions & 0 deletions blog2epub/crawlers/article_factory/blogspot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from typing import Optional

from lxml.html.soupparser import fromstring

from blog2epub.crawlers.article_factory.default import DefaultArticleFactory
from blog2epub.models.book import ArticleModel


class BlogspotArticleFactory(DefaultArticleFactory):
def get_title(self) -> Optional[str]:
title = super().get_title()
if self.blog_title is not None and title == self.blog_title:
title = "Pomidor"
return title

def process(self) -> ArticleModel:
self.tree = fromstring(self.html)
return ArticleModel(
url=self.url,
title=self.get_title(),
date=self.get_date(),
images=self.get_images(),
tags=self.get_tags(),
content=self.get_content(),
comments=self.get_comments(),
)
8 changes: 5 additions & 3 deletions blog2epub/crawlers/article_factory/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@

class DefaultArticleFactory(AbstractArticleFactory):
def get_title(self) -> Optional[str]:
title = None
if self.tree is not None and self.patterns is not None:
for title_pattern in self.patterns.title:
if title_pattern.xpath:
title = self.tree.xpath(title_pattern.xpath)
if len(title) > 0:
if len(title) > 1:
title = title[0]
return html.unescape(title.strip())
return None
title = html.unescape(title.strip())
break
return title

def get_date(self) -> Optional[datetime]:
result_date = None
Expand Down
4 changes: 3 additions & 1 deletion blog2epub/crawlers/blogspot.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python3
# -*- coding : utf-8 -*-

from blog2epub.crawlers.article_factory.blogspot import BlogspotArticleFactory
from blog2epub.crawlers.default import DefaultCrawler
from blog2epub.models.content_patterns import Pattern

Expand All @@ -10,7 +12,7 @@ class BlogspotCrawler(DefaultCrawler):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.name = "blogger.com crawler"

self.article_factory_class = BlogspotArticleFactory
self.patterns.content.append(
Pattern(
xpath="//div[contains(@class, 'post-body')]",
Expand Down
1 change: 1 addition & 0 deletions blog2epub/crawlers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ def crawl(self):
language=self.language,
downloader=self.downloader,
download_callback=self._break_the_loop,
blog_title=self.title,
)
art = art_factory.process()
self.images = self.images + art.images
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/blog2epub/test_blog2epub_salam_pax.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ def mock_configuration() -> ConfigurationModel:
)


class TestBlog2EPubMainSalamPax:
def test_velosov_can_parse_the_date(self, mock_configuration):
class TestBlog2EPubSalamPax:
def test_salam_pax_get_different_titles(self, mock_configuration):
# given
given_blog2epub = Blog2Epub(
url="dear_raed.blogspot.com",
url="http://dear_raed.blogspot.com",
interface=EmptyInterface(),
configuration=mock_configuration,
cache_folder="tests_cache",
Expand All @@ -32,6 +32,6 @@ def test_velosov_can_parse_the_date(self, mock_configuration):
interface=EmptyInterface(),
configuration=mock_configuration,
)
ebook.save()
# then
pass
assert len(ebook.book_data.articles) == 2
assert ebook.book_data.articles[0].title != ebook.book_data.articles[1].title
3 changes: 2 additions & 1 deletion tests/integration/blog2epub/test_blog2epub_velosov.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ def test_velosov_can_parse_the_date(self, mock_configuration):
)
ebook.save()
# then
pass
assert len(ebook.book_data.articles) == 2
assert ebook.book_data.articles[0].title != ebook.book_data.articles[1].title

0 comments on commit 9054120

Please sign in to comment.