Skip to content

Commit

Permalink
Fixing images attachment and date parsing on russian blogs
Browse files Browse the repository at this point in the history
  • Loading branch information
bohdanbobrowski committed Nov 13, 2024
1 parent 2162274 commit 61d81f9
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 80 deletions.
6 changes: 4 additions & 2 deletions blog2epub/common/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,11 @@ def __init__(self, article: ArticleModel, number: int, language: str):
lang=language, # type: ignore
) # type: ignore
tags = self._print_tags(article)
art_date = ""
art_date = "<p>"
if article.date is not None:
art_date = article.date.strftime("%d %B %Y, %H:%M")
art_date += f"<i>Created: {article.date.strftime("%d %B %Y, %H:%M")}</i><br/>"
art_date += f"<i>Accessed: {article.accessed.strftime("%d %B %Y, %H:%M")}</i>"
art_date += "</p>"
self.epub.content = (
f"<h2>{article.title}</h2>{tags}{art_date}" + f'<p><i><a href="{article.url}">{article.url}</a></i></p>'
)
Expand Down
12 changes: 5 additions & 7 deletions blog2epub/common/language_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,8 @@
def translate_month(date: str, language: str) -> str:
date = date.lower()
if language == "pl":
date = date.replace("poniedziałek", "")
date = date.replace("wtorek", "")
date = date.replace("środa", "")
date = date.replace("czwartek", "")
date = date.replace("piątek", "")
date = date.replace("sobota", "")
date = date.replace("niedziela", "")
for dn in ["poniedziałek", "wtorek", "środa", "czwartek", "piątek", "sobota", "niedziela"]:
date = date.replace(dn, "")
date = date.strip(",")
date = date.strip()
replace_dict = {
Expand All @@ -34,6 +29,9 @@ def translate_month(date: str, language: str) -> str:
for key, val in replace_dict_short.items():
date = date.replace(key, val)
if language == "ru":
for dn in ["понедельник", "вторник", "среда", "четверг", "пятница", "суббота", "воскресенье"]:
date = date.replace(dn, "")
date = date.strip(",")
replace_dict = {
"января": "january",
"февраля": "february",
Expand Down
6 changes: 3 additions & 3 deletions blog2epub/crawlers/article_factory/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ class AbstractArticleFactory(ABC):
def __init__(
self,
url: str,
html_content: str,
html_content: bytes,
patterns: Optional[ContentPatterns],
interface: EmptyInterface,
dirs: DirModel,
language: str,
downloader: Downloader,
):
self.url = url
self.html = html_content
self.html: bytes = html_content
self.interface = interface
self.dirs: DirModel = dirs
self.language: Optional[str] = language
Expand All @@ -31,7 +31,7 @@ def __init__(
self.title: Optional[str] = None
self.tags: list[str] = []
self.tree = fromstring("<div></div>")
self.images_list = [ImageModel]
self.images_list: list[ImageModel] = []
self.comments = "" # TODO: should be a list in the future

@abstractmethod
Expand Down
53 changes: 24 additions & 29 deletions blog2epub/crawlers/article_factory/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,40 +49,35 @@ def get_date(self) -> Optional[datetime]:

def _remove_images(self, images_html: list[bytes], images_list: list[ImageModel]):
if len(images_html) > 0 and len(images_list) == len(images_html):
self.html = tostring(self.tree)
self.html: bytes = tostring(self.tree)
for key, img_obj in enumerate(images_list):
replace_pattern = f"#blog2epubimage#{img_obj.hash}#".encode()
image_html = images_html[key]
self.html = self.html.replace(image_html, replace_pattern)
self.tree = fromstring(self.html)

def get_images(self) -> list[ImageModel]:
images_list = []
images_html = []
for pattern in self.patterns.images:
if pattern.regex:
pass
elif pattern.xpath:
images_in_pattern = self.tree.xpath(pattern.xpath)
for image_element in images_in_pattern:
image_url = image_element.xpath("@src")[0]
try:
image_description = image_element.xpath("@alt")[0]
except IndexError:
image_description = ""
image_obj = ImageModel(url=image_url, description=image_description)
if self.downloader.download_image(image_obj):
images_list.append(image_obj)
images_html.append(tostring(image_element))
self._remove_images(images_html=images_html, images_list=images_list)
# images will be inserted back after cleaning the content
self.images_list = images_list
return images_list

def set_content(self, content):
self.content = content
self.html = content
self.tree = fromstring(self.html)
self.images_list = []
images_html: list[bytes] = []
if self.patterns is not None:
for pattern in self.patterns.images:
if pattern.regex:
pass
elif pattern.xpath:
images_in_pattern = self.tree.xpath(pattern.xpath)
for image_element in images_in_pattern:
image_url = image_element.xpath("@src")[0]
try:
image_description = image_element.xpath("@alt")[0]
except IndexError:
image_description = ""
image_obj = ImageModel(url=image_url, description=image_description)
if self.downloader.download_image(image_obj):
self.images_list.append(image_obj)
images_html.append(tostring(image_element))
self._remove_images(images_html=images_html, images_list=self.images_list)
# images will be inserted back after cleaning the content
return self.images_list

def _insert_images(self, article_content: str, images_list: list[ImageModel]) -> str:
for image in images_list:
Expand Down Expand Up @@ -172,12 +167,12 @@ def get_comments(self) -> str:
pass
return result_comments

def _content_cleanup(self, content: str) -> str:
def _content_cleanup(self, content: bytes) -> bytes:
"""This function removes from downloaded content unwanted patterns - like ads, etc."""
if self.patterns:
for pattern in self.patterns.content_cleanup:
if pattern.regex:
content = re.sub(pattern.regex, "", content)
content = re.sub(pattern.regex, b"", content)
return content

def _content_cleanup_xpath(self):
Expand Down
4 changes: 2 additions & 2 deletions blog2epub/crawlers/article_factory/wordpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _get_single_images(self, images_elements) -> list[ImageModel]:
if img_parent:
img_parent.replace(img, etree.Comment(f"#blog2epubimage#{img_parent.hash}#"))
self.tree = img_parent.getroottree()
self.html = etree.tostring(self.tree).decode("utf-8")
self.html = etree.tostring(self.tree)
images_list.append(image_obj)
return images_list

Expand All @@ -60,7 +60,7 @@ def _get_images_with_captions(self) -> list[ImageModel]:
img_parent = img.getparent()
img_parent.replace(img, etree.Comment(f"#blog2epubimage#{image_obj.hash}#"))
self.tree = img_parent.getroottree()
self.html = etree.tostring(self.tree).decode("utf-8")
self.html = etree.tostring(self.tree)
images_list.append(image_obj)
return images_list

Expand Down
41 changes: 6 additions & 35 deletions blog2epub/crawlers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def __init__(self, **kwargs):
xpath='//abbr[@itemprop="datePublished"]/@title',
),
Pattern(
xpath='//h2[@class="date-header"]/span/text()',
xpath='//*[@class="date-header"]/span/text()',
),
Pattern(
xpath='//meta[@property="article:published_time"]/@content',
Expand Down Expand Up @@ -168,37 +168,6 @@ def _add_tags(self, tags) -> None:
else:
self.tags[tag] = 1

def _atom_feed_loop(self):
# TODO: This needs refactor! Maybe separate crawler for atom feed?
for item in self.atom_feed.entries:
try:
self.article_counter += 1
art = self.article_factory_class(item.links[0].href, item.title.value, self)
if (
self.configuration.skip
and self.configuration.skip.isdigit()
and self.article_counter < int(self.configuration.skip)
):
self.interface.print("[skipping] " + art.title)
continue
art_no = len(self.articles) + 1
self.interface.print(f"{art_no}. {art.title}")
art.date = item.updated
if self.start:
self.end = art.date
else:
self.start = art.date
if item.content:
art.set_content(item.content.value)
art.get_images()
art.set_content(art.html)
self._add_tags(art.tags)
if self.configuration.limit and len(self.articles) >= int(self.configuration.limit):
break
except AttributeError as e:
self.interface.print(str(e))
self.interface.print("[article not recognized - skipping]")

def _break_the_loop(self):
if (
self.cancelled
Expand Down Expand Up @@ -235,7 +204,7 @@ def _check_for_sub_sitemaps(
sub_sitemaps = []
pages = []
for element in sitemap_pages:
if element.endswith(".xml"):
if element.endswith(".xml") or re.search("sitemap.xml\\?page=[0-9]+$", element):
sub_sitemaps.append(element)
else:
pages.append(element)
Expand All @@ -248,8 +217,10 @@ def _get_pages_urls(self, sitemap_url: str) -> list[str]:
sitemap_pages.append(sitemap_element.getchildren()[0].text) # type: ignore
sub_sitemaps, pages = self._check_for_sub_sitemaps(sitemap_pages)
for sub_sitemap in sub_sitemaps:
if re.search("wp-sitemap-posts-(post|page)-[0-9]+.xml$", sub_sitemap) or re.search(
"(post|page)-sitemap[0-9-]*.xml$", sub_sitemap
if (
re.search("sitemap.xml\\?page=[0-9]+$", sub_sitemap)
or re.search("wp-sitemap-posts-(post|page)-[0-9]+.xml$", sub_sitemap)
or re.search("(post|page)-sitemap[0-9-]*.xml$", sub_sitemap)
):
pages += self._get_pages_from_sub_sitemap(sub_sitemap)
self.interface.print(f"Found {len(pages)} articles to crawl.")
Expand Down
2 changes: 1 addition & 1 deletion blog2epub/crawlers/wordpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _atom_feed_loop(self):
else:
self.start = art.date
if item.content:
art.set_content(item.content.value)
# art.set_content(item.content.value)
art.get_images()
else:
art.html = self.downloader.get_content(art.url)
Expand Down
1 change: 1 addition & 0 deletions blog2epub/models/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class ArticleModel(BaseModel):
url: str
title: Optional[str]
date: Optional[datetime]
accessed: datetime = datetime.now()
content: Optional[str]
comments: Optional[str] # TODO: replace with List[CommentModel]
tags: list[str] = []
Expand Down
13 changes: 13 additions & 0 deletions tests/integration/blog2epub/crawlers/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,16 @@ def test_get_pages_urls(self, mock_configuration):
assert "https://bohdan.bobrowski.com.pl/wp-sitemap-taxonomies-category-1.xml" not in pages
assert "https://bohdan.bobrowski.com.pl/wp-sitemap-taxonomies-post_tag-1.xml" not in pages
assert "https://bohdan.bobrowski.com.pl/wp-sitemap-users-1.xml" not in pages

def test_rocket_garage_blogspot_com(self, mock_configuration):
# given
given_crawler = DefaultCrawler(
url="rocket-garage.blogspot.com",
interface=EmptyInterface(),
configuration=mock_configuration,
)
# when
sitemap_url = given_crawler._get_sitemap_url()
pages = given_crawler._get_pages_urls(sitemap_url=sitemap_url)
# then
assert len(pages) > 1000
21 changes: 20 additions & 1 deletion tests/integration/blog2epub/test_blog2epub_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ def mock_configuration() -> ConfigurationModel:
return ConfigurationModel(
destination_folder=tempfile.gettempdir(),
limit="2",
include_images=True,
)


Expand Down Expand Up @@ -59,3 +58,23 @@ def test_bohdan_bobrowski_com_pl_has_images(self, mock_configuration):
# then
assert len(ebook.book_data.articles) >= 2
assert ebook.book_data.articles[0].content.find("#blog2epubimage#") == -1


class TestBlog2EPubMainVelosov:
def test_velosov_can_parse_the_date(self, mock_configuration):
# given
given_blog2epub = Blog2Epub(
url="velosov.blogspot.com",
interface=EmptyInterface(),
configuration=mock_configuration,
)
# when
given_blog2epub.download()
ebook = Book(
book_data=given_blog2epub.crawler.get_book_data(),
interface=EmptyInterface(),
configuration=mock_configuration,
)
ebook.save()
# then
pass

0 comments on commit 61d81f9

Please sign in to comment.