Skip to content

Commit

Permalink
Mior fixes in crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
bohdanbobrowski committed Nov 15, 2024
1 parent 0e7737d commit 216059b
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 14 deletions.
2 changes: 1 addition & 1 deletion blog2epub/common/cover.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _split_too_long_parts(self, parts: list[str]) -> list[str]:

def _split_long_title(self) -> list[str]:
title = []
for char in ["|", "-", ":", " "]:
for char in ["|", "-", ":", "–", " "]:
if self.title.find(char) > -1:
title = self.title.split(char)
if len(title) > 0:
Expand Down
4 changes: 4 additions & 0 deletions blog2epub/crawlers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ def __init__(self, **kwargs):
Pattern(
xpath="//article//img",
),
Pattern(
xpath="//img[contains(@class, 'wp-post-image')]",
),
],
)
self.downloader = Downloader(
Expand Down Expand Up @@ -187,6 +190,7 @@ def _get_header_images(self, tree) -> list[ImageModel]:
xpaths = [
'//*[contains(@class, "wp-block-image")]//img/@src',
'//div[@id="header"]/div/div/div/p[@class="description"]/span/img/@src',
"//img[contains(@class, 'wp-post-image')]/@src",
]
for xpath in xpaths:
for img in tree.xpath(xpath):
Expand Down
29 changes: 16 additions & 13 deletions blog2epub/models/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,30 @@

example_blogs = [
"http://archaia-ellada.blogspot.com",
"http://historicaltidbits.blogspot.com/",
"http://starybezpiek.blogspot.com/",
"https://19thcentury.wordpress.com/",
"https://cyclehistory.wordpress.com/",
"http://historicaltidbits.blogspot.com",
"http://starybezpiek.blogspot.com",
"https://19thcentury.wordpress.com",
"https://cyclehistory.wordpress.com",
"https://klubjagiellonski.pl",
"https://knippsen.blogspot.com",
"https://ksgedania.blogspot.com",
"https://motorbikes.blog",
"https://oldcam.wordpress.com",
"https://oldcamera.blog/",
"https://oldcamera.blog",
"https://python-bloggers.com",
"https://rocket-garage.blogspot.com",
"https://thevictoriancyclist.wordpress.com/",
"https://swiatmotocykli.pl",
"https://thevictoriancyclist.wordpress.com",
"https://velosov.blogspot.com",
"https://vintagebicycle.wordpress.com/",
"https://vowe.net/",
"https://vintagebicycle.wordpress.com",
"https://vowe.net",
"https://www.blog.homebrewing.pl",
"https://www.historyoftheancientworld.com/",
"https://www.mikeanderson.biz/",
"https://www.nomadicmatt.com/",
"https://www.returnofthecaferacers.com/",
"https://www.szarmant.pl/",
"https://www.historyoftheancientworld.com",
"https://www.infolotnicze.pl",
"https://www.mikeanderson.biz",
"https://www.nomadicmatt.com",
"https://www.returnofthecaferacers.com",
"https://www.szarmant.pl",
]


Expand Down

0 comments on commit 216059b

Please sign in to comment.