From d81505d800d4e11f9b70497ee13b41a573deba8c Mon Sep 17 00:00:00 2001 From: Philippe Chadenier Date: Tue, 9 Jul 2024 11:35:25 +0200 Subject: [PATCH 1/2] Encode html_title --- newsplease/helper_classes/parse_crawler.py | 102 +++++++++++---------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/newsplease/helper_classes/parse_crawler.py b/newsplease/helper_classes/parse_crawler.py index 3917ad1b..b70729e5 100644 --- a/newsplease/helper_classes/parse_crawler.py +++ b/newsplease/helper_classes/parse_crawler.py @@ -1,20 +1,24 @@ """ This is a helper class for the crawler's parse methods """ + import logging import re import time +from typing import Optional import scrapy +from scrapy.http import Response # to improve performance, regex statements are compiled only once per module -re_html = re.compile('text/html') +re_html = re.compile("text/html") class ParseCrawler(object): """ Helper class for the crawler's parse methods. """ + helper = None log = None @@ -23,11 +27,11 @@ def __init__(self, helper): self.log = logging.getLogger(__name__) def pass_to_pipeline_if_article( - self, - response, - source_domain, - original_url, - rss_title=None + self, + response: Response, + source_domain: str, + original_url: str, + rss_title: Optional[str] = None, ): """ Responsible for passing a NewscrawlerItem to the pipeline if the @@ -40,51 +44,53 @@ def pass_to_pipeline_if_article( :return NewscrawlerItem: NewscrawlerItem to pass to the pipeline """ if self.helper.heuristics.is_article(response, original_url): - return self.pass_to_pipeline( - response, source_domain, rss_title=None) + return self.pass_to_pipeline(response, source_domain, rss_title=rss_title) def pass_to_pipeline( - self, - response, - source_domain, - rss_title=None + self, response: Response, source_domain: str, rss_title: Optional[str] = None ): - timestamp = time.strftime('%Y-%m-%d %H:%M:%S', - time.gmtime(time.time())) + timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time())) - relative_local_path = self.helper.savepath_parser \ - .get_savepath(response.url) + relative_local_path = self.helper.savepath_parser.get_savepath(response.url) # Instantiate the crawler item class defined in the configuration article = self.helper.crawler_item_class() - article['local_path'] = self.helper.savepath_parser \ - .get_formatted_relative_path(relative_local_path) - article['filename'] = self.helper.savepath_parser.get_filename(article['local_path']) - article['abs_local_path'] = self.helper.savepath_parser \ - .get_abs_path(relative_local_path) - article['modified_date'] = timestamp - article['download_date'] = timestamp - article['source_domain'] = source_domain.encode("utf-8") - article['url'] = response.url - extracted_title = response.selector.xpath('//title/text()').extract_first() - article['html_title'] = extracted_title.encode("utf-8") if extracted_title is not None else '' + article["local_path"] = self.helper.savepath_parser.get_formatted_relative_path( + relative_local_path + ) + article["filename"] = self.helper.savepath_parser.get_filename( + article["local_path"] + ) + article["abs_local_path"] = self.helper.savepath_parser.get_abs_path( + relative_local_path + ) + article["modified_date"] = timestamp + article["download_date"] = timestamp + article["source_domain"] = source_domain.encode("utf-8") + article["url"] = response.url + extracted_title = response.selector.xpath("//title/text()").extract_first() + if extracted_title is None: + extracted_title = "" + article["html_title"] = extracted_title.encode("utf-8") + if rss_title is None: - article['rss_title'] = 'NULL' + article["rss_title"] = "NULL".encode("utf-8") else: - article['rss_title'] = rss_title.encode("utf-8") - article['spider_response'] = response - article['article_title'] = 'NULL' - article['article_description'] = 'NULL' - article['article_text'] = 'NULL' - article['article_image'] = 'NULL' - article['article_author'] = 'NULL' - article['article_publish_date'] = 'NULL' - article['article_language'] = 'NULL' + article["rss_title"] = rss_title.encode("utf-8") + article["spider_response"] = response + article["article_title"] = "NULL" + article["article_description"] = "NULL" + article["article_text"] = "NULL" + article["article_image"] = "NULL" + article["article_author"] = "NULL" + article["article_publish_date"] = "NULL" + article["article_language"] = "NULL" return article @staticmethod - def recursive_requests(response, spider, ignore_regex='', - ignore_file_extensions='pdf'): + def recursive_requests( + response, spider, ignore_regex="", ignore_file_extensions="pdf" + ): """ Manages recursive requests. Determines urls to recursivly crawl if they do not match certain file @@ -103,10 +109,13 @@ def recursive_requests(response, spider, ignore_regex='', # or contain any of the given ignore_regex regexes return [ scrapy.Request(response.urljoin(href), callback=spider.parse) - for href in response.css("a::attr('href')").extract() if re.match( - r'.*\.' + ignore_file_extensions + - r'$', response.urljoin(href), re.IGNORECASE - ) is None + for href in response.css("a::attr('href')").extract() + if re.match( + r".*\." + ignore_file_extensions + r"$", + response.urljoin(href), + re.IGNORECASE, + ) + is None and len(re.match(ignore_regex, response.urljoin(href)).group(0)) == 0 ] @@ -117,10 +126,11 @@ def content_type(self, response): :param obj response: The scrapy response :return bool: Determines wether the response is of the correct type """ - if not re_html.match(response.headers.get('Content-Type').decode('utf-8')): + if not re_html.match(response.headers.get("Content-Type").decode("utf-8")): self.log.warn( - "Dropped: %s's content is not of type " - "text/html but %s", response.url, response.headers.get('Content-Type') + "Dropped: %s's content is not of type " "text/html but %s", + response.url, + response.headers.get("Content-Type"), ) return False else: From 7404aa70829a6a46ed1f91adaddd97a474e1a4cf Mon Sep 17 00:00:00 2001 From: Philippe Chadenier Date: Tue, 9 Jul 2024 11:46:13 +0200 Subject: [PATCH 2/2] Don't encode rss_title --- newsplease/helper_classes/parse_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/newsplease/helper_classes/parse_crawler.py b/newsplease/helper_classes/parse_crawler.py index b70729e5..4d257624 100644 --- a/newsplease/helper_classes/parse_crawler.py +++ b/newsplease/helper_classes/parse_crawler.py @@ -74,9 +74,9 @@ def pass_to_pipeline( article["html_title"] = extracted_title.encode("utf-8") if rss_title is None: - article["rss_title"] = "NULL".encode("utf-8") + article["rss_title"] = "NULL" else: - article["rss_title"] = rss_title.encode("utf-8") + article["rss_title"] = rss_title article["spider_response"] = response article["article_title"] = "NULL" article["article_description"] = "NULL"