From 5878a232738462d21b942b075b9b2efba0682bb8 Mon Sep 17 00:00:00 2001 From: Jonatan Jalle Steller Date: Wed, 4 Oct 2023 23:40:08 +0200 Subject: [PATCH] Fix content negotiation --- README.md | 2 +- classes/beacon.py | 4 ++-- classes/hydra.py | 4 ++-- helpers/clean.py | 2 +- helpers/download.py | 13 ++++++++----- helpers/status.py | 2 +- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a9b0ea1..e3f8a2f 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ run the script without interaction. - `resources`: all resources listed in a Hydra API or a beacon file (requires `-source_url` or `-source_file`) - `resource_triples`: all RDF triples in resources listed in a Hydra API or a beacon file (requires `-source_url` or `-source_file`) - `-source_url ''`: use this entry-point URL to scrape content -- `-source_url_type ''`: request this content type when scraping content (defaults to `text/html`) +- `-source_url_type ''`: request this content type when scraping content (defaults to none) - `-source_file ''`: use the URLs contained in this beacon file to scrape content - `-target_folder ''`: download everything into this subfolder of `downloads` (defaults to timestamp) - `-resource_url_filter ''`: when listing resources, apply this string as a filter (defaults to none) diff --git a/classes/beacon.py b/classes/beacon.py index b00318a..a697826 100644 --- a/classes/beacon.py +++ b/classes/beacon.py @@ -37,13 +37,13 @@ class Beacon: non_rdf_resources_list = [] - def __init__(self, target_folder:str, resources_type:str = 'text/html', resources:list = []): + def __init__(self, target_folder:str, resources_type:str = '', resources:list = []): ''' Sets up a list of resources to process Parameters: target_folder (str): Name of the downloads subfolder to store files in - resources_type (str, optional): Content type to request when retrieving resources, defaults to 'text/html' + resources_type (str, optional): Content type to request when retrieving resources, defaults to none resources (list, optional): List of resources to retrieve, defaults to empty list ''' diff --git a/classes/hydra.py b/classes/hydra.py index 392b4c8..8ea9be4 100644 --- a/classes/hydra.py +++ b/classes/hydra.py @@ -46,14 +46,14 @@ class Hydra: number_of_lists = 0 - def __init__(self, target_folder:str, entry_point_url:str, entry_point_url_type:str = 'text/html'): + def __init__(self, target_folder:str, entry_point_url:str, entry_point_url_type:str = ''): ''' Sets up a Hydra entry point to process Parameters: target_folder (str): Name of the downloads subfolder to store files in entry_point_url (str): URL to use as an entry point for a scraping run - entry_point_url_type (str, optional): Content type to request when retrieving resources, defaults to 'text/html' + entry_point_url_type (str, optional): Content type to request when retrieving resources, defaults to none ''' # Assign variables diff --git a/helpers/clean.py b/helpers/clean.py index 9e39d1e..908e8a5 100644 --- a/helpers/clean.py +++ b/helpers/clean.py @@ -31,7 +31,7 @@ def clean_request(arguments:list) -> dict: request = { 'download': [], # May contain lists, list_triples, beacon, resources, resource_triples 'source_url': '', - 'source_url_type': 'text/html', + 'source_url_type': '', 'source_file': '', 'taget_folder': current_timestamp(), 'resource_url_filter': '', diff --git a/helpers/download.py b/helpers/download.py index 410cd60..c8d996d 100644 --- a/helpers/download.py +++ b/helpers/download.py @@ -13,13 +13,13 @@ from helpers.clean import clean_lines -def download_file(url:str, content_type:str = 'text/html') -> dict: +def download_file(url:str, content_type:str = '') -> dict: ''' Retrieves a file from a URL and returns the content Parameters: url (str): URL to download the file from - content_type (str): optional content type + content_type (str, optional): content type to request, defaults to none Returns: dict: Provides 'file_type', 'file_extension' and 'content' of the retrieved file @@ -27,9 +27,12 @@ def download_file(url:str, content_type:str = 'text/html') -> dict: # Retrieve URL content try: - request_header = { 'Content-Type': content_type } - request_object = request.Request(url, headers = request_header) - response = request.urlopen(request_object) + if content_type != '': + request_header = { 'Accept': content_type } + request_object = request.Request(url, headers = request_header) + response = request.urlopen(request_object) + else: + response = request.urlopen(url) # Check if response is invalid if response.status != 200: diff --git a/helpers/status.py b/helpers/status.py index 69fe69c..c52f8b9 100644 --- a/helpers/status.py +++ b/helpers/status.py @@ -72,7 +72,7 @@ def echo_help(): -source_url '': use this entry-point URL to scrape content --source_url_type '': request this content type when scraping content (defaults to `text/html`) +-source_url_type '': request this content type when scraping content (defaults to none) -source_file '': use the URLs contained in this beacon file to scrape content