From 5878a232738462d21b942b075b9b2efba0682bb8 Mon Sep 17 00:00:00 2001
From: Jonatan Jalle Steller <jonatan.steller@adwmainz.de>
Date: Wed, 4 Oct 2023 23:40:08 +0200
Subject: [PATCH] Fix content negotiation

---
 README.md           |  2 +-
 classes/beacon.py   |  4 ++--
 classes/hydra.py    |  4 ++--
 helpers/clean.py    |  2 +-
 helpers/download.py | 13 ++++++++-----
 helpers/status.py   |  2 +-
 6 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index a9b0ea1..e3f8a2f 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ run the script without interaction.
   - `resources`: all resources listed in a Hydra API or a beacon file (requires `-source_url` or `-source_file`)
   - `resource_triples`: all RDF triples in resources listed in a Hydra API or a beacon file (requires `-source_url` or `-source_file`)
 - `-source_url '<url>'`: use this entry-point URL to scrape content
-- `-source_url_type '<string>'`: request this content type when scraping content (defaults to `text/html`)
+- `-source_url_type '<string>'`: request this content type when scraping content (defaults to none)
 - `-source_file '<path to file>'`: use the URLs contained in this beacon file to scrape content
 - `-target_folder '<name of folder>'`: download everything into this subfolder of `downloads` (defaults to timestamp)
 - `-resource_url_filter '<regular expression>'`: when listing resources, apply this string as a filter (defaults to none)
diff --git a/classes/beacon.py b/classes/beacon.py
index b00318a..a697826 100644
--- a/classes/beacon.py
+++ b/classes/beacon.py
@@ -37,13 +37,13 @@ class Beacon:
     non_rdf_resources_list = []
 
 
-    def __init__(self, target_folder:str, resources_type:str = 'text/html', resources:list = []):
+    def __init__(self, target_folder:str, resources_type:str = '', resources:list = []):
         '''
         Sets up a list of resources to process
 
             Parameters:
                 target_folder (str): Name of the downloads subfolder to store files in
-                resources_type (str, optional): Content type to request when retrieving resources, defaults to 'text/html'
+                resources_type (str, optional): Content type to request when retrieving resources, defaults to none
                 resources (list, optional): List of resources to retrieve, defaults to empty list
         '''
 
diff --git a/classes/hydra.py b/classes/hydra.py
index 392b4c8..8ea9be4 100644
--- a/classes/hydra.py
+++ b/classes/hydra.py
@@ -46,14 +46,14 @@ class Hydra:
     number_of_lists = 0
 
 
-    def __init__(self, target_folder:str, entry_point_url:str, entry_point_url_type:str = 'text/html'):
+    def __init__(self, target_folder:str, entry_point_url:str, entry_point_url_type:str = ''):
         '''
         Sets up a Hydra entry point to process
 
             Parameters:
                 target_folder (str): Name of the downloads subfolder to store files in
                 entry_point_url (str): URL to use as an entry point for a scraping run
-                entry_point_url_type (str, optional): Content type to request when retrieving resources, defaults to 'text/html'
+                entry_point_url_type (str, optional): Content type to request when retrieving resources, defaults to none
         '''
 
         # Assign variables
diff --git a/helpers/clean.py b/helpers/clean.py
index 9e39d1e..908e8a5 100644
--- a/helpers/clean.py
+++ b/helpers/clean.py
@@ -31,7 +31,7 @@ def clean_request(arguments:list) -> dict:
     request = {
         'download': [], # May contain lists, list_triples, beacon, resources, resource_triples
         'source_url': '',
-        'source_url_type': 'text/html',
+        'source_url_type': '',
         'source_file': '',
         'taget_folder': current_timestamp(),
         'resource_url_filter': '',
diff --git a/helpers/download.py b/helpers/download.py
index 410cd60..c8d996d 100644
--- a/helpers/download.py
+++ b/helpers/download.py
@@ -13,13 +13,13 @@
 from helpers.clean import clean_lines
 
 
-def download_file(url:str, content_type:str = 'text/html') -> dict:
+def download_file(url:str, content_type:str = '') -> dict:
     '''
     Retrieves a file from a URL and returns the content
 
         Parameters:
             url (str): URL to download the file from
-            content_type (str): optional content type
+            content_type (str, optional): content type to request, defaults to none
 
         Returns:
             dict: Provides 'file_type', 'file_extension' and 'content' of the retrieved file
@@ -27,9 +27,12 @@ def download_file(url:str, content_type:str = 'text/html') -> dict:
 
     # Retrieve URL content
     try:
-        request_header = { 'Content-Type': content_type }
-        request_object = request.Request(url, headers = request_header)
-        response = request.urlopen(request_object)
+        if content_type != '':
+            request_header = { 'Accept': content_type }
+            request_object = request.Request(url, headers = request_header)
+            response = request.urlopen(request_object)
+        else:
+            response = request.urlopen(url)
 
         # Check if response is invalid
         if response.status != 200:
diff --git a/helpers/status.py b/helpers/status.py
index 69fe69c..c52f8b9 100644
--- a/helpers/status.py
+++ b/helpers/status.py
@@ -72,7 +72,7 @@ def echo_help():
 
 -source_url '<url>': use this entry-point URL to scrape content
 
--source_url_type '<string>': request this content type when scraping content (defaults to `text/html`)
+-source_url_type '<string>': request this content type when scraping content (defaults to none)
 
 -source_file '<path to file>': use the URLs contained in this beacon file to scrape content