Skip to content

Commit

Permalink
Fix content negotiation
Browse files Browse the repository at this point in the history
  • Loading branch information
jonatansteller committed Oct 4, 2023
1 parent ef9cdec commit 5878a23
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ run the script without interaction.
- `resources`: all resources listed in a Hydra API or a beacon file (requires `-source_url` or `-source_file`)
- `resource_triples`: all RDF triples in resources listed in a Hydra API or a beacon file (requires `-source_url` or `-source_file`)
- `-source_url '<url>'`: use this entry-point URL to scrape content
- `-source_url_type '<string>'`: request this content type when scraping content (defaults to `text/html`)
- `-source_url_type '<string>'`: request this content type when scraping content (defaults to none)
- `-source_file '<path to file>'`: use the URLs contained in this beacon file to scrape content
- `-target_folder '<name of folder>'`: download everything into this subfolder of `downloads` (defaults to timestamp)
- `-resource_url_filter '<regular expression>'`: when listing resources, apply this string as a filter (defaults to none)
Expand Down
4 changes: 2 additions & 2 deletions classes/beacon.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ class Beacon:
non_rdf_resources_list = []


def __init__(self, target_folder:str, resources_type:str = 'text/html', resources:list = []):
def __init__(self, target_folder:str, resources_type:str = '', resources:list = []):
'''
Sets up a list of resources to process
Parameters:
target_folder (str): Name of the downloads subfolder to store files in
resources_type (str, optional): Content type to request when retrieving resources, defaults to 'text/html'
resources_type (str, optional): Content type to request when retrieving resources, defaults to none
resources (list, optional): List of resources to retrieve, defaults to empty list
'''

Expand Down
4 changes: 2 additions & 2 deletions classes/hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ class Hydra:
number_of_lists = 0


def __init__(self, target_folder:str, entry_point_url:str, entry_point_url_type:str = 'text/html'):
def __init__(self, target_folder:str, entry_point_url:str, entry_point_url_type:str = ''):
'''
Sets up a Hydra entry point to process
Parameters:
target_folder (str): Name of the downloads subfolder to store files in
entry_point_url (str): URL to use as an entry point for a scraping run
entry_point_url_type (str, optional): Content type to request when retrieving resources, defaults to 'text/html'
entry_point_url_type (str, optional): Content type to request when retrieving resources, defaults to none
'''

# Assign variables
Expand Down
2 changes: 1 addition & 1 deletion helpers/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def clean_request(arguments:list) -> dict:
request = {
'download': [], # May contain lists, list_triples, beacon, resources, resource_triples
'source_url': '',
'source_url_type': 'text/html',
'source_url_type': '',
'source_file': '',
'taget_folder': current_timestamp(),
'resource_url_filter': '',
Expand Down
13 changes: 8 additions & 5 deletions helpers/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,26 @@
from helpers.clean import clean_lines


def download_file(url:str, content_type:str = 'text/html') -> dict:
def download_file(url:str, content_type:str = '') -> dict:
'''
Retrieves a file from a URL and returns the content
Parameters:
url (str): URL to download the file from
content_type (str): optional content type
content_type (str, optional): content type to request, defaults to none
Returns:
dict: Provides 'file_type', 'file_extension' and 'content' of the retrieved file
'''

# Retrieve URL content
try:
request_header = { 'Content-Type': content_type }
request_object = request.Request(url, headers = request_header)
response = request.urlopen(request_object)
if content_type != '':
request_header = { 'Accept': content_type }
request_object = request.Request(url, headers = request_header)
response = request.urlopen(request_object)
else:
response = request.urlopen(url)

# Check if response is invalid
if response.status != 200:
Expand Down
2 changes: 1 addition & 1 deletion helpers/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def echo_help():
-source_url '<url>': use this entry-point URL to scrape content
-source_url_type '<string>': request this content type when scraping content (defaults to `text/html`)
-source_url_type '<string>': request this content type when scraping content (defaults to none)
-source_file '<path to file>': use the URLs contained in this beacon file to scrape content
Expand Down

0 comments on commit 5878a23

Please sign in to comment.