From 4c9518a36a22b614729fc389b516465d32d0e1a9 Mon Sep 17 00:00:00 2001 From: Chakradhar Bandla Date: Tue, 15 Oct 2024 15:33:49 +0100 Subject: [PATCH 1/7] stream apis commit1 --- environment.yml | 3 +- pridepy/files/files.py | 17 ++++++++++++ pridepy/pridepy.py | 54 +++++++++++++++++++++++++++++++++++- pridepy/project/project.py | 11 ++++++++ pridepy/util/api_handling.py | 22 +++++++++++++++ requirements.txt | 3 +- setup.py | 4 +-- 7 files changed, 109 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index fb9b158..746fe16 100644 --- a/environment.yml +++ b/environment.yml @@ -13,4 +13,5 @@ dependencies: - boto3 - botocore - tqdm - - urllib3 \ No newline at end of file + - urllib3 + - httpx \ No newline at end of file diff --git a/pridepy/files/files.py b/pridepy/files/files.py index fbf76f2..3df24b1 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -50,6 +50,7 @@ class Files: This class handles PRIDE API files endpoint. """ + V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/" API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2" API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" @@ -62,6 +63,22 @@ class Files: def __init__(self): pass + async def stream_all_files_info(self, accession, output_file): + """ + get stream all project files from PRIDE API in JSON format + """ + request_url = (self.V3_API_BASE_URL + "/files/all") + headers = {"Accept": "application/JSON"} + await Util.stream_response_to_file(output_file, request_url, headers) + + async def stream_all_project_files_info(self, accession, output_file): + """ + get stream all project files from PRIDE API in JSON format + """ + request_url = (self.V3_API_BASE_URL + "projects/"+ accession + "/files/all") + headers = {"Accept": "application/JSON"} + await Util.stream_response_to_file(output_file, request_url, headers) + def get_all_paged_files( self, query_filter, page_size, page, sort_direction, sort_conditions ): diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py index b15f69c..8583b7e 100644 --- a/pridepy/pridepy.py +++ b/pridepy/pridepy.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 - +import asyncio import logging import click from pridepy.files.files import Files @@ -254,6 +254,58 @@ def search_projects_by_keywords_and_filters( ) +@main.command() +@click.option( + "-o", + "--output_file", + required=True, + help="output file to save all the projects info", +) +def stream_projects_metadata(output_file): + """ + get all projects from PRIDE in JSON format + :return: + """ + project = Project() + asyncio.run(project.stream_all_projects(output_file)) + + +@main.command() +@click.option( + "-a", + "--accession", + required=True, + help="project accession", +) +@click.option( + "-o", + "--output_file", + required=True, + help="output file to save all the projects info", +) +def stream_all_project_files_info(accession, output_file): + """ + get all projects from PRIDE in JSON format + :return: + """ + files = Files() + asyncio.run(files.stream_all_project_files_info(accession, output_file)) + +@main.command() +@click.option( + "-o", + "--output_file", + required=True, + help="output file to save all the projects info", +) +def stream_all_files_info(output_file): + """ + get all projects from PRIDE in JSON format + :return: + """ + files = Files() + asyncio.run(files.stream_all_files_info( output_file)) + @main.command() @click.option( "-ps", diff --git a/pridepy/project/project.py b/pridepy/project/project.py index 68740b4..6498b94 100644 --- a/pridepy/project/project.py +++ b/pridepy/project/project.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +import asyncio + from pridepy.authentication.authentication import Authentication from pridepy.util.api_handling import Util @@ -9,6 +11,7 @@ class Project: """ API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2/" + V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/" PRIVATE_API_BASE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2/" def __init__(self): @@ -39,6 +42,14 @@ def get_projects(self, page_size, page, sort_direction, sort_conditions): response = Util.get_api_call(request_url, headers) return response.json() + async def stream_all_projects(self, output_file): + """ + get stream of all projects from PRIDE API in JSON format + """ + request_url = (self.V3_API_BASE_URL + "projects/all") + headers = {"Accept": "application/JSON"} + await Util.stream_response_to_file(output_file, request_url, headers) + def get_reanalysis_projects_by_accession(self, accession): """ search PRIDE projects by reanalysis accession diff --git a/pridepy/util/api_handling.py b/pridepy/util/api_handling.py index 3362b5c..8d1951a 100644 --- a/pridepy/util/api_handling.py +++ b/pridepy/util/api_handling.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +import sys +import httpx import requests import logging from ratelimit import limits, sleep_and_retry @@ -30,6 +32,26 @@ def get_api_call(url, headers=None): ) return response + @staticmethod + @sleep_and_retry + @limits(calls=1000, period=50) + async def stream_response_to_file(out_file, url, headers=None): + async with httpx.AsyncClient() as client: + # Use a GET request with stream=True to handle streaming responses + async with client.stream("GET", url, headers=headers) as response: + # Check if the response is successful + response.raise_for_status() + try: + cfile = open(out_file, 'w') + # Iterate over the streaming content line by line + async for line in response.aiter_lines(): + if line: # Avoid printing empty lines (common with text/event-stream) + cfile.write(line) + cfile.close() + except PermissionError as e: + print("[ERROR] No permissions to write to:", out_file) + sys.exit(1) + @staticmethod @sleep_and_retry @limits(calls=1000, period=50) diff --git a/requirements.txt b/requirements.txt index cbcb2c5..6750e0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ plotly boto3 botocore tqdm -urllib3 \ No newline at end of file +urllib3 +httpx \ No newline at end of file diff --git a/setup.py b/setup.py index bc884ff..171aa15 100644 --- a/setup.py +++ b/setup.py @@ -6,9 +6,9 @@ setup( name="pridepy", - version="0.0.4", + version="0.0.5", author="PRIDE Team", - author_email="pride-report@ebi.ac.uk", + author_email="pride-support@ebi.ac.uk", description="Python Client library for PRIDE Rest API", long_description=long_description, long_description_content_type="text/markdown", From ac03b3389c511a349bfdf898500f86c710bfea3a Mon Sep 17 00:00:00 2001 From: Chakradhar Bandla Date: Tue, 15 Oct 2024 18:11:52 +0100 Subject: [PATCH 2/7] stream projects & files metadata --- pridepy/files/files.py | 21 ++++++++++--------- pridepy/pridepy.py | 35 +++++++++----------------------- pridepy/project/project.py | 9 ++++++--- pridepy/util/api_handling.py | 39 +++++++++++++++++++++--------------- 4 files changed, 50 insertions(+), 54 deletions(-) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 3df24b1..09f4ec4 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -63,21 +63,22 @@ class Files: def __init__(self): pass - async def stream_all_files_info(self, accession, output_file): + async def stream_all_files_metadata(self, output_file, accession=None): """ get stream all project files from PRIDE API in JSON format """ - request_url = (self.V3_API_BASE_URL + "/files/all") + if accession is None: + request_url = ("{0}files/all".format(self.V3_API_BASE_URL)) + count_request_url = ("{0}files/count".format(self.V3_API_BASE_URL)) + else: + request_url = ("{0}projects/{1}/files/all".format(self.V3_API_BASE_URL, accession)) + count_request_url = ("{0}projects/{1}/files/count".format(self.V3_API_BASE_URL, accession)) headers = {"Accept": "application/JSON"} - await Util.stream_response_to_file(output_file, request_url, headers) + response = Util.get_api_call(count_request_url, headers) + total_records = response.json() - async def stream_all_project_files_info(self, accession, output_file): - """ - get stream all project files from PRIDE API in JSON format - """ - request_url = (self.V3_API_BASE_URL + "projects/"+ accession + "/files/all") - headers = {"Accept": "application/JSON"} - await Util.stream_response_to_file(output_file, request_url, headers) + regex_search_pattern = '"fileName"' + await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers) def get_all_paged_files( self, query_filter, page_size, page, sort_direction, sort_conditions diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py index 8583b7e..82dabbf 100644 --- a/pridepy/pridepy.py +++ b/pridepy/pridepy.py @@ -259,11 +259,11 @@ def search_projects_by_keywords_and_filters( "-o", "--output_file", required=True, - help="output file to save all the projects info", + help="output file to save all the projects metadata", ) def stream_projects_metadata(output_file): """ - get all projects from PRIDE in JSON format + Stream all projects metadata in JSON format to a file :return: """ project = Project() @@ -271,40 +271,25 @@ def stream_projects_metadata(output_file): @main.command() -@click.option( - "-a", - "--accession", - required=True, - help="project accession", -) @click.option( "-o", "--output_file", required=True, - help="output file to save all the projects info", + help="output file to save all the files metadata", ) -def stream_all_project_files_info(accession, output_file): - """ - get all projects from PRIDE in JSON format - :return: - """ - files = Files() - asyncio.run(files.stream_all_project_files_info(accession, output_file)) - -@main.command() @click.option( - "-o", - "--output_file", - required=True, - help="output file to save all the projects info", + "-a", + "--accession", + required=False, + help="project accession", ) -def stream_all_files_info(output_file): +def stream_files_metadata(accession, output_file): """ - get all projects from PRIDE in JSON format + Stream all files metadata in JSON format and write it to a file :return: """ files = Files() - asyncio.run(files.stream_all_files_info( output_file)) + asyncio.run(files.stream_all_files_metadata(output_file, accession)) @main.command() @click.option( diff --git a/pridepy/project/project.py b/pridepy/project/project.py index 6498b94..4e4c67d 100644 --- a/pridepy/project/project.py +++ b/pridepy/project/project.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -import asyncio from pridepy.authentication.authentication import Authentication from pridepy.util.api_handling import Util @@ -46,9 +45,13 @@ async def stream_all_projects(self, output_file): """ get stream of all projects from PRIDE API in JSON format """ - request_url = (self.V3_API_BASE_URL + "projects/all") + request_url = self.V3_API_BASE_URL + "projects/all" + count_request_url = self.V3_API_BASE_URL + "projects/count" headers = {"Accept": "application/JSON"} - await Util.stream_response_to_file(output_file, request_url, headers) + response = Util.get_api_call(count_request_url, headers) + total_records = response.json() + regex_search_pattern = '"projectDescription"' + await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers) def get_reanalysis_projects_by_accession(self, accession): """ diff --git a/pridepy/util/api_handling.py b/pridepy/util/api_handling.py index 8d1951a..c1637d2 100644 --- a/pridepy/util/api_handling.py +++ b/pridepy/util/api_handling.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import re import sys import httpx @@ -6,6 +7,7 @@ import logging from ratelimit import limits, sleep_and_retry from requests.adapters import HTTPAdapter +from tqdm import tqdm from urllib3.util.retry import Retry @@ -35,22 +37,27 @@ def get_api_call(url, headers=None): @staticmethod @sleep_and_retry @limits(calls=1000, period=50) - async def stream_response_to_file(out_file, url, headers=None): - async with httpx.AsyncClient() as client: - # Use a GET request with stream=True to handle streaming responses - async with client.stream("GET", url, headers=headers) as response: - # Check if the response is successful - response.raise_for_status() - try: - cfile = open(out_file, 'w') - # Iterate over the streaming content line by line - async for line in response.aiter_lines(): - if line: # Avoid printing empty lines (common with text/event-stream) - cfile.write(line) - cfile.close() - except PermissionError as e: - print("[ERROR] No permissions to write to:", out_file) - sys.exit(1) + async def stream_response_to_file(out_file, total_records, regex_search_pattern, url, headers=None): + # Initialize the progress bar + with tqdm(total=total_records, unit_scale=True) as pbar: + async with httpx.AsyncClient() as client: + # Use a GET request with stream=True to handle streaming responses + async with client.stream("GET", url, headers=headers) as response: + # Check if the response is successful + response.raise_for_status() + try: + cfile = open(out_file, 'w') + # Iterate over the streaming content line by line + async for line in response.aiter_lines(): + if line: # Avoid printing empty lines (common with text/event-stream) + cfile.write(line + "\n") + # Check if the pattern exists in the string + if re.search(regex_search_pattern, line): + pbar.update(1) # Update progress bar by 1 for each detection + cfile.close() + except PermissionError as e: + print("[ERROR] No permissions to write to:", out_file) + sys.exit(1) @staticmethod @sleep_and_retry From ee6c69d25f5401cd75a20b0f80ab2c526fe4995f Mon Sep 17 00:00:00 2001 From: Chakradhar Bandla Date: Tue, 15 Oct 2024 18:28:27 +0100 Subject: [PATCH 3/7] minor var --- pridepy/files/files.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 09f4ec4..0de92ce 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -50,7 +50,7 @@ class Files: This class handles PRIDE API files endpoint. """ - V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/" + V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3" API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2" API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" @@ -68,11 +68,11 @@ async def stream_all_files_metadata(self, output_file, accession=None): get stream all project files from PRIDE API in JSON format """ if accession is None: - request_url = ("{0}files/all".format(self.V3_API_BASE_URL)) - count_request_url = ("{0}files/count".format(self.V3_API_BASE_URL)) + request_url = ("{0}/files/all".format(self.V3_API_BASE_URL)) + count_request_url = ("{0}/files/count".format(self.V3_API_BASE_URL)) else: - request_url = ("{0}projects/{1}/files/all".format(self.V3_API_BASE_URL, accession)) - count_request_url = ("{0}projects/{1}/files/count".format(self.V3_API_BASE_URL, accession)) + request_url = ("{0}/projects/{1}/files/all".format(self.V3_API_BASE_URL, accession)) + count_request_url = ("{0}/projects/{1}/files/count".format(self.V3_API_BASE_URL, accession)) headers = {"Accept": "application/JSON"} response = Util.get_api_call(count_request_url, headers) total_records = response.json() From 1b7beb539b967c70cee59e9eea4584527903c925 Mon Sep 17 00:00:00 2001 From: Chakra Date: Tue, 15 Oct 2024 18:40:39 +0100 Subject: [PATCH 4/7] f-strings instead of format Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- pridepy/files/files.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index 0de92ce..c4c2e3d 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -68,8 +68,8 @@ async def stream_all_files_metadata(self, output_file, accession=None): get stream all project files from PRIDE API in JSON format """ if accession is None: - request_url = ("{0}/files/all".format(self.V3_API_BASE_URL)) - count_request_url = ("{0}/files/count".format(self.V3_API_BASE_URL)) + request_url = f"{self.V3_API_BASE_URL}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/files/count" else: request_url = ("{0}/projects/{1}/files/all".format(self.V3_API_BASE_URL, accession)) count_request_url = ("{0}/projects/{1}/files/count".format(self.V3_API_BASE_URL, accession)) From 1946da7763b8145018958d10d6a3faa755fa1f40 Mon Sep 17 00:00:00 2001 From: Chakradhar Bandla Date: Tue, 15 Oct 2024 18:44:57 +0100 Subject: [PATCH 5/7] f-strings instead of forma --- pridepy/files/files.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pridepy/files/files.py b/pridepy/files/files.py index c4c2e3d..76fe6da 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -71,8 +71,8 @@ async def stream_all_files_metadata(self, output_file, accession=None): request_url = f"{self.V3_API_BASE_URL}/files/all" count_request_url = f"{self.V3_API_BASE_URL}/files/count" else: - request_url = ("{0}/projects/{1}/files/all".format(self.V3_API_BASE_URL, accession)) - count_request_url = ("{0}/projects/{1}/files/count".format(self.V3_API_BASE_URL, accession)) + request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" headers = {"Accept": "application/JSON"} response = Util.get_api_call(count_request_url, headers) total_records = response.json() From ba7b52c9588f7fabe63f0b607473839dc7f46748 Mon Sep 17 00:00:00 2001 From: Chakradhar Bandla Date: Tue, 15 Oct 2024 18:51:55 +0100 Subject: [PATCH 6/7] file operations with context manager --- pridepy/util/api_handling.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pridepy/util/api_handling.py b/pridepy/util/api_handling.py index c1637d2..7f7ace2 100644 --- a/pridepy/util/api_handling.py +++ b/pridepy/util/api_handling.py @@ -46,15 +46,14 @@ async def stream_response_to_file(out_file, total_records, regex_search_pattern, # Check if the response is successful response.raise_for_status() try: - cfile = open(out_file, 'w') - # Iterate over the streaming content line by line - async for line in response.aiter_lines(): - if line: # Avoid printing empty lines (common with text/event-stream) - cfile.write(line + "\n") - # Check if the pattern exists in the string - if re.search(regex_search_pattern, line): - pbar.update(1) # Update progress bar by 1 for each detection - cfile.close() + with open(out_file, 'w') as cfile: + # Iterate over the streaming content line by line + async for line in response.aiter_lines(): + if line: # Avoid printing empty lines (common with text/event-stream) + cfile.write(line + "\n") + # Check if the pattern exists in the string + if re.search(regex_search_pattern, line): + pbar.update(1) # Update progress bar by 1 for each detection except PermissionError as e: print("[ERROR] No permissions to write to:", out_file) sys.exit(1) From 67fd0b12f9cb4be73c37ac7c41686c32c60d2239 Mon Sep 17 00:00:00 2001 From: Chakradhar Bandla Date: Tue, 15 Oct 2024 20:07:39 +0100 Subject: [PATCH 7/7] update README.md about stream API --- README.md | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6e4a6ad..d41ce93 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Download ``` Download single file by name: - ```bash $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus ``` @@ -58,15 +57,27 @@ $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/folder >**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer). Search projects with keywords and filters - ```bash $ pridepy search-projects-by-keywords-and-filters --keyword accession:PXD012353 ``` -Search files with filters +Search files with filters ```bash $ pridepy get-files-by-filter --filter fileCategory.value==RAW ``` + +Stream metadata of all projects as json and write it to a file +```bash +$ pridepy stream-projects-metadata -o all_pride_projects.json +``` + +Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter +```bash +$ pridepy stream-files-metadata -o all_pride_files.json +OR +$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011 +``` + Use the below command to view a list of commands available: ```bash @@ -83,7 +94,10 @@ Commands: get-files-by-project-accession get files by project accession... get-private-files Get private files by project... get-projects get paged projects :return: - get-projects-by-accession get projects by accession... + get-projects-by-accession get projects by accession... + stream-files-metadata Stream all files metadata in... + stream-projects-metadata Stream all projects metadata... + ``` # NOTE