Merge pull request #44 from PRIDE-Archive/stream_apis

Stream apis
PRIDE-Archive · Oct 15, 2024 · b1c55b5 · b1c55b5
2 parents 7751f9e + 67fd0b1
commit b1c55b5
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -50,23 +50,34 @@ $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Download
 ```
 
 Download single file by name:
-
 ```bash
 $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus
 ```
 
 >**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer).
 
 Search projects with keywords and filters
-
 ```bash
 $ pridepy search-projects-by-keywords-and-filters --keyword accession:PXD012353
 ```
-Search files with filters
 
+Search files with filters
 ```bash
 $ pridepy get-files-by-filter --filter fileCategory.value==RAW
 ```
+
+Stream metadata of all projects as json and write it to a file
+```bash
+$ pridepy stream-projects-metadata -o all_pride_projects.json
+```
+
+Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter
+```bash
+$ pridepy stream-files-metadata -o all_pride_files.json
+OR
+$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011
+```
+
 Use the below command to view a list of commands available:
 
 ```bash
@@ -83,7 +94,10 @@ Commands:
   get-files-by-project-accession  get files by project accession...
   get-private-files               Get private files by project...
   get-projects                    get paged projects :return:
-  get-projects-by-accession       get projects by accession...     
+  get-projects-by-accession       get projects by accession... 
+  stream-files-metadata           Stream all files metadata in...
+  stream-projects-metadata        Stream all projects metadata...
+
 ```
 # NOTE
 

diff --git a/environment.yml b/environment.yml
@@ -13,4 +13,5 @@ dependencies:
   - boto3
   - botocore
   - tqdm
-  - urllib3
+  - urllib3
+  - httpx
diff --git a/pridepy/files/files.py b/pridepy/files/files.py
@@ -50,6 +50,7 @@ class Files:
     This class handles PRIDE API files endpoint.
     """
 
+    V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3"
     API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2"
     API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2"
     PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk"
@@ -62,6 +63,23 @@ class Files:
     def __init__(self):
         pass
 
+    async def stream_all_files_metadata(self, output_file, accession=None):
+        """
+        get stream all project files from PRIDE API in JSON format
+        """
+        if accession is None:
+            request_url = f"{self.V3_API_BASE_URL}/files/all"
+            count_request_url = f"{self.V3_API_BASE_URL}/files/count"
+        else:
+            request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all"
+            count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count"
+        headers = {"Accept": "application/JSON"}
+        response = Util.get_api_call(count_request_url, headers)
+        total_records = response.json()
+
+        regex_search_pattern = '"fileName"'
+        await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)
+
     def get_all_paged_files(
         self, query_filter, page_size, page, sort_direction, sort_conditions
     ):

diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-
+import asyncio
 import logging
 import click
 from pridepy.files.files import Files
@@ -254,6 +254,43 @@ def search_projects_by_keywords_and_filters(
     )
 
 
+@main.command()
+@click.option(
+    "-o",
+    "--output_file",
+    required=True,
+    help="output file to save all the projects metadata",
+)
+def stream_projects_metadata(output_file):
+    """
+    Stream all projects metadata in JSON format to a file
+    :return:
+    """
+    project = Project()
+    asyncio.run(project.stream_all_projects(output_file))
+
+
+@main.command()
+@click.option(
+    "-o",
+    "--output_file",
+    required=True,
+    help="output file to save all the files metadata",
+)
+@click.option(
+    "-a",
+    "--accession",
+    required=False,
+    help="project accession",
+)
+def stream_files_metadata(accession, output_file):
+    """
+    Stream all files metadata in JSON format and write it to a file
+    :return:
+    """
+    files = Files()
+    asyncio.run(files.stream_all_files_metadata(output_file, accession))
+
 @main.command()
 @click.option(
     "-ps",

diff --git a/pridepy/project/project.py b/pridepy/project/project.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+
 from pridepy.authentication.authentication import Authentication
 from pridepy.util.api_handling import Util
 
@@ -9,6 +10,7 @@ class Project:
     """
 
     API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2/"
+    V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/"
     PRIVATE_API_BASE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2/"
 
     def __init__(self):
@@ -39,6 +41,18 @@ def get_projects(self, page_size, page, sort_direction, sort_conditions):
         response = Util.get_api_call(request_url, headers)
         return response.json()
 
+    async def stream_all_projects(self, output_file):
+        """
+        get stream of all projects from PRIDE API in JSON format
+        """
+        request_url = self.V3_API_BASE_URL + "projects/all"
+        count_request_url = self.V3_API_BASE_URL + "projects/count"
+        headers = {"Accept": "application/JSON"}
+        response = Util.get_api_call(count_request_url, headers)
+        total_records = response.json()
+        regex_search_pattern = '"projectDescription"'
+        await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)
+
     def get_reanalysis_projects_by_accession(self, accession):
         """
         search PRIDE projects by reanalysis accession

diff --git a/pridepy/util/api_handling.py b/pridepy/util/api_handling.py
@@ -1,9 +1,13 @@
 #!/usr/bin/env python
+import re
+import sys
 
+import httpx
 import requests
 import logging
 from ratelimit import limits, sleep_and_retry
 from requests.adapters import HTTPAdapter
+from tqdm import tqdm
 from urllib3.util.retry import Retry
 
 
@@ -30,6 +34,30 @@ def get_api_call(url, headers=None):
             )
         return response
 
+    @staticmethod
+    @sleep_and_retry
+    @limits(calls=1000, period=50)
+    async def stream_response_to_file(out_file, total_records, regex_search_pattern, url, headers=None):
+        # Initialize the progress bar
+        with tqdm(total=total_records, unit_scale=True) as pbar:
+            async with httpx.AsyncClient() as client:
+                # Use a GET request with stream=True to handle streaming responses
+                async with client.stream("GET", url, headers=headers) as response:
+                    # Check if the response is successful
+                    response.raise_for_status()
+                    try:
+                        with open(out_file, 'w') as cfile:
+                            # Iterate over the streaming content line by line
+                            async for line in response.aiter_lines():
+                                if line:  # Avoid printing empty lines (common with text/event-stream)
+                                    cfile.write(line + "\n")
+                                    # Check if the pattern exists in the string
+                                    if re.search(regex_search_pattern, line):
+                                        pbar.update(1)  # Update progress bar by 1 for each detection
+                    except PermissionError as e:
+                        print("[ERROR] No permissions to write to:", out_file)
+                        sys.exit(1)
+
     @staticmethod
     @sleep_and_retry
     @limits(calls=1000, period=50)

diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ plotly
 boto3
 botocore
 tqdm
-urllib3
+urllib3
+httpx
diff --git a/setup.py b/setup.py
@@ -6,9 +6,9 @@
 
 setup(
     name="pridepy",
-    version="0.0.4",
+    version="0.0.5",
     author="PRIDE Team",
-    author_email="pride-report@ebi.ac.uk",
+    author_email="pride-support@ebi.ac.uk",
     description="Python Client library for PRIDE Rest API",
     long_description=long_description,
     long_description_content_type="text/markdown",