Skip to content

Commit

Permalink
Merge pull request #44 from PRIDE-Archive/stream_apis
Browse files Browse the repository at this point in the history
Stream apis
  • Loading branch information
ypriverol authored Oct 15, 2024
2 parents 7751f9e + 67fd0b1 commit b1c55b5
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 9 deletions.
22 changes: 18 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,34 @@ $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Download
```

Download single file by name:

```bash
$ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus
```

>**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer).
Search projects with keywords and filters

```bash
$ pridepy search-projects-by-keywords-and-filters --keyword accession:PXD012353
```
Search files with filters

Search files with filters
```bash
$ pridepy get-files-by-filter --filter fileCategory.value==RAW
```

Stream metadata of all projects as json and write it to a file
```bash
$ pridepy stream-projects-metadata -o all_pride_projects.json
```

Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter
```bash
$ pridepy stream-files-metadata -o all_pride_files.json
OR
$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011
```

Use the below command to view a list of commands available:

```bash
Expand All @@ -83,7 +94,10 @@ Commands:
get-files-by-project-accession get files by project accession...
get-private-files Get private files by project...
get-projects get paged projects :return:
get-projects-by-accession get projects by accession...
get-projects-by-accession get projects by accession...
stream-files-metadata Stream all files metadata in...
stream-projects-metadata Stream all projects metadata...

```
# NOTE

Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ dependencies:
- boto3
- botocore
- tqdm
- urllib3
- urllib3
- httpx
18 changes: 18 additions & 0 deletions pridepy/files/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class Files:
This class handles PRIDE API files endpoint.
"""

V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3"
API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2"
API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2"
PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk"
Expand All @@ -62,6 +63,23 @@ class Files:
def __init__(self):
pass

async def stream_all_files_metadata(self, output_file, accession=None):
"""
get stream all project files from PRIDE API in JSON format
"""
if accession is None:
request_url = f"{self.V3_API_BASE_URL}/files/all"
count_request_url = f"{self.V3_API_BASE_URL}/files/count"
else:
request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all"
count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count"
headers = {"Accept": "application/JSON"}
response = Util.get_api_call(count_request_url, headers)
total_records = response.json()

regex_search_pattern = '"fileName"'
await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)

def get_all_paged_files(
self, query_filter, page_size, page, sort_direction, sort_conditions
):
Expand Down
39 changes: 38 additions & 1 deletion pridepy/pridepy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3

import asyncio
import logging
import click
from pridepy.files.files import Files
Expand Down Expand Up @@ -254,6 +254,43 @@ def search_projects_by_keywords_and_filters(
)


@main.command()
@click.option(
"-o",
"--output_file",
required=True,
help="output file to save all the projects metadata",
)
def stream_projects_metadata(output_file):
"""
Stream all projects metadata in JSON format to a file
:return:
"""
project = Project()
asyncio.run(project.stream_all_projects(output_file))


@main.command()
@click.option(
"-o",
"--output_file",
required=True,
help="output file to save all the files metadata",
)
@click.option(
"-a",
"--accession",
required=False,
help="project accession",
)
def stream_files_metadata(accession, output_file):
"""
Stream all files metadata in JSON format and write it to a file
:return:
"""
files = Files()
asyncio.run(files.stream_all_files_metadata(output_file, accession))

@main.command()
@click.option(
"-ps",
Expand Down
14 changes: 14 additions & 0 deletions pridepy/project/project.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python

from pridepy.authentication.authentication import Authentication
from pridepy.util.api_handling import Util

Expand All @@ -9,6 +10,7 @@ class Project:
"""

API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2/"
V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/"
PRIVATE_API_BASE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2/"

def __init__(self):
Expand Down Expand Up @@ -39,6 +41,18 @@ def get_projects(self, page_size, page, sort_direction, sort_conditions):
response = Util.get_api_call(request_url, headers)
return response.json()

async def stream_all_projects(self, output_file):
"""
get stream of all projects from PRIDE API in JSON format
"""
request_url = self.V3_API_BASE_URL + "projects/all"
count_request_url = self.V3_API_BASE_URL + "projects/count"
headers = {"Accept": "application/JSON"}
response = Util.get_api_call(count_request_url, headers)
total_records = response.json()
regex_search_pattern = '"projectDescription"'
await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)

def get_reanalysis_projects_by_accession(self, accession):
"""
search PRIDE projects by reanalysis accession
Expand Down
28 changes: 28 additions & 0 deletions pridepy/util/api_handling.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#!/usr/bin/env python
import re
import sys

import httpx
import requests
import logging
from ratelimit import limits, sleep_and_retry
from requests.adapters import HTTPAdapter
from tqdm import tqdm
from urllib3.util.retry import Retry


Expand All @@ -30,6 +34,30 @@ def get_api_call(url, headers=None):
)
return response

@staticmethod
@sleep_and_retry
@limits(calls=1000, period=50)
async def stream_response_to_file(out_file, total_records, regex_search_pattern, url, headers=None):
# Initialize the progress bar
with tqdm(total=total_records, unit_scale=True) as pbar:
async with httpx.AsyncClient() as client:
# Use a GET request with stream=True to handle streaming responses
async with client.stream("GET", url, headers=headers) as response:
# Check if the response is successful
response.raise_for_status()
try:
with open(out_file, 'w') as cfile:
# Iterate over the streaming content line by line
async for line in response.aiter_lines():
if line: # Avoid printing empty lines (common with text/event-stream)
cfile.write(line + "\n")
# Check if the pattern exists in the string
if re.search(regex_search_pattern, line):
pbar.update(1) # Update progress bar by 1 for each detection
except PermissionError as e:
print("[ERROR] No permissions to write to:", out_file)
sys.exit(1)

@staticmethod
@sleep_and_retry
@limits(calls=1000, period=50)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ plotly
boto3
botocore
tqdm
urllib3
urllib3
httpx
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

setup(
name="pridepy",
version="0.0.4",
version="0.0.5",
author="PRIDE Team",
author_email="pride-report@ebi.ac.uk",
author_email="pride-support@ebi.ac.uk",
description="Python Client library for PRIDE Rest API",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit b1c55b5

Please sign in to comment.