Skip to content

Commit

Permalink
refactor(io): simplify resource interface (#93)
Browse files Browse the repository at this point in the history
* refactor(io): do not enforce buffering

* doc(io): better docstrings

* refactor(io): Resource.open always returns io.RawIOBase

* test(io): rm unneeded import, type checks

* test(io): rm useless check

* refactor(io): rm mode from LocalResource.open args

* fix(io): skip empty elements in IterStream

* refactor(io): simplify readinto logic
  • Loading branch information
cmdoret authored Oct 20, 2023
1 parent 4740ff7 commit 012475b
Showing 1 changed file with 51 additions and 47 deletions.
98 changes: 51 additions & 47 deletions gimie/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,102 +4,106 @@
import os
from pathlib import Path
import requests
from typing import Optional, Union
from typing import Iterator, Optional, Union


class Resource:
"""Abstract class for buffered read-only access to local or remote resources via
a file-like interface."""
"""Abstract class for read-only access to local or remote resources via
a file-like interface.
Parameters
----------
name:
The name of the resource, typically the filename.
"""

name: str

def open(self) -> io.BufferedReader:
def open(self) -> io.RawIOBase:
raise NotImplementedError


class LocalResource(Resource):
"""Providing buffered read-only access to local data.
Parameters
----------
name: the name of the resource, typically the filename.
url: the URL where the resource. can be downladed from.
headers: optional headers to pass to the request.
"""Providing read-only access to local data via a file-like interface.
Examples
--------
>>> from gimie.io import LocalResource
>>> resource = LocalResource("README.md")
"""

def __init__(self, path: Union[str, os.PathLike]):
self.path: Path = Path(path)

def open(self, mode="r") -> io.BufferedReader:
return io.BufferedReader(io.FileIO(self.path, mode))
def open(self) -> io.RawIOBase:
return io.FileIO(self.path, mode="r")

@property
def name(self) -> str:
return self.path.name


class RemoteResource(Resource):
"""Provides buffered read-only access to remote data.
"""Provides read-only access to remote data via a file-like interface.
Parameters
----------
name: the name of the resource, typically the filename.
url: the URL where the resource. can be downladed from.
headers: optional headers to pass to the request.
url:
The URL where the resource. can be downladed from.
headers:
Optional headers to pass to the request.
Examples
--------
>>> from gimie.io import RemoteResource
>>> url = "https://raw.githubusercontent.com/SDSC-ORD/gimie/main/README.md"
>>> resource = RemoteResource("README.md", url)
>>> content = RemoteResource("README.md", url).open().read()
>>> assert isinstance(content, bytes)
"""

def __init__(self, name: str, url: str, headers: Optional[dict] = None):
self.name = name
self.url = url
self.headers = headers or {}

def open(self) -> io.BufferedReader:
def open(self) -> io.RawIOBase:
resp = requests.get(
self.url, headers=self.headers, stream=True
).iter_content(chunk_size=128)
return iterable_to_stream(resp)
return IterStream(resp)


def iterable_to_stream(
iterable, buffer_size=io.DEFAULT_BUFFER_SIZE
) -> io.BufferedReader:
"""
Converts an iterable yielding bytestrings to a read-only input stream.
Lets you use an iterable (e.g. a generator) that yields bytestrings as a read-only
input stream.
class IterStream(io.RawIOBase):
"""Wraps an iterator under a like a file-like interface.
Empty elements in the iterator are ignored.
The stream implements Python 3's newer I/O API (available in Python 2's io module).
For efficiency, the stream is buffered.
Parameters
----------
iterator:
An iterator yielding bytes.
credits: https://stackoverflow.com/a/20260030/8440675
Examples
--------
>>> stream = IterStream(iter([b"Hello ", b"", b"World"]))
>>> stream.read()
b'Hello World'
"""

class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = ""

def readable(self):
return True

def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
def __init__(self, iterator: Iterator[bytes]):
self.leftover = b""
self.iterator = iterator

def readable(self):
return True

def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
while True:
chunk = self.leftover or next(self.iterator)
# skip empty elements
if not chunk:
continue
output, self.leftover = chunk[:l], chunk[l:]
b[: len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF

return io.BufferedReader(IterStream(), buffer_size=buffer_size)
except StopIteration:
return 0 # indicate EOF

0 comments on commit 012475b

Please sign in to comment.