Skip to content

Commit

Permalink
Merge pull request internetarchive#9386 from cdrini/9372/fix/standard…
Browse files Browse the repository at this point in the history
…-ebooks-auth

Add access key for standard ebooks OPDS feed
  • Loading branch information
mekarpeles authored Jun 4, 2024
2 parents caa28ed + a248925 commit ac58113
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 115 deletions.
148 changes: 33 additions & 115 deletions scripts/import_standard_ebooks.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#!/usr/bin/env python
import json
import requests
from requests.auth import AuthBase, HTTPBasicAuth
import time
from typing import Any

from os import path

import feedparser

from openlibrary.core.imports import Batch
Expand All @@ -14,43 +13,47 @@
from infogami import config

FEED_URL = 'https://standardebooks.org/opds/all'
LAST_UPDATED_TIME = './standard_ebooks_last_updated.txt'
IMAGE_REL = 'http://opds-spec.org/image'
BASE_SE_URL = 'https://standardebooks.org'


def get_feed():
def get_feed(auth: AuthBase):
"""Fetches and returns Standard Ebook's feed."""
r = requests.get(FEED_URL)
return feedparser.parse(r.text)
with requests.get(FEED_URL, auth=auth, stream=True) as r:
r.raise_for_status()
return feedparser.parse(r.raw, response_headers=r.headers)


def map_data(entry) -> dict[str, Any]:
def map_data(entry: dict) -> dict[str, Any]:
"""Maps Standard Ebooks feed entry to an Open Library import object."""
std_ebooks_id = entry.id.replace('https://standardebooks.org/ebooks/', '')
image_uris = filter(lambda link: link.rel == IMAGE_REL, entry.links)
std_ebooks_id = entry['id'].replace('https://standardebooks.org/ebooks/', '')

# Standard ebooks only has English works at this time ; because we don't have an
# easy way to translate the language codes they store in the feed to the MARC
# language codes, we're just gonna handle English for now, and have it error
# if Standard Ebooks ever adds non-English works.
marc_lang_code = 'eng' if entry.language.startswith('en-') else None
if not marc_lang_code:
raise ValueError(f'Feed entry language {entry.language} is not supported.')
lang = entry.get('dcterms_language')
if not lang or not lang.startswith('en-'):
raise ValueError(f'Feed entry language {lang} is not supported.')
import_record = {
"title": entry.title,
"title": entry['title'],
"source_records": [f"standard_ebooks:{std_ebooks_id}"],
"publishers": [entry.publisher],
"publish_date": entry.dc_issued[0:4],
"authors": [{"name": author.name} for author in entry.authors],
"description": entry.content[0].value,
"subjects": [tag.term for tag in entry.tags],
"publishers": ['Standard Ebooks'],
"publish_date": entry['published'][0:4],
"authors": [{"name": author['name']} for author in entry['authors']],
"description": entry['content'][0]['value'],
"subjects": [tag['term'] for tag in entry['tags']],
"identifiers": {"standard_ebooks": [std_ebooks_id]},
"languages": [marc_lang_code],
"languages": ['eng'],
}

if image_uris:
import_record['cover'] = f'{BASE_SE_URL}{next(iter(image_uris))["href"]}'
cover_url = next(
(link['href'] for link in entry['links'] if link['rel'] == IMAGE_REL),
None,
)
if cover_url:
# This used to be a relative URL; ensure the API doesn't change.
assert cover_url.startswith('https://')
import_record['cover'] = cover_url

return import_record

Expand All @@ -68,67 +71,6 @@ def create_batch(records: list[dict[str, str]]) -> None:
batch.add_items([{'ia_id': r['source_records'][0], 'data': r} for r in records])


def get_last_updated_time() -> str | None:
"""Gets date of last import job.
Last updated dates are read from a local file. If no
file exists, None is returned. Last updated date is
expected to be in HTTP-date format:
https://httpwg.org/specs/rfc7231.html#http.date
returns last updated date string or None
"""
if path.exists(LAST_UPDATED_TIME):
with open(LAST_UPDATED_TIME) as f:
return f.readline()

return None


def find_last_updated() -> str | None:
"""Fetches and returns Standard Ebooks most recent update date.
Returns None if the last modified date is not included in the
response headers.
"""
r = requests.head(FEED_URL)
return r.headers['last-modified'] if r.ok else None


def convert_date_string(date_string: str | None) -> time.struct_time:
"""Converts HTTP-date format string into a struct_time object.
The date_string will be formatted similarly to this:
Fri, 05 Nov 2021 03:50:24 GMT
returns struct_time representation of the given time, or the
epoch if no time given.
>>> str(convert_date_string(None)) # doctest: +NORMALIZE_WHITESPACE
'time.struct_time(tm_year=1970, tm_mon=1, tm_mday=1, tm_hour=0,
tm_min=0, tm_sec=0, tm_wday=3, tm_yday=1, tm_isdst=0)'
>>> convert_date_string("") # doctest: +ELLIPSIS
time.struct_time(tm_year=1970, tm_mon=1, tm_mday=1, tm_hour=0, ...
>>> convert_date_string(0) # doctest: +ELLIPSIS
time.struct_time(tm_year=1970, tm_mon=1, tm_mday=1, tm_hour=0, ...
>>> convert_date_string("Fri, 05 Nov 2021 03:50:24 GMT") # doctest: +ELLIPSIS
time.struct_time(tm_year=2021, tm_mon=11, tm_mday=5, tm_hour=3, tm_min=50, ...
"""
if not date_string:
return time.gmtime(0)
return time.strptime(date_string[5:-4], '%d %b %Y %H:%M:%S')


def filter_modified_since(
entries, modified_since: time.struct_time
) -> list[dict[str, str]]:
"""Returns a list of import objects."""
return [map_data(e) for e in entries if e.updated_parsed > modified_since]


def import_job(
ol_config: str,
dry_run: bool = False,
Expand All @@ -139,45 +81,21 @@ def import_job(
"""
load_config(ol_config)

# Make HEAD request to get last-modified time
last_modified = find_last_updated()

if not last_modified:
print(f'HEAD request to {FEED_URL} failed. Not attempting GET request.')
if not config.get('standard_ebooks_key'):
print('Standard Ebooks key not found in config. Exiting.')
return

print(f'Last-Modified date: {last_modified}')

updated_on = get_last_updated_time()
if last_modified == updated_on:
print(f'No new updates since {updated_on}. Processing completed.')
return

print(f'Last import job: {updated_on or "No date found"}')
# Get feed:
d = get_feed()

# Create datetime using updated_on:
modified_since = convert_date_string(updated_on)

# Map feed entries to list of import objects:
print(f'Importing all entries that have been updated since {modified_since}.')
modified_entries = filter_modified_since(d.entries, modified_since)
print(f'{len(modified_entries)} import objects created.')
auth = HTTPBasicAuth(config.get('standard_ebooks_key'), '')
feed = map(map_data, get_feed(auth).entries)

if not dry_run:
create_batch(modified_entries)
print(f'{len(modified_entries)} entries added to the batch import job.')
list_feed = list(feed)
create_batch(list_feed)
print(f'{len(list_feed)} entries added to the batch import job.')
else:
for record in modified_entries:
for record in feed:
print(json.dumps(record))

# Store timestamp for header
if not dry_run:
with open(LAST_UPDATED_TIME, 'w+') as f:
f.write(last_modified)
print(f'Last updated timestamp written to: {LAST_UPDATED_TIME}')


if __name__ == '__main__':
print("Start: Standard Ebooks import job")
Expand Down
150 changes: 150 additions & 0 deletions scripts/tests/test_import_standard_ebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from scripts.import_standard_ebooks import map_data


SAMPLE_1 = {
'id': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom',
'guidislink': True,
'link': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom',
'dcterms_identifier': 'url:https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom',
'title': 'Seven Pillars of Wisdom',
'title_detail': {
'type': 'text/plain',
'language': None,
'base': '',
'value': 'Seven Pillars of Wisdom',
},
'authors': [
{
'name': 'T. E. Lawrence',
'href': 'https://standardebooks.org/ebooks/t-e-lawrence',
}
],
'author_detail': {
'name': 'T. E. Lawrence',
'href': 'https://standardebooks.org/ebooks/t-e-lawrence',
},
'href': 'https://standardebooks.org/ebooks/t-e-lawrence',
'author': 'T. E. Lawrence',
'schema_alternatename': 'Thomas Edward Lawrence',
'schema_sameas': 'http://id.loc.gov/authorities/names/n79097491',
'published': '2022-01-01T22:32:49Z',
'updated': '2024-06-03T21:26:42Z',
'dcterms_language': 'en-GB',
'dcterms_publisher': 'Standard Ebooks',
'rights': 'Public domain in the United States. Users located outside of the United States must check their local laws before using this ebook. Original content released to the public domain via the Creative Commons CC0 1.0 Universal Public Domain Dedication.', # noqa: E501
'rights_detail': {
'type': 'text/plain',
'language': None,
'base': '',
'value': 'Public domain in the United States. Users located outside of the United States must check their local laws before using this ebook. Original content released to the public domain via the Creative Commons CC0 1.0 Universal Public Domain Dedication.', # noqa: E501
},
'summary': 'T. E. Lawrence’s memoir of leading the Arab revolt against the Ottoman empire during World War I.',
'summary_detail': {
'type': 'text/plain',
'language': None,
'base': '',
'value': 'T. E. Lawrence’s memoir of leading the Arab revolt against the Ottoman empire during World War I.',
},
'content': [
{
'type': 'text/html',
'language': None,
'base': '',
'value': '<p><i>Seven Pillars of Wisdom</i> is <a href="https://standardebooks.org/ebooks/t-e-lawrence"><abbr>T. E.</abbr> Lawrence’s</a> memoir of his involvement in leading a portion of the Arab revolt against the Ottoman empire during World War I. The empire had joined the side of Germany and the Central Powers in the war, and Britain hoped that a successful revolt would take the empire out of the war effort. Britain had also promised the Arabs that, if they were successful, England would recognize a single Arab state.</p> <p>Lawrence convinced the Arab leaders, who had historically not shown a willingness to work together, to join forces in supporting Britain’s strategy in the area. His memoir is part travelogue, part philosophy treatise, and part action novel. It details his movements and actions during his two year involvement, his relationships with the various Arab leaders and men who fought with him, and his thoughts—and doubts—during that time. It’s a gripping tale made famous by the movie <i>Lawrence of Arabia</i>, and one that Winston Churchill called “unsurpassable” as a “narrative of war and adventure.”</p> <p>The manuscript of <i>Seven Pillars of Wisdom</i> has a rich history. Lawrence finished his first draft in 1919 from his notes during the war, but lost most of it when changing trains in England (it was never found). The next year, he started working on a new version from memory that ended up being sixty percent longer than the original. He then edited that version (although it was still a third longer than the original draft), finishing it in early 1922, and had eight copies of it printed to give to friends so they could review it and offer editing suggestions (and to prevent a repeat of losing his only copy). About this time he re-enlisted in the service, but friends convinced him to work on a version he could publish. In 1926, he had a first edition of approximately 200 copies published that included 125 black-and-white and color illustrations from sixteen different artists. The first edition lost money, and it was the only edition published during his lifetime. This edition uses the first edition text and includes all 125 of the original illustrations, including both endpapers.</p>', # noqa: E501
}
],
'tags': [
{
'term': 'Arab countries--History--Arab Revolt, 1916-1918',
'scheme': 'http://purl.org/dc/terms/LCSH',
'label': None,
},
{
'term': 'World War, 1914-1918',
'scheme': 'http://purl.org/dc/terms/LCSH',
'label': None,
},
{
'term': 'Adventure',
'scheme': 'https://standardebooks.org/vocab/subjects',
'label': None,
},
{
'term': 'Memoir',
'scheme': 'https://standardebooks.org/vocab/subjects',
'label': None,
},
{
'term': 'Nonfiction',
'scheme': 'https://standardebooks.org/vocab/subjects',
'label': None,
},
],
'links': [
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/cover.jpg',
'rel': 'http://opds-spec.org/image',
'type': 'image/jpeg',
},
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/cover-thumbnail.jpg',
'rel': 'http://opds-spec.org/image/thumbnail',
'type': 'image/jpeg',
},
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom',
'rel': 'alternate',
'title': 'This ebook’s page at Standard Ebooks',
'type': 'application/xhtml+xml',
},
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/t-e-lawrence_seven-pillars-of-wisdom.epub',
'length': '62070075',
'rel': 'http://opds-spec.org/acquisition/open-access',
'title': 'Recommended compatible epub',
'type': 'application/epub+zip',
},
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/t-e-lawrence_seven-pillars-of-wisdom_advanced.epub',
'length': '62221725',
'rel': 'http://opds-spec.org/acquisition/open-access',
'title': 'Advanced epub',
'type': 'application/epub+zip',
},
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/t-e-lawrence_seven-pillars-of-wisdom.kepub.epub',
'length': '62135106',
'rel': 'http://opds-spec.org/acquisition/open-access',
'title': 'Kobo Kepub epub',
'type': 'application/kepub+zip',
},
{
'href': 'https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/t-e-lawrence_seven-pillars-of-wisdom.azw3',
'length': '63108449',
'rel': 'http://opds-spec.org/acquisition/open-access',
'title': 'Amazon Kindle azw3',
'type': 'application/x-mobipocket-ebook',
},
],
}


def test_map_data():
assert map_data(SAMPLE_1) == {
"title": "Seven Pillars of Wisdom",
"source_records": ["standard_ebooks:t-e-lawrence/seven-pillars-of-wisdom"],
"publishers": ["Standard Ebooks"],
"publish_date": "2022",
"authors": [{"name": "T. E. Lawrence"}],
"description": SAMPLE_1["content"][0]["value"],
"subjects": [
"Arab countries--History--Arab Revolt, 1916-1918",
"World War, 1914-1918",
"Adventure",
"Memoir",
"Nonfiction",
],
"identifiers": {"standard_ebooks": ["t-e-lawrence/seven-pillars-of-wisdom"]},
"languages": ["eng"],
"cover": "https://standardebooks.org/ebooks/t-e-lawrence/seven-pillars-of-wisdom/downloads/cover.jpg",
}

0 comments on commit ac58113

Please sign in to comment.