diff --git a/CHANGELOG.md b/CHANGELOG.md index adbb483..5b693d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,105 @@ # Changelog +## 1.1.1 (2019-03-01) + +### Bugfixes +* SUSHI: Deal with missing institutional identifier for customer. [Geoffrey Spear] + + (see Issue #72. This should catch the exception involved in the + issue, but there may be a wider issue with the Gale report since + the "customer" object involved should not be None if there's + an actual report to read, since the report itself is in the + Customer XML element.) + + +### Code quality/CI + +* Add pre-commit black hook, format all code with black. [Geoffrey Spear] + +* Python 3.7 support. [Geoffrey Spear] + +* Replace arrow library with pendulum. [Geoffrey Spear] + +* Pyup: try updating all deps. [Geoffrey Spear] + + +## 1.1.0 (2018-08-03) + +### Other + +* SUSHI: Verify SSL certs by default. [Geoffrey Spear] + + (Bumps version to 1.1 because this could be a breaking change for + sites that rely on requests to broken servers working without a flag) + + Fixes #67 + +* Don't try to do string formatting on given output file name. [Geoffrey Spear] + + Fixes #58 + + +## 1.0.3 (2018-08-01) + +### Bugfixes + +* Negate bash regex match correctly. [Geoffrey Spear] + + Issue #63 + +* Retry SUSHI reports if "Report Queued" message is returned. [Geoffrey Spear] + + (This is kind of an ugly hack that looks for this string in the + raw XML. A nicer fix will be possible with a fix for #3 ) + +### Docs +* Help for sushiclient --nodelay option (which probably shouldn't actually be used, but was helpful for testing without making the rest suite wait 60 seconds) [Geoffrey Spear] + + +### Code quality/CI + +* Exclude builds for flake8/lint/manifest? [Geoffrey Spear] + Issue #63 + +* Only use pylint version 1. [Geoffrey Spear] + + (version 2 drops py2 support) + + +## 1.0.2 (2018-05-11) + +### Bugfixes +* Fix incorrect first_date_col for DB1 reports. [James Fournie] + + The first date column in a DB1 report should actually have index 5 (6th column). See: https://www.projectcounter.org/code-of-practice-sections/usage-reports/#databases + +### Tests + +* Add failing test for PR #60. [Geoffrey Spear] + +* Add test for gaps in stats being output correctly. [Geoffrey Spear] + +### Code quality/CI + +* Create pyup.io config file. [pyup-bot] + +* Flake8: Fix whitespace after comma. [James Fournie] + +* Run 2.7 flake8 with 2.7; correct matrix syntax. [Geoffrey Spear] + +* Do linting through tox instead of directly in .travis.yml. [Geoffrey Spear] + +* Docs: pypi stuff. [Geoffrey Spear] + + fix pypi link + + remove some outdated advice on running setup.py install directly + + link to PyPA installing packages page + + +## 1.0.1 (2018-04-06) + +* Use universal wheel. [Geoffrey Spear] ## 1.0.0 diff --git a/MANIFEST.in b/MANIFEST.in index e4a957e..5614c7a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,6 +6,7 @@ recursive-include docs *.gitkeep recursive-include docs *.py recursive-include docs *.rst recursive-include pycounter *.csv +recursive-include pycounter *.json recursive-include pycounter *.tsv recursive-include pycounter *.xlsx recursive-include pycounter *.xml diff --git a/README.rst b/README.rst index f3a9147..98e337c 100644 --- a/README.rst +++ b/README.rst @@ -38,6 +38,8 @@ Licensed under the MIT license. See the file LICENSE for details. pycounter is tested on Python 2.7, 3.4, 3.5, 3.6, 3.7 and pypy2 (if you're still stuck on Python 2.6 or 3.3, please use version 0.16.1 of pycounter) +pycounter 2.x will be the last version with support for Python 2. + Documentation is on `Read the Docs `_. @@ -55,6 +57,19 @@ From inside the source distribution: Probably do all of this in a virtualenv. `The PyPA `_ has a good explanation of how to get started.) + +COUNTER 5 Note +-------------- + +In this alpha release, reports are output in COUNTER 4 format with COUNTER 5 data, +which is wrong, and probably not a valid apples-to-apples comparison since, for example, +TR_J1 excludes Gold Open Access counts that would be included in JR1, and also has +HTML and PDF columns that will always be 0 because these are no longer reported. + +Before the final 2.0 release, it will be capable of producing actual COUNTER 5 reports, +probably with an API for getting COUNTER 4 style data compatible with scripts that +were making assumptions about the data received to pass it into another system. + Usage ----- @@ -100,3 +115,4 @@ Our code is automatically styled using black. To install the pre-commit hook: pip install pre-commit pre-commit install + diff --git a/pycounter/constants.py b/pycounter/constants.py index 9ea959c..4596350 100644 --- a/pycounter/constants.py +++ b/pycounter/constants.py @@ -90,6 +90,7 @@ u"and Page-Type (formatted for normal browsers/delivered " u"to mobile devices and for mobile devices/delivered to " u"mobile devices)", + u"TR_J1": u'Journal Requests (Excluding "OA_Gold")', } HEADER_FIELDS = { @@ -173,6 +174,19 @@ u"Access denied category", u"Reporting Period Total", ), + # FIXME: this is outputting counter 5 reports in 4 format for... reasons. + "TR_J1": ( + u"Journal", + u"Publisher", + u"Platform", + u"Journal DOI", + u"Proprietary Identifier", + u"Print ISSN", + u"Online ISSN", + u"Reporting Period Total", + u"Reporting Period HTML", + u"Reporting Period PDF", + ), } TOTAL_TEXT = { diff --git a/pycounter/report.py b/pycounter/report.py index c53d149..1420af2 100644 --- a/pycounter/report.py +++ b/pycounter/report.py @@ -643,40 +643,63 @@ def parse_generic(report_reader): """ report = CounterReport() - report.report_type, report.report_version = _get_type_and_version( - six.next(report_reader)[0] - ) + first_line = six.next(report_reader) + if first_line[0] == "Report_Name": # COUNTER 5 report + second_line = six.next(report_reader) + third_line = six.next(report_reader) + report.report_type, report.report_version = _get_c5_type_and_version( + first_line, second_line, third_line + ) + else: + report.report_type, report.report_version = _get_type_and_version(first_line[0]) - # noinspection PyTypeChecker - report.metric = METRICS.get(report.report_type) + if report.report_version != 5: + # noinspection PyTypeChecker + report.metric = METRICS.get(report.report_type) - report.customer = six.next(report_reader)[0] + report.customer = six.next(report_reader)[1 if report.report_version == 5 else 0] - if report.report_version == 4: + if report.report_version >= 4: inst_id_line = six.next(report_reader) if inst_id_line: - report.institutional_identifier = inst_id_line[0] + report.institutional_identifier = inst_id_line[ + 1 if report.report_version == 5 else 0 + ] if report.report_type == "BR2": report.section_type = inst_id_line[1] six.next(report_reader) + if report.report_version == 5: + for _ in range(3): + six.next(report_reader) covered_line = six.next(report_reader) - report.period = convert_covered(covered_line[0]) + report.period = convert_covered( + covered_line[1 if report.report_version == 5 else 0] + ) - six.next(report_reader) + if report.report_version < 5: + six.next(report_reader) date_run_line = six.next(report_reader) - report.date_run = convert_date_run(date_run_line[0]) + report.date_run = convert_date_run( + date_run_line[1 if report.report_version == 5 else 0] + ) + + if report.report_version == 5: + for _ in range(2): + # Skip Created_By and blank line + six.next(report_reader) header = six.next(report_reader) - try: - report.year = _year_from_header(header, report) - except AttributeError: - warnings.warn("Could not determine year from malformed header") + if report.report_version < 5: + try: + report.year = _year_from_header(header, report) + except AttributeError: + warnings.warn("Could not determine year from malformed header") - if report.report_version == 4: + if report.report_version >= 4: countable_header = header[0:8] for col in header[8:]: if col: @@ -693,7 +716,7 @@ def parse_generic(report_reader): end_date = last_day(convert_date_column(header[last_col - 1])) report.period = (start_date, end_date) - if report.report_type != "DB1": + if report.report_type != "DB1" and report.report_version != 5: six.next(report_reader) if report.report_type == "DB2": @@ -723,8 +746,8 @@ def _parse_line(line, report, last_col): doi = "" prop_id = "" - if report.report_version == 4: - if report.report_type.startswith("JR1"): + if report.report_version >= 4: + if report.report_type.startswith("JR1") or report.report_type == "TR_J1": old_line = line line = line[0:3] + line[5:7] + line[10:last_col] doi = old_line[3] @@ -761,7 +784,7 @@ def _parse_line(line, report, last_col): for data in line[5:]: month_data.append((curr_month, format_stat(data))) curr_month = next_month(curr_month) - if report.report_type.startswith("JR"): + if report.report_type.startswith("JR") or report.report_type == "TR_J1": return CounterJournal( metric=report.metric, month_data=month_data, @@ -809,6 +832,10 @@ def _get_type_and_version(specifier): return report_type, report_version +def _get_c5_type_and_version(first_line, second_line, third_line): + return second_line[1], int(third_line[1]) + + def _year_from_header(header, report): """Get the year for the report from the header. diff --git a/pycounter/sushi.py b/pycounter/sushi.py index dd05349..cd7443a 100644 --- a/pycounter/sushi.py +++ b/pycounter/sushi.py @@ -14,11 +14,13 @@ import requests import six +from pycounter import sushi5 import pycounter.constants import pycounter.exceptions from pycounter.helpers import convert_date_run import pycounter.report + logger = logging.getLogger(__name__) NS = pycounter.constants.NS @@ -130,6 +132,9 @@ def get_report(*args, **kwargs): :param no_delay: don't delay in retrying Report Queued """ + if kwargs.get("release") == 5: + return sushi5.get_report(*args, **kwargs) + no_delay = kwargs.pop("no_delay", False) delay_amount = 0 if no_delay else 60 while True: diff --git a/pycounter/sushi5.py b/pycounter/sushi5.py new file mode 100644 index 0000000..ce5ca29 --- /dev/null +++ b/pycounter/sushi5.py @@ -0,0 +1,204 @@ +"""COUNTER 5 SUSHI support.""" + +import datetime +import logging +import time +import warnings + +import pendulum +import requests + +import pycounter.exceptions +from pycounter.helpers import convert_date_run +import pycounter.report + + +logger = logging.getLogger(__name__) + + +def _dates_from_filters(filters): + """Convert report filters to start and end date + + Args: + filters: a list of dicts containing SUSHI report filters + + Returns: tuple of start, end dates as datetime.date + + """ + + converted_filters = { + filter_["Name"]: datetime.datetime.strptime(filter_["Value"], "%Y-%m-%d").date() + for filter_ in filters + if filter_["Name"] in ("Begin_Date", "End_Date") + } + try: + return converted_filters["Begin_Date"], converted_filters["End_Date"] + except KeyError: + raise ValueError("filters must include a Begin_Date and End_Date") + + +def _raw_to_full(raw_report): + """Convert a raw report to CounterReport. + + :param raw_report: raw report as dict decoded from JSON + :return: a :class:`pycounter.report.CounterReport` + """ + header = raw_report["Report_Header"] + start_date, end_date = _dates_from_filters(header["Report_Filters"]) + date_run = header.get("Created") + report_data = { + "period": (start_date, end_date), + "report_version": int(header["Release"]), + "report_type": header["Report_ID"], + "customer": header.get("Institution_Name", u""), + "institutional_identifier": header.get("Customer_ID", u""), + "date_run": pendulum.parse(date_run) if date_run else datetime.datetime.now(), + } + + report = pycounter.report.CounterReport(**report_data) + + for item in raw_report["Report_Items"]: + publisher_name = item.get("Publisher", u"") + platform = item.get("Platform", u"") + title = item["Title"] + eissn = issn = doi = prop_id = u"" + # isbn = u"" + + for identifier in item["Item_ID"]: + if identifier["Type"] == "Print_ISSN": + issn = identifier["Value"] + elif identifier["Type"] == "Online_ISSN": + eissn = identifier["Value"] + # elif identifier["Type"] == "ISBN": + # isbn = identifier["Value"] + elif identifier["Type"] == "DOI": + doi = identifier["Value"] + elif identifier["Type"] == "Proprietary_ID": + prop_id = identifier["Value"] + + month_data = [] + + for perform_item in item["Performance"]: + item_date = convert_date_run(perform_item["Period"]["Begin_Date"]) + usage = None + for inst in perform_item["Instance"]: + if inst["Metric_Type"] == u"Total_Item_Requests": + usage = inst["Count"] + if usage is not None: + month_data.append((item_date, int(usage))) + + if report.report_type.startswith("TR_J"): + report.pubs.append( + pycounter.report.CounterJournal( + title=title, + platform=platform, + publisher=publisher_name, + period=report.period, + metric=report.metric, + issn=issn, + eissn=eissn, + doi=doi, + proprietary_id=prop_id, + month_data=month_data, + ) + ) + + return report + + +def get_sushi_stats_raw( + wsdl_url=None, + start_date=None, + end_date=None, + requestor_id=None, + requestor_email=None, + requestor_name=None, + customer_reference=None, + customer_name=None, + report="TR_J1", + release=5, + sushi_dump=False, + verify=True, + url=None, +): + """Get SUSHI stats for a given site in dict (decoded from JSON) format. + + :param wsdl_url: (Deprecated; for backward compatibility with COUNTER 4 SUSHI + code. Use `url` instead.) URL to API endpoint for this provider + + :param start_date: start date for report (must be first day of a month) + + :param end_date: end date for report (must be last day of a month) + + :param requestor_id: requestor ID as defined by SUSHI protocol + + :param requestor_email: requestor email address, if required by provider + + :param requestor_name: Internationally recognized organization name + + :param customer_reference: customer reference number as defined by SUSHI + protocol + + :param customer_name: Internationally recognized organization name + + :param report: report type, values defined by SUSHI protocol + + :param release: report release number (should generally be `4`.) + + :param sushi_dump: produces dump of XML to DEBUG logger + + :param verify: bool: whether to verify SSL certificates + + :param url: str: URL to endpoint for this provider + + """ + if url is None and wsdl_url: + warnings.warn( + DeprecationWarning( + "wsdl_url argument to get_sushi_stats" + "_raw is deprecated; use url instead" + ) + ) + url = wsdl_url + url_params = {"url": url, "report": report} + req_params = { + "customer_id": customer_reference, + "begin_date": start_date, + "end_date": end_date, + "requestor_id": requestor_id, + } + + response = requests.get( + "{url}/reports/{report}".format(**url_params), + params=req_params, + headers={"User-Agent": "pycounter/%s" % pycounter.__version__}, + ) + + if sushi_dump: + logger.debug( + "SUSHI DUMP: request: %s \n\n response: %s", + vars(response.request), + response.content, + ) + + return response.json() + + +def get_report(*args, **kwargs): + """Get a usage report from a COUNTER 5 (RESTful) SUSHI server. + + returns a :class:`pycounter.report.CounterReport` object. + + parameters: see get_sushi_stats_raw + + :param no_delay: don't delay in retrying Report Queued + """ + no_delay = kwargs.pop("no_delay", False) + delay_amount = 0 if no_delay else 60 + while True: + try: + raw_report = get_sushi_stats_raw(*args, **kwargs) + return _raw_to_full(raw_report) + except pycounter.exceptions.ServiceBusyError: + print("Service busy, retrying in %d seconds" % delay_amount) + time.sleep(delay_amount) diff --git a/pycounter/test/counter5/__init__.py b/pycounter/test/counter5/__init__.py new file mode 100644 index 0000000..03b35a9 --- /dev/null +++ b/pycounter/test/counter5/__init__.py @@ -0,0 +1 @@ +"""COUNTER 5 test suite""" diff --git a/pycounter/test/counter5/conftest.py b/pycounter/test/counter5/conftest.py new file mode 100644 index 0000000..2eba76f --- /dev/null +++ b/pycounter/test/counter5/conftest.py @@ -0,0 +1,34 @@ +import datetime +import io +import os + +from httmock import HTTMock, urlmatch +import pytest + +import pycounter + + +@urlmatch(netloc=r"(.*\.)?example\.com$") +def sushi_mock(url_unused, request_unused): + path = os.path.join(os.path.dirname(__file__), "data", "sushi_simple.json") + with io.open(path, "r", encoding="utf-8") as datafile: + return datafile.read() + + +@pytest.fixture +def trj1_report(): + return pycounter.report.parse( + os.path.join(os.path.dirname(__file__), "data", "tr_j1.tsv") + ) + + +@pytest.fixture +def sushi5_report(): + with HTTMock(sushi_mock): + return pycounter.sushi.get_report( + url="http://www.example.com/Sushi", + start_date=datetime.date(2019, 1, 1), + end_date=datetime.date(2019, 2, 28), + release=5, + report="TR_J1", + ) diff --git a/pycounter/test/counter5/data/sushi_simple.json b/pycounter/test/counter5/data/sushi_simple.json new file mode 100644 index 0000000..d5db884 --- /dev/null +++ b/pycounter/test/counter5/data/sushi_simple.json @@ -0,0 +1,78 @@ +{ + "Report_Header": { + "Created": "2019-03-28T11:25:11Z", + "Created_By": "Example Vendor", + "Customer_ID": "exampleLibrary", + "Report_ID": "TR_J1", + "Release": "5", + "Institution_Name": "Example Library", + "Report_Name": "Journal Requests (Excluding \"OA_Gold\")", + "Report_Filters": [ + { + "Name": "Begin_Date", + "Value": "2019-01-01" + }, + { + "Name": "End_Date", + "Value": "2019-02-28" + } + ] + }, + "Report_Items": [ + { + "Title": "Journal of fake data", + "Item_ID": [ + { + "Type": "Print_ISSN", + "Value": "0737-1764" + }, + { + "Type": "Online_ISSN", + "Value": "1234-5678" + } + ], + "Platform": "ExamplePlatform", + "Publisher": "Example Publisher", + "Publisher_ID": [ + { + "Type": "Proprietary", + "Value": "Example: 42" + } + ], + "Performance": [ + { + "Period": { + "Begin_Date": "2019-01-01", + "End_Date": "2019-01-31" + }, + "Instance": [ + { + "Metric_Type": "Total_Item_Requests", + "Count": 14 + }, + { + "Metric_Type": "Unique_Item_Requests", + "Count": 9 + } + ] + }, + { + "Period": { + "Begin_Date": "2019-02-01", + "End_Date": "2019-02-28" + }, + "Instance": [ + { + "Metric_Type": "Total_Item_Requests", + "Count": 16 + }, + { + "Metric_Type": "Unique_Item_Requests", + "Count": 12 + } + ] + } + ] + } + ] +} diff --git a/pycounter/test/counter5/data/tr_j1.tsv b/pycounter/test/counter5/data/tr_j1.tsv new file mode 100644 index 0000000..c286c69 --- /dev/null +++ b/pycounter/test/counter5/data/tr_j1.tsv @@ -0,0 +1,18 @@ +Report_Name Journal Requests (Excluding OA_Gold) +Report_ID TR_J1 +Release 5 +Institution_Name Sample University +Institution_ID isni=1234567890 +Metric_Types Total_Item_Requests; Unique_Item_Requests +Report_Filters Data_Type=Journal; Access_Type=Controlled; Access_Method=Regular +Report_Attributes +Exceptions +Reporting_Period 2017-01-01 to 2017-06-30 +Created 2017-05-25 +Created_By Platform X + +Title Publisher Publisher_ID Platform DOI Proprietary_ID Print_ISSN Online_ISSN URI Metric_Type Reporting_ Period_Total Jan-2017 Feb-2017 Mar-2017 Apr-2017 May-2017 Jun-2017 +Journal A Publisher X isni=1234123412341234 PlatformX /12.1.0.1/1111.2.222 pubx:jnlA 1111-22222 1111-1223 Total_Item_Requests +Journal A Publisher X isni=1234123412341234 PlatformX /12.1.0.1/1111.2.222 pubx:jnlA 1111-22222 1111-1223 Unique_Item_Requests +Journal B Publisher X isni=1234123412341234 PlatformX /12.1.0.1/1111.2.211 pubx:jnlB 1111-22211 1111-1213 Total_Item_Requests +Journal B Publisher X isni=1234123412341234 PlatformX /12.1.0.1/1111.2.211 pubx:jnlB 1111-22211 1111-1213 Unique_Item_Requests \ No newline at end of file diff --git a/pycounter/test/counter5/test_sushi.py b/pycounter/test/counter5/test_sushi.py new file mode 100644 index 0000000..bce0cf4 --- /dev/null +++ b/pycounter/test/counter5/test_sushi.py @@ -0,0 +1,19 @@ +"""Tests for COUNTER 5 SUSHI support.""" + + +def test_report_type(sushi5_report): + assert u"TR_J1" == sushi5_report.report_type + + +def test_report_version(sushi5_report): + assert 5 == sushi5_report.report_version + + +def test_report_customer(sushi5_report): + assert u"exampleLibrary" == sushi5_report.institutional_identifier + + +def test_data(sushi5_report): + publication = next(iter(sushi5_report)) + data = [month[2] for month in publication] + assert 14 == data[0] diff --git a/pycounter/test/counter5/test_trj1.py b/pycounter/test/counter5/test_trj1.py new file mode 100644 index 0000000..ffdba6b --- /dev/null +++ b/pycounter/test/counter5/test_trj1.py @@ -0,0 +1,32 @@ +"""Tests of title report for journals.""" +from datetime import date +import os + +import pycounter.report + + +def test_version(trj1_report): + assert trj1_report.report_version == 5 + + +def test_report_type(trj1_report): + assert trj1_report.report_type == "TR_J1" + + +def test_pubs_length(trj1_report): + assert len(trj1_report.pubs) == 4 + + +def test_customer(trj1_report): + assert trj1_report.customer == "Sample University" + + +def test_period(trj1_report): + assert trj1_report.period == (date(2017, 1, 1), date(2017, 6, 30)) + + +def test_parse(): + report = pycounter.report.parse( + os.path.join(os.path.dirname(__file__), "data", "tr_j1.tsv") + ) + assert report.report_version == 5 diff --git a/setup.py b/setup.py index fad6684..3c2bf3d 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ keywords="library COUNTER journals usage_statistics SUSHI", test_suite="pycounter.test", classifiers=[ - "Development Status :: 4 - Beta", + "Development Status :: 3 - Alpha", "License :: OSI Approved :: MIT License", "Intended Audience :: Developers", "Programming Language :: Python :: 2.7",