Skip to content

Commit

Permalink
fix(cff): doi structure parsing (#121)
Browse files Browse the repository at this point in the history
* fix: DOI from dict, not flat value

* fix:make cff example correct

* docs: adapt docstring example to match real CFF structure

* fix:docstring still fucked

* fix:typo docstring

* fix:chatGPT's suggestion for docstring formatting

* fix:OK, no multiline, and double escape newlines

* feat: support multiple DOI's

* refactor(cff): reduce nesting

* chore(cff): use type hint for list from python standard collection

Co-authored-by: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>

---------

Co-authored-by: Cyril Matthey-Doret <cyril.matthey-doret@epfl.ch>
Co-authored-by: cmdoret <cyril.mattheydoret@gmail.com>
  • Loading branch information
3 people authored Dec 17, 2024
1 parent 5ff13d2 commit 3867d7a
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 32 deletions.
63 changes: 36 additions & 27 deletions gimie/parsers/cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,22 @@ def __init__(self, subject: str):
super().__init__(subject)

def parse(self, data: bytes) -> Graph:
"""Extracts a DOI link and list of authors from a CFF file and returns a
graph with a single triple <subject> <schema:citation> <doi>
"""Extracts DOIs and list of authors from a CFF file and returns a
graph with triples <subject> <schema:citation> <doi>
and a number of author objects with <schema:name> and <md4i:orcid> values.
If no DOI is found, it will not be included in the graph.
If no authors are found, it will not be included in the graph.
If neither authors nor DOI are found, an empty graph is returned.
If no DOIs are found, they will not be included in the graph.
If no authors are found, they will not be included in the graph.
If neither authors nor DOIs are found, an empty graph is returned.
"""
extracted_cff_triples = Graph()
doi = get_cff_doi(data)
dois = get_cff_doi(data)
authors = get_cff_authors(data)

if doi:
extracted_cff_triples.add(
(self.subject, SDO.citation, URIRef(doi))
)
if dois:
for doi in dois:
extracted_cff_triples.add(
(self.subject, SDO.citation, URIRef(doi))
)
if not authors:
return extracted_cff_triples
for author in authors:
Expand Down Expand Up @@ -119,8 +120,8 @@ def doi_to_url(doi: str) -> str:
return f"https://doi.org/{doi_match}"


def get_cff_doi(data: bytes) -> Optional[str]:
"""Given a CFF file, returns the DOI, if any.
def get_cff_doi(data: bytes) -> Optional[list[str]]:
"""Given a CFF file, returns a list of DOIs, if any.
Parameters
----------
Expand All @@ -129,15 +130,16 @@ def get_cff_doi(data: bytes) -> Optional[str]:
Returns
-------
str, optional
doi formatted as a valid url
list of str, optional
DOIs formatted as valid URLs
Examples
--------
>>> get_cff_doi(bytes("doi: 10.5281/zenodo.1234", encoding="utf8"))
'https://doi.org/10.5281/zenodo.1234'
>>> get_cff_doi(bytes("identifiers:\\n - type: doi\\n value: 10.5281/zenodo.1234\\n - type: doi\\n value: 10.5281/zenodo.5678", encoding="utf8"))
['https://doi.org/10.5281/zenodo.1234', 'https://doi.org/10.5281/zenodo.5678']
>>> get_cff_doi(bytes("identifiers:\\n - type: doi\\n value: 10.5281/zenodo.9012", encoding="utf8"))
['https://doi.org/10.5281/zenodo.9012']
>>> get_cff_doi(bytes("abc: def", encoding="utf8"))
"""

try:
Expand All @@ -146,18 +148,25 @@ def get_cff_doi(data: bytes) -> Optional[str]:
logger.warning("cannot read CITATION.cff, skipped.")
return None

doi_urls = []

try:
doi_url = doi_to_url(cff["doi"])
# No doi in cff file
identifiers = cff["identifiers"]
except (KeyError, TypeError):
logger.warning("CITATION.cff does not contain a 'doi' key.")
doi_url = None
# doi is malformed
except ValueError as err:
logger.warning(err)
doi_url = None

return doi_url
logger.warning(
"CITATION.cff does not contain a valid 'identifiers' key."
)
return None

for identifier in identifiers:
if identifier.get("type") == "doi":
try:
doi_url = doi_to_url(identifier["value"])
doi_urls.append(doi_url)
except ValueError as err:
logger.warning(err)

return doi_urls or None


def get_cff_authors(data: bytes) -> Optional[List[dict[str, str]]]:
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 15 additions & 4 deletions tests/test_cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,23 @@ def test_broken_cff(cff_file):
def test_parse_doi():
cff_file = b"""
cff-version: 1.2.0
title: gimie
doi: 10.5281/zenodo.1234567
message: If you use this software, please cite it using these metadata.
title: 'napari: a multi-dimensional image viewer for Python'
identifiers:
- type: doi
value: 10.5281/zenodo.3555620
- type: doi
value: 10.21105/joss.01274
"""
obj = next(
parsed_dois = list(
CffParser(subject=URIRef("https://example.org/"))
.parse(data=cff_file)
.objects()
)
assert URIRef("https://doi.org/10.5281/zenodo.1234567") == obj
expected_dois = [
URIRef("https://doi.org/10.5281/zenodo.3555620"),
URIRef("https://doi.org/10.21105/joss.01274"),
]
# parsed_dois already contains all parsed DOI objects
for doi in expected_dois:
assert doi in parsed_dois

0 comments on commit 3867d7a

Please sign in to comment.