Skip to content

Commit

Permalink
👌 IMPROVE: Support for keeping abbreviations at enclosing values.
Browse files Browse the repository at this point in the history
  • Loading branch information
lartpang committed Oct 10, 2024
1 parent 214ef38 commit f3a9d14
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 10 deletions.
51 changes: 41 additions & 10 deletions bibtexparser/middlewares/enclosing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Tuple
from typing import Union

Expand Down Expand Up @@ -85,6 +86,7 @@ def __init__(
reuse_previous_enclosing: bool,
enclose_integers: bool,
default_enclosing: str,
keep_abbr_string: bool = False,
allow_inplace_modification: bool = True,
):
"""
Expand All @@ -95,6 +97,8 @@ def __init__(
(only of no previous enclosing was applied)
:param default_enclosing: The default enclosing character to use ('{', '"', or 'no-enclosing')
(only of no previous enclosing was applied, and - for ints - enclose_integers is False)
:keep_abbr_string: Whether to keep the abbreviation (e.g., 'IEEE_J_PAMI').
(only of no previous enclosing was applied)
:param allow_inplace_modification: Whether to allow inplace modification
(see BlockMiddleware docs).
"""
Expand All @@ -110,19 +114,31 @@ def __init__(
self._default_enclosing = default_enclosing
self._reuse_previous_enclosing = reuse_previous_enclosing
self._enclose_integers = enclose_integers
self._keep_abbr_string = keep_abbr_string

# docstr-coverage: inherited
@classmethod
def metadata_key(cls) -> str:
return "remove_enclosing"

def _enclose(self, value: str, metadata_enclosing: str, apply_int_rule: bool) -> str:
def _enclose(
self,
value: str,
metadata_enclosing: str,
apply_int_rule: bool,
replaced_abbr: bool,
) -> str:
enclosing = self._default_enclosing
if self._reuse_previous_enclosing and metadata_enclosing is not None:
enclosing = metadata_enclosing
elif apply_int_rule and not self._enclose_integers and value.isdigit():
return value
enclosing = "no-enclosing"
elif not replaced_abbr and self._keep_abbr_string:
if self._is_value_containing_abbr(value):
enclosing = "no-enclosing"
return self._enclose_value(value, enclosing)

def _enclose_value(self, value: str, enclosing: str) -> str:
if enclosing == "{":
return f"{{{value}}}"
if enclosing == '"':
Expand All @@ -133,18 +149,32 @@ def _enclose(self, value: str, metadata_enclosing: str, apply_int_rule: bool) ->
f"enclosing must be either '{{' or '\"' or 'no-enclosing', " f"not '{enclosing}'"
)

def _is_value_containing_abbr(self, value: str) -> bool:
is_invalid_abbr = False
for _s in value.split("#"):
_s = _s.strip()
# is not a valid string is enclosed in quotes,
if not (_s.startswith('"') and _s.endswith('"')):
# and is a invalid abbreviation starts with a letter and contains only letters, digits and underscores
if re.fullmatch(r"[A-Za-z][A-Za-z0-9_]*", _s) is None:
is_invalid_abbr = True
break
return not is_invalid_abbr

# docstr-coverage: inherited
def transform_entry(self, entry: Entry, *args, **kwargs) -> Entry:
field: Field
metadata_enclosing = entry.parser_metadata.pop(
RemoveEnclosingMiddleware.metadata_key(), None
)
metadata_enclosing = entry.parser_metadata.pop(RemoveEnclosingMiddleware.metadata_key(), {})
# NOTE: this is a ugly hack to check if the string was resolved by the ResolveStringReferencesMiddleware
# we can't import the class directly because of circular imports
# maybe we should add a shared module containing all metadata keys
metadata_resolving: list = entry.parser_metadata.get("ResolveStringReferences", [])
for field in entry.fields:
apply_int_rule = field.key in ENTRY_POTENTIALLY_INT_FIELDS
prev_encoding = (
metadata_enclosing.get(field.key, None) if metadata_enclosing is not None else None
field.value = self._enclose(
field.value,
metadata_enclosing=metadata_enclosing.get(field.key, None),
apply_int_rule=field.key in ENTRY_POTENTIALLY_INT_FIELDS,
replaced_abbr=field.key in metadata_resolving,
)
field.value = self._enclose(field.value, prev_encoding, apply_int_rule=apply_int_rule)
return entry

# docstr-coverage: inherited
Expand All @@ -154,5 +184,6 @@ def transform_string(self, string: String, *args, **kwargs) -> String:
string.value,
string.parser_metadata.get(metadata_key),
apply_int_rule=STRINGS_CAN_BE_UNESCAPED_INTS,
replaced_abbr=False,
)
return string
94 changes: 94 additions & 0 deletions tests/middleware_tests/test_enclosing.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,100 @@ def _figure_out_added_enclosing(changed_value, value):
return used_enclosing


@pytest.mark.parametrize("metadata_resolving", ["", "journal"])
@pytest.mark.parametrize("metadata_enclosing", ["{", '"', "no-enclosing", None])
@pytest.mark.parametrize("default_enclosing", ["{", '"'])
@pytest.mark.parametrize("enclose_ints", [True, False], ids=["enclose_ints", "no_enclose_ints"])
@pytest.mark.parametrize(
"keep_abbr_string", [True, False], ids=["keep_abbr_string", "no_keep_abbr_string"]
)
@pytest.mark.parametrize("reuse_previous_enclosing", [True, False], ids=["reuse", "no_reuse"])
@pytest.mark.parametrize(
"value",
[
# value, is a abbreviation?
("IEEE_T_PAMI", True),
('IEEE_T_PAMI # "ieee tpami"', True),
('IEEE_T_PAMI" # ieee tpami', False),
('IEEE_T-PAMI # "ieee tpami"', False),
('IEEE_T-PAMI # "ieee # tpami"', False),
('IEEE T-PAMI # "ieee tpami"', False),
],
)
@pytest.mark.parametrize("inplace", [True, False], ids=["inplace", "not_inplace"])
def test_addition_of_enclosing_on_entry_with_abbr(
value: tuple,
metadata_resolving: str,
keep_abbr_string: bool,
metadata_enclosing: str,
default_enclosing: str,
enclose_ints: bool,
reuse_previous_enclosing: bool,
inplace: bool,
):
"""Extensive Matrix-Testing of the AddEnclosingMiddleware on Entries.
Also covers the internals for other block types (i.e., String),
which thus can be tested more light-weight."""
# These values not matter for this unit test,
# but must not change during transformation
# (hence, they are created as variables, not directly in Entry constructor)
value, is_abbr = value
input_entry = Entry(
start_line=5,
entry_type="article",
raw="<--- does not matter for this unit test -->",
key="someKey",
fields=[Field(value=value, start_line=6, key="journal")],
)

if metadata_resolving:
input_entry.parser_metadata["ResolveStringReferences"] = [metadata_resolving]
if metadata_enclosing is not None:
input_entry.parser_metadata["removed_enclosing"] = {"journal": metadata_enclosing}

middleware = AddEnclosingMiddleware(
allow_inplace_modification=inplace,
default_enclosing=default_enclosing,
reuse_previous_enclosing=reuse_previous_enclosing,
enclose_integers=enclose_ints,
keep_abbr_string=keep_abbr_string,
)

transformed_library = middleware.transform(library=Library([input_entry]))

# Assert correct library state
assert len(transformed_library.blocks) == 1
assert len(transformed_library.entries) == 1
# Assert correct addition of enclosing
transformed = transformed_library.entries[0]
changed_value = transformed["journal"]

# Assert correct enclosing was added
if reuse_previous_enclosing and metadata_enclosing is not None:
expected_enclosing = metadata_enclosing
elif (isinstance(value, int) or value.isdigit()) and not enclose_ints:
expected_enclosing = "no-enclosing"
elif not metadata_resolving and keep_abbr_string:
if is_abbr:
expected_enclosing = "no-enclosing"
else:
expected_enclosing = default_enclosing
else:
expected_enclosing = default_enclosing

if expected_enclosing == "no-enclosing":
_skip_pseudo_enclosing_value(value)

assert changed_value == middleware._enclose_value(value, expected_enclosing)

# Assert remaining fields are unchanged
assert_nonfield_entry_attributes_unchanged(input_entry, transformed)

# Assert `allow_inplace_modification` is respected
assert_inplace_is_respected(inplace, input_entry, transformed)


@pytest.mark.parametrize("metadata_enclosing", ["{", '"', None])
@pytest.mark.parametrize("default_enclosing", ["{", '"'])
@pytest.mark.parametrize("enclose_ints", [True, False], ids=["enclose_ints", "no_enclose_ints"])
Expand Down

0 comments on commit f3a9d14

Please sign in to comment.