diff --git a/bibtexparser/middlewares/enclosing.py b/bibtexparser/middlewares/enclosing.py index 08835f3..041eb48 100644 --- a/bibtexparser/middlewares/enclosing.py +++ b/bibtexparser/middlewares/enclosing.py @@ -1,3 +1,4 @@ +import re from typing import Tuple from typing import Union @@ -85,6 +86,7 @@ def __init__( reuse_previous_enclosing: bool, enclose_integers: bool, default_enclosing: str, + keep_abbr_string: bool = False, allow_inplace_modification: bool = True, ): """ @@ -95,6 +97,8 @@ def __init__( (only of no previous enclosing was applied) :param default_enclosing: The default enclosing character to use ('{', '"', or 'no-enclosing') (only of no previous enclosing was applied, and - for ints - enclose_integers is False) + :keep_abbr_string: Whether to keep the abbreviation (e.g., 'IEEE_J_PAMI'). + (only of no previous enclosing was applied) :param allow_inplace_modification: Whether to allow inplace modification (see BlockMiddleware docs). """ @@ -110,19 +114,31 @@ def __init__( self._default_enclosing = default_enclosing self._reuse_previous_enclosing = reuse_previous_enclosing self._enclose_integers = enclose_integers + self._keep_abbr_string = keep_abbr_string # docstr-coverage: inherited @classmethod def metadata_key(cls) -> str: return "remove_enclosing" - def _enclose(self, value: str, metadata_enclosing: str, apply_int_rule: bool) -> str: + def _enclose( + self, + value: str, + metadata_enclosing: str, + apply_int_rule: bool, + replaced_abbr: bool, + ) -> str: enclosing = self._default_enclosing if self._reuse_previous_enclosing and metadata_enclosing is not None: enclosing = metadata_enclosing elif apply_int_rule and not self._enclose_integers and value.isdigit(): - return value + enclosing = "no-enclosing" + elif not replaced_abbr and self._keep_abbr_string: + if self._is_value_containing_abbr(value): + enclosing = "no-enclosing" + return self._enclose_value(value, enclosing) + def _enclose_value(self, value: str, enclosing: str) -> str: if enclosing == "{": return f"{{{value}}}" if enclosing == '"': @@ -133,18 +149,32 @@ def _enclose(self, value: str, metadata_enclosing: str, apply_int_rule: bool) -> f"enclosing must be either '{{' or '\"' or 'no-enclosing', " f"not '{enclosing}'" ) + def _is_value_containing_abbr(self, value: str) -> bool: + is_invalid_abbr = False + for _s in value.split("#"): + _s = _s.strip() + # is not a valid string is enclosed in quotes, + if not (_s.startswith('"') and _s.endswith('"')): + # and is a invalid abbreviation starts with a letter and contains only letters, digits and underscores + if re.fullmatch(r"[A-Za-z][A-Za-z0-9_]*", _s) is None: + is_invalid_abbr = True + break + return not is_invalid_abbr + # docstr-coverage: inherited def transform_entry(self, entry: Entry, *args, **kwargs) -> Entry: - field: Field - metadata_enclosing = entry.parser_metadata.pop( - RemoveEnclosingMiddleware.metadata_key(), None - ) + metadata_enclosing = entry.parser_metadata.pop(RemoveEnclosingMiddleware.metadata_key(), {}) + # NOTE: this is a ugly hack to check if the string was resolved by the ResolveStringReferencesMiddleware + # we can't import the class directly because of circular imports + # maybe we should add a shared module containing all metadata keys + metadata_resolving: list = entry.parser_metadata.get("ResolveStringReferences", []) for field in entry.fields: - apply_int_rule = field.key in ENTRY_POTENTIALLY_INT_FIELDS - prev_encoding = ( - metadata_enclosing.get(field.key, None) if metadata_enclosing is not None else None + field.value = self._enclose( + field.value, + metadata_enclosing=metadata_enclosing.get(field.key, None), + apply_int_rule=field.key in ENTRY_POTENTIALLY_INT_FIELDS, + replaced_abbr=field.key in metadata_resolving, ) - field.value = self._enclose(field.value, prev_encoding, apply_int_rule=apply_int_rule) return entry # docstr-coverage: inherited @@ -154,5 +184,6 @@ def transform_string(self, string: String, *args, **kwargs) -> String: string.value, string.parser_metadata.get(metadata_key), apply_int_rule=STRINGS_CAN_BE_UNESCAPED_INTS, + replaced_abbr=False, ) return string diff --git a/tests/middleware_tests/test_enclosing.py b/tests/middleware_tests/test_enclosing.py index b2f96b8..4852ee6 100644 --- a/tests/middleware_tests/test_enclosing.py +++ b/tests/middleware_tests/test_enclosing.py @@ -197,6 +197,100 @@ def _figure_out_added_enclosing(changed_value, value): return used_enclosing +@pytest.mark.parametrize("metadata_resolving", ["", "journal"]) +@pytest.mark.parametrize("metadata_enclosing", ["{", '"', "no-enclosing", None]) +@pytest.mark.parametrize("default_enclosing", ["{", '"']) +@pytest.mark.parametrize("enclose_ints", [True, False], ids=["enclose_ints", "no_enclose_ints"]) +@pytest.mark.parametrize( + "keep_abbr_string", [True, False], ids=["keep_abbr_string", "no_keep_abbr_string"] +) +@pytest.mark.parametrize("reuse_previous_enclosing", [True, False], ids=["reuse", "no_reuse"]) +@pytest.mark.parametrize( + "value", + [ + # value, is a abbreviation? + ("IEEE_T_PAMI", True), + ('IEEE_T_PAMI # "ieee tpami"', True), + ('IEEE_T_PAMI" # ieee tpami', False), + ('IEEE_T-PAMI # "ieee tpami"', False), + ('IEEE_T-PAMI # "ieee # tpami"', False), + ('IEEE T-PAMI # "ieee tpami"', False), + ], +) +@pytest.mark.parametrize("inplace", [True, False], ids=["inplace", "not_inplace"]) +def test_addition_of_enclosing_on_entry_with_abbr( + value: tuple, + metadata_resolving: str, + keep_abbr_string: bool, + metadata_enclosing: str, + default_enclosing: str, + enclose_ints: bool, + reuse_previous_enclosing: bool, + inplace: bool, +): + """Extensive Matrix-Testing of the AddEnclosingMiddleware on Entries. + + Also covers the internals for other block types (i.e., String), + which thus can be tested more light-weight.""" + # These values not matter for this unit test, + # but must not change during transformation + # (hence, they are created as variables, not directly in Entry constructor) + value, is_abbr = value + input_entry = Entry( + start_line=5, + entry_type="article", + raw="<--- does not matter for this unit test -->", + key="someKey", + fields=[Field(value=value, start_line=6, key="journal")], + ) + + if metadata_resolving: + input_entry.parser_metadata["ResolveStringReferences"] = [metadata_resolving] + if metadata_enclosing is not None: + input_entry.parser_metadata["removed_enclosing"] = {"journal": metadata_enclosing} + + middleware = AddEnclosingMiddleware( + allow_inplace_modification=inplace, + default_enclosing=default_enclosing, + reuse_previous_enclosing=reuse_previous_enclosing, + enclose_integers=enclose_ints, + keep_abbr_string=keep_abbr_string, + ) + + transformed_library = middleware.transform(library=Library([input_entry])) + + # Assert correct library state + assert len(transformed_library.blocks) == 1 + assert len(transformed_library.entries) == 1 + # Assert correct addition of enclosing + transformed = transformed_library.entries[0] + changed_value = transformed["journal"] + + # Assert correct enclosing was added + if reuse_previous_enclosing and metadata_enclosing is not None: + expected_enclosing = metadata_enclosing + elif (isinstance(value, int) or value.isdigit()) and not enclose_ints: + expected_enclosing = "no-enclosing" + elif not metadata_resolving and keep_abbr_string: + if is_abbr: + expected_enclosing = "no-enclosing" + else: + expected_enclosing = default_enclosing + else: + expected_enclosing = default_enclosing + + if expected_enclosing == "no-enclosing": + _skip_pseudo_enclosing_value(value) + + assert changed_value == middleware._enclose_value(value, expected_enclosing) + + # Assert remaining fields are unchanged + assert_nonfield_entry_attributes_unchanged(input_entry, transformed) + + # Assert `allow_inplace_modification` is respected + assert_inplace_is_respected(inplace, input_entry, transformed) + + @pytest.mark.parametrize("metadata_enclosing", ["{", '"', None]) @pytest.mark.parametrize("default_enclosing", ["{", '"']) @pytest.mark.parametrize("enclose_ints", [True, False], ids=["enclose_ints", "no_enclose_ints"])