Skip to content

Commit

Permalink
Fix repetion of content when lexicalising compounds
Browse files Browse the repository at this point in the history
Fixes #76
  • Loading branch information
albbas committed Feb 8, 2025
1 parent 8f20115 commit 776c63a
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions scripts/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,16 @@ def analyse_expressions(fst: Path, lines: Iterable[str]) -> list[str]:
]


def get_longest_cmp_stem(analyses: list[str]) -> str:
def get_longest_cmp_stem(suffix: str, analyses: list[str]) -> str:
"""Get the longest last compound stem from a list of analyses."""
for analysis in analyses:
logging.debug(f"{analysis=}")
return max(
[analysis.split("#")[-1].split("+")[0] for analysis in analyses],
[
analysis.split("#")[-1].split("+")[0]
for analysis in analyses
if analysis.split("#")[-1].split("+")[0].endswith(suffix)
],
key=len,
)

Expand All @@ -306,7 +312,9 @@ def lexicalise_compound(
Returns:
An iterator of lexicalised lexc entries.
"""
longest_last_stem = get_longest_cmp_stem(analyses)
longest_last_stem = get_longest_cmp_stem(
suffix=unlexicalised_compound_stem[-1], analyses=analyses
)

if longest_last_stem not in lexc_dict:
raise ValueError(f"Longest stem {longest_last_stem} not found in lexc")
Expand All @@ -316,6 +324,7 @@ def lexicalise_compound(
]
matching_lexc_entries = lexc_dict.get(longest_last_stem, [])

logging.debug(f"{prefix=} {unlexicalised_compound_stem=} {longest_last_stem=}")
return (
LexcEntry(
stem=f"{prefix}{longest_last_stem}",
Expand Down

0 comments on commit 776c63a

Please sign in to comment.