Skip to content

Commit

Permalink
[ngrams] Accept aff / dic file directly
Browse files Browse the repository at this point in the history
  • Loading branch information
behdad committed Nov 15, 2023
1 parent b331999 commit ed6d2df
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,19 @@ def extract_ngrams_from_file(filename, *kargs, **kwargs):
except FileNotFoundError:
try:
# Assume hunspell dictionary format;
afffile = open(filename + ".aff", "rb")
filename2 = filename
if filename.endswith(".aff"):
filename2 = filename[:-4]
elif filename.endswith(".dic"):
filename2 = filename[:-4]
afffile = open(filename2 + ".aff", "rb")
for line in afffile:
if line.startswith(b"SET"):
kwargs["encoding"] = (
line.replace(b"\t", b" ").split()[1].decode("ascii")
)
break
txtfile = open(filename + ".dic", "rb")
txtfile = open(filename2 + ".dic", "rb")
next(txtfile) # Skip over the num entries line
txtfile = (
s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile
Expand Down

0 comments on commit ed6d2df

Please sign in to comment.