Skip to content

Commit

Permalink
Tokenizer rules from the core spacy
Browse files Browse the repository at this point in the history
  • Loading branch information
aajanki committed Jan 25, 2020
1 parent b75b687 commit 4b56412
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 4 deletions.
4 changes: 1 addition & 3 deletions fi/fi.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from .lemmatizer import FinnishLemmatizer
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from spacy.lang.fi import FinnishDefaults
from spacy.language import Language
from spacy.lookups import Lookups
from spacy.symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV
from spacy.symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX

# Punctuation stolen from Danish
from spacy.lang.da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES


TAG_MAP = {
'Adv': {POS: ADV},
Expand Down
33 changes: 33 additions & 0 deletions fi/punctuation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# coding: utf8
from __future__ import unicode_literals

from spacy.lang.char_classes import LIST_ELLIPSES, LIST_ICONS
from spacy.lang.char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.punctuation import TOKENIZER_SUFFIXES


_quotes = CONCAT_QUOTES.replace("'", "")

_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

_suffixes = [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
]


TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes
2 changes: 1 addition & 1 deletion tools/package_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ MODEL_NAME=`echo $PACKAGE_DIR | sed 's/-[0-9.]\+$//'`

echo "Copying the lemmatizer sources to the package directory"
mkdir -p models/python-package/"$PACKAGE_DIR/$MODEL_NAME"
cp fi/fi.py fi/lemmatizer.py models/python-package/"$PACKAGE_DIR/$MODEL_NAME"/
cp fi/fi.py fi/lemmatizer.py fi/punctuation.py models/python-package/"$PACKAGE_DIR/$MODEL_NAME"/
cp -r fi/lookups/ models/python-package/"$PACKAGE_DIR/$MODEL_NAME"/

echo "Adding import to __init__.py"
Expand Down

0 comments on commit 4b56412

Please sign in to comment.