From f9e11b2173f30d85e90ca59bc79db3cbef0526a3 Mon Sep 17 00:00:00 2001 From: Houjun Liu Date: Thu, 25 Jan 2024 11:10:45 -0800 Subject: [PATCH] muting catalan warning --- batchalign/pipelines/morphosyntax/ud.py | 24 ++++++++++++------------ batchalign/version | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/batchalign/pipelines/morphosyntax/ud.py b/batchalign/pipelines/morphosyntax/ud.py index d619304..9f97092 100644 --- a/batchalign/pipelines/morphosyntax/ud.py +++ b/batchalign/pipelines/morphosyntax/ud.py @@ -37,7 +37,7 @@ from batchalign.document import * from batchalign.pipelines.base import * from batchalign.formats.chat.parser import chat_parse_utterance - + from batchalign.utils.dp import * import logging @@ -60,7 +60,7 @@ def stringify_feats(*feats): # the following is a list of feature-extracting handlers # it is used to extract features from specific parts of -# speech. +# speech. def handler(word, lang=None): """The generic handler""" @@ -76,7 +76,7 @@ def handler(word, lang=None): # unknown flag unknown = False - + # if there is a 0 in front, the word is unkown # so we mark it as such if target[0] == '0': @@ -173,7 +173,7 @@ def handler__ADJ(word, lang=None): person = str(feats.get("Person", 1)) if person == "0": person = '4' - + return handler(word, lang)+stringify_feats(deg, case, number[:1]+person) def handler__NOUN(word, lang=None): @@ -213,7 +213,7 @@ def handler__VERB(word, lang=None): if person == "0": person = '4' number = feats.get("Number", "Sing") - + tense = feats.get("Tense", "") polarity = feats.get("Polarity", "") polite = feats.get("Polite", "") @@ -369,7 +369,7 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$" mor_word = handle(word, lang) # exception: if the word is 0, it is probably 0word # occationally Stanza screws up and makes forms like 0thing as 2 tokens: - # 0 and thing + # 0 and thing if word.text.strip() == "0": mor.append("$ZERO$") num_skipped+=1 # mark skipped if skipped @@ -381,7 +381,7 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$" mor_word = "cm|begin" elif word.text.strip() == '„': mor_word = "cm|end" - + # specivl forms: recall the special form marker is xbxxx if "xbxxx" in word.text.strip(): @@ -499,10 +499,10 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$" return (mor_str, gra_str) def clean_sentence(sent): - """clean a sentence + """clean a sentence Arguments: - sent (string): + sent (string): """ remove = ["+,", "++", "+\""] @@ -632,7 +632,7 @@ def morphoanalyze(doc: Document, status_hook:callable = None): # some languages don't have alpha 2 pass - + # pycountry.languages.get(alpha_3=i).alpha_2 for i in lang config = {"processors": {"tokenize": "default", @@ -647,7 +647,7 @@ def morphoanalyze(doc: Document, status_hook:callable = None): if "zh" in lang: lang.pop(lang.index("zh")) lang.append("zh-hans") - + elif "hr" not in lang and "zh" not in lang and "zh-hans" not in lang and "ja" not in lang and "ko" not in lang: if "en" in lang: config["processors"]["mwt"] = "gum" @@ -757,7 +757,7 @@ def morphoanalyze(doc: Document, status_hook:callable = None): mor, gra = parse_sentence(sents[0], ending, special_forms_cleaned, lang[0]) # breakpoint() - if mor.strip() == "": + if mor.strip() == "" or mor.strip() in ENDING_PUNCT: L.debug(f"Encountered an utterance that's likely devoid of morphological information; skipping... utterance='{doc.content[indx]}'") continue diff --git a/batchalign/version b/batchalign/version index 008d376..a06388a 100644 --- a/batchalign/version +++ b/batchalign/version @@ -1,3 +1,3 @@ -0.4.0-post.1 -Jan 23st, 2024 -Croatian models \ No newline at end of file +0.4.0-post.2 +Jan 25th, 2024 +Muting warning about Catalan \ No newline at end of file