diff --git a/batchalign/formats/chat/parser.py b/batchalign/formats/chat/parser.py index 85280c9..98f1bd6 100644 --- a/batchalign/formats/chat/parser.py +++ b/batchalign/formats/chat/parser.py @@ -222,7 +222,7 @@ def chat_parse_doc(lines, special_mor=False): continue # we split because there are multiple languages possible elif "@Languages" in line.strip(): - results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().split(",")] + results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().replace(" ", ",").strip().split(",") if i.strip() != ""] if len(results["langs"]) > 0 and results["langs"][0] == "eng" and special_mor: use_special_mor = True # parse participants; the number of | delinates the metedata field diff --git a/batchalign/pipelines/morphosyntax/ud.py b/batchalign/pipelines/morphosyntax/ud.py index 1f3b046..f6ffaa8 100644 --- a/batchalign/pipelines/morphosyntax/ud.py +++ b/batchalign/pipelines/morphosyntax/ud.py @@ -37,6 +37,7 @@ from batchalign.document import * +from batchalign.constants import * from batchalign.pipelines.base import * from batchalign.formats.chat.parser import chat_parse_utterance @@ -808,7 +809,52 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, * ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending, mor, gra, None, None) + # split the text up into previous chunks + chunks = list(enumerate(doc.content[indx].text.split(" "))) + # filter out everything that could not possibly align + chunks_align = [(i,j) for i,j in chunks + if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"]) + and (len(j) <= 2 or (j[-2] not in "@")) + and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"]] + # hollow out anything we are trying to align, and leave everything else + chunks_backplate = [[j] + if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"]) + and (len(j) <= 2 or (j[-2] not in "@")) + and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"]) + else + [] + for i,j in chunks] + # render each into a list + chunks_chars = [] + for i,j in chunks_align: + for k in j: + chunks_chars.append(PayloadTarget(k, payload=i)) + ud_chars = [] + for i,j in enumerate(ut): + for k in j.text: + ud_chars.append(ReferenceTarget(k, payload=i)) + # brrr + aligned = align(chunks_chars, ud_chars, tqdm=False) + for i in aligned: + if isinstance(i, Match): + if i.reference_payload not in chunks_backplate[i.payload]: + chunks_backplate[i.payload].append(i.reference_payload) + elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD: + # just put it back + chunks_backplate[i.payload].append(i.key) + # resolve all the numbers and flatten + chunks_backplate = [j if isinstance(j, str) else ut[j].text + for i in chunks_backplate + for j in i] + + retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"]) + retokenized_ut = re.sub(r" +", " ", retokenized_ut) + # pray to everyone that it works---this will simply crash and ignore + # the utterance if it didn't work, so we are doing this as a sanity + # check rather than needing the parsed result + _1, _2 = chat_parse_utterance(retokenized_ut, mor, gra, None, None) doc.content[indx] = Utterance(content=ut, + text=retokenized_ut, tier=doc.content[indx].tier, time=doc.content[indx].time, custom_dependencies=doc.content[indx].custom_dependencies) diff --git a/batchalign/version b/batchalign/version index 02a30de..848c4bb 100644 --- a/batchalign/version +++ b/batchalign/version @@ -1,3 +1,3 @@ -0.7.1-beta.7 +0.7.1-beta.8 May 21st, 2024 -insert debug info to transcribe file +better retokenize algorithm diff --git a/scratchpad.py b/scratchpad.py index 43e1ec1..26c7900 100644 --- a/scratchpad.py +++ b/scratchpad.py @@ -60,7 +60,19 @@ # return leafs # from batchalign.models import BertUtteranceModel -# tmp = CHATFile(path="./extern/Untitled.cha").doc +# from batchalign.pipelines import BatchalignPipeline +# tmp = CHATFile(path="../talkbank-alignment/test_harness/input/10502.cha").doc +# pipe = BatchalignPipeline.new("morphosyntax", "jpn") +# tmp.langs = ["jpn"] +# # tmp[-1].content +# res = pipe(tmp, retokenize=True) +# print(str(CHATFile(doc=res))) + + +# tmp[-1].content +# tmp[-1] +# tmp[6] + # tmp # tmp1 = sue(tmp) # tmp1 @@ -97,28 +109,28 @@ ########### The Batchalign Individual Engine Harness ########### -text = "We should be friends! Yes we should." -# text = "ice ice cream ice ice cream ice ice cream" +# text = "We should be friends! Yes we should." +# # text = "ice ice cream ice ice cream ice ice cream" -# ice ice cream ice cream -# ice [/] [/] ice cream -# ice cream ice cream ice cream ice ice cream cream +# # ice ice cream ice cream +# # ice [/] [/] ice cream +# # ice cream ice cream ice cream ice ice cream cream -lang = "eng" +# lang = "eng" -# forms, delim = chat_parse_utterance(text, None, None, None, None) -# utterance = Utterance(content=forms, delim=delim) -# ut = Document(content=[utterance], langs=[lang]) +# # forms, delim = chat_parse_utterance(text, None, None, None, None) +# # utterance = Utterance(content=forms, delim=delim) +# # ut = Document(content=[utterance], langs=[lang]) -doc = Document.new(text, lang=lang) +# doc = Document.new(text, lang=lang) -retrace = StanzaEngine() -pipe = BatchalignPipeline(retrace) +# retrace = StanzaEngine() +# pipe = BatchalignPipeline(retrace) -doc = pipe(doc) -doc +# doc = pipe(doc) +# doc -# # doc[0].content +# # # doc[0].content # print(str(CHATFile(doc=doc)))