better retokenize scheme

TalkBank · May 23, 2024 · 39eee45 · 39eee45
1 parent 80b57c3
commit 39eee45
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 19 deletions.
diff --git a/batchalign/formats/chat/parser.py b/batchalign/formats/chat/parser.py
@@ -222,7 +222,7 @@ def chat_parse_doc(lines, special_mor=False):
                 continue
             # we split because there are multiple languages possible 
             elif "@Languages" in line.strip():
-                results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().split(",")]
+                results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().replace(" ", ",").strip().split(",") if i.strip() != ""]
                 if len(results["langs"]) > 0 and results["langs"][0] == "eng" and special_mor:
                     use_special_mor = True
             # parse participants; the number of | delinates the metedata field

diff --git a/batchalign/pipelines/morphosyntax/ud.py b/batchalign/pipelines/morphosyntax/ud.py
@@ -37,6 +37,7 @@
 
 
 from batchalign.document import *
+from batchalign.constants import *
 from batchalign.pipelines.base import *
 from batchalign.formats.chat.parser import chat_parse_utterance
 
@@ -808,7 +809,52 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
                 ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
                                           mor, gra,
                                           None, None)
+                # split the text up into previous chunks
+                chunks = list(enumerate(doc.content[indx].text.split(" ")))
+                # filter out everything that could not possibly align
+                chunks_align = [(i,j) for i,j in chunks
+                                if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
+                                                   and (len(j) <= 2 or (j[-2] not in "@"))
+                                and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"]]
+                # hollow out anything we are trying to align, and leave everything else
+                chunks_backplate = [[j] 
+                                    if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
+                                    and (len(j) <= 2 or (j[-2] not in "@"))
+                                            and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"])
+                                    else
+                                    []
+                                    for i,j in chunks]
+                # render each into a list
+                chunks_chars = []
+                for i,j in chunks_align:
+                    for k in j:
+                        chunks_chars.append(PayloadTarget(k, payload=i))
+                ud_chars = []
+                for i,j in enumerate(ut):
+                    for k in j.text:
+                        ud_chars.append(ReferenceTarget(k, payload=i))
+                # brrr
+                aligned = align(chunks_chars, ud_chars, tqdm=False)
+                for i in aligned:
+                    if isinstance(i, Match):
+                        if i.reference_payload not in chunks_backplate[i.payload]:
+                            chunks_backplate[i.payload].append(i.reference_payload)
+                    elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
+                        # just put it back
+                        chunks_backplate[i.payload].append(i.key)
+                # resolve all the numbers and flatten
+                chunks_backplate = [j if isinstance(j, str) else ut[j].text
+                                    for i in chunks_backplate
+                                    for j in i]
+
+                retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
+                retokenized_ut = re.sub(r" +", " ", retokenized_ut)
+                # pray to everyone that it works---this will simply crash and ignore
+                # the utterance if it didn't work, so we are doing this as a sanity
+                # check rather than needing the parsed result
+                _1, _2 = chat_parse_utterance(retokenized_ut, mor, gra, None, None)
                 doc.content[indx] = Utterance(content=ut,
+                                              text=retokenized_ut,
                                               tier=doc.content[indx].tier,
                                               time=doc.content[indx].time,
                                               custom_dependencies=doc.content[indx].custom_dependencies)

diff --git a/batchalign/version b/batchalign/version
@@ -1,3 +1,3 @@
-0.7.1-beta.7
+0.7.1-beta.8
 May 21st, 2024
-insert debug info to transcribe file
+better retokenize algorithm
diff --git a/scratchpad.py b/scratchpad.py
@@ -60,7 +60,19 @@
 #     return leafs
 
 # from batchalign.models import BertUtteranceModel
-# tmp = CHATFile(path="./extern/Untitled.cha").doc
+# from batchalign.pipelines import BatchalignPipeline
+# tmp = CHATFile(path="../talkbank-alignment/test_harness/input/10502.cha").doc
+# pipe = BatchalignPipeline.new("morphosyntax", "jpn")
+# tmp.langs = ["jpn"]
+# # tmp[-1].content
+# res = pipe(tmp, retokenize=True)
+# print(str(CHATFile(doc=res)))
+
+
+# tmp[-1].content
+# tmp[-1]
+# tmp[6]
+
 # tmp
 # tmp1 = sue(tmp)
 # tmp1
@@ -97,28 +109,28 @@
 
 ########### The Batchalign Individual Engine Harness ###########
 
-text = "We should be friends! Yes we should."
-# text = "ice ice cream ice ice cream ice ice cream"
+# text = "We should be friends! Yes we should."
+# # text = "ice ice cream ice ice cream ice ice cream"
 
-# ice ice cream ice cream
-# ice [/] <ice cream> [/] ice cream
-# ice cream ice cream ice cream ice ice cream cream
+# # ice ice cream ice cream
+# # ice [/] <ice cream> [/] ice cream
+# # ice cream ice cream ice cream ice ice cream cream
 
-lang = "eng"
+# lang = "eng"
 
-# forms, delim = chat_parse_utterance(text, None, None, None, None)
-# utterance = Utterance(content=forms, delim=delim)
-# ut = Document(content=[utterance], langs=[lang])
+# # forms, delim = chat_parse_utterance(text, None, None, None, None)
+# # utterance = Utterance(content=forms, delim=delim)
+# # ut = Document(content=[utterance], langs=[lang])
 
-doc = Document.new(text, lang=lang)
+# doc = Document.new(text, lang=lang)
 
-retrace = StanzaEngine()
-pipe = BatchalignPipeline(retrace)
+# retrace = StanzaEngine()
+# pipe = BatchalignPipeline(retrace)
 
-doc = pipe(doc)
-doc
+# doc = pipe(doc)
+# doc
 
-# # doc[0].content
+# # # doc[0].content
 
 # print(str(CHATFile(doc=doc)))