Skip to content

Commit

Permalink
better retokenize scheme
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed May 23, 2024
1 parent 80b57c3 commit 39eee45
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 19 deletions.
2 changes: 1 addition & 1 deletion batchalign/formats/chat/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def chat_parse_doc(lines, special_mor=False):
continue
# we split because there are multiple languages possible
elif "@Languages" in line.strip():
results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().split(",")]
results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().replace(" ", ",").strip().split(",") if i.strip() != ""]
if len(results["langs"]) > 0 and results["langs"][0] == "eng" and special_mor:
use_special_mor = True
# parse participants; the number of | delinates the metedata field
Expand Down
46 changes: 46 additions & 0 deletions batchalign/pipelines/morphosyntax/ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@


from batchalign.document import *
from batchalign.constants import *
from batchalign.pipelines.base import *
from batchalign.formats.chat.parser import chat_parse_utterance

Expand Down Expand Up @@ -808,7 +809,52 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
mor, gra,
None, None)
# split the text up into previous chunks
chunks = list(enumerate(doc.content[indx].text.split(" ")))
# filter out everything that could not possibly align
chunks_align = [(i,j) for i,j in chunks
if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
and (len(j) <= 2 or (j[-2] not in "@"))
and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"]]
# hollow out anything we are trying to align, and leave everything else
chunks_backplate = [[j]
if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
and (len(j) <= 2 or (j[-2] not in "@"))
and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"])
else
[]
for i,j in chunks]
# render each into a list
chunks_chars = []
for i,j in chunks_align:
for k in j:
chunks_chars.append(PayloadTarget(k, payload=i))
ud_chars = []
for i,j in enumerate(ut):
for k in j.text:
ud_chars.append(ReferenceTarget(k, payload=i))
# brrr
aligned = align(chunks_chars, ud_chars, tqdm=False)
for i in aligned:
if isinstance(i, Match):
if i.reference_payload not in chunks_backplate[i.payload]:
chunks_backplate[i.payload].append(i.reference_payload)
elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
# just put it back
chunks_backplate[i.payload].append(i.key)
# resolve all the numbers and flatten
chunks_backplate = [j if isinstance(j, str) else ut[j].text
for i in chunks_backplate
for j in i]

retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
retokenized_ut = re.sub(r" +", " ", retokenized_ut)
# pray to everyone that it works---this will simply crash and ignore
# the utterance if it didn't work, so we are doing this as a sanity
# check rather than needing the parsed result
_1, _2 = chat_parse_utterance(retokenized_ut, mor, gra, None, None)
doc.content[indx] = Utterance(content=ut,
text=retokenized_ut,
tier=doc.content[indx].tier,
time=doc.content[indx].time,
custom_dependencies=doc.content[indx].custom_dependencies)
Expand Down
4 changes: 2 additions & 2 deletions batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.7.1-beta.7
0.7.1-beta.8
May 21st, 2024
insert debug info to transcribe file
better retokenize algorithm
44 changes: 28 additions & 16 deletions scratchpad.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,19 @@
# return leafs

# from batchalign.models import BertUtteranceModel
# tmp = CHATFile(path="./extern/Untitled.cha").doc
# from batchalign.pipelines import BatchalignPipeline
# tmp = CHATFile(path="../talkbank-alignment/test_harness/input/10502.cha").doc
# pipe = BatchalignPipeline.new("morphosyntax", "jpn")
# tmp.langs = ["jpn"]
# # tmp[-1].content
# res = pipe(tmp, retokenize=True)
# print(str(CHATFile(doc=res)))


# tmp[-1].content
# tmp[-1]
# tmp[6]

# tmp
# tmp1 = sue(tmp)
# tmp1
Expand Down Expand Up @@ -97,28 +109,28 @@

########### The Batchalign Individual Engine Harness ###########

text = "We should be friends! Yes we should."
# text = "ice ice cream ice ice cream ice ice cream"
# text = "We should be friends! Yes we should."
# # text = "ice ice cream ice ice cream ice ice cream"

# ice ice cream ice cream
# ice [/] <ice cream> [/] ice cream
# ice cream ice cream ice cream ice ice cream cream
# # ice ice cream ice cream
# # ice [/] <ice cream> [/] ice cream
# # ice cream ice cream ice cream ice ice cream cream

lang = "eng"
# lang = "eng"

# forms, delim = chat_parse_utterance(text, None, None, None, None)
# utterance = Utterance(content=forms, delim=delim)
# ut = Document(content=[utterance], langs=[lang])
# # forms, delim = chat_parse_utterance(text, None, None, None, None)
# # utterance = Utterance(content=forms, delim=delim)
# # ut = Document(content=[utterance], langs=[lang])

doc = Document.new(text, lang=lang)
# doc = Document.new(text, lang=lang)

retrace = StanzaEngine()
pipe = BatchalignPipeline(retrace)
# retrace = StanzaEngine()
# pipe = BatchalignPipeline(retrace)

doc = pipe(doc)
doc
# doc = pipe(doc)
# doc

# # doc[0].content
# # # doc[0].content

# print(str(CHATFile(doc=doc)))

Expand Down

0 comments on commit 39eee45

Please sign in to comment.