Skip to content

Commit

Permalink
Moved most declarations to constructor for efficiency
Browse files Browse the repository at this point in the history
andreihar committed May 11, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 0112aaf commit ac4cb22
Showing 1 changed file with 74 additions and 118 deletions.
192 changes: 74 additions & 118 deletions taibun/taibun.py
Original file line number Diff line number Diff line change
@@ -86,12 +86,68 @@ class Converter(object):

def __init__(self, system='Tailo', dialect='south', format='mark', delimiter=DEFAULT_DELIMITER, sandhi=DEFAULT_SANDHI, punctuation='format', convert_non_cjk=False):
self.system = system.lower()
self.dialect = dialect.lower()
self.format = format
self.delimiter = delimiter if delimiter != self.DEFAULT_DELIMITER else self.__set_default_delimiter()
self.sandhi = sandhi if sandhi != self.DEFAULT_SANDHI else self.__set_default_sandhi()
self.punctuation = punctuation
self.convert_non_cjk = convert_non_cjk
self.__declarations(dialect.lower())


def __declarations(self, dialect):
def placements(elements):
return elements + [s.capitalize() for s in elements]
def converts(dictionary):
return {**dictionary, **{k.capitalize(): v.capitalize() for k, v in dictionary.items()}}

# Conversion
self.conversion_func = {
'poj': self.__tailo_to_poj,
'zhuyin': self.__tailo_to_zhuyin,
'tlpa': self.__tailo_to_tlpa,
'pingyim': self.__tailo_to_pingyim,
'tongiong': self.__tailo_to_ti,
'ipa': self.__tailo_to_ipa,
'tailo': self.__tailo_to_tailo
}.get(self.system, lambda word: word[0])

if self.system == 'tailo':
self.placement = placements(['ia'+self.tt+'u', 'ua'+self.tt+'i', 'ua'+self.tt, 'ue'+self.tt, 'ui'+self.tt, 'a'+self.tt+'i', 'a'+self.tt+'u', 'o'+self.tt+'o','ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'o'+self.tt+'o', 'a'+self.tt, 'o'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt])
self.tones = ["", "", "́", "̀", "", "̂", "̌", "̄", "̍", "̋"]
if self.system == 'poj':
self.convert = converts({'nng':'nng', 'nnh':'hⁿ', 'nn':'ⁿ', 'ts':'ch', 'ing':'eng', 'uai':'oai', 'uan':'oan', 'ik':'ek', 'ua':'oa', 'ue':'oe', 'oo':'o͘'})
self.placement = placements(['oa'+self.tt+'h', 'oa'+self.tt+'n', 'oa'+self.tt+'ng', 'oa'+self.tt+'ⁿ', 'oa'+self.tt+'t', 'ia'+self.tt+'u', 'oe'+self.tt+'h', 'o'+self.tt+'e', 'oa'+self.tt+'i', 'u'+self.tt+'i', 'o'+self.tt+'a', 'a'+self.tt+'i', 'a'+self.tt+'u', 'ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'a'+self.tt, 'o'+self.tt, 'o͘'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt])
self.tones = ['', '', '́', '̀', '', '̂', '', '̄', '̍', '']
if self.system == 'zhuyin':
self.convert = {'p4':'ㆴ4', 'p8':'ㆴ8', 'k4':'ㆶ4', 'k8':'ㆶ8', 't4':'ㆵ4', 't8':'ㆵ8', 'h4':'ㆷ4', 'h8':'ㆷ8', 'h0': '0','tshing':'ㄑㄧㄥ', 'tshinn':'ㄑㆪ', 'phing':'ㄆㄧㄥ', 'phinn':'ㄆㆪ', 'tsing':'ㄐㄧㄥ', 'tsinn':'ㄐㆪ','ainn':'ㆮ', 'aunn':'ㆯ', 'giok':'ㆣㄧㄜㆶ', 'ngai':'ㄫㄞ', 'ngau':'ㄫㄠ', 'ngoo':'ㄫㆦ', 'ping':'ㄅㄧㄥ','pinn':'ㄅㆪ', 'senn':'ㄙㆥ', 'sing':'ㄒㄧㄥ', 'sinn':'ㄒㆪ', 'tshi':'ㄑㄧ','ang':'ㄤ', 'ann':'ㆩ', 'enn':'ㆥ', 'ing':'ㄧㄥ', 'inn':'ㆪ', 'mai':'ㄇㄞ', 'mau':'ㄇㄠ', 'mng':'ㄇㆭ','moo':'ㄇㆦ', 'mua':'ㄇㄨㄚ', 'mue':'ㄇㄨㆤ', 'mui':'ㄇㄨㄧ', 'nga':'ㄫㄚ', 'nge':'ㄫㆤ', 'ngi':'ㄫㄧ','ong':'ㆲ', 'onn':'ㆧ', 'tsh':'ㄘ', 'tsi':'ㄐㄧ', 'unn':'ㆫ','ai':'ㄞ', 'am':'ㆰ', 'an':'ㄢ', 'au':'ㄠ', 'ji':'ㆢㄧ', 'kh':'ㄎ', 'ma':'ㄇㄚ', 'me':'ㄇㆤ', 'mi':'ㄇㄧ','ng':'ㆭ', 'ok':'ㆦㆶ', 'om':'ㆱ', 'oo':'ㆦ', 'ph':'ㄆ', 'si':'ㄒㄧ', 'th':'ㄊ', 'ts':'ㄗ','a':'ㄚ', 'b':'ㆠ', 'e':'ㆤ', 'g':'ㆣ', 'h':'ㄏ', 'i':'ㄧ', 'j':'ㆡ', 'k':'ㄍ', 'l':'ㄌ', 'm':'ㆬ','n':'ㄋ', 'o':'ㄜ', 'p':'ㄅ', 's':'ㄙ', 't':'ㄉ', 'u':'ㄨ'}
self.tones = ['', '', 'ˋ', '˪', '', 'ˊ', '', '˫', '˙']
if self.system == 'tlpa':
self.convert = converts({'tsh':'ch', 'ts':'c'})
if self.system == 'pingyim':
self.convert = converts({'p4':'p4', 't4':'t4', 'k4':'k4', 'h4':'h4', 'p8':'p8', 't8':'t8', 'k8':'k8', 'h8':'h8','ainn':'nai', 'iunn':'niu', 'ann':'na', 'onn':'noo', 'enn':'ne','inn':'ni', 'unn':'nu', 'au':'ao', 'ph':'p', 'nng':'lng', 'tsh':'c','ng':'ggn', 'ts':'z', 'th':'t', 'kh':'k', 'ir':'i', 'p':'b', 'b':'bb','t':'d', 'k':'g', 'g':'gg', 'j':'zz', 'n':'ln', 'm':'bbn'})
self.placement = placements(['ua'+self.tt+'i', 'ia'+self.tt+'o', 'a'+self.tt+'i', 'a'+self.tt+'o', 'oo'+self.tt, 'ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'ua'+self.tt, 'ue'+self.tt, 'ui'+self.tt,'a'+self.tt, 'o'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt, 'n'+self.tt])
self.tones = ['', '̄', '̌', '̀', '̄', '́', '', '̂', '́', '']
if self.system == 'tongiong':
self.convert = converts({'p4':'p4', 't4':'t4', 'k4':'k4', 'h4':'h4', 'p8':'p8', 't8':'t8', 'k8':'k8', 'h8':'h8','oo':'o', 'om':'om', 'ong':'ong', 'ir':'i', 'tsh':'c','ts':'z', 'nng':'nng', 'ng':'ng', 'g':'gh', 'kh':'k', 'k':'g','ph':'p', 'p':'b', 'b':'bh', 'th':'t', 't':'d', 'j':'r'})
self.placement = placements(['ua'+self.tt+'i', 'ia'+self.tt+'o', 'a'+self.tt+'i', 'a'+self.tt+'o', 'oo'+self.tt, 'ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'ua'+self.tt, 'ue'+self.tt, 'ui'+self.tt,'a'+self.tt, 'o'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt])
self.tones = ["̊", "", "̀", "̂", "̄", "̆", "", "̄", "", "́"]
if self.system == 'ipa':
convert = {'tsing':'tɕiɪŋ','jiang':'dʑiaŋ','tshing':'tɕʰiɪŋ','tsik':'tɕiɪk','tshik':'tɕʰiɪk','jian':'dʑiɛn','jiat':'dʑiɛt','tshi':'tɕʰi','iann':'iã','ainn':'ãi','iang':'iaŋ','nng':'nŋ','mia':'miã','mui':'muĩ','mue':'muẽ','mua':'muã','ma':'mã','me':'mẽ','mi':'mĩ','moo':'mɔ̃','nia':'niã','nua':'nuã','na':'nã','ne':'nẽ','ni':'nĩ','noo':'nɔ̃','ngia':'ŋiã','ngiu':'ŋiũ','nga':'ŋã','nge':'ŋẽ','ngi':'ŋĩ','ngoo':'ŋɔ̃','ing':'iɪŋ','tsh':'tsʰ','tsi':'tɕi','ian':'iɛn','iat':'iɛt','onn':'ɔ̃','ong':'ɔŋ','ik':'iɪk','ji':'dʑi','kh':'kʰ','ng':'ŋ','oo':'ɔ','nn':'̃','hm':'hm̩','ph':'pʰ','th':'tʰ','ok':'ɔk','om':'ɔm','j':'dz','o':'ə'}
if dialect == 'north':
convert.update({'o':'o'})
self.convert = converts(convert)
self.convert2 = converts({'p4':'p̚4','p8':'p̚8','k4':'k̚4','k8':'k̚8','t4':'t̚4','t8':'t̚8','h4':'ʔ4','h8':'ʔ8','si':'ɕi','h0':'0'})
self.tones = ['', '⁴⁴', '⁵³', '¹¹', '²¹', '²⁵', '', '²²', '⁵'] if dialect != 'north' else ['', '⁵⁵', '⁵¹', '²¹', '³²', '²⁴', '', '³³', '⁴']

# Dialect
self.word_dict = {k: (v.split('/')[1] if dialect == 'north' else v.split('/')[0]) if '/' in v else v for k, v in word_dict.items()}

# Sandhi
if self.sandhi in ['auto', 'exc_last', 'incl_last']:
self.sandhi_conversion = {'1':'7', '7':'3', '3':'2', '2':'1', '5':'7', 'p4':'p8', 't4':'t8', 'k4':'k8', 'h4':'2', 'p8':'p4', 't8':'t4', 'k8':'k4', 'h8':'3'}
if dialect == 'north':
self.sandhi_conversion.update({'5':'3'})
self.a_sandhi = {'1':'7', '2':'1', '3':'1', '5':'7', 'p4':'p8', 't4':'t8', 'k4':'k8', 'h4':'1', 'p8':'p4', 't8':'t4', 'k8':'k4', 'h8':'7'}


### Interface functions
@@ -109,14 +165,11 @@ def get(self, input):

# Helper to convert separate words
def __convert_tokenised(self, word):
if word[0] in word_dict:
word = (word_dict[word[0]],) + word[1:]
if "/" in word[0]:
dialect_part = word[0].split("/")[1] if self.dialect == 'north' else word[0].split("/")[0]
word = (dialect_part,) + word[1:]
if word[0] in self.word_dict:
word = (self.word_dict[word[0]],) + word[1:]
elif not self.convert_non_cjk or word[0] in ".,!?\"#$%&()*+/:;<=>@[\\]^`{|}~\t。.,、!?;:()[]【】「」“”":
return word[0]
word = self.__system_conversion(word).replace('---', '--')
word = self.conversion_func(word).replace('---', '--')
if self.format == 'number' and self.system in ['tailo', 'poj']:
word = self.__mark_to_number(word)
if self.format == 'strip':
@@ -130,18 +183,6 @@ def __convert_tokenised(self, word):
return word.replace('--', self.suffix_token).replace('-', self.delimiter).replace(self.suffix_token, '--')


# Helper switch for converting 漢字 based on defined transliteration system
def __system_conversion(self, word):
if self.system == 'poj': return self.__tailo_to_poj(word)
if self.system == 'zhuyin': return self.__tailo_to_zhuyin(word)
if self.system == 'tlpa': return self.__tailo_to_tlpa(word)
if self.system == 'pingyim': return self.__tailo_to_pingyim(word)
if self.system == 'tongiong': return self.__tailo_to_ti(word)
if self.system == 'ipa': return self.__tailo_to_ipa(word)
if self.sandhi in ['auto', 'exc_last', 'incl_last']: return self.__tailo_to_tailo(word)
else: return word[0]


# Helper functions to set delimiter according to transliteration system if wasn't explicitly defined by user
def __set_default_delimiter(self):
if self.system == 'tlpa' or self.system == 'zhuyin' or self.system == 'ipa': return ' '
@@ -219,18 +260,14 @@ def __get_mark_tone(self, input, placement, tones):

# Helper to apply tone sandhi to a word
def __tone_sandhi(self, words, last):
sandhi = {'1':'7', '7':'3', '3':'2', '2':'1', '5':'7', 'p4':'p8', 't4':'t8', 'k4':'k8', 'h4':'2', 'p8':'p4', 't8':'t4', 'k8':'k4', 'h8':'3'}
a_sandhi = {'1':'7', '2':'1', '3':'1', '5':'7', 'p4':'p8', 't4':'t8', 'k4':'k8', 'h4':'1', 'p8':'p4', 't8':'t4', 'k8':'k4', 'h8':'7'}
if self.dialect == 'north':
sandhi.update({'5':'3'})
indices = (
list(range(len(words) - 2)) if last == 'a suff' and len(words) > 1
else list(range(len(words) - 1)) if not last
else list(range(len(words)))
)
sandhi_words = [self.__replacement_tool(sandhi, words[i]) for i in indices]
sandhi_words = [self.__replacement_tool(self.sandhi_conversion, words[i]) for i in indices]
if last == 'a suff' and len(words) > 1:
sandhi_words.append(self.__replacement_tool(a_sandhi, words[-2]))
sandhi_words.append(self.__replacement_tool(self.a_sandhi, words[-2]))
if not last or last == 'a suff':
sandhi_words.append(words[-1])
return sandhi_words
@@ -264,93 +301,45 @@ def __tone_sandhi_position(self, input):
### Tai-lo to other transliteration systems converting

# Helper to convert syllable from Tai-lo to Tai-lo
# (called only in cases when tone sandhi is applied)
def __tailo_to_tailo(self, input):
placement = [
'ia'+self.tt+'u', 'ua'+self.tt+'i', 'ua'+self.tt, 'ue'+self.tt, 'ui'+self.tt, 'a'+self.tt+'i',
'a'+self.tt+'u', 'o'+self.tt+'o','ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'o'+self.tt+'o', 'a'+self.tt,
'o'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt
]
tones = ["", "", "́", "̀", "", "̂", "̌", "̄", "̍", "̋"]
placement += [s.capitalize() for s in placement]
input = '-'.join(self.__get_mark_tone(nt, placement, tones) for nt in self.__get_number_tones(input))
input = '-'.join(self.__get_mark_tone(nt, self.placement, self.tones) for nt in self.__get_number_tones(input))
return input.replace(self.suffix_token, '--')


# Helper to convert syllable from Tai-lo to POJ
def __tailo_to_poj(self, input):
placement = [
'oa'+self.tt+'h', 'oa'+self.tt+'n', 'oa'+self.tt+'ng', 'oa'+self.tt+'ⁿ', 'oa'+self.tt+'t',
'ia'+self.tt+'u', 'oe'+self.tt+'h', 'o'+self.tt+'e', 'oa'+self.tt+'i', 'u'+self.tt+'i', 'o'+self.tt+'a',
'a'+self.tt+'i', 'a'+self.tt+'u', 'ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'a'+self.tt,
'o'+self.tt, 'o͘'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt
]
convert = {'nng':'nng', 'nnh':'hⁿ', 'nn':'ⁿ', 'ts':'ch', 'ing':'eng', 'uai':'oai', 'uan':'oan', 'ik':'ek', 'ua':'oa', 'ue':'oe', 'oo':'o͘'}
tones = ['', '', '́', '̀', '', '̂', '', '̄', '̍', '']
placement += [s.capitalize() for s in placement]
convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
number_tones = self.__get_number_tones(input)
input = '-'.join(
self.__get_mark_tone(self.__replacement_tool(convert, nt), placement, tones)
self.__get_mark_tone(self.__replacement_tool(self.convert, nt), self.placement, self.tones)
for nt in number_tones
)
return input.replace(self.suffix_token, '--')


# Helper to convert syllable from Tai-lo to 方音符號 (zhuyin)
def __tailo_to_zhuyin(self, input):
convert = {
'p4':'ㆴ4', 'p8':'ㆴ8', 'k4':'ㆶ4', 'k8':'ㆶ8', 't4':'ㆵ4', 't8':'ㆵ8', 'h4':'ㆷ4', 'h8':'ㆷ8', 'h0': '0',
'tshing':'ㄑㄧㄥ', 'tshinn':'ㄑㆪ', 'phing':'ㄆㄧㄥ', 'phinn':'ㄆㆪ', 'tsing':'ㄐㄧㄥ', 'tsinn':'ㄐㆪ',
'ainn':'ㆮ', 'aunn':'ㆯ', 'giok':'ㆣㄧㄜㆶ', 'ngai':'ㄫㄞ', 'ngau':'ㄫㄠ', 'ngoo':'ㄫㆦ', 'ping':'ㄅㄧㄥ',
'pinn':'ㄅㆪ', 'senn':'ㄙㆥ', 'sing':'ㄒㄧㄥ', 'sinn':'ㄒㆪ', 'tshi':'ㄑㄧ',
'ang':'ㄤ', 'ann':'ㆩ', 'enn':'ㆥ', 'ing':'ㄧㄥ', 'inn':'ㆪ', 'mai':'ㄇㄞ', 'mau':'ㄇㄠ', 'mng':'ㄇㆭ',
'moo':'ㄇㆦ', 'mua':'ㄇㄨㄚ', 'mue':'ㄇㄨㆤ', 'mui':'ㄇㄨㄧ', 'nga':'ㄫㄚ', 'nge':'ㄫㆤ', 'ngi':'ㄫㄧ',
'ong':'ㆲ', 'onn':'ㆧ', 'tsh':'ㄘ', 'tsi':'ㄐㄧ', 'unn':'ㆫ',
'ai':'ㄞ', 'am':'ㆰ', 'an':'ㄢ', 'au':'ㄠ', 'ji':'ㆢㄧ', 'kh':'ㄎ', 'ma':'ㄇㄚ', 'me':'ㄇㆤ', 'mi':'ㄇㄧ',
'ng':'ㆭ', 'ok':'ㆦㆶ', 'om':'ㆱ', 'oo':'ㆦ', 'ph':'ㄆ', 'si':'ㄒㄧ', 'th':'ㄊ', 'ts':'ㄗ',
'a':'ㄚ', 'b':'ㆠ', 'e':'ㆤ', 'g':'ㆣ', 'h':'ㄏ', 'i':'ㄧ', 'j':'ㆡ', 'k':'ㄍ', 'l':'ㄌ', 'm':'ㆬ',
'n':'ㄋ', 'o':'ㄜ', 'p':'ㄅ', 's':'ㄙ', 't':'ㄉ', 'u':'ㄨ'}
tones = ['', '', 'ˋ', '˪', '', 'ˊ', '', '˫', '˙']
output = []
for nt in self.__get_number_tones((input[0].lower(), input[1])):
nt = self.__replacement_tool(convert, nt).replace(self.suffix_token, '')
nt = self.__replacement_tool(self.convert, nt).replace(self.suffix_token, '')
if len(nt) > 2 and nt[-2] == 'ㄋ':
nt = nt[:-2] + 'ㄣ' + nt[-1]
if self.format != 'number':
nt = ''.join(tones[int(t)] if t.isnumeric() else t for t in nt)
nt = ''.join(self.tones[int(t)] if t.isnumeric() else t for t in nt)
output.append(nt)
return '-'.join(output).replace(self.suffix_token, '')


# Helper to convert syllable from Tai-lo to TLPA
def __tailo_to_tlpa(self, input):
convert = {'tsh':'ch', 'ts':'c'}
convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
input = '-'.join(self.__replacement_tool(convert, nt) for nt in self.__get_number_tones(input))
input = '-'.join(self.__replacement_tool(self.convert, nt) for nt in self.__get_number_tones(input))
return input.replace(self.suffix_token, '')


# Helper to convert syllable from Tai-lo to Bbanlam pingyim
def __tailo_to_pingyim(self, input):
placement = [
'ua'+self.tt+'i', 'ia'+self.tt+'o', 'a'+self.tt+'i', 'a'+self.tt+'o',
'oo'+self.tt, 'ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'ua'+self.tt, 'ue'+self.tt, 'ui'+self.tt,
'a'+self.tt, 'o'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt, 'n'+self.tt
]
# plosives don't change, ptkh 4/8 -> ptkh 4/8
convert = {
'p4':'p4', 't4':'t4', 'k4':'k4', 'h4':'h4', 'p8':'p8', 't8':'t8', 'k8':'k8', 'h8':'h8',
'ainn':'nai', 'iunn':'niu', 'ann':'na', 'onn':'noo', 'enn':'ne',
'inn':'ni', 'unn':'nu', 'au':'ao', 'ph':'p', 'nng':'lng', 'tsh':'c',
'ng':'ggn', 'ts':'z', 'th':'t', 'kh':'k', 'ir':'i', 'p':'b', 'b':'bb',
't':'d', 'k':'g', 'g':'gg', 'j':'zz', 'n':'ln', 'm':'bbn'}
tones = ['', '̄', '̌', '̀', '̄', '́', '', '̂', '́', '']
placement += [s.capitalize() for s in placement]
convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
output = []
for nt in self.__get_number_tones(input):
replaced = self.__replacement_tool(convert, nt)
replaced = self.__replacement_tool(self.convert, nt)
if replaced[0] in ['i', 'I']: # Initial i
replaced = ('Y' if replaced[0] == 'I' else 'y') + (replaced[1:] if replaced[1] in ['a', 'u', 'o'] else replaced.lower())
if replaced[0] in ['u', 'U']: # Initial u
@@ -367,61 +356,29 @@ def __tailo_to_pingyim(self, input):
if replaced[-3:-1] == 'ln': # Final n
replaced = replaced[:-3] + 'n' + replaced[-1]
if self.format != 'number':
output.append(self.__get_mark_tone(replaced, placement, tones))
output.append(self.__get_mark_tone(replaced, self.placement, self.tones))
else:
output.append(replaced)
return '-'.join(output).replace(self.suffix_token, '')


# Helper to convert syllable from Tai-lo to Tong-iong ping-im
# Not enough information on tone mark placement
def __tailo_to_ti(self, input):
placement = [
'ua'+self.tt+'i', 'ia'+self.tt+'o', 'a'+self.tt+'i', 'a'+self.tt+'o',
'oo'+self.tt, 'ia'+self.tt, 'iu'+self.tt, 'io'+self.tt, 'ua'+self.tt, 'ue'+self.tt, 'ui'+self.tt,
'a'+self.tt, 'o'+self.tt, 'e'+self.tt, 'i'+self.tt, 'u'+self.tt, 'n'+self.tt+'g', 'm'+self.tt
]
# plosives don't change, ptkh 4/8 -> ptkh 4/8
convert = {
'p4':'p4', 't4':'t4', 'k4':'k4', 'h4':'h4', 'p8':'p8', 't8':'t8', 'k8':'k8', 'h8':'h8',
'oo':'o', 'om':'om', 'ong':'ong', 'ir':'i', 'tsh':'c',
'ts':'z', 'nng':'nng', 'ng':'ng', 'g':'gh', 'kh':'k', 'k':'g',
'ph':'p', 'p':'b', 'b':'bh', 'th':'t', 't':'d', 'j':'r'}
tones = ["̊", "", "̀", "̂", "̄", "̆", "", "̄", "", "́"]
placement += [s.capitalize() for s in placement]
convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
number_tones = [nt[:-2] + 'or' + nt[-1] if nt[-2] == 'o' else nt for nt in self.__get_number_tones(input)]
input = '-'.join(
self.__get_mark_tone(self.__replacement_tool(convert, nt), placement, tones)
self.__get_mark_tone(self.__replacement_tool(self.convert, nt), self.placement, self.tones)
if self.format != 'number'
else self.__replacement_tool(convert, nt)
else self.__replacement_tool(self.convert, nt)
for nt in number_tones
)
return input.replace(self.suffix_token, '--')


# Helper to convert syllable from Tai-lo to International Phonetic Alphabet
def __tailo_to_ipa(self, input):
convert = {
'tsing':'tɕiɪŋ','jiang':'dʑiaŋ','tshing':'tɕʰiɪŋ','tsik':'tɕiɪk','tshik':'tɕʰiɪk',
'jian':'dʑiɛn','jiat':'dʑiɛt','tshi':'tɕʰi',
'iann':'iã','ainn':'ãi','iang':'iaŋ','nng':'nŋ',
'mia':'miã','mui':'muĩ','mue':'muẽ','mua':'muã','ma':'mã','me':'mẽ','mi':'mĩ','moo':'mɔ̃', # m nasalisation
'nia':'niã','nua':'nuã','na':'nã','ne':'nẽ','ni':'nĩ','noo':'nɔ̃', # n nasalisation
'ngia':'ŋiã','ngiu':'ŋiũ','nga':'ŋã','nge':'ŋẽ','ngi':'ŋĩ','ngoo':'ŋɔ̃', # ng nasalisation
'ing':'iɪŋ','tsh':'tsʰ','tsi':'tɕi','ian':'iɛn','iat':'iɛt','onn':'ɔ̃',
'ong':'ɔŋ','ik':'iɪk','ji':'dʑi','kh':'kʰ','ng':'ŋ','oo':'ɔ','nn':'̃',
'hm':'hm̩','ph':'pʰ','th':'tʰ','ok':'ɔk','om':'ɔm','j':'dz','o':'ə'}
if self.dialect == 'north':
convert.update({'o':'o'})
convert2 = {
'p4':'p̚4','p8':'p̚8','k4':'k̚4','k8':'k̚8','t4':'t̚4','t8':'t̚8','h4':'ʔ4','h8':'ʔ8','si':'ɕi','h0':'0'}
tones = ['', '⁴⁴', '⁵³', '¹¹', '²¹', '²⁵', '', '²²', '⁵'] if self.dialect != 'north' else ['', '⁵⁵', '⁵¹', '²¹', '³²', '²⁴', '', '³³', '⁴']
convert.update({k.capitalize(): v.capitalize() for k, v in convert.items()})
convert2.update({k.capitalize(): v.capitalize() for k, v in convert2.items()})
output = []
for nt in self.__get_number_tones((input[0], input[1])):
nt = self.__replacement_tool(convert, nt).replace(self.suffix_token, '')
nt = self.__replacement_tool(self.convert, nt).replace(self.suffix_token, '')
if 'ŋ' in nt:
if len(nt) > 2:
if all(c.lower() not in 'aeioɔu' for c in nt[:nt.index('ŋ')]) and nt.index('ŋ') != 0:
@@ -430,9 +387,9 @@ def __tailo_to_ipa(self, input):
nt = nt.replace('ŋ', 'ŋ̍')
if len(nt) == 2 and nt[0] == 'm':
nt = 'm̩' + nt[-1]
nt = self.__replacement_tool(convert2, nt)
nt = self.__replacement_tool(self.convert2, nt)
if self.format != 'number':
nt = ''.join(tones[int(t)] if t.isnumeric() else t for t in nt)
nt = ''.join(self.tones[int(t)] if t.isnumeric() else t for t in nt)
output.append(unicodedata.normalize('NFC', nt))
return '-'.join(output).replace(self.suffix_token, '')

@@ -467,7 +424,6 @@ def __format_punctuation_cjk(self, input):

# Helper to capitalise text in according to punctuation
def __format_text(self, input):
# punc_filter = re.compile("([.!?]\s*)")
punc_filter = re.compile(r"([.!?]\s*)")
split_with_punc = punc_filter.split(input)
split_with_punc = [i[0].upper() + i[1:] if len(i) > 1 else i for i in split_with_punc]

0 comments on commit ac4cb22

Please sign in to comment.