Skip to content

Commit

Permalink
Added tests for new to_traditional, to_simplified, Tokeniser
Browse files Browse the repository at this point in the history
  • Loading branch information
andreihar committed May 10, 2024
1 parent 2326603 commit e1af27e
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 22 deletions.
38 changes: 19 additions & 19 deletions taibun/data/words.json
Original file line number Diff line number Diff line change
Expand Up @@ -13364,7 +13364,7 @@
"雞籠蜂": "ke-lang-phang/kue-lang-phang",
"加了工": "ke-liáu-kang",
"加囉嗦": "ke-lo-so",
"街裡": "ke-lí-lí/kue-lí-lí",
"街裡": "ke-lí/kue-lí",
"雞鱸": "ke-lôo/kue-lôo",
"家內": "ke-lāi",
"家內穡": "ke-lāi-sit",
Expand Down Expand Up @@ -13866,7 +13866,7 @@
"空講": "khang-kóng",
"空股": "khang-kóo",
"空拳": "khang-kûn",
"空裡": "khang-lí-lí",
"空裡": "khang-lí",
"空閬閬": "khang-lòng-lòng",
"空勞": "khang-lô",
"空囉嗦": "khang-lô-so",
Expand Down Expand Up @@ -23254,9 +23254,9 @@
"履歷": "lí-li̍k",
"履歷表": "lí-li̍k-pió",
"理落": "lí-lo̍h",
"裡": "lí-lí",
"裡": "lí",
"里里擴擴": "lí-lí-khok-khok",
"裡外": "lí-lí-lí",
"裡外": "lí-guā",
"李老君": "lí-ló-kun",
"女郎": "lí-lông/lú-lông",
"女奴": "lí-lôo/lú-lôo",
Expand Down Expand Up @@ -23937,7 +23937,7 @@
"內陸": "lāi-lio̍k",
"內亂": "lāi-luān",
"內里": "lāi-lí",
"內裡": "lāi-lí-lí",
"內裡": "lāi-lí",
"內籬仔": "lāi-lî-á",
"內門": "lāi-mn̂g",
"內媽": "lāi-má",
Expand Down Expand Up @@ -24728,7 +24728,7 @@
"麻稿": "muâ-kó",
"麻糊": "muâ-kôo",
"麻栗樹": "muâ-li̍k-tshiū",
"蔴潁": "muâ-muâ-muâ",
"蔴潁": "muâ-ínn",
"麻芳範": "muâ-phang-huān",
"麻芳斜": "muâ-phang-tshua̍h",
"瞞騙": "muâ-phiàn",
Expand Down Expand Up @@ -29079,7 +29079,7 @@
"婊囝": "piáu-kiánn",
"表哥": "piáu-ko",
"表決": "piáu-kuat",
"表裡": "piáu-lí-lí",
"表裡": "piáu-lí",
"表露": "piáu-lōo",
"表名": "piáu-miâ",
"表妹": "piáu-muē/piáu-bē",
Expand Down Expand Up @@ -30827,7 +30827,7 @@
"布料": "pòo-liāu",
"布攄仔": "pòo-lu-á",
"佈雷": "pòo-luî",
"布裡": "pòo-lí-lí",
"布裡": "pòo-lí",
"布里斯瑪": "pòo-lí-su-má",
"布里斯托": "pòo-lí-su-thuh",
"布籬仔": "pòo-lî-á",
Expand Down Expand Up @@ -35022,7 +35022,7 @@
"山欖": "suann-lám",
"山荖葉": "suann-láu-hio̍h",
"山藍": "suann-lâm",
"山裡花": "suann-lí-lí-lí",
"山裡花": "suann-lí-hue",
"山龍": "suann-lîng",
"山龍眼": "suann-lîng-gíng",
"山內": "suann-lāi",
Expand Down Expand Up @@ -39485,7 +39485,7 @@
"套口供": "thò-kháu-king",
"套供": "thò-king",
"套劇": "thò-kio̍k",
"套裡": "thò-lí-lí",
"套裡": "thò-lí",
"套牢": "thò-lô",
"套利": "thò-lī",
"套袍": "thò-phâu",
Expand Down Expand Up @@ -47562,8 +47562,8 @@
"妝閣": "tsng-koh",
"莊官": "tsng-kuann",
"裝奇做怪": "tsng-kî-tsò-kuài/tsng-kî-tsuè-kuài",
"莊裡": "tsng-lí-lí",
"莊裡人": "tsng-lí-lí-lí",
"莊裡": "tsng-lí",
"莊裡人": "tsng-lí-lâng",
"裝滿": "tsng-muá",
"裝配": "tsng-phuè/tsng-phè",
"裝盤": "tsng-puânn",
Expand Down Expand Up @@ -51678,7 +51678,7 @@
"底稿": "té-kó/tué-kó",
"短裙": "té-kûn",
"短理": "té-lí",
"底裡": "té-lí-lí/tué-lí-lí",
"底裡": "té-lí/tué-lí",
"短路": "té-lōo",
"短命": "té-miā",
"短波": "té-pho",
Expand Down Expand Up @@ -56000,8 +56000,8 @@
"南州鄉": "Lâm-tsiu-hiong",
"南莊": "Lâm-tsng",
"南莊鄉": "Lâm-tsng-hiong",
"楠梓": "Lâm-tsú-tsú",
"楠梓區": "Lâm-tsú-tsú-tsú",
"楠梓": "Lâm-tsú",
"楠梓區": "Lâm-tsú-khu",
"南靖": "Lâm-tsīng",
"南台": "Lâm-tâi",
"南投": "Lâm-tâu",
Expand Down Expand Up @@ -57472,7 +57472,7 @@
"水堀頭": "Tsuí-khut-thâu",
"水里": "Tsuí-lí",
"水里鄉": "Tsuí-lí-hiong",
"水裡坑": "Tsuí-lí--lí",
"水裡坑": "Tsuí-lí-khenn/Tsuí-lí-khinn",
"水利會": "Tsuí-lī-huē/Tsuí-lī-hē",
"水利局": "Tsuí-lī-kio̍k",
"水麵龜": "Tsuí-mī-ku",
Expand Down Expand Up @@ -57557,7 +57557,7 @@
"子貢": "Tsú-kòng",
"子路": "Tsú-lōo",
"子張": "Tsú-tiunn",
"梓官區": "Tsú-tsú-tsú",
"梓官區": "Tsú-kuann-khu",
"主後": "Tsú-āu",
"慈惠堂": "Tsû-huī-tn̂g",
"慈利": "Tsû-lī",
Expand Down Expand Up @@ -57862,8 +57862,8 @@
"溫嶺": "Un-niá",
"溫州": "Un-tsiu",
"尉遲": "Ut-tî",
"苑裡": "Uán-lí-lí",
"苑裡鎮": "Uán-lí-lí-lí",
"苑裡": "Uán-lí",
"苑裡鎮": "Uán-lí-tìn",
"遠哲": "Uán-tiat",
"遠東": "Uán-tong",
"遠東區": "Uán-tong-khu",
Expand Down
43 changes: 41 additions & 2 deletions tests/test_additional.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,49 @@ def test_convert_simplified():
assert(c.get('我爱学语言')) == 'Guá ài o̍h gí-giân'

def test_to_traditional():
assert '漢字是台灣用來寫幾若種現代佮古代語文个書寫文字系統' == to_traditional('汉字是台湾用来写几若种现代佮古代语文个书写文字系统')
assert to_traditional('干休') == '干休'
assert to_traditional('干杯') == '乾杯'
assert to_traditional('干部') == '幹部'
assert to_traditional('周密') == '周密'
assert to_traditional('周期') == '週期'
assert to_traditional('天后') == '天后'
assert to_traditional('大后日') == '大後日'
assert to_traditional('不只') == '不只'
assert to_traditional('船只') == '船隻'
assert to_traditional('台语') == '台語'
assert to_traditional('寝台车') == '寢臺車'
assert to_traditional('台面') == '檯面'
assert to_traditional('风台') == '風颱'
assert to_traditional('两个') == '兩个'
assert to_traditional('个人') == '個人'

def test_vars():
c = Converter(punctuation='none')
assert(c.get('木蝨')) == 'ba̍k-sat'
assert(c.get('爲啥物')) == 'uī-siánn-mi̍h'
assert(c.get('白癡')) == 'pe̍h-tshi'
assert(c.get('牛肉麪')) == 'gû-bah-mī'
assert(c.get('臺北人')) == 'Tâi-pak-lâng'
assert(c.get('聲説')) == 'siann-sueh'
assert(c.get('研鉢')) == 'gíng-puah'
assert(c.get('踊躍')) == 'ióng-io̍k'

def test_to_simplified():
assert '汉字是台湾用来写几若种现代佮古代语文个书写文字系统' == to_simplified('漢字是臺灣用來寫幾若種現代佮古代語文个書寫文字系統')
assert to_simplified('干休') == '干休'
assert to_simplified('乾杯') == '干杯'
assert to_simplified('幹部') == '干部'
assert to_simplified('周密') == '周密'
assert to_simplified('週期') == '周期'
assert to_simplified('天后') == '天后'
assert to_simplified('大後日') == '大后日'
assert to_simplified('不只') == '不只'
assert to_simplified('船隻') == '船只'
assert to_simplified('台語') == '台语'
assert to_simplified('寢臺車') == '寝台车'
assert to_simplified('檯面') == '台面'
assert to_simplified('風颱') == '风台'
assert to_simplified('兩个') == '两个'
assert to_simplified('個人') == '个人'

def test_is_cjk():
assert is_cjk('漢') == True
Expand Down
7 changes: 6 additions & 1 deletion tests/test_tokenisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,9 @@ def test_suffix():
def test_simplified():
t = Tokeniser()
assert ['汉字', '是', '用来', '写', '几若', '种', '现代', '佮', '古代', '语文', '个', '书写', '文字', '系统', '。'] == t.tokenise('汉字是用来写几若种现代佮古代语文个书写文字系统。')
assert ['现代', '个', '中国', '、', '日本', '、', '韩国', '、', '台湾', '拢', '有', '使用', '汉字'] == t.tokenise('现代个中国、日本、韩国、台湾拢有使用汉字')
assert ['现代', '个', '中国', '、', '日本', '、', '韩国', '、', '台湾', '拢', '有', '使用', '汉字'] == t.tokenise('现代个中国、日本、韩国、台湾拢有使用汉字')

def test_false():
t = Tokeniser(False)
assert ['漢字', '是', '用來', '寫', '幾若', '種', '現代', '佮', '古代', '語文', '个', '書寫', '文字', '系統', '。'] == t.tokenise('汉字是用来写几若种现代佮古代语文个书写文字系统。')
assert ['現代', '个', '中國', '、', '日本', '、', '韓國', '、', '台灣', '攏', '有', '使用', '漢字'] == t.tokenise('现代个中国、日本、韩国、台湾拢有使用汉字')

0 comments on commit e1af27e

Please sign in to comment.