Skip to content
/ tha Public

πŸ“’ Tha (ថអ) - A Khmer Text Normalization and Verbalization Toolkit

License

Notifications You must be signed in to change notification settings

seanghay/tha

Folders and files

NameName
Last commit message
Last commit date

Latest commit

a54e414 Β· Jul 26, 2024

History

14 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Tha (ថអ)

Khmer Text Normalization and Verbalization Toolkit.

Install

pip install tha
import tha.normalize
import tha.phone_numbers
import tha.urls
import tha.datetime
import tha.hashtags
import tha.ascii_lines
import tha.license_plate
import tha.cardinals
import tha.decimals
import tha.ordinals
import tha.currency
import tha.parenthesis
import tha.repeater

## Normalize
assert tha.normalize.processor("αž˜αž·αž“\u200bαž²αŸ’αž™") == "αž˜αž·αž“αž±αŸ’αž™"

## Phone Numbers
assert tha.phone_numbers.processor("010123123", chunk_size=2) == "0▁10▁12▁31▁23"
assert tha.phone_numbers.processor("010123123", chunk_size=3) == "0▁10▁123▁123"
assert tha.phone_numbers.processor("0961231234", chunk_size=3) == "0▁96▁123▁1234"

## URLs and emails
assert tha.urls.processor("example@gmail.com") == "example at g▁mail dot com"
assert tha.urls.processor("https://google.com") == "google dot com"
assert tha.urls.processor("http://google.com") == "google dot com"
assert tha.urls.processor("google.com") == "google dot com"
assert tha.urls.processor("google.gov.kh") == "google dot gov dot k▁h"
assert tha.urls.processor("google.com.kh") == "google dot com dot k▁h"

## Time
assert tha.datetime.time_processor("10:23AM") == "10 23▁A▁M"
assert tha.datetime.time_processor("10:23PM") == "10 23▁P▁M"
assert tha.datetime.time_processor("1:23PM") == "1 23▁P▁M"

## Date
assert tha.datetime.date_processor("2024-01-02") == "2024 01 02"
assert tha.datetime.date_processor("01-02-2034") == "01 02 2034"

## Hashtags
assert (
  tha.hashtags.processor("Hello world #this_will_remove hello") == "Hello world  hello"
)
assert tha.hashtags.processor("Hello world #αž›αž»αž” hello") == "Hello world  hello"
assert tha.hashtags.processor("Hello world #αž›αž»αž”1234 hello") == "Hello world  hello"

## ASCII Lines
assert tha.ascii_lines.processor("Remove --- asdasd") == "Remove  asdasd"
assert tha.ascii_lines.processor("Remove\n###\nasdasd") == "Remove\n\nasdasd"

## Cambodia License Plate
assert tha.license_plate.processor("1A 1234") == "1 A 12▁34"
assert tha.license_plate.processor("1A 4444") == "1 A αž€αžΆαžšαŸ‰αŸ4"

## Number - Cardinals
assert tha.cardinals.processor("1234") == "αž˜αž½αž™αž–αžΆαž“αŸ‹β–αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("1") == "αž˜αž½αž™"
assert tha.cardinals.processor("1▁2") == "αž˜αž½αž™β–αž–αžΈαžš"
assert tha.cardinals.processor("-1") == "αžŠαž€β–αž˜αž½αž™"
assert tha.cardinals.processor("10") == "αžŠαž”αŸ‹"
assert tha.cardinals.processor("15") == "αžŠαž”αŸ‹αž”αŸ’αžšαžΆαŸ†"
assert tha.cardinals.processor("100") == "αž˜αž½αž™αžšαž™"
assert tha.cardinals.processor("10000") == "αž˜αž½αž™αž˜αŸ‰αžΊαž“"
assert tha.cardinals.processor("10000.234") == "αž˜αž½αž™αž˜αŸ‰αžΊαž“.αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("-10000.234") == "αžŠαž€β–αž˜αž½αž™αž˜αŸ‰αžΊαž“.αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("-10000,234") == "αžŠαž€β–αž˜αž½αž™αž˜αŸ‰αžΊαž“,αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"

## Number - Decimals
assert tha.decimals.processor("123.324") == "αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αž”αžΈαžšαž™β–αž˜αŸ’αž—αŸƒαž”αž½αž“"
assert tha.decimals.processor("123.001") == "αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αž˜αž½αž™"
assert tha.decimals.processor("-123.0012") == "αžŠαž€β–αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŠαž”αŸ‹αž–αžΈαžš"
assert tha.decimals.processor("-123,0012") == "αžŠαž€β–αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž€αŸ’αž”αŸ€αžŸβ–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŠαž”αŸ‹αž–αžΈαžš"

## Number - Ordinals
assert tha.ordinals.processor("5th") == "αž‘αžΈβ–αž”αŸ’αžšαžΆαŸ†"
assert tha.ordinals.processor("3rd") == "αž‘αžΈβ–αž”αžΈ"
assert tha.ordinals.processor("1st") == "αž‘αžΈβ–αž˜αž½αž™"
assert tha.ordinals.processor("10th") == "αž‘αžΈβ–αžŠαž”αŸ‹"
assert tha.ordinals.processor("10") == "10"

## Number - Currency
assert tha.currency.processor("$100.01") == "αž˜αž½αž™αžšαž™αžŠαž»αž›αŸ’αž›αžΆαžšβ–αž˜αž½αž™αžŸαŸαž“"
assert tha.currency.processor("$100") == "αž˜αž½αž™αžšαž™β–αžŠαž»αž›αŸ’αž›αžΆαžš"
assert tha.currency.processor("100$") == "αž˜αž½αž™αžšαž™αžŠαž»αž›αŸ’αž›αžΆαžš"
assert tha.currency.processor("100αŸ›") == "αž˜αž½αž™αžšαž™αžšαŸ€αž›"
assert tha.currency.processor("100.32αŸ›") == "αž˜αž½αž™αžšαž™β–αž…αž»αž…β–αžŸαžΆαž˜αžŸαž·αž”αž–αžΈαžšαžšαŸ€αž›"
assert tha.currency.processor("100.0032αŸ›") == "αž˜αž½αž™αžšαž™β–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΆαž˜αžŸαž·αž”αž–αžΈαžšαžšαŸ€αž›"

## Parenthesis
assert tha.parenthesis.processor("Hello (this will be ignored) world") == "Hello world"


## Iteration Mark
def fake_tokenizer(_):
  return ["αž‚αžΆαžαŸ‹", "αž”αžΆαž“", "αž‘αŸ…", "αž”αž“αŸ’αžαž·αž…", "αž˜αŸ’αžŠαž„"]


assert (
  tha.repeater.processor("αž‚αžΆαžαŸ‹αž”αžΆαž“αž‘αŸ…αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„αŸ—αž αžΎαž™", tokenizer=fake_tokenizer)
  == "αž‚αžΆαžαŸ‹αž”αžΆαž“αž‘αŸ…αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„β–αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„αž αžΎαž™"
)

About

πŸ“’ Tha (ថអ) - A Khmer Text Normalization and Verbalization Toolkit

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published