diff --git a/augly/tests/text_tests/functional_unit_test.py b/augly/tests/text_tests/functional_unit_test.py index 5dc220ba..c36ccb73 100644 --- a/augly/tests/text_tests/functional_unit_test.py +++ b/augly/tests/text_tests/functional_unit_test.py @@ -321,8 +321,8 @@ def test_replace_similar_chars(self) -> None: self.assertEqual( augmented_chars, [ - "T|-|e quick brown 'fox' could^'t jump over the green, grassy hill.", - "The quick 13rown 'fox' couldn't jump over the gr3en, grassy hill.", + "T/-/e quick brown 'fox' coul|)n't jump over the green, grassy hill.", + "T)-(e quick br0wn 'fox' couldn't jump over the green, g12assy hill.", ], ) augmented_chars_targetted = txtaugs.replace_similar_chars( @@ -335,8 +335,8 @@ def test_replace_similar_chars(self) -> None: self.assertEqual( augmented_chars_targetted, [ - "The quick brown 'fox' couldn't jump over the gI2een, 9rassy hil|_.", - "The quick brown 'fox' couldn't jump over t/-/e 9reen, gr4ssy h!ll.", + "The quic|{ brown 'fox' couldn't jump Dver the green, gr4ssy hill.", + "7he quick brown 'fox' couldn't jump over the green, gr4ssy hill.", ], ) @@ -347,8 +347,8 @@ def test_replace_similar_unicode_chars(self) -> None: self.assertEqual( augmented_unicode_chars, [ - "Tℌe ჹuick brown 'fox' coỦldή't jump oṼer the green, grassy hill.", - "The quick ␢rown 'ⓕox' couldn't jumρ over the ġreen, grassy hill.", + "TĦe ℚuick brown 'fox' coul₫n't jump over the green, grassy hill.", + "Ŧhe quick browŅ 'fox' couldn't jÙmp over the green, grassy hill.", ], ) augmented_unicode_chars_targetted = txtaugs.replace_similar_unicode_chars( @@ -361,8 +361,8 @@ def test_replace_similar_unicode_chars(self) -> None: self.assertEqual( augmented_unicode_chars_targetted, [ - "The quick brown 'fox' couldn't jump over †he ǥreen, gℝassy hiĽl.", - "The quick brown 'fox' couldn't jump over tҺe Ġreen, grⓐssy hilĻ.", + "⍡he quick brown 'fox' couldn't jump oveℛ the green, ġrassy hill.", + "The quick brown 'fox' couldn't jump over thė green, gℝassy hill.", ], ) diff --git a/augly/tests/text_tests/transforms_unit_test.py b/augly/tests/text_tests/transforms_unit_test.py index d6e0c826..ad3ed2b4 100644 --- a/augly/tests/text_tests/transforms_unit_test.py +++ b/augly/tests/text_tests/transforms_unit_test.py @@ -259,10 +259,9 @@ def test_ReplaceSimilarChars(self) -> None: aug_chars = txtaugs.ReplaceSimilarChars(aug_word_p=0.3, aug_char_p=0.3)( self.texts, metadata=self.metadata ) - self.assertTrue( aug_chars[0] - == "The quick brown 'fox' could^'t jump over the green, grassy hi7l." + == "The quick brown 'fox' coul|)n't jump 0ver the green, grassy hill." ) self.assertTrue( are_equal_metadata( @@ -277,7 +276,7 @@ def test_ReplaceSimilarUnicodeChars(self) -> None: self.assertTrue( aug_unicode_chars[0] - == "The ჹuick brown 'fox' coỦldή't jump oṼer the green, grassy hiļl." + == "The ℚuick brown 'fox' coul₫n't jump ov६r the green, grassy hill." ) self.assertTrue( are_equal_metadata( diff --git a/augly/text/augmenters/letter_replacement.py b/augly/text/augmenters/letter_replacement.py index d1599b9b..33ca4cfe 100644 --- a/augly/text/augmenters/letter_replacement.py +++ b/augly/text/augmenters/letter_replacement.py @@ -9,16 +9,15 @@ from typing import List, Optional from augly.text.augmenters.utils import ( - detokenize, get_aug_idxes, LETTER_CHAR_MAPPING, - tokenize, + split_words_on_whitespace, + rejoin_words_and_whitespace, validate_augmenter_params, ) from augly.utils import pathmgr from augly.utils.libsndfile import install_libsndfile - install_libsndfile() from nlpaug.augmenter.char import CharAugmenter # @manual from nlpaug.util import Action, Method # @manual @@ -90,7 +89,7 @@ def substitute(self, data: str) -> str: @param data: the text where the letter substitution will be applied on """ - tokens = tokenize(data) + tokens, whitespaces = split_words_on_whitespace(data) aug_word_cnt = self._generate_aug_cnt( len(tokens), self.aug_word_min, self.aug_word_max, self.aug_word_p ) @@ -126,4 +125,4 @@ def substitute(self, data: str) -> str: tokens[t_i] = "".join(chars) - return detokenize(tokens) + return rejoin_words_and_whitespace(tokens, whitespaces)