replacing tokenize by split_words_on_whitespace to keep text structure (

#201) Summary: ## Related Issue Fixes #200 - [ x ] I have read CONTRIBUTING.md to understand how to contribute to this repository :) Replaced `tokenize` and `detokenize` functions by `split_words_on_whitespace` and `rejoin_words_and_whitespace`. ## Unit Tests ### Text Replaced expected strings in unit test using `ReplaceSimilarChars`. All tests passed. Pull Request resolved: #201 Reviewed By: jbitton Differential Revision: D34819291 Pulled By: zpapakipos fbshipit-source-id: 3174900eb7823c1b4df535cbf90edcaf3cadafdc
facebookresearch · Mar 28, 2022 · bc7c105 · bc7c105
1 parent 15e7157
commit bc7c105
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 16 deletions.
diff --git a/augly/tests/text_tests/functional_unit_test.py b/augly/tests/text_tests/functional_unit_test.py
@@ -321,8 +321,8 @@ def test_replace_similar_chars(self) -> None:
         self.assertEqual(
             augmented_chars,
             [
-                "T|-|e quick brown 'fox' could^'t jump over the green, grassy hill.",
-                "The quick 13rown 'fox' couldn't jump over the gr3en, grassy hill.",
+                "T/-/e quick brown 'fox' coul|)n't jump over the green, grassy hill.",
+                "T)-(e quick br0wn 'fox' couldn't jump over the green, g12assy hill.",
             ],
         )
         augmented_chars_targetted = txtaugs.replace_similar_chars(
@@ -335,8 +335,8 @@ def test_replace_similar_chars(self) -> None:
         self.assertEqual(
             augmented_chars_targetted,
             [
-                "The quick brown 'fox' couldn't jump over the gI2een, 9rassy hil|_.",
-                "The quick brown 'fox' couldn't jump over t/-/e 9reen, gr4ssy h!ll.",
+                "The quic|{ brown 'fox' couldn't jump Dver the green, gr4ssy hill.",
+                "7he quick brown 'fox' couldn't jump over the green, gr4ssy hill.",
             ],
         )
 
@@ -347,8 +347,8 @@ def test_replace_similar_unicode_chars(self) -> None:
         self.assertEqual(
             augmented_unicode_chars,
             [
-                "Tℌe ჹuick brown 'fox' coỦldή't jump oṼer the green, grassy hill.",
-                "The quick ␢rown 'ⓕox' couldn't jumρ over the ġreen, grassy hill.",
+                "TĦe ℚuick brown 'fox' coul₫n't jump over the green, grassy hill.",
+                "Ŧhe quick browŅ 'fox' couldn't jÙmp over the green, grassy hill.",
             ],
         )
         augmented_unicode_chars_targetted = txtaugs.replace_similar_unicode_chars(
@@ -361,8 +361,8 @@ def test_replace_similar_unicode_chars(self) -> None:
         self.assertEqual(
             augmented_unicode_chars_targetted,
             [
-                "The quick brown 'fox' couldn't jump over †he ǥreen, gℝassy hiĽl.",
-                "The quick brown 'fox' couldn't jump over tҺe Ġreen, grⓐssy hilĻ.",
+                "⍡he quick brown 'fox' couldn't jump oveℛ the green, ġrassy hill.",
+                "The quick brown 'fox' couldn't jump over thė green, gℝassy hill.",
             ],
         )
 

diff --git a/augly/tests/text_tests/transforms_unit_test.py b/augly/tests/text_tests/transforms_unit_test.py
@@ -259,10 +259,9 @@ def test_ReplaceSimilarChars(self) -> None:
         aug_chars = txtaugs.ReplaceSimilarChars(aug_word_p=0.3, aug_char_p=0.3)(
             self.texts, metadata=self.metadata
         )
-
         self.assertTrue(
             aug_chars[0]
-            == "The quick brown 'fox' could^'t jump over the green, grassy hi7l."
+            == "The quick brown 'fox' coul|)n't jump 0ver the green, grassy hill."
         )
         self.assertTrue(
             are_equal_metadata(
@@ -277,7 +276,7 @@ def test_ReplaceSimilarUnicodeChars(self) -> None:
 
         self.assertTrue(
             aug_unicode_chars[0]
-            == "The ჹuick brown 'fox' coỦldή't jump oṼer the green, grassy hiļl."
+            == "The ℚuick brown 'fox' coul₫n't jump ov६r the green, grassy hill."
         )
         self.assertTrue(
             are_equal_metadata(

diff --git a/augly/text/augmenters/letter_replacement.py b/augly/text/augmenters/letter_replacement.py
@@ -9,16 +9,15 @@
 from typing import List, Optional
 
 from augly.text.augmenters.utils import (
-    detokenize,
     get_aug_idxes,
     LETTER_CHAR_MAPPING,
-    tokenize,
+    split_words_on_whitespace,
+    rejoin_words_and_whitespace,
     validate_augmenter_params,
 )
 from augly.utils import pathmgr
 from augly.utils.libsndfile import install_libsndfile
 
-
 install_libsndfile()
 from nlpaug.augmenter.char import CharAugmenter  # @manual
 from nlpaug.util import Action, Method  # @manual
@@ -90,7 +89,7 @@ def substitute(self, data: str) -> str:
 
         @param data: the text where the letter substitution will be applied on
         """
-        tokens = tokenize(data)
+        tokens, whitespaces = split_words_on_whitespace(data)
         aug_word_cnt = self._generate_aug_cnt(
             len(tokens), self.aug_word_min, self.aug_word_max, self.aug_word_p
         )
@@ -126,4 +125,4 @@ def substitute(self, data: str) -> str:
 
             tokens[t_i] = "".join(chars)
 
-        return detokenize(tokens)
+        return rejoin_words_and_whitespace(tokens, whitespaces)