Skip to content

Commit

Permalink
replacing tokenize by split_words_on_whitespace to keep text structure (
Browse files Browse the repository at this point in the history
#201)

Summary:
## Related Issue
Fixes #200

- [ x ] I have read CONTRIBUTING.md to understand how to contribute to this repository :)

Replaced `tokenize` and `detokenize` functions by `split_words_on_whitespace` and `rejoin_words_and_whitespace`.

## Unit Tests

### Text

Replaced expected strings in unit test using `ReplaceSimilarChars`.

All tests passed.

Pull Request resolved: #201

Reviewed By: jbitton

Differential Revision: D34819291

Pulled By: zpapakipos

fbshipit-source-id: 3174900eb7823c1b4df535cbf90edcaf3cadafdc
  • Loading branch information
AghilesAzzoug authored and facebook-github-bot committed Mar 28, 2022
1 parent 15e7157 commit bc7c105
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 16 deletions.
16 changes: 8 additions & 8 deletions augly/tests/text_tests/functional_unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,8 @@ def test_replace_similar_chars(self) -> None:
self.assertEqual(
augmented_chars,
[
"T|-|e quick brown 'fox' could^'t jump over the green, grassy hill.",
"The quick 13rown 'fox' couldn't jump over the gr3en, grassy hill.",
"T/-/e quick brown 'fox' coul|)n't jump over the green, grassy hill.",
"T)-(e quick br0wn 'fox' couldn't jump over the green, g12assy hill.",
],
)
augmented_chars_targetted = txtaugs.replace_similar_chars(
Expand All @@ -335,8 +335,8 @@ def test_replace_similar_chars(self) -> None:
self.assertEqual(
augmented_chars_targetted,
[
"The quick brown 'fox' couldn't jump over the gI2een, 9rassy hil|_.",
"The quick brown 'fox' couldn't jump over t/-/e 9reen, gr4ssy h!ll.",
"The quic|{ brown 'fox' couldn't jump Dver the green, gr4ssy hill.",
"7he quick brown 'fox' couldn't jump over the green, gr4ssy hill.",
],
)

Expand All @@ -347,8 +347,8 @@ def test_replace_similar_unicode_chars(self) -> None:
self.assertEqual(
augmented_unicode_chars,
[
"Tℌe ჹuick brown 'fox' coỦldή't jump oṼer the green, grassy hill.",
"The quick ␢rown 'ⓕox' couldn't jumρ over the ġreen, grassy hill.",
"TĦe ℚuick brown 'fox' coul₫n't jump over the green, grassy hill.",
"Ŧhe quick browŅ 'fox' couldn't jÙmp over the green, grassy hill.",
],
)
augmented_unicode_chars_targetted = txtaugs.replace_similar_unicode_chars(
Expand All @@ -361,8 +361,8 @@ def test_replace_similar_unicode_chars(self) -> None:
self.assertEqual(
augmented_unicode_chars_targetted,
[
"The quick brown 'fox' couldn't jump over †he ǥreen, gℝassy hiĽl.",
"The quick brown 'fox' couldn't jump over tҺe Ġreen, grⓐssy hilĻ.",
"⍡he quick brown 'fox' couldn't jump oveℛ the green, ġrassy hill.",
"The quick brown 'fox' couldn't jump over thė green, gℝassy hill.",
],
)

Expand Down
5 changes: 2 additions & 3 deletions augly/tests/text_tests/transforms_unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,9 @@ def test_ReplaceSimilarChars(self) -> None:
aug_chars = txtaugs.ReplaceSimilarChars(aug_word_p=0.3, aug_char_p=0.3)(
self.texts, metadata=self.metadata
)

self.assertTrue(
aug_chars[0]
== "The quick brown 'fox' could^'t jump over the green, grassy hi7l."
== "The quick brown 'fox' coul|)n't jump 0ver the green, grassy hill."
)
self.assertTrue(
are_equal_metadata(
Expand All @@ -277,7 +276,7 @@ def test_ReplaceSimilarUnicodeChars(self) -> None:

self.assertTrue(
aug_unicode_chars[0]
== "The ჹuick brown 'fox' coỦldή't jump oṼer the green, grassy hiļl."
== "The ℚuick brown 'fox' coul₫n't jump ov६r the green, grassy hill."
)
self.assertTrue(
are_equal_metadata(
Expand Down
9 changes: 4 additions & 5 deletions augly/text/augmenters/letter_replacement.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
from typing import List, Optional

from augly.text.augmenters.utils import (
detokenize,
get_aug_idxes,
LETTER_CHAR_MAPPING,
tokenize,
split_words_on_whitespace,
rejoin_words_and_whitespace,
validate_augmenter_params,
)
from augly.utils import pathmgr
from augly.utils.libsndfile import install_libsndfile


install_libsndfile()
from nlpaug.augmenter.char import CharAugmenter # @manual
from nlpaug.util import Action, Method # @manual
Expand Down Expand Up @@ -90,7 +89,7 @@ def substitute(self, data: str) -> str:
@param data: the text where the letter substitution will be applied on
"""
tokens = tokenize(data)
tokens, whitespaces = split_words_on_whitespace(data)
aug_word_cnt = self._generate_aug_cnt(
len(tokens), self.aug_word_min, self.aug_word_max, self.aug_word_p
)
Expand Down Expand Up @@ -126,4 +125,4 @@ def substitute(self, data: str) -> str:

tokens[t_i] = "".join(chars)

return detokenize(tokens)
return rejoin_words_and_whitespace(tokens, whitespaces)

0 comments on commit bc7c105

Please sign in to comment.