Skip to content

Commit

Permalink
Bug in trie
Browse files Browse the repository at this point in the history
  • Loading branch information
mgraffg committed Aug 23, 2024
1 parent c89e88b commit 0911a04
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
11 changes: 10 additions & 1 deletion dialectid/tests/test_text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,13 @@ def test_SeqTM():
assert seq.compute_tokens('~dias~duros~') == _
assert seq.compute_tokens('~🤷~') == [['🤷']]
assert seq.compute_tokens('~🙇🏿~') == [['🙇']]
assert seq.tokenize('buenos dias 🙇🏿')[-1] == '🙇'
assert seq.tokenize('buenos dias 🙇🏿')[-1] == '🙇'


def test_SeqTM_bug():
"""Test SeqTM class"""

seq = SeqTM(language='es', subwords=True, voc_size_exponent=13)
res1 = seq.tokenize('mira pinche a')
res2 = seq.tokenize('a pinche a')
assert res1[1:] == res2[1:]
11 changes: 8 additions & 3 deletions dialectid/text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ def __vocabulary(self, counter):
key = value
if value[:2] == 'q:':
key = value[2:]
if key in self._map:
continue
self._map[key] = value
else:
key = f'~{key}~'
self._map[key] = value
tokens[key] = value
_ = join(dirname(__file__), 'data', 'emojis.json.gz')
Expand Down Expand Up @@ -241,7 +246,7 @@ def find_token(self, text):
:rtype: list
"""

blocks = list()
blocks = []
init = i = end = 0
head = self.data_structure
current = head
Expand All @@ -257,12 +262,12 @@ def find_token(self, text):
current = head
if end > init:
blocks.append([init, end])
if (end - init) > 2 and text[end - 1] == '~':
if (end - init) >= 2 and text[end - 1] == '~':
init = i = end = end - 1
else:
init = i = end
elif i > init:
if (i - init) > 2 and text[i - 1] == '~':
if (i - init) >= 2 and text[i - 1] == '~':
init = end = i = i - 1
else:
init = end = i
Expand Down

0 comments on commit 0911a04

Please sign in to comment.