Skip to content

Commit

Permalink
Proper support for ZWJ
Browse files Browse the repository at this point in the history
  • Loading branch information
mpcabd committed Nov 21, 2017
1 parent 135a6b8 commit 0839a95
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 48 deletions.
2 changes: 1 addition & 1 deletion arabic_reshaper/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.0.10'
__version__ = '2.0.11'
56 changes: 10 additions & 46 deletions arabic_reshaper/arabic_reshaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,7 @@ def reshape(self, text):
support_zwj = self.configuration.getboolean('support_zwj')
positions_harakat = {}

arabic_word_start = -1
zwjs = []

for i, letter in enumerate(text):
for letter in text:
if HARAKAT_RE.match(letter):
if not delete_harakat:
position = len(output) - 1
Expand All @@ -181,51 +178,15 @@ def reshape(self, text):
positions_harakat[position].append(letter)
elif letter == TATWEEL and delete_tatweel:
pass
elif letter == ZWJ and support_zwj:
zwjs.append(i)

if arabic_word_start != -1:
# Handle three consecutive ZWJs or more
if (
len(zwjs) > 2 and
zwjs[-2] == i - 1 and
zwjs[-3] == i - 2
):
arabic_word_start = -1
# Handle when previous letter is not ZWJ
elif (
output and
len(zwjs) == 1 or (len(zwjs) > 1 and zwjs[-2] != i - 1)
):
previous_letter = output[-1]
if connects_with_letter_after(previous_letter[LETTER]):
if previous_letter[FORM] == ISOLATED:
output[-1] = (
previous_letter[LETTER],
INITIAL
)
else:
output[-1] = (
previous_letter[LETTER],
MEDIAL
)
elif letter == ZWJ and not support_zwj:
pass
elif letter not in LETTERS:
arabic_word_start = -1
output.append((letter, NOT_SUPPORTED))
elif not output: # first letter
arabic_word_start = i
output.append((letter, ISOLATED))
else:
if arabic_word_start == -1:
arabic_word_start = i
previous_letter = output[-1]
if (
arabic_word_start != i and
zwjs and
connects_with_letter_before(letter)
):
output.append((letter, FINAL))
elif previous_letter[FORM] == NOT_SUPPORTED:
if previous_letter[FORM] == NOT_SUPPORTED:
output.append((letter, ISOLATED))
elif not connects_with_letter_before(letter):
output.append((letter, ISOLATED))
Expand Down Expand Up @@ -253,9 +214,12 @@ def reshape(self, text):
)
output.append((letter, FINAL))

# clear ZWJs
if zwjs and letter != ZWJ:
zwjs = []
# Remove ZWJ if it's the second to last item as it won't be useful
if support_zwj and len(output) > 1 and output[-2][LETTER] == ZWJ:
output.pop(len(output) - 2)

if support_zwj and output and output[-1][LETTER] == ZWJ:
output.pop()

if self.configuration.getboolean('support_ligatures'):
# Clean text from Harakat to be able to find ligatures
Expand Down
3 changes: 3 additions & 0 deletions arabic_reshaper/letters.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@
'\u06D2': ('\uFBAE', '', '', '\uFBAF'),
# ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
'\u06D3': ('\uFBB0', '', '', '\uFBB1'),

# ZWJ
ZWJ: (ZWJ, ZWJ, ZWJ, ZWJ),
}


Expand Down
14 changes: 13 additions & 1 deletion arabic_reshaper/tests/test_002_reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,19 @@ def setUp(self):
),
(
letters.ZWJ + BEH + HAMZA,
BEH_ISOLATED + HAMZA_ISOLATED
BEH_FINAL + HAMZA_ISOLATED
),
(
letters.ZWJ + BEH,
BEH_FINAL
),
(
BEH + letters.ZWJ,
BEH_INITIAL
),
(
letters.ZWJ + BEH + letters.ZWJ,
BEH_MEDIAL
),
(
BEH + letters.ZWJ + HAMZA,
Expand Down

0 comments on commit 0839a95

Please sign in to comment.