-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpreprocessing_lexicon.py
87 lines (70 loc) · 3.32 KB
/
preprocessing_lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import unicode_literals
import pandas as pd
from hazm import *
from lexiPersTags import addLexiPers
lexicon = pd.read_excel('377901568617848.xlsx')
lexicon = lexicon.loc[:, 'Persian Translation (Google Translate)':'Negative']
lexicon = lexicon[lexicon['Persian Translation (Google Translate)'].str.contains("[^a-zA-Z]").fillna(False)]
lexicon = lexicon.dropna(axis=0, subset=['Persian Translation (Google Translate)'])
lexicon['sentiment'] = 0
lexiPers = addLexiPers()
for i in range(len(lexicon)):
if(lexicon.iloc[i,1] == lexicon.iloc[i,2]):
lexicon.iloc[i, 3] = 0
elif(lexicon.iloc[i,1] == 1):
lexicon.iloc[i, 3] = 1
elif(lexicon.iloc[i,2] == 1):
lexicon.iloc[i, 3] = -1
else:
lexicon.iloc[i, 3] = 4
lexicon = lexicon.dropna(axis=0, subset=['Persian Translation (Google Translate)'])
lexicon = lexicon.sort_values("Persian Translation (Google Translate)")
# lexicon.to_csv(r'lexicon_with_duplicated_values.csv')
duplicates = lexicon[lexicon.duplicated(['Persian Translation (Google Translate)'], keep=False)]
duplicates = duplicates.sort_values("Persian Translation (Google Translate)")
# duplicates.to_csv(r'duplicated_values.csv')
j = 0
handled_duplicates = []
for i in range(len(duplicates)):
start_index = j
word = duplicates.iloc[start_index, 0]
while(word == duplicates.iloc[j, 0]):
j = j+1
if(j == len(duplicates)):
break
end_index = j
pos = duplicates.iloc[start_index:end_index, 1].sum()
neg = duplicates.iloc[start_index:end_index, 2].sum()
neutral = duplicates.iloc[start_index:end_index, 3].sum()
if(pos >= neutral and pos > neg):
case = {'Persian Translation (Google Translate)': word, 'Positive': 1, 'Negative': 0, 'sentiment': 0}
elif(neg >= neutral and neg > pos):
case = {'Persian Translation (Google Translate)': word, 'Positive': 0, 'Negative': 1, 'sentiment': 0}
else:
case = {'Persian Translation (Google Translate)': word, 'Positive': 0, 'Negative': 0, 'sentiment': 1}
handled_duplicates.append(case)
if (j >= len(duplicates)):
break
handled_duplicates = pd.DataFrame.from_dict(handled_duplicates)
lexicon = lexicon.drop_duplicates(subset="Persian Translation (Google Translate)", keep=False)
for i in range(len(handled_duplicates)):
if(handled_duplicates.iloc[i,1] == handled_duplicates.iloc[i,2]):
handled_duplicates.iloc[i, 3] = 0
elif(handled_duplicates.iloc[i,1] == 1):
handled_duplicates.iloc[i, 3] = 1
elif(handled_duplicates.iloc[i,2] == 1):
handled_duplicates.iloc[i, 3] = -1
else:
handled_duplicates.iloc[i, 3] = 4
lexicon = lexicon.append(handled_duplicates, ignore_index=True)
lexicon = lexicon[['Persian Translation (Google Translate)', 'sentiment']]
lexicon = lexicon.append(lexiPers, ignore_index=True)
lexicon = lexicon.drop_duplicates(subset="Persian Translation (Google Translate)", keep='first')
lexicon = lexicon.sort_values("Persian Translation (Google Translate)")
lexicon = lexicon.reset_index(drop=True)
normalizer = Normalizer()
for i in range(len(lexicon)):
lexicon.iloc[i, 0] = normalizer.normalize(lexicon.iloc[i, 0])
# lexicon.to_csv(r'final_lexicon_without_duplicated_values.csv')
lexicon = lexicon.loc[lexicon['sentiment'] != 0]
lexicon.to_csv(r'final_lexicon_without_duplicated_values_and_zeros.csv')