-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEng_clean_testdata(FINAL) 28 Nov.py
86 lines (71 loc) · 2.73 KB
/
Eng_clean_testdata(FINAL) 28 Nov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# =============================================================================
# PREPROCESSING ENGLISH DATA (test set)
# =============================================================================
#%%
"Execute preprocessing functions"
runfile('C:/Users/shash/RP_functions.py', wdir='C:/Users/shash')
#%%
df_test = pd.read_csv("Eng_testdata(labelled).csv")
df_test['text'] = df_test['text'].astype('str')
print("\n")
print("sample of test data tweets")
print(df_test.text.head(8))
print("\n")
print("Summary of the basic information about this DataFrame and its data:")
print(df_test[['target','sentiment']].describe())
#%%
print("target & sentiment total")
print("\n")
print(df_test.groupby('target').size())
print("\n")
print(pd.crosstab(df_test.target,df_test.sentiment))
#%%
df1 = df_test.copy()
#%%
"PREPROCESSING STEPS"
"NOTE: No stopwords removal, to prep data for sentiment analysis experiments"
"PREPROCESS text"
df1['textclean'] = df1['text'].apply(lambda x:preprocess(x))
"REDUCE elongated words"
df1['textclean'] = df1['textclean'].apply(lambda x:reduce(x))
"REMOVE chinese/other languages after preprocessing"
df1['textclean'] = df1['textclean'].apply(lambda x:char_remove(x))
"STANDARDIZE words that have similar meaning & correcting short form and slang words"
df1['textclean'] = df1['textclean'].apply(lambda x: standardize(x))
"STANDARDIZE key terms related to train"
df1['textclean'] = df1['textclean'].apply(lambda x: standardize2(x))
"STANDARDIZE laughter expression e.g. lol, hahaha, hihihi, hehehe"
df1['textclean'] = df1['textclean'].apply(lambda x: standardize3(x))
"STANDARDIZE additional sentiment expression words"
df1['textclean'] = df1['textclean'].apply(lambda x: standardize_senti(x))
#%%
"SPLIT malay words that starts with x e.g. xnak, xboleh, xnormal"
"LONG RUNTIME !!!"
for row in df1.itertuples():
%time df1.at[row.Index, 'textclean'] = split_x(row.textclean)
print('split done for index: ' + str(row.Index))
65#%%
df1['textclean'] = df1['textclean'].apply(lambda x: standardize(x))
#%%
"CORRECT spelling"
"LONG RUNTIME !!!"
count = 0
for row in df1.itertuples():
%time df1.at[row.Index, 'textspell'] = correct(row.textclean)
count +=1
print("\n")
print(count)
print('spelling done for index: ' + str(row.Index))
#%%
"STEM & LEMMATIZE words"
"LONG RUNTIME !!!"
count = 0
for row in df1.itertuples():
%time df1.at[row.Index, 'textfin'] = stem_lemma(row.textspell)
count +=1
print("\n")
print('stem & lemmatize done for index: ' + str(row.Index) + " count: " + str(count))
#%%
df1['textfin'] = df1['textfin'].apply(lambda x: standardize_eng(x))
#%%
df1.to_csv("Eng_testdata(labelled_Cleaned).csv",index=False)