-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEng_clean_traindata(FINAL) 26 Nov.py
114 lines (96 loc) · 3.65 KB
/
Eng_clean_traindata(FINAL) 26 Nov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#%%
"Run preprocessing functions"
runfile('C:/Users/shash/RP_functions.py', wdir='C:/Users/shash')
#%%
df = pd.read_csv('rapidkl_1_2018_clean(target).csv')
df = df[df['text']=='ENGLISH']
df.dtypes
df.isna().sum()
df['text'] = df['text'].astype('str')
df.shape
#%%
"REMOVE admin tweets"
admin = ['ktm_berhad','aduanMOT','MyRapidKL','AskRapidKL',
'AsianBuses','AstroRadioNews','MRTMalaysia','APADChannel',
'Rapidpg','Salak_Selatan', 'KLIAtransit']
df1 = df1[~df1.username.isin(admin)]
df1.shape
#%%
"PREPROCESS text"
df1['textclean'] = df1['text'].apply(lambda x:preprocess(x))
#%%
"REMOVE duplicate text"
df1 = df1.drop_duplicates(['textclean'], keep='last')
df1.shape
#%%
"REDUCE elongated words"
df1['textclean'] = df1['textclean'].apply(lambda x:reduce(x))
"REMOVE chinese/other languages after preprocessing"
df1['textclean'] = df1['textclean'].apply(lambda x:char_remove(x))
"STANDARDIZE words that have similar meaning & correcting short form and slang words"
df1['textclean'] = df1['textclean'].apply(lambda x: standardize(x))
"STANDARDIZE key terms related to train"
df1['textclean'] = df1['textclean'].apply(lambda x: standardize2(x))
#%%
"FILTER to select train related tweets"
term = ["Lrt","lrt","Mrt","mrt","Monorail","monorail","station","train","carriage"]
term.extend(stationnames)
dftren = df1[df1.textclean.str.contains('|'.join(term))]
#%%
"FILTER to EXCLUDE bus, ktm, erl and other non train related tweets"
bus_ktm_erl = ["bus","bas","brt","Brt","driver","drebar","drivers","ktm","ecrl","ecrls","erls","erl","ktm_berhad",
"ktm berhad","uitm", "puncak alam", "air selangor", "ukm", "dbkl", "upm",
"puncak perdana", "rapid penang", "rapidpenang","klia express","kliaexpress"]
dftren = dftren[(dftren.textclean.str.contains(('|'.join(bus_ktm_erl)))==False)]
dftren.shape
#%%
"REMOVE stopwords"
dftren['textclean'] = dftren['textclean'].apply(lambda x: remove_stopwords(x))
"STANDARDIZE laughter expression e.g. lol, hahaha, hihihi, hehehe"
dftren['textclean'] = dftren['textclean'].apply(lambda x: standardize3(x))
"REMOVE stopwords"
dftren['textclean'] = dftren['textclean'].apply(lambda x: remove_stopwords(x))
#%%
"SPLIT malay words that starts with x e.g. xnak, xboleh, xnormal"
"LONG RUNTIME!!!"
for row in dftren.itertuples():
%time dftren.at[row.Index, 'textclean'] = split_x(row.textclean)
print('split done for index: ' + str(row.Index))
#%%
"STANDARDIZE and remove stopwords"
dftren['textclean'] = dftren['textclean'].apply(lambda x: standardize(x))
dftren['textclean'] = dftren['textclean'].apply(lambda x: remove_stopwords(x)
#%%
#%%
"CORRECT spelling"
"LONG RUNTIME !!!"
count = 0
for row in dftren.itertuples():
%time dftren.at[row.Index, 'textspell'] = correct(row.textclean)
count +=1
print("\n")
print(count)
print('spelling done for index: ' + str(row.Index))
#%%
"REMOVE stopwords"
dftren['textspell'] = dftren['textspell'].apply(lambda x: remove_stopwords(x))
#%%
"If required run clear cache"
#malaya.clear_cache('language-detection/multinomial')
#%%
"STEM & LEMMATIZE words"
"LONG RUNTIME !!!"
count = 0
for row in dftren.itertuples():
%time dftren.at[row.Index, 'textfin'] = stem_lemma(row.textspell)
count +=1
print("\n")
print('stem & lemmatize done for index: ' + str(row.Index) + " count: " + str(count))
#%%
dftren['textfin'] = dftren['textfin'].apply(lambda x: standardize_eng(x))
#%%
"FILTER OUT tweets with less than 4 words"
%time dffin = dftren[dftren.textfin.apply(lambda x: word_count(x) >= 4)]
dffin.shape
#%%
dffin.to_csv("Eng_traindata.csv",index=False)