-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessing.py
60 lines (49 loc) · 2.06 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd
from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D
from keras.layers import Dropout, Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
import re
from langdetect import detect
from google.cloud import translate_v2 as translate
from tqdm import tqdm
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"\'scuse", " excuse ", text)
text = re.sub('\W', ' ', text)
text = re.sub('\s+', ' ', text)
text = text.strip(' ')
return text
EMBEDDING_FILE = 'glove.840B.300d.txt'
train_x = pd.read_csv('train.csv').fillna(' ')
test_x = pd.read_csv('test.csv').fillna(' ')
train_x['comment_text'] = train_x['comment_text'].map(lambda com : clean_text(com))
test_x['comment_text'] = test_x['comment_text'].map(lambda com : clean_text(com))
translate_client = translate.Client()
for i in tqdm(range(0, 153163)):
try:
if detect(test_x['comment_text'][i]) != 'en':
temp = translate_client.translate(test_x['comment_text'][i],target_language='en')
test_x['comment_text'][i] = temp['translatedText']
except:
temp = 0
for i in tqdm(range(0, 159570)):
try:
if detect(train_x['comment_text'][i]) != 'en':
temp = translate_client.translate(train_x['comment_text'][i],target_language='en')
train_x['comment_text'][i] = temp['translatedText']
except:
temp = 0
test_x.to_csv('test_translated.csv', index=False)
train_x.to_csv('train_translated.csv', index=False)