-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbi-lstm
78 lines (64 loc) · 2.44 KB
/
bi-lstm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.layers import Dense, LSTM, Dropout, Activation, Embedding, Bidirectional
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, classification_report
# Load Data
train = pd.read_csv('path/data.csv')
test = pd.read_csv('path/data.csv')
# Append test set to train set
combine = train.append(test)
# Dummify label for combined set
y_combine = pd.get_dummies(combine['label']).values
# Split set back to train and test for label column
y_train = y_combine[0:20026,]
y_test = y_combine[20026:21794,]
#Train and Test Set Assignment for description column
X_train = train.description
X_test = test.description
# Tokenization
max_words = 50000
oov_tok = '<OOV>' # OOV = Out of Vocabulary
tokenize = Tokenizer(num_words=max_words, oov_token=oov_tok)
tokenize.fit_on_texts(train['description'])
word_index = tokenize.word_index
# Text to Sequence
X_train = tokenize.texts_to_sequences(X_train)
X_test = tokenize.texts_to_sequences(X_test)
# Pad Sequence - Train Set
max_length = 200
trunc_type = 'post'
padding_type = 'post'
X_train = pad_sequences(X_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# Pad Sequence - Test Set
max_length = 200
trunc_type = 'post'
padding_type = 'post'
X_test = pad_sequences(X_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# Create Embedding Layer
MAX_SEQUENCE_LENGTH = 200
embedding_dim = 100
embedding_layer = Embedding(len(word_index), embedding_dim, input_length=MAX_SEQUENCE_LENGTH)
# Build Model
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(69, activation='softmax'))
# Compile Model
opt = tf.keras.optimizers.legacy.Adam(lr=0.001, decay=1e-6)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
# Train Model
num_epochs = 100
batch_size = 128
history = model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.15,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
# Test Model
y_pred = model.predict(X_test)
y_pred2 = np.argmax(y_pred, axis=1)
y_test2 = np.argmax(y_test, axis=1)
print('accuracy %s' % accuracy_score(y_pred2, y_test2))
print(classification_report(y_test2, y_pred2))