-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlstm.py
132 lines (105 loc) · 4.34 KB
/
lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Dropout, LSTM, Activation, CuDNNLSTM, Embedding
from keras.models import Model
from config import (
DATA_FILE,
DENSE_UNITS,
CONTENT,
LABELS,
DROPOUT_RATE,
LABELS,
EPOCHS,
BATCH_SIZE,
LSTM_HIDDEN_STATE,
GLOVE_FILE,
TEST_SIZE,
MAX_WORDS,
MAX_SEQUENCE_LEN,
PREDICTION_THRESHOLD,
)
from data_loading import loadData
from typing import Dict, List, Tuple
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from nb_svm import calculateModelMetrics
import pandas as pd
def getSequenceModel(word2index: Dict, word2vec: Dict, num_categories: int) -> Model:
""" Create graph model for multiclass classification using LSTM
Args:
word2index: dictionnary mapping words to indices
word2vec: dictionnary mapping words to embedding vectors
num_categories: number of classes
Returns:
model: graph model for LSTM with pretrained word embeddings
"""
# Embedding layer
sentence_indices = Input(shape=(MAX_SEQUENCE_LEN,), dtype="int32")
embedding_layer = createEmbeddingLayer(word2index, word2vec)
embeddings = embedding_layer(sentence_indices)
# LSTM, Dropout and activation layers
X = CuDNNLSTM(LSTM_HIDDEN_STATE, return_sequences=True)(embeddings)
X = Dropout(rate=DROPOUT_RATE)(X)
X = CuDNNLSTM(LSTM_HIDDEN_STATE, return_sequences=False)(X)
X = Dropout(rate=DROPOUT_RATE)(X)
X = Dense(DENSE_UNITS, activation="relu")(X)
X = Dropout(rate=DROPOUT_RATE)(X)
X = Dense(num_categories, activation="sigmoid")(X)
model = Model(inputs=sentence_indices, outputs=X)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
return model
def getWordVectors(df: DataFrame) -> Tuple[List[int], Dict]:
""" Convert the sentence into word vector representation
Args:
df: text data, one sample per row
Returns:
data: text data represented with indices
word2index: dictionnary mapping words to indices
"""
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(list(df))
word2index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LEN)
return data, word2index
def loadGloveEmbeddings(file: str) -> Dict:
""" Load GloVe word embeddings from a file
Args:
file: file with GloVe embeddings. File format as in https://nlp.stanford.edu/projects/glove/
Returns:
word2vec: dictionnary mapping word to empedding vector
"""
word2vec = {}
f = open(GLOVE_FILE, encoding="utf8")
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype="float32")
word2vec[word] = coefs
return word2vec
def createEmbeddingLayer(word2index: dict, word2vec: dict) -> Embedding:
""" Set up pretrained Embedding layer
Credits to: https://keras.io/examples/pretrained_word_embeddings/
Args:
word2index: dictionnary mapping words to indices
word2vec: dictionnary mapping words to embedding vectors
Returns:
embedding_layer layer with pretrained GloVe word embeddings
"""
num_words = min(MAX_WORDS, len(word2index)) + 1
emb_dim = word2vec["cucumber"].shape[0]
emb_matrix = np.zeros((num_words, emb_dim))
for word, index in word2index.items():
if index > MAX_WORDS:
break
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
emb_matrix[index, :] = embedding_vector
embedding_layer = Embedding(
num_words, emb_dim, input_length=MAX_SEQUENCE_LEN, trainable=False
) # Do not update word embeddings
embedding_layer.build((None,))
embedding_layer.set_weights([emb_matrix])
return embedding_layer