-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPoStagging.py
205 lines (149 loc) · 7.45 KB
/
PoStagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# Import libraries
import conllu
import numpy as np
import tensorflow as tf
class PoStaggingModel():
def GetSentences(self,path):
# Open the plain text file for reading; assign under 'data'
with open(path, mode="r", encoding="utf-8") as data:
# Read the file contents and assign under 'annotations'
annotations = data.read()
# Use the parse() function to parse the annotations; store under 'sentences'
sentences = conllu.parse(annotations)
# Return sentences
return sentences
def InputEncoding(self,sentences,trainSet=False,predictSet=False):
# Initialize array to fill with input values
input = []
# Fill array
for sentence in sentences:
currentInput = []
for i in range(len(sentence[:])):
currentInput.append(sentence[i]['form'])
input.append(currentInput)
# Let only the train set tokenizer be available to other methods
if trainSet==True:
# Create tokenizer object
self.tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='OOV')
# Build vocabularies for set
self.tokenizer.fit_on_texts(input)
# Encode inputs for set
inputEncoded = self.tokenizer.texts_to_sequences(input)
# Let only the pre-padded input be availible to other methods when encodig a set to be used for prediction
# This is used in the predict method to prune the padding post-prediction
if predictSet==True:
self.inputEncoded = inputEncoded
# Pad inputs for set
inputEncodedPadded = tf.keras.utils.pad_sequences(inputEncoded, maxlen=128, padding='post',
truncating='post')
# Return the encoded and padded set
return inputEncodedPadded
def TargetEncoding(self,sentences):
# Initialize array to fill with input and output values
output = []
# Fill array
for sentence in sentences:
currentOutput = []
for i in range(len(sentence[:])):
currentOutput.append(sentence[i]['upos'])
output.append(currentOutput)
# Create dictionary to assign a number to each unique UPOS (both 'X' and '_' are considered to correspond to
# 'other' and therefore are assigned the same number)
assignedUPOS = {'ADJ':1 ,'ADP': 2, 'ADV': 3, 'AUX': 4, 'CCONJ': 5, 'DET': 6, 'INTJ': 7, 'NOUN': 8, 'NUM': 9,
'PART': 10, 'PRON': 11, 'PROPN': 12, 'PUNCT': 13, 'SCONJ': 14, 'SYM': 15, 'VERB': 16, 'X': 17,
'_': 17}
# Initialize array to fill with encoded outputs
outputEncoded = []
# Encode
for i in range(len(output)):
currentOutput = []
for upos in output[i]:
currentOutput.append(assignedUPOS[upos])
outputEncoded.append(currentOutput)
# Pad outputs for set
outputEncodedPadded = tf.keras.utils.pad_sequences(outputEncoded, maxlen=128,padding='post',
truncating='post')
#return outputEncodedPaddedCategorical
return outputEncodedPadded
def CreateModel(self,embeddingSize=64,lstmUnits=32,dropout=0.0,printSummary=False):
# Get size of vocabulary
vsize = len(self.tokenizer.word_index)+1
# Define inputs
inputs = tf.keras.Input(shape=(128,), dtype=tf.int32) # Each sentence has 128 elements, and batch size is left
# empty as it should work for any size
# Create network
x = tf.keras.layers.Embedding(vsize,embeddingSize,mask_zero=True)(inputs) # Default: 64 numbers per word
# Create LSTM layer
lstm = tf.keras.layers.LSTM(lstmUnits,return_sequences=True,dropout=dropout) # Default: 32 LSTM units
# Continue with network creation
x = tf.keras.layers.Bidirectional(lstm)(x)
# Create desnse layer
dense = tf.keras.layers.Dense(17,activation='softmax') # 17 labels
# Continue with network creation
outputs = tf.keras.layers.TimeDistributed(dense)(x)
# Create model
self.model = tf.keras.Model(inputs=inputs,outputs=outputs)
# Print model summary if requested
if printSummary==True:
print(self.model.summary())
# Return the created model
return self.model
def Train(self,trainInputs,trainTargets,inputValidation=None,targetValidation=None,epochs=10,
optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),metrics=['accuracy'],batchSize=None):
# Compile model
self.model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=metrics)
# If both input and target sets for validation were provided
if type(inputValidation) is np.ndarray and type(targetValidation) is np.ndarray:
# Train model
self.trainedModel = self.model.fit(trainInputs,trainTargets,epochs=epochs,validation_data=(inputValidation,targetValidation),
batch_size=batchSize)
# If input or target sets for validation were not provided
else:
# Train model
self.trainedModel = self.model.fit(trainInputs,trainTargets,epochs=epochs,batch_size=batchSize)
# Return trained model
return self.trainedModel
def Test(self,inputTest,targetTest):
# Test model
testedModel = self.model.evaluate(inputTest,targetTest)
# Return tested model
return testedModel
def Predict(self,inputSample,prunePadding=False):
# Predict UPOS given a text sample
predictionsRaw = self.model.predict(inputSample)
# Initialize array to contain dictionary keys
predictionsKeys = []
# Fill array
for sentence in predictionsRaw:
currentSentence = []
for word in sentence:
currentSentence.append(np.argmax(word))
predictionsKeys.append(currentSentence)
# Decoding dictionary
assignedUPOS = {1: 'ADJ', 2: 'ADP', 3: 'ADV', 4: 'AUX', 5: 'CCONJ', 6: 'DET', 7: 'INTJ', 8: 'NOUN', 9: 'NUM',
10: 'PART', 11: 'PRON', 12: 'PROPN', 13: 'PUNCT', 14: 'SCONJ', 15: 'SYM', 16: 'VERB', 17: 'X'}
# Initialize array to contain predicted UPOS
predictionDecoded = []
# Decode
for i in range(len(predictionsKeys)):
currentSentence = []
for upos in predictionsKeys[i]:
currentSentence.append(assignedUPOS[upos])
predictionDecoded.append(currentSentence)
# If pruning the predictions is desired (only when InputEncoding method was used)
if prunePadding==True:
predictionsPruned = []
for i in range(len(predictionDecoded)):
predictionsPruned.append(predictionDecoded[i][:len(self.inputEncoded[i])])
predictionDecoded = predictionsPruned
# Return decoded
return predictionDecoded
def Save(self,path):
# Save model in the specified path
return self.model.save(path)
def Load(self,path):
# Load model in the specified path
self.model = tf.keras.models.load_model(path)
# Return model
return self.model