-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLSTM+character-based_POSTagging.py
113 lines (104 loc) · 4.43 KB
/
LSTM+character-based_POSTagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import nltk, re, pprint
import numpy as np
#import numpy as np
from tqdm import tqdm
import pandas as pd
import requests
#import matplotlib.pyplot as plt
#import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
#from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.tagged import TaggedCorpusReader
corpusdir ='_DATA' # Directory of corpus.
#newcorpus = PlaintextCorpusReader(corpusdir, '.*')
newcorpus = TaggedCorpusReader(corpusdir, '.*')
print(newcorpus.fileids())
dictlist=[]
#Converting from word/tag pait to a list containing two tuple i.e,[(word1,tag),(word2,tag)]
for i in newcorpus.fileids():
tagged_sent=newcorpus.raw(i)
tagged=tagged_sent.split()
for t in tagged:
temp1=nltk.tag.str2tuple(t)
dictlist.append(temp1)
tagged_sentence = newcorpus.tagged_sents()
#print(dictlist)
tagged_words=[tup for sent in tagged_sentence for tup in sent]
words=[word for word,tag in tagged_words]
words.append("ENDPAD")
print("Total Number of Tagged words", len(words))
n_words = len(tagged_words)
tags1=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags1))
n_tags = len(tags1)
print("Number of Tagged Sentences ",len(tagged_sentence))
sentences=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words in sentences", len(sentences))
max_len_char = 15
from collections import Counter, defaultdict
max_len = 75
w2idx = defaultdict(lambda:1,{w: i + 1 for i, w in tqdm(enumerate(words))})
#w2idx["UNK"] = 1
w2idx["PAD"] = 0
t2idx = {t:i for i, t in tqdm(enumerate(tags1))}
t2idx ["PAD"] = 0
idx2tag = {i: w for w, i in t2idx .items()}
from keras.preprocessing.sequence import pad_sequences
X_word = [[w2idx[w[0]] for w in s] for s in tagged_sentence]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=w2idx["PAD"], padding='post', truncating='post')
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)
c2idx = {c: i + 2 for i, c in enumerate(chars)}
c2idx["UNK"] = 1
c2idx["PAD"] = 0
X_char = []
for sentence in tagged_sentence:
sent_seq = []
for i in range(max_len):
word_seq = []
for j in range(max_len_char):
try:
word_seq.append(c2idx.get(sentence[i][0][j]))
except:
word_seq.append(c2idx.get("PAD"))
sent_seq.append(word_seq)
X_char.append(np.array(sent_seq))
y = [[t2idx [tag] for word,tag in s] for s in tagged_sentence]
#y = pad_sequences(maxlen=max_len, sequences=y, value=t2idx ["PAD"], padding='post', truncating='post')
y = pad_sequences(maxlen=max_len, sequences=y, padding='post', truncating='post')
from sklearn.model_selection import train_test_split
#X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.2, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.2, random_state=2018)
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
input_length=max_len, mask_zero=True)(word_in)
# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
input_length=max_len_char, mask_zero=True))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
recurrent_dropout=0.5))(emb_char)
# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
recurrent_dropout=0.6))(x)
out = TimeDistributed(Dense(n_tags + 1, activation="sigmoid"))(main_lstm)
model = Model([word_in, char_in], out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])
model.summary()