-
Notifications
You must be signed in to change notification settings - Fork 11
/
tagging_ner.py
119 lines (100 loc) · 3.16 KB
/
tagging_ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from nltk import tokenize
import sys, getopt
from flair.data import Sentence
from flair.models import SequenceTagger
print(" ")
def load_input_conll_file(filename):
tokens = []
sentences = []
with open(filename, "r", encoding="utf8") as file:
for line in file:
if line != "\n":
line = line.strip()
tokens.append(line)
else:
sentence = " ".join(tokens)
sentences.append(sentence)
tokens = []
print("Total Sentences: ", len(sentences))
return sentences
def load_input_plain_file(filename):
sentences = []
with open(filename, "r", encoding="utf8") as file:
for line in file:
line = line.strip()
tokens = tokenize.word_tokenize(line, language='portuguese')
sentences.append(" ".join(tokens))
return sentences
def predict_sentence(sentences):
i = 1
pred = []
print(" ")
print("-----------------LOADING NER MODEL-----------------")
tagger = SequenceTagger.load_from_file('best-model.pt')
print("---------------------------------------------------")
for sentence in sentences:
sentence_to_predict = Sentence(sentence)
tagger.predict(sentence_to_predict)
pred.append(sentence_to_predict.to_tagged_string())
print("Sentence predict: ", str(i)+"/"+str(len(sentences)))
i+=1
return pred
def output_conll_format(output_filename, predicted):
print(" ")
tag_list = ['<B-ORG>', '<I-ORG>', '<B-TMP>', '<I-TMP>', '<B-LOC>', '<I-LOC>', '<B-VAL>', '<I-VAL>', '<B-PER>', '<I-PER>']
new_file = open(output_filename, "w+", encoding="utf8")
new_list_tokens_tags, new_sentences_with_tags = [], []
for sentence in predicted:
splited = sentence.split(' ')
for i in range(len(splited)):
if splited[i] in tag_list:
tag = splited[i]
new_list_tokens_tags.append(tag)
else:
token = splited[i]
new_list_tokens_tags.append(token)
if i+1 < len(splited):
if splited[i+1] not in tag_list:
tag = 'O'
new_list_tokens_tags.append(tag)
if new_list_tokens_tags[-1] not in tag_list:
new_list_tokens_tags.append('O')
new_sentences_with_tags.append(new_list_tokens_tags)
new_list_tokens_tags = []
for new_sentence in new_sentences_with_tags:
for i in range(len(new_sentence)):
if i % 2 == 0:
token = new_sentence[i]
tag = new_sentence[i+1]
tag = tag.replace("<","")
tag = tag.replace(">","")
new_file.write(token+" "+tag+"\n")
new_file.write("\n")
new_file.close()
print(" ")
print("Output file Done!")
return new_list_tokens_tags
def output_plain_format(output_filename, predicted):
new_file = open(output_filename, "w+", encoding="utf8")
for pred in predicted:
new_file.write(pred+"\n")
new_file.close()
print(" ")
print("Output file Done!")
def main():
input_file = sys.argv[1]
output_file = sys.argv[2]
mode = sys.argv[3]
print("Input File: ", input_file)
print("Output File: ", output_file)
print(" ")
if str(mode) == 'conll':
sentences = load_input_conll_file(str(input_file))
predicted = predict_sentence(sentences)
output_conll_format(str(output_file), predicted)
if str(mode) == 'plain':
sentences = load_input_plain_file(str(input_file))
predicted = predict_sentence(sentences)
output_plain_format(str(output_file), predicted)
if __name__ == "__main__":
main()