-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtext_augmentation.py
211 lines (190 loc) · 7.25 KB
/
text_augmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os
import re
import json
from random import shuffle, choice
from ltp import LTP
from tqdm import tqdm
# settings
ltp = LTP()
with open("corpus/corpus_handian/word_handian/corpus4typos.json", "r") as f:
dict_typos = json.load(f)
with open("corpus/corpus_handian/word_handian/corpus4v.json", "r") as f:
dict_v = json.load(f)
with open("corpus/corpus_handian/word_handian/corpus4adj.json", "r") as f:
dict_adj = json.load(f)
with open("corpus/corpus_handian/word_handian/corpus4adv.json", "r") as f:
dict_adv = json.load(f)
with open("corpus/corpus_handian/word_handian/corpus4n.json", "r") as f:
dict_n = json.load(f)
def twist_num(q):
# 找出q中的数字并随意换成其他数字
global ltp
lst_q = list()
ltp_word, hidden = ltp.seg([q])
ltp_pos = ltp.pos(hidden)
for word, w_pos in zip(ltp_word[0], ltp_pos[0]):
pos = w_pos.lower()
if pos == "m":
word_new = ""
for char in word:
if char in ['一', '二', '三', '四', '五', '六', '七', '八', '九']:
lst_candi = ['一', '二', '三', '四', '五', '六', '七', '八', '九']
lst_candi.remove(char)
candi = choice(lst_candi)
elif char in ['1', '2', '3', '4', '5', '6', '7', '8', '9']:
lst_candi = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
lst_candi.remove(char)
candi = choice(lst_candi)
else:
candi = char
word_new += candi
word = word_new
lst_q.append(word)
q_new = "".join(lst_q)
q_tag = True if q_new != q else False
return q_new, q_tag
def update_typos(char):
global dict_typos
if char in dict_typos.keys():
if "similar_phone" in dict_typos[char] and "number" in dict_typos[char]:
for candi in dict_typos[char]["similar_phone"]:
if candi[1] == dict_typos[char]["number"]:
return candi[0], True
if len(dict_typos[char]["similar_phone"]) > 0:
return choice(dict_typos[char]["similar_phone"])[0], True
if "similar_form" in dict_typos[char] and "number" in dict_typos[char]:
for candi in dict_typos[char]["similar_form"]:
if candi[1] == dict_typos[char]["number"]:
return candi[0], True
if len(dict_typos[char]["similar_form"]) > 0:
return choice(dict_typos[char]["similar_form"])[0], True
return char, False
def twist_typos(q):
# 找出q中的某个字并替换成形/音近字
global ltp
lst_candi = list()
ltp_word, hidden = ltp.seg([q])
ltp_pos = ltp.pos(hidden)
for w, w_pos in zip(ltp_word[0], ltp_pos[0]):
pos = w_pos.lower()
if ("n" in pos) or (pos == "i") or (pos == "j") or (pos == "r"):
lst_candi.append([w, len(w)])
if len(lst_candi) == 0:
return q, False
lst_candi.sort(key=lambda x:x[1], reverse=True)
for word, _ in lst_candi:
lst_word = list(word)
lst_idx = list(range(len(lst_word)))
shuffle(lst_idx)
for i in lst_idx:
char_old = lst_word[i]
char_new, flag = update_typos(char_old)
if flag:
lst_word[i] = char_new
word_update = "".join(lst_word)
q = q.replace(word, word_update, 1)
return q, True
return q, False
def twist_synonym(q, pos_tag="v"):
# 找出q中的某个名/动/副/形容词替换成其同/近义词
global dict_v, dict_adj, dict_adv, dict_n, ltp
# set conditions
if pos_tag == "v":
dict_now = dict_v
condition = "\"v\" in pos"
elif pos_tag == "adj":
dict_now = dict_adj
condition = "pos in [\"a\", \"b\"]"
elif pos_tag == "adv":
dict_now = dict_adv
condition = "pos in [\"d\"]"
elif pos_tag == "n":
dict_now = dict_n
condition = "\"n\" in pos"
else:
return q, False
# get candidates
lst_candi = list()
ltp_word, hidden = ltp.seg([q])
ltp_pos = ltp.pos(hidden)
for w, w_pos in zip(ltp_word[0], ltp_pos[0]):
pos = w_pos.lower()
if eval(condition):
lst_candi.append(w)
if len(lst_candi) == 0:
return q, False
shuffle(lst_candi)
# update candidates
for word in lst_candi:
if word in dict_now.keys():
if len(dict_now[word]["synonym"]) > 0:
word_update = choice(dict_now[word]["synonym"])
q = q.replace(word, word_update, 1)
return q, True
return q, False
def twist_antonym(q, pos_tag="v"):
# 找出q中的某个名/动/副/形容词替换成其反义词
global dict_v, dict_adj, dict_adv, dict_n, ltp
# set conditions
if pos_tag == "v":
dict_now = dict_v
condition = "\"v\" in pos"
elif pos_tag == "adj":
dict_now = dict_adj
condition = "pos in [\"a\", \"b\"]"
elif pos_tag == "adv":
dict_now = dict_adv
condition = "pos in [\"d\"]"
elif pos_tag == "n":
dict_now = dict_n
condition = "\"n\" in pos"
else:
return q, False
# get candidates
lst_candi = list()
ltp_word, hidden = ltp.seg([q])
ltp_pos = ltp.pos(hidden)
for w, w_pos in zip(ltp_word[0], ltp_pos[0]):
pos = w_pos.lower()
if eval(condition):
lst_candi.append(w)
if len(lst_candi) == 0:
return q, False
shuffle(lst_candi)
# update candidates
for word in lst_candi:
if word in dict_now.keys():
if len(dict_now[word]["antonym"]) > 0:
word_update = choice(dict_now[word]["antonym"])
q = q.replace(word, word_update, 1)
return q, True
return q, False
def show_fix_input():
print(twist_num("这是十二亿元的问题吗"))
print(twist_num("42个苹果就是宇宙的终极答案吗"))
print(twist_typos("命运之夜的结局是皆大欢喜的吗"))
print(twist_synonym("他最后是快乐地接受了这个悲惨的结局吗", "v"))
print(twist_synonym("他最后是快乐地接受了这个悲惨的结局吗", "adj"))
print(twist_synonym("他最后是快乐地接受了这个悲惨的结局吗", "adv"))
print(twist_synonym("他最后是快乐地接受了这个悲惨的结局吗", "n"))
print(twist_antonym("他最后是快乐地接受了这个悲惨的结局吗", "v"))
print(twist_antonym("他最后是快乐地接受了这个悲惨的结局吗", "adj"))
print(twist_antonym("他最后是快乐地接受了这个悲惨的结局吗", "adv"))
print(twist_antonym("他最后是快乐地接受了这个悲惨的结局吗", "n"))
return None
def show_self_defined_input(q):
print(twist_num(q))
print(twist_num(q))
print(twist_typos(q))
print(twist_synonym(q, "v"))
print(twist_synonym(q, "adj"))
print(twist_synonym(q, "adv"))
print(twist_synonym(q, "n"))
print(twist_antonym(q, "v"))
print(twist_antonym(q, "adj"))
print(twist_antonym(q, "adv"))
print(twist_antonym(q, "n"))
return None
if __name__ == "__main__":
show_fix_input()
show_self_defined_input("公积金提取记录单到哪儿打印")