-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
549 lines (484 loc) · 18.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
import os
import re
import time
import random
import readlogs
import utils
from youdao import YouDaoFanyi
from google_trans import GoogleTrans
class Counter:
def __init__(self, mode):
self.cnt = 0
self.mode = mode
def wait(self):
if self.mode == 'debug':
return
if self.cnt % 30 == 0:
wait_time = self._1D10()+5
elif self.cnt % 10 == 0:
wait_time = self._1D6()+3
elif self.cnt % 5 == 0:
wait_time = self._1D6()
else:
wait_time = 0.8
time.sleep(wait_time)
def _1D10(self):
return random.randint(1, 10)
def _1D6(self):
return random.randint(1, 6)
def incr(self):
self.cnt += 1
class Solver:
def __init__(self, mode=''):
self.mode = mode
self.TOKEN_SIGNAL_L = '['
self.TOKEN_SIGNAL_R = ']'
# k, v
self.word_dict = dict()
# k, token_id
self.token = dict()
# token_id, k
self.token_r = dict()
# name字典
self.name_dict = dict()
# ignore字典 - 忽略指定单词
self.ignore_dict = dict()
# sp_word 在 manul_trans_word 词附近的词典,用作特定单词翻译。
# 这部分词在日常用语和游戏环境用语中,有不同的翻译
self.sp_word_dict = dict()
# 复杂字典 k,v
self.comp_word_dict = dict()
# 不走API的单词字典, manul_trans_word_dict
self.manual_trans_word_dict = dict()
# 初始化word_dict,comp_word_dict, manul_trans_word_dict, name_dict, ignore_dict
self._init_word_dict()
# 翻译器
self.translator = self.get_translator()
# 等待计数器
self.counter = Counter(mode)
# 批量翻译的相关缓存
self.voice_multi_cache = []
def no_need_trans(self, line):
# 忽略空行
if line.strip() == '':
return True
# 如果没有需要翻译的部分,则忽略
if self.has_zh(line):
return True
# 存档标志
if line.find('000000') != -1:
return True
# 占位符标志
if line == 'placeholder':
return True
# 如果仅包含声音标识
line = self.off_voice(line)
if line.strip() == '':
print('')
line = self.on_voice(line)
return True
return False
def text_pre_solve(self, line, pattern='single', index=0):
# 一些容易引起翻译错误的,在这里手动翻译,不调用API接口
tup = self.direct_translate(line)
res = tup[0]
if tup[1]:
print('[直译] ' + res)
print('')
# 回复声音的标识位
res = self.on_voice(res, pattern, index)
return res
if res.strip() == '':
print('')
res = self.on_voice(res, pattern, index)
return res
# 替换token
res = self.token_replace(res)
print('[替换token] ' + res)
return res
def text_after_solve(self, line, pattern='single', index=0):
# 占位符还原成字典值
rev_back = self.set_token_back(line)
# 声音标识位还原
rev_back = self.on_voice(rev_back, pattern, index)
return rev_back
def batch_solve(self, line, batch_trans_lines, index):
if self.no_need_trans(line):
return False
# 预处理 得到一个包含若干占位符+原文的res
res = self.text_pre_solve(line, pattern='batch', index=index)
# debug模式下不调用API
if self.mode == 'debug':
return False
else:
# 加入批量翻译
batch_trans_lines.append(res)
return True
def single_solve(self, line):
if self.no_need_trans(line):
return line
# 预处理
res = self.text_pre_solve(line)
if self.mode == 'debug':
# debug模式下不调用API
zh = res
else:
# 调用API进行翻译
zh = self.translator.translate(res)
if self.mode == 'debug':
print('[API结果] ' + zh)
# 计数器
self.counter.incr()
# 特殊字符还原
rev_back = self.text_after_solve(zh)
# 等待,防止频繁调用报错
self.counter.wait()
return rev_back
# 还原声音占位符
def on_voice(self, line, pattern='single', index=0):
if pattern == 'single':
cache = self.voice_cache
line = cache+line
self.voice_cache = ''
return line
else:
cache = self.voice_multi_cache[index]
line = cache + line
self.voice_multi_cache[index] = ''
return line
# 去除声音占位符
def off_voice(self, line, pattern='single', index=0):
if pattern == 'single':
self.voice_cache = ''
if line[0] == '[' and line.find(']') != -1:
rp = line.find(']')
v = line[0:rp+1]
self.voice_cache = v
line = line[rp+1:]
return line
else:
self.voice_multi_cache[index] = ''
if line[0] == '[' and line.find(']') != -1:
rp = line.find(']')
v = line[0:rp + 1]
self.voice_multi_cache[index] = v
line = line[rp + 1:]
return line
def get_translator(self):
use = utils.read_config('appconf.ini')['config']['use']
if use == 'youdao':
return YouDaoFanyi('en', 'zh-CHS')
elif use == 'google':
return GoogleTrans('auto', 'zh-CN')
else:
# 默认返回有道
return YouDaoFanyi('en', 'zh-CHS')
# word_dict, comp_word_dict, manual_trans_word_dict, name_dict,这几个字典的key实际存的都是小写
# 因为共用 token, token_r
def _init_word_dict(self):
idx = 1
idx = self._init_token(utils.read_dict('dict/word_dict.txt'), self.word_dict, idx)
idx = self._init_token(utils.read_dict('dict/comp_word_dict.txt'), self.comp_word_dict, idx)
idx = self._init_token(utils.read_dict('dict/manual_trans_word_dict.txt'), self.manual_trans_word_dict, idx)
idx = self._init_token(utils.read_dict('dict/name_dict.txt'), self.name_dict, idx)
idx = self._init_token(utils.read_dict('dict/sp_word_dict.txt'), self.sp_word_dict, idx)
# 初始化ignore_dict
self.ignore_dict['lbs.'] = ''
self.ignore_dict['lb.'] = ''
self.ignore_dict['ft.'] = ''
# key 全部转为小写存储和比较
# 除了token_r 的value,作为翻译的value都不做任何改变
def _init_token(self, lines, w_dict, idx):
FIRST = 0
SECOND = 1
i = idx
for l in lines:
arr = l.split('#')
key = arr[FIRST].lower()
w_dict[key] = arr[SECOND]
self.token[key] = i
self.token_r[str(i)] = key
i = i+1
return i
def _init_voice_cache(self, size):
for i in range(size):
self.voice_multi_cache.append('')
def direct_translate(self, line):
line = line.lower()
no_api_req = False
# 如果这一行仅仅只有专有名词,则不走token替换
if line in self.comp_word_dict:
line = line.replace(line, self.comp_word_dict[line])
no_api_req = True
return (line, no_api_req)
# 检查不需要调用 API 的单词 manual_trans_word_dict
for k,v in self.manual_trans_word_dict.items():
if k in line:
no_api_req = True
# 将key替换为value
line = line.replace(k, v)
# 在这个上下文,找相关的sp_word
for sp in self.sp_word_dict:
if sp in line:
line = line.replace(sp, self.sp_word_dict[sp])
if len(line) == 0:
return (line, no_api_req)
# 坑爹的符号,这2个不是同一个符号
if (line[0] == '-' or line[0] == '–'):
if len(line) < 10:
print('[no_api_req]' + str(no_api_req))
no_api_req = True
# 在这个上下文,找相关的sp_word
for sp in self.sp_word_dict:
if sp in line:
line = line.replace(sp, self.sp_word_dict[sp])
if no_api_req:
# 如果不走API,那么在这里直接尽可能换完
for k,v in self.comp_word_dict.items():
if k in line:
line = line.replace(k, v)
words = line.split(' ')
for w in words:
w = self.word_clear(w)
# 需要忽略的单词
if w in self.ignore_dict:
line = line.replace(w, '')
continue
# 将w替换为key
if w in self.word_dict:
line = line.replace(w, self.word_dict[w])
# 替换name
for w in self.name_dict:
# 替换name
if w in words:
line = line.replace(w, self.name_dict[w])
return(line, no_api_req)
# 做 token 替换. 等翻译结束后,再替换回来
def token_replace(self, line):
# 检查复合单词 comp_word_dict 并替换token
for k,v in self.comp_word_dict.items():
if k in line:
line = line.replace(k, self.get_token_val(k))
words = line.split(' ')
# 这个要放在前面,因为数据清洗会洗掉单词末尾的.等字符,这里就会导致匹配失败
# 下面这几个部分的顺序不能变,不然会导致一些翻译上的问题
for k in self.ignore_dict:
if k in words:
line = line.replace(k, '')
continue
# 再对 words 做数据清洗,准备下面的正常匹配
for i in range(len(words)):
_w = self.word_clear(words[i])
words[i] = _w
# 匹配并替换为token
# 之所以要用 dict 中的k去对line中的单词做匹配,因为dict中的单词匹配顺序是可以控制的,而line中无法控制,这就可能导致一些替换上的问题
for k in self.word_dict:
if k in words:
line = line.replace(k, self.get_token_val(k))
for k in self.name_dict:
if k in words:
line = line.replace(k, self.get_token_val(k))
return line
# 去除词尾的一些符号
def word_clear(self, word):
ele = ['.',',','!',':','?','\'s']
for e in ele:
if word.find(e) != -1:
ws = word.split(e)
return ws[0]
return word
def get_token_val(self, k):
return self._make_token_val(self.token[k])
def get_token(self):
return (self.token, self.token_r)
def _make_token_val(self, int_v):
return self.TOKEN_SIGNAL_L+str(int_v)+self.TOKEN_SIGNAL_R
def set_token_back(self, line):
if line == '\n' or line == ' ':
return line
# 有时,有道翻译会把半角字符转成全角字符,造成匹配的问题。为了兼容这种情况,要把[]的全角字符转成半角[]
if line.find('[') != -1:
line = line.replace('[','[')
if line.find(']') != -1:
line = line.replace(']',']')
if line.find('【') != -1:
line = line.replace('【', '[')
if line.find('】') != -1:
line = line.replace('】', ']')
# 例子:_7_10 _3_10
pattern = r'\[(\d+)\]'
matches = re.findall(pattern, line)
for idx in matches:
# 根据idx反向找到key, 即token_r[idx]
k = self.token_r[idx]
if k in self.word_dict:
v = self.word_dict[k]
elif k in self.comp_word_dict:
v = self.comp_word_dict[k]
elif k in self.manual_trans_word_dict:
v = self.manual_trans_word_dict[k]
elif k in self.name_dict:
v = self.name_dict[k]
else:
v = ''
if self.mode == 'debug':
print('[set_token_back] m='+idx + ' k=' + k + ' v=' + v)
line = line.replace(self._make_token_val(idx), v)
if self.mode == 'debug':
print('[还原后] ' + line)
return line
def has_zh(self, string):
if string == '':
return False
for ch in string:
if self.zh_signal(ch):
return True
return False
def zh_signal(self, ch):
return '\u4e00' <= ch <= '\u9fff'
def fill(self, line, format_text, flag, fill_lines, trans_flag_lines):
trans_flag_lines.append(flag)
if flag:
fill_lines.append(format_text)
else:
fill_lines.append(line)
# 批量翻译并写文件
def batch_convert(self, lines, filename, output_encoding):
if self.translator.name != 'google':
print('必须使用google才能使用批量翻译')
return []
# 初始化 self.voice_multi_cache
self._init_voice_cache(len(lines))
# 填充模板数组
fill_lines = []
# 是否要翻译的标志位数组 true/false
trans_flag_lines = []
# 需要翻译的lines
batch_trans_lines = []
j = -1
for i in range(len(lines)):
if i <= j:
continue
line = lines[i]
l = line.find('~')
if l != -1:
r = line.find('~', l + 1)
if r != -1:
# 在同一行
flag = self.batch_solve(line[l + 1:r], batch_trans_lines, i)
self.fill(line,
line[:l + 1] + '{}' + line[r:],
flag, fill_lines, trans_flag_lines)
else:
# 在不同行
flag = self.batch_solve(line[l + 1:], batch_trans_lines, i)
self.fill(line,
line[:l+1] + '{}',
flag, fill_lines, trans_flag_lines)
j = i+1
while (lines[j].find('~') == -1):
flag = self.batch_solve(lines[j], batch_trans_lines, j)
self.fill(line,
'{}',
flag, fill_lines, trans_flag_lines)
j = j+1
r = lines[j].find('~')
flag = self.batch_solve(lines[j][:r], batch_trans_lines, i)
self.fill(line,
'{}' + lines[j][r:],
flag, fill_lines, trans_flag_lines)
# 批量翻译结果
batch_result = self.translator.batch_translate(batch_trans_lines)
next = 0
res = []
for i in range(len(fill_lines)):
# 获取填充模板
l = fill_lines[i]
# 根据标志位,对True的行进行还原
if trans_flag_lines[i]:
# 特殊字符还原
text_rev = self.text_after_solve(batch_result[next], pattern='batch', index=i)
res.append(l.format(text_rev))
next = next+1
else:
res.append(fill_lines[i])
# 写文件
utils.write_file('', filename, res, output_encoding)
return res
# 逐行翻译并写文件
def convert(self, lines, filename, start_line_num=0, output_encoding='utf-8'):
# 日志
log = readlogs.ReadLogs()
res = []
j = -1
for i in range(len(lines)):
if i < start_line_num:
continue
if i <= j:
continue
line = lines[i]
l = line.find('~')
if l != -1:
r = line.find('~',l+1)
if r != -1:
# 在同一行
result = []
res.append(line[:l+1] + self.single_solve(line[l+1:r]) + line[r:])
result.append(res[-1])
# 记录日志
self.do_write_append(log, '', filename, result, output_encoding, i + 1)
else:
# 在不同行
result = []
res.append(line[:l+1] + self.single_solve(line[l+1:]))
result.append(res[-1])
j = i+1
while (lines[j].find('~') == -1):
res.append(self.single_solve(lines[j]))
result.append(res[-1])
j = j+1
r = lines[j].find('~')
res.append(self.single_solve(lines[j][:r]) + lines[j][r:])
result.append(res[-1])
# 记录日志
self.do_write_append(log, '', filename, result, output_encoding, j + 1)
# 写文件
utils.write_file('', filename, res, output_encoding)
return res
def do_write_append(self, log, prefix, filename, lines, encoding, next_line_num):
for line in lines:
print('[翻译]' + line)
utils.write_line_in_append(prefix, filename, lines, encoding)
log.writelogs(filename, next_line_num)
def main():
# solver
solver = Solver()
log = readlogs.ReadLogs()
# 读取上次结束文件名
tup = log.readlogs()
lastfile = tup[0]
line_num = tup[1]
flg = False
files = os.listdir('tra/')
for file in files:
print(file)
# 忽略setup.tra文件
if file.lower() == 'setup.tra':
continue
if not flg and lastfile != '' and file != lastfile:
print('pass ' + file)
continue
else:
flg = True
if file.lower().endswith('.tra'):
# 先写log记录
log.writelogs(file)
lines = utils.read_file('tra/'+file, 'utf-8')
# 批量翻译
solver.batch_convert(lines, file, 'utf-8')
# 逐行翻译
# solver.convert('tra/'+file, 'utf-8')
print('-'*30)
log.done()
if __name__ == '__main__':
main()