-
Notifications
You must be signed in to change notification settings - Fork 0
/
translate.py
75 lines (69 loc) · 3.15 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from utils.utils import *
from dotenv.main import load_dotenv
import argparse
import tqdm
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
parser = argparse.ArgumentParser()
parser.add_argument('-i','-input', type=str, required=True, help='The input path')
parser.add_argument('-l','-language', type=str, required=True, default='english', help="Target language (EXCEPTIONALLY, use 'braz_por' for Brazilian/Portuguese and 'simp_chinese' for Simplified Chinese)")
parser.add_argument('-o','-output', type=str, help='The output path, if not provided, the output will be saved in the same directory.')
args = parser.parse_args()
inputPath,language,outputPath=args.i,args.l,args.o
localeFiles=getAllymlFiles(inputPath)
lang=getPromptLang(language)
if outputPath is None:
outputPath=os.path.join(os.path.dirname(inputPath), language.lower())
tokensInTotal=0
for file in tqdm.tqdm(localeFiles):
print(file.filePath)
file.content=getKey_Value(file.content)
origin=file.content[1]
errorPath=os.path.join(outputPath,file.filePath+"_error.yml")
failedFiles=[]
# The total tokens counter is erroneous
'''From OpenAI's documentation:
Note that very long conversations are more likely to receive incomplete replies.
For example, a gpt-3.5-turbo conversation that is 4090 tokens long will have its reply cut off after just 6 tokens.
A tested configuration that works well on Victoria 3 game is 1350 tokens per request. If you want to change this, DO REMEMBER TO CHANGE THE VALUE IN utils.splitContents.
'''
get_text = lambda _t: "[["+"".join(getSeparatedContent(_t)).__str__()+"]]"
_text=get_text(origin)
tokens=len(tiktoken.encoding_for_model('gpt-3.5-turbo-0301').encode(_text))
tokensInTotal+=tokens
if tokens<=1350:
translated=segmentTranslator(lang,_text,file.content[0],errorPath)
if not translated:
continue
else:
splitLocation=splitContents(origin)
translated=[]
start=0
flag=False
for i in range(len(splitLocation)):
if i==len(splitLocation)-1 and splitLocation[-1]==len(origin) and start!=0:
start=len(origin)-1
else:
#print(len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode("".join(origin[start:splitLocation[i]-1].__str__()))))
_text=get_text(origin[start:(splitLocation[i]-1)])
_translated=segmentTranslator(lang,_text,file.content[0][start:splitLocation[i]-1],errorPath,i,translated)
if not _translated:
flag=True
break
translated+=_translated
start=splitLocation[i]
if flag :
failedFiles+=(file.filePath)
continue
_translated=segmentTranslator(lang,_text,file.content[0][start:],errorPath,i,translated)
if not _translated:
failedFiles+=(file.filePath)
continue
_text=get_text(origin[start:])
translated+=_translated
file.content=translated
saveFile(language,outputPath,file)
print("Done, total tokens used:"+str(tokensInTotal))
print("Failed files:")
for i in failedFiles:
print(i)