-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathtokenizer.py
103 lines (95 loc) · 3.33 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pickle
import re
import os
import readidadata
def tokenizer():
with open('token_ida.pkl','rb') as f:
token_id=pickle.load(f)
f.close()
return token_id
def seq_to_token(token_id,seq,UNK):
ret=[]
cnt=0
for str in seq:
re=token_id.get(str)
if re!=None:
ret.append(re)
else:
cnt+=1
ret.append(UNK)
return ret
def normalize(opcode):
opcode = opcode.replace(' - ', ' + ')
opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
opcode = re.sub(r' [0-9]', ' CONST', opcode)
return opcode
def save_tokens(prefixs):
token_id={}
cnts=1
file_cnt=0
binlist=[]
nowdir='../largedata/ourclean'
docs = os.listdir(nowdir)
for i in docs:
pth=os.path.join(nowdir,i)
for fi in os.listdir(pth):
idx=fi.find('.')
fd=False
for pre in prefixs:
if fi.startswith(pre):
fd=True
if fd and fi.endswith('.nod'):
fn=os.path.join(pth,fi[0:idx])
binlist.append(fn+'.nod')
print(fn)
for fi in binlist:
fii=open(fi,'rb')
try:
asm_seq=pickle.load(fii)
except:
fii.close()
continue
else:
fii.close()
for bbid,addr,bb in asm_seq:
for addr,instructions in bb:
operator,operand1,operand2,operand3,annotation=readidadata.parse_asm(instructions)
if operator!=None:
if token_id.get(operator)==None:
print(operator,cnts," from ",hex(addr),instructions)
token_id[operator]=cnts
cnts+=1
if operand1!=None:
if not operand1.startswith('hex') and token_id.get(operand1)==None:
print(operand1,cnts," from ",hex(addr),instructions)
token_id[operand1]=cnts
cnts+=1
if operand2!=None:
if token_id.get(operand2)==None:
print(operand2,cnts," from ",hex(addr),instructions)
token_id[operand2]=cnts
cnts+=1
if operand3!=None:
if token_id.get(operand3)==None:
print(operand3,cnts," from ",hex(addr),instructions)
token_id[operand3]=cnts
cnts+=1
file_cnt+=1
print("finnish ",fi,file_cnt/len(binlist))
print("token_number: ",cnts)
with open('token_ida.pkl','wb') as f:
pickle.dump(token_id,f)
f.close()
return token_id
if __name__ == "__main__":
fi=open("vocab.txt","wb")
tokens=save_tokens(['proxmark3','pythonqt','pizmidi','plasma','qbs','qcad','sc3','vice','virtualgl','vtk','onics','odr','opencolorio','owncloud','sagemath','usd','lua','lxc'])
'''
output=[0 in range(100000)]
for i in tokens:
output[tokens[i]]=i
for i in range(len(tokens)):
print(output[i],file=fi)
fi.close()
'''