-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathassembler.py
224 lines (186 loc) · 8.88 KB
/
assembler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import logging
import cStringIO
import networkx as nx
import dis
from basicblock import BasicBlock
from instruction import Instruction
logger = logging.getLogger(__name__)
class Assembler:
def __init__(self, bb_graph):
"""
:param bb_graph: The graph of basic blocks
:type bb_graph: nx.DiGraph
"""
self.bb_graph = bb_graph
self.bb_ordered = [None] * self.bb_graph.number_of_nodes()
self.idx = self.bb_graph.number_of_nodes() - 1
def assemble(self):
"""
Assembling consists of several sub-stages:
1. Depth first search the graph to fix the order of the nodes when they are laid out sequentially.
(This is done in a post-order fashion)
2.
"""
entryblock = nx.get_node_attributes(self.bb_graph, 'isEntry').keys()[0]
logger.debug('Performing a DFS on the graph to generate the layout of the blocks.')
self.dfs(entryblock)
logger.debug('Morphing some JUMP_ABSOLUTE instructions to make file decompilable.')
self.convert_abs_to_rel()
# self.remove_redundant_jumps()
# If basic block A has a relative control flow instruction to block B, then block B
# must be located after block A in the generated layout.
# This is because relative control flow instructions are USUALLY used to refer to
# addresses located after it.
# If the relative c.f. instruction is JUMP_FORWARD we can change to JUMP_ABSOLUTE without
# any further modifications.
# For other relative c.f instructions like SETUP_LOOP, SETUP_EXCEPT etc,
# we need to create an new forwarder block consisting of an absolute jump instruction
# to block B, and make the relative control flow instruction in block A to point to
# the forwarder block. This works since the forwarder block will naturally be after
# block A in the generated layout and relative instructions can be always used to point
# to blocks located after it, i.e. have a higher address.
logger.debug('Verifying generated layout...')
for idx in xrange(len(self.bb_ordered)):
block = self.bb_ordered[idx]
for ins in block.instruction_iter():
if ins.opcode in dis.hasjrel:
targetBlock = ins.argval
# Check if target block occurs before the current block
if self.bb_ordered.index(targetBlock) <= idx:
logger.info(
'Basic block {} uses a relative control transfer instruction to access block {} located before it.'.format(
hex(id(block)), hex(id(targetBlock))))
# Modify relative jump to absolute jump
if ins.mnemonic == 'JUMP_FORWARD':
ins.opcode = dis.opmap['JUMP_ABSOLUTE']
# If instruction is a relative control transfer instruction
# but is not JUMP_FORWARD (like SETUP_LOOP)
else:
# Create a new forwarder block
bb = self.create_forwarder_block(targetBlock)
# Make the original instruction point to the new block
ins.argval = bb
# Append new block at end
self.bb_ordered.append(bb)
logger.debug('Successfully verified layout.')
self.calculate_block_addresses()
self.calculate_ins_operands()
return self.emit()
def create_forwarder_block(self, target):
"""
Create a new basic block consisting of a `JUMP_ABSOLUTE`
instruction to target block
:param target: The target basic block to jump to
:type target: BasicBlock
:return: The new basic block
:rtype: BasicBlock
"""
bb = BasicBlock()
ins = Instruction(dis.opmap['JUMP_ABSOLUTE'], target, 3)
ins.argval = target
bb.add_instruction(ins)
return bb
def dfs(self, bb):
"""
Depth first search.
Ported from: https://github.com/python/cpython/blob/2.7/Python/compile.c#L3409
:param bb: The basic block
:type bb: basicblock.BasicBlock
"""
# Return if the current block has already been visited
if bb.b_seen:
return
# Mark this block as visited
bb.b_seen = True
# Recursively dfs on all out going explicit edges
for o_edge in self.bb_graph.out_edges(bb, data=True):
# o_edge is a tuple (edge src, edge dest, edge attrib dict)
if o_edge[2]['edge_type'] == 'explicit':
self.dfs(o_edge[1])
# Iterate over the instructions in the basic block
for ins in bb.instruction_iter():
# Recursively dfs if instruction have xreferences
if ins.has_xref():
self.dfs(ins.argval)
# Recursively dfs on all out going implicit edges
for o_edge in self.bb_graph.out_edges(bb, data=True):
# o_edge is a tuple (edge src, edge dest, edge attrib dict)
if o_edge[2]['edge_type'] == 'implicit':
self.dfs(o_edge[1])
# Add the basic block in a reversed order
self.bb_ordered[self.idx] = bb
self.idx -= 1
def calculate_block_addresses(self):
"""
Once the layout of the blocks are fixed, we need to calculate the address of each block.
"""
logger.debug('Calculating addresses of basic blocks.')
size = 0
for block in self.bb_ordered:
block.address = size
size += block.size()
def calculate_ins_operands(self):
"""
Instructions like JUMP_FORWARD & SETUP_LOOP uses the operand to refer to other instructions.
This reference is an integer denoting the offset/absolute address of the target. This function
calculates the values of these operand
"""
logger.debug('Calculating instruction operands.')
for block in self.bb_ordered:
addr = block.address
for ins in block.instruction_iter():
addr += ins.size
if ins.opcode in dis.hasjabs:
# ins.argval is a BasicBlock
ins.arg = ins.argval.address
# TODO
# We do not generate EXTENDED_ARG opcode at the moment,
# hence size of opcode argument can only be 2 bytes
assert ins.arg <= 0xFFFF
elif ins.opcode in dis.hasjrel:
ins.arg = ins.argval.address - addr
# relative jump can USUALLY go forward
assert ins.arg >= 0
assert ins.arg <= 0xFFFF
def emit(self):
logger.debug('Generating code...')
codestring = cStringIO.StringIO()
for block in self.bb_ordered:
for ins in block.instruction_iter():
codestring.write(ins.assemble())
return codestring.getvalue()
def convert_abs_to_rel(self):
"""
An JUMP_ABSOLUTE instruction from basic block A to block B can be replaced with a JUMP_FORWARD
if block B is located after block A.
This conversion is not really required, but some decompilers like uncompyle fails without it.
"""
for idx in xrange(len(self.bb_ordered)):
block = self.bb_ordered[idx]
# Fetch the last instruction
ins = block.instructions[-1]
# A JUMP_ABSOLUTE instruction whose target block is located after it
if ins.mnemonic == 'JUMP_ABSOLUTE' and self.bb_ordered.index(ins.argval) > idx:
ins.opcode = dis.opmap['JUMP_FORWARD']
ins.mnemonic = 'JUMP_FORWARD'
def remove_redundant_jumps(self):
"""
If basic block A has a jump instruction to block B, but block B is immediately located after A,
then the jump instruction can safely be removed.
This feature is experimental and may break decompilers. The advantage of this feature is it reduces
generated code size.
"""
logger.warning('Removing redundant jump instruction. This feature is EXPERIMENTAL.')
numRemoved = 0
for idx in xrange(len(self.bb_ordered)):
block = self.bb_ordered[idx]
# Fetch the ;ast instruction
ins = block.instructions[-1]
if ins.mnemonic == 'JUMP_ABSOLUTE' or ins.mnemonic == 'JUMP_FORWARD':
target = ins.argval
# If target block is immediately located after it
if self.bb_ordered.index(target) == idx + 1:
# Remove the instruction
del block.instructions[-1]
numRemoved += 1
logger.debug('Removed {} redundant jump instructions'.format(numRemoved))