Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pyvex/lifting/util/instr_helper.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

146 statements  

1import abc 

2import string 

3from typing import Dict 

4 

5import bitstring 

6 

7from pyvex.expr import IRExpr, RdTmp 

8 

9from .lifter_helper import ParseError 

10from .syntax_wrapper import VexValue 

11from .vex_helper import IRSBCustomizer, JumpKind, vex_int_class 

12 

13 

14class Instruction(metaclass=abc.ABCMeta): 

15 """ 

16 Base class for an Instruction. 

17 

18 You should make a subclass of this for each instruction you want to lift. These classes will contain the "semantics" 

19 of the instruction, that is, what it _does_, in terms of the VEX IR. 

20 

21 You may want to subclass this for your architecture, and add arch-specific handling for parsing, argument 

22 resolution, etc., and have instructions subclass that instead. 

23 

24 The core parsing functionality is done via ``bin_format``. Each instruction should be a subclass of ``Instruction`` 

25 and will be parsed by comparing bits in the provided bitstream to symbols in the ``bin_format`` member of the class. 

26 "Bin formats" are strings of symbols, like those you'd find in an ISA document, such as "0010rrrrddddffmm" 

27 0 or 1 specify hard-coded bits that must match for an instruction to match. 

28 Any letters specify arguments, grouped by letter, which will be parsed and provided as bitstrings in the ``data`` 

29 member of the class as a dictionary. 

30 So, in our example, the bits ``0010110101101001``, applied to format string ``0010rrrrddddffmm`` 

31 will result in the following in ``self.data``: 

32 

33 {'r': '1101', 

34 'd': '0110', 

35 'f': '10', 

36 'm': '01'} 

37 

38 Implement compute_result to provide the "meat" of what your instruction does. 

39 You can also implement it in your arch-specific subclass of ``Instruction``, to handle things common to all 

40 instructions, and provide instruction implementations elsewhere. 

41 

42 We provide the ``VexValue`` syntax wrapper to make expressing instruction semantics easy. 

43 You first convert the bitstring arguments into ``VexValue``s using the provided convenience methods 

44 (``self.get/put/load/store/etc.``) 

45 This loads the register from the actual registers into a temporary value we can work with. 

46 You can then write it back to a register when you're done. 

47 For example, if you have the register in ``r``, as above, you can make a ``VexValue`` like this: 

48 

49 r = int(self.data['r'], 2) # we get bits corresponding to `r` bits and convert it to an int 

50 r_vv = self.get(r, Type.int_32) 

51 

52 If you then had an instruction to increment ``r``, you could simply: 

53 

54 return r_vv += 1 

55 

56 You could then write it back to the register like this: 

57 

58 self.put(r_vv, r) 

59 

60 Note that most architectures have special flags that get set differently for each instruction, make sure to 

61 implement those as well (override ``set_flags()`` ) 

62 

63 Override ``parse()`` to extend parsing. 

64 For example, in MSP430, this allows us to grab extra words from the bitstream 

65 when extra immediate words are present. 

66 

67 All architectures are different enough that there's no magic recipe for how to write a lifter. 

68 See the examples provided by gymrat for ideas of how to use this to build your own lifters quickly and easily. 

69 """ 

70 

71 data: Dict[str, str] 

72 irsb_c: IRSBCustomizer 

73 

74 def __init__(self, bitstrm, arch, addr): 

75 """ 

76 Create an instance of the instruction 

77 

78 :param irsb_c: The IRSBCustomizer to put VEX instructions into 

79 :param bitstrm: The bitstream to decode instructions from 

80 :param addr: The address of the instruction to be lifted, used only for jumps and branches 

81 """ 

82 self.addr = addr 

83 self.arch = arch 

84 self.bitwidth = len(self.bin_format) 

85 self.data = self.parse(bitstrm) 

86 

87 @property 

88 @abc.abstractmethod 

89 def bin_format(self) -> str: 

90 """ 

91 Read the documentation of the class to understand what a bin format string is 

92 

93 :return: str bin format string 

94 """ 

95 

96 @property 

97 @abc.abstractmethod 

98 def name(self) -> str: 

99 """ 

100 Name of the instruction 

101 

102 Can be useful to name the instruction when there's an error related to it 

103 """ 

104 

105 def __call__(self, irsb_c, past_instructions, future_instructions): 

106 self.lift(irsb_c, past_instructions, future_instructions) 

107 

108 def mark_instruction_start(self): 

109 self.irsb_c.imark(self.addr, self.bytewidth, 0) 

110 

111 def fetch_operands(self): # pylint: disable=no-self-use 

112 """ 

113 Get the operands out of memory or registers 

114 Return a tuple of operands for the instruction 

115 """ 

116 return () 

117 

118 def lift(self, irsb_c: IRSBCustomizer, past_instructions, future_instructions): # pylint: disable=unused-argument 

119 """ 

120 This is the main body of the "lifting" for the instruction. 

121 This can/should be overridden to provide the general flow of how instructions in your arch work. 

122 For example, in MSP430, this is: 

123 

124 - Figure out what your operands are by parsing the addressing, and load them into temporary registers 

125 - Do the actual operation, and commit the result, if needed. 

126 - Compute the flags 

127 """ 

128 self.irsb_c = irsb_c 

129 # Always call this first! 

130 self.mark_instruction_start() 

131 # Then do the actual stuff. 

132 inputs = self.fetch_operands() 

133 retval = self.compute_result(*inputs) # pylint: disable=assignment-from-none 

134 if retval is not None: 

135 self.commit_result(retval) 

136 vals = list(inputs) + [retval] 

137 self.compute_flags(*vals) 

138 

139 def commit_result(self, res): 

140 """ 

141 This where the result of the operation is written to a destination. 

142 This happens only if compute_result does not return None, and happens before compute_flags is called. 

143 Override this to specify how to write out the result. 

144 The results of fetch_operands can be used to resolve various addressing modes for the write outward. 

145 A common pattern is to return a function from fetch_operands which will be called here to perform the write. 

146 

147 :param args: A tuple of the results of fetch_operands and compute_result 

148 """ 

149 

150 def compute_result(self, *args): # pylint: disable=unused-argument,no-self-use 

151 """ 

152 This is where the actual operation performed by your instruction, excluding the calculation of flags, should be 

153 performed. Return the VexValue of the "result" of the instruction, which may 

154 be used to calculate the flags later. 

155 For example, for a simple add, with arguments src and dst, you can simply write: 

156 

157 return src + dst: 

158 

159 :param args: 

160 :return: A VexValue containing the "result" of the operation. 

161 """ 

162 return None 

163 

164 def compute_flags(self, *args): 

165 """ 

166 Most CPU architectures have "flags" that should be computed for many instructions. 

167 Override this to specify how that happens. One common pattern is to define this method to call specifi methods 

168 to update each flag, which can then be overriden in the actual classes for each instruction. 

169 """ 

170 

171 def match_instruction(self, data, bitstrm): # pylint: disable=unused-argument,no-self-use 

172 """ 

173 Override this to extend the parsing functionality. 

174 This is great for if your arch has instruction "formats" that have an opcode that has to match. 

175 

176 :param data: 

177 :param bitstrm: 

178 :return: data 

179 """ 

180 return data 

181 

182 def parse(self, bitstrm): 

183 if self.arch.instruction_endness == "Iend_LE": 

184 # This arch stores its instructions in memory endian-flipped compared to the ISA. 

185 # To enable natural lifter-writing, we let the user write them like in the manual, and correct for 

186 # endness here. 

187 instr_bits = self._load_le_instr(bitstrm, self.bitwidth) 

188 else: 

189 instr_bits = bitstrm.peek("bin:%d" % self.bitwidth) 

190 

191 data = {c: "" for c in self.bin_format if c in string.ascii_letters} 

192 for c, b in zip(self.bin_format, instr_bits): 

193 if c in "01": 

194 if b != c: 

195 raise ParseError("Mismatch between format bit %c and instruction bit %c" % (c, b)) 

196 elif c in string.ascii_letters: 

197 data[c] += b 

198 else: 

199 raise ValueError("Invalid bin_format character %c" % c) 

200 

201 # Hook here for extra matching functionality 

202 if hasattr(self, "match_instruction"): 

203 # Should raise if it's not right 

204 self.match_instruction(data, bitstrm) 

205 

206 # Use up the bits once we're sure it's right 

207 self.rawbits = bitstrm.read("hex:%d" % self.bitwidth) 

208 

209 # Hook here for extra parsing functionality (e.g., trailers) 

210 if hasattr(self, "_extra_parsing"): 

211 data = self._extra_parsing(data, bitstrm) # pylint: disable=no-member 

212 

213 return data 

214 

215 @property 

216 def bytewidth(self): 

217 if self.bitwidth % self.arch.byte_width != 0: 

218 raise ValueError("Instruction is not a multiple of bytes wide!") 

219 return self.bitwidth // self.arch.byte_width 

220 

221 def disassemble(self): 

222 """ 

223 Return the disassembly of this instruction, as a string. 

224 Override this in subclasses. 

225 

226 :return: The address (self.addr), the instruction's name, and a list of its operands, as strings 

227 """ 

228 return self.addr, "UNK", [self.rawbits] 

229 

230 # These methods should be called in subclasses to do register and memory operations 

231 

232 def load(self, addr, ty): 

233 """ 

234 Load a value from memory into a VEX temporary register. 

235 

236 :param addr: The VexValue containing the addr to load from. 

237 :param ty: The Type of the resulting data 

238 :return: a VexValue 

239 """ 

240 rdt = self.irsb_c.load(addr.rdt, ty) 

241 return VexValue(self.irsb_c, rdt) 

242 

243 def constant(self, val, ty): 

244 """ 

245 Creates a constant as a VexValue 

246 

247 :param val: The value, as an integer 

248 :param ty: The type of the resulting VexValue 

249 :return: a VexValue 

250 """ 

251 if isinstance(val, VexValue) and not isinstance(val, IRExpr): 

252 raise Exception("Constant cannot be made from VexValue or IRExpr") 

253 rdt = self.irsb_c.mkconst(val, ty) 

254 return VexValue(self.irsb_c, rdt) 

255 

256 @staticmethod 

257 def _lookup_register(arch, reg): 

258 # TODO: This is a hack to make it work with archinfo where we use 

259 # register indicies instead of names 

260 if isinstance(reg, int): 

261 if hasattr(arch, "register_index"): 

262 reg = arch.register_index[reg] 

263 else: 

264 reg = arch.register_list[reg].name 

265 return arch.get_register_offset(reg) 

266 

267 def get(self, reg, ty): 

268 """ 

269 Load a value from a machine register into a VEX temporary register. 

270 All values must be loaded out of registers before they can be used with operations, etc 

271 and stored back into them when the instruction is over. See Put(). 

272 

273 :param reg: Register number as an integer, or register string name 

274 :param ty: The Type to use. 

275 :return: A VexValue of the gotten value. 

276 """ 

277 offset = self._lookup_register(self.irsb_c.irsb.arch, reg) 

278 if offset == self.irsb_c.irsb.arch.ip_offset: 

279 return self.constant(self.addr, ty) 

280 rdt = self.irsb_c.rdreg(offset, ty) 

281 return VexValue(self.irsb_c, rdt) 

282 

283 def put(self, val, reg): 

284 """ 

285 Puts a value from a VEX temporary register into a machine register. 

286 This is how the results of operations done to registers get committed to the machine's state. 

287 

288 :param val: The VexValue to store (Want to store a constant? See Constant() first) 

289 :param reg: The integer register number to store into, or register name 

290 :return: None 

291 """ 

292 offset = self._lookup_register(self.irsb_c.irsb.arch, reg) 

293 self.irsb_c.put(val.rdt, offset) 

294 

295 def put_conditional(self, cond, valiftrue, valiffalse, reg): 

296 """ 

297 Like put, except it checks a condition 

298 to decide what to put in the destination register. 

299 

300 :param cond: The VexValue representing the logical expression for the condition 

301 (if your expression only has constants, don't use this method!) 

302 :param valiftrue: the VexValue to put in reg if cond evals as true 

303 :param validfalse: the VexValue to put in reg if cond evals as false 

304 :param reg: The integer register number to store into, or register name 

305 :return: None 

306 """ 

307 

308 val = self.irsb_c.ite(cond.rdt, valiftrue.rdt, valiffalse.rdt) 

309 offset = self._lookup_register(self.irsb_c.irsb.arch, reg) 

310 self.irsb_c.put(val, offset) 

311 

312 def store(self, val, addr): 

313 """ 

314 Store a VexValue in memory at the specified loaction. 

315 

316 :param val: The VexValue of the value to store 

317 :param addr: The VexValue of the address to store into 

318 :return: None 

319 """ 

320 self.irsb_c.store(addr.rdt, val.rdt) 

321 

322 def jump(self, condition, to_addr, jumpkind=JumpKind.Boring, ip_offset=None): 

323 """ 

324 Jump to a specified destination, under the specified condition. 

325 Used for branches, jumps, calls, returns, etc. 

326 

327 :param condition: The VexValue representing the expression for the guard, or None for an unconditional jump 

328 :param to_addr: The address to jump to. 

329 :param jumpkind: The JumpKind to use. See the VEX docs for what these are; you only need them for things 

330 aren't normal jumps (e.g., calls, interrupts, program exits, etc etc) 

331 :return: None 

332 """ 

333 to_addr_ty = None 

334 if isinstance(to_addr, VexValue): 

335 # Unpack a VV 

336 to_addr_rdt = to_addr.rdt 

337 to_addr_ty = to_addr.ty 

338 elif isinstance(to_addr, int): 

339 # Direct jump to an int, make an RdT and Ty 

340 to_addr_ty = vex_int_class(self.irsb_c.irsb.arch.bits).type 

341 to_addr = self.constant(to_addr, to_addr_ty) # TODO archinfo may be changing 

342 to_addr_rdt = to_addr.rdt 

343 elif isinstance(to_addr, RdTmp): 

344 # An RdT; just get the Ty of the arch's pointer type 

345 to_addr_ty = vex_int_class(self.irsb_c.irsb.arch.bits).type 

346 to_addr_rdt = to_addr 

347 else: 

348 raise TypeError("Jump destination has unknown type: " + repr(type(to_addr))) 

349 if not condition: 

350 # This is the default exit. 

351 self.irsb_c.irsb.jumpkind = jumpkind 

352 self.irsb_c.irsb.next = to_addr_rdt 

353 else: 

354 # add another exit 

355 # EDG says: We should make sure folks set ArchXYZ.ip_offset like they're supposed to 

356 if ip_offset is None: 

357 ip_offset = self.arch.ip_offset 

358 assert ip_offset is not None 

359 

360 negated_condition_rdt = self.ite(condition, self.constant(0, condition.ty), self.constant(1, condition.ty)) 

361 direct_exit_target = self.constant(self.addr + (self.bitwidth // 8), to_addr_ty) 

362 self.irsb_c.add_exit(negated_condition_rdt, direct_exit_target.rdt, jumpkind, ip_offset) 

363 self.irsb_c.irsb.jumpkind = jumpkind 

364 self.irsb_c.irsb.next = to_addr_rdt 

365 

366 def ite(self, cond, t, f): 

367 return self.irsb_c.ite(cond.rdt, t.rdt, f.rdt) 

368 

369 def ccall(self, ret_type, func_name, args): 

370 """ 

371 Creates a CCall operation. 

372 A CCall is a procedure that calculates a value at *runtime*, not at lift-time. 

373 You can use these for flags, unresolvable jump targets, etc. 

374 We caution you to avoid using them when at all possible though. 

375 

376 :param ret_type: The return type of the CCall 

377 :param func_obj: The name of the helper function to call. If you're using angr, this should be added (or 

378 monkeypatched) into ``angr.engines.vex.claripy.ccall``. 

379 :param args: List of arguments to the function 

380 :return: A VexValue of the result. 

381 """ 

382 

383 # Check the args to make sure they're the right type 

384 list_args = list(args) 

385 new_args = [] 

386 for arg in list_args: 

387 if isinstance(arg, VexValue): 

388 arg = arg.rdt 

389 new_args.append(arg) 

390 args = tuple(new_args) 

391 

392 cc = self.irsb_c.op_ccall(ret_type, func_name, args) 

393 return VexValue(self.irsb_c, cc) 

394 

395 def dirty(self, ret_type, func_name, args) -> VexValue: 

396 """ 

397 Creates a dirty call operation. 

398 

399 These are like ccalls (clean calls) but their implementations are theoretically allowed to read or write to or 

400 from any part of the state, making them a nightmare for static analysis to reason about. Avoid their use at all 

401 costs. 

402 

403 :param ret_type: The return type of the dirty call, or None if the dirty call doesn't return anything. 

404 :param func_name: The name of the helper function to call. If you're using angr, this should be added (or 

405 monkeypatched) into ``angr.engines.vex.heavy.dirty``. 

406 :param args: List of arguments to the function 

407 :return: A VexValue of the result. 

408 """ 

409 

410 # Check the args to make sure they're the right type 

411 list_args = list(args) 

412 new_args = [] 

413 for arg in list_args: 

414 if isinstance(arg, VexValue): 

415 arg = arg.rdt 

416 new_args.append(arg) 

417 args = tuple(new_args) 

418 

419 rdt = self.irsb_c.dirty(ret_type, func_name, args) 

420 return VexValue(self.irsb_c, rdt) 

421 

422 def _load_le_instr(self, bitstream: bitstring.ConstBitStream, numbits: int) -> str: 

423 return bitstring.Bits(uint=bitstream.peek("uintle:%d" % numbits), length=numbits).bin