Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pyvex/lifting/util/instr_helper.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

145 statements  

1import abc 

2import string 

3 

4import bitstring 

5 

6from pyvex.expr import IRExpr, RdTmp 

7 

8from .lifter_helper import ParseError 

9from .syntax_wrapper import VexValue 

10from .vex_helper import IRSBCustomizer, JumpKind, vex_int_class 

11 

12 

13class Instruction(metaclass=abc.ABCMeta): 

14 """ 

15 Base class for an Instruction. 

16 

17 You should make a subclass of this for each instruction you want to lift. These classes will contain the "semantics" 

18 of the instruction, that is, what it _does_, in terms of the VEX IR. 

19 

20 You may want to subclass this for your architecture, and add arch-specific handling for parsing, argument 

21 resolution, etc., and have instructions subclass that instead. 

22 

23 The core parsing functionality is done via ``bin_format``. Each instruction should be a subclass of ``Instruction`` 

24 and will be parsed by comparing bits in the provided bitstream to symbols in the ``bin_format`` member of the class. 

25 "Bin formats" are strings of symbols, like those you'd find in an ISA document, such as "0010rrrrddddffmm" 

26 0 or 1 specify hard-coded bits that must match for an instruction to match. 

27 Any letters specify arguments, grouped by letter, which will be parsed and provided as bitstrings in the ``data`` 

28 member of the class as a dictionary. 

29 So, in our example, the bits ``0010110101101001``, applied to format string ``0010rrrrddddffmm`` 

30 will result in the following in ``self.data``: 

31 

32 {'r': '1101', 

33 'd': '0110', 

34 'f': '10', 

35 'm': '01'} 

36 

37 Implement compute_result to provide the "meat" of what your instruction does. 

38 You can also implement it in your arch-specific subclass of ``Instruction``, to handle things common to all 

39 instructions, and provide instruction implementations elsewhere. 

40 

41 We provide the ``VexValue`` syntax wrapper to make expressing instruction semantics easy. 

42 You first convert the bitstring arguments into ``VexValue``s using the provided convenience methods 

43 (``self.get/put/load/store/etc.``) 

44 This loads the register from the actual registers into a temporary value we can work with. 

45 You can then write it back to a register when you're done. 

46 For example, if you have the register in ``r``, as above, you can make a ``VexValue`` like this: 

47 

48 r = int(self.data['r'], 2) # we get bits corresponding to `r` bits and convert it to an int 

49 r_vv = self.get(r, Type.int_32) 

50 

51 If you then had an instruction to increment ``r``, you could simply: 

52 

53 return r_vv += 1 

54 

55 You could then write it back to the register like this: 

56 

57 self.put(r_vv, r) 

58 

59 Note that most architectures have special flags that get set differently for each instruction, make sure to 

60 implement those as well (override ``set_flags()`` ) 

61 

62 Override ``parse()`` to extend parsing. 

63 For example, in MSP430, this allows us to grab extra words from the bitstream 

64 when extra immediate words are present. 

65 

66 All architectures are different enough that there's no magic recipe for how to write a lifter. 

67 See the examples provided by gymrat for ideas of how to use this to build your own lifters quickly and easily. 

68 """ 

69 

70 data: dict[str, str] 

71 irsb_c: IRSBCustomizer 

72 

73 def __init__(self, bitstrm, arch, addr): 

74 """ 

75 Create an instance of the instruction 

76 

77 :param irsb_c: The IRSBCustomizer to put VEX instructions into 

78 :param bitstrm: The bitstream to decode instructions from 

79 :param addr: The address of the instruction to be lifted, used only for jumps and branches 

80 """ 

81 self.addr = addr 

82 self.arch = arch 

83 self.bitwidth = len(self.bin_format) 

84 self.data = self.parse(bitstrm) 

85 

86 @property 

87 @abc.abstractmethod 

88 def bin_format(self) -> str: 

89 """ 

90 Read the documentation of the class to understand what a bin format string is 

91 

92 :return: str bin format string 

93 """ 

94 

95 @property 

96 @abc.abstractmethod 

97 def name(self) -> str: 

98 """ 

99 Name of the instruction 

100 

101 Can be useful to name the instruction when there's an error related to it 

102 """ 

103 

104 def __call__(self, irsb_c, past_instructions, future_instructions): 

105 self.lift(irsb_c, past_instructions, future_instructions) 

106 

107 def mark_instruction_start(self): 

108 self.irsb_c.imark(self.addr, self.bytewidth, 0) 

109 

110 def fetch_operands(self): # pylint: disable=no-self-use 

111 """ 

112 Get the operands out of memory or registers 

113 Return a tuple of operands for the instruction 

114 """ 

115 return () 

116 

117 def lift(self, irsb_c: IRSBCustomizer, past_instructions, future_instructions): # pylint: disable=unused-argument 

118 """ 

119 This is the main body of the "lifting" for the instruction. 

120 This can/should be overridden to provide the general flow of how instructions in your arch work. 

121 For example, in MSP430, this is: 

122 

123 - Figure out what your operands are by parsing the addressing, and load them into temporary registers 

124 - Do the actual operation, and commit the result, if needed. 

125 - Compute the flags 

126 """ 

127 self.irsb_c = irsb_c 

128 # Always call this first! 

129 self.mark_instruction_start() 

130 # Then do the actual stuff. 

131 inputs = self.fetch_operands() 

132 retval = self.compute_result(*inputs) # pylint: disable=assignment-from-none 

133 if retval is not None: 

134 self.commit_result(retval) 

135 vals = list(inputs) + [retval] 

136 self.compute_flags(*vals) 

137 

138 def commit_result(self, res): 

139 """ 

140 This where the result of the operation is written to a destination. 

141 This happens only if compute_result does not return None, and happens before compute_flags is called. 

142 Override this to specify how to write out the result. 

143 The results of fetch_operands can be used to resolve various addressing modes for the write outward. 

144 A common pattern is to return a function from fetch_operands which will be called here to perform the write. 

145 

146 :param args: A tuple of the results of fetch_operands and compute_result 

147 """ 

148 

149 def compute_result(self, *args): # pylint: disable=unused-argument,no-self-use 

150 """ 

151 This is where the actual operation performed by your instruction, excluding the calculation of flags, should be 

152 performed. Return the VexValue of the "result" of the instruction, which may 

153 be used to calculate the flags later. 

154 For example, for a simple add, with arguments src and dst, you can simply write: 

155 

156 return src + dst: 

157 

158 :param args: 

159 :return: A VexValue containing the "result" of the operation. 

160 """ 

161 return None 

162 

163 def compute_flags(self, *args): 

164 """ 

165 Most CPU architectures have "flags" that should be computed for many instructions. 

166 Override this to specify how that happens. One common pattern is to define this method to call specifi methods 

167 to update each flag, which can then be overriden in the actual classes for each instruction. 

168 """ 

169 

170 def match_instruction(self, data, bitstrm): # pylint: disable=unused-argument,no-self-use 

171 """ 

172 Override this to extend the parsing functionality. 

173 This is great for if your arch has instruction "formats" that have an opcode that has to match. 

174 

175 :param data: 

176 :param bitstrm: 

177 :return: data 

178 """ 

179 return data 

180 

181 def parse(self, bitstrm): 

182 if self.arch.instruction_endness == "Iend_LE": 

183 # This arch stores its instructions in memory endian-flipped compared to the ISA. 

184 # To enable natural lifter-writing, we let the user write them like in the manual, and correct for 

185 # endness here. 

186 instr_bits = self._load_le_instr(bitstrm, self.bitwidth) 

187 else: 

188 instr_bits = bitstrm.peek("bin:%d" % self.bitwidth) 

189 

190 data = {c: "" for c in self.bin_format if c in string.ascii_letters} 

191 for c, b in zip(self.bin_format, instr_bits): 

192 if c in "01": 

193 if b != c: 

194 raise ParseError("Mismatch between format bit %c and instruction bit %c" % (c, b)) 

195 elif c in string.ascii_letters: 

196 data[c] += b 

197 else: 

198 raise ValueError("Invalid bin_format character %c" % c) 

199 

200 # Hook here for extra matching functionality 

201 if hasattr(self, "match_instruction"): 

202 # Should raise if it's not right 

203 self.match_instruction(data, bitstrm) 

204 

205 # Use up the bits once we're sure it's right 

206 self.rawbits = bitstrm.read("hex:%d" % self.bitwidth) 

207 

208 # Hook here for extra parsing functionality (e.g., trailers) 

209 if hasattr(self, "_extra_parsing"): 

210 data = self._extra_parsing(data, bitstrm) # pylint: disable=no-member 

211 

212 return data 

213 

214 @property 

215 def bytewidth(self): 

216 if self.bitwidth % self.arch.byte_width != 0: 

217 raise ValueError("Instruction is not a multiple of bytes wide!") 

218 return self.bitwidth // self.arch.byte_width 

219 

220 def disassemble(self): 

221 """ 

222 Return the disassembly of this instruction, as a string. 

223 Override this in subclasses. 

224 

225 :return: The address (self.addr), the instruction's name, and a list of its operands, as strings 

226 """ 

227 return self.addr, "UNK", [self.rawbits] 

228 

229 # These methods should be called in subclasses to do register and memory operations 

230 

231 def load(self, addr, ty): 

232 """ 

233 Load a value from memory into a VEX temporary register. 

234 

235 :param addr: The VexValue containing the addr to load from. 

236 :param ty: The Type of the resulting data 

237 :return: a VexValue 

238 """ 

239 rdt = self.irsb_c.load(addr.rdt, ty) 

240 return VexValue(self.irsb_c, rdt) 

241 

242 def constant(self, val, ty): 

243 """ 

244 Creates a constant as a VexValue 

245 

246 :param val: The value, as an integer 

247 :param ty: The type of the resulting VexValue 

248 :return: a VexValue 

249 """ 

250 if isinstance(val, VexValue) and not isinstance(val, IRExpr): 

251 raise Exception("Constant cannot be made from VexValue or IRExpr") 

252 rdt = self.irsb_c.mkconst(val, ty) 

253 return VexValue(self.irsb_c, rdt) 

254 

255 @staticmethod 

256 def _lookup_register(arch, reg): 

257 # TODO: This is a hack to make it work with archinfo where we use 

258 # register indicies instead of names 

259 if isinstance(reg, int): 

260 if hasattr(arch, "register_index"): 

261 reg = arch.register_index[reg] 

262 else: 

263 reg = arch.register_list[reg].name 

264 return arch.get_register_offset(reg) 

265 

266 def get(self, reg, ty): 

267 """ 

268 Load a value from a machine register into a VEX temporary register. 

269 All values must be loaded out of registers before they can be used with operations, etc 

270 and stored back into them when the instruction is over. See Put(). 

271 

272 :param reg: Register number as an integer, or register string name 

273 :param ty: The Type to use. 

274 :return: A VexValue of the gotten value. 

275 """ 

276 offset = self._lookup_register(self.irsb_c.irsb.arch, reg) 

277 if offset == self.irsb_c.irsb.arch.ip_offset: 

278 return self.constant(self.addr, ty) 

279 rdt = self.irsb_c.rdreg(offset, ty) 

280 return VexValue(self.irsb_c, rdt) 

281 

282 def put(self, val, reg): 

283 """ 

284 Puts a value from a VEX temporary register into a machine register. 

285 This is how the results of operations done to registers get committed to the machine's state. 

286 

287 :param val: The VexValue to store (Want to store a constant? See Constant() first) 

288 :param reg: The integer register number to store into, or register name 

289 :return: None 

290 """ 

291 offset = self._lookup_register(self.irsb_c.irsb.arch, reg) 

292 self.irsb_c.put(val.rdt, offset) 

293 

294 def put_conditional(self, cond, valiftrue, valiffalse, reg): 

295 """ 

296 Like put, except it checks a condition 

297 to decide what to put in the destination register. 

298 

299 :param cond: The VexValue representing the logical expression for the condition 

300 (if your expression only has constants, don't use this method!) 

301 :param valiftrue: the VexValue to put in reg if cond evals as true 

302 :param validfalse: the VexValue to put in reg if cond evals as false 

303 :param reg: The integer register number to store into, or register name 

304 :return: None 

305 """ 

306 

307 val = self.irsb_c.ite(cond.rdt, valiftrue.rdt, valiffalse.rdt) 

308 offset = self._lookup_register(self.irsb_c.irsb.arch, reg) 

309 self.irsb_c.put(val, offset) 

310 

311 def store(self, val, addr): 

312 """ 

313 Store a VexValue in memory at the specified loaction. 

314 

315 :param val: The VexValue of the value to store 

316 :param addr: The VexValue of the address to store into 

317 :return: None 

318 """ 

319 self.irsb_c.store(addr.rdt, val.rdt) 

320 

321 def jump(self, condition, to_addr, jumpkind=JumpKind.Boring, ip_offset=None): 

322 """ 

323 Jump to a specified destination, under the specified condition. 

324 Used for branches, jumps, calls, returns, etc. 

325 

326 :param condition: The VexValue representing the expression for the guard, or None for an unconditional jump 

327 :param to_addr: The address to jump to. 

328 :param jumpkind: The JumpKind to use. See the VEX docs for what these are; you only need them for things 

329 aren't normal jumps (e.g., calls, interrupts, program exits, etc etc) 

330 :return: None 

331 """ 

332 to_addr_ty = None 

333 if isinstance(to_addr, VexValue): 

334 # Unpack a VV 

335 to_addr_rdt = to_addr.rdt 

336 to_addr_ty = to_addr.ty 

337 elif isinstance(to_addr, int): 

338 # Direct jump to an int, make an RdT and Ty 

339 to_addr_ty = vex_int_class(self.irsb_c.irsb.arch.bits).type 

340 to_addr = self.constant(to_addr, to_addr_ty) # TODO archinfo may be changing 

341 to_addr_rdt = to_addr.rdt 

342 elif isinstance(to_addr, RdTmp): 

343 # An RdT; just get the Ty of the arch's pointer type 

344 to_addr_ty = vex_int_class(self.irsb_c.irsb.arch.bits).type 

345 to_addr_rdt = to_addr 

346 else: 

347 raise TypeError("Jump destination has unknown type: " + repr(type(to_addr))) 

348 if not condition: 

349 # This is the default exit. 

350 self.irsb_c.irsb.jumpkind = jumpkind 

351 self.irsb_c.irsb.next = to_addr_rdt 

352 else: 

353 # add another exit 

354 # EDG says: We should make sure folks set ArchXYZ.ip_offset like they're supposed to 

355 if ip_offset is None: 

356 ip_offset = self.arch.ip_offset 

357 assert ip_offset is not None 

358 

359 negated_condition_rdt = self.ite(condition, self.constant(0, condition.ty), self.constant(1, condition.ty)) 

360 direct_exit_target = self.constant(self.addr + (self.bitwidth // 8), to_addr_ty) 

361 self.irsb_c.add_exit(negated_condition_rdt, direct_exit_target.rdt, jumpkind, ip_offset) 

362 self.irsb_c.irsb.jumpkind = jumpkind 

363 self.irsb_c.irsb.next = to_addr_rdt 

364 

365 def ite(self, cond, t, f): 

366 return self.irsb_c.ite(cond.rdt, t.rdt, f.rdt) 

367 

368 def ccall(self, ret_type, func_name, args): 

369 """ 

370 Creates a CCall operation. 

371 A CCall is a procedure that calculates a value at *runtime*, not at lift-time. 

372 You can use these for flags, unresolvable jump targets, etc. 

373 We caution you to avoid using them when at all possible though. 

374 

375 :param ret_type: The return type of the CCall 

376 :param func_obj: The name of the helper function to call. If you're using angr, this should be added (or 

377 monkeypatched) into ``angr.engines.vex.claripy.ccall``. 

378 :param args: List of arguments to the function 

379 :return: A VexValue of the result. 

380 """ 

381 

382 # Check the args to make sure they're the right type 

383 list_args = list(args) 

384 new_args = [] 

385 for arg in list_args: 

386 if isinstance(arg, VexValue): 

387 arg = arg.rdt 

388 new_args.append(arg) 

389 args = tuple(new_args) 

390 

391 cc = self.irsb_c.op_ccall(ret_type, func_name, args) 

392 return VexValue(self.irsb_c, cc) 

393 

394 def dirty(self, ret_type, func_name, args) -> VexValue: 

395 """ 

396 Creates a dirty call operation. 

397 

398 These are like ccalls (clean calls) but their implementations are theoretically allowed to read or write to or 

399 from any part of the state, making them a nightmare for static analysis to reason about. Avoid their use at all 

400 costs. 

401 

402 :param ret_type: The return type of the dirty call, or None if the dirty call doesn't return anything. 

403 :param func_name: The name of the helper function to call. If you're using angr, this should be added (or 

404 monkeypatched) into ``angr.engines.vex.heavy.dirty``. 

405 :param args: List of arguments to the function 

406 :return: A VexValue of the result. 

407 """ 

408 

409 # Check the args to make sure they're the right type 

410 list_args = list(args) 

411 new_args = [] 

412 for arg in list_args: 

413 if isinstance(arg, VexValue): 

414 arg = arg.rdt 

415 new_args.append(arg) 

416 args = tuple(new_args) 

417 

418 rdt = self.irsb_c.dirty(ret_type, func_name, args) 

419 return VexValue(self.irsb_c, rdt) 

420 

421 def _load_le_instr(self, bitstream: bitstring.ConstBitStream, numbits: int) -> str: 

422 return bitstring.Bits(uint=bitstream.peek("uintle:%d" % numbits), length=numbits).bin