/src/wasmtime/cranelift/codegen/src/isa/x64/lower.rs
Line | Count | Source |
1 | | //! Lowering rules for X64. |
2 | | |
3 | | // ISLE integration glue. |
4 | | pub(super) mod isle; |
5 | | |
6 | | use crate::ir::pcc::{FactContext, PccResult}; |
7 | | use crate::ir::{ |
8 | | Endianness, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type, types, |
9 | | }; |
10 | | use crate::isa::x64::abi::*; |
11 | | use crate::isa::x64::inst::args::*; |
12 | | use crate::isa::x64::inst::*; |
13 | | use crate::isa::x64::pcc; |
14 | | use crate::isa::{CallConv, x64::X64Backend}; |
15 | | use crate::machinst::lower::*; |
16 | | use crate::machinst::*; |
17 | | use crate::result::CodegenResult; |
18 | | use crate::settings::Flags; |
19 | | use std::boxed::Box; |
20 | | use target_lexicon::Triple; |
21 | | |
22 | | /// Identifier for a particular input of an instruction. |
23 | | #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
24 | | struct InsnInput { |
25 | | insn: IRInst, |
26 | | input: usize, |
27 | | } |
28 | | |
29 | | //============================================================================= |
30 | | // Helpers for instruction lowering. |
31 | | |
32 | | impl Lower<'_, Inst> { |
33 | | #[inline] |
34 | 52.1M | pub fn temp_writable_gpr(&mut self) -> WritableGpr { |
35 | 52.1M | WritableGpr::from_writable_reg(self.alloc_tmp(types::I64).only_reg().unwrap()).unwrap() |
36 | 52.1M | } |
37 | | |
38 | | #[inline] |
39 | 13.9M | pub fn temp_writable_xmm(&mut self) -> WritableXmm { |
40 | 13.9M | WritableXmm::from_writable_reg(self.alloc_tmp(types::F64).only_reg().unwrap()).unwrap() |
41 | 13.9M | } |
42 | | } |
43 | | |
44 | 72.8M | fn is_int_or_ref_ty(ty: Type) -> bool { |
45 | 72.8M | match ty { |
46 | 67.5M | types::I8 | types::I16 | types::I32 | types::I64 => true, |
47 | 5.31M | _ => false, |
48 | | } |
49 | 72.8M | } |
50 | | |
51 | | /// Returns whether the given specified `input` is a result produced by an instruction with Opcode |
52 | | /// `op`. |
53 | | // TODO investigate failures with checking against the result index. |
54 | 2.73M | fn matches_input(ctx: &mut Lower<Inst>, input: InsnInput, op: Opcode) -> Option<IRInst> { |
55 | 2.73M | let inputs = ctx.get_input_as_source_or_const(input.insn, input.input); |
56 | 2.73M | inputs.inst.as_inst().and_then(|(src_inst, _)| { |
57 | 751k | let data = ctx.data(src_inst); |
58 | 751k | if data.opcode() == op { |
59 | 125k | return Some(src_inst); |
60 | 626k | } |
61 | 626k | None |
62 | 751k | }) |
63 | 2.73M | } |
64 | | |
65 | | /// Put the given input into possibly multiple registers, and mark it as used (side-effect). |
66 | 2.32M | fn put_input_in_regs(ctx: &mut Lower<Inst>, spec: InsnInput) -> ValueRegs<Reg> { |
67 | 2.32M | let ty = ctx.input_ty(spec.insn, spec.input); |
68 | 2.32M | let input = ctx.get_input_as_source_or_const(spec.insn, spec.input); |
69 | | |
70 | 2.32M | if let Some(c) = input.constant { |
71 | | // Generate constants fresh at each use to minimize long-range register pressure. |
72 | 830 | let size = if ty_bits(ty) < 64 { |
73 | 0 | OperandSize::Size32 |
74 | | } else { |
75 | 830 | OperandSize::Size64 |
76 | | }; |
77 | 830 | assert!(is_int_or_ref_ty(ty)); // Only used for addresses. |
78 | 830 | let cst_copy = ctx.alloc_tmp(ty); |
79 | 830 | ctx.emit(Inst::imm(size, c, cst_copy.only_reg().unwrap())); |
80 | 830 | non_writable_value_regs(cst_copy) |
81 | | } else { |
82 | 2.32M | ctx.put_input_in_regs(spec.insn, spec.input) |
83 | | } |
84 | 2.32M | } |
85 | | |
86 | | /// Put the given input into a register, and mark it as used (side-effect). |
87 | 2.32M | fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg { |
88 | 2.32M | put_input_in_regs(ctx, spec) |
89 | 2.32M | .only_reg() |
90 | 2.32M | .expect("Multi-register value not expected") |
91 | 2.32M | } |
92 | | |
93 | | enum MergeableLoadSize { |
94 | | /// The load size performed by a sinkable load merging operation is |
95 | | /// precisely the size necessary for the type in question. |
96 | | Exact, |
97 | | |
98 | | /// Narrower-than-32-bit values are handled by ALU insts that are at least |
99 | | /// 32 bits wide, which is normally OK as we ignore upper buts; but, if we |
100 | | /// generate, e.g., a direct-from-memory 32-bit add for a byte value and |
101 | | /// the byte is the last byte in a page, the extra data that we load is |
102 | | /// incorrectly accessed. So we only allow loads to merge for |
103 | | /// 32-bit-and-above widths. |
104 | | Min32, |
105 | | } |
106 | | |
107 | | /// Determines whether a load operation (indicated by `src_insn`) can be merged |
108 | | /// into the current lowering point. If so, returns the address-base source (as |
109 | | /// an `InsnInput`) and an offset from that address from which to perform the |
110 | | /// load. |
111 | 13.6M | fn is_mergeable_load( |
112 | 13.6M | ctx: &mut Lower<Inst>, |
113 | 13.6M | src_insn: IRInst, |
114 | 13.6M | size: MergeableLoadSize, |
115 | 13.6M | ) -> Option<(InsnInput, i32)> { |
116 | 13.6M | let insn_data = ctx.data(src_insn); |
117 | 13.6M | let inputs = ctx.num_inputs(src_insn); |
118 | 13.6M | if inputs != 1 { |
119 | 7.25M | return None; |
120 | 6.42M | } |
121 | | |
122 | | // If this type is too small to get a merged load, don't merge the load. |
123 | 6.42M | let load_ty = ctx.output_ty(src_insn, 0); |
124 | 6.42M | if ty_bits(load_ty) < 32 { |
125 | 691k | match size { |
126 | 347k | MergeableLoadSize::Exact => {} |
127 | 344k | MergeableLoadSize::Min32 => return None, |
128 | | } |
129 | 5.72M | } |
130 | | |
131 | | // If the load's flags specify big-endian, we can't merge. |
132 | 6.07M | if let Some(flags) = ctx.memflags(src_insn) { |
133 | 2.94M | if flags.explicit_endianness() == Some(Endianness::Big) { |
134 | 0 | return None; |
135 | 2.94M | } |
136 | 3.13M | } |
137 | | |
138 | | // Just testing the opcode is enough, because the width will always match if |
139 | | // the type does (and the type should match if the CLIF is properly |
140 | | // constructed). |
141 | | if let &InstructionData::Load { |
142 | | opcode: Opcode::Load, |
143 | 2.30M | offset, |
144 | | .. |
145 | 2.41M | } = insn_data |
146 | | { |
147 | 2.30M | Some(( |
148 | 2.30M | InsnInput { |
149 | 2.30M | insn: src_insn, |
150 | 2.30M | input: 0, |
151 | 2.30M | }, |
152 | 2.30M | offset.into(), |
153 | 2.30M | )) |
154 | | } else { |
155 | 3.76M | None |
156 | | } |
157 | 13.6M | } |
158 | | |
159 | 1 | fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> { |
160 | 1 | ctx.get_input_as_source_or_const(spec.insn, spec.input) |
161 | 1 | .constant |
162 | 1 | } |
163 | | |
164 | 56 | fn emit_vm_call( |
165 | 56 | ctx: &mut Lower<Inst>, |
166 | 56 | flags: &Flags, |
167 | 56 | triple: &Triple, |
168 | 56 | libcall: LibCall, |
169 | 56 | inputs: &[ValueRegs<Reg>], |
170 | 56 | ) -> CodegenResult<InstOutput> { |
171 | 56 | let extname = ExternalName::LibCall(libcall); |
172 | | |
173 | | // TODO avoid recreating signatures for every single Libcall function. |
174 | 56 | let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple)); |
175 | 56 | let sig = libcall.signature(call_conv, types::I64); |
176 | 56 | let outputs = ctx.gen_call_output(&sig); |
177 | | |
178 | 56 | if !ctx.sigs().have_abi_sig_for_signature(&sig) { |
179 | 12 | ctx.sigs_mut() |
180 | 12 | .make_abi_sig_from_ir_signature::<X64ABIMachineSpec>(sig.clone(), flags)?; |
181 | 44 | } |
182 | 56 | let sig = ctx.sigs().abi_sig_for_signature(&sig); |
183 | | |
184 | 56 | let uses = ctx.gen_call_args(sig, inputs); |
185 | 56 | let defs = ctx.gen_call_rets(sig, &outputs); |
186 | | |
187 | 56 | let stack_ret_space = ctx.sigs()[sig].sized_stack_ret_space(); |
188 | 56 | let stack_arg_space = ctx.sigs()[sig].sized_stack_arg_space(); |
189 | 56 | ctx.abi_mut() |
190 | 56 | .accumulate_outgoing_args_size(stack_ret_space + stack_arg_space); |
191 | | |
192 | 56 | if flags.use_colocated_libcalls() { |
193 | 0 | let call_info = ctx.gen_call_info(sig, extname, uses, defs, None); |
194 | 0 | ctx.emit(Inst::call_known(Box::new(call_info))); |
195 | 56 | } else { |
196 | 56 | let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap(); |
197 | 56 | ctx.emit(Inst::LoadExtName { |
198 | 56 | dst: tmp.map(Gpr::unwrap_new), |
199 | 56 | name: Box::new(extname), |
200 | 56 | offset: 0, |
201 | 56 | distance: RelocDistance::Far, |
202 | 56 | }); |
203 | 56 | let call_info = ctx.gen_call_info(sig, RegMem::reg(tmp.to_reg()), uses, defs, None); |
204 | 56 | ctx.emit(Inst::call_unknown(Box::new(call_info))); |
205 | 56 | } |
206 | 56 | Ok(outputs) |
207 | 56 | } |
208 | | |
209 | | /// Returns whether the given input is a shift by a constant value less or equal than 3. |
210 | | /// The goal is to embed it within an address mode. |
211 | 216k | fn matches_small_constant_shift(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<(InsnInput, u8)> { |
212 | 216k | matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| { |
213 | 1 | match input_to_imm( |
214 | 1 | ctx, |
215 | 1 | InsnInput { |
216 | 1 | insn: shift, |
217 | 1 | input: 1, |
218 | 1 | }, |
219 | | ) { |
220 | 1 | Some(shift_amt) if shift_amt <= 3 => Some(( |
221 | 1 | InsnInput { |
222 | 1 | insn: shift, |
223 | 1 | input: 0, |
224 | 1 | }, |
225 | 1 | shift_amt as u8, |
226 | 1 | )), |
227 | 0 | _ => None, |
228 | | } |
229 | 1 | }) |
230 | 216k | } |
231 | | |
232 | | /// Lowers an instruction to one of the x86 addressing modes. |
233 | | /// |
234 | | /// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior. |
235 | 2.30M | fn lower_to_amode(ctx: &mut Lower<Inst>, spec: InsnInput, offset: i32) -> Amode { |
236 | 2.30M | let flags = ctx |
237 | 2.30M | .memflags(spec.insn) |
238 | 2.30M | .expect("Instruction with amode should have memflags"); |
239 | | |
240 | | // We now either have an add that we must materialize, or some other input; as well as the |
241 | | // final offset. |
242 | 2.30M | if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) { |
243 | 108k | let output_ty = ctx.output_ty(add, 0); |
244 | 108k | debug_assert_eq!( |
245 | | output_ty, |
246 | | types::I64, |
247 | 0 | "Address width of 64 expected, got {output_ty}" |
248 | | ); |
249 | 108k | let add_inputs = &[ |
250 | 108k | InsnInput { |
251 | 108k | insn: add, |
252 | 108k | input: 0, |
253 | 108k | }, |
254 | 108k | InsnInput { |
255 | 108k | insn: add, |
256 | 108k | input: 1, |
257 | 108k | }, |
258 | 108k | ]; |
259 | | |
260 | | // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations |
261 | | // aren't happening in the wasm case. We could do better, given some range analysis. |
262 | 17.1k | let (base, index, shift) = if let Some((shift_input, shift_amt)) = |
263 | 108k | matches_small_constant_shift(ctx, add_inputs[0]) |
264 | | { |
265 | 0 | ( |
266 | 0 | put_input_in_reg(ctx, add_inputs[1]), |
267 | 0 | put_input_in_reg(ctx, shift_input), |
268 | 0 | shift_amt, |
269 | 0 | ) |
270 | 1 | } else if let Some((shift_input, shift_amt)) = |
271 | 108k | matches_small_constant_shift(ctx, add_inputs[1]) |
272 | | { |
273 | 1 | ( |
274 | 1 | put_input_in_reg(ctx, add_inputs[0]), |
275 | 1 | put_input_in_reg(ctx, shift_input), |
276 | 1 | shift_amt, |
277 | 1 | ) |
278 | | } else { |
279 | 233k | for input in 0..=1 { |
280 | | // Try to pierce through uextend. |
281 | 216k | let (inst, inst_input) = if let Some(uextend) = |
282 | 216k | matches_input(ctx, InsnInput { insn: add, input }, Opcode::Uextend) |
283 | | { |
284 | 16.8k | (uextend, 0) |
285 | | } else { |
286 | 199k | (add, input) |
287 | | }; |
288 | | |
289 | | // If it's a constant, add it directly! |
290 | 216k | if let Some(cst) = ctx.get_input_as_source_or_const(inst, inst_input).constant { |
291 | 91.9k | let final_offset = (offset as i64).wrapping_add(cst as i64); |
292 | 91.9k | if let Ok(final_offset) = i32::try_from(final_offset) { |
293 | 91.0k | let base = put_input_in_reg(ctx, add_inputs[1 - input]); |
294 | 91.0k | return Amode::imm_reg(final_offset, base).with_flags(flags); |
295 | 842 | } |
296 | 124k | } |
297 | | } |
298 | | |
299 | 17.1k | ( |
300 | 17.1k | put_input_in_reg(ctx, add_inputs[0]), |
301 | 17.1k | put_input_in_reg(ctx, add_inputs[1]), |
302 | 17.1k | 0, |
303 | 17.1k | ) |
304 | | }; |
305 | | |
306 | 17.1k | return Amode::imm_reg_reg_shift( |
307 | 17.1k | offset, |
308 | 17.1k | Gpr::unwrap_new(base), |
309 | 17.1k | Gpr::unwrap_new(index), |
310 | 17.1k | shift, |
311 | 17.1k | ) |
312 | 17.1k | .with_flags(flags); |
313 | 2.19M | } |
314 | | |
315 | 2.19M | let input = put_input_in_reg(ctx, spec); |
316 | 2.19M | Amode::imm_reg(offset, input).with_flags(flags) |
317 | 2.30M | } |
318 | | |
319 | | //============================================================================= |
320 | | // Lowering-backend trait implementation. |
321 | | |
322 | | impl LowerBackend for X64Backend { |
323 | | type MInst = Inst; |
324 | | |
325 | 90.8M | fn lower(&self, ctx: &mut Lower<Inst>, ir_inst: IRInst) -> Option<InstOutput> { |
326 | 90.8M | isle::lower(ctx, self, ir_inst) |
327 | 90.8M | } |
328 | | |
329 | 19.0M | fn lower_branch( |
330 | 19.0M | &self, |
331 | 19.0M | ctx: &mut Lower<Inst>, |
332 | 19.0M | ir_inst: IRInst, |
333 | 19.0M | targets: &[MachLabel], |
334 | 19.0M | ) -> Option<()> { |
335 | 19.0M | isle::lower_branch(ctx, self, ir_inst, targets) |
336 | 19.0M | } |
337 | | |
338 | 2.24M | fn maybe_pinned_reg(&self) -> Option<Reg> { |
339 | 2.24M | Some(regs::pinned_reg()) |
340 | 2.24M | } |
341 | | |
342 | 0 | fn check_fact( |
343 | 0 | &self, |
344 | 0 | ctx: &FactContext<'_>, |
345 | 0 | vcode: &mut VCode<Self::MInst>, |
346 | 0 | inst: InsnIndex, |
347 | 0 | state: &mut pcc::FactFlowState, |
348 | 0 | ) -> PccResult<()> { |
349 | 0 | pcc::check(ctx, vcode, inst, state) |
350 | 0 | } |
351 | | |
352 | | type FactFlowState = pcc::FactFlowState; |
353 | | } |