Coverage Report

Created: 2025-10-12 07:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wasmtime/cranelift/codegen/src/isa/x64/lower.rs
Line
Count
Source
1
//! Lowering rules for X64.
2
3
// ISLE integration glue.
4
pub(super) mod isle;
5
6
use crate::ir::pcc::{FactContext, PccResult};
7
use crate::ir::{
8
    Endianness, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type, types,
9
};
10
use crate::isa::x64::abi::*;
11
use crate::isa::x64::inst::args::*;
12
use crate::isa::x64::inst::*;
13
use crate::isa::x64::pcc;
14
use crate::isa::{CallConv, x64::X64Backend};
15
use crate::machinst::lower::*;
16
use crate::machinst::*;
17
use crate::result::CodegenResult;
18
use crate::settings::Flags;
19
use std::boxed::Box;
20
use target_lexicon::Triple;
21
22
/// Identifier for a particular input of an instruction.
23
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
24
struct InsnInput {
25
    insn: IRInst,
26
    input: usize,
27
}
28
29
//=============================================================================
30
// Helpers for instruction lowering.
31
32
impl Lower<'_, Inst> {
33
    #[inline]
34
52.1M
    pub fn temp_writable_gpr(&mut self) -> WritableGpr {
35
52.1M
        WritableGpr::from_writable_reg(self.alloc_tmp(types::I64).only_reg().unwrap()).unwrap()
36
52.1M
    }
37
38
    #[inline]
39
13.9M
    pub fn temp_writable_xmm(&mut self) -> WritableXmm {
40
13.9M
        WritableXmm::from_writable_reg(self.alloc_tmp(types::F64).only_reg().unwrap()).unwrap()
41
13.9M
    }
42
}
43
44
72.8M
fn is_int_or_ref_ty(ty: Type) -> bool {
45
72.8M
    match ty {
46
67.5M
        types::I8 | types::I16 | types::I32 | types::I64 => true,
47
5.31M
        _ => false,
48
    }
49
72.8M
}
50
51
/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
52
/// `op`.
53
// TODO investigate failures with checking against the result index.
54
2.73M
fn matches_input(ctx: &mut Lower<Inst>, input: InsnInput, op: Opcode) -> Option<IRInst> {
55
2.73M
    let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
56
2.73M
    inputs.inst.as_inst().and_then(|(src_inst, _)| {
57
751k
        let data = ctx.data(src_inst);
58
751k
        if data.opcode() == op {
59
125k
            return Some(src_inst);
60
626k
        }
61
626k
        None
62
751k
    })
63
2.73M
}
64
65
/// Put the given input into possibly multiple registers, and mark it as used (side-effect).
66
2.32M
fn put_input_in_regs(ctx: &mut Lower<Inst>, spec: InsnInput) -> ValueRegs<Reg> {
67
2.32M
    let ty = ctx.input_ty(spec.insn, spec.input);
68
2.32M
    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
69
70
2.32M
    if let Some(c) = input.constant {
71
        // Generate constants fresh at each use to minimize long-range register pressure.
72
830
        let size = if ty_bits(ty) < 64 {
73
0
            OperandSize::Size32
74
        } else {
75
830
            OperandSize::Size64
76
        };
77
830
        assert!(is_int_or_ref_ty(ty)); // Only used for addresses.
78
830
        let cst_copy = ctx.alloc_tmp(ty);
79
830
        ctx.emit(Inst::imm(size, c, cst_copy.only_reg().unwrap()));
80
830
        non_writable_value_regs(cst_copy)
81
    } else {
82
2.32M
        ctx.put_input_in_regs(spec.insn, spec.input)
83
    }
84
2.32M
}
85
86
/// Put the given input into a register, and mark it as used (side-effect).
87
2.32M
fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg {
88
2.32M
    put_input_in_regs(ctx, spec)
89
2.32M
        .only_reg()
90
2.32M
        .expect("Multi-register value not expected")
91
2.32M
}
92
93
enum MergeableLoadSize {
94
    /// The load size performed by a sinkable load merging operation is
95
    /// precisely the size necessary for the type in question.
96
    Exact,
97
98
    /// Narrower-than-32-bit values are handled by ALU insts that are at least
99
    /// 32 bits wide, which is normally OK as we ignore upper buts; but, if we
100
    /// generate, e.g., a direct-from-memory 32-bit add for a byte value and
101
    /// the byte is the last byte in a page, the extra data that we load is
102
    /// incorrectly accessed. So we only allow loads to merge for
103
    /// 32-bit-and-above widths.
104
    Min32,
105
}
106
107
/// Determines whether a load operation (indicated by `src_insn`) can be merged
108
/// into the current lowering point. If so, returns the address-base source (as
109
/// an `InsnInput`) and an offset from that address from which to perform the
110
/// load.
111
13.6M
fn is_mergeable_load(
112
13.6M
    ctx: &mut Lower<Inst>,
113
13.6M
    src_insn: IRInst,
114
13.6M
    size: MergeableLoadSize,
115
13.6M
) -> Option<(InsnInput, i32)> {
116
13.6M
    let insn_data = ctx.data(src_insn);
117
13.6M
    let inputs = ctx.num_inputs(src_insn);
118
13.6M
    if inputs != 1 {
119
7.25M
        return None;
120
6.42M
    }
121
122
    // If this type is too small to get a merged load, don't merge the load.
123
6.42M
    let load_ty = ctx.output_ty(src_insn, 0);
124
6.42M
    if ty_bits(load_ty) < 32 {
125
691k
        match size {
126
347k
            MergeableLoadSize::Exact => {}
127
344k
            MergeableLoadSize::Min32 => return None,
128
        }
129
5.72M
    }
130
131
    // If the load's flags specify big-endian, we can't merge.
132
6.07M
    if let Some(flags) = ctx.memflags(src_insn) {
133
2.94M
        if flags.explicit_endianness() == Some(Endianness::Big) {
134
0
            return None;
135
2.94M
        }
136
3.13M
    }
137
138
    // Just testing the opcode is enough, because the width will always match if
139
    // the type does (and the type should match if the CLIF is properly
140
    // constructed).
141
    if let &InstructionData::Load {
142
        opcode: Opcode::Load,
143
2.30M
        offset,
144
        ..
145
2.41M
    } = insn_data
146
    {
147
2.30M
        Some((
148
2.30M
            InsnInput {
149
2.30M
                insn: src_insn,
150
2.30M
                input: 0,
151
2.30M
            },
152
2.30M
            offset.into(),
153
2.30M
        ))
154
    } else {
155
3.76M
        None
156
    }
157
13.6M
}
158
159
1
fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
160
1
    ctx.get_input_as_source_or_const(spec.insn, spec.input)
161
1
        .constant
162
1
}
163
164
56
fn emit_vm_call(
165
56
    ctx: &mut Lower<Inst>,
166
56
    flags: &Flags,
167
56
    triple: &Triple,
168
56
    libcall: LibCall,
169
56
    inputs: &[ValueRegs<Reg>],
170
56
) -> CodegenResult<InstOutput> {
171
56
    let extname = ExternalName::LibCall(libcall);
172
173
    // TODO avoid recreating signatures for every single Libcall function.
174
56
    let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
175
56
    let sig = libcall.signature(call_conv, types::I64);
176
56
    let outputs = ctx.gen_call_output(&sig);
177
178
56
    if !ctx.sigs().have_abi_sig_for_signature(&sig) {
179
12
        ctx.sigs_mut()
180
12
            .make_abi_sig_from_ir_signature::<X64ABIMachineSpec>(sig.clone(), flags)?;
181
44
    }
182
56
    let sig = ctx.sigs().abi_sig_for_signature(&sig);
183
184
56
    let uses = ctx.gen_call_args(sig, inputs);
185
56
    let defs = ctx.gen_call_rets(sig, &outputs);
186
187
56
    let stack_ret_space = ctx.sigs()[sig].sized_stack_ret_space();
188
56
    let stack_arg_space = ctx.sigs()[sig].sized_stack_arg_space();
189
56
    ctx.abi_mut()
190
56
        .accumulate_outgoing_args_size(stack_ret_space + stack_arg_space);
191
192
56
    if flags.use_colocated_libcalls() {
193
0
        let call_info = ctx.gen_call_info(sig, extname, uses, defs, None);
194
0
        ctx.emit(Inst::call_known(Box::new(call_info)));
195
56
    } else {
196
56
        let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
197
56
        ctx.emit(Inst::LoadExtName {
198
56
            dst: tmp.map(Gpr::unwrap_new),
199
56
            name: Box::new(extname),
200
56
            offset: 0,
201
56
            distance: RelocDistance::Far,
202
56
        });
203
56
        let call_info = ctx.gen_call_info(sig, RegMem::reg(tmp.to_reg()), uses, defs, None);
204
56
        ctx.emit(Inst::call_unknown(Box::new(call_info)));
205
56
    }
206
56
    Ok(outputs)
207
56
}
208
209
/// Returns whether the given input is a shift by a constant value less or equal than 3.
210
/// The goal is to embed it within an address mode.
211
216k
fn matches_small_constant_shift(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<(InsnInput, u8)> {
212
216k
    matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
213
1
        match input_to_imm(
214
1
            ctx,
215
1
            InsnInput {
216
1
                insn: shift,
217
1
                input: 1,
218
1
            },
219
        ) {
220
1
            Some(shift_amt) if shift_amt <= 3 => Some((
221
1
                InsnInput {
222
1
                    insn: shift,
223
1
                    input: 0,
224
1
                },
225
1
                shift_amt as u8,
226
1
            )),
227
0
            _ => None,
228
        }
229
1
    })
230
216k
}
231
232
/// Lowers an instruction to one of the x86 addressing modes.
233
///
234
/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
235
2.30M
fn lower_to_amode(ctx: &mut Lower<Inst>, spec: InsnInput, offset: i32) -> Amode {
236
2.30M
    let flags = ctx
237
2.30M
        .memflags(spec.insn)
238
2.30M
        .expect("Instruction with amode should have memflags");
239
240
    // We now either have an add that we must materialize, or some other input; as well as the
241
    // final offset.
242
2.30M
    if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
243
108k
        let output_ty = ctx.output_ty(add, 0);
244
108k
        debug_assert_eq!(
245
            output_ty,
246
            types::I64,
247
0
            "Address width of 64 expected, got {output_ty}"
248
        );
249
108k
        let add_inputs = &[
250
108k
            InsnInput {
251
108k
                insn: add,
252
108k
                input: 0,
253
108k
            },
254
108k
            InsnInput {
255
108k
                insn: add,
256
108k
                input: 1,
257
108k
            },
258
108k
        ];
259
260
        // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
261
        // aren't happening in the wasm case. We could do better, given some range analysis.
262
17.1k
        let (base, index, shift) = if let Some((shift_input, shift_amt)) =
263
108k
            matches_small_constant_shift(ctx, add_inputs[0])
264
        {
265
0
            (
266
0
                put_input_in_reg(ctx, add_inputs[1]),
267
0
                put_input_in_reg(ctx, shift_input),
268
0
                shift_amt,
269
0
            )
270
1
        } else if let Some((shift_input, shift_amt)) =
271
108k
            matches_small_constant_shift(ctx, add_inputs[1])
272
        {
273
1
            (
274
1
                put_input_in_reg(ctx, add_inputs[0]),
275
1
                put_input_in_reg(ctx, shift_input),
276
1
                shift_amt,
277
1
            )
278
        } else {
279
233k
            for input in 0..=1 {
280
                // Try to pierce through uextend.
281
216k
                let (inst, inst_input) = if let Some(uextend) =
282
216k
                    matches_input(ctx, InsnInput { insn: add, input }, Opcode::Uextend)
283
                {
284
16.8k
                    (uextend, 0)
285
                } else {
286
199k
                    (add, input)
287
                };
288
289
                // If it's a constant, add it directly!
290
216k
                if let Some(cst) = ctx.get_input_as_source_or_const(inst, inst_input).constant {
291
91.9k
                    let final_offset = (offset as i64).wrapping_add(cst as i64);
292
91.9k
                    if let Ok(final_offset) = i32::try_from(final_offset) {
293
91.0k
                        let base = put_input_in_reg(ctx, add_inputs[1 - input]);
294
91.0k
                        return Amode::imm_reg(final_offset, base).with_flags(flags);
295
842
                    }
296
124k
                }
297
            }
298
299
17.1k
            (
300
17.1k
                put_input_in_reg(ctx, add_inputs[0]),
301
17.1k
                put_input_in_reg(ctx, add_inputs[1]),
302
17.1k
                0,
303
17.1k
            )
304
        };
305
306
17.1k
        return Amode::imm_reg_reg_shift(
307
17.1k
            offset,
308
17.1k
            Gpr::unwrap_new(base),
309
17.1k
            Gpr::unwrap_new(index),
310
17.1k
            shift,
311
17.1k
        )
312
17.1k
        .with_flags(flags);
313
2.19M
    }
314
315
2.19M
    let input = put_input_in_reg(ctx, spec);
316
2.19M
    Amode::imm_reg(offset, input).with_flags(flags)
317
2.30M
}
318
319
//=============================================================================
320
// Lowering-backend trait implementation.
321
322
impl LowerBackend for X64Backend {
323
    type MInst = Inst;
324
325
90.8M
    fn lower(&self, ctx: &mut Lower<Inst>, ir_inst: IRInst) -> Option<InstOutput> {
326
90.8M
        isle::lower(ctx, self, ir_inst)
327
90.8M
    }
328
329
19.0M
    fn lower_branch(
330
19.0M
        &self,
331
19.0M
        ctx: &mut Lower<Inst>,
332
19.0M
        ir_inst: IRInst,
333
19.0M
        targets: &[MachLabel],
334
19.0M
    ) -> Option<()> {
335
19.0M
        isle::lower_branch(ctx, self, ir_inst, targets)
336
19.0M
    }
337
338
2.24M
    fn maybe_pinned_reg(&self) -> Option<Reg> {
339
2.24M
        Some(regs::pinned_reg())
340
2.24M
    }
341
342
0
    fn check_fact(
343
0
        &self,
344
0
        ctx: &FactContext<'_>,
345
0
        vcode: &mut VCode<Self::MInst>,
346
0
        inst: InsnIndex,
347
0
        state: &mut pcc::FactFlowState,
348
0
    ) -> PccResult<()> {
349
0
        pcc::check(ctx, vcode, inst, state)
350
0
    }
351
352
    type FactFlowState = pcc::FactFlowState;
353
}