Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- X86CompressEVEX.cpp ------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
10
// when possible in order to reduce code size or facilitate HW decoding.
11
//
12
// Possible compression:
13
//   a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
14
//   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
15
//   c. NDD (EVEX) -> non-NDD (legacy)
16
//   d. NF_ND (EVEX) -> NF (EVEX)
17
//
18
// Compression a, b and c can always reduce code size, with some exceptions
19
// such as promoted 16-bit CRC32 which is as long as the legacy version.
20
//
21
// legacy:
22
//   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
23
// promoted:
24
//   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
25
//
26
// From performance perspective, these should be same (same uops and same EXE
27
// ports). From a FMV perspective, an older legacy encoding is preferred b/c it
28
// can execute in more places (broader HW install base). So we will still do
29
// the compression.
30
//
31
// Compression d can help hardware decode (HW may skip reading the NDD
32
// register) although the instruction length remains unchanged.
33
//===----------------------------------------------------------------------===//
34
35
#include "MCTargetDesc/X86BaseInfo.h"
36
#include "MCTargetDesc/X86InstComments.h"
37
#include "X86.h"
38
#include "X86InstrInfo.h"
39
#include "X86Subtarget.h"
40
#include "llvm/ADT/StringRef.h"
41
#include "llvm/CodeGen/MachineFunction.h"
42
#include "llvm/CodeGen/MachineFunctionPass.h"
43
#include "llvm/CodeGen/MachineInstr.h"
44
#include "llvm/CodeGen/MachineOperand.h"
45
#include "llvm/MC/MCInstrDesc.h"
46
#include "llvm/Pass.h"
47
#include <atomic>
48
#include <cassert>
49
#include <cstdint>
50
51
using namespace llvm;
52
53
// Including the generated EVEX compression tables.
54
struct X86CompressEVEXTableEntry {
55
  uint16_t OldOpc;
56
  uint16_t NewOpc;
57
58
1.57k
  bool operator<(const X86CompressEVEXTableEntry &RHS) const {
59
1.57k
    return OldOpc < RHS.OldOpc;
60
1.57k
  }
61
62
326
  friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) {
63
326
    return TE.OldOpc < Opc;
64
326
  }
65
};
66
#include "X86GenCompressEVEXTables.inc"
67
68
662
#define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
69
#define COMP_EVEX_NAME "x86-compress-evex"
70
71
#define DEBUG_TYPE COMP_EVEX_NAME
72
73
namespace {
74
75
class CompressEVEXPass : public MachineFunctionPass {
76
public:
77
  static char ID;
78
662
  CompressEVEXPass() : MachineFunctionPass(ID) {}
79
662
  StringRef getPassName() const override { return COMP_EVEX_DESC; }
80
81
  bool runOnMachineFunction(MachineFunction &MF) override;
82
83
  // This pass runs after regalloc and doesn't support VReg operands.
84
662
  MachineFunctionProperties getRequiredProperties() const override {
85
662
    return MachineFunctionProperties().set(
86
662
        MachineFunctionProperties::Property::NoVRegs);
87
662
  }
88
};
89
90
} // end anonymous namespace
91
92
char CompressEVEXPass::ID = 0;
93
94
31
static bool usesExtendedRegister(const MachineInstr &MI) {
95
122
  auto isHiRegIdx = [](unsigned Reg) {
96
    // Check for XMM register with indexes between 16 - 31.
97
122
    if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
98
0
      return true;
99
    // Check for YMM register with indexes between 16 - 31.
100
122
    if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
101
0
      return true;
102
    // Check for GPR with indexes between 16 - 31.
103
122
    if (X86II::isApxExtendedReg(Reg))
104
0
      return true;
105
122
    return false;
106
122
  };
107
108
  // Check that operands are not ZMM regs or
109
  // XMM/YMM regs with hi indexes between 16 - 31.
110
176
  for (const MachineOperand &MO : MI.explicit_operands()) {
111
176
    if (!MO.isReg())
112
54
      continue;
113
114
122
    Register Reg = MO.getReg();
115
122
    assert(!X86II::isZMMReg(Reg) &&
116
122
           "ZMM instructions should not be in the EVEX->VEX tables");
117
122
    if (isHiRegIdx(Reg))
118
0
      return true;
119
122
  }
120
121
31
  return false;
122
31
}
123
124
31
static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
125
31
  switch (OldOpc) {
126
31
  default:
127
31
    return true;
128
0
  case X86::VCVTNEPS2BF16Z128rm:
129
0
  case X86::VCVTNEPS2BF16Z128rr:
130
0
  case X86::VCVTNEPS2BF16Z256rm:
131
0
  case X86::VCVTNEPS2BF16Z256rr:
132
0
    return ST.hasAVXNECONVERT();
133
0
  case X86::VPDPBUSDSZ128m:
134
0
  case X86::VPDPBUSDSZ128r:
135
0
  case X86::VPDPBUSDSZ256m:
136
0
  case X86::VPDPBUSDSZ256r:
137
0
  case X86::VPDPBUSDZ128m:
138
0
  case X86::VPDPBUSDZ128r:
139
0
  case X86::VPDPBUSDZ256m:
140
0
  case X86::VPDPBUSDZ256r:
141
0
  case X86::VPDPWSSDSZ128m:
142
0
  case X86::VPDPWSSDSZ128r:
143
0
  case X86::VPDPWSSDSZ256m:
144
0
  case X86::VPDPWSSDSZ256r:
145
0
  case X86::VPDPWSSDZ128m:
146
0
  case X86::VPDPWSSDZ128r:
147
0
  case X86::VPDPWSSDZ256m:
148
0
  case X86::VPDPWSSDZ256r:
149
0
    return ST.hasAVXVNNI();
150
0
  case X86::VPMADD52HUQZ128m:
151
0
  case X86::VPMADD52HUQZ128r:
152
0
  case X86::VPMADD52HUQZ256m:
153
0
  case X86::VPMADD52HUQZ256r:
154
0
  case X86::VPMADD52LUQZ128m:
155
0
  case X86::VPMADD52LUQZ128r:
156
0
  case X86::VPMADD52LUQZ256m:
157
0
  case X86::VPMADD52LUQZ256r:
158
0
    return ST.hasAVXIFMA();
159
31
  }
160
31
}
161
162
// Do any custom cleanup needed to finalize the conversion.
163
31
static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
164
31
  (void)NewOpc;
165
31
  unsigned Opc = MI.getOpcode();
166
31
  switch (Opc) {
167
0
  case X86::VALIGNDZ128rri:
168
0
  case X86::VALIGNDZ128rmi:
169
0
  case X86::VALIGNQZ128rri:
170
0
  case X86::VALIGNQZ128rmi: {
171
0
    assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
172
0
           "Unexpected new opcode!");
173
0
    unsigned Scale =
174
0
        (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
175
0
    MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
176
0
    Imm.setImm(Imm.getImm() * Scale);
177
0
    break;
178
0
  }
179
0
  case X86::VSHUFF32X4Z256rmi:
180
0
  case X86::VSHUFF32X4Z256rri:
181
0
  case X86::VSHUFF64X2Z256rmi:
182
0
  case X86::VSHUFF64X2Z256rri:
183
0
  case X86::VSHUFI32X4Z256rmi:
184
0
  case X86::VSHUFI32X4Z256rri:
185
0
  case X86::VSHUFI64X2Z256rmi:
186
0
  case X86::VSHUFI64X2Z256rri: {
187
0
    assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
188
0
            NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
189
0
           "Unexpected new opcode!");
190
0
    MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
191
0
    int64_t ImmVal = Imm.getImm();
192
    // Set bit 5, move bit 1 to bit 4, copy bit 0.
193
0
    Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
194
0
    break;
195
0
  }
196
0
  case X86::VRNDSCALEPDZ128rri:
197
0
  case X86::VRNDSCALEPDZ128rmi:
198
0
  case X86::VRNDSCALEPSZ128rri:
199
0
  case X86::VRNDSCALEPSZ128rmi:
200
0
  case X86::VRNDSCALEPDZ256rri:
201
0
  case X86::VRNDSCALEPDZ256rmi:
202
0
  case X86::VRNDSCALEPSZ256rri:
203
0
  case X86::VRNDSCALEPSZ256rmi:
204
0
  case X86::VRNDSCALESDZr:
205
0
  case X86::VRNDSCALESDZm:
206
0
  case X86::VRNDSCALESSZr:
207
0
  case X86::VRNDSCALESSZm:
208
0
  case X86::VRNDSCALESDZr_Int:
209
0
  case X86::VRNDSCALESDZm_Int:
210
0
  case X86::VRNDSCALESSZr_Int:
211
0
  case X86::VRNDSCALESSZm_Int:
212
0
    const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
213
0
    int64_t ImmVal = Imm.getImm();
214
    // Ensure that only bits 3:0 of the immediate are used.
215
0
    if ((ImmVal & 0xf) != ImmVal)
216
0
      return false;
217
0
    break;
218
31
  }
219
220
31
  return true;
221
31
}
222
223
0
static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) {
224
  // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
225
  //   ->
226
  // $rbx = ADD64rr $rbx, $rax
227
0
  const MCInstrDesc &Desc = MI.getDesc();
228
0
  Register Reg0 = MI.getOperand(0).getReg();
229
0
  const MachineOperand &Op1 = MI.getOperand(1);
230
0
  if (!Op1.isReg())
231
0
    return false;
232
0
  Register Reg1 = Op1.getReg();
233
0
  if (Reg1 == Reg0)
234
0
    return true;
235
236
  // Op1 and Op2 may be commutable for ND instructions.
237
0
  if (!Desc.isCommutable() || Desc.getNumOperands() < 3 ||
238
0
      !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0)
239
0
    return false;
240
  // Opcode may change after commute, e.g. SHRD -> SHLD
241
  // TODO: Add test for this after ND SHRD/SHLD is supported
242
0
  ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
243
0
  return true;
244
0
}
245
246
3.99k
static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
247
3.99k
  uint64_t TSFlags = MI.getDesc().TSFlags;
248
249
  // Check for EVEX instructions only.
250
3.99k
  if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
251
3.96k
    return false;
252
253
  // Instructions with mask or 512-bit vector can't be converted to VEX.
254
31
  if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
255
0
    return false;
256
257
  // EVEX_B has several meanings.
258
  // AVX512:
259
  //  register form: rounding control or SAE
260
  //  memory form: broadcast
261
  //
262
  // APX:
263
  //  MAP4: NDD
264
  //
265
  // For AVX512 cases, EVEX prefix is needed in order to carry this information
266
  // thus preventing the transformation to VEX encoding.
267
31
  bool IsND = X86II::hasNewDataDest(TSFlags);
268
31
  if (TSFlags & X86II::EVEX_B)
269
0
    if (!IsND || !isRedundantNewDataDest(MI, ST))
270
0
      return false;
271
272
31
  ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable);
273
274
31
  unsigned Opc = MI.getOpcode();
275
31
  const auto *I = llvm::lower_bound(Table, Opc);
276
31
  if (I == Table.end() || I->OldOpc != Opc) {
277
0
    assert(!IsND && "Missing entry for ND instruction");
278
0
    return false;
279
0
  }
280
281
31
  if (!IsND) {
282
31
    if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) ||
283
31
        !performCustomAdjustments(MI, I->NewOpc))
284
0
      return false;
285
31
  }
286
287
31
  const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc);
288
31
  MI.setDesc(NewDesc);
289
31
  unsigned AsmComment;
290
31
  switch (NewDesc.TSFlags & X86II::EncodingMask) {
291
0
  case X86II::LEGACY:
292
0
    AsmComment = X86::AC_EVEX_2_LEGACY;
293
0
    break;
294
31
  case X86II::VEX:
295
31
    AsmComment = X86::AC_EVEX_2_VEX;
296
31
    break;
297
0
  case X86II::EVEX:
298
0
    AsmComment = X86::AC_EVEX_2_EVEX;
299
0
    assert(IsND && (NewDesc.TSFlags & X86II::EVEX_NF) &&
300
0
           "Unknown EVEX2EVEX compression");
301
0
    break;
302
0
  default:
303
0
    llvm_unreachable("Unknown EVEX compression");
304
31
  }
305
31
  MI.setAsmPrinterFlag(AsmComment);
306
31
  if (IsND)
307
0
    MI.tieOperands(0, 1);
308
309
31
  return true;
310
31
}
311
312
22.2k
bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
313
22.2k
#ifndef NDEBUG
314
  // Make sure the tables are sorted.
315
22.2k
  static std::atomic<bool> TableChecked(false);
316
22.2k
  if (!TableChecked.load(std::memory_order_relaxed)) {
317
1
    assert(llvm::is_sorted(X86CompressEVEXTable) &&
318
1
           "X86CompressEVEXTable is not sorted!");
319
0
    TableChecked.store(true, std::memory_order_relaxed);
320
1
  }
321
0
#endif
322
0
  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
323
22.2k
  if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())
324
22.1k
    return false;
325
326
25
  bool Changed = false;
327
328
221
  for (MachineBasicBlock &MBB : MF) {
329
    // Traverse the basic block.
330
221
    for (MachineInstr &MI : MBB)
331
3.99k
      Changed |= CompressEVEXImpl(MI, ST);
332
221
  }
333
334
25
  return Changed;
335
22.2k
}
336
337
INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
338
339
662
FunctionPass *llvm::createX86CompressEVEXPass() {
340
662
  return new CompressEVEXPass();
341
662
}