Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
/// The pass tries to use the 32-bit encoding for instructions when possible.
8
//===----------------------------------------------------------------------===//
9
//
10
11
#include "AMDGPU.h"
12
#include "GCNSubtarget.h"
13
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14
#include "Utils/AMDGPUBaseInfo.h"
15
#include "llvm/ADT/Statistic.h"
16
#include "llvm/CodeGen/MachineFunctionPass.h"
17
18
#define DEBUG_TYPE "si-shrink-instructions"
19
20
STATISTIC(NumInstructionsShrunk,
21
          "Number of 64-bit instruction reduced to 32-bit.");
22
STATISTIC(NumLiteralConstantsFolded,
23
          "Number of literal constants folded into 32-bit instructions.");
24
25
using namespace llvm;
26
27
namespace {
28
29
class SIShrinkInstructions : public MachineFunctionPass {
30
  MachineFunction *MF;
31
  MachineRegisterInfo *MRI;
32
  const GCNSubtarget *ST;
33
  const SIInstrInfo *TII;
34
  const SIRegisterInfo *TRI;
35
36
public:
37
  static char ID;
38
39
public:
40
0
  SIShrinkInstructions() : MachineFunctionPass(ID) {
41
0
  }
42
43
  bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
44
  bool shouldShrinkTrue16(MachineInstr &MI) const;
45
  bool isKImmOperand(const MachineOperand &Src) const;
46
  bool isKUImmOperand(const MachineOperand &Src) const;
47
  bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
48
  bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
49
  void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
50
  void shrinkScalarCompare(MachineInstr &MI) const;
51
  void shrinkMIMG(MachineInstr &MI) const;
52
  void shrinkMadFma(MachineInstr &MI) const;
53
  bool shrinkScalarLogicOp(MachineInstr &MI) const;
54
  bool tryReplaceDeadSDST(MachineInstr &MI) const;
55
  bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
56
                     Register Reg, unsigned SubReg) const;
57
  bool instReadsReg(const MachineInstr *MI, unsigned Reg,
58
                    unsigned SubReg) const;
59
  bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
60
                       unsigned SubReg) const;
61
  TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
62
                                                   unsigned I) const;
63
  void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
64
  MachineInstr *matchSwap(MachineInstr &MovT) const;
65
66
  bool runOnMachineFunction(MachineFunction &MF) override;
67
68
0
  StringRef getPassName() const override { return "SI Shrink Instructions"; }
69
70
0
  void getAnalysisUsage(AnalysisUsage &AU) const override {
71
0
    AU.setPreservesCFG();
72
0
    MachineFunctionPass::getAnalysisUsage(AU);
73
0
  }
74
};
75
76
} // End anonymous namespace.
77
78
INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
79
                "SI Shrink Instructions", false, false)
80
81
char SIShrinkInstructions::ID = 0;
82
83
0
FunctionPass *llvm::createSIShrinkInstructionsPass() {
84
0
  return new SIShrinkInstructions();
85
0
}
86
87
/// This function checks \p MI for operands defined by a move immediate
88
/// instruction and then folds the literal constant into the instruction if it
89
/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
90
bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
91
0
                                          bool TryToCommute) const {
92
0
  assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
93
94
0
  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
95
96
  // Try to fold Src0
97
0
  MachineOperand &Src0 = MI.getOperand(Src0Idx);
98
0
  if (Src0.isReg()) {
99
0
    Register Reg = Src0.getReg();
100
0
    if (Reg.isVirtual()) {
101
0
      MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
102
0
      if (Def && Def->isMoveImmediate()) {
103
0
        MachineOperand &MovSrc = Def->getOperand(1);
104
0
        bool ConstantFolded = false;
105
106
0
        if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
107
0
          if (MovSrc.isImm()) {
108
0
            Src0.ChangeToImmediate(MovSrc.getImm());
109
0
            ConstantFolded = true;
110
0
          } else if (MovSrc.isFI()) {
111
0
            Src0.ChangeToFrameIndex(MovSrc.getIndex());
112
0
            ConstantFolded = true;
113
0
          } else if (MovSrc.isGlobal()) {
114
0
            Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
115
0
                            MovSrc.getTargetFlags());
116
0
            ConstantFolded = true;
117
0
          }
118
0
        }
119
120
0
        if (ConstantFolded) {
121
0
          if (MRI->use_nodbg_empty(Reg))
122
0
            Def->eraseFromParent();
123
0
          ++NumLiteralConstantsFolded;
124
0
          return true;
125
0
        }
126
0
      }
127
0
    }
128
0
  }
129
130
  // We have failed to fold src0, so commute the instruction and try again.
131
0
  if (TryToCommute && MI.isCommutable()) {
132
0
    if (TII->commuteInstruction(MI)) {
133
0
      if (foldImmediates(MI, false))
134
0
        return true;
135
136
      // Commute back.
137
0
      TII->commuteInstruction(MI);
138
0
    }
139
0
  }
140
141
0
  return false;
142
0
}
143
144
/// Do not shrink the instruction if its registers are not expressible in the
145
/// shrunk encoding.
146
0
bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
147
0
  for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
148
0
    const MachineOperand &MO = MI.getOperand(I);
149
0
    if (MO.isReg()) {
150
0
      Register Reg = MO.getReg();
151
0
      assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink "
152
0
                                 "True16 Instructions post-RA");
153
0
      if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
154
0
          !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
155
0
        return false;
156
0
    }
157
0
  }
158
0
  return true;
159
0
}
160
161
0
bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
162
0
  return isInt<16>(SignExtend64(Src.getImm(), 32)) &&
163
0
         !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
164
0
}
165
166
0
bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
167
0
  return isUInt<16>(Src.getImm()) &&
168
0
         !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
169
0
}
170
171
bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
172
0
                                                bool &IsUnsigned) const {
173
0
  if (isInt<16>(SignExtend64(Src.getImm(), 32))) {
174
0
    IsUnsigned = false;
175
0
    return !TII->isInlineConstant(Src);
176
0
  }
177
178
0
  if (isUInt<16>(Src.getImm())) {
179
0
    IsUnsigned = true;
180
0
    return !TII->isInlineConstant(Src);
181
0
  }
182
183
0
  return false;
184
0
}
185
186
/// \returns true if the constant in \p Src should be replaced with a bitreverse
187
/// of an inline immediate.
188
bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
189
0
                                              int32_t &ReverseImm) const {
190
0
  if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
191
0
    return false;
192
193
0
  ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
194
0
  return ReverseImm >= -16 && ReverseImm <= 64;
195
0
}
196
197
/// Copy implicit register operands from specified instruction to this
198
/// instruction that are not part of the instruction definition.
199
void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
200
0
                                                MachineInstr &MI) const {
201
0
  MachineFunction &MF = *MI.getMF();
202
0
  for (unsigned i = MI.getDesc().getNumOperands() +
203
0
                    MI.getDesc().implicit_uses().size() +
204
0
                    MI.getDesc().implicit_defs().size(),
205
0
                e = MI.getNumOperands();
206
0
       i != e; ++i) {
207
0
    const MachineOperand &MO = MI.getOperand(i);
208
0
    if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
209
0
      NewMI.addOperand(MF, MO);
210
0
  }
211
0
}
212
213
0
void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
214
0
  if (!ST->hasSCmpK())
215
0
    return;
216
217
  // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
218
  // get constants on the RHS.
219
0
  if (!MI.getOperand(0).isReg())
220
0
    TII->commuteInstruction(MI, false, 0, 1);
221
222
  // cmpk requires src0 to be a register
223
0
  const MachineOperand &Src0 = MI.getOperand(0);
224
0
  if (!Src0.isReg())
225
0
    return;
226
227
0
  MachineOperand &Src1 = MI.getOperand(1);
228
0
  if (!Src1.isImm())
229
0
    return;
230
231
0
  int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
232
0
  if (SOPKOpc == -1)
233
0
    return;
234
235
  // eq/ne is special because the imm16 can be treated as signed or unsigned,
236
  // and initially selected to the unsigned versions.
237
0
  if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
238
0
    bool HasUImm;
239
0
    if (isKImmOrKUImmOperand(Src1, HasUImm)) {
240
0
      if (!HasUImm) {
241
0
        SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
242
0
          AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
243
0
        Src1.setImm(SignExtend32(Src1.getImm(), 32));
244
0
      }
245
246
0
      MI.setDesc(TII->get(SOPKOpc));
247
0
    }
248
249
0
    return;
250
0
  }
251
252
0
  const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
253
254
0
  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
255
0
      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
256
0
    if (!TII->sopkIsZext(SOPKOpc))
257
0
      Src1.setImm(SignExtend64(Src1.getImm(), 32));
258
0
    MI.setDesc(NewDesc);
259
0
  }
260
0
}
261
262
// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
263
0
void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
264
0
  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
265
0
  if (!Info)
266
0
    return;
267
268
0
  uint8_t NewEncoding;
269
0
  switch (Info->MIMGEncoding) {
270
0
  case AMDGPU::MIMGEncGfx10NSA:
271
0
    NewEncoding = AMDGPU::MIMGEncGfx10Default;
272
0
    break;
273
0
  case AMDGPU::MIMGEncGfx11NSA:
274
0
    NewEncoding = AMDGPU::MIMGEncGfx11Default;
275
0
    break;
276
0
  default:
277
0
    return;
278
0
  }
279
280
0
  int VAddr0Idx =
281
0
      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
282
0
  unsigned NewAddrDwords = Info->VAddrDwords;
283
0
  const TargetRegisterClass *RC;
284
285
0
  if (Info->VAddrDwords == 2) {
286
0
    RC = &AMDGPU::VReg_64RegClass;
287
0
  } else if (Info->VAddrDwords == 3) {
288
0
    RC = &AMDGPU::VReg_96RegClass;
289
0
  } else if (Info->VAddrDwords == 4) {
290
0
    RC = &AMDGPU::VReg_128RegClass;
291
0
  } else if (Info->VAddrDwords == 5) {
292
0
    RC = &AMDGPU::VReg_160RegClass;
293
0
  } else if (Info->VAddrDwords == 6) {
294
0
    RC = &AMDGPU::VReg_192RegClass;
295
0
  } else if (Info->VAddrDwords == 7) {
296
0
    RC = &AMDGPU::VReg_224RegClass;
297
0
  } else if (Info->VAddrDwords == 8) {
298
0
    RC = &AMDGPU::VReg_256RegClass;
299
0
  } else if (Info->VAddrDwords == 9) {
300
0
    RC = &AMDGPU::VReg_288RegClass;
301
0
  } else if (Info->VAddrDwords == 10) {
302
0
    RC = &AMDGPU::VReg_320RegClass;
303
0
  } else if (Info->VAddrDwords == 11) {
304
0
    RC = &AMDGPU::VReg_352RegClass;
305
0
  } else if (Info->VAddrDwords == 12) {
306
0
    RC = &AMDGPU::VReg_384RegClass;
307
0
  } else {
308
0
    RC = &AMDGPU::VReg_512RegClass;
309
0
    NewAddrDwords = 16;
310
0
  }
311
312
0
  unsigned VgprBase = 0;
313
0
  unsigned NextVgpr = 0;
314
0
  bool IsUndef = true;
315
0
  bool IsKill = NewAddrDwords == Info->VAddrDwords;
316
0
  const unsigned NSAMaxSize = ST->getNSAMaxSize();
317
0
  const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
318
0
  const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
319
0
  for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) {
320
0
    const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
321
0
    unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
322
0
    unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
323
0
    assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
324
325
0
    if (Idx == 0) {
326
0
      VgprBase = Vgpr;
327
0
      NextVgpr = Vgpr + Dwords;
328
0
    } else if (Vgpr == NextVgpr) {
329
0
      NextVgpr = Vgpr + Dwords;
330
0
    } else {
331
0
      return;
332
0
    }
333
334
0
    if (!Op.isUndef())
335
0
      IsUndef = false;
336
0
    if (!Op.isKill())
337
0
      IsKill = false;
338
0
  }
339
340
0
  if (VgprBase + NewAddrDwords > 256)
341
0
    return;
342
343
  // Further check for implicit tied operands - this may be present if TFE is
344
  // enabled
345
0
  int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
346
0
  int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
347
0
  unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
348
0
  unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
349
0
  int ToUntie = -1;
350
0
  if (TFEVal || LWEVal) {
351
    // TFE/LWE is enabled so we need to deal with an implicit tied operand
352
0
    for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
353
0
      if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
354
0
          MI.getOperand(i).isImplicit()) {
355
        // This is the tied operand
356
0
        assert(
357
0
            ToUntie == -1 &&
358
0
            "found more than one tied implicit operand when expecting only 1");
359
0
        ToUntie = i;
360
0
        MI.untieRegOperand(ToUntie);
361
0
      }
362
0
    }
363
0
  }
364
365
0
  unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding,
366
0
                                             Info->VDataDwords, NewAddrDwords);
367
0
  MI.setDesc(TII->get(NewOpcode));
368
0
  MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
369
0
  MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
370
0
  MI.getOperand(VAddr0Idx).setIsKill(IsKill);
371
372
0
  for (unsigned i = 1; i < EndVAddr; ++i)
373
0
    MI.removeOperand(VAddr0Idx + 1);
374
375
0
  if (ToUntie >= 0) {
376
0
    MI.tieOperands(
377
0
        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
378
0
        ToUntie - (EndVAddr - 1));
379
0
  }
380
0
}
381
382
// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
383
0
void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
384
  // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
385
  // there is no reason to try to shrink them.
386
0
  if (!ST->hasVOP3Literal())
387
0
    return;
388
389
  // There is no advantage to doing this pre-RA.
390
0
  if (!MF->getProperties().hasProperty(
391
0
          MachineFunctionProperties::Property::NoVRegs))
392
0
    return;
393
394
0
  if (TII->hasAnyModifiersSet(MI))
395
0
    return;
396
397
0
  const unsigned Opcode = MI.getOpcode();
398
0
  MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
399
0
  MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
400
0
  MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
401
0
  unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
402
403
0
  bool Swap;
404
405
  // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
406
0
  if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
407
0
    if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
408
0
      Swap = false;
409
0
    else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
410
0
      Swap = true;
411
0
    else
412
0
      return;
413
414
0
    switch (Opcode) {
415
0
    default:
416
0
      llvm_unreachable("Unexpected mad/fma opcode!");
417
0
    case AMDGPU::V_MAD_F32_e64:
418
0
      NewOpcode = AMDGPU::V_MADAK_F32;
419
0
      break;
420
0
    case AMDGPU::V_FMA_F32_e64:
421
0
      NewOpcode = AMDGPU::V_FMAAK_F32;
422
0
      break;
423
0
    case AMDGPU::V_MAD_F16_e64:
424
0
      NewOpcode = AMDGPU::V_MADAK_F16;
425
0
      break;
426
0
    case AMDGPU::V_FMA_F16_e64:
427
0
    case AMDGPU::V_FMA_F16_gfx9_e64:
428
0
      NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
429
0
                                          : AMDGPU::V_FMAAK_F16;
430
0
      break;
431
0
    }
432
0
  }
433
434
  // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
435
0
  if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
436
0
    if (Src1.isImm() && !TII->isInlineConstant(Src1))
437
0
      Swap = false;
438
0
    else if (Src0.isImm() && !TII->isInlineConstant(Src0))
439
0
      Swap = true;
440
0
    else
441
0
      return;
442
443
0
    switch (Opcode) {
444
0
    default:
445
0
      llvm_unreachable("Unexpected mad/fma opcode!");
446
0
    case AMDGPU::V_MAD_F32_e64:
447
0
      NewOpcode = AMDGPU::V_MADMK_F32;
448
0
      break;
449
0
    case AMDGPU::V_FMA_F32_e64:
450
0
      NewOpcode = AMDGPU::V_FMAMK_F32;
451
0
      break;
452
0
    case AMDGPU::V_MAD_F16_e64:
453
0
      NewOpcode = AMDGPU::V_MADMK_F16;
454
0
      break;
455
0
    case AMDGPU::V_FMA_F16_e64:
456
0
    case AMDGPU::V_FMA_F16_gfx9_e64:
457
0
      NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
458
0
                                          : AMDGPU::V_FMAMK_F16;
459
0
      break;
460
0
    }
461
0
  }
462
463
0
  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
464
0
    return;
465
466
0
  if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI))
467
0
    return;
468
469
0
  if (Swap) {
470
    // Swap Src0 and Src1 by building a new instruction.
471
0
    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
472
0
            MI.getOperand(0).getReg())
473
0
        .add(Src1)
474
0
        .add(Src0)
475
0
        .add(Src2)
476
0
        .setMIFlags(MI.getFlags());
477
0
    MI.eraseFromParent();
478
0
  } else {
479
0
    TII->removeModOperands(MI);
480
0
    MI.setDesc(TII->get(NewOpcode));
481
0
  }
482
0
}
483
484
/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
485
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
486
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
487
/// XNOR (as a ^ b == ~(a ^ ~b)).
488
/// \returns true if the caller should continue the machine function iterator
489
0
bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
490
0
  unsigned Opc = MI.getOpcode();
491
0
  const MachineOperand *Dest = &MI.getOperand(0);
492
0
  MachineOperand *Src0 = &MI.getOperand(1);
493
0
  MachineOperand *Src1 = &MI.getOperand(2);
494
0
  MachineOperand *SrcReg = Src0;
495
0
  MachineOperand *SrcImm = Src1;
496
497
0
  if (!SrcImm->isImm() ||
498
0
      AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
499
0
    return false;
500
501
0
  uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
502
0
  uint32_t NewImm = 0;
503
504
0
  if (Opc == AMDGPU::S_AND_B32) {
505
0
    if (isPowerOf2_32(~Imm)) {
506
0
      NewImm = llvm::countr_one(Imm);
507
0
      Opc = AMDGPU::S_BITSET0_B32;
508
0
    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
509
0
      NewImm = ~Imm;
510
0
      Opc = AMDGPU::S_ANDN2_B32;
511
0
    }
512
0
  } else if (Opc == AMDGPU::S_OR_B32) {
513
0
    if (isPowerOf2_32(Imm)) {
514
0
      NewImm = llvm::countr_zero(Imm);
515
0
      Opc = AMDGPU::S_BITSET1_B32;
516
0
    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
517
0
      NewImm = ~Imm;
518
0
      Opc = AMDGPU::S_ORN2_B32;
519
0
    }
520
0
  } else if (Opc == AMDGPU::S_XOR_B32) {
521
0
    if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
522
0
      NewImm = ~Imm;
523
0
      Opc = AMDGPU::S_XNOR_B32;
524
0
    }
525
0
  } else {
526
0
    llvm_unreachable("unexpected opcode");
527
0
  }
528
529
0
  if (NewImm != 0) {
530
0
    if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
531
0
      MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
532
0
      MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
533
0
      return true;
534
0
    }
535
536
0
    if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
537
0
      const bool IsUndef = SrcReg->isUndef();
538
0
      const bool IsKill = SrcReg->isKill();
539
0
      MI.setDesc(TII->get(Opc));
540
0
      if (Opc == AMDGPU::S_BITSET0_B32 ||
541
0
          Opc == AMDGPU::S_BITSET1_B32) {
542
0
        Src0->ChangeToImmediate(NewImm);
543
        // Remove the immediate and add the tied input.
544
0
        MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
545
0
                                          /*isImp*/ false, IsKill,
546
0
                                          /*isDead*/ false, IsUndef);
547
0
        MI.tieOperands(0, 2);
548
0
      } else {
549
0
        SrcImm->setImm(NewImm);
550
0
      }
551
0
    }
552
0
  }
553
554
0
  return false;
555
0
}
556
557
// This is the same as MachineInstr::readsRegister/modifiesRegister except
558
// it takes subregs into account.
559
bool SIShrinkInstructions::instAccessReg(
560
    iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
561
0
    unsigned SubReg) const {
562
0
  for (const MachineOperand &MO : R) {
563
0
    if (!MO.isReg())
564
0
      continue;
565
566
0
    if (Reg.isPhysical() && MO.getReg().isPhysical()) {
567
0
      if (TRI->regsOverlap(Reg, MO.getReg()))
568
0
        return true;
569
0
    } else if (MO.getReg() == Reg && Reg.isVirtual()) {
570
0
      LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
571
0
                            TRI->getSubRegIndexLaneMask(MO.getSubReg());
572
0
      if (Overlap.any())
573
0
        return true;
574
0
    }
575
0
  }
576
0
  return false;
577
0
}
578
579
bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
580
0
                                        unsigned SubReg) const {
581
0
  return instAccessReg(MI->uses(), Reg, SubReg);
582
0
}
583
584
bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
585
0
                                           unsigned SubReg) const {
586
0
  return instAccessReg(MI->defs(), Reg, SubReg);
587
0
}
588
589
TargetInstrInfo::RegSubRegPair
590
SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
591
0
                                        unsigned I) const {
592
0
  if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
593
0
    if (Reg.isPhysical()) {
594
0
      Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
595
0
    } else {
596
0
      Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
597
0
    }
598
0
  }
599
0
  return TargetInstrInfo::RegSubRegPair(Reg, Sub);
600
0
}
601
602
void SIShrinkInstructions::dropInstructionKeepingImpDefs(
603
0
    MachineInstr &MI) const {
604
0
  for (unsigned i = MI.getDesc().getNumOperands() +
605
0
                    MI.getDesc().implicit_uses().size() +
606
0
                    MI.getDesc().implicit_defs().size(),
607
0
                e = MI.getNumOperands();
608
0
       i != e; ++i) {
609
0
    const MachineOperand &Op = MI.getOperand(i);
610
0
    if (!Op.isDef())
611
0
      continue;
612
0
    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
613
0
            TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
614
0
  }
615
616
0
  MI.eraseFromParent();
617
0
}
618
619
// Match:
620
// mov t, x
621
// mov x, y
622
// mov y, t
623
//
624
// =>
625
//
626
// mov t, x (t is potentially dead and move eliminated)
627
// v_swap_b32 x, y
628
//
629
// Returns next valid instruction pointer if was able to create v_swap_b32.
630
//
631
// This shall not be done too early not to prevent possible folding which may
632
// remove matched moves, and this should preferably be done before RA to
633
// release saved registers and also possibly after RA which can insert copies
634
// too.
635
//
636
// This is really just a generic peephole that is not a canonical shrinking,
637
// although requirements match the pass placement and it reduces code size too.
638
0
MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
639
0
  assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
640
0
         MovT.getOpcode() == AMDGPU::COPY);
641
642
0
  Register T = MovT.getOperand(0).getReg();
643
0
  unsigned Tsub = MovT.getOperand(0).getSubReg();
644
0
  MachineOperand &Xop = MovT.getOperand(1);
645
646
0
  if (!Xop.isReg())
647
0
    return nullptr;
648
0
  Register X = Xop.getReg();
649
0
  unsigned Xsub = Xop.getSubReg();
650
651
0
  unsigned Size = TII->getOpSize(MovT, 0) / 4;
652
653
0
  if (!TRI->isVGPR(*MRI, X))
654
0
    return nullptr;
655
656
0
  const unsigned SearchLimit = 16;
657
0
  unsigned Count = 0;
658
0
  bool KilledT = false;
659
0
  for (auto Iter = std::next(MovT.getIterator()),
660
0
            E = MovT.getParent()->instr_end();
661
0
       Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
662
663
0
    MachineInstr *MovY = &*Iter;
664
0
    KilledT = MovY->killsRegister(T, TRI);
665
666
0
    if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
667
0
         MovY->getOpcode() != AMDGPU::COPY) ||
668
0
        !MovY->getOperand(1).isReg()        ||
669
0
        MovY->getOperand(1).getReg() != T   ||
670
0
        MovY->getOperand(1).getSubReg() != Tsub)
671
0
      continue;
672
673
0
    Register Y = MovY->getOperand(0).getReg();
674
0
    unsigned Ysub = MovY->getOperand(0).getSubReg();
675
676
0
    if (!TRI->isVGPR(*MRI, Y))
677
0
      continue;
678
679
0
    MachineInstr *MovX = nullptr;
680
0
    for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
681
0
         I != IY; ++I) {
682
0
      if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
683
0
          instModifiesReg(&*I, T, Tsub) ||
684
0
          (MovX && instModifiesReg(&*I, X, Xsub))) {
685
0
        MovX = nullptr;
686
0
        break;
687
0
      }
688
0
      if (!instReadsReg(&*I, Y, Ysub)) {
689
0
        if (!MovX && instModifiesReg(&*I, X, Xsub)) {
690
0
          MovX = nullptr;
691
0
          break;
692
0
        }
693
0
        continue;
694
0
      }
695
0
      if (MovX ||
696
0
          (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
697
0
           I->getOpcode() != AMDGPU::COPY) ||
698
0
          I->getOperand(0).getReg() != X ||
699
0
          I->getOperand(0).getSubReg() != Xsub) {
700
0
        MovX = nullptr;
701
0
        break;
702
0
      }
703
704
0
      if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
705
0
        continue;
706
707
0
      MovX = &*I;
708
0
    }
709
710
0
    if (!MovX)
711
0
      continue;
712
713
0
    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
714
715
0
    for (unsigned I = 0; I < Size; ++I) {
716
0
      TargetInstrInfo::RegSubRegPair X1, Y1;
717
0
      X1 = getSubRegForIndex(X, Xsub, I);
718
0
      Y1 = getSubRegForIndex(Y, Ysub, I);
719
0
      MachineBasicBlock &MBB = *MovT.getParent();
720
0
      auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
721
0
                         TII->get(AMDGPU::V_SWAP_B32))
722
0
        .addDef(X1.Reg, 0, X1.SubReg)
723
0
        .addDef(Y1.Reg, 0, Y1.SubReg)
724
0
        .addReg(Y1.Reg, 0, Y1.SubReg)
725
0
        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
726
0
      if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
727
        // Drop implicit EXEC.
728
0
        MIB->removeOperand(MIB->getNumExplicitOperands());
729
0
        MIB->copyImplicitOps(*MBB.getParent(), *MovX);
730
0
      }
731
0
    }
732
0
    MovX->eraseFromParent();
733
0
    dropInstructionKeepingImpDefs(*MovY);
734
0
    MachineInstr *Next = &*std::next(MovT.getIterator());
735
736
0
    if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
737
0
      dropInstructionKeepingImpDefs(MovT);
738
0
    } else {
739
0
      Xop.setIsKill(false);
740
0
      for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
741
0
        unsigned OpNo = MovT.getNumExplicitOperands() + I;
742
0
        const MachineOperand &Op = MovT.getOperand(OpNo);
743
0
        if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
744
0
          MovT.removeOperand(OpNo);
745
0
      }
746
0
    }
747
748
0
    return Next;
749
0
  }
750
751
0
  return nullptr;
752
0
}
753
754
// If an instruction has dead sdst replace it with NULL register on gfx1030+
755
0
bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
756
0
  if (!ST->hasGFX10_3Insts())
757
0
    return false;
758
759
0
  MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
760
0
  if (!Op)
761
0
    return false;
762
0
  Register SDstReg = Op->getReg();
763
0
  if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
764
0
    return false;
765
766
0
  Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
767
0
  return true;
768
0
}
769
770
0
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
771
0
  if (skipFunction(MF.getFunction()))
772
0
    return false;
773
774
0
  this->MF = &MF;
775
0
  MRI = &MF.getRegInfo();
776
0
  ST = &MF.getSubtarget<GCNSubtarget>();
777
0
  TII = ST->getInstrInfo();
778
0
  TRI = &TII->getRegisterInfo();
779
780
0
  unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
781
782
0
  std::vector<unsigned> I1Defs;
783
784
0
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
785
0
                                                  BI != BE; ++BI) {
786
787
0
    MachineBasicBlock &MBB = *BI;
788
0
    MachineBasicBlock::iterator I, Next;
789
0
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
790
0
      Next = std::next(I);
791
0
      MachineInstr &MI = *I;
792
793
0
      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
794
        // If this has a literal constant source that is the same as the
795
        // reversed bits of an inline immediate, replace with a bitreverse of
796
        // that constant. This saves 4 bytes in the common case of materializing
797
        // sign bits.
798
799
        // Test if we are after regalloc. We only want to do this after any
800
        // optimizations happen because this will confuse them.
801
        // XXX - not exactly a check for post-regalloc run.
802
0
        MachineOperand &Src = MI.getOperand(1);
803
0
        if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
804
0
          int32_t ReverseImm;
805
0
          if (isReverseInlineImm(Src, ReverseImm)) {
806
0
            MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
807
0
            Src.setImm(ReverseImm);
808
0
            continue;
809
0
          }
810
0
        }
811
0
      }
812
813
0
      if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
814
0
                            MI.getOpcode() == AMDGPU::COPY)) {
815
0
        if (auto *NextMI = matchSwap(MI)) {
816
0
          Next = NextMI->getIterator();
817
0
          continue;
818
0
        }
819
0
      }
820
821
      // Try to use S_ADDK_I32 and S_MULK_I32.
822
0
      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
823
0
          MI.getOpcode() == AMDGPU::S_MUL_I32) {
824
0
        const MachineOperand *Dest = &MI.getOperand(0);
825
0
        MachineOperand *Src0 = &MI.getOperand(1);
826
0
        MachineOperand *Src1 = &MI.getOperand(2);
827
828
0
        if (!Src0->isReg() && Src1->isReg()) {
829
0
          if (TII->commuteInstruction(MI, false, 1, 2))
830
0
            std::swap(Src0, Src1);
831
0
        }
832
833
        // FIXME: This could work better if hints worked with subregisters. If
834
        // we have a vector add of a constant, we usually don't get the correct
835
        // allocation due to the subregister usage.
836
0
        if (Dest->getReg().isVirtual() && Src0->isReg()) {
837
0
          MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
838
0
          MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
839
0
          continue;
840
0
        }
841
842
0
        if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
843
0
          if (Src1->isImm() && isKImmOperand(*Src1)) {
844
0
            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
845
0
              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
846
847
0
            Src1->setImm(SignExtend64(Src1->getImm(), 32));
848
0
            MI.setDesc(TII->get(Opc));
849
0
            MI.tieOperands(0, 1);
850
0
          }
851
0
        }
852
0
      }
853
854
      // Try to use s_cmpk_*
855
0
      if (MI.isCompare() && TII->isSOPC(MI)) {
856
0
        shrinkScalarCompare(MI);
857
0
        continue;
858
0
      }
859
860
      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
861
0
      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
862
0
        const MachineOperand &Dst = MI.getOperand(0);
863
0
        MachineOperand &Src = MI.getOperand(1);
864
865
0
        if (Src.isImm() && Dst.getReg().isPhysical()) {
866
0
          int32_t ReverseImm;
867
0
          if (isKImmOperand(Src)) {
868
0
            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
869
0
            Src.setImm(SignExtend64(Src.getImm(), 32));
870
0
          } else if (isReverseInlineImm(Src, ReverseImm)) {
871
0
            MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
872
0
            Src.setImm(ReverseImm);
873
0
          }
874
0
        }
875
876
0
        continue;
877
0
      }
878
879
      // Shrink scalar logic operations.
880
0
      if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
881
0
          MI.getOpcode() == AMDGPU::S_OR_B32 ||
882
0
          MI.getOpcode() == AMDGPU::S_XOR_B32) {
883
0
        if (shrinkScalarLogicOp(MI))
884
0
          continue;
885
0
      }
886
887
0
      if (TII->isMIMG(MI.getOpcode()) &&
888
0
          ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
889
0
          MF.getProperties().hasProperty(
890
0
              MachineFunctionProperties::Property::NoVRegs)) {
891
0
        shrinkMIMG(MI);
892
0
        continue;
893
0
      }
894
895
0
      if (!TII->isVOP3(MI))
896
0
        continue;
897
898
0
      if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
899
0
          MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
900
0
          MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
901
0
          MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
902
0
          MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
903
0
        shrinkMadFma(MI);
904
0
        continue;
905
0
      }
906
907
0
      if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
908
        // If there is no chance we will shrink it and use VCC as sdst to get
909
        // a 32 bit form try to replace dead sdst with NULL.
910
0
        tryReplaceDeadSDST(MI);
911
0
        continue;
912
0
      }
913
914
0
      if (!TII->canShrink(MI, *MRI)) {
915
        // Try commuting the instruction and see if that enables us to shrink
916
        // it.
917
0
        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
918
0
            !TII->canShrink(MI, *MRI)) {
919
0
          tryReplaceDeadSDST(MI);
920
0
          continue;
921
0
        }
922
0
      }
923
924
0
      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
925
926
0
      if (TII->isVOPC(Op32)) {
927
0
        MachineOperand &Op0 = MI.getOperand(0);
928
0
        if (Op0.isReg()) {
929
          // Exclude VOPCX instructions as these don't explicitly write a
930
          // dst.
931
0
          Register DstReg = Op0.getReg();
932
0
          if (DstReg.isVirtual()) {
933
            // VOPC instructions can only write to the VCC register. We can't
934
            // force them to use VCC here, because this is only one register and
935
            // cannot deal with sequences which would require multiple copies of
936
            // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
937
            //
938
            // So, instead of forcing the instruction to write to VCC, we
939
            // provide a hint to the register allocator to use VCC and then we
940
            // will run this pass again after RA and shrink it if it outputs to
941
            // VCC.
942
0
            MRI->setRegAllocationHint(DstReg, 0, VCCReg);
943
0
            continue;
944
0
          }
945
0
          if (DstReg != VCCReg)
946
0
            continue;
947
0
        }
948
0
      }
949
950
0
      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
951
        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
952
        // instructions.
953
0
        const MachineOperand *Src2 =
954
0
            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
955
0
        if (!Src2->isReg())
956
0
          continue;
957
0
        Register SReg = Src2->getReg();
958
0
        if (SReg.isVirtual()) {
959
0
          MRI->setRegAllocationHint(SReg, 0, VCCReg);
960
0
          continue;
961
0
        }
962
0
        if (SReg != VCCReg)
963
0
          continue;
964
0
      }
965
966
      // Check for the bool flag output for instructions like V_ADD_I32_e64.
967
0
      const MachineOperand *SDst = TII->getNamedOperand(MI,
968
0
                                                        AMDGPU::OpName::sdst);
969
970
0
      if (SDst) {
971
0
        bool Next = false;
972
973
0
        if (SDst->getReg() != VCCReg) {
974
0
          if (SDst->getReg().isVirtual())
975
0
            MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
976
0
          Next = true;
977
0
        }
978
979
        // All of the instructions with carry outs also have an SGPR input in
980
        // src2.
981
0
        const MachineOperand *Src2 = TII->getNamedOperand(MI,
982
0
                                                          AMDGPU::OpName::src2);
983
0
        if (Src2 && Src2->getReg() != VCCReg) {
984
0
          if (Src2->getReg().isVirtual())
985
0
            MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
986
0
          Next = true;
987
0
        }
988
989
0
        if (Next)
990
0
          continue;
991
0
      }
992
993
      // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
994
      // fold an immediate into the shrunk instruction as a literal operand. In
995
      // GFX10 VOP3 instructions can take a literal operand anyway, so there is
996
      // no advantage to doing this.
997
0
      if (ST->hasVOP3Literal() &&
998
0
          !MF.getProperties().hasProperty(
999
0
              MachineFunctionProperties::Property::NoVRegs))
1000
0
        continue;
1001
1002
0
      if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
1003
0
          !shouldShrinkTrue16(MI))
1004
0
        continue;
1005
1006
      // We can shrink this instruction
1007
0
      LLVM_DEBUG(dbgs() << "Shrinking " << MI);
1008
1009
0
      MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
1010
0
      ++NumInstructionsShrunk;
1011
1012
      // Copy extra operands not present in the instruction definition.
1013
0
      copyExtraImplicitOps(*Inst32, MI);
1014
1015
      // Copy deadness from the old explicit vcc def to the new implicit def.
1016
0
      if (SDst && SDst->isDead())
1017
0
        Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
1018
1019
0
      MI.eraseFromParent();
1020
0
      foldImmediates(*Inst32);
1021
1022
0
      LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
1023
0
    }
1024
0
  }
1025
0
  return false;
1026
0
}