• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// The pass tries to use the 32-bit encoding for instructions when possible.
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUMCInstLower.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIInstrInfo.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "si-shrink-instructions"
28 
29 STATISTIC(NumInstructionsShrunk,
30           "Number of 64-bit instruction reduced to 32-bit.");
31 STATISTIC(NumLiteralConstantsFolded,
32           "Number of literal constants folded into 32-bit instructions.");
33 
34 using namespace llvm;
35 
36 namespace {
37 
38 class SIShrinkInstructions : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42 public:
SIShrinkInstructions()43   SIShrinkInstructions() : MachineFunctionPass(ID) {
44   }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47 
getPassName() const48   const char *getPassName() const override {
49     return "SI Shrink Instructions";
50   }
51 
getAnalysisUsage(AnalysisUsage & AU) const52   void getAnalysisUsage(AnalysisUsage &AU) const override {
53     AU.setPreservesCFG();
54     MachineFunctionPass::getAnalysisUsage(AU);
55   }
56 };
57 
58 } // End anonymous namespace.
59 
60 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
61                 "SI Shrink Instructions", false, false)
62 
63 char SIShrinkInstructions::ID = 0;
64 
createSIShrinkInstructionsPass()65 FunctionPass *llvm::createSIShrinkInstructionsPass() {
66   return new SIShrinkInstructions();
67 }
68 
isVGPR(const MachineOperand * MO,const SIRegisterInfo & TRI,const MachineRegisterInfo & MRI)69 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
70                    const MachineRegisterInfo &MRI) {
71   if (!MO->isReg())
72     return false;
73 
74   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
75     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
76 
77   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
78 }
79 
canShrink(MachineInstr & MI,const SIInstrInfo * TII,const SIRegisterInfo & TRI,const MachineRegisterInfo & MRI)80 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
81                       const SIRegisterInfo &TRI,
82                       const MachineRegisterInfo &MRI) {
83 
84   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
85   // Can't shrink instruction with three operands.
86   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
87   // a special case for it.  It can only be shrunk if the third operand
88   // is vcc.  We should handle this the same way we handle vopc, by addding
89   // a register allocation hint pre-regalloc and then do the shrining
90   // post-regalloc.
91   if (Src2) {
92     switch (MI.getOpcode()) {
93       default: return false;
94 
95       case AMDGPU::V_MAC_F32_e64:
96         if (!isVGPR(Src2, TRI, MRI) ||
97             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
98           return false;
99         break;
100 
101       case AMDGPU::V_CNDMASK_B32_e64:
102         break;
103     }
104   }
105 
106   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
107   const MachineOperand *Src1Mod =
108       TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
109 
110   if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
111     return false;
112 
113   // We don't need to check src0, all input types are legal, so just make sure
114   // src0 isn't using any modifiers.
115   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
116     return false;
117 
118   // Check output modifiers
119   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
120     return false;
121 
122   return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
123 }
124 
125 /// \brief This function checks \p MI for operands defined by a move immediate
126 /// instruction and then folds the literal constant into the instruction if it
127 /// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
128 /// and will only fold literal constants if we are still in SSA.
foldImmediates(MachineInstr & MI,const SIInstrInfo * TII,MachineRegisterInfo & MRI,bool TryToCommute=true)129 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
130                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
131 
132   if (!MRI.isSSA())
133     return;
134 
135   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
136 
137   const SIRegisterInfo &TRI = TII->getRegisterInfo();
138   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
139   MachineOperand &Src0 = MI.getOperand(Src0Idx);
140 
141   // Only one literal constant is allowed per instruction, so if src0 is a
142   // literal constant then we can't do any folding.
143   if (Src0.isImm() &&
144       TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
145     return;
146 
147   // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
148   // SGPR, we cannot commute the instruction, so we can't fold any literal
149   // constants.
150   if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
151     return;
152 
153   // Try to fold Src0
154   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
155     unsigned Reg = Src0.getReg();
156     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
157     if (Def && Def->isMoveImmediate()) {
158       MachineOperand &MovSrc = Def->getOperand(1);
159       bool ConstantFolded = false;
160 
161       if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
162         Src0.ChangeToImmediate(MovSrc.getImm());
163         ConstantFolded = true;
164       }
165       if (ConstantFolded) {
166         if (MRI.use_empty(Reg))
167           Def->eraseFromParent();
168         ++NumLiteralConstantsFolded;
169         return;
170       }
171     }
172   }
173 
174   // We have failed to fold src0, so commute the instruction and try again.
175   if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
176     foldImmediates(MI, TII, MRI, false);
177 
178 }
179 
180 // Copy MachineOperand with all flags except setting it as implicit.
copyFlagsToImplicitVCC(MachineInstr & MI,const MachineOperand & Orig)181 static void copyFlagsToImplicitVCC(MachineInstr &MI,
182                                    const MachineOperand &Orig) {
183 
184   for (MachineOperand &Use : MI.implicit_operands()) {
185     if (Use.getReg() == AMDGPU::VCC) {
186       Use.setIsUndef(Orig.isUndef());
187       Use.setIsKill(Orig.isKill());
188       return;
189     }
190   }
191 }
192 
isKImmOperand(const SIInstrInfo * TII,const MachineOperand & Src)193 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
194   return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
195 }
196 
runOnMachineFunction(MachineFunction & MF)197 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
198   if (skipFunction(*MF.getFunction()))
199     return false;
200 
201   MachineRegisterInfo &MRI = MF.getRegInfo();
202   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
203   const SIInstrInfo *TII = ST.getInstrInfo();
204   const SIRegisterInfo &TRI = TII->getRegisterInfo();
205 
206   std::vector<unsigned> I1Defs;
207 
208   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
209                                                   BI != BE; ++BI) {
210 
211     MachineBasicBlock &MBB = *BI;
212     MachineBasicBlock::iterator I, Next;
213     for (I = MBB.begin(); I != MBB.end(); I = Next) {
214       Next = std::next(I);
215       MachineInstr &MI = *I;
216 
217       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
218         // If this has a literal constant source that is the same as the
219         // reversed bits of an inline immediate, replace with a bitreverse of
220         // that constant. This saves 4 bytes in the common case of materializing
221         // sign bits.
222 
223         // Test if we are after regalloc. We only want to do this after any
224         // optimizations happen because this will confuse them.
225         // XXX - not exactly a check for post-regalloc run.
226         MachineOperand &Src = MI.getOperand(1);
227         if (Src.isImm() &&
228             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
229           int64_t Imm = Src.getImm();
230           if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
231             int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
232             if (ReverseImm >= -16 && ReverseImm <= 64) {
233               MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
234               Src.setImm(ReverseImm);
235               continue;
236             }
237           }
238         }
239       }
240 
241       // Combine adjacent s_nops to use the immediate operand encoding how long
242       // to wait.
243       //
244       // s_nop N
245       // s_nop M
246       //  =>
247       // s_nop (N + M)
248       if (MI.getOpcode() == AMDGPU::S_NOP &&
249           Next != MBB.end() &&
250           (*Next).getOpcode() == AMDGPU::S_NOP) {
251 
252         MachineInstr &NextMI = *Next;
253         // The instruction encodes the amount to wait with an offset of 1,
254         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
255         // after adding.
256         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
257         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
258 
259         // Make sure we don't overflow the bounds.
260         if (Nop0 + Nop1 <= 8) {
261           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
262           MI.eraseFromParent();
263         }
264 
265         continue;
266       }
267 
268       // FIXME: We also need to consider movs of constant operands since
269       // immediate operands are not folded if they have more than one use, and
270       // the operand folding pass is unaware if the immediate will be free since
271       // it won't know if the src == dest constraint will end up being
272       // satisfied.
273       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
274           MI.getOpcode() == AMDGPU::S_MUL_I32) {
275         const MachineOperand &Dest = MI.getOperand(0);
276         const MachineOperand &Src0 = MI.getOperand(1);
277         const MachineOperand &Src1 = MI.getOperand(2);
278 
279         // FIXME: This could work better if hints worked with subregisters. If
280         // we have a vector add of a constant, we usually don't get the correct
281         // allocation due to the subregister usage.
282         if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
283             Src0.isReg()) {
284           MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
285           continue;
286         }
287 
288         if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
289           if (Src1.isImm() && isKImmOperand(TII, Src1)) {
290             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
291               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
292 
293             MI.setDesc(TII->get(Opc));
294             MI.tieOperands(0, 1);
295           }
296         }
297       }
298 
299       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
300       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
301         const MachineOperand &Src = MI.getOperand(1);
302 
303         if (Src.isImm() && isKImmOperand(TII, Src))
304           MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
305 
306         continue;
307       }
308 
309       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
310         continue;
311 
312       if (!canShrink(MI, TII, TRI, MRI)) {
313         // Try commuting the instruction and see if that enables us to shrink
314         // it.
315         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
316             !canShrink(MI, TII, TRI, MRI))
317           continue;
318       }
319 
320       // getVOPe32 could be -1 here if we started with an instruction that had
321       // a 32-bit encoding and then commuted it to an instruction that did not.
322       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
323         continue;
324 
325       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
326 
327       if (TII->isVOPC(Op32)) {
328         unsigned DstReg = MI.getOperand(0).getReg();
329         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
330           // VOPC instructions can only write to the VCC register. We can't
331           // force them to use VCC here, because this is only one register and
332           // cannot deal with sequences which would require multiple copies of
333           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
334           //
335           // So, instead of forcing the instruction to write to VCC, we provide
336           // a hint to the register allocator to use VCC and then we we will run
337           // this pass again after RA and shrink it if it outputs to VCC.
338           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
339           continue;
340         }
341         if (DstReg != AMDGPU::VCC)
342           continue;
343       }
344 
345       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
346         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
347         // instructions.
348         const MachineOperand *Src2 =
349             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
350         if (!Src2->isReg())
351           continue;
352         unsigned SReg = Src2->getReg();
353         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
354           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
355           continue;
356         }
357         if (SReg != AMDGPU::VCC)
358           continue;
359       }
360 
361       // We can shrink this instruction
362       DEBUG(dbgs() << "Shrinking " << MI);
363 
364       MachineInstrBuilder Inst32 =
365           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
366 
367       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
368       // For VOPC instructions, this is replaced by an implicit def of vcc.
369       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
370       if (Op32DstIdx != -1) {
371         // dst
372         Inst32.addOperand(MI.getOperand(0));
373       } else {
374         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
375                "Unexpected case");
376       }
377 
378 
379       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
380 
381       const MachineOperand *Src1 =
382           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
383       if (Src1)
384         Inst32.addOperand(*Src1);
385 
386       const MachineOperand *Src2 =
387         TII->getNamedOperand(MI, AMDGPU::OpName::src2);
388       if (Src2) {
389         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
390         if (Op32Src2Idx != -1) {
391           Inst32.addOperand(*Src2);
392         } else {
393           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
394           // replaced with an implicit read of vcc. This was already added
395           // during the initial BuildMI, so find it to preserve the flags.
396           copyFlagsToImplicitVCC(*Inst32, *Src2);
397         }
398       }
399 
400       ++NumInstructionsShrunk;
401       MI.eraseFromParent();
402 
403       foldImmediates(*Inst32, TII, MRI);
404       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
405 
406 
407     }
408   }
409   return false;
410 }
411