1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPU.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Target/TargetMachine.h"
36 #include <cassert>
37 #include <cstdint>
38 #include <iterator>
39
40 using namespace llvm;
41
42 #define DEBUG_TYPE "si-insert-skips"
43
44 static cl::opt<unsigned> SkipThresholdFlag(
45 "amdgpu-skip-threshold",
46 cl::desc("Number of instructions before jumping over divergent control flow"),
47 cl::init(12), cl::Hidden);
48
49 namespace {
50
51 class SIInsertSkips : public MachineFunctionPass {
52 private:
53 const SIRegisterInfo *TRI = nullptr;
54 const SIInstrInfo *TII = nullptr;
55 unsigned SkipThreshold = 0;
56
57 bool shouldSkip(const MachineBasicBlock &From,
58 const MachineBasicBlock &To) const;
59
60 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
61
62 void kill(MachineInstr &MI);
63
64 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
65 MachineBasicBlock::iterator I) const;
66
67 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
68
69 public:
70 static char ID;
71
SIInsertSkips()72 SIInsertSkips() : MachineFunctionPass(ID) {}
73
74 bool runOnMachineFunction(MachineFunction &MF) override;
75
getPassName() const76 StringRef getPassName() const override {
77 return "SI insert s_cbranch_execz instructions";
78 }
79
getAnalysisUsage(AnalysisUsage & AU) const80 void getAnalysisUsage(AnalysisUsage &AU) const override {
81 MachineFunctionPass::getAnalysisUsage(AU);
82 }
83 };
84
85 } // end anonymous namespace
86
87 char SIInsertSkips::ID = 0;
88
89 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
90 "SI insert s_cbranch_execz instructions", false, false)
91
92 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
93
opcodeEmitsNoInsts(unsigned Opc)94 static bool opcodeEmitsNoInsts(unsigned Opc) {
95 switch (Opc) {
96 case TargetOpcode::IMPLICIT_DEF:
97 case TargetOpcode::KILL:
98 case TargetOpcode::BUNDLE:
99 case TargetOpcode::CFI_INSTRUCTION:
100 case TargetOpcode::EH_LABEL:
101 case TargetOpcode::GC_LABEL:
102 case TargetOpcode::DBG_VALUE:
103 return true;
104 default:
105 return false;
106 }
107 }
108
shouldSkip(const MachineBasicBlock & From,const MachineBasicBlock & To) const109 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
110 const MachineBasicBlock &To) const {
111 if (From.succ_empty())
112 return false;
113
114 unsigned NumInstr = 0;
115 const MachineFunction *MF = From.getParent();
116
117 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
118 MBBI != End && MBBI != ToI; ++MBBI) {
119 const MachineBasicBlock &MBB = *MBBI;
120
121 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
122 NumInstr < SkipThreshold && I != E; ++I) {
123 if (opcodeEmitsNoInsts(I->getOpcode()))
124 continue;
125
126 // FIXME: Since this is required for correctness, this should be inserted
127 // during SILowerControlFlow.
128
129 // When a uniform loop is inside non-uniform control flow, the branch
130 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
131 // when EXEC = 0. We should skip the loop lest it becomes infinite.
132 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
133 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
134 return true;
135
136 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
137 return true;
138
139 ++NumInstr;
140 if (NumInstr >= SkipThreshold)
141 return true;
142 }
143 }
144
145 return false;
146 }
147
skipIfDead(MachineInstr & MI,MachineBasicBlock & NextBB)148 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
149 MachineBasicBlock &MBB = *MI.getParent();
150 MachineFunction *MF = MBB.getParent();
151
152 if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
153 !shouldSkip(MBB, MBB.getParent()->back()))
154 return false;
155
156 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
157
158 const DebugLoc &DL = MI.getDebugLoc();
159
160 // If the exec mask is non-zero, skip the next two instructions
161 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
162 .addMBB(&NextBB);
163
164 MachineBasicBlock::iterator Insert = SkipBB->begin();
165
166 // Exec mask is zero: Export to NULL target...
167 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
168 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
169 .addReg(AMDGPU::VGPR0, RegState::Undef)
170 .addReg(AMDGPU::VGPR0, RegState::Undef)
171 .addReg(AMDGPU::VGPR0, RegState::Undef)
172 .addReg(AMDGPU::VGPR0, RegState::Undef)
173 .addImm(1) // vm
174 .addImm(0) // compr
175 .addImm(0); // en
176
177 // ... and terminate wavefront.
178 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
179
180 return true;
181 }
182
kill(MachineInstr & MI)183 void SIInsertSkips::kill(MachineInstr &MI) {
184 MachineBasicBlock &MBB = *MI.getParent();
185 DebugLoc DL = MI.getDebugLoc();
186
187 switch (MI.getOpcode()) {
188 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
189 unsigned Opcode = 0;
190
191 // The opcodes are inverted because the inline immediate has to be
192 // the first operand, e.g. from "x < imm" to "imm > x"
193 switch (MI.getOperand(2).getImm()) {
194 case ISD::SETOEQ:
195 case ISD::SETEQ:
196 Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
197 break;
198 case ISD::SETOGT:
199 case ISD::SETGT:
200 Opcode = AMDGPU::V_CMPX_LT_F32_e64;
201 break;
202 case ISD::SETOGE:
203 case ISD::SETGE:
204 Opcode = AMDGPU::V_CMPX_LE_F32_e64;
205 break;
206 case ISD::SETOLT:
207 case ISD::SETLT:
208 Opcode = AMDGPU::V_CMPX_GT_F32_e64;
209 break;
210 case ISD::SETOLE:
211 case ISD::SETLE:
212 Opcode = AMDGPU::V_CMPX_GE_F32_e64;
213 break;
214 case ISD::SETONE:
215 case ISD::SETNE:
216 Opcode = AMDGPU::V_CMPX_LG_F32_e64;
217 break;
218 case ISD::SETO:
219 Opcode = AMDGPU::V_CMPX_O_F32_e64;
220 break;
221 case ISD::SETUO:
222 Opcode = AMDGPU::V_CMPX_U_F32_e64;
223 break;
224 case ISD::SETUEQ:
225 Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
226 break;
227 case ISD::SETUGT:
228 Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
229 break;
230 case ISD::SETUGE:
231 Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
232 break;
233 case ISD::SETULT:
234 Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
235 break;
236 case ISD::SETULE:
237 Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
238 break;
239 case ISD::SETUNE:
240 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
241 break;
242 default:
243 llvm_unreachable("invalid ISD:SET cond code");
244 }
245
246 assert(MI.getOperand(0).isReg());
247
248 if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
249 MI.getOperand(0).getReg())) {
250 Opcode = AMDGPU::getVOPe32(Opcode);
251 BuildMI(MBB, &MI, DL, TII->get(Opcode))
252 .add(MI.getOperand(1))
253 .add(MI.getOperand(0));
254 } else {
255 BuildMI(MBB, &MI, DL, TII->get(Opcode))
256 .addReg(AMDGPU::VCC, RegState::Define)
257 .addImm(0) // src0 modifiers
258 .add(MI.getOperand(1))
259 .addImm(0) // src1 modifiers
260 .add(MI.getOperand(0))
261 .addImm(0); // omod
262 }
263 break;
264 }
265 case AMDGPU::SI_KILL_I1_TERMINATOR: {
266 const MachineOperand &Op = MI.getOperand(0);
267 int64_t KillVal = MI.getOperand(1).getImm();
268 assert(KillVal == 0 || KillVal == -1);
269
270 // Kill all threads if Op0 is an immediate and equal to the Kill value.
271 if (Op.isImm()) {
272 int64_t Imm = Op.getImm();
273 assert(Imm == 0 || Imm == -1);
274
275 if (Imm == KillVal)
276 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
277 .addImm(0);
278 break;
279 }
280
281 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
282 BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
283 .addReg(AMDGPU::EXEC)
284 .add(Op);
285 break;
286 }
287 default:
288 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
289 }
290 }
291
insertSkipBlock(MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const292 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
293 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
294 MachineFunction *MF = MBB.getParent();
295
296 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
297 MachineFunction::iterator MBBI(MBB);
298 ++MBBI;
299
300 MF->insert(MBBI, SkipBB);
301 MBB.addSuccessor(SkipBB);
302
303 return SkipBB;
304 }
305
306 // Returns true if a branch over the block was inserted.
skipMaskBranch(MachineInstr & MI,MachineBasicBlock & SrcMBB)307 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
308 MachineBasicBlock &SrcMBB) {
309 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
310
311 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
312 return false;
313
314 const DebugLoc &DL = MI.getDebugLoc();
315 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
316
317 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
318 .addMBB(DestBB);
319
320 return true;
321 }
322
runOnMachineFunction(MachineFunction & MF)323 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
324 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
325 TII = ST.getInstrInfo();
326 TRI = &TII->getRegisterInfo();
327 SkipThreshold = SkipThresholdFlag;
328
329 bool HaveKill = false;
330 bool MadeChange = false;
331
332 // Track depth of exec mask, divergent branches.
333 SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
334
335 MachineFunction::iterator NextBB;
336
337 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
338
339 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
340 BI != BE; BI = NextBB) {
341 NextBB = std::next(BI);
342 MachineBasicBlock &MBB = *BI;
343 bool HaveSkipBlock = false;
344
345 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
346 // Reached convergence point for last divergent branch.
347 ExecBranchStack.pop_back();
348 }
349
350 if (HaveKill && ExecBranchStack.empty()) {
351 HaveKill = false;
352
353 // TODO: Insert skip if exec is 0?
354 }
355
356 MachineBasicBlock::iterator I, Next;
357 for (I = MBB.begin(); I != MBB.end(); I = Next) {
358 Next = std::next(I);
359
360 MachineInstr &MI = *I;
361
362 switch (MI.getOpcode()) {
363 case AMDGPU::SI_MASK_BRANCH:
364 ExecBranchStack.push_back(MI.getOperand(0).getMBB());
365 MadeChange |= skipMaskBranch(MI, MBB);
366 break;
367
368 case AMDGPU::S_BRANCH:
369 // Optimize out branches to the next block.
370 // FIXME: Shouldn't this be handled by BranchFolding?
371 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
372 MI.eraseFromParent();
373 } else if (HaveSkipBlock) {
374 // Remove the given unconditional branch when a skip block has been
375 // inserted after the current one and let skip the two instructions
376 // performing the kill if the exec mask is non-zero.
377 MI.eraseFromParent();
378 }
379 break;
380
381 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
382 case AMDGPU::SI_KILL_I1_TERMINATOR:
383 MadeChange = true;
384 kill(MI);
385
386 if (ExecBranchStack.empty()) {
387 if (skipIfDead(MI, *NextBB)) {
388 HaveSkipBlock = true;
389 NextBB = std::next(BI);
390 BE = MF.end();
391 }
392 } else {
393 HaveKill = true;
394 }
395
396 MI.eraseFromParent();
397 break;
398
399 case AMDGPU::SI_RETURN_TO_EPILOG:
400 // FIXME: Should move somewhere else
401 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
402
403 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
404 // because external bytecode will be appended at the end.
405 if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
406 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
407 // the end and jump there.
408 if (!EmptyMBBAtEnd) {
409 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
410 MF.insert(MF.end(), EmptyMBBAtEnd);
411 }
412
413 MBB.addSuccessor(EmptyMBBAtEnd);
414 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
415 .addMBB(EmptyMBBAtEnd);
416 I->eraseFromParent();
417 }
418 break;
419
420 default:
421 break;
422 }
423 }
424 }
425
426 return MadeChange;
427 }
428