1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUTargetMachine.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "llvm/CodeGen/GlobalISel/Combiner.h"
17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/MachineDominators.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28
29 using namespace llvm;
30 using namespace MIPatternMatch;
31
32 class AMDGPUPostLegalizerCombinerHelper {
33 protected:
34 MachineIRBuilder &B;
35 MachineFunction &MF;
36 MachineRegisterInfo &MRI;
37 CombinerHelper &Helper;
38
39 public:
AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder & B,CombinerHelper & Helper)40 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42
43 struct FMinFMaxLegacyInfo {
44 Register LHS;
45 Register RHS;
46 Register True;
47 Register False;
48 CmpInst::Predicate Pred;
49 };
50
51 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
52 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
53 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
54 const FMinFMaxLegacyInfo &Info);
55
56 bool matchUCharToFloat(MachineInstr &MI);
57 void applyUCharToFloat(MachineInstr &MI);
58
59 // FIXME: Should be able to have 2 separate matchdatas rather than custom
60 // struct boilerplate.
61 struct CvtF32UByteMatchInfo {
62 Register CvtVal;
63 unsigned ShiftOffset;
64 };
65
66 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
67 void applyCvtF32UByteN(MachineInstr &MI,
68 const CvtF32UByteMatchInfo &MatchInfo);
69 };
70
matchFMinFMaxLegacy(MachineInstr & MI,FMinFMaxLegacyInfo & Info)71 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
72 MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
73 // FIXME: Combines should have subtarget predicates, and we shouldn't need
74 // this here.
75 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
76 return false;
77
78 // FIXME: Type predicate on pattern
79 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
80 return false;
81
82 Register Cond = MI.getOperand(1).getReg();
83 if (!MRI.hasOneNonDBGUse(Cond) ||
84 !mi_match(Cond, MRI,
85 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
86 return false;
87
88 Info.True = MI.getOperand(2).getReg();
89 Info.False = MI.getOperand(3).getReg();
90
91 if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
92 !(Info.LHS == Info.False && Info.RHS == Info.True))
93 return false;
94
95 switch (Info.Pred) {
96 case CmpInst::FCMP_FALSE:
97 case CmpInst::FCMP_OEQ:
98 case CmpInst::FCMP_ONE:
99 case CmpInst::FCMP_ORD:
100 case CmpInst::FCMP_UNO:
101 case CmpInst::FCMP_UEQ:
102 case CmpInst::FCMP_UNE:
103 case CmpInst::FCMP_TRUE:
104 return false;
105 default:
106 return true;
107 }
108 }
109
applySelectFCmpToFMinToFMaxLegacy(MachineInstr & MI,const FMinFMaxLegacyInfo & Info)110 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
111 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
112 B.setInstrAndDebugLoc(MI);
113 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
114 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
115 };
116
117 switch (Info.Pred) {
118 case CmpInst::FCMP_ULT:
119 case CmpInst::FCMP_ULE:
120 if (Info.LHS == Info.True)
121 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
122 else
123 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
124 break;
125 case CmpInst::FCMP_OLE:
126 case CmpInst::FCMP_OLT: {
127 // We need to permute the operands to get the correct NaN behavior. The
128 // selected operand is the second one based on the failing compare with NaN,
129 // so permute it based on the compare type the hardware uses.
130 if (Info.LHS == Info.True)
131 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
132 else
133 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
134 break;
135 }
136 case CmpInst::FCMP_UGE:
137 case CmpInst::FCMP_UGT: {
138 if (Info.LHS == Info.True)
139 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
140 else
141 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
142 break;
143 }
144 case CmpInst::FCMP_OGT:
145 case CmpInst::FCMP_OGE: {
146 if (Info.LHS == Info.True)
147 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
148 else
149 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
150 break;
151 }
152 default:
153 llvm_unreachable("predicate should not have matched");
154 }
155
156 MI.eraseFromParent();
157 }
158
matchUCharToFloat(MachineInstr & MI)159 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
160 Register DstReg = MI.getOperand(0).getReg();
161
162 // TODO: We could try to match extracting the higher bytes, which would be
163 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
164 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
165 // about in practice.
166 LLT Ty = MRI.getType(DstReg);
167 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
168 Register SrcReg = MI.getOperand(1).getReg();
169 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
170 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
171 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
172 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
173 }
174
175 return false;
176 }
177
applyUCharToFloat(MachineInstr & MI)178 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
179 B.setInstrAndDebugLoc(MI);
180
181 const LLT S32 = LLT::scalar(32);
182
183 Register DstReg = MI.getOperand(0).getReg();
184 Register SrcReg = MI.getOperand(1).getReg();
185 LLT Ty = MRI.getType(DstReg);
186 LLT SrcTy = MRI.getType(SrcReg);
187 if (SrcTy != S32)
188 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
189
190 if (Ty == S32) {
191 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
192 {SrcReg}, MI.getFlags());
193 } else {
194 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
195 {SrcReg}, MI.getFlags());
196 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
197 }
198
199 MI.eraseFromParent();
200 }
201
matchCvtF32UByteN(MachineInstr & MI,CvtF32UByteMatchInfo & MatchInfo)202 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
203 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
204 Register SrcReg = MI.getOperand(1).getReg();
205
206 // Look through G_ZEXT.
207 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
208
209 Register Src0;
210 int64_t ShiftAmt;
211 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
212 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
213 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
214
215 unsigned ShiftOffset = 8 * Offset;
216 if (IsShr)
217 ShiftOffset += ShiftAmt;
218 else
219 ShiftOffset -= ShiftAmt;
220
221 MatchInfo.CvtVal = Src0;
222 MatchInfo.ShiftOffset = ShiftOffset;
223 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
224 }
225
226 // TODO: Simplify demanded bits.
227 return false;
228 }
229
applyCvtF32UByteN(MachineInstr & MI,const CvtF32UByteMatchInfo & MatchInfo)230 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
231 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
232 B.setInstrAndDebugLoc(MI);
233 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
234
235 const LLT S32 = LLT::scalar(32);
236 Register CvtSrc = MatchInfo.CvtVal;
237 LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
238 if (SrcTy != S32) {
239 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
240 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
241 }
242
243 assert(MI.getOpcode() != NewOpc);
244 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
245 MI.eraseFromParent();
246 }
247
248 class AMDGPUPostLegalizerCombinerHelperState {
249 protected:
250 CombinerHelper &Helper;
251 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
252
253 public:
AMDGPUPostLegalizerCombinerHelperState(CombinerHelper & Helper,AMDGPUPostLegalizerCombinerHelper & PostLegalizerHelper)254 AMDGPUPostLegalizerCombinerHelperState(
255 CombinerHelper &Helper,
256 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
257 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
258 };
259
260 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
261 #include "AMDGPUGenPostLegalizeGICombiner.inc"
262 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
263
264 namespace {
265 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
266 #include "AMDGPUGenPostLegalizeGICombiner.inc"
267 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
268
269 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
270 GISelKnownBits *KB;
271 MachineDominatorTree *MDT;
272
273 public:
274 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
275
AMDGPUPostLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,const AMDGPULegalizerInfo * LI,GISelKnownBits * KB,MachineDominatorTree * MDT)276 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
277 const AMDGPULegalizerInfo *LI,
278 GISelKnownBits *KB, MachineDominatorTree *MDT)
279 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
280 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
281 KB(KB), MDT(MDT) {
282 if (!GeneratedRuleCfg.parseCommandLineOption())
283 report_fatal_error("Invalid rule identifier");
284 }
285
286 bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
287 MachineIRBuilder &B) const override;
288 };
289
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const290 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
291 MachineInstr &MI,
292 MachineIRBuilder &B) const {
293 CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
294 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
295 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
296 PostLegalizerHelper);
297
298 if (Generated.tryCombineAll(Observer, MI, B))
299 return true;
300
301 switch (MI.getOpcode()) {
302 case TargetOpcode::G_SHL:
303 case TargetOpcode::G_LSHR:
304 case TargetOpcode::G_ASHR:
305 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
306 // common case, splitting this into a move and a 32-bit shift is faster and
307 // the same code size.
308 return Helper.tryCombineShiftToUnmerge(MI, 32);
309 }
310
311 return false;
312 }
313
314 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
315 #include "AMDGPUGenPostLegalizeGICombiner.inc"
316 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
317
318 // Pass boilerplate
319 // ================
320
321 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
322 public:
323 static char ID;
324
325 AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
326
getPassName() const327 StringRef getPassName() const override {
328 return "AMDGPUPostLegalizerCombiner";
329 }
330
331 bool runOnMachineFunction(MachineFunction &MF) override;
332
333 void getAnalysisUsage(AnalysisUsage &AU) const override;
334 private:
335 bool IsOptNone;
336 };
337 } // end anonymous namespace
338
getAnalysisUsage(AnalysisUsage & AU) const339 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
340 AU.addRequired<TargetPassConfig>();
341 AU.setPreservesCFG();
342 getSelectionDAGFallbackAnalysisUsage(AU);
343 AU.addRequired<GISelKnownBitsAnalysis>();
344 AU.addPreserved<GISelKnownBitsAnalysis>();
345 if (!IsOptNone) {
346 AU.addRequired<MachineDominatorTree>();
347 AU.addPreserved<MachineDominatorTree>();
348 }
349 MachineFunctionPass::getAnalysisUsage(AU);
350 }
351
AMDGPUPostLegalizerCombiner(bool IsOptNone)352 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
353 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
354 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
355 }
356
runOnMachineFunction(MachineFunction & MF)357 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
358 if (MF.getProperties().hasProperty(
359 MachineFunctionProperties::Property::FailedISel))
360 return false;
361 auto *TPC = &getAnalysis<TargetPassConfig>();
362 const Function &F = MF.getFunction();
363 bool EnableOpt =
364 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
365
366 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
367 const AMDGPULegalizerInfo *LI
368 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
369
370 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
371 MachineDominatorTree *MDT =
372 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
373 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
374 F.hasMinSize(), LI, KB, MDT);
375 Combiner C(PCInfo, TPC);
376 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
377 }
378
379 char AMDGPUPostLegalizerCombiner::ID = 0;
380 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
381 "Combine AMDGPU machine instrs after legalization",
382 false, false)
383 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
384 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
385 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
386 "Combine AMDGPU machine instrs after legalization", false,
387 false)
388
389 namespace llvm {
createAMDGPUPostLegalizeCombiner(bool IsOptNone)390 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
391 return new AMDGPUPostLegalizerCombiner(IsOptNone);
392 }
393 } // end namespace llvm
394