• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUTargetMachine.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "llvm/CodeGen/GlobalISel/Combiner.h"
17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/MachineDominators.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 class AMDGPUPostLegalizerCombinerHelper {
33 protected:
34   MachineIRBuilder &B;
35   MachineFunction &MF;
36   MachineRegisterInfo &MRI;
37   CombinerHelper &Helper;
38 
39 public:
AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder & B,CombinerHelper & Helper)40   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42 
43   struct FMinFMaxLegacyInfo {
44     Register LHS;
45     Register RHS;
46     Register True;
47     Register False;
48     CmpInst::Predicate Pred;
49   };
50 
51   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
52   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
53   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
54                                          const FMinFMaxLegacyInfo &Info);
55 
56   bool matchUCharToFloat(MachineInstr &MI);
57   void applyUCharToFloat(MachineInstr &MI);
58 
59   // FIXME: Should be able to have 2 separate matchdatas rather than custom
60   // struct boilerplate.
61   struct CvtF32UByteMatchInfo {
62     Register CvtVal;
63     unsigned ShiftOffset;
64   };
65 
66   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
67   void applyCvtF32UByteN(MachineInstr &MI,
68                          const CvtF32UByteMatchInfo &MatchInfo);
69 };
70 
matchFMinFMaxLegacy(MachineInstr & MI,FMinFMaxLegacyInfo & Info)71 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
72     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
73   // FIXME: Combines should have subtarget predicates, and we shouldn't need
74   // this here.
75   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
76     return false;
77 
78   // FIXME: Type predicate on pattern
79   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
80     return false;
81 
82   Register Cond = MI.getOperand(1).getReg();
83   if (!MRI.hasOneNonDBGUse(Cond) ||
84       !mi_match(Cond, MRI,
85                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
86     return false;
87 
88   Info.True = MI.getOperand(2).getReg();
89   Info.False = MI.getOperand(3).getReg();
90 
91   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
92       !(Info.LHS == Info.False && Info.RHS == Info.True))
93     return false;
94 
95   switch (Info.Pred) {
96   case CmpInst::FCMP_FALSE:
97   case CmpInst::FCMP_OEQ:
98   case CmpInst::FCMP_ONE:
99   case CmpInst::FCMP_ORD:
100   case CmpInst::FCMP_UNO:
101   case CmpInst::FCMP_UEQ:
102   case CmpInst::FCMP_UNE:
103   case CmpInst::FCMP_TRUE:
104     return false;
105   default:
106     return true;
107   }
108 }
109 
applySelectFCmpToFMinToFMaxLegacy(MachineInstr & MI,const FMinFMaxLegacyInfo & Info)110 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
111     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
112   B.setInstrAndDebugLoc(MI);
113   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
114     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
115   };
116 
117   switch (Info.Pred) {
118   case CmpInst::FCMP_ULT:
119   case CmpInst::FCMP_ULE:
120     if (Info.LHS == Info.True)
121       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
122     else
123       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
124     break;
125   case CmpInst::FCMP_OLE:
126   case CmpInst::FCMP_OLT: {
127     // We need to permute the operands to get the correct NaN behavior. The
128     // selected operand is the second one based on the failing compare with NaN,
129     // so permute it based on the compare type the hardware uses.
130     if (Info.LHS == Info.True)
131       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
132     else
133       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
134     break;
135   }
136   case CmpInst::FCMP_UGE:
137   case CmpInst::FCMP_UGT: {
138     if (Info.LHS == Info.True)
139       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
140     else
141       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
142     break;
143   }
144   case CmpInst::FCMP_OGT:
145   case CmpInst::FCMP_OGE: {
146     if (Info.LHS == Info.True)
147       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
148     else
149       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
150     break;
151   }
152   default:
153     llvm_unreachable("predicate should not have matched");
154   }
155 
156   MI.eraseFromParent();
157 }
158 
matchUCharToFloat(MachineInstr & MI)159 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
160   Register DstReg = MI.getOperand(0).getReg();
161 
162   // TODO: We could try to match extracting the higher bytes, which would be
163   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
164   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
165   // about in practice.
166   LLT Ty = MRI.getType(DstReg);
167   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
168     Register SrcReg = MI.getOperand(1).getReg();
169     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
170     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
171     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
172     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
173   }
174 
175   return false;
176 }
177 
applyUCharToFloat(MachineInstr & MI)178 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
179   B.setInstrAndDebugLoc(MI);
180 
181   const LLT S32 = LLT::scalar(32);
182 
183   Register DstReg = MI.getOperand(0).getReg();
184   Register SrcReg = MI.getOperand(1).getReg();
185   LLT Ty = MRI.getType(DstReg);
186   LLT SrcTy = MRI.getType(SrcReg);
187   if (SrcTy != S32)
188     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
189 
190   if (Ty == S32) {
191     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
192                    {SrcReg}, MI.getFlags());
193   } else {
194     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
195                              {SrcReg}, MI.getFlags());
196     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
197   }
198 
199   MI.eraseFromParent();
200 }
201 
matchCvtF32UByteN(MachineInstr & MI,CvtF32UByteMatchInfo & MatchInfo)202 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
203     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
204   Register SrcReg = MI.getOperand(1).getReg();
205 
206   // Look through G_ZEXT.
207   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
208 
209   Register Src0;
210   int64_t ShiftAmt;
211   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
212   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
213     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
214 
215     unsigned ShiftOffset = 8 * Offset;
216     if (IsShr)
217       ShiftOffset += ShiftAmt;
218     else
219       ShiftOffset -= ShiftAmt;
220 
221     MatchInfo.CvtVal = Src0;
222     MatchInfo.ShiftOffset = ShiftOffset;
223     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
224   }
225 
226   // TODO: Simplify demanded bits.
227   return false;
228 }
229 
applyCvtF32UByteN(MachineInstr & MI,const CvtF32UByteMatchInfo & MatchInfo)230 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
231     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
232   B.setInstrAndDebugLoc(MI);
233   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
234 
235   const LLT S32 = LLT::scalar(32);
236   Register CvtSrc = MatchInfo.CvtVal;
237   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
238   if (SrcTy != S32) {
239     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
240     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
241   }
242 
243   assert(MI.getOpcode() != NewOpc);
244   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
245   MI.eraseFromParent();
246 }
247 
248 class AMDGPUPostLegalizerCombinerHelperState {
249 protected:
250   CombinerHelper &Helper;
251   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
252 
253 public:
AMDGPUPostLegalizerCombinerHelperState(CombinerHelper & Helper,AMDGPUPostLegalizerCombinerHelper & PostLegalizerHelper)254   AMDGPUPostLegalizerCombinerHelperState(
255       CombinerHelper &Helper,
256       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
257       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
258 };
259 
260 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
261 #include "AMDGPUGenPostLegalizeGICombiner.inc"
262 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
263 
264 namespace {
265 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
266 #include "AMDGPUGenPostLegalizeGICombiner.inc"
267 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
268 
269 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
270   GISelKnownBits *KB;
271   MachineDominatorTree *MDT;
272 
273 public:
274   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
275 
AMDGPUPostLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,const AMDGPULegalizerInfo * LI,GISelKnownBits * KB,MachineDominatorTree * MDT)276   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
277                                   const AMDGPULegalizerInfo *LI,
278                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
279       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
280                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
281         KB(KB), MDT(MDT) {
282     if (!GeneratedRuleCfg.parseCommandLineOption())
283       report_fatal_error("Invalid rule identifier");
284   }
285 
286   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
287                MachineIRBuilder &B) const override;
288 };
289 
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const290 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
291                                               MachineInstr &MI,
292                                               MachineIRBuilder &B) const {
293   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
294   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
295   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
296                                                  PostLegalizerHelper);
297 
298   if (Generated.tryCombineAll(Observer, MI, B))
299     return true;
300 
301   switch (MI.getOpcode()) {
302   case TargetOpcode::G_SHL:
303   case TargetOpcode::G_LSHR:
304   case TargetOpcode::G_ASHR:
305     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
306     // common case, splitting this into a move and a 32-bit shift is faster and
307     // the same code size.
308     return Helper.tryCombineShiftToUnmerge(MI, 32);
309   }
310 
311   return false;
312 }
313 
314 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
315 #include "AMDGPUGenPostLegalizeGICombiner.inc"
316 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
317 
318 // Pass boilerplate
319 // ================
320 
321 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
322 public:
323   static char ID;
324 
325   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
326 
getPassName() const327   StringRef getPassName() const override {
328     return "AMDGPUPostLegalizerCombiner";
329   }
330 
331   bool runOnMachineFunction(MachineFunction &MF) override;
332 
333   void getAnalysisUsage(AnalysisUsage &AU) const override;
334 private:
335   bool IsOptNone;
336 };
337 } // end anonymous namespace
338 
getAnalysisUsage(AnalysisUsage & AU) const339 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
340   AU.addRequired<TargetPassConfig>();
341   AU.setPreservesCFG();
342   getSelectionDAGFallbackAnalysisUsage(AU);
343   AU.addRequired<GISelKnownBitsAnalysis>();
344   AU.addPreserved<GISelKnownBitsAnalysis>();
345   if (!IsOptNone) {
346     AU.addRequired<MachineDominatorTree>();
347     AU.addPreserved<MachineDominatorTree>();
348   }
349   MachineFunctionPass::getAnalysisUsage(AU);
350 }
351 
AMDGPUPostLegalizerCombiner(bool IsOptNone)352 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
353   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
354   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
355 }
356 
runOnMachineFunction(MachineFunction & MF)357 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
358   if (MF.getProperties().hasProperty(
359           MachineFunctionProperties::Property::FailedISel))
360     return false;
361   auto *TPC = &getAnalysis<TargetPassConfig>();
362   const Function &F = MF.getFunction();
363   bool EnableOpt =
364       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
365 
366   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
367   const AMDGPULegalizerInfo *LI
368     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
369 
370   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
371   MachineDominatorTree *MDT =
372       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
373   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
374                                          F.hasMinSize(), LI, KB, MDT);
375   Combiner C(PCInfo, TPC);
376   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
377 }
378 
379 char AMDGPUPostLegalizerCombiner::ID = 0;
380 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
381                       "Combine AMDGPU machine instrs after legalization",
382                       false, false)
383 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
384 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
385 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
386                     "Combine AMDGPU machine instrs after legalization", false,
387                     false)
388 
389 namespace llvm {
createAMDGPUPostLegalizeCombiner(bool IsOptNone)390 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
391   return new AMDGPUPostLegalizerCombiner(IsOptNone);
392 }
393 } // end namespace llvm
394