• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/CodeGen/ValueTypes.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 
32 #define DEBUG_TYPE "amdgpu-legalinfo"
33 
34 using namespace llvm;
35 using namespace LegalizeActions;
36 using namespace LegalizeMutations;
37 using namespace LegalityPredicates;
38 using namespace MIPatternMatch;
39 
40 // Hack until load/store selection patterns support any tuple of legal types.
41 static cl::opt<bool> EnableNewLegality(
42   "amdgpu-global-isel-new-legality",
43   cl::desc("Use GlobalISel desired legality, rather than try to use"
44            "rules compatible with selection patterns"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 static constexpr unsigned MaxRegisterSize = 1024;
49 
50 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)51 static LLT getPow2VectorType(LLT Ty) {
52   unsigned NElts = Ty.getNumElements();
53   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
54   return Ty.changeNumElements(Pow2NElts);
55 }
56 
57 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)58 static LLT getPow2ScalarType(LLT Ty) {
59   unsigned Bits = Ty.getSizeInBits();
60   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
61   return LLT::scalar(Pow2Bits);
62 }
63 
64 /// \returs true if this is an odd sized vector which should widen by adding an
65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     if (!Ty.isVector())
71       return false;
72 
73     const LLT EltTy = Ty.getElementType();
74     const unsigned EltSize = EltTy.getSizeInBits();
75     return Ty.getNumElements() % 2 != 0 &&
76            EltSize > 1 && EltSize < 32 &&
77            Ty.getSizeInBits() % 32 != 0;
78   };
79 }
80 
sizeIsMultipleOf32(unsigned TypeIdx)81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     return Ty.getSizeInBits() % 32 == 0;
85   };
86 }
87 
isWideVec16(unsigned TypeIdx)88 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getScalarType();
92     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
93   };
94 }
95 
oneMoreElement(unsigned TypeIdx)96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
fewerEltsToSize64Vector(unsigned TypeIdx)104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
112   };
113 }
114 
115 // Increase the number of vector elements to reach the next multiple of 32-bit
116 // type.
moreEltsToNext32Bit(unsigned TypeIdx)117 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
118   return [=](const LegalityQuery &Query) {
119     const LLT Ty = Query.Types[TypeIdx];
120 
121     const LLT EltTy = Ty.getElementType();
122     const int Size = Ty.getSizeInBits();
123     const int EltSize = EltTy.getSizeInBits();
124     const int NextMul32 = (Size + 31) / 32;
125 
126     assert(EltSize < 32);
127 
128     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
129     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
130   };
131 }
132 
getBitcastRegisterType(const LLT Ty)133 static LLT getBitcastRegisterType(const LLT Ty) {
134   const unsigned Size = Ty.getSizeInBits();
135 
136   LLT CoercedTy;
137   if (Size <= 32) {
138     // <2 x s8> -> s16
139     // <4 x s8> -> s32
140     return LLT::scalar(Size);
141   }
142 
143   return LLT::scalarOrVector(Size / 32, 32);
144 }
145 
bitcastToRegisterType(unsigned TypeIdx)146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147   return [=](const LegalityQuery &Query) {
148     const LLT Ty = Query.Types[TypeIdx];
149     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
150   };
151 }
152 
bitcastToVectorElement32(unsigned TypeIdx)153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     unsigned Size = Ty.getSizeInBits();
157     assert(Size % 32 == 0);
158     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
159   };
160 }
161 
vectorSmallerThan(unsigned TypeIdx,unsigned Size)162 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
163   return [=](const LegalityQuery &Query) {
164     const LLT QueryTy = Query.Types[TypeIdx];
165     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
166   };
167 }
168 
vectorWiderThan(unsigned TypeIdx,unsigned Size)169 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
170   return [=](const LegalityQuery &Query) {
171     const LLT QueryTy = Query.Types[TypeIdx];
172     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
173   };
174 }
175 
numElementsNotEven(unsigned TypeIdx)176 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
177   return [=](const LegalityQuery &Query) {
178     const LLT QueryTy = Query.Types[TypeIdx];
179     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
180   };
181 }
182 
isRegisterSize(unsigned Size)183 static bool isRegisterSize(unsigned Size) {
184   return Size % 32 == 0 && Size <= MaxRegisterSize;
185 }
186 
isRegisterVectorElementType(LLT EltTy)187 static bool isRegisterVectorElementType(LLT EltTy) {
188   const int EltSize = EltTy.getSizeInBits();
189   return EltSize == 16 || EltSize % 32 == 0;
190 }
191 
isRegisterVectorType(LLT Ty)192 static bool isRegisterVectorType(LLT Ty) {
193   const int EltSize = Ty.getElementType().getSizeInBits();
194   return EltSize == 32 || EltSize == 64 ||
195          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
196          EltSize == 128 || EltSize == 256;
197 }
198 
isRegisterType(LLT Ty)199 static bool isRegisterType(LLT Ty) {
200   if (!isRegisterSize(Ty.getSizeInBits()))
201     return false;
202 
203   if (Ty.isVector())
204     return isRegisterVectorType(Ty);
205 
206   return true;
207 }
208 
209 // Any combination of 32 or 64-bit elements up the maximum register size, and
210 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)211 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
212   return [=](const LegalityQuery &Query) {
213     return isRegisterType(Query.Types[TypeIdx]);
214   };
215 }
216 
elementTypeIsLegal(unsigned TypeIdx)217 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
218   return [=](const LegalityQuery &Query) {
219     const LLT QueryTy = Query.Types[TypeIdx];
220     if (!QueryTy.isVector())
221       return false;
222     const LLT EltTy = QueryTy.getElementType();
223     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
224   };
225 }
226 
isWideScalarTruncStore(unsigned TypeIdx)227 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query,
265                                  unsigned Opcode) {
266   const LLT Ty = Query.Types[0];
267 
268   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
269   const bool IsLoad = Opcode != AMDGPU::G_STORE;
270 
271   unsigned RegSize = Ty.getSizeInBits();
272   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
273   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
274   unsigned AS = Query.Types[1].getAddressSpace();
275 
276   // All of these need to be custom lowered to cast the pointer operand.
277   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
278     return false;
279 
280   // TODO: We should be able to widen loads if the alignment is high enough, but
281   // we also need to modify the memory access size.
282 #if 0
283   // Accept widening loads based on alignment.
284   if (IsLoad && MemSize < Size)
285     MemSize = std::max(MemSize, Align);
286 #endif
287 
288   // Only 1-byte and 2-byte to 32-bit extloads are valid.
289   if (MemSize != RegSize && RegSize != 32)
290     return false;
291 
292   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
293     return false;
294 
295   switch (MemSize) {
296   case 8:
297   case 16:
298   case 32:
299   case 64:
300   case 128:
301     break;
302   case 96:
303     if (!ST.hasDwordx3LoadStores())
304       return false;
305     break;
306   case 256:
307   case 512:
308     // These may contextually need to be broken down.
309     break;
310   default:
311     return false;
312   }
313 
314   assert(RegSize >= MemSize);
315 
316   if (AlignBits < MemSize) {
317     const SITargetLowering *TLI = ST.getTargetLowering();
318     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
319                                                  Align(AlignBits / 8)))
320       return false;
321   }
322 
323   return true;
324 }
325 
326 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
327 // workaround this. Eventually it should ignore the type for loads and only care
328 // about the size. Return true in cases where we will workaround this for now by
329 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)330 static bool loadStoreBitcastWorkaround(const LLT Ty) {
331   if (EnableNewLegality)
332     return false;
333 
334   const unsigned Size = Ty.getSizeInBits();
335   if (Size <= 64)
336     return false;
337   if (!Ty.isVector())
338     return true;
339 
340   LLT EltTy = Ty.getElementType();
341   if (EltTy.isPointer())
342     return true;
343 
344   unsigned EltSize = EltTy.getSizeInBits();
345   return EltSize != 32 && EltSize != 64;
346 }
347 
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)348 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
349                              unsigned Opcode) {
350   const LLT Ty = Query.Types[0];
351   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
352          !loadStoreBitcastWorkaround(Ty);
353 }
354 
355 /// Return true if a load or store of the type should be lowered with a bitcast
356 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const unsigned MemSizeInBits)357 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
358                                        const unsigned MemSizeInBits) {
359   const unsigned Size = Ty.getSizeInBits();
360     if (Size != MemSizeInBits)
361       return Size <= 32 && Ty.isVector();
362 
363   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
364     return true;
365   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
366          !isRegisterVectorElementType(Ty.getElementType());
367 }
368 
369 /// Return true if we should legalize a load by widening an odd sized memory
370 /// access up to the alignment. Note this case when the memory access itself
371 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,unsigned SizeInBits,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)372 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
373                             unsigned AlignInBits, unsigned AddrSpace,
374                             unsigned Opcode) {
375   // We don't want to widen cases that are naturally legal.
376   if (isPowerOf2_32(SizeInBits))
377     return false;
378 
379   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
380   // end up widening these for a scalar load during RegBankSelect, since there
381   // aren't 96-bit scalar loads.
382   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
383     return false;
384 
385   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
386     return false;
387 
388   // A load is known dereferenceable up to the alignment, so it's legal to widen
389   // to it.
390   //
391   // TODO: Could check dereferenceable for less aligned cases.
392   unsigned RoundedSize = NextPowerOf2(SizeInBits);
393   if (AlignInBits < RoundedSize)
394     return false;
395 
396   // Do not widen if it would introduce a slow unaligned load.
397   const SITargetLowering *TLI = ST.getTargetLowering();
398   bool Fast = false;
399   return TLI->allowsMisalignedMemoryAccessesImpl(
400              RoundedSize, AddrSpace, Align(AlignInBits / 8),
401              MachineMemOperand::MOLoad, &Fast) &&
402          Fast;
403 }
404 
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)405 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
406                             unsigned Opcode) {
407   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
408     return false;
409 
410   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
411                          Query.MMODescrs[0].AlignInBits,
412                          Query.Types[1].getAddressSpace(), Opcode);
413 }
414 
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)415 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
416                                          const GCNTargetMachine &TM)
417   :  ST(ST_) {
418   using namespace TargetOpcode;
419 
420   auto GetAddrSpacePtr = [&TM](unsigned AS) {
421     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
422   };
423 
424   const LLT S1 = LLT::scalar(1);
425   const LLT S8 = LLT::scalar(8);
426   const LLT S16 = LLT::scalar(16);
427   const LLT S32 = LLT::scalar(32);
428   const LLT S64 = LLT::scalar(64);
429   const LLT S128 = LLT::scalar(128);
430   const LLT S256 = LLT::scalar(256);
431   const LLT S512 = LLT::scalar(512);
432   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
433 
434   const LLT V2S8 = LLT::vector(2, 8);
435   const LLT V2S16 = LLT::vector(2, 16);
436   const LLT V4S16 = LLT::vector(4, 16);
437 
438   const LLT V2S32 = LLT::vector(2, 32);
439   const LLT V3S32 = LLT::vector(3, 32);
440   const LLT V4S32 = LLT::vector(4, 32);
441   const LLT V5S32 = LLT::vector(5, 32);
442   const LLT V6S32 = LLT::vector(6, 32);
443   const LLT V7S32 = LLT::vector(7, 32);
444   const LLT V8S32 = LLT::vector(8, 32);
445   const LLT V9S32 = LLT::vector(9, 32);
446   const LLT V10S32 = LLT::vector(10, 32);
447   const LLT V11S32 = LLT::vector(11, 32);
448   const LLT V12S32 = LLT::vector(12, 32);
449   const LLT V13S32 = LLT::vector(13, 32);
450   const LLT V14S32 = LLT::vector(14, 32);
451   const LLT V15S32 = LLT::vector(15, 32);
452   const LLT V16S32 = LLT::vector(16, 32);
453   const LLT V32S32 = LLT::vector(32, 32);
454 
455   const LLT V2S64 = LLT::vector(2, 64);
456   const LLT V3S64 = LLT::vector(3, 64);
457   const LLT V4S64 = LLT::vector(4, 64);
458   const LLT V5S64 = LLT::vector(5, 64);
459   const LLT V6S64 = LLT::vector(6, 64);
460   const LLT V7S64 = LLT::vector(7, 64);
461   const LLT V8S64 = LLT::vector(8, 64);
462   const LLT V16S64 = LLT::vector(16, 64);
463 
464   std::initializer_list<LLT> AllS32Vectors =
465     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
466      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
467   std::initializer_list<LLT> AllS64Vectors =
468     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
469 
470   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
471   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
472   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
473   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
474   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
475   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
476   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
477 
478   const LLT CodePtr = FlatPtr;
479 
480   const std::initializer_list<LLT> AddrSpaces64 = {
481     GlobalPtr, ConstantPtr, FlatPtr
482   };
483 
484   const std::initializer_list<LLT> AddrSpaces32 = {
485     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
486   };
487 
488   const std::initializer_list<LLT> FPTypesBase = {
489     S32, S64
490   };
491 
492   const std::initializer_list<LLT> FPTypes16 = {
493     S32, S64, S16
494   };
495 
496   const std::initializer_list<LLT> FPTypesPK16 = {
497     S32, S64, S16, V2S16
498   };
499 
500   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
501 
502   setAction({G_BRCOND, S1}, Legal); // VCC branches
503   setAction({G_BRCOND, S32}, Legal); // SCC branches
504 
505   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
506   // elements for v3s16
507   getActionDefinitionsBuilder(G_PHI)
508     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
509     .legalFor(AllS32Vectors)
510     .legalFor(AllS64Vectors)
511     .legalFor(AddrSpaces64)
512     .legalFor(AddrSpaces32)
513     .legalIf(isPointer(0))
514     .clampScalar(0, S16, S256)
515     .widenScalarToNextPow2(0, 32)
516     .clampMaxNumElements(0, S32, 16)
517     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
518     .scalarize(0);
519 
520   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
521     // Full set of gfx9 features.
522     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
523       .legalFor({S32, S16, V2S16})
524       .clampScalar(0, S16, S32)
525       .clampMaxNumElements(0, S16, 2)
526       .scalarize(0)
527       .widenScalarToNextPow2(0, 32);
528 
529     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
530       .legalFor({S32, S16, V2S16}) // Clamp modifier
531       .minScalarOrElt(0, S16)
532       .clampMaxNumElements(0, S16, 2)
533       .scalarize(0)
534       .widenScalarToNextPow2(0, 32)
535       .lower();
536   } else if (ST.has16BitInsts()) {
537     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
538       .legalFor({S32, S16})
539       .clampScalar(0, S16, S32)
540       .scalarize(0)
541       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
542 
543     // Technically the saturating operations require clamp bit support, but this
544     // was introduced at the same time as 16-bit operations.
545     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
546       .legalFor({S32, S16}) // Clamp modifier
547       .minScalar(0, S16)
548       .scalarize(0)
549       .widenScalarToNextPow2(0, 16)
550       .lower();
551 
552     // We're just lowering this, but it helps get a better result to try to
553     // coerce to the desired type first.
554     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
555       .minScalar(0, S16)
556       .scalarize(0)
557       .lower();
558   } else {
559     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
560       .legalFor({S32})
561       .clampScalar(0, S32, S32)
562       .scalarize(0);
563 
564     if (ST.hasIntClamp()) {
565       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
566         .legalFor({S32}) // Clamp modifier.
567         .scalarize(0)
568         .minScalarOrElt(0, S32)
569         .lower();
570     } else {
571       // Clamp bit support was added in VI, along with 16-bit operations.
572       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
573         .minScalar(0, S32)
574         .scalarize(0)
575         .lower();
576     }
577 
578     // FIXME: DAG expansion gets better results. The widening uses the smaller
579     // range values and goes for the min/max lowering directly.
580     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
581       .minScalar(0, S32)
582       .scalarize(0)
583       .lower();
584   }
585 
586   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
587     .customFor({S32, S64})
588     .clampScalar(0, S32, S64)
589     .widenScalarToNextPow2(0, 32)
590     .scalarize(0);
591 
592   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
593                    .legalFor({S32})
594                    .maxScalarOrElt(0, S32);
595 
596   if (ST.hasVOP3PInsts()) {
597     Mulh
598       .clampMaxNumElements(0, S8, 2)
599       .lowerFor({V2S8});
600   }
601 
602   Mulh
603     .scalarize(0)
604     .lower();
605 
606   // Report legal for any types we can handle anywhere. For the cases only legal
607   // on the SALU, RegBankSelect will be able to re-legalize.
608   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
609     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
610     .clampScalar(0, S32, S64)
611     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
612     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
613     .widenScalarToNextPow2(0)
614     .scalarize(0);
615 
616   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
617                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
618     .legalFor({{S32, S1}, {S32, S32}})
619     .minScalar(0, S32)
620     // TODO: .scalarize(0)
621     .lower();
622 
623   getActionDefinitionsBuilder(G_BITCAST)
624     // Don't worry about the size constraint.
625     .legalIf(all(isRegisterType(0), isRegisterType(1)))
626     .lower();
627 
628 
629   getActionDefinitionsBuilder(G_CONSTANT)
630     .legalFor({S1, S32, S64, S16, GlobalPtr,
631                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
632     .legalIf(isPointer(0))
633     .clampScalar(0, S32, S64)
634     .widenScalarToNextPow2(0);
635 
636   getActionDefinitionsBuilder(G_FCONSTANT)
637     .legalFor({S32, S64, S16})
638     .clampScalar(0, S16, S64);
639 
640   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
641       .legalIf(isRegisterType(0))
642       // s1 and s16 are special cases because they have legal operations on
643       // them, but don't really occupy registers in the normal way.
644       .legalFor({S1, S16})
645       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
646       .clampScalarOrElt(0, S32, MaxScalar)
647       .widenScalarToNextPow2(0, 32)
648       .clampMaxNumElements(0, S32, 16);
649 
650   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
651 
652   // If the amount is divergent, we have to do a wave reduction to get the
653   // maximum value, so this is expanded during RegBankSelect.
654   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
655     .legalFor({{PrivatePtr, S32}});
656 
657   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
658     .customIf(typeIsNot(0, PrivatePtr));
659 
660   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
661 
662   auto &FPOpActions = getActionDefinitionsBuilder(
663     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
664     .legalFor({S32, S64});
665   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
666     .customFor({S32, S64});
667   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
668     .customFor({S32, S64});
669 
670   if (ST.has16BitInsts()) {
671     if (ST.hasVOP3PInsts())
672       FPOpActions.legalFor({S16, V2S16});
673     else
674       FPOpActions.legalFor({S16});
675 
676     TrigActions.customFor({S16});
677     FDIVActions.customFor({S16});
678   }
679 
680   auto &MinNumMaxNum = getActionDefinitionsBuilder({
681       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
682 
683   if (ST.hasVOP3PInsts()) {
684     MinNumMaxNum.customFor(FPTypesPK16)
685       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
686       .clampMaxNumElements(0, S16, 2)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else if (ST.has16BitInsts()) {
690     MinNumMaxNum.customFor(FPTypes16)
691       .clampScalar(0, S16, S64)
692       .scalarize(0);
693   } else {
694     MinNumMaxNum.customFor(FPTypesBase)
695       .clampScalar(0, S32, S64)
696       .scalarize(0);
697   }
698 
699   if (ST.hasVOP3PInsts())
700     FPOpActions.clampMaxNumElements(0, S16, 2);
701 
702   FPOpActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   TrigActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   FDIVActions
711     .scalarize(0)
712     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
713 
714   getActionDefinitionsBuilder({G_FNEG, G_FABS})
715     .legalFor(FPTypesPK16)
716     .clampMaxNumElements(0, S16, 2)
717     .scalarize(0)
718     .clampScalar(0, S16, S64);
719 
720   if (ST.has16BitInsts()) {
721     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
722       .legalFor({S32, S64, S16})
723       .scalarize(0)
724       .clampScalar(0, S16, S64);
725   } else {
726     getActionDefinitionsBuilder(G_FSQRT)
727       .legalFor({S32, S64})
728       .scalarize(0)
729       .clampScalar(0, S32, S64);
730 
731     if (ST.hasFractBug()) {
732       getActionDefinitionsBuilder(G_FFLOOR)
733         .customFor({S64})
734         .legalFor({S32, S64})
735         .scalarize(0)
736         .clampScalar(0, S32, S64);
737     } else {
738       getActionDefinitionsBuilder(G_FFLOOR)
739         .legalFor({S32, S64})
740         .scalarize(0)
741         .clampScalar(0, S32, S64);
742     }
743   }
744 
745   getActionDefinitionsBuilder(G_FPTRUNC)
746     .legalFor({{S32, S64}, {S16, S32}})
747     .scalarize(0)
748     .lower();
749 
750   getActionDefinitionsBuilder(G_FPEXT)
751     .legalFor({{S64, S32}, {S32, S16}})
752     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
753     .scalarize(0);
754 
755   getActionDefinitionsBuilder(G_FSUB)
756       // Use actual fsub instruction
757       .legalFor({S32})
758       // Must use fadd + fneg
759       .lowerFor({S64, S16, V2S16})
760       .scalarize(0)
761       .clampScalar(0, S32, S64);
762 
763   // Whether this is legal depends on the floating point mode for the function.
764   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
765   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
766     FMad.customFor({S32, S16});
767   else if (ST.hasMadMacF32Insts())
768     FMad.customFor({S32});
769   else if (ST.hasMadF16())
770     FMad.customFor({S16});
771   FMad.scalarize(0)
772       .lower();
773 
774   auto &FRem = getActionDefinitionsBuilder(G_FREM);
775   if (ST.has16BitInsts()) {
776     FRem.customFor({S16, S32, S64});
777   } else {
778     FRem.minScalar(0, S32)
779         .customFor({S32, S64});
780   }
781   FRem.scalarize(0);
782 
783   // TODO: Do we need to clamp maximum bitwidth?
784   getActionDefinitionsBuilder(G_TRUNC)
785     .legalIf(isScalar(0))
786     .legalFor({{V2S16, V2S32}})
787     .clampMaxNumElements(0, S16, 2)
788     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
789     // situations (like an invalid implicit use), we don't want to infinite loop
790     // in the legalizer.
791     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
792     .alwaysLegal();
793 
794   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
795     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
796                {S32, S1}, {S64, S1}, {S16, S1}})
797     .scalarize(0)
798     .clampScalar(0, S32, S64)
799     .widenScalarToNextPow2(1, 32);
800 
801   // TODO: Split s1->s64 during regbankselect for VALU.
802   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
803     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
804     .lowerFor({{S32, S64}})
805     .lowerIf(typeIs(1, S1))
806     .customFor({{S64, S64}});
807   if (ST.has16BitInsts())
808     IToFP.legalFor({{S16, S16}});
809   IToFP.clampScalar(1, S32, S64)
810        .minScalar(0, S32)
811        .scalarize(0)
812        .widenScalarToNextPow2(1);
813 
814   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
815     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
816     .customFor({{S64, S64}})
817     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
818   if (ST.has16BitInsts())
819     FPToI.legalFor({{S16, S16}});
820   else
821     FPToI.minScalar(1, S32);
822 
823   FPToI.minScalar(0, S32)
824        .scalarize(0)
825        .lower();
826 
827   // Lower roundeven into G_FRINT
828   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
829     .scalarize(0)
830     .lower();
831 
832   if (ST.has16BitInsts()) {
833     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
834       .legalFor({S16, S32, S64})
835       .clampScalar(0, S16, S64)
836       .scalarize(0);
837   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
838     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
839       .legalFor({S32, S64})
840       .clampScalar(0, S32, S64)
841       .scalarize(0);
842   } else {
843     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
844       .legalFor({S32})
845       .customFor({S64})
846       .clampScalar(0, S32, S64)
847       .scalarize(0);
848   }
849 
850   getActionDefinitionsBuilder(G_PTR_ADD)
851     .legalIf(all(isPointer(0), sameSize(0, 1)))
852     .scalarize(0)
853     .scalarSameSizeAs(1, 0);
854 
855   getActionDefinitionsBuilder(G_PTRMASK)
856     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
857     .scalarSameSizeAs(1, 0)
858     .scalarize(0);
859 
860   auto &CmpBuilder =
861     getActionDefinitionsBuilder(G_ICMP)
862     // The compare output type differs based on the register bank of the output,
863     // so make both s1 and s32 legal.
864     //
865     // Scalar compares producing output in scc will be promoted to s32, as that
866     // is the allocatable register type that will be needed for the copy from
867     // scc. This will be promoted during RegBankSelect, and we assume something
868     // before that won't try to use s32 result types.
869     //
870     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
871     // bank.
872     .legalForCartesianProduct(
873       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
874     .legalForCartesianProduct(
875       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
876   if (ST.has16BitInsts()) {
877     CmpBuilder.legalFor({{S1, S16}});
878   }
879 
880   CmpBuilder
881     .widenScalarToNextPow2(1)
882     .clampScalar(1, S32, S64)
883     .scalarize(0)
884     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
885 
886   getActionDefinitionsBuilder(G_FCMP)
887     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
888     .widenScalarToNextPow2(1)
889     .clampScalar(1, S32, S64)
890     .scalarize(0);
891 
892   // FIXME: fpow has a selection pattern that should move to custom lowering.
893   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
894   if (ST.has16BitInsts())
895     Exp2Ops.legalFor({S32, S16});
896   else
897     Exp2Ops.legalFor({S32});
898   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
899   Exp2Ops.scalarize(0);
900 
901   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
902   if (ST.has16BitInsts())
903     ExpOps.customFor({{S32}, {S16}});
904   else
905     ExpOps.customFor({S32});
906   ExpOps.clampScalar(0, MinScalarFPTy, S32)
907         .scalarize(0);
908 
909   getActionDefinitionsBuilder(G_FPOWI)
910     .clampScalar(0, MinScalarFPTy, S32)
911     .lower();
912 
913   // The 64-bit versions produce 32-bit results, but only on the SALU.
914   getActionDefinitionsBuilder(G_CTPOP)
915     .legalFor({{S32, S32}, {S32, S64}})
916     .clampScalar(0, S32, S32)
917     .clampScalar(1, S32, S64)
918     .scalarize(0)
919     .widenScalarToNextPow2(0, 32)
920     .widenScalarToNextPow2(1, 32);
921 
922   // The hardware instructions return a different result on 0 than the generic
923   // instructions expect. The hardware produces -1, but these produce the
924   // bitwidth.
925   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
926     .scalarize(0)
927     .clampScalar(0, S32, S32)
928     .clampScalar(1, S32, S64)
929     .widenScalarToNextPow2(0, 32)
930     .widenScalarToNextPow2(1, 32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .clampScalar(1, S32, S64)
938     .scalarize(0)
939     .widenScalarToNextPow2(0, 32)
940     .widenScalarToNextPow2(1, 32);
941 
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32})
944     .clampScalar(0, S32, S32)
945     .scalarize(0);
946 
947   if (ST.has16BitInsts()) {
948     getActionDefinitionsBuilder(G_BSWAP)
949       .legalFor({S16, S32, V2S16})
950       .clampMaxNumElements(0, S16, 2)
951       // FIXME: Fixing non-power-of-2 before clamp is workaround for
952       // narrowScalar limitation.
953       .widenScalarToNextPow2(0)
954       .clampScalar(0, S16, S32)
955       .scalarize(0);
956 
957     if (ST.hasVOP3PInsts()) {
958       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
959         .legalFor({S32, S16, V2S16})
960         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
961         .clampMaxNumElements(0, S16, 2)
962         .minScalar(0, S16)
963         .widenScalarToNextPow2(0)
964         .scalarize(0)
965         .lower();
966     } else {
967       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
968         .legalFor({S32, S16})
969         .widenScalarToNextPow2(0)
970         .minScalar(0, S16)
971         .scalarize(0)
972         .lower();
973     }
974   } else {
975     // TODO: Should have same legality without v_perm_b32
976     getActionDefinitionsBuilder(G_BSWAP)
977       .legalFor({S32})
978       .lowerIf(scalarNarrowerThan(0, 32))
979       // FIXME: Fixing non-power-of-2 before clamp is workaround for
980       // narrowScalar limitation.
981       .widenScalarToNextPow2(0)
982       .maxScalar(0, S32)
983       .scalarize(0)
984       .lower();
985 
986     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
987       .legalFor({S32})
988       .minScalar(0, S32)
989       .widenScalarToNextPow2(0)
990       .scalarize(0)
991       .lower();
992   }
993 
994   getActionDefinitionsBuilder(G_INTTOPTR)
995     // List the common cases
996     .legalForCartesianProduct(AddrSpaces64, {S64})
997     .legalForCartesianProduct(AddrSpaces32, {S32})
998     .scalarize(0)
999     // Accept any address space as long as the size matches
1000     .legalIf(sameSize(0, 1))
1001     .widenScalarIf(smallerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       })
1005     .narrowScalarIf(largerThan(1, 0),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1008       });
1009 
1010   getActionDefinitionsBuilder(G_PTRTOINT)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(0, 1),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1020       })
1021     .narrowScalarIf(
1022       largerThan(0, 1),
1023       [](const LegalityQuery &Query) {
1024         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1025       });
1026 
1027   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1028     .scalarize(0)
1029     .custom();
1030 
1031   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1032                                     bool IsLoad) -> bool {
1033     const LLT DstTy = Query.Types[0];
1034 
1035     // Split vector extloads.
1036     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1037     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1038 
1039     if (MemSize < DstTy.getSizeInBits())
1040       MemSize = std::max(MemSize, AlignBits);
1041 
1042     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1043       return true;
1044 
1045     const LLT PtrTy = Query.Types[1];
1046     unsigned AS = PtrTy.getAddressSpace();
1047     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1048       return true;
1049 
1050     // Catch weird sized loads that don't evenly divide into the access sizes
1051     // TODO: May be able to widen depending on alignment etc.
1052     unsigned NumRegs = (MemSize + 31) / 32;
1053     if (NumRegs == 3) {
1054       if (!ST.hasDwordx3LoadStores())
1055         return true;
1056     } else {
1057       // If the alignment allows, these should have been widened.
1058       if (!isPowerOf2_32(NumRegs))
1059         return true;
1060     }
1061 
1062     if (AlignBits < MemSize) {
1063       const SITargetLowering *TLI = ST.getTargetLowering();
1064       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1065                                                       Align(AlignBits / 8));
1066     }
1067 
1068     return false;
1069   };
1070 
1071   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1072   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1073   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1074 
1075   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1076   // LDS
1077   // TODO: Unsupported flat for SI.
1078 
1079   for (unsigned Op : {G_LOAD, G_STORE}) {
1080     const bool IsStore = Op == G_STORE;
1081 
1082     auto &Actions = getActionDefinitionsBuilder(Op);
1083     // Explicitly list some common cases.
1084     // TODO: Does this help compile time at all?
1085     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1086                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1087                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1088                                       {S64, GlobalPtr, 64, GlobalAlign32},
1089                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1090                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1091                                       {S32, GlobalPtr, 8, GlobalAlign8},
1092                                       {S32, GlobalPtr, 16, GlobalAlign16},
1093 
1094                                       {S32, LocalPtr, 32, 32},
1095                                       {S64, LocalPtr, 64, 32},
1096                                       {V2S32, LocalPtr, 64, 32},
1097                                       {S32, LocalPtr, 8, 8},
1098                                       {S32, LocalPtr, 16, 16},
1099                                       {V2S16, LocalPtr, 32, 32},
1100 
1101                                       {S32, PrivatePtr, 32, 32},
1102                                       {S32, PrivatePtr, 8, 8},
1103                                       {S32, PrivatePtr, 16, 16},
1104                                       {V2S16, PrivatePtr, 32, 32},
1105 
1106                                       {S32, ConstantPtr, 32, GlobalAlign32},
1107                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1108                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1109                                       {S64, ConstantPtr, 64, GlobalAlign32},
1110                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1111     Actions.legalIf(
1112       [=](const LegalityQuery &Query) -> bool {
1113         return isLoadStoreLegal(ST, Query, Op);
1114       });
1115 
1116     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1117     // 64-bits.
1118     //
1119     // TODO: Should generalize bitcast action into coerce, which will also cover
1120     // inserting addrspacecasts.
1121     Actions.customIf(typeIs(1, Constant32Ptr));
1122 
1123     // Turn any illegal element vectors into something easier to deal
1124     // with. These will ultimately produce 32-bit scalar shifts to extract the
1125     // parts anyway.
1126     //
1127     // For odd 16-bit element vectors, prefer to split those into pieces with
1128     // 16-bit vector parts.
1129     Actions.bitcastIf(
1130       [=](const LegalityQuery &Query) -> bool {
1131         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1132                                           Query.MMODescrs[0].SizeInBits);
1133       }, bitcastToRegisterType(0));
1134 
1135     if (!IsStore) {
1136       // Widen suitably aligned loads by loading extra bytes. The standard
1137       // legalization actions can't properly express widening memory operands.
1138       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1139         return shouldWidenLoad(ST, Query, G_LOAD);
1140       });
1141     }
1142 
1143     // FIXME: load/store narrowing should be moved to lower action
1144     Actions
1145         .narrowScalarIf(
1146             [=](const LegalityQuery &Query) -> bool {
1147               return !Query.Types[0].isVector() &&
1148                      needToSplitMemOp(Query, Op == G_LOAD);
1149             },
1150             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1151               const LLT DstTy = Query.Types[0];
1152               const LLT PtrTy = Query.Types[1];
1153 
1154               const unsigned DstSize = DstTy.getSizeInBits();
1155               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1156 
1157               // Split extloads.
1158               if (DstSize > MemSize)
1159                 return std::make_pair(0, LLT::scalar(MemSize));
1160 
1161               if (!isPowerOf2_32(DstSize)) {
1162                 // We're probably decomposing an odd sized store. Try to split
1163                 // to the widest type. TODO: Account for alignment. As-is it
1164                 // should be OK, since the new parts will be further legalized.
1165                 unsigned FloorSize = PowerOf2Floor(DstSize);
1166                 return std::make_pair(0, LLT::scalar(FloorSize));
1167               }
1168 
1169               if (DstSize > 32 && (DstSize % 32 != 0)) {
1170                 // FIXME: Need a way to specify non-extload of larger size if
1171                 // suitably aligned.
1172                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1173               }
1174 
1175               unsigned MaxSize = maxSizeForAddrSpace(ST,
1176                                                      PtrTy.getAddressSpace(),
1177                                                      Op == G_LOAD);
1178               if (MemSize > MaxSize)
1179                 return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181               unsigned Align = Query.MMODescrs[0].AlignInBits;
1182               return std::make_pair(0, LLT::scalar(Align));
1183             })
1184         .fewerElementsIf(
1185             [=](const LegalityQuery &Query) -> bool {
1186               return Query.Types[0].isVector() &&
1187                      needToSplitMemOp(Query, Op == G_LOAD);
1188             },
1189             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190               const LLT DstTy = Query.Types[0];
1191               const LLT PtrTy = Query.Types[1];
1192 
1193               LLT EltTy = DstTy.getElementType();
1194               unsigned MaxSize = maxSizeForAddrSpace(ST,
1195                                                      PtrTy.getAddressSpace(),
1196                                                      Op == G_LOAD);
1197 
1198               // FIXME: Handle widened to power of 2 results better. This ends
1199               // up scalarizing.
1200               // FIXME: 3 element stores scalarized on SI
1201 
1202               // Split if it's too large for the address space.
1203               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1204                 unsigned NumElts = DstTy.getNumElements();
1205                 unsigned EltSize = EltTy.getSizeInBits();
1206 
1207                 if (MaxSize % EltSize == 0) {
1208                   return std::make_pair(
1209                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1210                 }
1211 
1212                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1213 
1214                 // FIXME: Refine when odd breakdowns handled
1215                 // The scalars will need to be re-legalized.
1216                 if (NumPieces == 1 || NumPieces >= NumElts ||
1217                     NumElts % NumPieces != 0)
1218                   return std::make_pair(0, EltTy);
1219 
1220                 return std::make_pair(0,
1221                                       LLT::vector(NumElts / NumPieces, EltTy));
1222               }
1223 
1224               // FIXME: We could probably handle weird extending loads better.
1225               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1226               if (DstTy.getSizeInBits() > MemSize)
1227                 return std::make_pair(0, EltTy);
1228 
1229               unsigned EltSize = EltTy.getSizeInBits();
1230               unsigned DstSize = DstTy.getSizeInBits();
1231               if (!isPowerOf2_32(DstSize)) {
1232                 // We're probably decomposing an odd sized store. Try to split
1233                 // to the widest type. TODO: Account for alignment. As-is it
1234                 // should be OK, since the new parts will be further legalized.
1235                 unsigned FloorSize = PowerOf2Floor(DstSize);
1236                 return std::make_pair(
1237                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1238               }
1239 
1240               // Need to split because of alignment.
1241               unsigned Align = Query.MMODescrs[0].AlignInBits;
1242               if (EltSize > Align &&
1243                   (EltSize / Align < DstTy.getNumElements())) {
1244                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1245               }
1246 
1247               // May need relegalization for the scalars.
1248               return std::make_pair(0, EltTy);
1249             })
1250     .lowerIfMemSizeNotPow2()
1251     .minScalar(0, S32);
1252 
1253     if (IsStore)
1254       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1255 
1256     Actions
1257         .widenScalarToNextPow2(0)
1258         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1259         .lower();
1260   }
1261 
1262   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1263                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1264                                                   {S32, GlobalPtr, 16, 2 * 8},
1265                                                   {S32, LocalPtr, 8, 8},
1266                                                   {S32, LocalPtr, 16, 16},
1267                                                   {S32, PrivatePtr, 8, 8},
1268                                                   {S32, PrivatePtr, 16, 16},
1269                                                   {S32, ConstantPtr, 8, 8},
1270                                                   {S32, ConstantPtr, 16, 2 * 8}});
1271   if (ST.hasFlatAddressSpace()) {
1272     ExtLoads.legalForTypesWithMemDesc(
1273         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1274   }
1275 
1276   ExtLoads.clampScalar(0, S32, S32)
1277           .widenScalarToNextPow2(0)
1278           .unsupportedIfMemSizeNotPow2()
1279           .lower();
1280 
1281   auto &Atomics = getActionDefinitionsBuilder(
1282     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1283      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1284      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1285      G_ATOMICRMW_UMIN})
1286     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1287                {S64, GlobalPtr}, {S64, LocalPtr},
1288                {S32, RegionPtr}, {S64, RegionPtr}});
1289   if (ST.hasFlatAddressSpace()) {
1290     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1291   }
1292 
1293   if (ST.hasLDSFPAtomics()) {
1294     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1295       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1296   }
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1310           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1311           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1312     .clampScalar(0, S16, S64)
1313     .scalarize(1)
1314     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1315     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1316     .clampMaxNumElements(0, S32, 2)
1317     .clampMaxNumElements(0, LocalPtr, 2)
1318     .clampMaxNumElements(0, PrivatePtr, 2)
1319     .scalarize(0)
1320     .widenScalarToNextPow2(0)
1321     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1322 
1323   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1324   // be more flexible with the shift amount type.
1325   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1326     .legalFor({{S32, S32}, {S64, S32}});
1327   if (ST.has16BitInsts()) {
1328     if (ST.hasVOP3PInsts()) {
1329       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1330             .clampMaxNumElements(0, S16, 2);
1331     } else
1332       Shifts.legalFor({{S16, S16}});
1333 
1334     // TODO: Support 16-bit shift amounts for all types
1335     Shifts.widenScalarIf(
1336       [=](const LegalityQuery &Query) {
1337         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1338         // 32-bit amount.
1339         const LLT ValTy = Query.Types[0];
1340         const LLT AmountTy = Query.Types[1];
1341         return ValTy.getSizeInBits() <= 16 &&
1342                AmountTy.getSizeInBits() < 16;
1343       }, changeTo(1, S16));
1344     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1345     Shifts.clampScalar(1, S32, S32);
1346     Shifts.clampScalar(0, S16, S64);
1347     Shifts.widenScalarToNextPow2(0, 16);
1348 
1349     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350       .minScalar(0, S16)
1351       .scalarize(0)
1352       .lower();
1353   } else {
1354     // Make sure we legalize the shift amount type first, as the general
1355     // expansion for the shifted type will produce much worse code if it hasn't
1356     // been truncated already.
1357     Shifts.clampScalar(1, S32, S32);
1358     Shifts.clampScalar(0, S32, S64);
1359     Shifts.widenScalarToNextPow2(0, 32);
1360 
1361     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1362       .minScalar(0, S32)
1363       .scalarize(0)
1364       .lower();
1365   }
1366   Shifts.scalarize(0);
1367 
1368   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1369     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1370     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1371     unsigned IdxTypeIdx = 2;
1372 
1373     getActionDefinitionsBuilder(Op)
1374       .customIf([=](const LegalityQuery &Query) {
1375           const LLT EltTy = Query.Types[EltTypeIdx];
1376           const LLT VecTy = Query.Types[VecTypeIdx];
1377           const LLT IdxTy = Query.Types[IdxTypeIdx];
1378           const unsigned EltSize = EltTy.getSizeInBits();
1379           return (EltSize == 32 || EltSize == 64) &&
1380                   VecTy.getSizeInBits() % 32 == 0 &&
1381                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1382                   IdxTy.getSizeInBits() == 32;
1383         })
1384       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1385                  bitcastToVectorElement32(VecTypeIdx))
1386       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1387       .bitcastIf(
1388         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1389         [=](const LegalityQuery &Query) {
1390           // For > 64-bit element types, try to turn this into a 64-bit
1391           // element vector since we may be able to do better indexing
1392           // if this is scalar. If not, fall back to 32.
1393           const LLT EltTy = Query.Types[EltTypeIdx];
1394           const LLT VecTy = Query.Types[VecTypeIdx];
1395           const unsigned DstEltSize = EltTy.getSizeInBits();
1396           const unsigned VecSize = VecTy.getSizeInBits();
1397 
1398           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1399           return std::make_pair(
1400             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1401         })
1402       .clampScalar(EltTypeIdx, S32, S64)
1403       .clampScalar(VecTypeIdx, S32, S64)
1404       .clampScalar(IdxTypeIdx, S32, S32)
1405       .clampMaxNumElements(VecTypeIdx, S32, 32)
1406       // TODO: Clamp elements for 64-bit vectors?
1407       // It should only be necessary with variable indexes.
1408       // As a last resort, lower to the stack
1409       .lower();
1410   }
1411 
1412   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1413     .unsupportedIf([=](const LegalityQuery &Query) {
1414         const LLT &EltTy = Query.Types[1].getElementType();
1415         return Query.Types[0] != EltTy;
1416       });
1417 
1418   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1419     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1420     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1421 
1422     // FIXME: Doesn't handle extract of illegal sizes.
1423     getActionDefinitionsBuilder(Op)
1424       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1425       // FIXME: Multiples of 16 should not be legal.
1426       .legalIf([=](const LegalityQuery &Query) {
1427           const LLT BigTy = Query.Types[BigTyIdx];
1428           const LLT LitTy = Query.Types[LitTyIdx];
1429           return (BigTy.getSizeInBits() % 32 == 0) &&
1430                  (LitTy.getSizeInBits() % 16 == 0);
1431         })
1432       .widenScalarIf(
1433         [=](const LegalityQuery &Query) {
1434           const LLT BigTy = Query.Types[BigTyIdx];
1435           return (BigTy.getScalarSizeInBits() < 16);
1436         },
1437         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1438       .widenScalarIf(
1439         [=](const LegalityQuery &Query) {
1440           const LLT LitTy = Query.Types[LitTyIdx];
1441           return (LitTy.getScalarSizeInBits() < 16);
1442         },
1443         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1444       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1445       .widenScalarToNextPow2(BigTyIdx, 32);
1446 
1447   }
1448 
1449   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1450     .legalForCartesianProduct(AllS32Vectors, {S32})
1451     .legalForCartesianProduct(AllS64Vectors, {S64})
1452     .clampNumElements(0, V16S32, V32S32)
1453     .clampNumElements(0, V2S64, V16S64)
1454     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1455 
1456   if (ST.hasScalarPackInsts()) {
1457     BuildVector
1458       // FIXME: Should probably widen s1 vectors straight to s32
1459       .minScalarOrElt(0, S16)
1460       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1461       .minScalar(1, S32);
1462 
1463     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1464       .legalFor({V2S16, S32})
1465       .lower();
1466     BuildVector.minScalarOrElt(0, S32);
1467   } else {
1468     BuildVector.customFor({V2S16, S16});
1469     BuildVector.minScalarOrElt(0, S32);
1470 
1471     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1472       .customFor({V2S16, S32})
1473       .lower();
1474   }
1475 
1476   BuildVector.legalIf(isRegisterType(0));
1477 
1478   // FIXME: Clamp maximum size
1479   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1480     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1481     .clampMaxNumElements(0, S32, 32)
1482     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1483     .clampMaxNumElements(0, S16, 64);
1484 
1485   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1486   // pre-legalize.
1487   if (ST.hasVOP3PInsts()) {
1488     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1489       .customFor({V2S16, V2S16})
1490       .lower();
1491   } else
1492     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1493 
1494   // Merge/Unmerge
1495   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1496     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1497     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1498 
1499     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1500       const LLT Ty = Query.Types[TypeIdx];
1501       if (Ty.isVector()) {
1502         const LLT &EltTy = Ty.getElementType();
1503         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1504           return true;
1505         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1506           return true;
1507       }
1508       return false;
1509     };
1510 
1511     auto &Builder = getActionDefinitionsBuilder(Op)
1512       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513       .lowerFor({{S16, V2S16}})
1514       .lowerIf([=](const LegalityQuery &Query) {
1515           const LLT BigTy = Query.Types[BigTyIdx];
1516           return BigTy.getSizeInBits() == 32;
1517         })
1518       // Try to widen to s16 first for small types.
1519       // TODO: Only do this on targets with legal s16 shifts
1520       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1521       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1522       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1523       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1524                            elementTypeIs(1, S16)),
1525                        changeTo(1, V2S16))
1526       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1527       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1528       // valid.
1529       .clampScalar(LitTyIdx, S32, S512)
1530       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1531       // Break up vectors with weird elements into scalars
1532       .fewerElementsIf(
1533         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1534         scalarize(0))
1535       .fewerElementsIf(
1536         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1537         scalarize(1))
1538       .clampScalar(BigTyIdx, S32, MaxScalar);
1539 
1540     if (Op == G_MERGE_VALUES) {
1541       Builder.widenScalarIf(
1542         // TODO: Use 16-bit shifts if legal for 8-bit values?
1543         [=](const LegalityQuery &Query) {
1544           const LLT Ty = Query.Types[LitTyIdx];
1545           return Ty.getSizeInBits() < 32;
1546         },
1547         changeTo(LitTyIdx, S32));
1548     }
1549 
1550     Builder.widenScalarIf(
1551       [=](const LegalityQuery &Query) {
1552         const LLT Ty = Query.Types[BigTyIdx];
1553         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1554           Ty.getSizeInBits() % 16 != 0;
1555       },
1556       [=](const LegalityQuery &Query) {
1557         // Pick the next power of 2, or a multiple of 64 over 128.
1558         // Whichever is smaller.
1559         const LLT &Ty = Query.Types[BigTyIdx];
1560         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1561         if (NewSizeInBits >= 256) {
1562           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1563           if (RoundedTo < NewSizeInBits)
1564             NewSizeInBits = RoundedTo;
1565         }
1566         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1567       })
1568       // Any vectors left are the wrong size. Scalarize them.
1569       .scalarize(0)
1570       .scalarize(1);
1571   }
1572 
1573   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1574   // RegBankSelect.
1575   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1576     .legalFor({{S32}, {S64}});
1577 
1578   if (ST.hasVOP3PInsts()) {
1579     SextInReg.lowerFor({{V2S16}})
1580       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1581       // get more vector shift opportunities, since we'll get those when
1582       // expanded.
1583       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1584   } else if (ST.has16BitInsts()) {
1585     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1586   } else {
1587     // Prefer to promote to s32 before lowering if we don't have 16-bit
1588     // shifts. This avoid a lot of intermediate truncate and extend operations.
1589     SextInReg.lowerFor({{S32}, {S64}});
1590   }
1591 
1592   SextInReg
1593     .scalarize(0)
1594     .clampScalar(0, S32, S64)
1595     .lower();
1596 
1597   getActionDefinitionsBuilder(G_FSHR)
1598     .legalFor({{S32, S32}})
1599     .scalarize(0)
1600     .lower();
1601 
1602   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1603     .legalFor({S64});
1604 
1605   getActionDefinitionsBuilder(G_FENCE)
1606     .alwaysLegal();
1607 
1608   getActionDefinitionsBuilder({
1609       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1610       G_FCOPYSIGN,
1611 
1612       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1613       G_ATOMICRMW_NAND,
1614       G_ATOMICRMW_FSUB,
1615       G_READ_REGISTER,
1616       G_WRITE_REGISTER,
1617 
1618       G_SADDO, G_SSUBO,
1619 
1620        // TODO: Implement
1621       G_FMINIMUM, G_FMAXIMUM,
1622       G_FSHL
1623     }).lower();
1624 
1625   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1626         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1627         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1628     .unsupported();
1629 
1630   computeTables();
1631   verify(*ST.getInstrInfo());
1632 }
1633 
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1634 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1635                                          MachineInstr &MI) const {
1636   MachineIRBuilder &B = Helper.MIRBuilder;
1637   MachineRegisterInfo &MRI = *B.getMRI();
1638 
1639   switch (MI.getOpcode()) {
1640   case TargetOpcode::G_ADDRSPACE_CAST:
1641     return legalizeAddrSpaceCast(MI, MRI, B);
1642   case TargetOpcode::G_FRINT:
1643     return legalizeFrint(MI, MRI, B);
1644   case TargetOpcode::G_FCEIL:
1645     return legalizeFceil(MI, MRI, B);
1646   case TargetOpcode::G_FREM:
1647     return legalizeFrem(MI, MRI, B);
1648   case TargetOpcode::G_INTRINSIC_TRUNC:
1649     return legalizeIntrinsicTrunc(MI, MRI, B);
1650   case TargetOpcode::G_SITOFP:
1651     return legalizeITOFP(MI, MRI, B, true);
1652   case TargetOpcode::G_UITOFP:
1653     return legalizeITOFP(MI, MRI, B, false);
1654   case TargetOpcode::G_FPTOSI:
1655     return legalizeFPTOI(MI, MRI, B, true);
1656   case TargetOpcode::G_FPTOUI:
1657     return legalizeFPTOI(MI, MRI, B, false);
1658   case TargetOpcode::G_FMINNUM:
1659   case TargetOpcode::G_FMAXNUM:
1660   case TargetOpcode::G_FMINNUM_IEEE:
1661   case TargetOpcode::G_FMAXNUM_IEEE:
1662     return legalizeMinNumMaxNum(Helper, MI);
1663   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1664     return legalizeExtractVectorElt(MI, MRI, B);
1665   case TargetOpcode::G_INSERT_VECTOR_ELT:
1666     return legalizeInsertVectorElt(MI, MRI, B);
1667   case TargetOpcode::G_SHUFFLE_VECTOR:
1668     return legalizeShuffleVector(MI, MRI, B);
1669   case TargetOpcode::G_FSIN:
1670   case TargetOpcode::G_FCOS:
1671     return legalizeSinCos(MI, MRI, B);
1672   case TargetOpcode::G_GLOBAL_VALUE:
1673     return legalizeGlobalValue(MI, MRI, B);
1674   case TargetOpcode::G_LOAD:
1675     return legalizeLoad(Helper, MI);
1676   case TargetOpcode::G_FMAD:
1677     return legalizeFMad(MI, MRI, B);
1678   case TargetOpcode::G_FDIV:
1679     return legalizeFDIV(MI, MRI, B);
1680   case TargetOpcode::G_UDIV:
1681   case TargetOpcode::G_UREM:
1682     return legalizeUDIV_UREM(MI, MRI, B);
1683   case TargetOpcode::G_SDIV:
1684   case TargetOpcode::G_SREM:
1685     return legalizeSDIV_SREM(MI, MRI, B);
1686   case TargetOpcode::G_ATOMIC_CMPXCHG:
1687     return legalizeAtomicCmpXChg(MI, MRI, B);
1688   case TargetOpcode::G_FLOG:
1689     return legalizeFlog(MI, B, numbers::ln2f);
1690   case TargetOpcode::G_FLOG10:
1691     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1692   case TargetOpcode::G_FEXP:
1693     return legalizeFExp(MI, B);
1694   case TargetOpcode::G_FPOW:
1695     return legalizeFPow(MI, B);
1696   case TargetOpcode::G_FFLOOR:
1697     return legalizeFFloor(MI, MRI, B);
1698   case TargetOpcode::G_BUILD_VECTOR:
1699     return legalizeBuildVector(MI, MRI, B);
1700   default:
1701     return false;
1702   }
1703 
1704   llvm_unreachable("expected switch to return");
1705 }
1706 
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1707 Register AMDGPULegalizerInfo::getSegmentAperture(
1708   unsigned AS,
1709   MachineRegisterInfo &MRI,
1710   MachineIRBuilder &B) const {
1711   MachineFunction &MF = B.getMF();
1712   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1713   const LLT S32 = LLT::scalar(32);
1714 
1715   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1716 
1717   if (ST.hasApertureRegs()) {
1718     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1719     // getreg.
1720     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1721         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1722         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1723     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1724         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1725         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1726     unsigned Encoding =
1727         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1728         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1729         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1730 
1731     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1732 
1733     B.buildInstr(AMDGPU::S_GETREG_B32)
1734       .addDef(GetReg)
1735       .addImm(Encoding);
1736     MRI.setType(GetReg, S32);
1737 
1738     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1739     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1740   }
1741 
1742   Register QueuePtr = MRI.createGenericVirtualRegister(
1743     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1744 
1745   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1746     return Register();
1747 
1748   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1749   // private_segment_aperture_base_hi.
1750   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1751 
1752   // TODO: can we be smarter about machine pointer info?
1753   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1754   MachineMemOperand *MMO = MF.getMachineMemOperand(
1755       PtrInfo,
1756       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1757           MachineMemOperand::MOInvariant,
1758       4, commonAlignment(Align(64), StructOffset));
1759 
1760   Register LoadAddr;
1761 
1762   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1763   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1764 }
1765 
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1766 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1767   MachineInstr &MI, MachineRegisterInfo &MRI,
1768   MachineIRBuilder &B) const {
1769   MachineFunction &MF = B.getMF();
1770 
1771   const LLT S32 = LLT::scalar(32);
1772   Register Dst = MI.getOperand(0).getReg();
1773   Register Src = MI.getOperand(1).getReg();
1774 
1775   LLT DstTy = MRI.getType(Dst);
1776   LLT SrcTy = MRI.getType(Src);
1777   unsigned DestAS = DstTy.getAddressSpace();
1778   unsigned SrcAS = SrcTy.getAddressSpace();
1779 
1780   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1781   // vector element.
1782   assert(!DstTy.isVector());
1783 
1784   const AMDGPUTargetMachine &TM
1785     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1786 
1787   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1788     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1789     return true;
1790   }
1791 
1792   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1793     // Truncate.
1794     B.buildExtract(Dst, Src, 0);
1795     MI.eraseFromParent();
1796     return true;
1797   }
1798 
1799   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1800     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1801     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1802 
1803     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1804     // another. Merge operands are required to be the same type, but creating an
1805     // extra ptrtoint would be kind of pointless.
1806     auto HighAddr = B.buildConstant(
1807       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1808     B.buildMerge(Dst, {Src, HighAddr});
1809     MI.eraseFromParent();
1810     return true;
1811   }
1812 
1813   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1814     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1815            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1816     unsigned NullVal = TM.getNullPointerValue(DestAS);
1817 
1818     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1819     auto FlatNull = B.buildConstant(SrcTy, 0);
1820 
1821     // Extract low 32-bits of the pointer.
1822     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1823 
1824     auto CmpRes =
1825         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1826     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1827 
1828     MI.eraseFromParent();
1829     return true;
1830   }
1831 
1832   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1833     return false;
1834 
1835   if (!ST.hasFlatAddressSpace())
1836     return false;
1837 
1838   auto SegmentNull =
1839       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1840   auto FlatNull =
1841       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1842 
1843   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1844   if (!ApertureReg.isValid())
1845     return false;
1846 
1847   auto CmpRes =
1848       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1849 
1850   // Coerce the type of the low half of the result so we can use merge_values.
1851   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1852 
1853   // TODO: Should we allow mismatched types but matching sizes in merges to
1854   // avoid the ptrtoint?
1855   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1856   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1857 
1858   MI.eraseFromParent();
1859   return true;
1860 }
1861 
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1862 bool AMDGPULegalizerInfo::legalizeFrint(
1863   MachineInstr &MI, MachineRegisterInfo &MRI,
1864   MachineIRBuilder &B) const {
1865   Register Src = MI.getOperand(1).getReg();
1866   LLT Ty = MRI.getType(Src);
1867   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1868 
1869   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1870   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1871 
1872   auto C1 = B.buildFConstant(Ty, C1Val);
1873   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1874 
1875   // TODO: Should this propagate fast-math-flags?
1876   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1877   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1878 
1879   auto C2 = B.buildFConstant(Ty, C2Val);
1880   auto Fabs = B.buildFAbs(Ty, Src);
1881 
1882   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1883   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1884   MI.eraseFromParent();
1885   return true;
1886 }
1887 
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1888 bool AMDGPULegalizerInfo::legalizeFceil(
1889   MachineInstr &MI, MachineRegisterInfo &MRI,
1890   MachineIRBuilder &B) const {
1891 
1892   const LLT S1 = LLT::scalar(1);
1893   const LLT S64 = LLT::scalar(64);
1894 
1895   Register Src = MI.getOperand(1).getReg();
1896   assert(MRI.getType(Src) == S64);
1897 
1898   // result = trunc(src)
1899   // if (src > 0.0 && src != result)
1900   //   result += 1.0
1901 
1902   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1903 
1904   const auto Zero = B.buildFConstant(S64, 0.0);
1905   const auto One = B.buildFConstant(S64, 1.0);
1906   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1907   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1908   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1909   auto Add = B.buildSelect(S64, And, One, Zero);
1910 
1911   // TODO: Should this propagate fast-math-flags?
1912   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1913   return true;
1914 }
1915 
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1916 bool AMDGPULegalizerInfo::legalizeFrem(
1917   MachineInstr &MI, MachineRegisterInfo &MRI,
1918   MachineIRBuilder &B) const {
1919     Register DstReg = MI.getOperand(0).getReg();
1920     Register Src0Reg = MI.getOperand(1).getReg();
1921     Register Src1Reg = MI.getOperand(2).getReg();
1922     auto Flags = MI.getFlags();
1923     LLT Ty = MRI.getType(DstReg);
1924 
1925     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1926     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1927     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1928     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1929     MI.eraseFromParent();
1930     return true;
1931 }
1932 
extractF64Exponent(Register Hi,MachineIRBuilder & B)1933 static MachineInstrBuilder extractF64Exponent(Register Hi,
1934                                               MachineIRBuilder &B) {
1935   const unsigned FractBits = 52;
1936   const unsigned ExpBits = 11;
1937   LLT S32 = LLT::scalar(32);
1938 
1939   auto Const0 = B.buildConstant(S32, FractBits - 32);
1940   auto Const1 = B.buildConstant(S32, ExpBits);
1941 
1942   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1943     .addUse(Hi)
1944     .addUse(Const0.getReg(0))
1945     .addUse(Const1.getReg(0));
1946 
1947   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1948 }
1949 
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1950 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1951   MachineInstr &MI, MachineRegisterInfo &MRI,
1952   MachineIRBuilder &B) const {
1953   const LLT S1 = LLT::scalar(1);
1954   const LLT S32 = LLT::scalar(32);
1955   const LLT S64 = LLT::scalar(64);
1956 
1957   Register Src = MI.getOperand(1).getReg();
1958   assert(MRI.getType(Src) == S64);
1959 
1960   // TODO: Should this use extract since the low half is unused?
1961   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1962   Register Hi = Unmerge.getReg(1);
1963 
1964   // Extract the upper half, since this is where we will find the sign and
1965   // exponent.
1966   auto Exp = extractF64Exponent(Hi, B);
1967 
1968   const unsigned FractBits = 52;
1969 
1970   // Extract the sign bit.
1971   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1972   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1973 
1974   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1975 
1976   const auto Zero32 = B.buildConstant(S32, 0);
1977 
1978   // Extend back to 64-bits.
1979   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1980 
1981   auto Shr = B.buildAShr(S64, FractMask, Exp);
1982   auto Not = B.buildNot(S64, Shr);
1983   auto Tmp0 = B.buildAnd(S64, Src, Not);
1984   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1985 
1986   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1987   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1988 
1989   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1990   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1991   MI.eraseFromParent();
1992   return true;
1993 }
1994 
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const1995 bool AMDGPULegalizerInfo::legalizeITOFP(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B, bool Signed) const {
1998 
1999   Register Dst = MI.getOperand(0).getReg();
2000   Register Src = MI.getOperand(1).getReg();
2001 
2002   const LLT S64 = LLT::scalar(64);
2003   const LLT S32 = LLT::scalar(32);
2004 
2005   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2006 
2007   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2008 
2009   auto CvtHi = Signed ?
2010     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2011     B.buildUITOFP(S64, Unmerge.getReg(1));
2012 
2013   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2014 
2015   auto ThirtyTwo = B.buildConstant(S32, 32);
2016   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2017     .addUse(CvtHi.getReg(0))
2018     .addUse(ThirtyTwo.getReg(0));
2019 
2020   // TODO: Should this propagate fast-math-flags?
2021   B.buildFAdd(Dst, LdExp, CvtLo);
2022   MI.eraseFromParent();
2023   return true;
2024 }
2025 
2026 // TODO: Copied from DAG implementation. Verify logic and document how this
2027 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2028 bool AMDGPULegalizerInfo::legalizeFPTOI(
2029   MachineInstr &MI, MachineRegisterInfo &MRI,
2030   MachineIRBuilder &B, bool Signed) const {
2031 
2032   Register Dst = MI.getOperand(0).getReg();
2033   Register Src = MI.getOperand(1).getReg();
2034 
2035   const LLT S64 = LLT::scalar(64);
2036   const LLT S32 = LLT::scalar(32);
2037 
2038   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2039 
2040   unsigned Flags = MI.getFlags();
2041 
2042   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2043   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2044   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2045 
2046   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2047   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2048   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2049 
2050   auto Hi = Signed ?
2051     B.buildFPTOSI(S32, FloorMul) :
2052     B.buildFPTOUI(S32, FloorMul);
2053   auto Lo = B.buildFPTOUI(S32, Fma);
2054 
2055   B.buildMerge(Dst, { Lo, Hi });
2056   MI.eraseFromParent();
2057 
2058   return true;
2059 }
2060 
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2061 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2062                                                MachineInstr &MI) const {
2063   MachineFunction &MF = Helper.MIRBuilder.getMF();
2064   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2065 
2066   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2067                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2068 
2069   // With ieee_mode disabled, the instructions have the correct behavior
2070   // already for G_FMINNUM/G_FMAXNUM
2071   if (!MFI->getMode().IEEE)
2072     return !IsIEEEOp;
2073 
2074   if (IsIEEEOp)
2075     return true;
2076 
2077   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2078 }
2079 
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2080 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2081   MachineInstr &MI, MachineRegisterInfo &MRI,
2082   MachineIRBuilder &B) const {
2083   // TODO: Should move some of this into LegalizerHelper.
2084 
2085   // TODO: Promote dynamic indexing of s16 to s32
2086 
2087   // FIXME: Artifact combiner probably should have replaced the truncated
2088   // constant before this, so we shouldn't need
2089   // getConstantVRegValWithLookThrough.
2090   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2091     MI.getOperand(2).getReg(), MRI);
2092   if (!IdxVal) // Dynamic case will be selected to register indexing.
2093     return true;
2094 
2095   Register Dst = MI.getOperand(0).getReg();
2096   Register Vec = MI.getOperand(1).getReg();
2097 
2098   LLT VecTy = MRI.getType(Vec);
2099   LLT EltTy = VecTy.getElementType();
2100   assert(EltTy == MRI.getType(Dst));
2101 
2102   if (IdxVal->Value < VecTy.getNumElements())
2103     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2104   else
2105     B.buildUndef(Dst);
2106 
2107   MI.eraseFromParent();
2108   return true;
2109 }
2110 
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2111 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2112   MachineInstr &MI, MachineRegisterInfo &MRI,
2113   MachineIRBuilder &B) const {
2114   // TODO: Should move some of this into LegalizerHelper.
2115 
2116   // TODO: Promote dynamic indexing of s16 to s32
2117 
2118   // FIXME: Artifact combiner probably should have replaced the truncated
2119   // constant before this, so we shouldn't need
2120   // getConstantVRegValWithLookThrough.
2121   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2122     MI.getOperand(3).getReg(), MRI);
2123   if (!IdxVal) // Dynamic case will be selected to register indexing.
2124     return true;
2125 
2126   Register Dst = MI.getOperand(0).getReg();
2127   Register Vec = MI.getOperand(1).getReg();
2128   Register Ins = MI.getOperand(2).getReg();
2129 
2130   LLT VecTy = MRI.getType(Vec);
2131   LLT EltTy = VecTy.getElementType();
2132   assert(EltTy == MRI.getType(Ins));
2133 
2134   if (IdxVal->Value < VecTy.getNumElements())
2135     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2136   else
2137     B.buildUndef(Dst);
2138 
2139   MI.eraseFromParent();
2140   return true;
2141 }
2142 
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2143 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2144   MachineInstr &MI, MachineRegisterInfo &MRI,
2145   MachineIRBuilder &B) const {
2146   const LLT V2S16 = LLT::vector(2, 16);
2147 
2148   Register Dst = MI.getOperand(0).getReg();
2149   Register Src0 = MI.getOperand(1).getReg();
2150   LLT DstTy = MRI.getType(Dst);
2151   LLT SrcTy = MRI.getType(Src0);
2152 
2153   if (SrcTy == V2S16 && DstTy == V2S16 &&
2154       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2155     return true;
2156 
2157   MachineIRBuilder HelperBuilder(MI);
2158   GISelObserverWrapper DummyObserver;
2159   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2160   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2161 }
2162 
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2163 bool AMDGPULegalizerInfo::legalizeSinCos(
2164   MachineInstr &MI, MachineRegisterInfo &MRI,
2165   MachineIRBuilder &B) const {
2166 
2167   Register DstReg = MI.getOperand(0).getReg();
2168   Register SrcReg = MI.getOperand(1).getReg();
2169   LLT Ty = MRI.getType(DstReg);
2170   unsigned Flags = MI.getFlags();
2171 
2172   Register TrigVal;
2173   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2174   if (ST.hasTrigReducedRange()) {
2175     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2176     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2177       .addUse(MulVal.getReg(0))
2178       .setMIFlags(Flags).getReg(0);
2179   } else
2180     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2181 
2182   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2183     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2184   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2185     .addUse(TrigVal)
2186     .setMIFlags(Flags);
2187   MI.eraseFromParent();
2188   return true;
2189 }
2190 
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2191 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2192                                                   MachineIRBuilder &B,
2193                                                   const GlobalValue *GV,
2194                                                   int64_t Offset,
2195                                                   unsigned GAFlags) const {
2196   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2197   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2198   // to the following code sequence:
2199   //
2200   // For constant address space:
2201   //   s_getpc_b64 s[0:1]
2202   //   s_add_u32 s0, s0, $symbol
2203   //   s_addc_u32 s1, s1, 0
2204   //
2205   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2206   //   a fixup or relocation is emitted to replace $symbol with a literal
2207   //   constant, which is a pc-relative offset from the encoding of the $symbol
2208   //   operand to the global variable.
2209   //
2210   // For global address space:
2211   //   s_getpc_b64 s[0:1]
2212   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2213   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2214   //
2215   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2216   //   fixups or relocations are emitted to replace $symbol@*@lo and
2217   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2218   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2219   //   operand to the global variable.
2220   //
2221   // What we want here is an offset from the value returned by s_getpc
2222   // (which is the address of the s_add_u32 instruction) to the global
2223   // variable, but since the encoding of $symbol starts 4 bytes after the start
2224   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2225   // small. This requires us to add 4 to the global variable offset in order to
2226   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2227   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2228   // instruction.
2229 
2230   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2231 
2232   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2233     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2234 
2235   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2236     .addDef(PCReg);
2237 
2238   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2239   if (GAFlags == SIInstrInfo::MO_NONE)
2240     MIB.addImm(0);
2241   else
2242     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2243 
2244   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2245 
2246   if (PtrTy.getSizeInBits() == 32)
2247     B.buildExtract(DstReg, PCReg, 0);
2248   return true;
2249  }
2250 
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2251 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2252   MachineInstr &MI, MachineRegisterInfo &MRI,
2253   MachineIRBuilder &B) const {
2254   Register DstReg = MI.getOperand(0).getReg();
2255   LLT Ty = MRI.getType(DstReg);
2256   unsigned AS = Ty.getAddressSpace();
2257 
2258   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2259   MachineFunction &MF = B.getMF();
2260   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2261 
2262   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2263     if (!MFI->isEntryFunction()) {
2264       const Function &Fn = MF.getFunction();
2265       DiagnosticInfoUnsupported BadLDSDecl(
2266         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2267         DS_Warning);
2268       Fn.getContext().diagnose(BadLDSDecl);
2269 
2270       // We currently don't have a way to correctly allocate LDS objects that
2271       // aren't directly associated with a kernel. We do force inlining of
2272       // functions that use local objects. However, if these dead functions are
2273       // not eliminated, we don't want a compile time error. Just emit a warning
2274       // and a trap, since there should be no callable path here.
2275       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2276       B.buildUndef(DstReg);
2277       MI.eraseFromParent();
2278       return true;
2279     }
2280 
2281     // TODO: We could emit code to handle the initialization somewhere.
2282     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2283       const SITargetLowering *TLI = ST.getTargetLowering();
2284       if (!TLI->shouldUseLDSConstAddress(GV)) {
2285         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2286         return true; // Leave in place;
2287       }
2288 
2289       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2290         Type *Ty = GV->getValueType();
2291         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2292         // zero-sized type in other languages to declare the dynamic shared
2293         // memory which size is not known at the compile time. They will be
2294         // allocated by the runtime and placed directly after the static
2295         // allocated ones. They all share the same offset.
2296         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2297           // Adjust alignment for that dynamic shared memory array.
2298           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2299           LLT S32 = LLT::scalar(32);
2300           auto Sz =
2301               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2302           B.buildIntToPtr(DstReg, Sz);
2303           MI.eraseFromParent();
2304           return true;
2305         }
2306       }
2307 
2308       B.buildConstant(
2309           DstReg,
2310           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2311       MI.eraseFromParent();
2312       return true;
2313     }
2314 
2315     const Function &Fn = MF.getFunction();
2316     DiagnosticInfoUnsupported BadInit(
2317       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2318     Fn.getContext().diagnose(BadInit);
2319     return true;
2320   }
2321 
2322   const SITargetLowering *TLI = ST.getTargetLowering();
2323 
2324   if (TLI->shouldEmitFixup(GV)) {
2325     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2326     MI.eraseFromParent();
2327     return true;
2328   }
2329 
2330   if (TLI->shouldEmitPCReloc(GV)) {
2331     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2332     MI.eraseFromParent();
2333     return true;
2334   }
2335 
2336   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2337   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2338 
2339   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2340       MachinePointerInfo::getGOT(MF),
2341       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2342           MachineMemOperand::MOInvariant,
2343       8 /*Size*/, Align(8));
2344 
2345   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2346 
2347   if (Ty.getSizeInBits() == 32) {
2348     // Truncate if this is a 32-bit constant adrdess.
2349     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2350     B.buildExtract(DstReg, Load, 0);
2351   } else
2352     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2353 
2354   MI.eraseFromParent();
2355   return true;
2356 }
2357 
widenToNextPowerOf2(LLT Ty)2358 static LLT widenToNextPowerOf2(LLT Ty) {
2359   if (Ty.isVector())
2360     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2361   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2362 }
2363 
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2364 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2365                                        MachineInstr &MI) const {
2366   MachineIRBuilder &B = Helper.MIRBuilder;
2367   MachineRegisterInfo &MRI = *B.getMRI();
2368   GISelChangeObserver &Observer = Helper.Observer;
2369 
2370   Register PtrReg = MI.getOperand(1).getReg();
2371   LLT PtrTy = MRI.getType(PtrReg);
2372   unsigned AddrSpace = PtrTy.getAddressSpace();
2373 
2374   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2375     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2376     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2377     Observer.changingInstr(MI);
2378     MI.getOperand(1).setReg(Cast.getReg(0));
2379     Observer.changedInstr(MI);
2380     return true;
2381   }
2382 
2383   Register ValReg = MI.getOperand(0).getReg();
2384   LLT ValTy = MRI.getType(ValReg);
2385 
2386   MachineMemOperand *MMO = *MI.memoperands_begin();
2387   const unsigned ValSize = ValTy.getSizeInBits();
2388   const unsigned MemSize = 8 * MMO->getSize();
2389   const Align MemAlign = MMO->getAlign();
2390   const unsigned AlignInBits = 8 * MemAlign.value();
2391 
2392   // Widen non-power-of-2 loads to the alignment if needed
2393   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2394     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2395 
2396     // This was already the correct extending load result type, so just adjust
2397     // the memory type.
2398     if (WideMemSize == ValSize) {
2399       MachineFunction &MF = B.getMF();
2400 
2401       MachineMemOperand *WideMMO =
2402           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2403       Observer.changingInstr(MI);
2404       MI.setMemRefs(MF, {WideMMO});
2405       Observer.changedInstr(MI);
2406       return true;
2407     }
2408 
2409     // Don't bother handling edge case that should probably never be produced.
2410     if (ValSize > WideMemSize)
2411       return false;
2412 
2413     LLT WideTy = widenToNextPowerOf2(ValTy);
2414 
2415     Register WideLoad;
2416     if (!WideTy.isVector()) {
2417       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2418       B.buildTrunc(ValReg, WideLoad).getReg(0);
2419     } else {
2420       // Extract the subvector.
2421 
2422       if (isRegisterType(ValTy)) {
2423         // If this a case where G_EXTRACT is legal, use it.
2424         // (e.g. <3 x s32> -> <4 x s32>)
2425         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2426         B.buildExtract(ValReg, WideLoad, 0);
2427       } else {
2428         // For cases where the widened type isn't a nice register value, unmerge
2429         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2430         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2431         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2432         B.setInsertPt(B.getMBB(), MI.getIterator());
2433         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2434       }
2435     }
2436 
2437     MI.eraseFromParent();
2438     return true;
2439   }
2440 
2441   return false;
2442 }
2443 
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2444 bool AMDGPULegalizerInfo::legalizeFMad(
2445   MachineInstr &MI, MachineRegisterInfo &MRI,
2446   MachineIRBuilder &B) const {
2447   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2448   assert(Ty.isScalar());
2449 
2450   MachineFunction &MF = B.getMF();
2451   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2452 
2453   // TODO: Always legal with future ftz flag.
2454   // FIXME: Do we need just output?
2455   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2456     return true;
2457   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2458     return true;
2459 
2460   MachineIRBuilder HelperBuilder(MI);
2461   GISelObserverWrapper DummyObserver;
2462   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2463   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2464 }
2465 
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2466 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2467   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2468   Register DstReg = MI.getOperand(0).getReg();
2469   Register PtrReg = MI.getOperand(1).getReg();
2470   Register CmpVal = MI.getOperand(2).getReg();
2471   Register NewVal = MI.getOperand(3).getReg();
2472 
2473   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2474          "this should not have been custom lowered");
2475 
2476   LLT ValTy = MRI.getType(CmpVal);
2477   LLT VecTy = LLT::vector(2, ValTy);
2478 
2479   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2480 
2481   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2482     .addDef(DstReg)
2483     .addUse(PtrReg)
2484     .addUse(PackedVal)
2485     .setMemRefs(MI.memoperands());
2486 
2487   MI.eraseFromParent();
2488   return true;
2489 }
2490 
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2491 bool AMDGPULegalizerInfo::legalizeFlog(
2492   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2493   Register Dst = MI.getOperand(0).getReg();
2494   Register Src = MI.getOperand(1).getReg();
2495   LLT Ty = B.getMRI()->getType(Dst);
2496   unsigned Flags = MI.getFlags();
2497 
2498   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2499   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2500 
2501   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2502   MI.eraseFromParent();
2503   return true;
2504 }
2505 
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2506 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2507                                        MachineIRBuilder &B) const {
2508   Register Dst = MI.getOperand(0).getReg();
2509   Register Src = MI.getOperand(1).getReg();
2510   unsigned Flags = MI.getFlags();
2511   LLT Ty = B.getMRI()->getType(Dst);
2512 
2513   auto K = B.buildFConstant(Ty, numbers::log2e);
2514   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2515   B.buildFExp2(Dst, Mul, Flags);
2516   MI.eraseFromParent();
2517   return true;
2518 }
2519 
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2520 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2521                                        MachineIRBuilder &B) const {
2522   Register Dst = MI.getOperand(0).getReg();
2523   Register Src0 = MI.getOperand(1).getReg();
2524   Register Src1 = MI.getOperand(2).getReg();
2525   unsigned Flags = MI.getFlags();
2526   LLT Ty = B.getMRI()->getType(Dst);
2527   const LLT S16 = LLT::scalar(16);
2528   const LLT S32 = LLT::scalar(32);
2529 
2530   if (Ty == S32) {
2531     auto Log = B.buildFLog2(S32, Src0, Flags);
2532     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2533       .addUse(Log.getReg(0))
2534       .addUse(Src1)
2535       .setMIFlags(Flags);
2536     B.buildFExp2(Dst, Mul, Flags);
2537   } else if (Ty == S16) {
2538     // There's no f16 fmul_legacy, so we need to convert for it.
2539     auto Log = B.buildFLog2(S16, Src0, Flags);
2540     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2541     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2542     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2543       .addUse(Ext0.getReg(0))
2544       .addUse(Ext1.getReg(0))
2545       .setMIFlags(Flags);
2546 
2547     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2548   } else
2549     return false;
2550 
2551   MI.eraseFromParent();
2552   return true;
2553 }
2554 
2555 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2556 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2557   Register ModSrc = OrigSrc;
2558   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2559     ModSrc = SrcFNeg->getOperand(1).getReg();
2560     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561       ModSrc = SrcFAbs->getOperand(1).getReg();
2562   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2563     ModSrc = SrcFAbs->getOperand(1).getReg();
2564   return ModSrc;
2565 }
2566 
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2567 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2568                                          MachineRegisterInfo &MRI,
2569                                          MachineIRBuilder &B) const {
2570 
2571   const LLT S1 = LLT::scalar(1);
2572   const LLT S64 = LLT::scalar(64);
2573   Register Dst = MI.getOperand(0).getReg();
2574   Register OrigSrc = MI.getOperand(1).getReg();
2575   unsigned Flags = MI.getFlags();
2576   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2577          "this should not have been custom lowered");
2578 
2579   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2580   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2581   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2582   // V_FRACT bug is:
2583   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2584   //
2585   // Convert floor(x) to (x - fract(x))
2586 
2587   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2588     .addUse(OrigSrc)
2589     .setMIFlags(Flags);
2590 
2591   // Give source modifier matching some assistance before obscuring a foldable
2592   // pattern.
2593 
2594   // TODO: We can avoid the neg on the fract? The input sign to fract
2595   // shouldn't matter?
2596   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2597 
2598   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2599 
2600   Register Min = MRI.createGenericVirtualRegister(S64);
2601 
2602   // We don't need to concern ourselves with the snan handling difference, so
2603   // use the one which will directly select.
2604   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2605   if (MFI->getMode().IEEE)
2606     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2607   else
2608     B.buildFMinNum(Min, Fract, Const, Flags);
2609 
2610   Register CorrectedFract = Min;
2611   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2612     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2613     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2614   }
2615 
2616   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2617   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2618 
2619   MI.eraseFromParent();
2620   return true;
2621 }
2622 
2623 // Turn an illegal packed v2s16 build vector into bit operations.
2624 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2625 bool AMDGPULegalizerInfo::legalizeBuildVector(
2626   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2627   Register Dst = MI.getOperand(0).getReg();
2628   const LLT S32 = LLT::scalar(32);
2629   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2630 
2631   Register Src0 = MI.getOperand(1).getReg();
2632   Register Src1 = MI.getOperand(2).getReg();
2633   assert(MRI.getType(Src0) == LLT::scalar(16));
2634 
2635   auto Merge = B.buildMerge(S32, {Src0, Src1});
2636   B.buildBitcast(Dst, Merge);
2637 
2638   MI.eraseFromParent();
2639   return true;
2640 }
2641 
2642 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2643 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2644   if (MI.getOpcode() != TargetOpcode::G_XOR)
2645     return false;
2646   auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
2647   return ConstVal && *ConstVal == -1;
2648 }
2649 
2650 // Return the use branch instruction, otherwise null if the usage is invalid.
2651 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2652 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2653                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2654   Register CondDef = MI.getOperand(0).getReg();
2655   if (!MRI.hasOneNonDBGUse(CondDef))
2656     return nullptr;
2657 
2658   MachineBasicBlock *Parent = MI.getParent();
2659   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2660 
2661   if (isNot(MRI, *UseMI)) {
2662     Register NegatedCond = UseMI->getOperand(0).getReg();
2663     if (!MRI.hasOneNonDBGUse(NegatedCond))
2664       return nullptr;
2665 
2666     // We're deleting the def of this value, so we need to remove it.
2667     UseMI->eraseFromParent();
2668 
2669     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2670     Negated = true;
2671   }
2672 
2673   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2674     return nullptr;
2675 
2676   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2677   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2678   if (Next == Parent->end()) {
2679     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2680     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2681       return nullptr;
2682     UncondBrTarget = &*NextMBB;
2683   } else {
2684     if (Next->getOpcode() != AMDGPU::G_BR)
2685       return nullptr;
2686     Br = &*Next;
2687     UncondBrTarget = Br->getOperand(0).getMBB();
2688   }
2689 
2690   return UseMI;
2691 }
2692 
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2693 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2694                                          const ArgDescriptor *Arg,
2695                                          const TargetRegisterClass *ArgRC,
2696                                          LLT ArgTy) const {
2697   MCRegister SrcReg = Arg->getRegister();
2698   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2699   assert(DstReg.isVirtual() && "Virtual register expected");
2700 
2701   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2702                                              ArgTy);
2703   if (Arg->isMasked()) {
2704     // TODO: Should we try to emit this once in the entry block?
2705     const LLT S32 = LLT::scalar(32);
2706     const unsigned Mask = Arg->getMask();
2707     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2708 
2709     Register AndMaskSrc = LiveIn;
2710 
2711     if (Shift != 0) {
2712       auto ShiftAmt = B.buildConstant(S32, Shift);
2713       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2714     }
2715 
2716     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2717   } else {
2718     B.buildCopy(DstReg, LiveIn);
2719   }
2720 
2721   return true;
2722 }
2723 
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2724 bool AMDGPULegalizerInfo::loadInputValue(
2725     Register DstReg, MachineIRBuilder &B,
2726     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2727   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2728   const ArgDescriptor *Arg;
2729   const TargetRegisterClass *ArgRC;
2730   LLT ArgTy;
2731   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2732 
2733   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2734     return false; // TODO: Handle these
2735   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2736 }
2737 
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2738 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2739     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2740     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2741   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2742     return false;
2743 
2744   MI.eraseFromParent();
2745   return true;
2746 }
2747 
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2748 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2749                                        MachineRegisterInfo &MRI,
2750                                        MachineIRBuilder &B) const {
2751   Register Dst = MI.getOperand(0).getReg();
2752   LLT DstTy = MRI.getType(Dst);
2753   LLT S16 = LLT::scalar(16);
2754   LLT S32 = LLT::scalar(32);
2755   LLT S64 = LLT::scalar(64);
2756 
2757   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2758     return true;
2759 
2760   if (DstTy == S16)
2761     return legalizeFDIV16(MI, MRI, B);
2762   if (DstTy == S32)
2763     return legalizeFDIV32(MI, MRI, B);
2764   if (DstTy == S64)
2765     return legalizeFDIV64(MI, MRI, B);
2766 
2767   return false;
2768 }
2769 
legalizeUDIV_UREM32Impl(MachineIRBuilder & B,Register DstReg,Register X,Register Y,bool IsDiv) const2770 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2771                                                   Register DstReg,
2772                                                   Register X,
2773                                                   Register Y,
2774                                                   bool IsDiv) const {
2775   const LLT S1 = LLT::scalar(1);
2776   const LLT S32 = LLT::scalar(32);
2777 
2778   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2779   // algorithm used here.
2780 
2781   // Initial estimate of inv(y).
2782   auto FloatY = B.buildUITOFP(S32, Y);
2783   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2784   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2785   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2786   auto Z = B.buildFPTOUI(S32, ScaledY);
2787 
2788   // One round of UNR.
2789   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2790   auto NegYZ = B.buildMul(S32, NegY, Z);
2791   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2792 
2793   // Quotient/remainder estimate.
2794   auto Q = B.buildUMulH(S32, X, Z);
2795   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2796 
2797   // First quotient/remainder refinement.
2798   auto One = B.buildConstant(S32, 1);
2799   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2800   if (IsDiv)
2801     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2802   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2803 
2804   // Second quotient/remainder refinement.
2805   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2806   if (IsDiv)
2807     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2808   else
2809     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2810 }
2811 
legalizeUDIV_UREM32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2812 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2813                                               MachineRegisterInfo &MRI,
2814                                               MachineIRBuilder &B) const {
2815   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2816   Register DstReg = MI.getOperand(0).getReg();
2817   Register Num = MI.getOperand(1).getReg();
2818   Register Den = MI.getOperand(2).getReg();
2819   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2820   MI.eraseFromParent();
2821   return true;
2822 }
2823 
2824 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2825 //
2826 // Return lo, hi of result
2827 //
2828 // %cvt.lo = G_UITOFP Val.lo
2829 // %cvt.hi = G_UITOFP Val.hi
2830 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2831 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2832 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2833 // %mul2 = G_FMUL %mul1, 2**(-32)
2834 // %trunc = G_INTRINSIC_TRUNC %mul2
2835 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2836 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2837 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2838                                                        Register Val) {
2839   const LLT S32 = LLT::scalar(32);
2840   auto Unmerge = B.buildUnmerge(S32, Val);
2841 
2842   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2843   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2844 
2845   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2846                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2847 
2848   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2849   auto Mul1 =
2850       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2851 
2852   // 2**(-32)
2853   auto Mul2 =
2854       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2855   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2856 
2857   // -(2**32)
2858   auto Mad2 = B.buildFMAD(S32, Trunc,
2859                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2860 
2861   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2862   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2863 
2864   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2865 }
2866 
legalizeUDIV_UREM64Impl(MachineIRBuilder & B,Register DstReg,Register Numer,Register Denom,bool IsDiv) const2867 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2868                                                   Register DstReg,
2869                                                   Register Numer,
2870                                                   Register Denom,
2871                                                   bool IsDiv) const {
2872   const LLT S32 = LLT::scalar(32);
2873   const LLT S64 = LLT::scalar(64);
2874   const LLT S1 = LLT::scalar(1);
2875   Register RcpLo, RcpHi;
2876 
2877   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2878 
2879   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2880 
2881   auto Zero64 = B.buildConstant(S64, 0);
2882   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2883 
2884   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2885   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2886 
2887   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2888   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2889   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2890 
2891   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2892   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2893   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2894   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2895 
2896   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2897   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2898   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2899   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2900   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2901 
2902   auto Zero32 = B.buildConstant(S32, 0);
2903   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2904   auto Add2_HiC =
2905       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2906   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2907   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2908 
2909   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2910   Register NumerLo = UnmergeNumer.getReg(0);
2911   Register NumerHi = UnmergeNumer.getReg(1);
2912 
2913   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2914   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2915   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2916   Register Mul3_Lo = UnmergeMul3.getReg(0);
2917   Register Mul3_Hi = UnmergeMul3.getReg(1);
2918   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2919   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2920   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2921   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2922 
2923   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2924   Register DenomLo = UnmergeDenom.getReg(0);
2925   Register DenomHi = UnmergeDenom.getReg(1);
2926 
2927   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2928   auto C1 = B.buildSExt(S32, CmpHi);
2929 
2930   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2931   auto C2 = B.buildSExt(S32, CmpLo);
2932 
2933   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2934   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2935 
2936   // TODO: Here and below portions of the code can be enclosed into if/endif.
2937   // Currently control flow is unconditional and we have 4 selects after
2938   // potential endif to substitute PHIs.
2939 
2940   // if C3 != 0 ...
2941   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2942   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2943   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2944   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2945 
2946   auto One64 = B.buildConstant(S64, 1);
2947   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2948 
2949   auto C4 =
2950       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2951   auto C5 =
2952       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2953   auto C6 = B.buildSelect(
2954       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2955 
2956   // if (C6 != 0)
2957   auto Add4 = B.buildAdd(S64, Add3, One64);
2958   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2959 
2960   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2961   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2962   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2963 
2964   // endif C6
2965   // endif C3
2966 
2967   if (IsDiv) {
2968     auto Sel1 = B.buildSelect(
2969         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2970     B.buildSelect(DstReg,
2971                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2972   } else {
2973     auto Sel2 = B.buildSelect(
2974         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2975     B.buildSelect(DstReg,
2976                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2977   }
2978 }
2979 
legalizeUDIV_UREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2980 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2981                                             MachineRegisterInfo &MRI,
2982                                             MachineIRBuilder &B) const {
2983   const LLT S64 = LLT::scalar(64);
2984   const LLT S32 = LLT::scalar(32);
2985   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2986   Register DstReg = MI.getOperand(0).getReg();
2987   Register Num = MI.getOperand(1).getReg();
2988   Register Den = MI.getOperand(2).getReg();
2989   LLT Ty = MRI.getType(DstReg);
2990 
2991   if (Ty == S32)
2992     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2993   else if (Ty == S64)
2994     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2995   else
2996     return false;
2997 
2998   MI.eraseFromParent();
2999   return true;
3000 
3001 }
3002 
legalizeSDIV_SREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3003 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3004                                             MachineRegisterInfo &MRI,
3005                                             MachineIRBuilder &B) const {
3006   const LLT S64 = LLT::scalar(64);
3007   const LLT S32 = LLT::scalar(32);
3008 
3009   Register DstReg = MI.getOperand(0).getReg();
3010   const LLT Ty = MRI.getType(DstReg);
3011   if (Ty != S32 && Ty != S64)
3012     return false;
3013 
3014   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3015 
3016   Register LHS = MI.getOperand(1).getReg();
3017   Register RHS = MI.getOperand(2).getReg();
3018 
3019   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3020   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3021   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3022 
3023   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3024   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3025 
3026   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3027   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3028 
3029   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3030   if (Ty == S32)
3031     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3032   else
3033     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3034 
3035   Register Sign;
3036   if (IsDiv)
3037     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3038   else
3039     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3040 
3041   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3042   B.buildSub(DstReg, UDivRem, Sign);
3043 
3044   MI.eraseFromParent();
3045   return true;
3046 }
3047 
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3048 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3049                                                  MachineRegisterInfo &MRI,
3050                                                  MachineIRBuilder &B) const {
3051   Register Res = MI.getOperand(0).getReg();
3052   Register LHS = MI.getOperand(1).getReg();
3053   Register RHS = MI.getOperand(2).getReg();
3054 
3055   uint16_t Flags = MI.getFlags();
3056 
3057   LLT ResTy = MRI.getType(Res);
3058   LLT S32 = LLT::scalar(32);
3059   LLT S64 = LLT::scalar(64);
3060 
3061   const MachineFunction &MF = B.getMF();
3062   bool Unsafe =
3063     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3064 
3065   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3066     return false;
3067 
3068   if (!Unsafe && ResTy == S32 &&
3069       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3070     return false;
3071 
3072   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3073     // 1 / x -> RCP(x)
3074     if (CLHS->isExactlyValue(1.0)) {
3075       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3076         .addUse(RHS)
3077         .setMIFlags(Flags);
3078 
3079       MI.eraseFromParent();
3080       return true;
3081     }
3082 
3083     // -1 / x -> RCP( FNEG(x) )
3084     if (CLHS->isExactlyValue(-1.0)) {
3085       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3086       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3087         .addUse(FNeg.getReg(0))
3088         .setMIFlags(Flags);
3089 
3090       MI.eraseFromParent();
3091       return true;
3092     }
3093   }
3094 
3095   // x / y -> x * (1.0 / y)
3096   if (Unsafe) {
3097     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3098       .addUse(RHS)
3099       .setMIFlags(Flags);
3100     B.buildFMul(Res, LHS, RCP, Flags);
3101 
3102     MI.eraseFromParent();
3103     return true;
3104   }
3105 
3106   return false;
3107 }
3108 
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3109 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3110                                          MachineRegisterInfo &MRI,
3111                                          MachineIRBuilder &B) const {
3112   Register Res = MI.getOperand(0).getReg();
3113   Register LHS = MI.getOperand(1).getReg();
3114   Register RHS = MI.getOperand(2).getReg();
3115 
3116   uint16_t Flags = MI.getFlags();
3117 
3118   LLT S16 = LLT::scalar(16);
3119   LLT S32 = LLT::scalar(32);
3120 
3121   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3122   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3123 
3124   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3125     .addUse(RHSExt.getReg(0))
3126     .setMIFlags(Flags);
3127 
3128   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3129   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3130 
3131   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3132     .addUse(RDst.getReg(0))
3133     .addUse(RHS)
3134     .addUse(LHS)
3135     .setMIFlags(Flags);
3136 
3137   MI.eraseFromParent();
3138   return true;
3139 }
3140 
3141 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3142 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3143 static void toggleSPDenormMode(bool Enable,
3144                                MachineIRBuilder &B,
3145                                const GCNSubtarget &ST,
3146                                AMDGPU::SIModeRegisterDefaults Mode) {
3147   // Set SP denorm mode to this value.
3148   unsigned SPDenormMode =
3149     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3150 
3151   if (ST.hasDenormModeInst()) {
3152     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3153     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3154 
3155     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3156     B.buildInstr(AMDGPU::S_DENORM_MODE)
3157       .addImm(NewDenormModeValue);
3158 
3159   } else {
3160     // Select FP32 bit field in mode register.
3161     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3162                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3163                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3164 
3165     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3166       .addImm(SPDenormMode)
3167       .addImm(SPDenormModeBitField);
3168   }
3169 }
3170 
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3171 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3172                                          MachineRegisterInfo &MRI,
3173                                          MachineIRBuilder &B) const {
3174   Register Res = MI.getOperand(0).getReg();
3175   Register LHS = MI.getOperand(1).getReg();
3176   Register RHS = MI.getOperand(2).getReg();
3177   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3178   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3179 
3180   uint16_t Flags = MI.getFlags();
3181 
3182   LLT S32 = LLT::scalar(32);
3183   LLT S1 = LLT::scalar(1);
3184 
3185   auto One = B.buildFConstant(S32, 1.0f);
3186 
3187   auto DenominatorScaled =
3188     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3189       .addUse(LHS)
3190       .addUse(RHS)
3191       .addImm(0)
3192       .setMIFlags(Flags);
3193   auto NumeratorScaled =
3194     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3195       .addUse(LHS)
3196       .addUse(RHS)
3197       .addImm(1)
3198       .setMIFlags(Flags);
3199 
3200   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3201     .addUse(DenominatorScaled.getReg(0))
3202     .setMIFlags(Flags);
3203   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3204 
3205   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3206   // aren't modeled as reading it.
3207   if (!Mode.allFP32Denormals())
3208     toggleSPDenormMode(true, B, ST, Mode);
3209 
3210   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3211   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3212   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3213   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3214   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3215   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3216 
3217   if (!Mode.allFP32Denormals())
3218     toggleSPDenormMode(false, B, ST, Mode);
3219 
3220   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3221     .addUse(Fma4.getReg(0))
3222     .addUse(Fma1.getReg(0))
3223     .addUse(Fma3.getReg(0))
3224     .addUse(NumeratorScaled.getReg(1))
3225     .setMIFlags(Flags);
3226 
3227   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3228     .addUse(Fmas.getReg(0))
3229     .addUse(RHS)
3230     .addUse(LHS)
3231     .setMIFlags(Flags);
3232 
3233   MI.eraseFromParent();
3234   return true;
3235 }
3236 
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3237 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3238                                          MachineRegisterInfo &MRI,
3239                                          MachineIRBuilder &B) const {
3240   Register Res = MI.getOperand(0).getReg();
3241   Register LHS = MI.getOperand(1).getReg();
3242   Register RHS = MI.getOperand(2).getReg();
3243 
3244   uint16_t Flags = MI.getFlags();
3245 
3246   LLT S64 = LLT::scalar(64);
3247   LLT S1 = LLT::scalar(1);
3248 
3249   auto One = B.buildFConstant(S64, 1.0);
3250 
3251   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3252     .addUse(LHS)
3253     .addUse(RHS)
3254     .addImm(0)
3255     .setMIFlags(Flags);
3256 
3257   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3258 
3259   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3260     .addUse(DivScale0.getReg(0))
3261     .setMIFlags(Flags);
3262 
3263   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3264   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3265   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3266 
3267   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3268     .addUse(LHS)
3269     .addUse(RHS)
3270     .addImm(1)
3271     .setMIFlags(Flags);
3272 
3273   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3274   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3275   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3276 
3277   Register Scale;
3278   if (!ST.hasUsableDivScaleConditionOutput()) {
3279     // Workaround a hardware bug on SI where the condition output from div_scale
3280     // is not usable.
3281 
3282     LLT S32 = LLT::scalar(32);
3283 
3284     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3285     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3286     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3287     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3288 
3289     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3290                               Scale1Unmerge.getReg(1));
3291     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3292                               Scale0Unmerge.getReg(1));
3293     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3294   } else {
3295     Scale = DivScale1.getReg(1);
3296   }
3297 
3298   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3299     .addUse(Fma4.getReg(0))
3300     .addUse(Fma3.getReg(0))
3301     .addUse(Mul.getReg(0))
3302     .addUse(Scale)
3303     .setMIFlags(Flags);
3304 
3305   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3306     .addUse(Fmas.getReg(0))
3307     .addUse(RHS)
3308     .addUse(LHS)
3309     .setMIFlags(Flags);
3310 
3311   MI.eraseFromParent();
3312   return true;
3313 }
3314 
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3315 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3316                                                  MachineRegisterInfo &MRI,
3317                                                  MachineIRBuilder &B) const {
3318   Register Res = MI.getOperand(0).getReg();
3319   Register LHS = MI.getOperand(2).getReg();
3320   Register RHS = MI.getOperand(3).getReg();
3321   uint16_t Flags = MI.getFlags();
3322 
3323   LLT S32 = LLT::scalar(32);
3324   LLT S1 = LLT::scalar(1);
3325 
3326   auto Abs = B.buildFAbs(S32, RHS, Flags);
3327   const APFloat C0Val(1.0f);
3328 
3329   auto C0 = B.buildConstant(S32, 0x6f800000);
3330   auto C1 = B.buildConstant(S32, 0x2f800000);
3331   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3332 
3333   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3334   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3335 
3336   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3337 
3338   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3339     .addUse(Mul0.getReg(0))
3340     .setMIFlags(Flags);
3341 
3342   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3343 
3344   B.buildFMul(Res, Sel, Mul1, Flags);
3345 
3346   MI.eraseFromParent();
3347   return true;
3348 }
3349 
3350 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3351 // FIXME: Why do we handle this one but not other removed instructions?
3352 //
3353 // Reciprocal square root.  The clamp prevents infinite results, clamping
3354 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3355 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3356 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3357                                                     MachineRegisterInfo &MRI,
3358                                                     MachineIRBuilder &B) const {
3359   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3360     return true;
3361 
3362   Register Dst = MI.getOperand(0).getReg();
3363   Register Src = MI.getOperand(2).getReg();
3364   auto Flags = MI.getFlags();
3365 
3366   LLT Ty = MRI.getType(Dst);
3367 
3368   const fltSemantics *FltSemantics;
3369   if (Ty == LLT::scalar(32))
3370     FltSemantics = &APFloat::IEEEsingle();
3371   else if (Ty == LLT::scalar(64))
3372     FltSemantics = &APFloat::IEEEdouble();
3373   else
3374     return false;
3375 
3376   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3377     .addUse(Src)
3378     .setMIFlags(Flags);
3379 
3380   // We don't need to concern ourselves with the snan handling difference, since
3381   // the rsq quieted (or not) so use the one which will directly select.
3382   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3383   const bool UseIEEE = MFI->getMode().IEEE;
3384 
3385   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3386   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3387                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3388 
3389   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3390 
3391   if (UseIEEE)
3392     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3393   else
3394     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3395   MI.eraseFromParent();
3396   return true;
3397 }
3398 
getDSFPAtomicOpcode(Intrinsic::ID IID)3399 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3400   switch (IID) {
3401   case Intrinsic::amdgcn_ds_fadd:
3402     return AMDGPU::G_ATOMICRMW_FADD;
3403   case Intrinsic::amdgcn_ds_fmin:
3404     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3405   case Intrinsic::amdgcn_ds_fmax:
3406     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3407   default:
3408     llvm_unreachable("not a DS FP intrinsic");
3409   }
3410 }
3411 
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3412 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3413                                                       MachineInstr &MI,
3414                                                       Intrinsic::ID IID) const {
3415   GISelChangeObserver &Observer = Helper.Observer;
3416   Observer.changingInstr(MI);
3417 
3418   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3419 
3420   // The remaining operands were used to set fields in the MemOperand on
3421   // construction.
3422   for (int I = 6; I > 3; --I)
3423     MI.RemoveOperand(I);
3424 
3425   MI.RemoveOperand(1); // Remove the intrinsic ID.
3426   Observer.changedInstr(MI);
3427   return true;
3428 }
3429 
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3430 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3431                                             MachineRegisterInfo &MRI,
3432                                             MachineIRBuilder &B) const {
3433   uint64_t Offset =
3434     ST.getTargetLowering()->getImplicitParameterOffset(
3435       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3436   LLT DstTy = MRI.getType(DstReg);
3437   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3438 
3439   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3440   if (!loadInputValue(KernargPtrReg, B,
3441                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3442     return false;
3443 
3444   // FIXME: This should be nuw
3445   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3446   return true;
3447 }
3448 
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3449 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3450                                                  MachineRegisterInfo &MRI,
3451                                                  MachineIRBuilder &B) const {
3452   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3453   if (!MFI->isEntryFunction()) {
3454     return legalizePreloadedArgIntrin(MI, MRI, B,
3455                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3456   }
3457 
3458   Register DstReg = MI.getOperand(0).getReg();
3459   if (!getImplicitArgPtr(DstReg, MRI, B))
3460     return false;
3461 
3462   MI.eraseFromParent();
3463   return true;
3464 }
3465 
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3466 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3467                                               MachineRegisterInfo &MRI,
3468                                               MachineIRBuilder &B,
3469                                               unsigned AddrSpace) const {
3470   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3471   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3472   Register Hi32 = Unmerge.getReg(1);
3473 
3474   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3475   MI.eraseFromParent();
3476   return true;
3477 }
3478 
3479 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3480 // offset (the offset that is included in bounds checking and swizzling, to be
3481 // split between the instruction's voffset and immoffset fields) and soffset
3482 // (the offset that is excluded from bounds checking and swizzling, to go in
3483 // the instruction's soffset field).  This function takes the first kind of
3484 // offset and figures out how to split it between voffset and immoffset.
3485 std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3486 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3487                                         Register OrigOffset) const {
3488   const unsigned MaxImm = 4095;
3489   Register BaseReg;
3490   unsigned TotalConstOffset;
3491   const LLT S32 = LLT::scalar(32);
3492 
3493   std::tie(BaseReg, TotalConstOffset) =
3494       AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3495 
3496   unsigned ImmOffset = TotalConstOffset;
3497 
3498   // If the immediate value is too big for the immoffset field, put the value
3499   // and -4096 into the immoffset field so that the value that is copied/added
3500   // for the voffset field is a multiple of 4096, and it stands more chance
3501   // of being CSEd with the copy/add for another similar load/store.
3502   // However, do not do that rounding down to a multiple of 4096 if that is a
3503   // negative number, as it appears to be illegal to have a negative offset
3504   // in the vgpr, even if adding the immediate offset makes it positive.
3505   unsigned Overflow = ImmOffset & ~MaxImm;
3506   ImmOffset -= Overflow;
3507   if ((int32_t)Overflow < 0) {
3508     Overflow += ImmOffset;
3509     ImmOffset = 0;
3510   }
3511 
3512   if (Overflow != 0) {
3513     if (!BaseReg) {
3514       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3515     } else {
3516       auto OverflowVal = B.buildConstant(S32, Overflow);
3517       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3518     }
3519   }
3520 
3521   if (!BaseReg)
3522     BaseReg = B.buildConstant(S32, 0).getReg(0);
3523 
3524   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3525 }
3526 
3527 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3528 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3529                                              MachineRegisterInfo &MRI,
3530                                              Register Reg,
3531                                              bool ImageStore) const {
3532   const LLT S16 = LLT::scalar(16);
3533   const LLT S32 = LLT::scalar(32);
3534   LLT StoreVT = MRI.getType(Reg);
3535   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3536 
3537   if (ST.hasUnpackedD16VMem()) {
3538     auto Unmerge = B.buildUnmerge(S16, Reg);
3539 
3540     SmallVector<Register, 4> WideRegs;
3541     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3542       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3543 
3544     int NumElts = StoreVT.getNumElements();
3545 
3546     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3547   }
3548 
3549   if (ImageStore && ST.hasImageStoreD16Bug()) {
3550     if (StoreVT.getNumElements() == 2) {
3551       SmallVector<Register, 4> PackedRegs;
3552       Reg = B.buildBitcast(S32, Reg).getReg(0);
3553       PackedRegs.push_back(Reg);
3554       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3555       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3556     }
3557 
3558     if (StoreVT.getNumElements() == 3) {
3559       SmallVector<Register, 4> PackedRegs;
3560       auto Unmerge = B.buildUnmerge(S16, Reg);
3561       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3562         PackedRegs.push_back(Unmerge.getReg(I));
3563       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3564       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3565       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3566     }
3567 
3568     if (StoreVT.getNumElements() == 4) {
3569       SmallVector<Register, 4> PackedRegs;
3570       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3571       auto Unmerge = B.buildUnmerge(S32, Reg);
3572       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3573         PackedRegs.push_back(Unmerge.getReg(I));
3574       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3575       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3576     }
3577 
3578     llvm_unreachable("invalid data type");
3579   }
3580 
3581   return Reg;
3582 }
3583 
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3584 Register AMDGPULegalizerInfo::fixStoreSourceType(
3585   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3586   MachineRegisterInfo *MRI = B.getMRI();
3587   LLT Ty = MRI->getType(VData);
3588 
3589   const LLT S16 = LLT::scalar(16);
3590 
3591   // Fixup illegal register types for i8 stores.
3592   if (Ty == LLT::scalar(8) || Ty == S16) {
3593     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3594     return AnyExt;
3595   }
3596 
3597   if (Ty.isVector()) {
3598     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3599       if (IsFormat)
3600         return handleD16VData(B, *MRI, VData);
3601     }
3602   }
3603 
3604   return VData;
3605 }
3606 
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3607 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3608                                               MachineRegisterInfo &MRI,
3609                                               MachineIRBuilder &B,
3610                                               bool IsTyped,
3611                                               bool IsFormat) const {
3612   Register VData = MI.getOperand(1).getReg();
3613   LLT Ty = MRI.getType(VData);
3614   LLT EltTy = Ty.getScalarType();
3615   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3616   const LLT S32 = LLT::scalar(32);
3617 
3618   VData = fixStoreSourceType(B, VData, IsFormat);
3619   Register RSrc = MI.getOperand(2).getReg();
3620 
3621   MachineMemOperand *MMO = *MI.memoperands_begin();
3622   const int MemSize = MMO->getSize();
3623 
3624   unsigned ImmOffset;
3625   unsigned TotalOffset;
3626 
3627   // The typed intrinsics add an immediate after the registers.
3628   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3629 
3630   // The struct intrinsic variants add one additional operand over raw.
3631   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3632   Register VIndex;
3633   int OpOffset = 0;
3634   if (HasVIndex) {
3635     VIndex = MI.getOperand(3).getReg();
3636     OpOffset = 1;
3637   }
3638 
3639   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3640   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3641 
3642   unsigned Format = 0;
3643   if (IsTyped) {
3644     Format = MI.getOperand(5 + OpOffset).getImm();
3645     ++OpOffset;
3646   }
3647 
3648   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3649 
3650   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3651   if (TotalOffset != 0)
3652     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3653 
3654   unsigned Opc;
3655   if (IsTyped) {
3656     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3657                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3658   } else if (IsFormat) {
3659     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3660                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3661   } else {
3662     switch (MemSize) {
3663     case 1:
3664       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3665       break;
3666     case 2:
3667       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3668       break;
3669     default:
3670       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3671       break;
3672     }
3673   }
3674 
3675   if (!VIndex)
3676     VIndex = B.buildConstant(S32, 0).getReg(0);
3677 
3678   auto MIB = B.buildInstr(Opc)
3679     .addUse(VData)              // vdata
3680     .addUse(RSrc)               // rsrc
3681     .addUse(VIndex)             // vindex
3682     .addUse(VOffset)            // voffset
3683     .addUse(SOffset)            // soffset
3684     .addImm(ImmOffset);         // offset(imm)
3685 
3686   if (IsTyped)
3687     MIB.addImm(Format);
3688 
3689   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3690      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3691      .addMemOperand(MMO);
3692 
3693   MI.eraseFromParent();
3694   return true;
3695 }
3696 
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3697 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3698                                              MachineRegisterInfo &MRI,
3699                                              MachineIRBuilder &B,
3700                                              bool IsFormat,
3701                                              bool IsTyped) const {
3702   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3703   MachineMemOperand *MMO = *MI.memoperands_begin();
3704   const int MemSize = MMO->getSize();
3705   const LLT S32 = LLT::scalar(32);
3706 
3707   Register Dst = MI.getOperand(0).getReg();
3708   Register RSrc = MI.getOperand(2).getReg();
3709 
3710   // The typed intrinsics add an immediate after the registers.
3711   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3712 
3713   // The struct intrinsic variants add one additional operand over raw.
3714   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3715   Register VIndex;
3716   int OpOffset = 0;
3717   if (HasVIndex) {
3718     VIndex = MI.getOperand(3).getReg();
3719     OpOffset = 1;
3720   }
3721 
3722   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3723   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3724 
3725   unsigned Format = 0;
3726   if (IsTyped) {
3727     Format = MI.getOperand(5 + OpOffset).getImm();
3728     ++OpOffset;
3729   }
3730 
3731   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3732   unsigned ImmOffset;
3733   unsigned TotalOffset;
3734 
3735   LLT Ty = MRI.getType(Dst);
3736   LLT EltTy = Ty.getScalarType();
3737   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3738   const bool Unpacked = ST.hasUnpackedD16VMem();
3739 
3740   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3741   if (TotalOffset != 0)
3742     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3743 
3744   unsigned Opc;
3745 
3746   if (IsTyped) {
3747     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3748                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3749   } else if (IsFormat) {
3750     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3751                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3752   } else {
3753     switch (MemSize) {
3754     case 1:
3755       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3756       break;
3757     case 2:
3758       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3759       break;
3760     default:
3761       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3762       break;
3763     }
3764   }
3765 
3766   Register LoadDstReg;
3767 
3768   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3769   LLT UnpackedTy = Ty.changeElementSize(32);
3770 
3771   if (IsExtLoad)
3772     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3773   else if (Unpacked && IsD16 && Ty.isVector())
3774     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3775   else
3776     LoadDstReg = Dst;
3777 
3778   if (!VIndex)
3779     VIndex = B.buildConstant(S32, 0).getReg(0);
3780 
3781   auto MIB = B.buildInstr(Opc)
3782     .addDef(LoadDstReg)         // vdata
3783     .addUse(RSrc)               // rsrc
3784     .addUse(VIndex)             // vindex
3785     .addUse(VOffset)            // voffset
3786     .addUse(SOffset)            // soffset
3787     .addImm(ImmOffset);         // offset(imm)
3788 
3789   if (IsTyped)
3790     MIB.addImm(Format);
3791 
3792   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3793      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3794      .addMemOperand(MMO);
3795 
3796   if (LoadDstReg != Dst) {
3797     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3798 
3799     // Widen result for extending loads was widened.
3800     if (IsExtLoad)
3801       B.buildTrunc(Dst, LoadDstReg);
3802     else {
3803       // Repack to original 16-bit vector result
3804       // FIXME: G_TRUNC should work, but legalization currently fails
3805       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3806       SmallVector<Register, 4> Repack;
3807       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3808         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3809       B.buildMerge(Dst, Repack);
3810     }
3811   }
3812 
3813   MI.eraseFromParent();
3814   return true;
3815 }
3816 
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const3817 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3818                                                MachineIRBuilder &B,
3819                                                bool IsInc) const {
3820   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3821                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3822   B.buildInstr(Opc)
3823     .addDef(MI.getOperand(0).getReg())
3824     .addUse(MI.getOperand(2).getReg())
3825     .addUse(MI.getOperand(3).getReg())
3826     .cloneMemRefs(MI);
3827   MI.eraseFromParent();
3828   return true;
3829 }
3830 
getBufferAtomicPseudo(Intrinsic::ID IntrID)3831 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3832   switch (IntrID) {
3833   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3834   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3835     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3836   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3837   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3838     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3839   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3840   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3841     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3842   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3843   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3844     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3845   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3846   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3847     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3848   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3849   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3850     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3851   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3852   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3853     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3854   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3855   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3856     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3857   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3858   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3859     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3860   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3861   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3862     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3863   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3864   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3865     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3866   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3867   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3868     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3869   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3870   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3871     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3872   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3873   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3874     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3875   default:
3876     llvm_unreachable("unhandled atomic opcode");
3877   }
3878 }
3879 
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const3880 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3881                                                MachineIRBuilder &B,
3882                                                Intrinsic::ID IID) const {
3883   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3884                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3885   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3886 
3887   Register Dst;
3888 
3889   int OpOffset = 0;
3890   if (HasReturn) {
3891     // A few FP atomics do not support return values.
3892     Dst = MI.getOperand(0).getReg();
3893   } else {
3894     OpOffset = -1;
3895   }
3896 
3897   Register VData = MI.getOperand(2 + OpOffset).getReg();
3898   Register CmpVal;
3899 
3900   if (IsCmpSwap) {
3901     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3902     ++OpOffset;
3903   }
3904 
3905   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3906   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3907 
3908   // The struct intrinsic variants add one additional operand over raw.
3909   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3910   Register VIndex;
3911   if (HasVIndex) {
3912     VIndex = MI.getOperand(4 + OpOffset).getReg();
3913     ++OpOffset;
3914   }
3915 
3916   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3917   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3918   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3919 
3920   MachineMemOperand *MMO = *MI.memoperands_begin();
3921 
3922   unsigned ImmOffset;
3923   unsigned TotalOffset;
3924   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3925   if (TotalOffset != 0)
3926     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3927 
3928   if (!VIndex)
3929     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3930 
3931   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3932 
3933   if (HasReturn)
3934     MIB.addDef(Dst);
3935 
3936   MIB.addUse(VData); // vdata
3937 
3938   if (IsCmpSwap)
3939     MIB.addReg(CmpVal);
3940 
3941   MIB.addUse(RSrc)               // rsrc
3942      .addUse(VIndex)             // vindex
3943      .addUse(VOffset)            // voffset
3944      .addUse(SOffset)            // soffset
3945      .addImm(ImmOffset)          // offset(imm)
3946      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3947      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3948      .addMemOperand(MMO);
3949 
3950   MI.eraseFromParent();
3951   return true;
3952 }
3953 
3954 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3955 /// vector with s16 typed elements.
packImageA16AddressToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,unsigned EndIdx)3956 static void packImageA16AddressToDwords(
3957     MachineIRBuilder &B, MachineInstr &MI,
3958     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3959     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3960   const LLT S16 = LLT::scalar(16);
3961   const LLT V2S16 = LLT::vector(2, 16);
3962 
3963   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3964     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3965     if (!SrcOp.isReg())
3966       continue; // _L to _LZ may have eliminated this.
3967 
3968     Register AddrReg = SrcOp.getReg();
3969 
3970     if (I < Intr->GradientStart) {
3971       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3972       PackedAddrs.push_back(AddrReg);
3973     } else {
3974       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3975       // derivatives dx/dh and dx/dv are packed with undef.
3976       if (((I + 1) >= EndIdx) ||
3977           ((Intr->NumGradients / 2) % 2 == 1 &&
3978            (I == static_cast<unsigned>(Intr->GradientStart +
3979                                        (Intr->NumGradients / 2) - 1) ||
3980             I == static_cast<unsigned>(Intr->GradientStart +
3981                                        Intr->NumGradients - 1))) ||
3982           // Check for _L to _LZ optimization
3983           !MI.getOperand(ArgOffset + I + 1).isReg()) {
3984         PackedAddrs.push_back(
3985             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3986                 .getReg(0));
3987       } else {
3988         PackedAddrs.push_back(
3989             B.buildBuildVector(
3990                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3991                 .getReg(0));
3992         ++I;
3993       }
3994     }
3995   }
3996 }
3997 
3998 /// Convert from separate vaddr components to a single vector address register,
3999 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4000 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4001                                      int DimIdx, int NumVAddrs) {
4002   const LLT S32 = LLT::scalar(32);
4003 
4004   SmallVector<Register, 8> AddrRegs;
4005   for (int I = 0; I != NumVAddrs; ++I) {
4006     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4007     if (SrcOp.isReg()) {
4008       AddrRegs.push_back(SrcOp.getReg());
4009       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4010     }
4011   }
4012 
4013   int NumAddrRegs = AddrRegs.size();
4014   if (NumAddrRegs != 1) {
4015     // Round up to 8 elements for v5-v7
4016     // FIXME: Missing intermediate sized register classes and instructions.
4017     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4018       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4019       auto Undef = B.buildUndef(S32);
4020       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4021       NumAddrRegs = RoundedNumRegs;
4022     }
4023 
4024     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4025     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4026   }
4027 
4028   for (int I = 1; I != NumVAddrs; ++I) {
4029     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4030     if (SrcOp.isReg())
4031       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4032   }
4033 }
4034 
4035 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4036 ///
4037 /// Depending on the subtarget, load/store with 16-bit element data need to be
4038 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4039 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4040 /// registers.
4041 ///
4042 /// We don't want to directly select image instructions just yet, but also want
4043 /// to exposes all register repacking to the legalizer/combiners. We also don't
4044 /// want a selected instrution entering RegBankSelect. In order to avoid
4045 /// defining a multitude of intermediate image instructions, directly hack on
4046 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4047 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4048 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4049     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4050     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4051 
4052   const unsigned NumDefs = MI.getNumExplicitDefs();
4053   const unsigned ArgOffset = NumDefs + 1;
4054   bool IsTFE = NumDefs == 2;
4055   // We are only processing the operands of d16 image operations on subtargets
4056   // that use the unpacked register layout, or need to repack the TFE result.
4057 
4058   // TODO: Do we need to guard against already legalized intrinsics?
4059   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4060       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4061 
4062   MachineRegisterInfo *MRI = B.getMRI();
4063   const LLT S32 = LLT::scalar(32);
4064   const LLT S16 = LLT::scalar(16);
4065   const LLT V2S16 = LLT::vector(2, 16);
4066 
4067   unsigned DMask = 0;
4068 
4069   // Check for 16 bit addresses and pack if true.
4070   LLT GradTy =
4071       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4072   LLT AddrTy =
4073       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4074   const bool IsG16 = GradTy == S16;
4075   const bool IsA16 = AddrTy == S16;
4076 
4077   int DMaskLanes = 0;
4078   if (!BaseOpcode->Atomic) {
4079     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4080     if (BaseOpcode->Gather4) {
4081       DMaskLanes = 4;
4082     } else if (DMask != 0) {
4083       DMaskLanes = countPopulation(DMask);
4084     } else if (!IsTFE && !BaseOpcode->Store) {
4085       // If dmask is 0, this is a no-op load. This can be eliminated.
4086       B.buildUndef(MI.getOperand(0));
4087       MI.eraseFromParent();
4088       return true;
4089     }
4090   }
4091 
4092   Observer.changingInstr(MI);
4093   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4094 
4095   unsigned NewOpcode = NumDefs == 0 ?
4096     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4097 
4098   // Track that we legalized this
4099   MI.setDesc(B.getTII().get(NewOpcode));
4100 
4101   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4102   // dmask to be at least 1 otherwise the instruction will fail
4103   if (IsTFE && DMask == 0) {
4104     DMask = 0x1;
4105     DMaskLanes = 1;
4106     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4107   }
4108 
4109   if (BaseOpcode->Atomic) {
4110     Register VData0 = MI.getOperand(2).getReg();
4111     LLT Ty = MRI->getType(VData0);
4112 
4113     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4114     if (Ty.isVector())
4115       return false;
4116 
4117     if (BaseOpcode->AtomicX2) {
4118       Register VData1 = MI.getOperand(3).getReg();
4119       // The two values are packed in one register.
4120       LLT PackedTy = LLT::vector(2, Ty);
4121       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4122       MI.getOperand(2).setReg(Concat.getReg(0));
4123       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4124     }
4125   }
4126 
4127   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4128 
4129   // Optimize _L to _LZ when _L is zero
4130   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4131           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4132     const ConstantFP *ConstantLod;
4133 
4134     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4135                  m_GFCst(ConstantLod))) {
4136       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4137         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4138         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4139             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4140                                                       Intr->Dim);
4141 
4142         // The starting indexes should remain in the same place.
4143         --CorrectedNumVAddrs;
4144 
4145         MI.getOperand(MI.getNumExplicitDefs())
4146             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4147         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4148         Intr = NewImageDimIntr;
4149       }
4150     }
4151   }
4152 
4153   // Optimize _mip away, when 'lod' is zero
4154   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4155     int64_t ConstantLod;
4156     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4157                  m_ICst(ConstantLod))) {
4158       if (ConstantLod == 0) {
4159         // TODO: Change intrinsic opcode and remove operand instead or replacing
4160         // it with 0, as the _L to _LZ handling is done above.
4161         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4162         --CorrectedNumVAddrs;
4163       }
4164     }
4165   }
4166 
4167   // Rewrite the addressing register layout before doing anything else.
4168   if (IsA16 || IsG16) {
4169     if (IsA16) {
4170       // Target must support the feature and gradients need to be 16 bit too
4171       if (!ST.hasA16() || !IsG16)
4172         return false;
4173     } else if (!ST.hasG16())
4174       return false;
4175 
4176     if (Intr->NumVAddrs > 1) {
4177       SmallVector<Register, 4> PackedRegs;
4178       // Don't compress addresses for G16
4179       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4180       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4181                                   PackEndIdx);
4182 
4183       if (!IsA16) {
4184         // Add uncompressed address
4185         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4186           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4187           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4188           PackedRegs.push_back(AddrReg);
4189         }
4190       }
4191 
4192       // See also below in the non-a16 branch
4193       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4194 
4195       if (!UseNSA && PackedRegs.size() > 1) {
4196         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4197         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4198         PackedRegs[0] = Concat.getReg(0);
4199         PackedRegs.resize(1);
4200       }
4201 
4202       const unsigned NumPacked = PackedRegs.size();
4203       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4204         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4205         if (!SrcOp.isReg()) {
4206           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4207           continue;
4208         }
4209 
4210         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4211 
4212         if (I - Intr->VAddrStart < NumPacked)
4213           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4214         else
4215           SrcOp.setReg(AMDGPU::NoRegister);
4216       }
4217     }
4218   } else {
4219     // If the register allocator cannot place the address registers contiguously
4220     // without introducing moves, then using the non-sequential address encoding
4221     // is always preferable, since it saves VALU instructions and is usually a
4222     // wash in terms of code size or even better.
4223     //
4224     // However, we currently have no way of hinting to the register allocator
4225     // that MIMG addresses should be placed contiguously when it is possible to
4226     // do so, so force non-NSA for the common 2-address case as a heuristic.
4227     //
4228     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4229     // allocation when possible.
4230     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4231 
4232     if (!UseNSA && Intr->NumVAddrs > 1)
4233       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4234                                Intr->NumVAddrs);
4235   }
4236 
4237   int Flags = 0;
4238   if (IsA16)
4239     Flags |= 1;
4240   if (IsG16)
4241     Flags |= 2;
4242   MI.addOperand(MachineOperand::CreateImm(Flags));
4243 
4244   if (BaseOpcode->Store) { // No TFE for stores?
4245     // TODO: Handle dmask trim
4246     Register VData = MI.getOperand(1).getReg();
4247     LLT Ty = MRI->getType(VData);
4248     if (!Ty.isVector() || Ty.getElementType() != S16)
4249       return true;
4250 
4251     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4252     if (RepackedReg != VData) {
4253       MI.getOperand(1).setReg(RepackedReg);
4254     }
4255 
4256     return true;
4257   }
4258 
4259   Register DstReg = MI.getOperand(0).getReg();
4260   LLT Ty = MRI->getType(DstReg);
4261   const LLT EltTy = Ty.getScalarType();
4262   const bool IsD16 = Ty.getScalarType() == S16;
4263   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4264 
4265   // Confirm that the return type is large enough for the dmask specified
4266   if (NumElts < DMaskLanes)
4267     return false;
4268 
4269   if (NumElts > 4 || DMaskLanes > 4)
4270     return false;
4271 
4272   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4273   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4274 
4275   // The raw dword aligned data component of the load. The only legal cases
4276   // where this matters should be when using the packed D16 format, for
4277   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4278   LLT RoundedTy;
4279 
4280   // S32 vector to to cover all data, plus TFE result element.
4281   LLT TFETy;
4282 
4283   // Register type to use for each loaded component. Will be S32 or V2S16.
4284   LLT RegTy;
4285 
4286   if (IsD16 && ST.hasUnpackedD16VMem()) {
4287     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4288     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4289     RegTy = S32;
4290   } else {
4291     unsigned EltSize = EltTy.getSizeInBits();
4292     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4293     unsigned RoundedSize = 32 * RoundedElts;
4294     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4295     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4296     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4297   }
4298 
4299   // The return type does not need adjustment.
4300   // TODO: Should we change s16 case to s32 or <2 x s16>?
4301   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4302     return true;
4303 
4304   Register Dst1Reg;
4305 
4306   // Insert after the instruction.
4307   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4308 
4309   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4310   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4311   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4312   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4313 
4314   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4315 
4316   MI.getOperand(0).setReg(NewResultReg);
4317 
4318   // In the IR, TFE is supposed to be used with a 2 element struct return
4319   // type. The intruction really returns these two values in one contiguous
4320   // register, with one additional dword beyond the loaded data. Rewrite the
4321   // return type to use a single register result.
4322 
4323   if (IsTFE) {
4324     Dst1Reg = MI.getOperand(1).getReg();
4325     if (MRI->getType(Dst1Reg) != S32)
4326       return false;
4327 
4328     // TODO: Make sure the TFE operand bit is set.
4329     MI.RemoveOperand(1);
4330 
4331     // Handle the easy case that requires no repack instructions.
4332     if (Ty == S32) {
4333       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4334       return true;
4335     }
4336   }
4337 
4338   // Now figure out how to copy the new result register back into the old
4339   // result.
4340   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4341 
4342   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4343 
4344   if (ResultNumRegs == 1) {
4345     assert(!IsTFE);
4346     ResultRegs[0] = NewResultReg;
4347   } else {
4348     // We have to repack into a new vector of some kind.
4349     for (int I = 0; I != NumDataRegs; ++I)
4350       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4351     B.buildUnmerge(ResultRegs, NewResultReg);
4352 
4353     // Drop the final TFE element to get the data part. The TFE result is
4354     // directly written to the right place already.
4355     if (IsTFE)
4356       ResultRegs.resize(NumDataRegs);
4357   }
4358 
4359   // For an s16 scalar result, we form an s32 result with a truncate regardless
4360   // of packed vs. unpacked.
4361   if (IsD16 && !Ty.isVector()) {
4362     B.buildTrunc(DstReg, ResultRegs[0]);
4363     return true;
4364   }
4365 
4366   // Avoid a build/concat_vector of 1 entry.
4367   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4368     B.buildBitcast(DstReg, ResultRegs[0]);
4369     return true;
4370   }
4371 
4372   assert(Ty.isVector());
4373 
4374   if (IsD16) {
4375     // For packed D16 results with TFE enabled, all the data components are
4376     // S32. Cast back to the expected type.
4377     //
4378     // TODO: We don't really need to use load s32 elements. We would only need one
4379     // cast for the TFE result if a multiple of v2s16 was used.
4380     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4381       for (Register &Reg : ResultRegs)
4382         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4383     } else if (ST.hasUnpackedD16VMem()) {
4384       for (Register &Reg : ResultRegs)
4385         Reg = B.buildTrunc(S16, Reg).getReg(0);
4386     }
4387   }
4388 
4389   auto padWithUndef = [&](LLT Ty, int NumElts) {
4390     if (NumElts == 0)
4391       return;
4392     Register Undef = B.buildUndef(Ty).getReg(0);
4393     for (int I = 0; I != NumElts; ++I)
4394       ResultRegs.push_back(Undef);
4395   };
4396 
4397   // Pad out any elements eliminated due to the dmask.
4398   LLT ResTy = MRI->getType(ResultRegs[0]);
4399   if (!ResTy.isVector()) {
4400     padWithUndef(ResTy, NumElts - ResultRegs.size());
4401     B.buildBuildVector(DstReg, ResultRegs);
4402     return true;
4403   }
4404 
4405   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4406   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4407 
4408   // Deal with the one annoying legal case.
4409   const LLT V3S16 = LLT::vector(3, 16);
4410   if (Ty == V3S16) {
4411     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4412     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4413     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4414     return true;
4415   }
4416 
4417   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4418   B.buildConcatVectors(DstReg, ResultRegs);
4419   return true;
4420 }
4421 
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4422 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4423   LegalizerHelper &Helper, MachineInstr &MI) const {
4424   MachineIRBuilder &B = Helper.MIRBuilder;
4425   GISelChangeObserver &Observer = Helper.Observer;
4426 
4427   Register Dst = MI.getOperand(0).getReg();
4428   LLT Ty = B.getMRI()->getType(Dst);
4429   unsigned Size = Ty.getSizeInBits();
4430   MachineFunction &MF = B.getMF();
4431 
4432   Observer.changingInstr(MI);
4433 
4434   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4435     Ty = getBitcastRegisterType(Ty);
4436     Helper.bitcastDst(MI, Ty, 0);
4437     Dst = MI.getOperand(0).getReg();
4438     B.setInsertPt(B.getMBB(), MI);
4439   }
4440 
4441   // FIXME: We don't really need this intermediate instruction. The intrinsic
4442   // should be fixed to have a memory operand. Since it's readnone, we're not
4443   // allowed to add one.
4444   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4445   MI.RemoveOperand(1); // Remove intrinsic ID
4446 
4447   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4448   // TODO: Should this use datalayout alignment?
4449   const unsigned MemSize = (Size + 7) / 8;
4450   const Align MemAlign(4);
4451   MachineMemOperand *MMO = MF.getMachineMemOperand(
4452       MachinePointerInfo(),
4453       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4454           MachineMemOperand::MOInvariant,
4455       MemSize, MemAlign);
4456   MI.addMemOperand(MF, MMO);
4457 
4458   // There are no 96-bit result scalar loads, but widening to 128-bit should
4459   // always be legal. We may need to restore this to a 96-bit result if it turns
4460   // out this needs to be converted to a vector load during RegBankSelect.
4461   if (!isPowerOf2_32(Size)) {
4462     if (Ty.isVector())
4463       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4464     else
4465       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4466   }
4467 
4468   Observer.changedInstr(MI);
4469   return true;
4470 }
4471 
4472 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4473 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4474                                                 MachineRegisterInfo &MRI,
4475                                                 MachineIRBuilder &B) const {
4476   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4477   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4478       !ST.isTrapHandlerEnabled()) {
4479     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4480   } else {
4481     // Pass queue pointer to trap handler as input, and insert trap instruction
4482     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4483     MachineRegisterInfo &MRI = *B.getMRI();
4484 
4485     Register LiveIn =
4486       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4487     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4488       return false;
4489 
4490     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4491     B.buildCopy(SGPR01, LiveIn);
4492     B.buildInstr(AMDGPU::S_TRAP)
4493         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4494         .addReg(SGPR01, RegState::Implicit);
4495   }
4496 
4497   MI.eraseFromParent();
4498   return true;
4499 }
4500 
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4501 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4502     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4503   // Is non-HSA path or trap-handler disabled? then, report a warning
4504   // accordingly
4505   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4506       !ST.isTrapHandlerEnabled()) {
4507     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4508                                      "debugtrap handler not supported",
4509                                      MI.getDebugLoc(), DS_Warning);
4510     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4511     Ctx.diagnose(NoTrap);
4512   } else {
4513     // Insert debug-trap instruction
4514     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4515   }
4516 
4517   MI.eraseFromParent();
4518   return true;
4519 }
4520 
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4521 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4522                                                MachineIRBuilder &B) const {
4523   MachineRegisterInfo &MRI = *B.getMRI();
4524   const LLT S16 = LLT::scalar(16);
4525   const LLT S32 = LLT::scalar(32);
4526 
4527   Register DstReg = MI.getOperand(0).getReg();
4528   Register NodePtr = MI.getOperand(2).getReg();
4529   Register RayExtent = MI.getOperand(3).getReg();
4530   Register RayOrigin = MI.getOperand(4).getReg();
4531   Register RayDir = MI.getOperand(5).getReg();
4532   Register RayInvDir = MI.getOperand(6).getReg();
4533   Register TDescr = MI.getOperand(7).getReg();
4534 
4535   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4536   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4537   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4538                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4539                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4540                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4541 
4542   SmallVector<Register, 12> Ops;
4543   if (Is64) {
4544     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4545     Ops.push_back(Unmerge.getReg(0));
4546     Ops.push_back(Unmerge.getReg(1));
4547   } else {
4548     Ops.push_back(NodePtr);
4549   }
4550   Ops.push_back(RayExtent);
4551 
4552   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4553     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4554     Ops.push_back(Unmerge.getReg(0));
4555     Ops.push_back(Unmerge.getReg(1));
4556     Ops.push_back(Unmerge.getReg(2));
4557   };
4558 
4559   packLanes(RayOrigin);
4560   if (IsA16) {
4561     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4562     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4563     Register R1 = MRI.createGenericVirtualRegister(S32);
4564     Register R2 = MRI.createGenericVirtualRegister(S32);
4565     Register R3 = MRI.createGenericVirtualRegister(S32);
4566     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4567     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4568     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4569     Ops.push_back(R1);
4570     Ops.push_back(R2);
4571     Ops.push_back(R3);
4572   } else {
4573     packLanes(RayDir);
4574     packLanes(RayInvDir);
4575   }
4576 
4577   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4578     .addDef(DstReg)
4579     .addImm(Opcode);
4580 
4581   for (Register R : Ops) {
4582     MIB.addUse(R);
4583   }
4584 
4585   MIB.addUse(TDescr)
4586      .addImm(IsA16 ? 1 : 0)
4587      .cloneMemRefs(MI);
4588 
4589   MI.eraseFromParent();
4590   return true;
4591 }
4592 
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4593 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4594                                             MachineInstr &MI) const {
4595   MachineIRBuilder &B = Helper.MIRBuilder;
4596   MachineRegisterInfo &MRI = *B.getMRI();
4597 
4598   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4599   auto IntrID = MI.getIntrinsicID();
4600   switch (IntrID) {
4601   case Intrinsic::amdgcn_if:
4602   case Intrinsic::amdgcn_else: {
4603     MachineInstr *Br = nullptr;
4604     MachineBasicBlock *UncondBrTarget = nullptr;
4605     bool Negated = false;
4606     if (MachineInstr *BrCond =
4607             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4608       const SIRegisterInfo *TRI
4609         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4610 
4611       Register Def = MI.getOperand(1).getReg();
4612       Register Use = MI.getOperand(3).getReg();
4613 
4614       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4615 
4616       if (Negated)
4617         std::swap(CondBrTarget, UncondBrTarget);
4618 
4619       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4620       if (IntrID == Intrinsic::amdgcn_if) {
4621         B.buildInstr(AMDGPU::SI_IF)
4622           .addDef(Def)
4623           .addUse(Use)
4624           .addMBB(UncondBrTarget);
4625       } else {
4626         B.buildInstr(AMDGPU::SI_ELSE)
4627             .addDef(Def)
4628             .addUse(Use)
4629             .addMBB(UncondBrTarget);
4630       }
4631 
4632       if (Br) {
4633         Br->getOperand(0).setMBB(CondBrTarget);
4634       } else {
4635         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4636         // since we're swapping branch targets it needs to be reinserted.
4637         // FIXME: IRTranslator should probably not do this
4638         B.buildBr(*CondBrTarget);
4639       }
4640 
4641       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4642       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4643       MI.eraseFromParent();
4644       BrCond->eraseFromParent();
4645       return true;
4646     }
4647 
4648     return false;
4649   }
4650   case Intrinsic::amdgcn_loop: {
4651     MachineInstr *Br = nullptr;
4652     MachineBasicBlock *UncondBrTarget = nullptr;
4653     bool Negated = false;
4654     if (MachineInstr *BrCond =
4655             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4656       const SIRegisterInfo *TRI
4657         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4658 
4659       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4660       Register Reg = MI.getOperand(2).getReg();
4661 
4662       if (Negated)
4663         std::swap(CondBrTarget, UncondBrTarget);
4664 
4665       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4666       B.buildInstr(AMDGPU::SI_LOOP)
4667         .addUse(Reg)
4668         .addMBB(UncondBrTarget);
4669 
4670       if (Br)
4671         Br->getOperand(0).setMBB(CondBrTarget);
4672       else
4673         B.buildBr(*CondBrTarget);
4674 
4675       MI.eraseFromParent();
4676       BrCond->eraseFromParent();
4677       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4678       return true;
4679     }
4680 
4681     return false;
4682   }
4683   case Intrinsic::amdgcn_kernarg_segment_ptr:
4684     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4685       // This only makes sense to call in a kernel, so just lower to null.
4686       B.buildConstant(MI.getOperand(0).getReg(), 0);
4687       MI.eraseFromParent();
4688       return true;
4689     }
4690 
4691     return legalizePreloadedArgIntrin(
4692       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4693   case Intrinsic::amdgcn_implicitarg_ptr:
4694     return legalizeImplicitArgPtr(MI, MRI, B);
4695   case Intrinsic::amdgcn_workitem_id_x:
4696     return legalizePreloadedArgIntrin(MI, MRI, B,
4697                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4698   case Intrinsic::amdgcn_workitem_id_y:
4699     return legalizePreloadedArgIntrin(MI, MRI, B,
4700                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4701   case Intrinsic::amdgcn_workitem_id_z:
4702     return legalizePreloadedArgIntrin(MI, MRI, B,
4703                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4704   case Intrinsic::amdgcn_workgroup_id_x:
4705     return legalizePreloadedArgIntrin(MI, MRI, B,
4706                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4707   case Intrinsic::amdgcn_workgroup_id_y:
4708     return legalizePreloadedArgIntrin(MI, MRI, B,
4709                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4710   case Intrinsic::amdgcn_workgroup_id_z:
4711     return legalizePreloadedArgIntrin(MI, MRI, B,
4712                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4713   case Intrinsic::amdgcn_dispatch_ptr:
4714     return legalizePreloadedArgIntrin(MI, MRI, B,
4715                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4716   case Intrinsic::amdgcn_queue_ptr:
4717     return legalizePreloadedArgIntrin(MI, MRI, B,
4718                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4719   case Intrinsic::amdgcn_implicit_buffer_ptr:
4720     return legalizePreloadedArgIntrin(
4721       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4722   case Intrinsic::amdgcn_dispatch_id:
4723     return legalizePreloadedArgIntrin(MI, MRI, B,
4724                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4725   case Intrinsic::amdgcn_fdiv_fast:
4726     return legalizeFDIVFastIntrin(MI, MRI, B);
4727   case Intrinsic::amdgcn_is_shared:
4728     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4729   case Intrinsic::amdgcn_is_private:
4730     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4731   case Intrinsic::amdgcn_wavefrontsize: {
4732     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4733     MI.eraseFromParent();
4734     return true;
4735   }
4736   case Intrinsic::amdgcn_s_buffer_load:
4737     return legalizeSBufferLoad(Helper, MI);
4738   case Intrinsic::amdgcn_raw_buffer_store:
4739   case Intrinsic::amdgcn_struct_buffer_store:
4740     return legalizeBufferStore(MI, MRI, B, false, false);
4741   case Intrinsic::amdgcn_raw_buffer_store_format:
4742   case Intrinsic::amdgcn_struct_buffer_store_format:
4743     return legalizeBufferStore(MI, MRI, B, false, true);
4744   case Intrinsic::amdgcn_raw_tbuffer_store:
4745   case Intrinsic::amdgcn_struct_tbuffer_store:
4746     return legalizeBufferStore(MI, MRI, B, true, true);
4747   case Intrinsic::amdgcn_raw_buffer_load:
4748   case Intrinsic::amdgcn_struct_buffer_load:
4749     return legalizeBufferLoad(MI, MRI, B, false, false);
4750   case Intrinsic::amdgcn_raw_buffer_load_format:
4751   case Intrinsic::amdgcn_struct_buffer_load_format:
4752     return legalizeBufferLoad(MI, MRI, B, true, false);
4753   case Intrinsic::amdgcn_raw_tbuffer_load:
4754   case Intrinsic::amdgcn_struct_tbuffer_load:
4755     return legalizeBufferLoad(MI, MRI, B, true, true);
4756   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4757   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4758   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4759   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4760   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4761   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4762   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4763   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4764   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4765   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4766   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4767   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4768   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4769   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4770   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4771   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4772   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4773   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4774   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4775   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4776   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4777   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4778   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4779   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4780   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4781   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4782   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4783   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4784     return legalizeBufferAtomic(MI, B, IntrID);
4785   case Intrinsic::amdgcn_atomic_inc:
4786     return legalizeAtomicIncDec(MI, B, true);
4787   case Intrinsic::amdgcn_atomic_dec:
4788     return legalizeAtomicIncDec(MI, B, false);
4789   case Intrinsic::trap:
4790     return legalizeTrapIntrinsic(MI, MRI, B);
4791   case Intrinsic::debugtrap:
4792     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4793   case Intrinsic::amdgcn_rsq_clamp:
4794     return legalizeRsqClampIntrinsic(MI, MRI, B);
4795   case Intrinsic::amdgcn_ds_fadd:
4796   case Intrinsic::amdgcn_ds_fmin:
4797   case Intrinsic::amdgcn_ds_fmax:
4798     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4799   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4800     return legalizeBVHIntrinsic(MI, B);
4801   default: {
4802     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4803             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4804       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4805     return true;
4806   }
4807   }
4808 
4809   return true;
4810 }
4811