1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/CodeGen/ValueTypes.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31
32 #define DEBUG_TYPE "amdgpu-legalinfo"
33
34 using namespace llvm;
35 using namespace LegalizeActions;
36 using namespace LegalizeMutations;
37 using namespace LegalityPredicates;
38 using namespace MIPatternMatch;
39
40 // Hack until load/store selection patterns support any tuple of legal types.
41 static cl::opt<bool> EnableNewLegality(
42 "amdgpu-global-isel-new-legality",
43 cl::desc("Use GlobalISel desired legality, rather than try to use"
44 "rules compatible with selection patterns"),
45 cl::init(false),
46 cl::ReallyHidden);
47
48 static constexpr unsigned MaxRegisterSize = 1024;
49
50 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)51 static LLT getPow2VectorType(LLT Ty) {
52 unsigned NElts = Ty.getNumElements();
53 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
54 return Ty.changeNumElements(Pow2NElts);
55 }
56
57 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)58 static LLT getPow2ScalarType(LLT Ty) {
59 unsigned Bits = Ty.getSizeInBits();
60 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
61 return LLT::scalar(Pow2Bits);
62 }
63
64 /// \returs true if this is an odd sized vector which should widen by adding an
65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 if (!Ty.isVector())
71 return false;
72
73 const LLT EltTy = Ty.getElementType();
74 const unsigned EltSize = EltTy.getSizeInBits();
75 return Ty.getNumElements() % 2 != 0 &&
76 EltSize > 1 && EltSize < 32 &&
77 Ty.getSizeInBits() % 32 != 0;
78 };
79 }
80
sizeIsMultipleOf32(unsigned TypeIdx)81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82 return [=](const LegalityQuery &Query) {
83 const LLT Ty = Query.Types[TypeIdx];
84 return Ty.getSizeInBits() % 32 == 0;
85 };
86 }
87
isWideVec16(unsigned TypeIdx)88 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
89 return [=](const LegalityQuery &Query) {
90 const LLT Ty = Query.Types[TypeIdx];
91 const LLT EltTy = Ty.getScalarType();
92 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
93 };
94 }
95
oneMoreElement(unsigned TypeIdx)96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getElementType();
100 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
101 };
102 }
103
fewerEltsToSize64Vector(unsigned TypeIdx)104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
112 };
113 }
114
115 // Increase the number of vector elements to reach the next multiple of 32-bit
116 // type.
moreEltsToNext32Bit(unsigned TypeIdx)117 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
118 return [=](const LegalityQuery &Query) {
119 const LLT Ty = Query.Types[TypeIdx];
120
121 const LLT EltTy = Ty.getElementType();
122 const int Size = Ty.getSizeInBits();
123 const int EltSize = EltTy.getSizeInBits();
124 const int NextMul32 = (Size + 31) / 32;
125
126 assert(EltSize < 32);
127
128 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
129 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
130 };
131 }
132
getBitcastRegisterType(const LLT Ty)133 static LLT getBitcastRegisterType(const LLT Ty) {
134 const unsigned Size = Ty.getSizeInBits();
135
136 LLT CoercedTy;
137 if (Size <= 32) {
138 // <2 x s8> -> s16
139 // <4 x s8> -> s32
140 return LLT::scalar(Size);
141 }
142
143 return LLT::scalarOrVector(Size / 32, 32);
144 }
145
bitcastToRegisterType(unsigned TypeIdx)146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147 return [=](const LegalityQuery &Query) {
148 const LLT Ty = Query.Types[TypeIdx];
149 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
150 };
151 }
152
bitcastToVectorElement32(unsigned TypeIdx)153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154 return [=](const LegalityQuery &Query) {
155 const LLT Ty = Query.Types[TypeIdx];
156 unsigned Size = Ty.getSizeInBits();
157 assert(Size % 32 == 0);
158 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
159 };
160 }
161
vectorSmallerThan(unsigned TypeIdx,unsigned Size)162 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
163 return [=](const LegalityQuery &Query) {
164 const LLT QueryTy = Query.Types[TypeIdx];
165 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
166 };
167 }
168
vectorWiderThan(unsigned TypeIdx,unsigned Size)169 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
170 return [=](const LegalityQuery &Query) {
171 const LLT QueryTy = Query.Types[TypeIdx];
172 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
173 };
174 }
175
numElementsNotEven(unsigned TypeIdx)176 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
177 return [=](const LegalityQuery &Query) {
178 const LLT QueryTy = Query.Types[TypeIdx];
179 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
180 };
181 }
182
isRegisterSize(unsigned Size)183 static bool isRegisterSize(unsigned Size) {
184 return Size % 32 == 0 && Size <= MaxRegisterSize;
185 }
186
isRegisterVectorElementType(LLT EltTy)187 static bool isRegisterVectorElementType(LLT EltTy) {
188 const int EltSize = EltTy.getSizeInBits();
189 return EltSize == 16 || EltSize % 32 == 0;
190 }
191
isRegisterVectorType(LLT Ty)192 static bool isRegisterVectorType(LLT Ty) {
193 const int EltSize = Ty.getElementType().getSizeInBits();
194 return EltSize == 32 || EltSize == 64 ||
195 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
196 EltSize == 128 || EltSize == 256;
197 }
198
isRegisterType(LLT Ty)199 static bool isRegisterType(LLT Ty) {
200 if (!isRegisterSize(Ty.getSizeInBits()))
201 return false;
202
203 if (Ty.isVector())
204 return isRegisterVectorType(Ty);
205
206 return true;
207 }
208
209 // Any combination of 32 or 64-bit elements up the maximum register size, and
210 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)211 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
212 return [=](const LegalityQuery &Query) {
213 return isRegisterType(Query.Types[TypeIdx]);
214 };
215 }
216
elementTypeIsLegal(unsigned TypeIdx)217 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
218 return [=](const LegalityQuery &Query) {
219 const LLT QueryTy = Query.Types[TypeIdx];
220 if (!QueryTy.isVector())
221 return false;
222 const LLT EltTy = QueryTy.getElementType();
223 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
224 };
225 }
226
isWideScalarTruncStore(unsigned TypeIdx)227 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
228 return [=](const LegalityQuery &Query) {
229 const LLT Ty = Query.Types[TypeIdx];
230 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232 };
233 }
234
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239 bool IsLoad) {
240 switch (AS) {
241 case AMDGPUAS::PRIVATE_ADDRESS:
242 // FIXME: Private element size.
243 return 32;
244 case AMDGPUAS::LOCAL_ADDRESS:
245 return ST.useDS128() ? 128 : 64;
246 case AMDGPUAS::GLOBAL_ADDRESS:
247 case AMDGPUAS::CONSTANT_ADDRESS:
248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249 // Treat constant and global as identical. SMRD loads are sometimes usable for
250 // global loads (ideally constant address space should be eliminated)
251 // depending on the context. Legality cannot be context dependent, but
252 // RegBankSelect can split the load as necessary depending on the pointer
253 // register bank/uniformity and if the memory is invariant or not written in a
254 // kernel.
255 return IsLoad ? 512 : 128;
256 default:
257 // Flat addresses may contextually need to be split to 32-bit parts if they
258 // may alias scratch depending on the subtarget.
259 return 128;
260 }
261 }
262
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264 const LegalityQuery &Query,
265 unsigned Opcode) {
266 const LLT Ty = Query.Types[0];
267
268 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
269 const bool IsLoad = Opcode != AMDGPU::G_STORE;
270
271 unsigned RegSize = Ty.getSizeInBits();
272 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
273 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
274 unsigned AS = Query.Types[1].getAddressSpace();
275
276 // All of these need to be custom lowered to cast the pointer operand.
277 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
278 return false;
279
280 // TODO: We should be able to widen loads if the alignment is high enough, but
281 // we also need to modify the memory access size.
282 #if 0
283 // Accept widening loads based on alignment.
284 if (IsLoad && MemSize < Size)
285 MemSize = std::max(MemSize, Align);
286 #endif
287
288 // Only 1-byte and 2-byte to 32-bit extloads are valid.
289 if (MemSize != RegSize && RegSize != 32)
290 return false;
291
292 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
293 return false;
294
295 switch (MemSize) {
296 case 8:
297 case 16:
298 case 32:
299 case 64:
300 case 128:
301 break;
302 case 96:
303 if (!ST.hasDwordx3LoadStores())
304 return false;
305 break;
306 case 256:
307 case 512:
308 // These may contextually need to be broken down.
309 break;
310 default:
311 return false;
312 }
313
314 assert(RegSize >= MemSize);
315
316 if (AlignBits < MemSize) {
317 const SITargetLowering *TLI = ST.getTargetLowering();
318 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
319 Align(AlignBits / 8)))
320 return false;
321 }
322
323 return true;
324 }
325
326 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
327 // workaround this. Eventually it should ignore the type for loads and only care
328 // about the size. Return true in cases where we will workaround this for now by
329 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)330 static bool loadStoreBitcastWorkaround(const LLT Ty) {
331 if (EnableNewLegality)
332 return false;
333
334 const unsigned Size = Ty.getSizeInBits();
335 if (Size <= 64)
336 return false;
337 if (!Ty.isVector())
338 return true;
339
340 LLT EltTy = Ty.getElementType();
341 if (EltTy.isPointer())
342 return true;
343
344 unsigned EltSize = EltTy.getSizeInBits();
345 return EltSize != 32 && EltSize != 64;
346 }
347
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)348 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
349 unsigned Opcode) {
350 const LLT Ty = Query.Types[0];
351 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
352 !loadStoreBitcastWorkaround(Ty);
353 }
354
355 /// Return true if a load or store of the type should be lowered with a bitcast
356 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const unsigned MemSizeInBits)357 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
358 const unsigned MemSizeInBits) {
359 const unsigned Size = Ty.getSizeInBits();
360 if (Size != MemSizeInBits)
361 return Size <= 32 && Ty.isVector();
362
363 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
364 return true;
365 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
366 !isRegisterVectorElementType(Ty.getElementType());
367 }
368
369 /// Return true if we should legalize a load by widening an odd sized memory
370 /// access up to the alignment. Note this case when the memory access itself
371 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,unsigned SizeInBits,unsigned AlignInBits,unsigned AddrSpace,unsigned Opcode)372 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
373 unsigned AlignInBits, unsigned AddrSpace,
374 unsigned Opcode) {
375 // We don't want to widen cases that are naturally legal.
376 if (isPowerOf2_32(SizeInBits))
377 return false;
378
379 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
380 // end up widening these for a scalar load during RegBankSelect, since there
381 // aren't 96-bit scalar loads.
382 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
383 return false;
384
385 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
386 return false;
387
388 // A load is known dereferenceable up to the alignment, so it's legal to widen
389 // to it.
390 //
391 // TODO: Could check dereferenceable for less aligned cases.
392 unsigned RoundedSize = NextPowerOf2(SizeInBits);
393 if (AlignInBits < RoundedSize)
394 return false;
395
396 // Do not widen if it would introduce a slow unaligned load.
397 const SITargetLowering *TLI = ST.getTargetLowering();
398 bool Fast = false;
399 return TLI->allowsMisalignedMemoryAccessesImpl(
400 RoundedSize, AddrSpace, Align(AlignInBits / 8),
401 MachineMemOperand::MOLoad, &Fast) &&
402 Fast;
403 }
404
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)405 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
406 unsigned Opcode) {
407 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
408 return false;
409
410 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
411 Query.MMODescrs[0].AlignInBits,
412 Query.Types[1].getAddressSpace(), Opcode);
413 }
414
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)415 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
416 const GCNTargetMachine &TM)
417 : ST(ST_) {
418 using namespace TargetOpcode;
419
420 auto GetAddrSpacePtr = [&TM](unsigned AS) {
421 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
422 };
423
424 const LLT S1 = LLT::scalar(1);
425 const LLT S8 = LLT::scalar(8);
426 const LLT S16 = LLT::scalar(16);
427 const LLT S32 = LLT::scalar(32);
428 const LLT S64 = LLT::scalar(64);
429 const LLT S128 = LLT::scalar(128);
430 const LLT S256 = LLT::scalar(256);
431 const LLT S512 = LLT::scalar(512);
432 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
433
434 const LLT V2S8 = LLT::vector(2, 8);
435 const LLT V2S16 = LLT::vector(2, 16);
436 const LLT V4S16 = LLT::vector(4, 16);
437
438 const LLT V2S32 = LLT::vector(2, 32);
439 const LLT V3S32 = LLT::vector(3, 32);
440 const LLT V4S32 = LLT::vector(4, 32);
441 const LLT V5S32 = LLT::vector(5, 32);
442 const LLT V6S32 = LLT::vector(6, 32);
443 const LLT V7S32 = LLT::vector(7, 32);
444 const LLT V8S32 = LLT::vector(8, 32);
445 const LLT V9S32 = LLT::vector(9, 32);
446 const LLT V10S32 = LLT::vector(10, 32);
447 const LLT V11S32 = LLT::vector(11, 32);
448 const LLT V12S32 = LLT::vector(12, 32);
449 const LLT V13S32 = LLT::vector(13, 32);
450 const LLT V14S32 = LLT::vector(14, 32);
451 const LLT V15S32 = LLT::vector(15, 32);
452 const LLT V16S32 = LLT::vector(16, 32);
453 const LLT V32S32 = LLT::vector(32, 32);
454
455 const LLT V2S64 = LLT::vector(2, 64);
456 const LLT V3S64 = LLT::vector(3, 64);
457 const LLT V4S64 = LLT::vector(4, 64);
458 const LLT V5S64 = LLT::vector(5, 64);
459 const LLT V6S64 = LLT::vector(6, 64);
460 const LLT V7S64 = LLT::vector(7, 64);
461 const LLT V8S64 = LLT::vector(8, 64);
462 const LLT V16S64 = LLT::vector(16, 64);
463
464 std::initializer_list<LLT> AllS32Vectors =
465 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
466 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
467 std::initializer_list<LLT> AllS64Vectors =
468 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
469
470 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
471 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
472 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
473 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
474 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
475 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
476 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
477
478 const LLT CodePtr = FlatPtr;
479
480 const std::initializer_list<LLT> AddrSpaces64 = {
481 GlobalPtr, ConstantPtr, FlatPtr
482 };
483
484 const std::initializer_list<LLT> AddrSpaces32 = {
485 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
486 };
487
488 const std::initializer_list<LLT> FPTypesBase = {
489 S32, S64
490 };
491
492 const std::initializer_list<LLT> FPTypes16 = {
493 S32, S64, S16
494 };
495
496 const std::initializer_list<LLT> FPTypesPK16 = {
497 S32, S64, S16, V2S16
498 };
499
500 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
501
502 setAction({G_BRCOND, S1}, Legal); // VCC branches
503 setAction({G_BRCOND, S32}, Legal); // SCC branches
504
505 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
506 // elements for v3s16
507 getActionDefinitionsBuilder(G_PHI)
508 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
509 .legalFor(AllS32Vectors)
510 .legalFor(AllS64Vectors)
511 .legalFor(AddrSpaces64)
512 .legalFor(AddrSpaces32)
513 .legalIf(isPointer(0))
514 .clampScalar(0, S16, S256)
515 .widenScalarToNextPow2(0, 32)
516 .clampMaxNumElements(0, S32, 16)
517 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
518 .scalarize(0);
519
520 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
521 // Full set of gfx9 features.
522 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
523 .legalFor({S32, S16, V2S16})
524 .clampScalar(0, S16, S32)
525 .clampMaxNumElements(0, S16, 2)
526 .scalarize(0)
527 .widenScalarToNextPow2(0, 32);
528
529 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
530 .legalFor({S32, S16, V2S16}) // Clamp modifier
531 .minScalarOrElt(0, S16)
532 .clampMaxNumElements(0, S16, 2)
533 .scalarize(0)
534 .widenScalarToNextPow2(0, 32)
535 .lower();
536 } else if (ST.has16BitInsts()) {
537 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
538 .legalFor({S32, S16})
539 .clampScalar(0, S16, S32)
540 .scalarize(0)
541 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
542
543 // Technically the saturating operations require clamp bit support, but this
544 // was introduced at the same time as 16-bit operations.
545 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
546 .legalFor({S32, S16}) // Clamp modifier
547 .minScalar(0, S16)
548 .scalarize(0)
549 .widenScalarToNextPow2(0, 16)
550 .lower();
551
552 // We're just lowering this, but it helps get a better result to try to
553 // coerce to the desired type first.
554 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
555 .minScalar(0, S16)
556 .scalarize(0)
557 .lower();
558 } else {
559 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
560 .legalFor({S32})
561 .clampScalar(0, S32, S32)
562 .scalarize(0);
563
564 if (ST.hasIntClamp()) {
565 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
566 .legalFor({S32}) // Clamp modifier.
567 .scalarize(0)
568 .minScalarOrElt(0, S32)
569 .lower();
570 } else {
571 // Clamp bit support was added in VI, along with 16-bit operations.
572 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
573 .minScalar(0, S32)
574 .scalarize(0)
575 .lower();
576 }
577
578 // FIXME: DAG expansion gets better results. The widening uses the smaller
579 // range values and goes for the min/max lowering directly.
580 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
581 .minScalar(0, S32)
582 .scalarize(0)
583 .lower();
584 }
585
586 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
587 .customFor({S32, S64})
588 .clampScalar(0, S32, S64)
589 .widenScalarToNextPow2(0, 32)
590 .scalarize(0);
591
592 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
593 .legalFor({S32})
594 .maxScalarOrElt(0, S32);
595
596 if (ST.hasVOP3PInsts()) {
597 Mulh
598 .clampMaxNumElements(0, S8, 2)
599 .lowerFor({V2S8});
600 }
601
602 Mulh
603 .scalarize(0)
604 .lower();
605
606 // Report legal for any types we can handle anywhere. For the cases only legal
607 // on the SALU, RegBankSelect will be able to re-legalize.
608 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
609 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
610 .clampScalar(0, S32, S64)
611 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
612 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
613 .widenScalarToNextPow2(0)
614 .scalarize(0);
615
616 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
617 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
618 .legalFor({{S32, S1}, {S32, S32}})
619 .minScalar(0, S32)
620 // TODO: .scalarize(0)
621 .lower();
622
623 getActionDefinitionsBuilder(G_BITCAST)
624 // Don't worry about the size constraint.
625 .legalIf(all(isRegisterType(0), isRegisterType(1)))
626 .lower();
627
628
629 getActionDefinitionsBuilder(G_CONSTANT)
630 .legalFor({S1, S32, S64, S16, GlobalPtr,
631 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
632 .legalIf(isPointer(0))
633 .clampScalar(0, S32, S64)
634 .widenScalarToNextPow2(0);
635
636 getActionDefinitionsBuilder(G_FCONSTANT)
637 .legalFor({S32, S64, S16})
638 .clampScalar(0, S16, S64);
639
640 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
641 .legalIf(isRegisterType(0))
642 // s1 and s16 are special cases because they have legal operations on
643 // them, but don't really occupy registers in the normal way.
644 .legalFor({S1, S16})
645 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
646 .clampScalarOrElt(0, S32, MaxScalar)
647 .widenScalarToNextPow2(0, 32)
648 .clampMaxNumElements(0, S32, 16);
649
650 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
651
652 // If the amount is divergent, we have to do a wave reduction to get the
653 // maximum value, so this is expanded during RegBankSelect.
654 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
655 .legalFor({{PrivatePtr, S32}});
656
657 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
658 .customIf(typeIsNot(0, PrivatePtr));
659
660 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
661
662 auto &FPOpActions = getActionDefinitionsBuilder(
663 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
664 .legalFor({S32, S64});
665 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
666 .customFor({S32, S64});
667 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
668 .customFor({S32, S64});
669
670 if (ST.has16BitInsts()) {
671 if (ST.hasVOP3PInsts())
672 FPOpActions.legalFor({S16, V2S16});
673 else
674 FPOpActions.legalFor({S16});
675
676 TrigActions.customFor({S16});
677 FDIVActions.customFor({S16});
678 }
679
680 auto &MinNumMaxNum = getActionDefinitionsBuilder({
681 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
682
683 if (ST.hasVOP3PInsts()) {
684 MinNumMaxNum.customFor(FPTypesPK16)
685 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
686 .clampMaxNumElements(0, S16, 2)
687 .clampScalar(0, S16, S64)
688 .scalarize(0);
689 } else if (ST.has16BitInsts()) {
690 MinNumMaxNum.customFor(FPTypes16)
691 .clampScalar(0, S16, S64)
692 .scalarize(0);
693 } else {
694 MinNumMaxNum.customFor(FPTypesBase)
695 .clampScalar(0, S32, S64)
696 .scalarize(0);
697 }
698
699 if (ST.hasVOP3PInsts())
700 FPOpActions.clampMaxNumElements(0, S16, 2);
701
702 FPOpActions
703 .scalarize(0)
704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705
706 TrigActions
707 .scalarize(0)
708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709
710 FDIVActions
711 .scalarize(0)
712 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
713
714 getActionDefinitionsBuilder({G_FNEG, G_FABS})
715 .legalFor(FPTypesPK16)
716 .clampMaxNumElements(0, S16, 2)
717 .scalarize(0)
718 .clampScalar(0, S16, S64);
719
720 if (ST.has16BitInsts()) {
721 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
722 .legalFor({S32, S64, S16})
723 .scalarize(0)
724 .clampScalar(0, S16, S64);
725 } else {
726 getActionDefinitionsBuilder(G_FSQRT)
727 .legalFor({S32, S64})
728 .scalarize(0)
729 .clampScalar(0, S32, S64);
730
731 if (ST.hasFractBug()) {
732 getActionDefinitionsBuilder(G_FFLOOR)
733 .customFor({S64})
734 .legalFor({S32, S64})
735 .scalarize(0)
736 .clampScalar(0, S32, S64);
737 } else {
738 getActionDefinitionsBuilder(G_FFLOOR)
739 .legalFor({S32, S64})
740 .scalarize(0)
741 .clampScalar(0, S32, S64);
742 }
743 }
744
745 getActionDefinitionsBuilder(G_FPTRUNC)
746 .legalFor({{S32, S64}, {S16, S32}})
747 .scalarize(0)
748 .lower();
749
750 getActionDefinitionsBuilder(G_FPEXT)
751 .legalFor({{S64, S32}, {S32, S16}})
752 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
753 .scalarize(0);
754
755 getActionDefinitionsBuilder(G_FSUB)
756 // Use actual fsub instruction
757 .legalFor({S32})
758 // Must use fadd + fneg
759 .lowerFor({S64, S16, V2S16})
760 .scalarize(0)
761 .clampScalar(0, S32, S64);
762
763 // Whether this is legal depends on the floating point mode for the function.
764 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
765 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
766 FMad.customFor({S32, S16});
767 else if (ST.hasMadMacF32Insts())
768 FMad.customFor({S32});
769 else if (ST.hasMadF16())
770 FMad.customFor({S16});
771 FMad.scalarize(0)
772 .lower();
773
774 auto &FRem = getActionDefinitionsBuilder(G_FREM);
775 if (ST.has16BitInsts()) {
776 FRem.customFor({S16, S32, S64});
777 } else {
778 FRem.minScalar(0, S32)
779 .customFor({S32, S64});
780 }
781 FRem.scalarize(0);
782
783 // TODO: Do we need to clamp maximum bitwidth?
784 getActionDefinitionsBuilder(G_TRUNC)
785 .legalIf(isScalar(0))
786 .legalFor({{V2S16, V2S32}})
787 .clampMaxNumElements(0, S16, 2)
788 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
789 // situations (like an invalid implicit use), we don't want to infinite loop
790 // in the legalizer.
791 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
792 .alwaysLegal();
793
794 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
795 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
796 {S32, S1}, {S64, S1}, {S16, S1}})
797 .scalarize(0)
798 .clampScalar(0, S32, S64)
799 .widenScalarToNextPow2(1, 32);
800
801 // TODO: Split s1->s64 during regbankselect for VALU.
802 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
803 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
804 .lowerFor({{S32, S64}})
805 .lowerIf(typeIs(1, S1))
806 .customFor({{S64, S64}});
807 if (ST.has16BitInsts())
808 IToFP.legalFor({{S16, S16}});
809 IToFP.clampScalar(1, S32, S64)
810 .minScalar(0, S32)
811 .scalarize(0)
812 .widenScalarToNextPow2(1);
813
814 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
815 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
816 .customFor({{S64, S64}})
817 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
818 if (ST.has16BitInsts())
819 FPToI.legalFor({{S16, S16}});
820 else
821 FPToI.minScalar(1, S32);
822
823 FPToI.minScalar(0, S32)
824 .scalarize(0)
825 .lower();
826
827 // Lower roundeven into G_FRINT
828 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
829 .scalarize(0)
830 .lower();
831
832 if (ST.has16BitInsts()) {
833 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
834 .legalFor({S16, S32, S64})
835 .clampScalar(0, S16, S64)
836 .scalarize(0);
837 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
838 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
839 .legalFor({S32, S64})
840 .clampScalar(0, S32, S64)
841 .scalarize(0);
842 } else {
843 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
844 .legalFor({S32})
845 .customFor({S64})
846 .clampScalar(0, S32, S64)
847 .scalarize(0);
848 }
849
850 getActionDefinitionsBuilder(G_PTR_ADD)
851 .legalIf(all(isPointer(0), sameSize(0, 1)))
852 .scalarize(0)
853 .scalarSameSizeAs(1, 0);
854
855 getActionDefinitionsBuilder(G_PTRMASK)
856 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
857 .scalarSameSizeAs(1, 0)
858 .scalarize(0);
859
860 auto &CmpBuilder =
861 getActionDefinitionsBuilder(G_ICMP)
862 // The compare output type differs based on the register bank of the output,
863 // so make both s1 and s32 legal.
864 //
865 // Scalar compares producing output in scc will be promoted to s32, as that
866 // is the allocatable register type that will be needed for the copy from
867 // scc. This will be promoted during RegBankSelect, and we assume something
868 // before that won't try to use s32 result types.
869 //
870 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
871 // bank.
872 .legalForCartesianProduct(
873 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
874 .legalForCartesianProduct(
875 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
876 if (ST.has16BitInsts()) {
877 CmpBuilder.legalFor({{S1, S16}});
878 }
879
880 CmpBuilder
881 .widenScalarToNextPow2(1)
882 .clampScalar(1, S32, S64)
883 .scalarize(0)
884 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
885
886 getActionDefinitionsBuilder(G_FCMP)
887 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
888 .widenScalarToNextPow2(1)
889 .clampScalar(1, S32, S64)
890 .scalarize(0);
891
892 // FIXME: fpow has a selection pattern that should move to custom lowering.
893 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
894 if (ST.has16BitInsts())
895 Exp2Ops.legalFor({S32, S16});
896 else
897 Exp2Ops.legalFor({S32});
898 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
899 Exp2Ops.scalarize(0);
900
901 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
902 if (ST.has16BitInsts())
903 ExpOps.customFor({{S32}, {S16}});
904 else
905 ExpOps.customFor({S32});
906 ExpOps.clampScalar(0, MinScalarFPTy, S32)
907 .scalarize(0);
908
909 getActionDefinitionsBuilder(G_FPOWI)
910 .clampScalar(0, MinScalarFPTy, S32)
911 .lower();
912
913 // The 64-bit versions produce 32-bit results, but only on the SALU.
914 getActionDefinitionsBuilder(G_CTPOP)
915 .legalFor({{S32, S32}, {S32, S64}})
916 .clampScalar(0, S32, S32)
917 .clampScalar(1, S32, S64)
918 .scalarize(0)
919 .widenScalarToNextPow2(0, 32)
920 .widenScalarToNextPow2(1, 32);
921
922 // The hardware instructions return a different result on 0 than the generic
923 // instructions expect. The hardware produces -1, but these produce the
924 // bitwidth.
925 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
926 .scalarize(0)
927 .clampScalar(0, S32, S32)
928 .clampScalar(1, S32, S64)
929 .widenScalarToNextPow2(0, 32)
930 .widenScalarToNextPow2(1, 32)
931 .lower();
932
933 // The 64-bit versions produce 32-bit results, but only on the SALU.
934 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
935 .legalFor({{S32, S32}, {S32, S64}})
936 .clampScalar(0, S32, S32)
937 .clampScalar(1, S32, S64)
938 .scalarize(0)
939 .widenScalarToNextPow2(0, 32)
940 .widenScalarToNextPow2(1, 32);
941
942 getActionDefinitionsBuilder(G_BITREVERSE)
943 .legalFor({S32})
944 .clampScalar(0, S32, S32)
945 .scalarize(0);
946
947 if (ST.has16BitInsts()) {
948 getActionDefinitionsBuilder(G_BSWAP)
949 .legalFor({S16, S32, V2S16})
950 .clampMaxNumElements(0, S16, 2)
951 // FIXME: Fixing non-power-of-2 before clamp is workaround for
952 // narrowScalar limitation.
953 .widenScalarToNextPow2(0)
954 .clampScalar(0, S16, S32)
955 .scalarize(0);
956
957 if (ST.hasVOP3PInsts()) {
958 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
959 .legalFor({S32, S16, V2S16})
960 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
961 .clampMaxNumElements(0, S16, 2)
962 .minScalar(0, S16)
963 .widenScalarToNextPow2(0)
964 .scalarize(0)
965 .lower();
966 } else {
967 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
968 .legalFor({S32, S16})
969 .widenScalarToNextPow2(0)
970 .minScalar(0, S16)
971 .scalarize(0)
972 .lower();
973 }
974 } else {
975 // TODO: Should have same legality without v_perm_b32
976 getActionDefinitionsBuilder(G_BSWAP)
977 .legalFor({S32})
978 .lowerIf(scalarNarrowerThan(0, 32))
979 // FIXME: Fixing non-power-of-2 before clamp is workaround for
980 // narrowScalar limitation.
981 .widenScalarToNextPow2(0)
982 .maxScalar(0, S32)
983 .scalarize(0)
984 .lower();
985
986 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
987 .legalFor({S32})
988 .minScalar(0, S32)
989 .widenScalarToNextPow2(0)
990 .scalarize(0)
991 .lower();
992 }
993
994 getActionDefinitionsBuilder(G_INTTOPTR)
995 // List the common cases
996 .legalForCartesianProduct(AddrSpaces64, {S64})
997 .legalForCartesianProduct(AddrSpaces32, {S32})
998 .scalarize(0)
999 // Accept any address space as long as the size matches
1000 .legalIf(sameSize(0, 1))
1001 .widenScalarIf(smallerThan(1, 0),
1002 [](const LegalityQuery &Query) {
1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004 })
1005 .narrowScalarIf(largerThan(1, 0),
1006 [](const LegalityQuery &Query) {
1007 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1008 });
1009
1010 getActionDefinitionsBuilder(G_PTRTOINT)
1011 // List the common cases
1012 .legalForCartesianProduct(AddrSpaces64, {S64})
1013 .legalForCartesianProduct(AddrSpaces32, {S32})
1014 .scalarize(0)
1015 // Accept any address space as long as the size matches
1016 .legalIf(sameSize(0, 1))
1017 .widenScalarIf(smallerThan(0, 1),
1018 [](const LegalityQuery &Query) {
1019 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1020 })
1021 .narrowScalarIf(
1022 largerThan(0, 1),
1023 [](const LegalityQuery &Query) {
1024 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1025 });
1026
1027 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1028 .scalarize(0)
1029 .custom();
1030
1031 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1032 bool IsLoad) -> bool {
1033 const LLT DstTy = Query.Types[0];
1034
1035 // Split vector extloads.
1036 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1037 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1038
1039 if (MemSize < DstTy.getSizeInBits())
1040 MemSize = std::max(MemSize, AlignBits);
1041
1042 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1043 return true;
1044
1045 const LLT PtrTy = Query.Types[1];
1046 unsigned AS = PtrTy.getAddressSpace();
1047 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1048 return true;
1049
1050 // Catch weird sized loads that don't evenly divide into the access sizes
1051 // TODO: May be able to widen depending on alignment etc.
1052 unsigned NumRegs = (MemSize + 31) / 32;
1053 if (NumRegs == 3) {
1054 if (!ST.hasDwordx3LoadStores())
1055 return true;
1056 } else {
1057 // If the alignment allows, these should have been widened.
1058 if (!isPowerOf2_32(NumRegs))
1059 return true;
1060 }
1061
1062 if (AlignBits < MemSize) {
1063 const SITargetLowering *TLI = ST.getTargetLowering();
1064 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1065 Align(AlignBits / 8));
1066 }
1067
1068 return false;
1069 };
1070
1071 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1072 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1073 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1074
1075 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1076 // LDS
1077 // TODO: Unsupported flat for SI.
1078
1079 for (unsigned Op : {G_LOAD, G_STORE}) {
1080 const bool IsStore = Op == G_STORE;
1081
1082 auto &Actions = getActionDefinitionsBuilder(Op);
1083 // Explicitly list some common cases.
1084 // TODO: Does this help compile time at all?
1085 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1086 {V2S32, GlobalPtr, 64, GlobalAlign32},
1087 {V4S32, GlobalPtr, 128, GlobalAlign32},
1088 {S64, GlobalPtr, 64, GlobalAlign32},
1089 {V2S64, GlobalPtr, 128, GlobalAlign32},
1090 {V2S16, GlobalPtr, 32, GlobalAlign32},
1091 {S32, GlobalPtr, 8, GlobalAlign8},
1092 {S32, GlobalPtr, 16, GlobalAlign16},
1093
1094 {S32, LocalPtr, 32, 32},
1095 {S64, LocalPtr, 64, 32},
1096 {V2S32, LocalPtr, 64, 32},
1097 {S32, LocalPtr, 8, 8},
1098 {S32, LocalPtr, 16, 16},
1099 {V2S16, LocalPtr, 32, 32},
1100
1101 {S32, PrivatePtr, 32, 32},
1102 {S32, PrivatePtr, 8, 8},
1103 {S32, PrivatePtr, 16, 16},
1104 {V2S16, PrivatePtr, 32, 32},
1105
1106 {S32, ConstantPtr, 32, GlobalAlign32},
1107 {V2S32, ConstantPtr, 64, GlobalAlign32},
1108 {V4S32, ConstantPtr, 128, GlobalAlign32},
1109 {S64, ConstantPtr, 64, GlobalAlign32},
1110 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1111 Actions.legalIf(
1112 [=](const LegalityQuery &Query) -> bool {
1113 return isLoadStoreLegal(ST, Query, Op);
1114 });
1115
1116 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1117 // 64-bits.
1118 //
1119 // TODO: Should generalize bitcast action into coerce, which will also cover
1120 // inserting addrspacecasts.
1121 Actions.customIf(typeIs(1, Constant32Ptr));
1122
1123 // Turn any illegal element vectors into something easier to deal
1124 // with. These will ultimately produce 32-bit scalar shifts to extract the
1125 // parts anyway.
1126 //
1127 // For odd 16-bit element vectors, prefer to split those into pieces with
1128 // 16-bit vector parts.
1129 Actions.bitcastIf(
1130 [=](const LegalityQuery &Query) -> bool {
1131 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1132 Query.MMODescrs[0].SizeInBits);
1133 }, bitcastToRegisterType(0));
1134
1135 if (!IsStore) {
1136 // Widen suitably aligned loads by loading extra bytes. The standard
1137 // legalization actions can't properly express widening memory operands.
1138 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1139 return shouldWidenLoad(ST, Query, G_LOAD);
1140 });
1141 }
1142
1143 // FIXME: load/store narrowing should be moved to lower action
1144 Actions
1145 .narrowScalarIf(
1146 [=](const LegalityQuery &Query) -> bool {
1147 return !Query.Types[0].isVector() &&
1148 needToSplitMemOp(Query, Op == G_LOAD);
1149 },
1150 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1151 const LLT DstTy = Query.Types[0];
1152 const LLT PtrTy = Query.Types[1];
1153
1154 const unsigned DstSize = DstTy.getSizeInBits();
1155 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1156
1157 // Split extloads.
1158 if (DstSize > MemSize)
1159 return std::make_pair(0, LLT::scalar(MemSize));
1160
1161 if (!isPowerOf2_32(DstSize)) {
1162 // We're probably decomposing an odd sized store. Try to split
1163 // to the widest type. TODO: Account for alignment. As-is it
1164 // should be OK, since the new parts will be further legalized.
1165 unsigned FloorSize = PowerOf2Floor(DstSize);
1166 return std::make_pair(0, LLT::scalar(FloorSize));
1167 }
1168
1169 if (DstSize > 32 && (DstSize % 32 != 0)) {
1170 // FIXME: Need a way to specify non-extload of larger size if
1171 // suitably aligned.
1172 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1173 }
1174
1175 unsigned MaxSize = maxSizeForAddrSpace(ST,
1176 PtrTy.getAddressSpace(),
1177 Op == G_LOAD);
1178 if (MemSize > MaxSize)
1179 return std::make_pair(0, LLT::scalar(MaxSize));
1180
1181 unsigned Align = Query.MMODescrs[0].AlignInBits;
1182 return std::make_pair(0, LLT::scalar(Align));
1183 })
1184 .fewerElementsIf(
1185 [=](const LegalityQuery &Query) -> bool {
1186 return Query.Types[0].isVector() &&
1187 needToSplitMemOp(Query, Op == G_LOAD);
1188 },
1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190 const LLT DstTy = Query.Types[0];
1191 const LLT PtrTy = Query.Types[1];
1192
1193 LLT EltTy = DstTy.getElementType();
1194 unsigned MaxSize = maxSizeForAddrSpace(ST,
1195 PtrTy.getAddressSpace(),
1196 Op == G_LOAD);
1197
1198 // FIXME: Handle widened to power of 2 results better. This ends
1199 // up scalarizing.
1200 // FIXME: 3 element stores scalarized on SI
1201
1202 // Split if it's too large for the address space.
1203 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1204 unsigned NumElts = DstTy.getNumElements();
1205 unsigned EltSize = EltTy.getSizeInBits();
1206
1207 if (MaxSize % EltSize == 0) {
1208 return std::make_pair(
1209 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1210 }
1211
1212 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1213
1214 // FIXME: Refine when odd breakdowns handled
1215 // The scalars will need to be re-legalized.
1216 if (NumPieces == 1 || NumPieces >= NumElts ||
1217 NumElts % NumPieces != 0)
1218 return std::make_pair(0, EltTy);
1219
1220 return std::make_pair(0,
1221 LLT::vector(NumElts / NumPieces, EltTy));
1222 }
1223
1224 // FIXME: We could probably handle weird extending loads better.
1225 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1226 if (DstTy.getSizeInBits() > MemSize)
1227 return std::make_pair(0, EltTy);
1228
1229 unsigned EltSize = EltTy.getSizeInBits();
1230 unsigned DstSize = DstTy.getSizeInBits();
1231 if (!isPowerOf2_32(DstSize)) {
1232 // We're probably decomposing an odd sized store. Try to split
1233 // to the widest type. TODO: Account for alignment. As-is it
1234 // should be OK, since the new parts will be further legalized.
1235 unsigned FloorSize = PowerOf2Floor(DstSize);
1236 return std::make_pair(
1237 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1238 }
1239
1240 // Need to split because of alignment.
1241 unsigned Align = Query.MMODescrs[0].AlignInBits;
1242 if (EltSize > Align &&
1243 (EltSize / Align < DstTy.getNumElements())) {
1244 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1245 }
1246
1247 // May need relegalization for the scalars.
1248 return std::make_pair(0, EltTy);
1249 })
1250 .lowerIfMemSizeNotPow2()
1251 .minScalar(0, S32);
1252
1253 if (IsStore)
1254 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1255
1256 Actions
1257 .widenScalarToNextPow2(0)
1258 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1259 .lower();
1260 }
1261
1262 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1263 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1264 {S32, GlobalPtr, 16, 2 * 8},
1265 {S32, LocalPtr, 8, 8},
1266 {S32, LocalPtr, 16, 16},
1267 {S32, PrivatePtr, 8, 8},
1268 {S32, PrivatePtr, 16, 16},
1269 {S32, ConstantPtr, 8, 8},
1270 {S32, ConstantPtr, 16, 2 * 8}});
1271 if (ST.hasFlatAddressSpace()) {
1272 ExtLoads.legalForTypesWithMemDesc(
1273 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1274 }
1275
1276 ExtLoads.clampScalar(0, S32, S32)
1277 .widenScalarToNextPow2(0)
1278 .unsupportedIfMemSizeNotPow2()
1279 .lower();
1280
1281 auto &Atomics = getActionDefinitionsBuilder(
1282 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1283 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1284 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1285 G_ATOMICRMW_UMIN})
1286 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1287 {S64, GlobalPtr}, {S64, LocalPtr},
1288 {S32, RegionPtr}, {S64, RegionPtr}});
1289 if (ST.hasFlatAddressSpace()) {
1290 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1291 }
1292
1293 if (ST.hasLDSFPAtomics()) {
1294 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1295 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1296 }
1297
1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299 // demarshalling
1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302 {S32, FlatPtr}, {S64, FlatPtr}})
1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304 {S32, RegionPtr}, {S64, RegionPtr}});
1305 // TODO: Pointer types, any 32-bit or 64-bit vector
1306
1307 // Condition should be s32 for scalar, s1 for vector.
1308 getActionDefinitionsBuilder(G_SELECT)
1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1310 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1311 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1312 .clampScalar(0, S16, S64)
1313 .scalarize(1)
1314 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1315 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1316 .clampMaxNumElements(0, S32, 2)
1317 .clampMaxNumElements(0, LocalPtr, 2)
1318 .clampMaxNumElements(0, PrivatePtr, 2)
1319 .scalarize(0)
1320 .widenScalarToNextPow2(0)
1321 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1322
1323 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1324 // be more flexible with the shift amount type.
1325 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1326 .legalFor({{S32, S32}, {S64, S32}});
1327 if (ST.has16BitInsts()) {
1328 if (ST.hasVOP3PInsts()) {
1329 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1330 .clampMaxNumElements(0, S16, 2);
1331 } else
1332 Shifts.legalFor({{S16, S16}});
1333
1334 // TODO: Support 16-bit shift amounts for all types
1335 Shifts.widenScalarIf(
1336 [=](const LegalityQuery &Query) {
1337 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1338 // 32-bit amount.
1339 const LLT ValTy = Query.Types[0];
1340 const LLT AmountTy = Query.Types[1];
1341 return ValTy.getSizeInBits() <= 16 &&
1342 AmountTy.getSizeInBits() < 16;
1343 }, changeTo(1, S16));
1344 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1345 Shifts.clampScalar(1, S32, S32);
1346 Shifts.clampScalar(0, S16, S64);
1347 Shifts.widenScalarToNextPow2(0, 16);
1348
1349 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350 .minScalar(0, S16)
1351 .scalarize(0)
1352 .lower();
1353 } else {
1354 // Make sure we legalize the shift amount type first, as the general
1355 // expansion for the shifted type will produce much worse code if it hasn't
1356 // been truncated already.
1357 Shifts.clampScalar(1, S32, S32);
1358 Shifts.clampScalar(0, S32, S64);
1359 Shifts.widenScalarToNextPow2(0, 32);
1360
1361 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1362 .minScalar(0, S32)
1363 .scalarize(0)
1364 .lower();
1365 }
1366 Shifts.scalarize(0);
1367
1368 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1369 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1370 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1371 unsigned IdxTypeIdx = 2;
1372
1373 getActionDefinitionsBuilder(Op)
1374 .customIf([=](const LegalityQuery &Query) {
1375 const LLT EltTy = Query.Types[EltTypeIdx];
1376 const LLT VecTy = Query.Types[VecTypeIdx];
1377 const LLT IdxTy = Query.Types[IdxTypeIdx];
1378 const unsigned EltSize = EltTy.getSizeInBits();
1379 return (EltSize == 32 || EltSize == 64) &&
1380 VecTy.getSizeInBits() % 32 == 0 &&
1381 VecTy.getSizeInBits() <= MaxRegisterSize &&
1382 IdxTy.getSizeInBits() == 32;
1383 })
1384 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1385 bitcastToVectorElement32(VecTypeIdx))
1386 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1387 .bitcastIf(
1388 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1389 [=](const LegalityQuery &Query) {
1390 // For > 64-bit element types, try to turn this into a 64-bit
1391 // element vector since we may be able to do better indexing
1392 // if this is scalar. If not, fall back to 32.
1393 const LLT EltTy = Query.Types[EltTypeIdx];
1394 const LLT VecTy = Query.Types[VecTypeIdx];
1395 const unsigned DstEltSize = EltTy.getSizeInBits();
1396 const unsigned VecSize = VecTy.getSizeInBits();
1397
1398 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1399 return std::make_pair(
1400 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1401 })
1402 .clampScalar(EltTypeIdx, S32, S64)
1403 .clampScalar(VecTypeIdx, S32, S64)
1404 .clampScalar(IdxTypeIdx, S32, S32)
1405 .clampMaxNumElements(VecTypeIdx, S32, 32)
1406 // TODO: Clamp elements for 64-bit vectors?
1407 // It should only be necessary with variable indexes.
1408 // As a last resort, lower to the stack
1409 .lower();
1410 }
1411
1412 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1413 .unsupportedIf([=](const LegalityQuery &Query) {
1414 const LLT &EltTy = Query.Types[1].getElementType();
1415 return Query.Types[0] != EltTy;
1416 });
1417
1418 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1419 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1420 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1421
1422 // FIXME: Doesn't handle extract of illegal sizes.
1423 getActionDefinitionsBuilder(Op)
1424 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1425 // FIXME: Multiples of 16 should not be legal.
1426 .legalIf([=](const LegalityQuery &Query) {
1427 const LLT BigTy = Query.Types[BigTyIdx];
1428 const LLT LitTy = Query.Types[LitTyIdx];
1429 return (BigTy.getSizeInBits() % 32 == 0) &&
1430 (LitTy.getSizeInBits() % 16 == 0);
1431 })
1432 .widenScalarIf(
1433 [=](const LegalityQuery &Query) {
1434 const LLT BigTy = Query.Types[BigTyIdx];
1435 return (BigTy.getScalarSizeInBits() < 16);
1436 },
1437 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1438 .widenScalarIf(
1439 [=](const LegalityQuery &Query) {
1440 const LLT LitTy = Query.Types[LitTyIdx];
1441 return (LitTy.getScalarSizeInBits() < 16);
1442 },
1443 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1444 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1445 .widenScalarToNextPow2(BigTyIdx, 32);
1446
1447 }
1448
1449 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1450 .legalForCartesianProduct(AllS32Vectors, {S32})
1451 .legalForCartesianProduct(AllS64Vectors, {S64})
1452 .clampNumElements(0, V16S32, V32S32)
1453 .clampNumElements(0, V2S64, V16S64)
1454 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1455
1456 if (ST.hasScalarPackInsts()) {
1457 BuildVector
1458 // FIXME: Should probably widen s1 vectors straight to s32
1459 .minScalarOrElt(0, S16)
1460 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1461 .minScalar(1, S32);
1462
1463 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1464 .legalFor({V2S16, S32})
1465 .lower();
1466 BuildVector.minScalarOrElt(0, S32);
1467 } else {
1468 BuildVector.customFor({V2S16, S16});
1469 BuildVector.minScalarOrElt(0, S32);
1470
1471 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1472 .customFor({V2S16, S32})
1473 .lower();
1474 }
1475
1476 BuildVector.legalIf(isRegisterType(0));
1477
1478 // FIXME: Clamp maximum size
1479 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1480 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1481 .clampMaxNumElements(0, S32, 32)
1482 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1483 .clampMaxNumElements(0, S16, 64);
1484
1485 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1486 // pre-legalize.
1487 if (ST.hasVOP3PInsts()) {
1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1489 .customFor({V2S16, V2S16})
1490 .lower();
1491 } else
1492 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1493
1494 // Merge/Unmerge
1495 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1496 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1497 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1498
1499 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1500 const LLT Ty = Query.Types[TypeIdx];
1501 if (Ty.isVector()) {
1502 const LLT &EltTy = Ty.getElementType();
1503 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1504 return true;
1505 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1506 return true;
1507 }
1508 return false;
1509 };
1510
1511 auto &Builder = getActionDefinitionsBuilder(Op)
1512 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513 .lowerFor({{S16, V2S16}})
1514 .lowerIf([=](const LegalityQuery &Query) {
1515 const LLT BigTy = Query.Types[BigTyIdx];
1516 return BigTy.getSizeInBits() == 32;
1517 })
1518 // Try to widen to s16 first for small types.
1519 // TODO: Only do this on targets with legal s16 shifts
1520 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1521 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1522 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1523 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1524 elementTypeIs(1, S16)),
1525 changeTo(1, V2S16))
1526 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1527 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1528 // valid.
1529 .clampScalar(LitTyIdx, S32, S512)
1530 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1531 // Break up vectors with weird elements into scalars
1532 .fewerElementsIf(
1533 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1534 scalarize(0))
1535 .fewerElementsIf(
1536 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1537 scalarize(1))
1538 .clampScalar(BigTyIdx, S32, MaxScalar);
1539
1540 if (Op == G_MERGE_VALUES) {
1541 Builder.widenScalarIf(
1542 // TODO: Use 16-bit shifts if legal for 8-bit values?
1543 [=](const LegalityQuery &Query) {
1544 const LLT Ty = Query.Types[LitTyIdx];
1545 return Ty.getSizeInBits() < 32;
1546 },
1547 changeTo(LitTyIdx, S32));
1548 }
1549
1550 Builder.widenScalarIf(
1551 [=](const LegalityQuery &Query) {
1552 const LLT Ty = Query.Types[BigTyIdx];
1553 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1554 Ty.getSizeInBits() % 16 != 0;
1555 },
1556 [=](const LegalityQuery &Query) {
1557 // Pick the next power of 2, or a multiple of 64 over 128.
1558 // Whichever is smaller.
1559 const LLT &Ty = Query.Types[BigTyIdx];
1560 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1561 if (NewSizeInBits >= 256) {
1562 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1563 if (RoundedTo < NewSizeInBits)
1564 NewSizeInBits = RoundedTo;
1565 }
1566 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1567 })
1568 // Any vectors left are the wrong size. Scalarize them.
1569 .scalarize(0)
1570 .scalarize(1);
1571 }
1572
1573 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1574 // RegBankSelect.
1575 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1576 .legalFor({{S32}, {S64}});
1577
1578 if (ST.hasVOP3PInsts()) {
1579 SextInReg.lowerFor({{V2S16}})
1580 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1581 // get more vector shift opportunities, since we'll get those when
1582 // expanded.
1583 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1584 } else if (ST.has16BitInsts()) {
1585 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1586 } else {
1587 // Prefer to promote to s32 before lowering if we don't have 16-bit
1588 // shifts. This avoid a lot of intermediate truncate and extend operations.
1589 SextInReg.lowerFor({{S32}, {S64}});
1590 }
1591
1592 SextInReg
1593 .scalarize(0)
1594 .clampScalar(0, S32, S64)
1595 .lower();
1596
1597 getActionDefinitionsBuilder(G_FSHR)
1598 .legalFor({{S32, S32}})
1599 .scalarize(0)
1600 .lower();
1601
1602 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1603 .legalFor({S64});
1604
1605 getActionDefinitionsBuilder(G_FENCE)
1606 .alwaysLegal();
1607
1608 getActionDefinitionsBuilder({
1609 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1610 G_FCOPYSIGN,
1611
1612 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1613 G_ATOMICRMW_NAND,
1614 G_ATOMICRMW_FSUB,
1615 G_READ_REGISTER,
1616 G_WRITE_REGISTER,
1617
1618 G_SADDO, G_SSUBO,
1619
1620 // TODO: Implement
1621 G_FMINIMUM, G_FMAXIMUM,
1622 G_FSHL
1623 }).lower();
1624
1625 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1626 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1627 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1628 .unsupported();
1629
1630 computeTables();
1631 verify(*ST.getInstrInfo());
1632 }
1633
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1634 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1635 MachineInstr &MI) const {
1636 MachineIRBuilder &B = Helper.MIRBuilder;
1637 MachineRegisterInfo &MRI = *B.getMRI();
1638
1639 switch (MI.getOpcode()) {
1640 case TargetOpcode::G_ADDRSPACE_CAST:
1641 return legalizeAddrSpaceCast(MI, MRI, B);
1642 case TargetOpcode::G_FRINT:
1643 return legalizeFrint(MI, MRI, B);
1644 case TargetOpcode::G_FCEIL:
1645 return legalizeFceil(MI, MRI, B);
1646 case TargetOpcode::G_FREM:
1647 return legalizeFrem(MI, MRI, B);
1648 case TargetOpcode::G_INTRINSIC_TRUNC:
1649 return legalizeIntrinsicTrunc(MI, MRI, B);
1650 case TargetOpcode::G_SITOFP:
1651 return legalizeITOFP(MI, MRI, B, true);
1652 case TargetOpcode::G_UITOFP:
1653 return legalizeITOFP(MI, MRI, B, false);
1654 case TargetOpcode::G_FPTOSI:
1655 return legalizeFPTOI(MI, MRI, B, true);
1656 case TargetOpcode::G_FPTOUI:
1657 return legalizeFPTOI(MI, MRI, B, false);
1658 case TargetOpcode::G_FMINNUM:
1659 case TargetOpcode::G_FMAXNUM:
1660 case TargetOpcode::G_FMINNUM_IEEE:
1661 case TargetOpcode::G_FMAXNUM_IEEE:
1662 return legalizeMinNumMaxNum(Helper, MI);
1663 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1664 return legalizeExtractVectorElt(MI, MRI, B);
1665 case TargetOpcode::G_INSERT_VECTOR_ELT:
1666 return legalizeInsertVectorElt(MI, MRI, B);
1667 case TargetOpcode::G_SHUFFLE_VECTOR:
1668 return legalizeShuffleVector(MI, MRI, B);
1669 case TargetOpcode::G_FSIN:
1670 case TargetOpcode::G_FCOS:
1671 return legalizeSinCos(MI, MRI, B);
1672 case TargetOpcode::G_GLOBAL_VALUE:
1673 return legalizeGlobalValue(MI, MRI, B);
1674 case TargetOpcode::G_LOAD:
1675 return legalizeLoad(Helper, MI);
1676 case TargetOpcode::G_FMAD:
1677 return legalizeFMad(MI, MRI, B);
1678 case TargetOpcode::G_FDIV:
1679 return legalizeFDIV(MI, MRI, B);
1680 case TargetOpcode::G_UDIV:
1681 case TargetOpcode::G_UREM:
1682 return legalizeUDIV_UREM(MI, MRI, B);
1683 case TargetOpcode::G_SDIV:
1684 case TargetOpcode::G_SREM:
1685 return legalizeSDIV_SREM(MI, MRI, B);
1686 case TargetOpcode::G_ATOMIC_CMPXCHG:
1687 return legalizeAtomicCmpXChg(MI, MRI, B);
1688 case TargetOpcode::G_FLOG:
1689 return legalizeFlog(MI, B, numbers::ln2f);
1690 case TargetOpcode::G_FLOG10:
1691 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1692 case TargetOpcode::G_FEXP:
1693 return legalizeFExp(MI, B);
1694 case TargetOpcode::G_FPOW:
1695 return legalizeFPow(MI, B);
1696 case TargetOpcode::G_FFLOOR:
1697 return legalizeFFloor(MI, MRI, B);
1698 case TargetOpcode::G_BUILD_VECTOR:
1699 return legalizeBuildVector(MI, MRI, B);
1700 default:
1701 return false;
1702 }
1703
1704 llvm_unreachable("expected switch to return");
1705 }
1706
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1707 Register AMDGPULegalizerInfo::getSegmentAperture(
1708 unsigned AS,
1709 MachineRegisterInfo &MRI,
1710 MachineIRBuilder &B) const {
1711 MachineFunction &MF = B.getMF();
1712 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1713 const LLT S32 = LLT::scalar(32);
1714
1715 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1716
1717 if (ST.hasApertureRegs()) {
1718 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1719 // getreg.
1720 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1721 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1722 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1723 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1724 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1725 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1726 unsigned Encoding =
1727 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1728 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1729 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1730
1731 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1732
1733 B.buildInstr(AMDGPU::S_GETREG_B32)
1734 .addDef(GetReg)
1735 .addImm(Encoding);
1736 MRI.setType(GetReg, S32);
1737
1738 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1739 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1740 }
1741
1742 Register QueuePtr = MRI.createGenericVirtualRegister(
1743 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1744
1745 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1746 return Register();
1747
1748 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1749 // private_segment_aperture_base_hi.
1750 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1751
1752 // TODO: can we be smarter about machine pointer info?
1753 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1754 MachineMemOperand *MMO = MF.getMachineMemOperand(
1755 PtrInfo,
1756 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1757 MachineMemOperand::MOInvariant,
1758 4, commonAlignment(Align(64), StructOffset));
1759
1760 Register LoadAddr;
1761
1762 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1763 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1764 }
1765
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1766 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1767 MachineInstr &MI, MachineRegisterInfo &MRI,
1768 MachineIRBuilder &B) const {
1769 MachineFunction &MF = B.getMF();
1770
1771 const LLT S32 = LLT::scalar(32);
1772 Register Dst = MI.getOperand(0).getReg();
1773 Register Src = MI.getOperand(1).getReg();
1774
1775 LLT DstTy = MRI.getType(Dst);
1776 LLT SrcTy = MRI.getType(Src);
1777 unsigned DestAS = DstTy.getAddressSpace();
1778 unsigned SrcAS = SrcTy.getAddressSpace();
1779
1780 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1781 // vector element.
1782 assert(!DstTy.isVector());
1783
1784 const AMDGPUTargetMachine &TM
1785 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1786
1787 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1788 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1789 return true;
1790 }
1791
1792 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1793 // Truncate.
1794 B.buildExtract(Dst, Src, 0);
1795 MI.eraseFromParent();
1796 return true;
1797 }
1798
1799 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1800 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1801 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1802
1803 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1804 // another. Merge operands are required to be the same type, but creating an
1805 // extra ptrtoint would be kind of pointless.
1806 auto HighAddr = B.buildConstant(
1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1808 B.buildMerge(Dst, {Src, HighAddr});
1809 MI.eraseFromParent();
1810 return true;
1811 }
1812
1813 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1814 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1815 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1816 unsigned NullVal = TM.getNullPointerValue(DestAS);
1817
1818 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1819 auto FlatNull = B.buildConstant(SrcTy, 0);
1820
1821 // Extract low 32-bits of the pointer.
1822 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1823
1824 auto CmpRes =
1825 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1826 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1827
1828 MI.eraseFromParent();
1829 return true;
1830 }
1831
1832 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1833 return false;
1834
1835 if (!ST.hasFlatAddressSpace())
1836 return false;
1837
1838 auto SegmentNull =
1839 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1840 auto FlatNull =
1841 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1842
1843 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1844 if (!ApertureReg.isValid())
1845 return false;
1846
1847 auto CmpRes =
1848 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1849
1850 // Coerce the type of the low half of the result so we can use merge_values.
1851 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1852
1853 // TODO: Should we allow mismatched types but matching sizes in merges to
1854 // avoid the ptrtoint?
1855 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1856 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1857
1858 MI.eraseFromParent();
1859 return true;
1860 }
1861
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1862 bool AMDGPULegalizerInfo::legalizeFrint(
1863 MachineInstr &MI, MachineRegisterInfo &MRI,
1864 MachineIRBuilder &B) const {
1865 Register Src = MI.getOperand(1).getReg();
1866 LLT Ty = MRI.getType(Src);
1867 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1868
1869 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1870 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1871
1872 auto C1 = B.buildFConstant(Ty, C1Val);
1873 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1874
1875 // TODO: Should this propagate fast-math-flags?
1876 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1877 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1878
1879 auto C2 = B.buildFConstant(Ty, C2Val);
1880 auto Fabs = B.buildFAbs(Ty, Src);
1881
1882 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1883 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1884 MI.eraseFromParent();
1885 return true;
1886 }
1887
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1888 bool AMDGPULegalizerInfo::legalizeFceil(
1889 MachineInstr &MI, MachineRegisterInfo &MRI,
1890 MachineIRBuilder &B) const {
1891
1892 const LLT S1 = LLT::scalar(1);
1893 const LLT S64 = LLT::scalar(64);
1894
1895 Register Src = MI.getOperand(1).getReg();
1896 assert(MRI.getType(Src) == S64);
1897
1898 // result = trunc(src)
1899 // if (src > 0.0 && src != result)
1900 // result += 1.0
1901
1902 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1903
1904 const auto Zero = B.buildFConstant(S64, 0.0);
1905 const auto One = B.buildFConstant(S64, 1.0);
1906 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1907 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1908 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1909 auto Add = B.buildSelect(S64, And, One, Zero);
1910
1911 // TODO: Should this propagate fast-math-flags?
1912 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1913 return true;
1914 }
1915
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1916 bool AMDGPULegalizerInfo::legalizeFrem(
1917 MachineInstr &MI, MachineRegisterInfo &MRI,
1918 MachineIRBuilder &B) const {
1919 Register DstReg = MI.getOperand(0).getReg();
1920 Register Src0Reg = MI.getOperand(1).getReg();
1921 Register Src1Reg = MI.getOperand(2).getReg();
1922 auto Flags = MI.getFlags();
1923 LLT Ty = MRI.getType(DstReg);
1924
1925 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1926 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1927 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1928 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1929 MI.eraseFromParent();
1930 return true;
1931 }
1932
extractF64Exponent(Register Hi,MachineIRBuilder & B)1933 static MachineInstrBuilder extractF64Exponent(Register Hi,
1934 MachineIRBuilder &B) {
1935 const unsigned FractBits = 52;
1936 const unsigned ExpBits = 11;
1937 LLT S32 = LLT::scalar(32);
1938
1939 auto Const0 = B.buildConstant(S32, FractBits - 32);
1940 auto Const1 = B.buildConstant(S32, ExpBits);
1941
1942 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1943 .addUse(Hi)
1944 .addUse(Const0.getReg(0))
1945 .addUse(Const1.getReg(0));
1946
1947 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1948 }
1949
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1950 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1951 MachineInstr &MI, MachineRegisterInfo &MRI,
1952 MachineIRBuilder &B) const {
1953 const LLT S1 = LLT::scalar(1);
1954 const LLT S32 = LLT::scalar(32);
1955 const LLT S64 = LLT::scalar(64);
1956
1957 Register Src = MI.getOperand(1).getReg();
1958 assert(MRI.getType(Src) == S64);
1959
1960 // TODO: Should this use extract since the low half is unused?
1961 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1962 Register Hi = Unmerge.getReg(1);
1963
1964 // Extract the upper half, since this is where we will find the sign and
1965 // exponent.
1966 auto Exp = extractF64Exponent(Hi, B);
1967
1968 const unsigned FractBits = 52;
1969
1970 // Extract the sign bit.
1971 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1972 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1973
1974 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1975
1976 const auto Zero32 = B.buildConstant(S32, 0);
1977
1978 // Extend back to 64-bits.
1979 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1980
1981 auto Shr = B.buildAShr(S64, FractMask, Exp);
1982 auto Not = B.buildNot(S64, Shr);
1983 auto Tmp0 = B.buildAnd(S64, Src, Not);
1984 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1985
1986 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1987 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1988
1989 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1990 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1991 MI.eraseFromParent();
1992 return true;
1993 }
1994
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const1995 bool AMDGPULegalizerInfo::legalizeITOFP(
1996 MachineInstr &MI, MachineRegisterInfo &MRI,
1997 MachineIRBuilder &B, bool Signed) const {
1998
1999 Register Dst = MI.getOperand(0).getReg();
2000 Register Src = MI.getOperand(1).getReg();
2001
2002 const LLT S64 = LLT::scalar(64);
2003 const LLT S32 = LLT::scalar(32);
2004
2005 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2006
2007 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2008
2009 auto CvtHi = Signed ?
2010 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2011 B.buildUITOFP(S64, Unmerge.getReg(1));
2012
2013 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2014
2015 auto ThirtyTwo = B.buildConstant(S32, 32);
2016 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2017 .addUse(CvtHi.getReg(0))
2018 .addUse(ThirtyTwo.getReg(0));
2019
2020 // TODO: Should this propagate fast-math-flags?
2021 B.buildFAdd(Dst, LdExp, CvtLo);
2022 MI.eraseFromParent();
2023 return true;
2024 }
2025
2026 // TODO: Copied from DAG implementation. Verify logic and document how this
2027 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2028 bool AMDGPULegalizerInfo::legalizeFPTOI(
2029 MachineInstr &MI, MachineRegisterInfo &MRI,
2030 MachineIRBuilder &B, bool Signed) const {
2031
2032 Register Dst = MI.getOperand(0).getReg();
2033 Register Src = MI.getOperand(1).getReg();
2034
2035 const LLT S64 = LLT::scalar(64);
2036 const LLT S32 = LLT::scalar(32);
2037
2038 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2039
2040 unsigned Flags = MI.getFlags();
2041
2042 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2043 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2044 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2045
2046 auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2047 auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2048 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2049
2050 auto Hi = Signed ?
2051 B.buildFPTOSI(S32, FloorMul) :
2052 B.buildFPTOUI(S32, FloorMul);
2053 auto Lo = B.buildFPTOUI(S32, Fma);
2054
2055 B.buildMerge(Dst, { Lo, Hi });
2056 MI.eraseFromParent();
2057
2058 return true;
2059 }
2060
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2061 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2062 MachineInstr &MI) const {
2063 MachineFunction &MF = Helper.MIRBuilder.getMF();
2064 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2065
2066 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2067 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2068
2069 // With ieee_mode disabled, the instructions have the correct behavior
2070 // already for G_FMINNUM/G_FMAXNUM
2071 if (!MFI->getMode().IEEE)
2072 return !IsIEEEOp;
2073
2074 if (IsIEEEOp)
2075 return true;
2076
2077 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2078 }
2079
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2080 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2081 MachineInstr &MI, MachineRegisterInfo &MRI,
2082 MachineIRBuilder &B) const {
2083 // TODO: Should move some of this into LegalizerHelper.
2084
2085 // TODO: Promote dynamic indexing of s16 to s32
2086
2087 // FIXME: Artifact combiner probably should have replaced the truncated
2088 // constant before this, so we shouldn't need
2089 // getConstantVRegValWithLookThrough.
2090 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2091 MI.getOperand(2).getReg(), MRI);
2092 if (!IdxVal) // Dynamic case will be selected to register indexing.
2093 return true;
2094
2095 Register Dst = MI.getOperand(0).getReg();
2096 Register Vec = MI.getOperand(1).getReg();
2097
2098 LLT VecTy = MRI.getType(Vec);
2099 LLT EltTy = VecTy.getElementType();
2100 assert(EltTy == MRI.getType(Dst));
2101
2102 if (IdxVal->Value < VecTy.getNumElements())
2103 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2104 else
2105 B.buildUndef(Dst);
2106
2107 MI.eraseFromParent();
2108 return true;
2109 }
2110
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2111 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2112 MachineInstr &MI, MachineRegisterInfo &MRI,
2113 MachineIRBuilder &B) const {
2114 // TODO: Should move some of this into LegalizerHelper.
2115
2116 // TODO: Promote dynamic indexing of s16 to s32
2117
2118 // FIXME: Artifact combiner probably should have replaced the truncated
2119 // constant before this, so we shouldn't need
2120 // getConstantVRegValWithLookThrough.
2121 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2122 MI.getOperand(3).getReg(), MRI);
2123 if (!IdxVal) // Dynamic case will be selected to register indexing.
2124 return true;
2125
2126 Register Dst = MI.getOperand(0).getReg();
2127 Register Vec = MI.getOperand(1).getReg();
2128 Register Ins = MI.getOperand(2).getReg();
2129
2130 LLT VecTy = MRI.getType(Vec);
2131 LLT EltTy = VecTy.getElementType();
2132 assert(EltTy == MRI.getType(Ins));
2133
2134 if (IdxVal->Value < VecTy.getNumElements())
2135 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2136 else
2137 B.buildUndef(Dst);
2138
2139 MI.eraseFromParent();
2140 return true;
2141 }
2142
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2143 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2144 MachineInstr &MI, MachineRegisterInfo &MRI,
2145 MachineIRBuilder &B) const {
2146 const LLT V2S16 = LLT::vector(2, 16);
2147
2148 Register Dst = MI.getOperand(0).getReg();
2149 Register Src0 = MI.getOperand(1).getReg();
2150 LLT DstTy = MRI.getType(Dst);
2151 LLT SrcTy = MRI.getType(Src0);
2152
2153 if (SrcTy == V2S16 && DstTy == V2S16 &&
2154 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2155 return true;
2156
2157 MachineIRBuilder HelperBuilder(MI);
2158 GISelObserverWrapper DummyObserver;
2159 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2160 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2161 }
2162
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2163 bool AMDGPULegalizerInfo::legalizeSinCos(
2164 MachineInstr &MI, MachineRegisterInfo &MRI,
2165 MachineIRBuilder &B) const {
2166
2167 Register DstReg = MI.getOperand(0).getReg();
2168 Register SrcReg = MI.getOperand(1).getReg();
2169 LLT Ty = MRI.getType(DstReg);
2170 unsigned Flags = MI.getFlags();
2171
2172 Register TrigVal;
2173 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2174 if (ST.hasTrigReducedRange()) {
2175 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2176 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2177 .addUse(MulVal.getReg(0))
2178 .setMIFlags(Flags).getReg(0);
2179 } else
2180 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2181
2182 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2183 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2184 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2185 .addUse(TrigVal)
2186 .setMIFlags(Flags);
2187 MI.eraseFromParent();
2188 return true;
2189 }
2190
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2191 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2192 MachineIRBuilder &B,
2193 const GlobalValue *GV,
2194 int64_t Offset,
2195 unsigned GAFlags) const {
2196 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2197 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2198 // to the following code sequence:
2199 //
2200 // For constant address space:
2201 // s_getpc_b64 s[0:1]
2202 // s_add_u32 s0, s0, $symbol
2203 // s_addc_u32 s1, s1, 0
2204 //
2205 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2206 // a fixup or relocation is emitted to replace $symbol with a literal
2207 // constant, which is a pc-relative offset from the encoding of the $symbol
2208 // operand to the global variable.
2209 //
2210 // For global address space:
2211 // s_getpc_b64 s[0:1]
2212 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2213 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2214 //
2215 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2216 // fixups or relocations are emitted to replace $symbol@*@lo and
2217 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2218 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2219 // operand to the global variable.
2220 //
2221 // What we want here is an offset from the value returned by s_getpc
2222 // (which is the address of the s_add_u32 instruction) to the global
2223 // variable, but since the encoding of $symbol starts 4 bytes after the start
2224 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2225 // small. This requires us to add 4 to the global variable offset in order to
2226 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2227 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2228 // instruction.
2229
2230 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2231
2232 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2233 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2234
2235 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2236 .addDef(PCReg);
2237
2238 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2239 if (GAFlags == SIInstrInfo::MO_NONE)
2240 MIB.addImm(0);
2241 else
2242 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2243
2244 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2245
2246 if (PtrTy.getSizeInBits() == 32)
2247 B.buildExtract(DstReg, PCReg, 0);
2248 return true;
2249 }
2250
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2251 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2252 MachineInstr &MI, MachineRegisterInfo &MRI,
2253 MachineIRBuilder &B) const {
2254 Register DstReg = MI.getOperand(0).getReg();
2255 LLT Ty = MRI.getType(DstReg);
2256 unsigned AS = Ty.getAddressSpace();
2257
2258 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2259 MachineFunction &MF = B.getMF();
2260 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2261
2262 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2263 if (!MFI->isEntryFunction()) {
2264 const Function &Fn = MF.getFunction();
2265 DiagnosticInfoUnsupported BadLDSDecl(
2266 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2267 DS_Warning);
2268 Fn.getContext().diagnose(BadLDSDecl);
2269
2270 // We currently don't have a way to correctly allocate LDS objects that
2271 // aren't directly associated with a kernel. We do force inlining of
2272 // functions that use local objects. However, if these dead functions are
2273 // not eliminated, we don't want a compile time error. Just emit a warning
2274 // and a trap, since there should be no callable path here.
2275 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2276 B.buildUndef(DstReg);
2277 MI.eraseFromParent();
2278 return true;
2279 }
2280
2281 // TODO: We could emit code to handle the initialization somewhere.
2282 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2283 const SITargetLowering *TLI = ST.getTargetLowering();
2284 if (!TLI->shouldUseLDSConstAddress(GV)) {
2285 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2286 return true; // Leave in place;
2287 }
2288
2289 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2290 Type *Ty = GV->getValueType();
2291 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2292 // zero-sized type in other languages to declare the dynamic shared
2293 // memory which size is not known at the compile time. They will be
2294 // allocated by the runtime and placed directly after the static
2295 // allocated ones. They all share the same offset.
2296 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2297 // Adjust alignment for that dynamic shared memory array.
2298 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2299 LLT S32 = LLT::scalar(32);
2300 auto Sz =
2301 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2302 B.buildIntToPtr(DstReg, Sz);
2303 MI.eraseFromParent();
2304 return true;
2305 }
2306 }
2307
2308 B.buildConstant(
2309 DstReg,
2310 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2311 MI.eraseFromParent();
2312 return true;
2313 }
2314
2315 const Function &Fn = MF.getFunction();
2316 DiagnosticInfoUnsupported BadInit(
2317 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2318 Fn.getContext().diagnose(BadInit);
2319 return true;
2320 }
2321
2322 const SITargetLowering *TLI = ST.getTargetLowering();
2323
2324 if (TLI->shouldEmitFixup(GV)) {
2325 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2326 MI.eraseFromParent();
2327 return true;
2328 }
2329
2330 if (TLI->shouldEmitPCReloc(GV)) {
2331 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2332 MI.eraseFromParent();
2333 return true;
2334 }
2335
2336 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2337 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2338
2339 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2340 MachinePointerInfo::getGOT(MF),
2341 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2342 MachineMemOperand::MOInvariant,
2343 8 /*Size*/, Align(8));
2344
2345 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2346
2347 if (Ty.getSizeInBits() == 32) {
2348 // Truncate if this is a 32-bit constant adrdess.
2349 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2350 B.buildExtract(DstReg, Load, 0);
2351 } else
2352 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2353
2354 MI.eraseFromParent();
2355 return true;
2356 }
2357
widenToNextPowerOf2(LLT Ty)2358 static LLT widenToNextPowerOf2(LLT Ty) {
2359 if (Ty.isVector())
2360 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2361 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2362 }
2363
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2364 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2365 MachineInstr &MI) const {
2366 MachineIRBuilder &B = Helper.MIRBuilder;
2367 MachineRegisterInfo &MRI = *B.getMRI();
2368 GISelChangeObserver &Observer = Helper.Observer;
2369
2370 Register PtrReg = MI.getOperand(1).getReg();
2371 LLT PtrTy = MRI.getType(PtrReg);
2372 unsigned AddrSpace = PtrTy.getAddressSpace();
2373
2374 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2375 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2376 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2377 Observer.changingInstr(MI);
2378 MI.getOperand(1).setReg(Cast.getReg(0));
2379 Observer.changedInstr(MI);
2380 return true;
2381 }
2382
2383 Register ValReg = MI.getOperand(0).getReg();
2384 LLT ValTy = MRI.getType(ValReg);
2385
2386 MachineMemOperand *MMO = *MI.memoperands_begin();
2387 const unsigned ValSize = ValTy.getSizeInBits();
2388 const unsigned MemSize = 8 * MMO->getSize();
2389 const Align MemAlign = MMO->getAlign();
2390 const unsigned AlignInBits = 8 * MemAlign.value();
2391
2392 // Widen non-power-of-2 loads to the alignment if needed
2393 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2394 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2395
2396 // This was already the correct extending load result type, so just adjust
2397 // the memory type.
2398 if (WideMemSize == ValSize) {
2399 MachineFunction &MF = B.getMF();
2400
2401 MachineMemOperand *WideMMO =
2402 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2403 Observer.changingInstr(MI);
2404 MI.setMemRefs(MF, {WideMMO});
2405 Observer.changedInstr(MI);
2406 return true;
2407 }
2408
2409 // Don't bother handling edge case that should probably never be produced.
2410 if (ValSize > WideMemSize)
2411 return false;
2412
2413 LLT WideTy = widenToNextPowerOf2(ValTy);
2414
2415 Register WideLoad;
2416 if (!WideTy.isVector()) {
2417 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2418 B.buildTrunc(ValReg, WideLoad).getReg(0);
2419 } else {
2420 // Extract the subvector.
2421
2422 if (isRegisterType(ValTy)) {
2423 // If this a case where G_EXTRACT is legal, use it.
2424 // (e.g. <3 x s32> -> <4 x s32>)
2425 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2426 B.buildExtract(ValReg, WideLoad, 0);
2427 } else {
2428 // For cases where the widened type isn't a nice register value, unmerge
2429 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2430 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2431 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2432 B.setInsertPt(B.getMBB(), MI.getIterator());
2433 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2434 }
2435 }
2436
2437 MI.eraseFromParent();
2438 return true;
2439 }
2440
2441 return false;
2442 }
2443
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2444 bool AMDGPULegalizerInfo::legalizeFMad(
2445 MachineInstr &MI, MachineRegisterInfo &MRI,
2446 MachineIRBuilder &B) const {
2447 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2448 assert(Ty.isScalar());
2449
2450 MachineFunction &MF = B.getMF();
2451 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2452
2453 // TODO: Always legal with future ftz flag.
2454 // FIXME: Do we need just output?
2455 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2456 return true;
2457 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2458 return true;
2459
2460 MachineIRBuilder HelperBuilder(MI);
2461 GISelObserverWrapper DummyObserver;
2462 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2463 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2464 }
2465
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2466 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2467 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2468 Register DstReg = MI.getOperand(0).getReg();
2469 Register PtrReg = MI.getOperand(1).getReg();
2470 Register CmpVal = MI.getOperand(2).getReg();
2471 Register NewVal = MI.getOperand(3).getReg();
2472
2473 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2474 "this should not have been custom lowered");
2475
2476 LLT ValTy = MRI.getType(CmpVal);
2477 LLT VecTy = LLT::vector(2, ValTy);
2478
2479 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2480
2481 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2482 .addDef(DstReg)
2483 .addUse(PtrReg)
2484 .addUse(PackedVal)
2485 .setMemRefs(MI.memoperands());
2486
2487 MI.eraseFromParent();
2488 return true;
2489 }
2490
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2491 bool AMDGPULegalizerInfo::legalizeFlog(
2492 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2493 Register Dst = MI.getOperand(0).getReg();
2494 Register Src = MI.getOperand(1).getReg();
2495 LLT Ty = B.getMRI()->getType(Dst);
2496 unsigned Flags = MI.getFlags();
2497
2498 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2499 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2500
2501 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2502 MI.eraseFromParent();
2503 return true;
2504 }
2505
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2506 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2507 MachineIRBuilder &B) const {
2508 Register Dst = MI.getOperand(0).getReg();
2509 Register Src = MI.getOperand(1).getReg();
2510 unsigned Flags = MI.getFlags();
2511 LLT Ty = B.getMRI()->getType(Dst);
2512
2513 auto K = B.buildFConstant(Ty, numbers::log2e);
2514 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2515 B.buildFExp2(Dst, Mul, Flags);
2516 MI.eraseFromParent();
2517 return true;
2518 }
2519
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2520 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2521 MachineIRBuilder &B) const {
2522 Register Dst = MI.getOperand(0).getReg();
2523 Register Src0 = MI.getOperand(1).getReg();
2524 Register Src1 = MI.getOperand(2).getReg();
2525 unsigned Flags = MI.getFlags();
2526 LLT Ty = B.getMRI()->getType(Dst);
2527 const LLT S16 = LLT::scalar(16);
2528 const LLT S32 = LLT::scalar(32);
2529
2530 if (Ty == S32) {
2531 auto Log = B.buildFLog2(S32, Src0, Flags);
2532 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2533 .addUse(Log.getReg(0))
2534 .addUse(Src1)
2535 .setMIFlags(Flags);
2536 B.buildFExp2(Dst, Mul, Flags);
2537 } else if (Ty == S16) {
2538 // There's no f16 fmul_legacy, so we need to convert for it.
2539 auto Log = B.buildFLog2(S16, Src0, Flags);
2540 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2541 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2542 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2543 .addUse(Ext0.getReg(0))
2544 .addUse(Ext1.getReg(0))
2545 .setMIFlags(Flags);
2546
2547 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2548 } else
2549 return false;
2550
2551 MI.eraseFromParent();
2552 return true;
2553 }
2554
2555 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2556 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2557 Register ModSrc = OrigSrc;
2558 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2559 ModSrc = SrcFNeg->getOperand(1).getReg();
2560 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561 ModSrc = SrcFAbs->getOperand(1).getReg();
2562 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2563 ModSrc = SrcFAbs->getOperand(1).getReg();
2564 return ModSrc;
2565 }
2566
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2567 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2568 MachineRegisterInfo &MRI,
2569 MachineIRBuilder &B) const {
2570
2571 const LLT S1 = LLT::scalar(1);
2572 const LLT S64 = LLT::scalar(64);
2573 Register Dst = MI.getOperand(0).getReg();
2574 Register OrigSrc = MI.getOperand(1).getReg();
2575 unsigned Flags = MI.getFlags();
2576 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2577 "this should not have been custom lowered");
2578
2579 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2580 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2581 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2582 // V_FRACT bug is:
2583 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2584 //
2585 // Convert floor(x) to (x - fract(x))
2586
2587 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2588 .addUse(OrigSrc)
2589 .setMIFlags(Flags);
2590
2591 // Give source modifier matching some assistance before obscuring a foldable
2592 // pattern.
2593
2594 // TODO: We can avoid the neg on the fract? The input sign to fract
2595 // shouldn't matter?
2596 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2597
2598 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2599
2600 Register Min = MRI.createGenericVirtualRegister(S64);
2601
2602 // We don't need to concern ourselves with the snan handling difference, so
2603 // use the one which will directly select.
2604 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2605 if (MFI->getMode().IEEE)
2606 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2607 else
2608 B.buildFMinNum(Min, Fract, Const, Flags);
2609
2610 Register CorrectedFract = Min;
2611 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2612 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2613 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2614 }
2615
2616 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2617 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2618
2619 MI.eraseFromParent();
2620 return true;
2621 }
2622
2623 // Turn an illegal packed v2s16 build vector into bit operations.
2624 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2625 bool AMDGPULegalizerInfo::legalizeBuildVector(
2626 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2627 Register Dst = MI.getOperand(0).getReg();
2628 const LLT S32 = LLT::scalar(32);
2629 assert(MRI.getType(Dst) == LLT::vector(2, 16));
2630
2631 Register Src0 = MI.getOperand(1).getReg();
2632 Register Src1 = MI.getOperand(2).getReg();
2633 assert(MRI.getType(Src0) == LLT::scalar(16));
2634
2635 auto Merge = B.buildMerge(S32, {Src0, Src1});
2636 B.buildBitcast(Dst, Merge);
2637
2638 MI.eraseFromParent();
2639 return true;
2640 }
2641
2642 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)2643 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2644 if (MI.getOpcode() != TargetOpcode::G_XOR)
2645 return false;
2646 auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
2647 return ConstVal && *ConstVal == -1;
2648 }
2649
2650 // Return the use branch instruction, otherwise null if the usage is invalid.
2651 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)2652 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2653 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2654 Register CondDef = MI.getOperand(0).getReg();
2655 if (!MRI.hasOneNonDBGUse(CondDef))
2656 return nullptr;
2657
2658 MachineBasicBlock *Parent = MI.getParent();
2659 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2660
2661 if (isNot(MRI, *UseMI)) {
2662 Register NegatedCond = UseMI->getOperand(0).getReg();
2663 if (!MRI.hasOneNonDBGUse(NegatedCond))
2664 return nullptr;
2665
2666 // We're deleting the def of this value, so we need to remove it.
2667 UseMI->eraseFromParent();
2668
2669 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2670 Negated = true;
2671 }
2672
2673 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2674 return nullptr;
2675
2676 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2677 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2678 if (Next == Parent->end()) {
2679 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2680 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2681 return nullptr;
2682 UncondBrTarget = &*NextMBB;
2683 } else {
2684 if (Next->getOpcode() != AMDGPU::G_BR)
2685 return nullptr;
2686 Br = &*Next;
2687 UncondBrTarget = Br->getOperand(0).getMBB();
2688 }
2689
2690 return UseMI;
2691 }
2692
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const2693 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2694 const ArgDescriptor *Arg,
2695 const TargetRegisterClass *ArgRC,
2696 LLT ArgTy) const {
2697 MCRegister SrcReg = Arg->getRegister();
2698 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2699 assert(DstReg.isVirtual() && "Virtual register expected");
2700
2701 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2702 ArgTy);
2703 if (Arg->isMasked()) {
2704 // TODO: Should we try to emit this once in the entry block?
2705 const LLT S32 = LLT::scalar(32);
2706 const unsigned Mask = Arg->getMask();
2707 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2708
2709 Register AndMaskSrc = LiveIn;
2710
2711 if (Shift != 0) {
2712 auto ShiftAmt = B.buildConstant(S32, Shift);
2713 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2714 }
2715
2716 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2717 } else {
2718 B.buildCopy(DstReg, LiveIn);
2719 }
2720
2721 return true;
2722 }
2723
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2724 bool AMDGPULegalizerInfo::loadInputValue(
2725 Register DstReg, MachineIRBuilder &B,
2726 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2727 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2728 const ArgDescriptor *Arg;
2729 const TargetRegisterClass *ArgRC;
2730 LLT ArgTy;
2731 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2732
2733 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2734 return false; // TODO: Handle these
2735 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2736 }
2737
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const2738 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2739 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2740 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2741 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2742 return false;
2743
2744 MI.eraseFromParent();
2745 return true;
2746 }
2747
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2748 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2749 MachineRegisterInfo &MRI,
2750 MachineIRBuilder &B) const {
2751 Register Dst = MI.getOperand(0).getReg();
2752 LLT DstTy = MRI.getType(Dst);
2753 LLT S16 = LLT::scalar(16);
2754 LLT S32 = LLT::scalar(32);
2755 LLT S64 = LLT::scalar(64);
2756
2757 if (legalizeFastUnsafeFDIV(MI, MRI, B))
2758 return true;
2759
2760 if (DstTy == S16)
2761 return legalizeFDIV16(MI, MRI, B);
2762 if (DstTy == S32)
2763 return legalizeFDIV32(MI, MRI, B);
2764 if (DstTy == S64)
2765 return legalizeFDIV64(MI, MRI, B);
2766
2767 return false;
2768 }
2769
legalizeUDIV_UREM32Impl(MachineIRBuilder & B,Register DstReg,Register X,Register Y,bool IsDiv) const2770 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2771 Register DstReg,
2772 Register X,
2773 Register Y,
2774 bool IsDiv) const {
2775 const LLT S1 = LLT::scalar(1);
2776 const LLT S32 = LLT::scalar(32);
2777
2778 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2779 // algorithm used here.
2780
2781 // Initial estimate of inv(y).
2782 auto FloatY = B.buildUITOFP(S32, Y);
2783 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2784 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2785 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2786 auto Z = B.buildFPTOUI(S32, ScaledY);
2787
2788 // One round of UNR.
2789 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2790 auto NegYZ = B.buildMul(S32, NegY, Z);
2791 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2792
2793 // Quotient/remainder estimate.
2794 auto Q = B.buildUMulH(S32, X, Z);
2795 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2796
2797 // First quotient/remainder refinement.
2798 auto One = B.buildConstant(S32, 1);
2799 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2800 if (IsDiv)
2801 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2802 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2803
2804 // Second quotient/remainder refinement.
2805 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2806 if (IsDiv)
2807 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2808 else
2809 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2810 }
2811
legalizeUDIV_UREM32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2812 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2813 MachineRegisterInfo &MRI,
2814 MachineIRBuilder &B) const {
2815 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2816 Register DstReg = MI.getOperand(0).getReg();
2817 Register Num = MI.getOperand(1).getReg();
2818 Register Den = MI.getOperand(2).getReg();
2819 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2820 MI.eraseFromParent();
2821 return true;
2822 }
2823
2824 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2825 //
2826 // Return lo, hi of result
2827 //
2828 // %cvt.lo = G_UITOFP Val.lo
2829 // %cvt.hi = G_UITOFP Val.hi
2830 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2831 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2832 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2833 // %mul2 = G_FMUL %mul1, 2**(-32)
2834 // %trunc = G_INTRINSIC_TRUNC %mul2
2835 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2836 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)2837 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2838 Register Val) {
2839 const LLT S32 = LLT::scalar(32);
2840 auto Unmerge = B.buildUnmerge(S32, Val);
2841
2842 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2843 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2844
2845 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2846 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2847
2848 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2849 auto Mul1 =
2850 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2851
2852 // 2**(-32)
2853 auto Mul2 =
2854 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2855 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2856
2857 // -(2**32)
2858 auto Mad2 = B.buildFMAD(S32, Trunc,
2859 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2860
2861 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2862 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2863
2864 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2865 }
2866
legalizeUDIV_UREM64Impl(MachineIRBuilder & B,Register DstReg,Register Numer,Register Denom,bool IsDiv) const2867 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2868 Register DstReg,
2869 Register Numer,
2870 Register Denom,
2871 bool IsDiv) const {
2872 const LLT S32 = LLT::scalar(32);
2873 const LLT S64 = LLT::scalar(64);
2874 const LLT S1 = LLT::scalar(1);
2875 Register RcpLo, RcpHi;
2876
2877 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2878
2879 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2880
2881 auto Zero64 = B.buildConstant(S64, 0);
2882 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2883
2884 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2885 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2886
2887 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2888 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2889 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2890
2891 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2892 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2893 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2894 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2895
2896 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2897 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2898 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2899 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2900 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2901
2902 auto Zero32 = B.buildConstant(S32, 0);
2903 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2904 auto Add2_HiC =
2905 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2906 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2907 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2908
2909 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2910 Register NumerLo = UnmergeNumer.getReg(0);
2911 Register NumerHi = UnmergeNumer.getReg(1);
2912
2913 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2914 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2915 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2916 Register Mul3_Lo = UnmergeMul3.getReg(0);
2917 Register Mul3_Hi = UnmergeMul3.getReg(1);
2918 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2919 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2920 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2921 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2922
2923 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2924 Register DenomLo = UnmergeDenom.getReg(0);
2925 Register DenomHi = UnmergeDenom.getReg(1);
2926
2927 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2928 auto C1 = B.buildSExt(S32, CmpHi);
2929
2930 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2931 auto C2 = B.buildSExt(S32, CmpLo);
2932
2933 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2934 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2935
2936 // TODO: Here and below portions of the code can be enclosed into if/endif.
2937 // Currently control flow is unconditional and we have 4 selects after
2938 // potential endif to substitute PHIs.
2939
2940 // if C3 != 0 ...
2941 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2942 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2943 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2944 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2945
2946 auto One64 = B.buildConstant(S64, 1);
2947 auto Add3 = B.buildAdd(S64, MulHi3, One64);
2948
2949 auto C4 =
2950 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2951 auto C5 =
2952 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2953 auto C6 = B.buildSelect(
2954 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2955
2956 // if (C6 != 0)
2957 auto Add4 = B.buildAdd(S64, Add3, One64);
2958 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2959
2960 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2961 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2962 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2963
2964 // endif C6
2965 // endif C3
2966
2967 if (IsDiv) {
2968 auto Sel1 = B.buildSelect(
2969 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2970 B.buildSelect(DstReg,
2971 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2972 } else {
2973 auto Sel2 = B.buildSelect(
2974 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2975 B.buildSelect(DstReg,
2976 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2977 }
2978 }
2979
legalizeUDIV_UREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2980 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2981 MachineRegisterInfo &MRI,
2982 MachineIRBuilder &B) const {
2983 const LLT S64 = LLT::scalar(64);
2984 const LLT S32 = LLT::scalar(32);
2985 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2986 Register DstReg = MI.getOperand(0).getReg();
2987 Register Num = MI.getOperand(1).getReg();
2988 Register Den = MI.getOperand(2).getReg();
2989 LLT Ty = MRI.getType(DstReg);
2990
2991 if (Ty == S32)
2992 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2993 else if (Ty == S64)
2994 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2995 else
2996 return false;
2997
2998 MI.eraseFromParent();
2999 return true;
3000
3001 }
3002
legalizeSDIV_SREM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3003 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3004 MachineRegisterInfo &MRI,
3005 MachineIRBuilder &B) const {
3006 const LLT S64 = LLT::scalar(64);
3007 const LLT S32 = LLT::scalar(32);
3008
3009 Register DstReg = MI.getOperand(0).getReg();
3010 const LLT Ty = MRI.getType(DstReg);
3011 if (Ty != S32 && Ty != S64)
3012 return false;
3013
3014 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3015
3016 Register LHS = MI.getOperand(1).getReg();
3017 Register RHS = MI.getOperand(2).getReg();
3018
3019 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3020 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3021 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3022
3023 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3024 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3025
3026 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3027 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3028
3029 Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3030 if (Ty == S32)
3031 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3032 else
3033 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3034
3035 Register Sign;
3036 if (IsDiv)
3037 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3038 else
3039 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3040
3041 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3042 B.buildSub(DstReg, UDivRem, Sign);
3043
3044 MI.eraseFromParent();
3045 return true;
3046 }
3047
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3048 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3049 MachineRegisterInfo &MRI,
3050 MachineIRBuilder &B) const {
3051 Register Res = MI.getOperand(0).getReg();
3052 Register LHS = MI.getOperand(1).getReg();
3053 Register RHS = MI.getOperand(2).getReg();
3054
3055 uint16_t Flags = MI.getFlags();
3056
3057 LLT ResTy = MRI.getType(Res);
3058 LLT S32 = LLT::scalar(32);
3059 LLT S64 = LLT::scalar(64);
3060
3061 const MachineFunction &MF = B.getMF();
3062 bool Unsafe =
3063 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3064
3065 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3066 return false;
3067
3068 if (!Unsafe && ResTy == S32 &&
3069 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3070 return false;
3071
3072 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3073 // 1 / x -> RCP(x)
3074 if (CLHS->isExactlyValue(1.0)) {
3075 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3076 .addUse(RHS)
3077 .setMIFlags(Flags);
3078
3079 MI.eraseFromParent();
3080 return true;
3081 }
3082
3083 // -1 / x -> RCP( FNEG(x) )
3084 if (CLHS->isExactlyValue(-1.0)) {
3085 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3086 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3087 .addUse(FNeg.getReg(0))
3088 .setMIFlags(Flags);
3089
3090 MI.eraseFromParent();
3091 return true;
3092 }
3093 }
3094
3095 // x / y -> x * (1.0 / y)
3096 if (Unsafe) {
3097 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3098 .addUse(RHS)
3099 .setMIFlags(Flags);
3100 B.buildFMul(Res, LHS, RCP, Flags);
3101
3102 MI.eraseFromParent();
3103 return true;
3104 }
3105
3106 return false;
3107 }
3108
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3109 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3110 MachineRegisterInfo &MRI,
3111 MachineIRBuilder &B) const {
3112 Register Res = MI.getOperand(0).getReg();
3113 Register LHS = MI.getOperand(1).getReg();
3114 Register RHS = MI.getOperand(2).getReg();
3115
3116 uint16_t Flags = MI.getFlags();
3117
3118 LLT S16 = LLT::scalar(16);
3119 LLT S32 = LLT::scalar(32);
3120
3121 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3122 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3123
3124 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3125 .addUse(RHSExt.getReg(0))
3126 .setMIFlags(Flags);
3127
3128 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3129 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3130
3131 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3132 .addUse(RDst.getReg(0))
3133 .addUse(RHS)
3134 .addUse(LHS)
3135 .setMIFlags(Flags);
3136
3137 MI.eraseFromParent();
3138 return true;
3139 }
3140
3141 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3142 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3143 static void toggleSPDenormMode(bool Enable,
3144 MachineIRBuilder &B,
3145 const GCNSubtarget &ST,
3146 AMDGPU::SIModeRegisterDefaults Mode) {
3147 // Set SP denorm mode to this value.
3148 unsigned SPDenormMode =
3149 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3150
3151 if (ST.hasDenormModeInst()) {
3152 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3153 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3154
3155 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3156 B.buildInstr(AMDGPU::S_DENORM_MODE)
3157 .addImm(NewDenormModeValue);
3158
3159 } else {
3160 // Select FP32 bit field in mode register.
3161 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3162 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3163 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3164
3165 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3166 .addImm(SPDenormMode)
3167 .addImm(SPDenormModeBitField);
3168 }
3169 }
3170
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3171 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3172 MachineRegisterInfo &MRI,
3173 MachineIRBuilder &B) const {
3174 Register Res = MI.getOperand(0).getReg();
3175 Register LHS = MI.getOperand(1).getReg();
3176 Register RHS = MI.getOperand(2).getReg();
3177 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3178 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3179
3180 uint16_t Flags = MI.getFlags();
3181
3182 LLT S32 = LLT::scalar(32);
3183 LLT S1 = LLT::scalar(1);
3184
3185 auto One = B.buildFConstant(S32, 1.0f);
3186
3187 auto DenominatorScaled =
3188 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3189 .addUse(LHS)
3190 .addUse(RHS)
3191 .addImm(0)
3192 .setMIFlags(Flags);
3193 auto NumeratorScaled =
3194 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3195 .addUse(LHS)
3196 .addUse(RHS)
3197 .addImm(1)
3198 .setMIFlags(Flags);
3199
3200 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3201 .addUse(DenominatorScaled.getReg(0))
3202 .setMIFlags(Flags);
3203 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3204
3205 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3206 // aren't modeled as reading it.
3207 if (!Mode.allFP32Denormals())
3208 toggleSPDenormMode(true, B, ST, Mode);
3209
3210 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3211 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3212 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3213 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3214 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3215 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3216
3217 if (!Mode.allFP32Denormals())
3218 toggleSPDenormMode(false, B, ST, Mode);
3219
3220 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3221 .addUse(Fma4.getReg(0))
3222 .addUse(Fma1.getReg(0))
3223 .addUse(Fma3.getReg(0))
3224 .addUse(NumeratorScaled.getReg(1))
3225 .setMIFlags(Flags);
3226
3227 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3228 .addUse(Fmas.getReg(0))
3229 .addUse(RHS)
3230 .addUse(LHS)
3231 .setMIFlags(Flags);
3232
3233 MI.eraseFromParent();
3234 return true;
3235 }
3236
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3237 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3238 MachineRegisterInfo &MRI,
3239 MachineIRBuilder &B) const {
3240 Register Res = MI.getOperand(0).getReg();
3241 Register LHS = MI.getOperand(1).getReg();
3242 Register RHS = MI.getOperand(2).getReg();
3243
3244 uint16_t Flags = MI.getFlags();
3245
3246 LLT S64 = LLT::scalar(64);
3247 LLT S1 = LLT::scalar(1);
3248
3249 auto One = B.buildFConstant(S64, 1.0);
3250
3251 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3252 .addUse(LHS)
3253 .addUse(RHS)
3254 .addImm(0)
3255 .setMIFlags(Flags);
3256
3257 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3258
3259 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3260 .addUse(DivScale0.getReg(0))
3261 .setMIFlags(Flags);
3262
3263 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3264 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3265 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3266
3267 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3268 .addUse(LHS)
3269 .addUse(RHS)
3270 .addImm(1)
3271 .setMIFlags(Flags);
3272
3273 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3274 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3275 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3276
3277 Register Scale;
3278 if (!ST.hasUsableDivScaleConditionOutput()) {
3279 // Workaround a hardware bug on SI where the condition output from div_scale
3280 // is not usable.
3281
3282 LLT S32 = LLT::scalar(32);
3283
3284 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3285 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3286 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3287 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3288
3289 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3290 Scale1Unmerge.getReg(1));
3291 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3292 Scale0Unmerge.getReg(1));
3293 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3294 } else {
3295 Scale = DivScale1.getReg(1);
3296 }
3297
3298 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3299 .addUse(Fma4.getReg(0))
3300 .addUse(Fma3.getReg(0))
3301 .addUse(Mul.getReg(0))
3302 .addUse(Scale)
3303 .setMIFlags(Flags);
3304
3305 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3306 .addUse(Fmas.getReg(0))
3307 .addUse(RHS)
3308 .addUse(LHS)
3309 .setMIFlags(Flags);
3310
3311 MI.eraseFromParent();
3312 return true;
3313 }
3314
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3315 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3316 MachineRegisterInfo &MRI,
3317 MachineIRBuilder &B) const {
3318 Register Res = MI.getOperand(0).getReg();
3319 Register LHS = MI.getOperand(2).getReg();
3320 Register RHS = MI.getOperand(3).getReg();
3321 uint16_t Flags = MI.getFlags();
3322
3323 LLT S32 = LLT::scalar(32);
3324 LLT S1 = LLT::scalar(1);
3325
3326 auto Abs = B.buildFAbs(S32, RHS, Flags);
3327 const APFloat C0Val(1.0f);
3328
3329 auto C0 = B.buildConstant(S32, 0x6f800000);
3330 auto C1 = B.buildConstant(S32, 0x2f800000);
3331 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3332
3333 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3334 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3335
3336 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3337
3338 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3339 .addUse(Mul0.getReg(0))
3340 .setMIFlags(Flags);
3341
3342 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3343
3344 B.buildFMul(Res, Sel, Mul1, Flags);
3345
3346 MI.eraseFromParent();
3347 return true;
3348 }
3349
3350 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3351 // FIXME: Why do we handle this one but not other removed instructions?
3352 //
3353 // Reciprocal square root. The clamp prevents infinite results, clamping
3354 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3355 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3356 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3357 MachineRegisterInfo &MRI,
3358 MachineIRBuilder &B) const {
3359 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3360 return true;
3361
3362 Register Dst = MI.getOperand(0).getReg();
3363 Register Src = MI.getOperand(2).getReg();
3364 auto Flags = MI.getFlags();
3365
3366 LLT Ty = MRI.getType(Dst);
3367
3368 const fltSemantics *FltSemantics;
3369 if (Ty == LLT::scalar(32))
3370 FltSemantics = &APFloat::IEEEsingle();
3371 else if (Ty == LLT::scalar(64))
3372 FltSemantics = &APFloat::IEEEdouble();
3373 else
3374 return false;
3375
3376 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3377 .addUse(Src)
3378 .setMIFlags(Flags);
3379
3380 // We don't need to concern ourselves with the snan handling difference, since
3381 // the rsq quieted (or not) so use the one which will directly select.
3382 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3383 const bool UseIEEE = MFI->getMode().IEEE;
3384
3385 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3386 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3387 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3388
3389 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3390
3391 if (UseIEEE)
3392 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3393 else
3394 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3395 MI.eraseFromParent();
3396 return true;
3397 }
3398
getDSFPAtomicOpcode(Intrinsic::ID IID)3399 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3400 switch (IID) {
3401 case Intrinsic::amdgcn_ds_fadd:
3402 return AMDGPU::G_ATOMICRMW_FADD;
3403 case Intrinsic::amdgcn_ds_fmin:
3404 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3405 case Intrinsic::amdgcn_ds_fmax:
3406 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3407 default:
3408 llvm_unreachable("not a DS FP intrinsic");
3409 }
3410 }
3411
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const3412 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3413 MachineInstr &MI,
3414 Intrinsic::ID IID) const {
3415 GISelChangeObserver &Observer = Helper.Observer;
3416 Observer.changingInstr(MI);
3417
3418 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3419
3420 // The remaining operands were used to set fields in the MemOperand on
3421 // construction.
3422 for (int I = 6; I > 3; --I)
3423 MI.RemoveOperand(I);
3424
3425 MI.RemoveOperand(1); // Remove the intrinsic ID.
3426 Observer.changedInstr(MI);
3427 return true;
3428 }
3429
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3430 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3431 MachineRegisterInfo &MRI,
3432 MachineIRBuilder &B) const {
3433 uint64_t Offset =
3434 ST.getTargetLowering()->getImplicitParameterOffset(
3435 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3436 LLT DstTy = MRI.getType(DstReg);
3437 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3438
3439 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3440 if (!loadInputValue(KernargPtrReg, B,
3441 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3442 return false;
3443
3444 // FIXME: This should be nuw
3445 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3446 return true;
3447 }
3448
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3449 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3450 MachineRegisterInfo &MRI,
3451 MachineIRBuilder &B) const {
3452 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3453 if (!MFI->isEntryFunction()) {
3454 return legalizePreloadedArgIntrin(MI, MRI, B,
3455 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3456 }
3457
3458 Register DstReg = MI.getOperand(0).getReg();
3459 if (!getImplicitArgPtr(DstReg, MRI, B))
3460 return false;
3461
3462 MI.eraseFromParent();
3463 return true;
3464 }
3465
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const3466 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3467 MachineRegisterInfo &MRI,
3468 MachineIRBuilder &B,
3469 unsigned AddrSpace) const {
3470 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3471 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3472 Register Hi32 = Unmerge.getReg(1);
3473
3474 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3475 MI.eraseFromParent();
3476 return true;
3477 }
3478
3479 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3480 // offset (the offset that is included in bounds checking and swizzling, to be
3481 // split between the instruction's voffset and immoffset fields) and soffset
3482 // (the offset that is excluded from bounds checking and swizzling, to go in
3483 // the instruction's soffset field). This function takes the first kind of
3484 // offset and figures out how to split it between voffset and immoffset.
3485 std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const3486 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3487 Register OrigOffset) const {
3488 const unsigned MaxImm = 4095;
3489 Register BaseReg;
3490 unsigned TotalConstOffset;
3491 const LLT S32 = LLT::scalar(32);
3492
3493 std::tie(BaseReg, TotalConstOffset) =
3494 AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3495
3496 unsigned ImmOffset = TotalConstOffset;
3497
3498 // If the immediate value is too big for the immoffset field, put the value
3499 // and -4096 into the immoffset field so that the value that is copied/added
3500 // for the voffset field is a multiple of 4096, and it stands more chance
3501 // of being CSEd with the copy/add for another similar load/store.
3502 // However, do not do that rounding down to a multiple of 4096 if that is a
3503 // negative number, as it appears to be illegal to have a negative offset
3504 // in the vgpr, even if adding the immediate offset makes it positive.
3505 unsigned Overflow = ImmOffset & ~MaxImm;
3506 ImmOffset -= Overflow;
3507 if ((int32_t)Overflow < 0) {
3508 Overflow += ImmOffset;
3509 ImmOffset = 0;
3510 }
3511
3512 if (Overflow != 0) {
3513 if (!BaseReg) {
3514 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3515 } else {
3516 auto OverflowVal = B.buildConstant(S32, Overflow);
3517 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3518 }
3519 }
3520
3521 if (!BaseReg)
3522 BaseReg = B.buildConstant(S32, 0).getReg(0);
3523
3524 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3525 }
3526
3527 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const3528 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3529 MachineRegisterInfo &MRI,
3530 Register Reg,
3531 bool ImageStore) const {
3532 const LLT S16 = LLT::scalar(16);
3533 const LLT S32 = LLT::scalar(32);
3534 LLT StoreVT = MRI.getType(Reg);
3535 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3536
3537 if (ST.hasUnpackedD16VMem()) {
3538 auto Unmerge = B.buildUnmerge(S16, Reg);
3539
3540 SmallVector<Register, 4> WideRegs;
3541 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3542 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3543
3544 int NumElts = StoreVT.getNumElements();
3545
3546 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3547 }
3548
3549 if (ImageStore && ST.hasImageStoreD16Bug()) {
3550 if (StoreVT.getNumElements() == 2) {
3551 SmallVector<Register, 4> PackedRegs;
3552 Reg = B.buildBitcast(S32, Reg).getReg(0);
3553 PackedRegs.push_back(Reg);
3554 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3555 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3556 }
3557
3558 if (StoreVT.getNumElements() == 3) {
3559 SmallVector<Register, 4> PackedRegs;
3560 auto Unmerge = B.buildUnmerge(S16, Reg);
3561 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3562 PackedRegs.push_back(Unmerge.getReg(I));
3563 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3564 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3565 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3566 }
3567
3568 if (StoreVT.getNumElements() == 4) {
3569 SmallVector<Register, 4> PackedRegs;
3570 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3571 auto Unmerge = B.buildUnmerge(S32, Reg);
3572 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3573 PackedRegs.push_back(Unmerge.getReg(I));
3574 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3575 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3576 }
3577
3578 llvm_unreachable("invalid data type");
3579 }
3580
3581 return Reg;
3582 }
3583
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const3584 Register AMDGPULegalizerInfo::fixStoreSourceType(
3585 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3586 MachineRegisterInfo *MRI = B.getMRI();
3587 LLT Ty = MRI->getType(VData);
3588
3589 const LLT S16 = LLT::scalar(16);
3590
3591 // Fixup illegal register types for i8 stores.
3592 if (Ty == LLT::scalar(8) || Ty == S16) {
3593 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3594 return AnyExt;
3595 }
3596
3597 if (Ty.isVector()) {
3598 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3599 if (IsFormat)
3600 return handleD16VData(B, *MRI, VData);
3601 }
3602 }
3603
3604 return VData;
3605 }
3606
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const3607 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3608 MachineRegisterInfo &MRI,
3609 MachineIRBuilder &B,
3610 bool IsTyped,
3611 bool IsFormat) const {
3612 Register VData = MI.getOperand(1).getReg();
3613 LLT Ty = MRI.getType(VData);
3614 LLT EltTy = Ty.getScalarType();
3615 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3616 const LLT S32 = LLT::scalar(32);
3617
3618 VData = fixStoreSourceType(B, VData, IsFormat);
3619 Register RSrc = MI.getOperand(2).getReg();
3620
3621 MachineMemOperand *MMO = *MI.memoperands_begin();
3622 const int MemSize = MMO->getSize();
3623
3624 unsigned ImmOffset;
3625 unsigned TotalOffset;
3626
3627 // The typed intrinsics add an immediate after the registers.
3628 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3629
3630 // The struct intrinsic variants add one additional operand over raw.
3631 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3632 Register VIndex;
3633 int OpOffset = 0;
3634 if (HasVIndex) {
3635 VIndex = MI.getOperand(3).getReg();
3636 OpOffset = 1;
3637 }
3638
3639 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3640 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3641
3642 unsigned Format = 0;
3643 if (IsTyped) {
3644 Format = MI.getOperand(5 + OpOffset).getImm();
3645 ++OpOffset;
3646 }
3647
3648 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3649
3650 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3651 if (TotalOffset != 0)
3652 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3653
3654 unsigned Opc;
3655 if (IsTyped) {
3656 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3657 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3658 } else if (IsFormat) {
3659 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3660 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3661 } else {
3662 switch (MemSize) {
3663 case 1:
3664 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3665 break;
3666 case 2:
3667 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3668 break;
3669 default:
3670 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3671 break;
3672 }
3673 }
3674
3675 if (!VIndex)
3676 VIndex = B.buildConstant(S32, 0).getReg(0);
3677
3678 auto MIB = B.buildInstr(Opc)
3679 .addUse(VData) // vdata
3680 .addUse(RSrc) // rsrc
3681 .addUse(VIndex) // vindex
3682 .addUse(VOffset) // voffset
3683 .addUse(SOffset) // soffset
3684 .addImm(ImmOffset); // offset(imm)
3685
3686 if (IsTyped)
3687 MIB.addImm(Format);
3688
3689 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3690 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3691 .addMemOperand(MMO);
3692
3693 MI.eraseFromParent();
3694 return true;
3695 }
3696
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const3697 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3698 MachineRegisterInfo &MRI,
3699 MachineIRBuilder &B,
3700 bool IsFormat,
3701 bool IsTyped) const {
3702 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3703 MachineMemOperand *MMO = *MI.memoperands_begin();
3704 const int MemSize = MMO->getSize();
3705 const LLT S32 = LLT::scalar(32);
3706
3707 Register Dst = MI.getOperand(0).getReg();
3708 Register RSrc = MI.getOperand(2).getReg();
3709
3710 // The typed intrinsics add an immediate after the registers.
3711 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3712
3713 // The struct intrinsic variants add one additional operand over raw.
3714 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3715 Register VIndex;
3716 int OpOffset = 0;
3717 if (HasVIndex) {
3718 VIndex = MI.getOperand(3).getReg();
3719 OpOffset = 1;
3720 }
3721
3722 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3723 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3724
3725 unsigned Format = 0;
3726 if (IsTyped) {
3727 Format = MI.getOperand(5 + OpOffset).getImm();
3728 ++OpOffset;
3729 }
3730
3731 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3732 unsigned ImmOffset;
3733 unsigned TotalOffset;
3734
3735 LLT Ty = MRI.getType(Dst);
3736 LLT EltTy = Ty.getScalarType();
3737 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3738 const bool Unpacked = ST.hasUnpackedD16VMem();
3739
3740 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3741 if (TotalOffset != 0)
3742 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3743
3744 unsigned Opc;
3745
3746 if (IsTyped) {
3747 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3748 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3749 } else if (IsFormat) {
3750 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3751 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3752 } else {
3753 switch (MemSize) {
3754 case 1:
3755 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3756 break;
3757 case 2:
3758 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3759 break;
3760 default:
3761 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3762 break;
3763 }
3764 }
3765
3766 Register LoadDstReg;
3767
3768 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3769 LLT UnpackedTy = Ty.changeElementSize(32);
3770
3771 if (IsExtLoad)
3772 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3773 else if (Unpacked && IsD16 && Ty.isVector())
3774 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3775 else
3776 LoadDstReg = Dst;
3777
3778 if (!VIndex)
3779 VIndex = B.buildConstant(S32, 0).getReg(0);
3780
3781 auto MIB = B.buildInstr(Opc)
3782 .addDef(LoadDstReg) // vdata
3783 .addUse(RSrc) // rsrc
3784 .addUse(VIndex) // vindex
3785 .addUse(VOffset) // voffset
3786 .addUse(SOffset) // soffset
3787 .addImm(ImmOffset); // offset(imm)
3788
3789 if (IsTyped)
3790 MIB.addImm(Format);
3791
3792 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3793 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3794 .addMemOperand(MMO);
3795
3796 if (LoadDstReg != Dst) {
3797 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3798
3799 // Widen result for extending loads was widened.
3800 if (IsExtLoad)
3801 B.buildTrunc(Dst, LoadDstReg);
3802 else {
3803 // Repack to original 16-bit vector result
3804 // FIXME: G_TRUNC should work, but legalization currently fails
3805 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3806 SmallVector<Register, 4> Repack;
3807 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3808 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3809 B.buildMerge(Dst, Repack);
3810 }
3811 }
3812
3813 MI.eraseFromParent();
3814 return true;
3815 }
3816
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const3817 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3818 MachineIRBuilder &B,
3819 bool IsInc) const {
3820 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3821 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3822 B.buildInstr(Opc)
3823 .addDef(MI.getOperand(0).getReg())
3824 .addUse(MI.getOperand(2).getReg())
3825 .addUse(MI.getOperand(3).getReg())
3826 .cloneMemRefs(MI);
3827 MI.eraseFromParent();
3828 return true;
3829 }
3830
getBufferAtomicPseudo(Intrinsic::ID IntrID)3831 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3832 switch (IntrID) {
3833 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3834 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3835 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3836 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3837 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3838 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3839 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3840 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3841 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3842 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3843 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3844 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3845 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3846 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3847 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3848 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3849 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3850 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3851 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3852 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3853 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3854 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3855 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3856 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3857 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3858 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3859 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3860 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3861 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3862 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3863 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3864 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3866 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3867 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3868 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3869 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3870 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3871 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3872 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3873 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3874 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3875 default:
3876 llvm_unreachable("unhandled atomic opcode");
3877 }
3878 }
3879
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const3880 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3881 MachineIRBuilder &B,
3882 Intrinsic::ID IID) const {
3883 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3884 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3885 const bool HasReturn = MI.getNumExplicitDefs() != 0;
3886
3887 Register Dst;
3888
3889 int OpOffset = 0;
3890 if (HasReturn) {
3891 // A few FP atomics do not support return values.
3892 Dst = MI.getOperand(0).getReg();
3893 } else {
3894 OpOffset = -1;
3895 }
3896
3897 Register VData = MI.getOperand(2 + OpOffset).getReg();
3898 Register CmpVal;
3899
3900 if (IsCmpSwap) {
3901 CmpVal = MI.getOperand(3 + OpOffset).getReg();
3902 ++OpOffset;
3903 }
3904
3905 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3906 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3907
3908 // The struct intrinsic variants add one additional operand over raw.
3909 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3910 Register VIndex;
3911 if (HasVIndex) {
3912 VIndex = MI.getOperand(4 + OpOffset).getReg();
3913 ++OpOffset;
3914 }
3915
3916 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3917 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3918 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3919
3920 MachineMemOperand *MMO = *MI.memoperands_begin();
3921
3922 unsigned ImmOffset;
3923 unsigned TotalOffset;
3924 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3925 if (TotalOffset != 0)
3926 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3927
3928 if (!VIndex)
3929 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3930
3931 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3932
3933 if (HasReturn)
3934 MIB.addDef(Dst);
3935
3936 MIB.addUse(VData); // vdata
3937
3938 if (IsCmpSwap)
3939 MIB.addReg(CmpVal);
3940
3941 MIB.addUse(RSrc) // rsrc
3942 .addUse(VIndex) // vindex
3943 .addUse(VOffset) // voffset
3944 .addUse(SOffset) // soffset
3945 .addImm(ImmOffset) // offset(imm)
3946 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3947 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3948 .addMemOperand(MMO);
3949
3950 MI.eraseFromParent();
3951 return true;
3952 }
3953
3954 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3955 /// vector with s16 typed elements.
packImageA16AddressToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,unsigned EndIdx)3956 static void packImageA16AddressToDwords(
3957 MachineIRBuilder &B, MachineInstr &MI,
3958 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3959 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3960 const LLT S16 = LLT::scalar(16);
3961 const LLT V2S16 = LLT::vector(2, 16);
3962
3963 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3964 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3965 if (!SrcOp.isReg())
3966 continue; // _L to _LZ may have eliminated this.
3967
3968 Register AddrReg = SrcOp.getReg();
3969
3970 if (I < Intr->GradientStart) {
3971 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3972 PackedAddrs.push_back(AddrReg);
3973 } else {
3974 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3975 // derivatives dx/dh and dx/dv are packed with undef.
3976 if (((I + 1) >= EndIdx) ||
3977 ((Intr->NumGradients / 2) % 2 == 1 &&
3978 (I == static_cast<unsigned>(Intr->GradientStart +
3979 (Intr->NumGradients / 2) - 1) ||
3980 I == static_cast<unsigned>(Intr->GradientStart +
3981 Intr->NumGradients - 1))) ||
3982 // Check for _L to _LZ optimization
3983 !MI.getOperand(ArgOffset + I + 1).isReg()) {
3984 PackedAddrs.push_back(
3985 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3986 .getReg(0));
3987 } else {
3988 PackedAddrs.push_back(
3989 B.buildBuildVector(
3990 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3991 .getReg(0));
3992 ++I;
3993 }
3994 }
3995 }
3996 }
3997
3998 /// Convert from separate vaddr components to a single vector address register,
3999 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4000 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4001 int DimIdx, int NumVAddrs) {
4002 const LLT S32 = LLT::scalar(32);
4003
4004 SmallVector<Register, 8> AddrRegs;
4005 for (int I = 0; I != NumVAddrs; ++I) {
4006 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4007 if (SrcOp.isReg()) {
4008 AddrRegs.push_back(SrcOp.getReg());
4009 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4010 }
4011 }
4012
4013 int NumAddrRegs = AddrRegs.size();
4014 if (NumAddrRegs != 1) {
4015 // Round up to 8 elements for v5-v7
4016 // FIXME: Missing intermediate sized register classes and instructions.
4017 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4018 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4019 auto Undef = B.buildUndef(S32);
4020 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4021 NumAddrRegs = RoundedNumRegs;
4022 }
4023
4024 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4025 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4026 }
4027
4028 for (int I = 1; I != NumVAddrs; ++I) {
4029 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4030 if (SrcOp.isReg())
4031 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4032 }
4033 }
4034
4035 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4036 ///
4037 /// Depending on the subtarget, load/store with 16-bit element data need to be
4038 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4039 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4040 /// registers.
4041 ///
4042 /// We don't want to directly select image instructions just yet, but also want
4043 /// to exposes all register repacking to the legalizer/combiners. We also don't
4044 /// want a selected instrution entering RegBankSelect. In order to avoid
4045 /// defining a multitude of intermediate image instructions, directly hack on
4046 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4047 /// now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4048 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4049 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4050 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4051
4052 const unsigned NumDefs = MI.getNumExplicitDefs();
4053 const unsigned ArgOffset = NumDefs + 1;
4054 bool IsTFE = NumDefs == 2;
4055 // We are only processing the operands of d16 image operations on subtargets
4056 // that use the unpacked register layout, or need to repack the TFE result.
4057
4058 // TODO: Do we need to guard against already legalized intrinsics?
4059 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4060 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4061
4062 MachineRegisterInfo *MRI = B.getMRI();
4063 const LLT S32 = LLT::scalar(32);
4064 const LLT S16 = LLT::scalar(16);
4065 const LLT V2S16 = LLT::vector(2, 16);
4066
4067 unsigned DMask = 0;
4068
4069 // Check for 16 bit addresses and pack if true.
4070 LLT GradTy =
4071 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4072 LLT AddrTy =
4073 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4074 const bool IsG16 = GradTy == S16;
4075 const bool IsA16 = AddrTy == S16;
4076
4077 int DMaskLanes = 0;
4078 if (!BaseOpcode->Atomic) {
4079 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4080 if (BaseOpcode->Gather4) {
4081 DMaskLanes = 4;
4082 } else if (DMask != 0) {
4083 DMaskLanes = countPopulation(DMask);
4084 } else if (!IsTFE && !BaseOpcode->Store) {
4085 // If dmask is 0, this is a no-op load. This can be eliminated.
4086 B.buildUndef(MI.getOperand(0));
4087 MI.eraseFromParent();
4088 return true;
4089 }
4090 }
4091
4092 Observer.changingInstr(MI);
4093 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4094
4095 unsigned NewOpcode = NumDefs == 0 ?
4096 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4097
4098 // Track that we legalized this
4099 MI.setDesc(B.getTII().get(NewOpcode));
4100
4101 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4102 // dmask to be at least 1 otherwise the instruction will fail
4103 if (IsTFE && DMask == 0) {
4104 DMask = 0x1;
4105 DMaskLanes = 1;
4106 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4107 }
4108
4109 if (BaseOpcode->Atomic) {
4110 Register VData0 = MI.getOperand(2).getReg();
4111 LLT Ty = MRI->getType(VData0);
4112
4113 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4114 if (Ty.isVector())
4115 return false;
4116
4117 if (BaseOpcode->AtomicX2) {
4118 Register VData1 = MI.getOperand(3).getReg();
4119 // The two values are packed in one register.
4120 LLT PackedTy = LLT::vector(2, Ty);
4121 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4122 MI.getOperand(2).setReg(Concat.getReg(0));
4123 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4124 }
4125 }
4126
4127 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4128
4129 // Optimize _L to _LZ when _L is zero
4130 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4131 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4132 const ConstantFP *ConstantLod;
4133
4134 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4135 m_GFCst(ConstantLod))) {
4136 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4137 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4138 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4139 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4140 Intr->Dim);
4141
4142 // The starting indexes should remain in the same place.
4143 --CorrectedNumVAddrs;
4144
4145 MI.getOperand(MI.getNumExplicitDefs())
4146 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4147 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4148 Intr = NewImageDimIntr;
4149 }
4150 }
4151 }
4152
4153 // Optimize _mip away, when 'lod' is zero
4154 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4155 int64_t ConstantLod;
4156 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4157 m_ICst(ConstantLod))) {
4158 if (ConstantLod == 0) {
4159 // TODO: Change intrinsic opcode and remove operand instead or replacing
4160 // it with 0, as the _L to _LZ handling is done above.
4161 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4162 --CorrectedNumVAddrs;
4163 }
4164 }
4165 }
4166
4167 // Rewrite the addressing register layout before doing anything else.
4168 if (IsA16 || IsG16) {
4169 if (IsA16) {
4170 // Target must support the feature and gradients need to be 16 bit too
4171 if (!ST.hasA16() || !IsG16)
4172 return false;
4173 } else if (!ST.hasG16())
4174 return false;
4175
4176 if (Intr->NumVAddrs > 1) {
4177 SmallVector<Register, 4> PackedRegs;
4178 // Don't compress addresses for G16
4179 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4180 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4181 PackEndIdx);
4182
4183 if (!IsA16) {
4184 // Add uncompressed address
4185 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4186 int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4187 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4188 PackedRegs.push_back(AddrReg);
4189 }
4190 }
4191
4192 // See also below in the non-a16 branch
4193 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4194
4195 if (!UseNSA && PackedRegs.size() > 1) {
4196 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4197 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4198 PackedRegs[0] = Concat.getReg(0);
4199 PackedRegs.resize(1);
4200 }
4201
4202 const unsigned NumPacked = PackedRegs.size();
4203 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4204 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4205 if (!SrcOp.isReg()) {
4206 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4207 continue;
4208 }
4209
4210 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4211
4212 if (I - Intr->VAddrStart < NumPacked)
4213 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4214 else
4215 SrcOp.setReg(AMDGPU::NoRegister);
4216 }
4217 }
4218 } else {
4219 // If the register allocator cannot place the address registers contiguously
4220 // without introducing moves, then using the non-sequential address encoding
4221 // is always preferable, since it saves VALU instructions and is usually a
4222 // wash in terms of code size or even better.
4223 //
4224 // However, we currently have no way of hinting to the register allocator
4225 // that MIMG addresses should be placed contiguously when it is possible to
4226 // do so, so force non-NSA for the common 2-address case as a heuristic.
4227 //
4228 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4229 // allocation when possible.
4230 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4231
4232 if (!UseNSA && Intr->NumVAddrs > 1)
4233 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4234 Intr->NumVAddrs);
4235 }
4236
4237 int Flags = 0;
4238 if (IsA16)
4239 Flags |= 1;
4240 if (IsG16)
4241 Flags |= 2;
4242 MI.addOperand(MachineOperand::CreateImm(Flags));
4243
4244 if (BaseOpcode->Store) { // No TFE for stores?
4245 // TODO: Handle dmask trim
4246 Register VData = MI.getOperand(1).getReg();
4247 LLT Ty = MRI->getType(VData);
4248 if (!Ty.isVector() || Ty.getElementType() != S16)
4249 return true;
4250
4251 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4252 if (RepackedReg != VData) {
4253 MI.getOperand(1).setReg(RepackedReg);
4254 }
4255
4256 return true;
4257 }
4258
4259 Register DstReg = MI.getOperand(0).getReg();
4260 LLT Ty = MRI->getType(DstReg);
4261 const LLT EltTy = Ty.getScalarType();
4262 const bool IsD16 = Ty.getScalarType() == S16;
4263 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4264
4265 // Confirm that the return type is large enough for the dmask specified
4266 if (NumElts < DMaskLanes)
4267 return false;
4268
4269 if (NumElts > 4 || DMaskLanes > 4)
4270 return false;
4271
4272 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4273 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4274
4275 // The raw dword aligned data component of the load. The only legal cases
4276 // where this matters should be when using the packed D16 format, for
4277 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4278 LLT RoundedTy;
4279
4280 // S32 vector to to cover all data, plus TFE result element.
4281 LLT TFETy;
4282
4283 // Register type to use for each loaded component. Will be S32 or V2S16.
4284 LLT RegTy;
4285
4286 if (IsD16 && ST.hasUnpackedD16VMem()) {
4287 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4288 TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4289 RegTy = S32;
4290 } else {
4291 unsigned EltSize = EltTy.getSizeInBits();
4292 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4293 unsigned RoundedSize = 32 * RoundedElts;
4294 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4295 TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4296 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4297 }
4298
4299 // The return type does not need adjustment.
4300 // TODO: Should we change s16 case to s32 or <2 x s16>?
4301 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4302 return true;
4303
4304 Register Dst1Reg;
4305
4306 // Insert after the instruction.
4307 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4308
4309 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4310 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4311 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4312 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4313
4314 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4315
4316 MI.getOperand(0).setReg(NewResultReg);
4317
4318 // In the IR, TFE is supposed to be used with a 2 element struct return
4319 // type. The intruction really returns these two values in one contiguous
4320 // register, with one additional dword beyond the loaded data. Rewrite the
4321 // return type to use a single register result.
4322
4323 if (IsTFE) {
4324 Dst1Reg = MI.getOperand(1).getReg();
4325 if (MRI->getType(Dst1Reg) != S32)
4326 return false;
4327
4328 // TODO: Make sure the TFE operand bit is set.
4329 MI.RemoveOperand(1);
4330
4331 // Handle the easy case that requires no repack instructions.
4332 if (Ty == S32) {
4333 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4334 return true;
4335 }
4336 }
4337
4338 // Now figure out how to copy the new result register back into the old
4339 // result.
4340 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4341
4342 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4343
4344 if (ResultNumRegs == 1) {
4345 assert(!IsTFE);
4346 ResultRegs[0] = NewResultReg;
4347 } else {
4348 // We have to repack into a new vector of some kind.
4349 for (int I = 0; I != NumDataRegs; ++I)
4350 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4351 B.buildUnmerge(ResultRegs, NewResultReg);
4352
4353 // Drop the final TFE element to get the data part. The TFE result is
4354 // directly written to the right place already.
4355 if (IsTFE)
4356 ResultRegs.resize(NumDataRegs);
4357 }
4358
4359 // For an s16 scalar result, we form an s32 result with a truncate regardless
4360 // of packed vs. unpacked.
4361 if (IsD16 && !Ty.isVector()) {
4362 B.buildTrunc(DstReg, ResultRegs[0]);
4363 return true;
4364 }
4365
4366 // Avoid a build/concat_vector of 1 entry.
4367 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4368 B.buildBitcast(DstReg, ResultRegs[0]);
4369 return true;
4370 }
4371
4372 assert(Ty.isVector());
4373
4374 if (IsD16) {
4375 // For packed D16 results with TFE enabled, all the data components are
4376 // S32. Cast back to the expected type.
4377 //
4378 // TODO: We don't really need to use load s32 elements. We would only need one
4379 // cast for the TFE result if a multiple of v2s16 was used.
4380 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4381 for (Register &Reg : ResultRegs)
4382 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4383 } else if (ST.hasUnpackedD16VMem()) {
4384 for (Register &Reg : ResultRegs)
4385 Reg = B.buildTrunc(S16, Reg).getReg(0);
4386 }
4387 }
4388
4389 auto padWithUndef = [&](LLT Ty, int NumElts) {
4390 if (NumElts == 0)
4391 return;
4392 Register Undef = B.buildUndef(Ty).getReg(0);
4393 for (int I = 0; I != NumElts; ++I)
4394 ResultRegs.push_back(Undef);
4395 };
4396
4397 // Pad out any elements eliminated due to the dmask.
4398 LLT ResTy = MRI->getType(ResultRegs[0]);
4399 if (!ResTy.isVector()) {
4400 padWithUndef(ResTy, NumElts - ResultRegs.size());
4401 B.buildBuildVector(DstReg, ResultRegs);
4402 return true;
4403 }
4404
4405 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4406 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4407
4408 // Deal with the one annoying legal case.
4409 const LLT V3S16 = LLT::vector(3, 16);
4410 if (Ty == V3S16) {
4411 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4412 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4413 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4414 return true;
4415 }
4416
4417 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4418 B.buildConcatVectors(DstReg, ResultRegs);
4419 return true;
4420 }
4421
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const4422 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4423 LegalizerHelper &Helper, MachineInstr &MI) const {
4424 MachineIRBuilder &B = Helper.MIRBuilder;
4425 GISelChangeObserver &Observer = Helper.Observer;
4426
4427 Register Dst = MI.getOperand(0).getReg();
4428 LLT Ty = B.getMRI()->getType(Dst);
4429 unsigned Size = Ty.getSizeInBits();
4430 MachineFunction &MF = B.getMF();
4431
4432 Observer.changingInstr(MI);
4433
4434 if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4435 Ty = getBitcastRegisterType(Ty);
4436 Helper.bitcastDst(MI, Ty, 0);
4437 Dst = MI.getOperand(0).getReg();
4438 B.setInsertPt(B.getMBB(), MI);
4439 }
4440
4441 // FIXME: We don't really need this intermediate instruction. The intrinsic
4442 // should be fixed to have a memory operand. Since it's readnone, we're not
4443 // allowed to add one.
4444 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4445 MI.RemoveOperand(1); // Remove intrinsic ID
4446
4447 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4448 // TODO: Should this use datalayout alignment?
4449 const unsigned MemSize = (Size + 7) / 8;
4450 const Align MemAlign(4);
4451 MachineMemOperand *MMO = MF.getMachineMemOperand(
4452 MachinePointerInfo(),
4453 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4454 MachineMemOperand::MOInvariant,
4455 MemSize, MemAlign);
4456 MI.addMemOperand(MF, MMO);
4457
4458 // There are no 96-bit result scalar loads, but widening to 128-bit should
4459 // always be legal. We may need to restore this to a 96-bit result if it turns
4460 // out this needs to be converted to a vector load during RegBankSelect.
4461 if (!isPowerOf2_32(Size)) {
4462 if (Ty.isVector())
4463 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4464 else
4465 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4466 }
4467
4468 Observer.changedInstr(MI);
4469 return true;
4470 }
4471
4472 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4473 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4474 MachineRegisterInfo &MRI,
4475 MachineIRBuilder &B) const {
4476 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4477 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4478 !ST.isTrapHandlerEnabled()) {
4479 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4480 } else {
4481 // Pass queue pointer to trap handler as input, and insert trap instruction
4482 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4483 MachineRegisterInfo &MRI = *B.getMRI();
4484
4485 Register LiveIn =
4486 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4487 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4488 return false;
4489
4490 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4491 B.buildCopy(SGPR01, LiveIn);
4492 B.buildInstr(AMDGPU::S_TRAP)
4493 .addImm(GCNSubtarget::TrapIDLLVMTrap)
4494 .addReg(SGPR01, RegState::Implicit);
4495 }
4496
4497 MI.eraseFromParent();
4498 return true;
4499 }
4500
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4501 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4502 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4503 // Is non-HSA path or trap-handler disabled? then, report a warning
4504 // accordingly
4505 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4506 !ST.isTrapHandlerEnabled()) {
4507 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4508 "debugtrap handler not supported",
4509 MI.getDebugLoc(), DS_Warning);
4510 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4511 Ctx.diagnose(NoTrap);
4512 } else {
4513 // Insert debug-trap instruction
4514 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4515 }
4516
4517 MI.eraseFromParent();
4518 return true;
4519 }
4520
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const4521 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4522 MachineIRBuilder &B) const {
4523 MachineRegisterInfo &MRI = *B.getMRI();
4524 const LLT S16 = LLT::scalar(16);
4525 const LLT S32 = LLT::scalar(32);
4526
4527 Register DstReg = MI.getOperand(0).getReg();
4528 Register NodePtr = MI.getOperand(2).getReg();
4529 Register RayExtent = MI.getOperand(3).getReg();
4530 Register RayOrigin = MI.getOperand(4).getReg();
4531 Register RayDir = MI.getOperand(5).getReg();
4532 Register RayInvDir = MI.getOperand(6).getReg();
4533 Register TDescr = MI.getOperand(7).getReg();
4534
4535 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4536 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4537 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4538 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4539 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4540 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4541
4542 SmallVector<Register, 12> Ops;
4543 if (Is64) {
4544 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4545 Ops.push_back(Unmerge.getReg(0));
4546 Ops.push_back(Unmerge.getReg(1));
4547 } else {
4548 Ops.push_back(NodePtr);
4549 }
4550 Ops.push_back(RayExtent);
4551
4552 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4553 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4554 Ops.push_back(Unmerge.getReg(0));
4555 Ops.push_back(Unmerge.getReg(1));
4556 Ops.push_back(Unmerge.getReg(2));
4557 };
4558
4559 packLanes(RayOrigin);
4560 if (IsA16) {
4561 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4562 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4563 Register R1 = MRI.createGenericVirtualRegister(S32);
4564 Register R2 = MRI.createGenericVirtualRegister(S32);
4565 Register R3 = MRI.createGenericVirtualRegister(S32);
4566 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4567 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4568 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4569 Ops.push_back(R1);
4570 Ops.push_back(R2);
4571 Ops.push_back(R3);
4572 } else {
4573 packLanes(RayDir);
4574 packLanes(RayInvDir);
4575 }
4576
4577 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4578 .addDef(DstReg)
4579 .addImm(Opcode);
4580
4581 for (Register R : Ops) {
4582 MIB.addUse(R);
4583 }
4584
4585 MIB.addUse(TDescr)
4586 .addImm(IsA16 ? 1 : 0)
4587 .cloneMemRefs(MI);
4588
4589 MI.eraseFromParent();
4590 return true;
4591 }
4592
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const4593 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4594 MachineInstr &MI) const {
4595 MachineIRBuilder &B = Helper.MIRBuilder;
4596 MachineRegisterInfo &MRI = *B.getMRI();
4597
4598 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4599 auto IntrID = MI.getIntrinsicID();
4600 switch (IntrID) {
4601 case Intrinsic::amdgcn_if:
4602 case Intrinsic::amdgcn_else: {
4603 MachineInstr *Br = nullptr;
4604 MachineBasicBlock *UncondBrTarget = nullptr;
4605 bool Negated = false;
4606 if (MachineInstr *BrCond =
4607 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4608 const SIRegisterInfo *TRI
4609 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4610
4611 Register Def = MI.getOperand(1).getReg();
4612 Register Use = MI.getOperand(3).getReg();
4613
4614 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4615
4616 if (Negated)
4617 std::swap(CondBrTarget, UncondBrTarget);
4618
4619 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4620 if (IntrID == Intrinsic::amdgcn_if) {
4621 B.buildInstr(AMDGPU::SI_IF)
4622 .addDef(Def)
4623 .addUse(Use)
4624 .addMBB(UncondBrTarget);
4625 } else {
4626 B.buildInstr(AMDGPU::SI_ELSE)
4627 .addDef(Def)
4628 .addUse(Use)
4629 .addMBB(UncondBrTarget);
4630 }
4631
4632 if (Br) {
4633 Br->getOperand(0).setMBB(CondBrTarget);
4634 } else {
4635 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4636 // since we're swapping branch targets it needs to be reinserted.
4637 // FIXME: IRTranslator should probably not do this
4638 B.buildBr(*CondBrTarget);
4639 }
4640
4641 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4642 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4643 MI.eraseFromParent();
4644 BrCond->eraseFromParent();
4645 return true;
4646 }
4647
4648 return false;
4649 }
4650 case Intrinsic::amdgcn_loop: {
4651 MachineInstr *Br = nullptr;
4652 MachineBasicBlock *UncondBrTarget = nullptr;
4653 bool Negated = false;
4654 if (MachineInstr *BrCond =
4655 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4656 const SIRegisterInfo *TRI
4657 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4658
4659 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4660 Register Reg = MI.getOperand(2).getReg();
4661
4662 if (Negated)
4663 std::swap(CondBrTarget, UncondBrTarget);
4664
4665 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4666 B.buildInstr(AMDGPU::SI_LOOP)
4667 .addUse(Reg)
4668 .addMBB(UncondBrTarget);
4669
4670 if (Br)
4671 Br->getOperand(0).setMBB(CondBrTarget);
4672 else
4673 B.buildBr(*CondBrTarget);
4674
4675 MI.eraseFromParent();
4676 BrCond->eraseFromParent();
4677 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4678 return true;
4679 }
4680
4681 return false;
4682 }
4683 case Intrinsic::amdgcn_kernarg_segment_ptr:
4684 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4685 // This only makes sense to call in a kernel, so just lower to null.
4686 B.buildConstant(MI.getOperand(0).getReg(), 0);
4687 MI.eraseFromParent();
4688 return true;
4689 }
4690
4691 return legalizePreloadedArgIntrin(
4692 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4693 case Intrinsic::amdgcn_implicitarg_ptr:
4694 return legalizeImplicitArgPtr(MI, MRI, B);
4695 case Intrinsic::amdgcn_workitem_id_x:
4696 return legalizePreloadedArgIntrin(MI, MRI, B,
4697 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4698 case Intrinsic::amdgcn_workitem_id_y:
4699 return legalizePreloadedArgIntrin(MI, MRI, B,
4700 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4701 case Intrinsic::amdgcn_workitem_id_z:
4702 return legalizePreloadedArgIntrin(MI, MRI, B,
4703 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4704 case Intrinsic::amdgcn_workgroup_id_x:
4705 return legalizePreloadedArgIntrin(MI, MRI, B,
4706 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4707 case Intrinsic::amdgcn_workgroup_id_y:
4708 return legalizePreloadedArgIntrin(MI, MRI, B,
4709 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4710 case Intrinsic::amdgcn_workgroup_id_z:
4711 return legalizePreloadedArgIntrin(MI, MRI, B,
4712 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4713 case Intrinsic::amdgcn_dispatch_ptr:
4714 return legalizePreloadedArgIntrin(MI, MRI, B,
4715 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4716 case Intrinsic::amdgcn_queue_ptr:
4717 return legalizePreloadedArgIntrin(MI, MRI, B,
4718 AMDGPUFunctionArgInfo::QUEUE_PTR);
4719 case Intrinsic::amdgcn_implicit_buffer_ptr:
4720 return legalizePreloadedArgIntrin(
4721 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4722 case Intrinsic::amdgcn_dispatch_id:
4723 return legalizePreloadedArgIntrin(MI, MRI, B,
4724 AMDGPUFunctionArgInfo::DISPATCH_ID);
4725 case Intrinsic::amdgcn_fdiv_fast:
4726 return legalizeFDIVFastIntrin(MI, MRI, B);
4727 case Intrinsic::amdgcn_is_shared:
4728 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4729 case Intrinsic::amdgcn_is_private:
4730 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4731 case Intrinsic::amdgcn_wavefrontsize: {
4732 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4733 MI.eraseFromParent();
4734 return true;
4735 }
4736 case Intrinsic::amdgcn_s_buffer_load:
4737 return legalizeSBufferLoad(Helper, MI);
4738 case Intrinsic::amdgcn_raw_buffer_store:
4739 case Intrinsic::amdgcn_struct_buffer_store:
4740 return legalizeBufferStore(MI, MRI, B, false, false);
4741 case Intrinsic::amdgcn_raw_buffer_store_format:
4742 case Intrinsic::amdgcn_struct_buffer_store_format:
4743 return legalizeBufferStore(MI, MRI, B, false, true);
4744 case Intrinsic::amdgcn_raw_tbuffer_store:
4745 case Intrinsic::amdgcn_struct_tbuffer_store:
4746 return legalizeBufferStore(MI, MRI, B, true, true);
4747 case Intrinsic::amdgcn_raw_buffer_load:
4748 case Intrinsic::amdgcn_struct_buffer_load:
4749 return legalizeBufferLoad(MI, MRI, B, false, false);
4750 case Intrinsic::amdgcn_raw_buffer_load_format:
4751 case Intrinsic::amdgcn_struct_buffer_load_format:
4752 return legalizeBufferLoad(MI, MRI, B, true, false);
4753 case Intrinsic::amdgcn_raw_tbuffer_load:
4754 case Intrinsic::amdgcn_struct_tbuffer_load:
4755 return legalizeBufferLoad(MI, MRI, B, true, true);
4756 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4757 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4758 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4759 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4760 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4761 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4762 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4763 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4764 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4765 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4766 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4767 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4768 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4769 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4770 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4771 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4772 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4773 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4774 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4775 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4776 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4777 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4778 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4779 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4780 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4781 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4782 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4783 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4784 return legalizeBufferAtomic(MI, B, IntrID);
4785 case Intrinsic::amdgcn_atomic_inc:
4786 return legalizeAtomicIncDec(MI, B, true);
4787 case Intrinsic::amdgcn_atomic_dec:
4788 return legalizeAtomicIncDec(MI, B, false);
4789 case Intrinsic::trap:
4790 return legalizeTrapIntrinsic(MI, MRI, B);
4791 case Intrinsic::debugtrap:
4792 return legalizeDebugTrapIntrinsic(MI, MRI, B);
4793 case Intrinsic::amdgcn_rsq_clamp:
4794 return legalizeRsqClampIntrinsic(MI, MRI, B);
4795 case Intrinsic::amdgcn_ds_fadd:
4796 case Intrinsic::amdgcn_ds_fmin:
4797 case Intrinsic::amdgcn_ds_fmax:
4798 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4799 case Intrinsic::amdgcn_image_bvh_intersect_ray:
4800 return legalizeBVHIntrinsic(MI, B);
4801 default: {
4802 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4803 AMDGPU::getImageDimIntrinsicInfo(IntrID))
4804 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4805 return true;
4806 }
4807 }
4808
4809 return true;
4810 }
4811