1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40
41
isMultiple32(unsigned TypeIdx,unsigned MaxSize=1024)42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48 };
49 }
50
sizeIs(unsigned TypeIdx,unsigned Size)51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
54 };
55 }
56
isSmallOddVector(unsigned TypeIdx)57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
64 };
65 }
66
isWideVec16(unsigned TypeIdx)67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72 };
73 }
74
oneMoreElement(unsigned TypeIdx)75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80 };
81 }
82
fewerEltsToSize64Vector(unsigned TypeIdx)83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91 };
92 }
93
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
moreEltsToNext32Bit(unsigned TypeIdx)96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
104
105 assert(EltSize < 32);
106
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109 };
110 }
111
vectorSmallerThan(unsigned TypeIdx,unsigned Size)112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116 };
117 }
118
vectorWiderThan(unsigned TypeIdx,unsigned Size)119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123 };
124 }
125
numElementsNotEven(unsigned TypeIdx)126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130 };
131 }
132
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
isRegisterType(unsigned TypeIdx)135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
143 }
144
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146 };
147 }
148
elementTypeIs(unsigned TypeIdx,LLT Type)149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
152 };
153 }
154
isWideScalarTruncStore(unsigned TypeIdx)155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160 };
161 }
162
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
166 using namespace TargetOpcode;
167
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170 };
171
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
181
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
184
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
201
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
210
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224
225 const LLT CodePtr = FlatPtr;
226
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
229 };
230
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233 };
234
235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
237 };
238
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
241 };
242
243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
245 };
246
247 setAction({G_BRCOND, S1}, Legal); // VCC branches
248 setAction({G_BRCOND, S32}, Legal); // SCC branches
249
250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251 // elements for v3s16
252 getActionDefinitionsBuilder(G_PHI)
253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254 .legalFor(AllS32Vectors)
255 .legalFor(AllS64Vectors)
256 .legalFor(AddrSpaces64)
257 .legalFor(AddrSpaces32)
258 .clampScalar(0, S32, S256)
259 .widenScalarToNextPow2(0, 32)
260 .clampMaxNumElements(0, S32, 16)
261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262 .legalIf(isPointer(0));
263
264 if (ST.has16BitInsts()) {
265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266 .legalFor({S32, S16})
267 .clampScalar(0, S16, S32)
268 .scalarize(0);
269 } else {
270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271 .legalFor({S32})
272 .clampScalar(0, S32, S32)
273 .scalarize(0);
274 }
275
276 // FIXME: Not really legal. Placeholder for custom lowering.
277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278 .legalFor({S32, S64})
279 .clampScalar(0, S32, S64)
280 .widenScalarToNextPow2(0, 32)
281 .scalarize(0);
282
283 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284 .legalFor({S32})
285 .clampScalar(0, S32, S32)
286 .scalarize(0);
287
288 // Report legal for any types we can handle anywhere. For the cases only legal
289 // on the SALU, RegBankSelect will be able to re-legalize.
290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292 .clampScalar(0, S32, S64)
293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295 .widenScalarToNextPow2(0)
296 .scalarize(0);
297
298 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300 .legalFor({{S32, S1}, {S32, S32}})
301 .clampScalar(0, S32, S32)
302 .scalarize(0); // TODO: Implement.
303
304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305 .lower();
306
307 getActionDefinitionsBuilder(G_BITCAST)
308 // Don't worry about the size constraint.
309 .legalIf(all(isRegisterType(0), isRegisterType(1)))
310 // FIXME: Testing hack
311 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
312
313 getActionDefinitionsBuilder(G_FCONSTANT)
314 .legalFor({S32, S64, S16})
315 .clampScalar(0, S16, S64);
316
317 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321 .clampScalarOrElt(0, S32, S1024)
322 .legalIf(isMultiple32(0))
323 .widenScalarToNextPow2(0, 32)
324 .clampMaxNumElements(0, S32, 16);
325
326
327 // FIXME: i1 operands to intrinsics should always be legal, but other i1
328 // values may not be legal. We need to figure out how to distinguish
329 // between these two scenarios.
330 getActionDefinitionsBuilder(G_CONSTANT)
331 .legalFor({S1, S32, S64, S16, GlobalPtr,
332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333 .clampScalar(0, S32, S64)
334 .widenScalarToNextPow2(0)
335 .legalIf(isPointer(0));
336
337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340
341
342 auto &FPOpActions = getActionDefinitionsBuilder(
343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344 .legalFor({S32, S64});
345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346 .customFor({S32, S64});
347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348 .customFor({S32, S64});
349
350 if (ST.has16BitInsts()) {
351 if (ST.hasVOP3PInsts())
352 FPOpActions.legalFor({S16, V2S16});
353 else
354 FPOpActions.legalFor({S16});
355
356 TrigActions.customFor({S16});
357 FDIVActions.customFor({S16});
358 }
359
360 auto &MinNumMaxNum = getActionDefinitionsBuilder({
361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362
363 if (ST.hasVOP3PInsts()) {
364 MinNumMaxNum.customFor(FPTypesPK16)
365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366 .clampMaxNumElements(0, S16, 2)
367 .clampScalar(0, S16, S64)
368 .scalarize(0);
369 } else if (ST.has16BitInsts()) {
370 MinNumMaxNum.customFor(FPTypes16)
371 .clampScalar(0, S16, S64)
372 .scalarize(0);
373 } else {
374 MinNumMaxNum.customFor(FPTypesBase)
375 .clampScalar(0, S32, S64)
376 .scalarize(0);
377 }
378
379 if (ST.hasVOP3PInsts())
380 FPOpActions.clampMaxNumElements(0, S16, 2);
381
382 FPOpActions
383 .scalarize(0)
384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385
386 TrigActions
387 .scalarize(0)
388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389
390 FDIVActions
391 .scalarize(0)
392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393
394 getActionDefinitionsBuilder({G_FNEG, G_FABS})
395 .legalFor(FPTypesPK16)
396 .clampMaxNumElements(0, S16, 2)
397 .scalarize(0)
398 .clampScalar(0, S16, S64);
399
400 // TODO: Implement
401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
402
403 if (ST.has16BitInsts()) {
404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405 .legalFor({S32, S64, S16})
406 .scalarize(0)
407 .clampScalar(0, S16, S64);
408 } else {
409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410 .legalFor({S32, S64})
411 .scalarize(0)
412 .clampScalar(0, S32, S64);
413 }
414
415 getActionDefinitionsBuilder(G_FPTRUNC)
416 .legalFor({{S32, S64}, {S16, S32}})
417 .scalarize(0);
418
419 getActionDefinitionsBuilder(G_FPEXT)
420 .legalFor({{S64, S32}, {S32, S16}})
421 .lowerFor({{S64, S16}}) // FIXME: Implement
422 .scalarize(0);
423
424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
426
427 getActionDefinitionsBuilder(G_FSUB)
428 // Use actual fsub instruction
429 .legalFor({S32})
430 // Must use fadd + fneg
431 .lowerFor({S64, S16, V2S16})
432 .scalarize(0)
433 .clampScalar(0, S32, S64);
434
435 // Whether this is legal depends on the floating point mode for the function.
436 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
437 if (ST.hasMadF16())
438 FMad.customFor({S32, S16});
439 else
440 FMad.customFor({S32});
441 FMad.scalarize(0)
442 .lower();
443
444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446 {S32, S1}, {S64, S1}, {S16, S1},
447 {S96, S32},
448 // FIXME: Hack
449 {S64, LLT::scalar(33)},
450 {S32, S8}, {S32, LLT::scalar(24)}})
451 .scalarize(0)
452 .clampScalar(0, S32, S64);
453
454 // TODO: Split s1->s64 during regbankselect for VALU.
455 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
456 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
457 .lowerFor({{S32, S64}})
458 .lowerIf(typeIs(1, S1))
459 .customFor({{S64, S64}});
460 if (ST.has16BitInsts())
461 IToFP.legalFor({{S16, S16}});
462 IToFP.clampScalar(1, S32, S64)
463 .scalarize(0);
464
465 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
466 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
467 if (ST.has16BitInsts())
468 FPToI.legalFor({{S16, S16}});
469 else
470 FPToI.minScalar(1, S32);
471
472 FPToI.minScalar(0, S32)
473 .scalarize(0);
474
475 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
476 .scalarize(0)
477 .lower();
478
479 if (ST.has16BitInsts()) {
480 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481 .legalFor({S16, S32, S64})
482 .clampScalar(0, S16, S64)
483 .scalarize(0);
484 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
485 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
486 .legalFor({S32, S64})
487 .clampScalar(0, S32, S64)
488 .scalarize(0);
489 } else {
490 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
491 .legalFor({S32})
492 .customFor({S64})
493 .clampScalar(0, S32, S64)
494 .scalarize(0);
495 }
496
497 getActionDefinitionsBuilder(G_PTR_ADD)
498 .legalForCartesianProduct(AddrSpaces64, {S64})
499 .legalForCartesianProduct(AddrSpaces32, {S32})
500 .scalarize(0);
501
502 getActionDefinitionsBuilder(G_PTR_MASK)
503 .scalarize(0)
504 .alwaysLegal();
505
506 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
507
508 auto &CmpBuilder =
509 getActionDefinitionsBuilder(G_ICMP)
510 // The compare output type differs based on the register bank of the output,
511 // so make both s1 and s32 legal.
512 //
513 // Scalar compares producing output in scc will be promoted to s32, as that
514 // is the allocatable register type that will be needed for the copy from
515 // scc. This will be promoted during RegBankSelect, and we assume something
516 // before that won't try to use s32 result types.
517 //
518 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
519 // bank.
520 .legalForCartesianProduct(
521 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
522 .legalForCartesianProduct(
523 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
524 if (ST.has16BitInsts()) {
525 CmpBuilder.legalFor({{S1, S16}});
526 }
527
528 CmpBuilder
529 .widenScalarToNextPow2(1)
530 .clampScalar(1, S32, S64)
531 .scalarize(0)
532 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
533
534 getActionDefinitionsBuilder(G_FCMP)
535 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
536 .widenScalarToNextPow2(1)
537 .clampScalar(1, S32, S64)
538 .scalarize(0);
539
540 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
541 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
542 G_FLOG, G_FLOG2, G_FLOG10})
543 .legalFor({S32})
544 .scalarize(0);
545
546 // The 64-bit versions produce 32-bit results, but only on the SALU.
547 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
548 G_CTTZ, G_CTTZ_ZERO_UNDEF,
549 G_CTPOP})
550 .legalFor({{S32, S32}, {S32, S64}})
551 .clampScalar(0, S32, S32)
552 .clampScalar(1, S32, S64)
553 .scalarize(0)
554 .widenScalarToNextPow2(0, 32)
555 .widenScalarToNextPow2(1, 32);
556
557 // TODO: Expand for > s32
558 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
559 .legalFor({S32})
560 .clampScalar(0, S32, S32)
561 .scalarize(0);
562
563 if (ST.has16BitInsts()) {
564 if (ST.hasVOP3PInsts()) {
565 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
566 .legalFor({S32, S16, V2S16})
567 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
568 .clampMaxNumElements(0, S16, 2)
569 .clampScalar(0, S16, S32)
570 .widenScalarToNextPow2(0)
571 .scalarize(0);
572 } else {
573 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
574 .legalFor({S32, S16})
575 .widenScalarToNextPow2(0)
576 .clampScalar(0, S16, S32)
577 .scalarize(0);
578 }
579 } else {
580 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
581 .legalFor({S32})
582 .clampScalar(0, S32, S32)
583 .widenScalarToNextPow2(0)
584 .scalarize(0);
585 }
586
587 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
588 return [=](const LegalityQuery &Query) {
589 return Query.Types[TypeIdx0].getSizeInBits() <
590 Query.Types[TypeIdx1].getSizeInBits();
591 };
592 };
593
594 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
595 return [=](const LegalityQuery &Query) {
596 return Query.Types[TypeIdx0].getSizeInBits() >
597 Query.Types[TypeIdx1].getSizeInBits();
598 };
599 };
600
601 getActionDefinitionsBuilder(G_INTTOPTR)
602 // List the common cases
603 .legalForCartesianProduct(AddrSpaces64, {S64})
604 .legalForCartesianProduct(AddrSpaces32, {S32})
605 .scalarize(0)
606 // Accept any address space as long as the size matches
607 .legalIf(sameSize(0, 1))
608 .widenScalarIf(smallerThan(1, 0),
609 [](const LegalityQuery &Query) {
610 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
611 })
612 .narrowScalarIf(greaterThan(1, 0),
613 [](const LegalityQuery &Query) {
614 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
615 });
616
617 getActionDefinitionsBuilder(G_PTRTOINT)
618 // List the common cases
619 .legalForCartesianProduct(AddrSpaces64, {S64})
620 .legalForCartesianProduct(AddrSpaces32, {S32})
621 .scalarize(0)
622 // Accept any address space as long as the size matches
623 .legalIf(sameSize(0, 1))
624 .widenScalarIf(smallerThan(0, 1),
625 [](const LegalityQuery &Query) {
626 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
627 })
628 .narrowScalarIf(
629 greaterThan(0, 1),
630 [](const LegalityQuery &Query) {
631 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
632 });
633
634 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
635 .scalarize(0)
636 .custom();
637
638 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
639 // handle some operations by just promoting the register during
640 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
641 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
642 switch (AS) {
643 // FIXME: Private element size.
644 case AMDGPUAS::PRIVATE_ADDRESS:
645 return 32;
646 // FIXME: Check subtarget
647 case AMDGPUAS::LOCAL_ADDRESS:
648 return ST.useDS128() ? 128 : 64;
649
650 // Treat constant and global as identical. SMRD loads are sometimes usable
651 // for global loads (ideally constant address space should be eliminated)
652 // depending on the context. Legality cannot be context dependent, but
653 // RegBankSelect can split the load as necessary depending on the pointer
654 // register bank/uniformity and if the memory is invariant or not written in
655 // a kernel.
656 case AMDGPUAS::CONSTANT_ADDRESS:
657 case AMDGPUAS::GLOBAL_ADDRESS:
658 return 512;
659 default:
660 return 128;
661 }
662 };
663
664 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
665 const LLT DstTy = Query.Types[0];
666
667 // Split vector extloads.
668 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
669 unsigned Align = Query.MMODescrs[0].AlignInBits;
670
671 if (MemSize < DstTy.getSizeInBits())
672 MemSize = std::max(MemSize, Align);
673
674 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
675 return true;
676
677 const LLT PtrTy = Query.Types[1];
678 unsigned AS = PtrTy.getAddressSpace();
679 if (MemSize > maxSizeForAddrSpace(AS))
680 return true;
681
682 // Catch weird sized loads that don't evenly divide into the access sizes
683 // TODO: May be able to widen depending on alignment etc.
684 unsigned NumRegs = MemSize / 32;
685 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
686 return true;
687
688 if (Align < MemSize) {
689 const SITargetLowering *TLI = ST.getTargetLowering();
690 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
691 }
692
693 return false;
694 };
695
696 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
697 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
698 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
699
700 // TODO: Refine based on subtargets which support unaligned access or 128-bit
701 // LDS
702 // TODO: Unsupported flat for SI.
703
704 for (unsigned Op : {G_LOAD, G_STORE}) {
705 const bool IsStore = Op == G_STORE;
706
707 auto &Actions = getActionDefinitionsBuilder(Op);
708 // Whitelist the common cases.
709 // TODO: Pointer loads
710 // TODO: Wide constant loads
711 // TODO: Only CI+ has 3x loads
712 // TODO: Loads to s16 on gfx9
713 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
714 {V2S32, GlobalPtr, 64, GlobalAlign32},
715 {V3S32, GlobalPtr, 96, GlobalAlign32},
716 {S96, GlobalPtr, 96, GlobalAlign32},
717 {V4S32, GlobalPtr, 128, GlobalAlign32},
718 {S128, GlobalPtr, 128, GlobalAlign32},
719 {S64, GlobalPtr, 64, GlobalAlign32},
720 {V2S64, GlobalPtr, 128, GlobalAlign32},
721 {V2S16, GlobalPtr, 32, GlobalAlign32},
722 {S32, GlobalPtr, 8, GlobalAlign8},
723 {S32, GlobalPtr, 16, GlobalAlign16},
724
725 {S32, LocalPtr, 32, 32},
726 {S64, LocalPtr, 64, 32},
727 {V2S32, LocalPtr, 64, 32},
728 {S32, LocalPtr, 8, 8},
729 {S32, LocalPtr, 16, 16},
730 {V2S16, LocalPtr, 32, 32},
731
732 {S32, PrivatePtr, 32, 32},
733 {S32, PrivatePtr, 8, 8},
734 {S32, PrivatePtr, 16, 16},
735 {V2S16, PrivatePtr, 32, 32},
736
737 {S32, FlatPtr, 32, GlobalAlign32},
738 {S32, FlatPtr, 16, GlobalAlign16},
739 {S32, FlatPtr, 8, GlobalAlign8},
740 {V2S16, FlatPtr, 32, GlobalAlign32},
741
742 {S32, ConstantPtr, 32, GlobalAlign32},
743 {V2S32, ConstantPtr, 64, GlobalAlign32},
744 {V3S32, ConstantPtr, 96, GlobalAlign32},
745 {V4S32, ConstantPtr, 128, GlobalAlign32},
746 {S64, ConstantPtr, 64, GlobalAlign32},
747 {S128, ConstantPtr, 128, GlobalAlign32},
748 {V2S32, ConstantPtr, 32, GlobalAlign32}});
749 Actions
750 .customIf(typeIs(1, Constant32Ptr))
751 .narrowScalarIf(
752 [=](const LegalityQuery &Query) -> bool {
753 return !Query.Types[0].isVector() && needToSplitLoad(Query);
754 },
755 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
756 const LLT DstTy = Query.Types[0];
757 const LLT PtrTy = Query.Types[1];
758
759 const unsigned DstSize = DstTy.getSizeInBits();
760 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
761
762 // Split extloads.
763 if (DstSize > MemSize)
764 return std::make_pair(0, LLT::scalar(MemSize));
765
766 if (DstSize > 32 && (DstSize % 32 != 0)) {
767 // FIXME: Need a way to specify non-extload of larger size if
768 // suitably aligned.
769 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
770 }
771
772 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
773 if (MemSize > MaxSize)
774 return std::make_pair(0, LLT::scalar(MaxSize));
775
776 unsigned Align = Query.MMODescrs[0].AlignInBits;
777 return std::make_pair(0, LLT::scalar(Align));
778 })
779 .fewerElementsIf(
780 [=](const LegalityQuery &Query) -> bool {
781 return Query.Types[0].isVector() && needToSplitLoad(Query);
782 },
783 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
784 const LLT DstTy = Query.Types[0];
785 const LLT PtrTy = Query.Types[1];
786
787 LLT EltTy = DstTy.getElementType();
788 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
789
790 // Split if it's too large for the address space.
791 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
792 unsigned NumElts = DstTy.getNumElements();
793 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
794
795 // FIXME: Refine when odd breakdowns handled
796 // The scalars will need to be re-legalized.
797 if (NumPieces == 1 || NumPieces >= NumElts ||
798 NumElts % NumPieces != 0)
799 return std::make_pair(0, EltTy);
800
801 return std::make_pair(0,
802 LLT::vector(NumElts / NumPieces, EltTy));
803 }
804
805 // Need to split because of alignment.
806 unsigned Align = Query.MMODescrs[0].AlignInBits;
807 unsigned EltSize = EltTy.getSizeInBits();
808 if (EltSize > Align &&
809 (EltSize / Align < DstTy.getNumElements())) {
810 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
811 }
812
813 // May need relegalization for the scalars.
814 return std::make_pair(0, EltTy);
815 })
816 .minScalar(0, S32);
817
818 if (IsStore)
819 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
820
821 // TODO: Need a bitcast lower option?
822 Actions
823 .legalIf([=](const LegalityQuery &Query) {
824 const LLT Ty0 = Query.Types[0];
825 unsigned Size = Ty0.getSizeInBits();
826 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
827 unsigned Align = Query.MMODescrs[0].AlignInBits;
828
829 // FIXME: Widening store from alignment not valid.
830 if (MemSize < Size)
831 MemSize = std::max(MemSize, Align);
832
833 // No extending vector loads.
834 if (Size > MemSize && Ty0.isVector())
835 return false;
836
837 switch (MemSize) {
838 case 8:
839 case 16:
840 return Size == 32;
841 case 32:
842 case 64:
843 case 128:
844 return true;
845 case 96:
846 return ST.hasDwordx3LoadStores();
847 case 256:
848 case 512:
849 return true;
850 default:
851 return false;
852 }
853 })
854 .widenScalarToNextPow2(0)
855 // TODO: v3s32->v4s32 with alignment
856 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
857 }
858
859 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
860 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
861 {S32, GlobalPtr, 16, 2 * 8},
862 {S32, LocalPtr, 8, 8},
863 {S32, LocalPtr, 16, 16},
864 {S32, PrivatePtr, 8, 8},
865 {S32, PrivatePtr, 16, 16},
866 {S32, ConstantPtr, 8, 8},
867 {S32, ConstantPtr, 16, 2 * 8}});
868 if (ST.hasFlatAddressSpace()) {
869 ExtLoads.legalForTypesWithMemDesc(
870 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
871 }
872
873 ExtLoads.clampScalar(0, S32, S32)
874 .widenScalarToNextPow2(0)
875 .unsupportedIfMemSizeNotPow2()
876 .lower();
877
878 auto &Atomics = getActionDefinitionsBuilder(
879 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
880 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
881 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
882 G_ATOMICRMW_UMIN})
883 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
884 {S64, GlobalPtr}, {S64, LocalPtr}});
885 if (ST.hasFlatAddressSpace()) {
886 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
887 }
888
889 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
890 .legalFor({{S32, LocalPtr}});
891
892 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
893 // demarshalling
894 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
895 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
896 {S32, FlatPtr}, {S64, FlatPtr}})
897 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
898 {S32, RegionPtr}, {S64, RegionPtr}});
899
900 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
901 .lower();
902
903 // TODO: Pointer types, any 32-bit or 64-bit vector
904
905 // Condition should be s32 for scalar, s1 for vector.
906 getActionDefinitionsBuilder(G_SELECT)
907 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
908 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
909 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
910 .clampScalar(0, S16, S64)
911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
913 .scalarize(1)
914 .clampMaxNumElements(0, S32, 2)
915 .clampMaxNumElements(0, LocalPtr, 2)
916 .clampMaxNumElements(0, PrivatePtr, 2)
917 .scalarize(0)
918 .widenScalarToNextPow2(0)
919 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
920
921 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
922 // be more flexible with the shift amount type.
923 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
924 .legalFor({{S32, S32}, {S64, S32}});
925 if (ST.has16BitInsts()) {
926 if (ST.hasVOP3PInsts()) {
927 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
928 .clampMaxNumElements(0, S16, 2);
929 } else
930 Shifts.legalFor({{S16, S32}, {S16, S16}});
931
932 // TODO: Support 16-bit shift amounts
933 Shifts.clampScalar(1, S32, S32);
934 Shifts.clampScalar(0, S16, S64);
935 Shifts.widenScalarToNextPow2(0, 16);
936 } else {
937 // Make sure we legalize the shift amount type first, as the general
938 // expansion for the shifted type will produce much worse code if it hasn't
939 // been truncated already.
940 Shifts.clampScalar(1, S32, S32);
941 Shifts.clampScalar(0, S32, S64);
942 Shifts.widenScalarToNextPow2(0, 32);
943 }
944 Shifts.scalarize(0);
945
946 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
947 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
948 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
949 unsigned IdxTypeIdx = 2;
950
951 getActionDefinitionsBuilder(Op)
952 .customIf([=](const LegalityQuery &Query) {
953 const LLT EltTy = Query.Types[EltTypeIdx];
954 const LLT VecTy = Query.Types[VecTypeIdx];
955 const LLT IdxTy = Query.Types[IdxTypeIdx];
956 return (EltTy.getSizeInBits() == 16 ||
957 EltTy.getSizeInBits() % 32 == 0) &&
958 VecTy.getSizeInBits() % 32 == 0 &&
959 VecTy.getSizeInBits() <= 1024 &&
960 IdxTy.getSizeInBits() == 32;
961 })
962 .clampScalar(EltTypeIdx, S32, S64)
963 .clampScalar(VecTypeIdx, S32, S64)
964 .clampScalar(IdxTypeIdx, S32, S32);
965 }
966
967 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
968 .unsupportedIf([=](const LegalityQuery &Query) {
969 const LLT &EltTy = Query.Types[1].getElementType();
970 return Query.Types[0] != EltTy;
971 });
972
973 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
974 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
975 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
976
977 // FIXME: Doesn't handle extract of illegal sizes.
978 getActionDefinitionsBuilder(Op)
979 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
980 // FIXME: Multiples of 16 should not be legal.
981 .legalIf([=](const LegalityQuery &Query) {
982 const LLT BigTy = Query.Types[BigTyIdx];
983 const LLT LitTy = Query.Types[LitTyIdx];
984 return (BigTy.getSizeInBits() % 32 == 0) &&
985 (LitTy.getSizeInBits() % 16 == 0);
986 })
987 .widenScalarIf(
988 [=](const LegalityQuery &Query) {
989 const LLT BigTy = Query.Types[BigTyIdx];
990 return (BigTy.getScalarSizeInBits() < 16);
991 },
992 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
993 .widenScalarIf(
994 [=](const LegalityQuery &Query) {
995 const LLT LitTy = Query.Types[LitTyIdx];
996 return (LitTy.getScalarSizeInBits() < 16);
997 },
998 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
999 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1000 .widenScalarToNextPow2(BigTyIdx, 32);
1001
1002 }
1003
1004 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1005 .legalForCartesianProduct(AllS32Vectors, {S32})
1006 .legalForCartesianProduct(AllS64Vectors, {S64})
1007 .clampNumElements(0, V16S32, V32S32)
1008 .clampNumElements(0, V2S64, V16S64)
1009 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1010
1011 if (ST.hasScalarPackInsts())
1012 BuildVector.legalFor({V2S16, S32});
1013
1014 BuildVector
1015 .minScalarSameAs(1, 0)
1016 .legalIf(isRegisterType(0))
1017 .minScalarOrElt(0, S32);
1018
1019 if (ST.hasScalarPackInsts()) {
1020 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1021 .legalFor({V2S16, S32})
1022 .lower();
1023 } else {
1024 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1025 .lower();
1026 }
1027
1028 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1029 .legalIf(isRegisterType(0));
1030
1031 // TODO: Don't fully scalarize v2s16 pieces
1032 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1033
1034 // Merge/Unmerge
1035 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1036 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1037 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1038
1039 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1040 const LLT &Ty = Query.Types[TypeIdx];
1041 if (Ty.isVector()) {
1042 const LLT &EltTy = Ty.getElementType();
1043 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1044 return true;
1045 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1046 return true;
1047 }
1048 return false;
1049 };
1050
1051 auto &Builder = getActionDefinitionsBuilder(Op)
1052 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1053 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1054 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1055 // valid.
1056 .clampScalar(LitTyIdx, S16, S256)
1057 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1058 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1059 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1060 elementTypeIs(1, S16)),
1061 changeTo(1, V2S16))
1062 // Break up vectors with weird elements into scalars
1063 .fewerElementsIf(
1064 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1065 scalarize(0))
1066 .fewerElementsIf(
1067 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1068 scalarize(1))
1069 .clampScalar(BigTyIdx, S32, S1024)
1070 .lowerFor({{S16, V2S16}});
1071
1072 if (Op == G_MERGE_VALUES) {
1073 Builder.widenScalarIf(
1074 // TODO: Use 16-bit shifts if legal for 8-bit values?
1075 [=](const LegalityQuery &Query) {
1076 const LLT Ty = Query.Types[LitTyIdx];
1077 return Ty.getSizeInBits() < 32;
1078 },
1079 changeTo(LitTyIdx, S32));
1080 }
1081
1082 Builder.widenScalarIf(
1083 [=](const LegalityQuery &Query) {
1084 const LLT Ty = Query.Types[BigTyIdx];
1085 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1086 Ty.getSizeInBits() % 16 != 0;
1087 },
1088 [=](const LegalityQuery &Query) {
1089 // Pick the next power of 2, or a multiple of 64 over 128.
1090 // Whichever is smaller.
1091 const LLT &Ty = Query.Types[BigTyIdx];
1092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1093 if (NewSizeInBits >= 256) {
1094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1095 if (RoundedTo < NewSizeInBits)
1096 NewSizeInBits = RoundedTo;
1097 }
1098 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1099 })
1100 .legalIf([=](const LegalityQuery &Query) {
1101 const LLT &BigTy = Query.Types[BigTyIdx];
1102 const LLT &LitTy = Query.Types[LitTyIdx];
1103
1104 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1105 return false;
1106 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1107 return false;
1108
1109 return BigTy.getSizeInBits() % 16 == 0 &&
1110 LitTy.getSizeInBits() % 16 == 0 &&
1111 BigTy.getSizeInBits() <= 1024;
1112 })
1113 // Any vectors left are the wrong size. Scalarize them.
1114 .scalarize(0)
1115 .scalarize(1);
1116 }
1117
1118 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1119
1120 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1121
1122 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1123 .legalFor({S64});
1124
1125 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1126 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1127 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1128 .unsupported();
1129
1130 computeTables();
1131 verify(*ST.getInstrInfo());
1132 }
1133
legalizeCustom(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,GISelChangeObserver & Observer) const1134 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1135 MachineRegisterInfo &MRI,
1136 MachineIRBuilder &B,
1137 GISelChangeObserver &Observer) const {
1138 switch (MI.getOpcode()) {
1139 case TargetOpcode::G_ADDRSPACE_CAST:
1140 return legalizeAddrSpaceCast(MI, MRI, B);
1141 case TargetOpcode::G_FRINT:
1142 return legalizeFrint(MI, MRI, B);
1143 case TargetOpcode::G_FCEIL:
1144 return legalizeFceil(MI, MRI, B);
1145 case TargetOpcode::G_INTRINSIC_TRUNC:
1146 return legalizeIntrinsicTrunc(MI, MRI, B);
1147 case TargetOpcode::G_SITOFP:
1148 return legalizeITOFP(MI, MRI, B, true);
1149 case TargetOpcode::G_UITOFP:
1150 return legalizeITOFP(MI, MRI, B, false);
1151 case TargetOpcode::G_FMINNUM:
1152 case TargetOpcode::G_FMAXNUM:
1153 case TargetOpcode::G_FMINNUM_IEEE:
1154 case TargetOpcode::G_FMAXNUM_IEEE:
1155 return legalizeMinNumMaxNum(MI, MRI, B);
1156 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1157 return legalizeExtractVectorElt(MI, MRI, B);
1158 case TargetOpcode::G_INSERT_VECTOR_ELT:
1159 return legalizeInsertVectorElt(MI, MRI, B);
1160 case TargetOpcode::G_FSIN:
1161 case TargetOpcode::G_FCOS:
1162 return legalizeSinCos(MI, MRI, B);
1163 case TargetOpcode::G_GLOBAL_VALUE:
1164 return legalizeGlobalValue(MI, MRI, B);
1165 case TargetOpcode::G_LOAD:
1166 return legalizeLoad(MI, MRI, B, Observer);
1167 case TargetOpcode::G_FMAD:
1168 return legalizeFMad(MI, MRI, B);
1169 case TargetOpcode::G_FDIV:
1170 return legalizeFDIV(MI, MRI, B);
1171 case TargetOpcode::G_ATOMIC_CMPXCHG:
1172 return legalizeAtomicCmpXChg(MI, MRI, B);
1173 default:
1174 return false;
1175 }
1176
1177 llvm_unreachable("expected switch to return");
1178 }
1179
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1180 Register AMDGPULegalizerInfo::getSegmentAperture(
1181 unsigned AS,
1182 MachineRegisterInfo &MRI,
1183 MachineIRBuilder &B) const {
1184 MachineFunction &MF = B.getMF();
1185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186 const LLT S32 = LLT::scalar(32);
1187
1188 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1189
1190 if (ST.hasApertureRegs()) {
1191 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1192 // getreg.
1193 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1194 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1195 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1196 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1197 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1198 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1199 unsigned Encoding =
1200 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1201 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1202 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1203
1204 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1205 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1206
1207 B.buildInstr(AMDGPU::S_GETREG_B32)
1208 .addDef(GetReg)
1209 .addImm(Encoding);
1210 MRI.setType(GetReg, S32);
1211
1212 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1213 B.buildInstr(TargetOpcode::G_SHL)
1214 .addDef(ApertureReg)
1215 .addUse(GetReg)
1216 .addUse(ShiftAmt.getReg(0));
1217
1218 return ApertureReg;
1219 }
1220
1221 Register QueuePtr = MRI.createGenericVirtualRegister(
1222 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1223
1224 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1225 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1226 return Register();
1227
1228 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1229 // private_segment_aperture_base_hi.
1230 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1231
1232 // TODO: can we be smarter about machine pointer info?
1233 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1234 MachineMemOperand *MMO = MF.getMachineMemOperand(
1235 PtrInfo,
1236 MachineMemOperand::MOLoad |
1237 MachineMemOperand::MODereferenceable |
1238 MachineMemOperand::MOInvariant,
1239 4,
1240 MinAlign(64, StructOffset));
1241
1242 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1243 Register LoadAddr;
1244
1245 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1246 B.buildLoad(LoadResult, LoadAddr, *MMO);
1247 return LoadResult;
1248 }
1249
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1250 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1251 MachineInstr &MI, MachineRegisterInfo &MRI,
1252 MachineIRBuilder &B) const {
1253 MachineFunction &MF = B.getMF();
1254
1255 B.setInstr(MI);
1256
1257 const LLT S32 = LLT::scalar(32);
1258 Register Dst = MI.getOperand(0).getReg();
1259 Register Src = MI.getOperand(1).getReg();
1260
1261 LLT DstTy = MRI.getType(Dst);
1262 LLT SrcTy = MRI.getType(Src);
1263 unsigned DestAS = DstTy.getAddressSpace();
1264 unsigned SrcAS = SrcTy.getAddressSpace();
1265
1266 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1267 // vector element.
1268 assert(!DstTy.isVector());
1269
1270 const AMDGPUTargetMachine &TM
1271 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1272
1273 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1274 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1275 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1276 return true;
1277 }
1278
1279 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1280 // Truncate.
1281 B.buildExtract(Dst, Src, 0);
1282 MI.eraseFromParent();
1283 return true;
1284 }
1285
1286 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1288 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1289
1290 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1291 // another. Merge operands are required to be the same type, but creating an
1292 // extra ptrtoint would be kind of pointless.
1293 auto HighAddr = B.buildConstant(
1294 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1295 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1296 MI.eraseFromParent();
1297 return true;
1298 }
1299
1300 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1301 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1302 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1303 unsigned NullVal = TM.getNullPointerValue(DestAS);
1304
1305 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1306 auto FlatNull = B.buildConstant(SrcTy, 0);
1307
1308 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1309
1310 // Extract low 32-bits of the pointer.
1311 B.buildExtract(PtrLo32, Src, 0);
1312
1313 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1314 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1315 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1316
1317 MI.eraseFromParent();
1318 return true;
1319 }
1320
1321 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1322 return false;
1323
1324 if (!ST.hasFlatAddressSpace())
1325 return false;
1326
1327 auto SegmentNull =
1328 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1329 auto FlatNull =
1330 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1331
1332 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1333 if (!ApertureReg.isValid())
1334 return false;
1335
1336 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1337 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1338
1339 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1340
1341 // Coerce the type of the low half of the result so we can use merge_values.
1342 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1343 B.buildInstr(TargetOpcode::G_PTRTOINT)
1344 .addDef(SrcAsInt)
1345 .addUse(Src);
1346
1347 // TODO: Should we allow mismatched types but matching sizes in merges to
1348 // avoid the ptrtoint?
1349 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1350 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1351
1352 MI.eraseFromParent();
1353 return true;
1354 }
1355
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1356 bool AMDGPULegalizerInfo::legalizeFrint(
1357 MachineInstr &MI, MachineRegisterInfo &MRI,
1358 MachineIRBuilder &B) const {
1359 B.setInstr(MI);
1360
1361 Register Src = MI.getOperand(1).getReg();
1362 LLT Ty = MRI.getType(Src);
1363 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1364
1365 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1366 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1367
1368 auto C1 = B.buildFConstant(Ty, C1Val);
1369 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1370
1371 // TODO: Should this propagate fast-math-flags?
1372 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1373 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1374
1375 auto C2 = B.buildFConstant(Ty, C2Val);
1376 auto Fabs = B.buildFAbs(Ty, Src);
1377
1378 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1379 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1380 return true;
1381 }
1382
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1383 bool AMDGPULegalizerInfo::legalizeFceil(
1384 MachineInstr &MI, MachineRegisterInfo &MRI,
1385 MachineIRBuilder &B) const {
1386 B.setInstr(MI);
1387
1388 const LLT S1 = LLT::scalar(1);
1389 const LLT S64 = LLT::scalar(64);
1390
1391 Register Src = MI.getOperand(1).getReg();
1392 assert(MRI.getType(Src) == S64);
1393
1394 // result = trunc(src)
1395 // if (src > 0.0 && src != result)
1396 // result += 1.0
1397
1398 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1399
1400 const auto Zero = B.buildFConstant(S64, 0.0);
1401 const auto One = B.buildFConstant(S64, 1.0);
1402 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1403 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1404 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1405 auto Add = B.buildSelect(S64, And, One, Zero);
1406
1407 // TODO: Should this propagate fast-math-flags?
1408 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1409 return true;
1410 }
1411
extractF64Exponent(unsigned Hi,MachineIRBuilder & B)1412 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1413 MachineIRBuilder &B) {
1414 const unsigned FractBits = 52;
1415 const unsigned ExpBits = 11;
1416 LLT S32 = LLT::scalar(32);
1417
1418 auto Const0 = B.buildConstant(S32, FractBits - 32);
1419 auto Const1 = B.buildConstant(S32, ExpBits);
1420
1421 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1422 .addUse(Const0.getReg(0))
1423 .addUse(Const1.getReg(0));
1424
1425 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1426 }
1427
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1428 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1429 MachineInstr &MI, MachineRegisterInfo &MRI,
1430 MachineIRBuilder &B) const {
1431 B.setInstr(MI);
1432
1433 const LLT S1 = LLT::scalar(1);
1434 const LLT S32 = LLT::scalar(32);
1435 const LLT S64 = LLT::scalar(64);
1436
1437 Register Src = MI.getOperand(1).getReg();
1438 assert(MRI.getType(Src) == S64);
1439
1440 // TODO: Should this use extract since the low half is unused?
1441 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1442 Register Hi = Unmerge.getReg(1);
1443
1444 // Extract the upper half, since this is where we will find the sign and
1445 // exponent.
1446 auto Exp = extractF64Exponent(Hi, B);
1447
1448 const unsigned FractBits = 52;
1449
1450 // Extract the sign bit.
1451 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1452 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1453
1454 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1455
1456 const auto Zero32 = B.buildConstant(S32, 0);
1457
1458 // Extend back to 64-bits.
1459 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1460
1461 auto Shr = B.buildAShr(S64, FractMask, Exp);
1462 auto Not = B.buildNot(S64, Shr);
1463 auto Tmp0 = B.buildAnd(S64, Src, Not);
1464 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1465
1466 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1467 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1468
1469 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1470 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1471 return true;
1472 }
1473
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const1474 bool AMDGPULegalizerInfo::legalizeITOFP(
1475 MachineInstr &MI, MachineRegisterInfo &MRI,
1476 MachineIRBuilder &B, bool Signed) const {
1477 B.setInstr(MI);
1478
1479 Register Dst = MI.getOperand(0).getReg();
1480 Register Src = MI.getOperand(1).getReg();
1481
1482 const LLT S64 = LLT::scalar(64);
1483 const LLT S32 = LLT::scalar(32);
1484
1485 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1486
1487 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1488
1489 auto CvtHi = Signed ?
1490 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1491 B.buildUITOFP(S64, Unmerge.getReg(1));
1492
1493 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1494
1495 auto ThirtyTwo = B.buildConstant(S32, 32);
1496 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1497 .addUse(CvtHi.getReg(0))
1498 .addUse(ThirtyTwo.getReg(0));
1499
1500 // TODO: Should this propagate fast-math-flags?
1501 B.buildFAdd(Dst, LdExp, CvtLo);
1502 MI.eraseFromParent();
1503 return true;
1504 }
1505
legalizeMinNumMaxNum(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1506 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1507 MachineInstr &MI, MachineRegisterInfo &MRI,
1508 MachineIRBuilder &B) const {
1509 MachineFunction &MF = B.getMF();
1510 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1511
1512 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1513 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1514
1515 // With ieee_mode disabled, the instructions have the correct behavior
1516 // already for G_FMINNUM/G_FMAXNUM
1517 if (!MFI->getMode().IEEE)
1518 return !IsIEEEOp;
1519
1520 if (IsIEEEOp)
1521 return true;
1522
1523 MachineIRBuilder HelperBuilder(MI);
1524 GISelObserverWrapper DummyObserver;
1525 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1526 HelperBuilder.setInstr(MI);
1527 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1528 }
1529
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1530 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1531 MachineInstr &MI, MachineRegisterInfo &MRI,
1532 MachineIRBuilder &B) const {
1533 // TODO: Should move some of this into LegalizerHelper.
1534
1535 // TODO: Promote dynamic indexing of s16 to s32
1536 // TODO: Dynamic s64 indexing is only legal for SGPR.
1537 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1538 if (!IdxVal) // Dynamic case will be selected to register indexing.
1539 return true;
1540
1541 Register Dst = MI.getOperand(0).getReg();
1542 Register Vec = MI.getOperand(1).getReg();
1543
1544 LLT VecTy = MRI.getType(Vec);
1545 LLT EltTy = VecTy.getElementType();
1546 assert(EltTy == MRI.getType(Dst));
1547
1548 B.setInstr(MI);
1549
1550 if (IdxVal.getValue() < VecTy.getNumElements())
1551 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1552 else
1553 B.buildUndef(Dst);
1554
1555 MI.eraseFromParent();
1556 return true;
1557 }
1558
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1559 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1560 MachineInstr &MI, MachineRegisterInfo &MRI,
1561 MachineIRBuilder &B) const {
1562 // TODO: Should move some of this into LegalizerHelper.
1563
1564 // TODO: Promote dynamic indexing of s16 to s32
1565 // TODO: Dynamic s64 indexing is only legal for SGPR.
1566 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1567 if (!IdxVal) // Dynamic case will be selected to register indexing.
1568 return true;
1569
1570 Register Dst = MI.getOperand(0).getReg();
1571 Register Vec = MI.getOperand(1).getReg();
1572 Register Ins = MI.getOperand(2).getReg();
1573
1574 LLT VecTy = MRI.getType(Vec);
1575 LLT EltTy = VecTy.getElementType();
1576 assert(EltTy == MRI.getType(Ins));
1577
1578 B.setInstr(MI);
1579
1580 if (IdxVal.getValue() < VecTy.getNumElements())
1581 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1582 else
1583 B.buildUndef(Dst);
1584
1585 MI.eraseFromParent();
1586 return true;
1587 }
1588
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1589 bool AMDGPULegalizerInfo::legalizeSinCos(
1590 MachineInstr &MI, MachineRegisterInfo &MRI,
1591 MachineIRBuilder &B) const {
1592 B.setInstr(MI);
1593
1594 Register DstReg = MI.getOperand(0).getReg();
1595 Register SrcReg = MI.getOperand(1).getReg();
1596 LLT Ty = MRI.getType(DstReg);
1597 unsigned Flags = MI.getFlags();
1598
1599 Register TrigVal;
1600 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1601 if (ST.hasTrigReducedRange()) {
1602 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1603 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1604 .addUse(MulVal.getReg(0))
1605 .setMIFlags(Flags).getReg(0);
1606 } else
1607 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1608
1609 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1610 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1611 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1612 .addUse(TrigVal)
1613 .setMIFlags(Flags);
1614 MI.eraseFromParent();
1615 return true;
1616 }
1617
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,unsigned Offset,unsigned GAFlags) const1618 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1619 Register DstReg, LLT PtrTy,
1620 MachineIRBuilder &B, const GlobalValue *GV,
1621 unsigned Offset, unsigned GAFlags) const {
1622 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1623 // to the following code sequence:
1624 //
1625 // For constant address space:
1626 // s_getpc_b64 s[0:1]
1627 // s_add_u32 s0, s0, $symbol
1628 // s_addc_u32 s1, s1, 0
1629 //
1630 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1631 // a fixup or relocation is emitted to replace $symbol with a literal
1632 // constant, which is a pc-relative offset from the encoding of the $symbol
1633 // operand to the global variable.
1634 //
1635 // For global address space:
1636 // s_getpc_b64 s[0:1]
1637 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1638 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1639 //
1640 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1641 // fixups or relocations are emitted to replace $symbol@*@lo and
1642 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1643 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1644 // operand to the global variable.
1645 //
1646 // What we want here is an offset from the value returned by s_getpc
1647 // (which is the address of the s_add_u32 instruction) to the global
1648 // variable, but since the encoding of $symbol starts 4 bytes after the start
1649 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1650 // small. This requires us to add 4 to the global variable offset in order to
1651 // compute the correct address.
1652
1653 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1654
1655 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1656 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1657
1658 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1659 .addDef(PCReg);
1660
1661 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1662 if (GAFlags == SIInstrInfo::MO_NONE)
1663 MIB.addImm(0);
1664 else
1665 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1666
1667 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1668
1669 if (PtrTy.getSizeInBits() == 32)
1670 B.buildExtract(DstReg, PCReg, 0);
1671 return true;
1672 }
1673
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1674 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1675 MachineInstr &MI, MachineRegisterInfo &MRI,
1676 MachineIRBuilder &B) const {
1677 Register DstReg = MI.getOperand(0).getReg();
1678 LLT Ty = MRI.getType(DstReg);
1679 unsigned AS = Ty.getAddressSpace();
1680
1681 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1682 MachineFunction &MF = B.getMF();
1683 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1684 B.setInstr(MI);
1685
1686 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1687 if (!MFI->isEntryFunction()) {
1688 const Function &Fn = MF.getFunction();
1689 DiagnosticInfoUnsupported BadLDSDecl(
1690 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1691 Fn.getContext().diagnose(BadLDSDecl);
1692 }
1693
1694 // TODO: We could emit code to handle the initialization somewhere.
1695 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1696 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1697 MI.eraseFromParent();
1698 return true;
1699 }
1700
1701 const Function &Fn = MF.getFunction();
1702 DiagnosticInfoUnsupported BadInit(
1703 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1704 Fn.getContext().diagnose(BadInit);
1705 return true;
1706 }
1707
1708 const SITargetLowering *TLI = ST.getTargetLowering();
1709
1710 if (TLI->shouldEmitFixup(GV)) {
1711 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1712 MI.eraseFromParent();
1713 return true;
1714 }
1715
1716 if (TLI->shouldEmitPCReloc(GV)) {
1717 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1718 MI.eraseFromParent();
1719 return true;
1720 }
1721
1722 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1723 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1724
1725 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1726 MachinePointerInfo::getGOT(MF),
1727 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1728 MachineMemOperand::MOInvariant,
1729 8 /*Size*/, 8 /*Align*/);
1730
1731 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1732
1733 if (Ty.getSizeInBits() == 32) {
1734 // Truncate if this is a 32-bit constant adrdess.
1735 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1736 B.buildExtract(DstReg, Load, 0);
1737 } else
1738 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1739
1740 MI.eraseFromParent();
1741 return true;
1742 }
1743
legalizeLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,GISelChangeObserver & Observer) const1744 bool AMDGPULegalizerInfo::legalizeLoad(
1745 MachineInstr &MI, MachineRegisterInfo &MRI,
1746 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1747 B.setInstr(MI);
1748 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1749 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1750 Observer.changingInstr(MI);
1751 MI.getOperand(1).setReg(Cast.getReg(0));
1752 Observer.changedInstr(MI);
1753 return true;
1754 }
1755
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1756 bool AMDGPULegalizerInfo::legalizeFMad(
1757 MachineInstr &MI, MachineRegisterInfo &MRI,
1758 MachineIRBuilder &B) const {
1759 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1760 assert(Ty.isScalar());
1761
1762 MachineFunction &MF = B.getMF();
1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764
1765 // TODO: Always legal with future ftz flag.
1766 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1767 return true;
1768 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1769 return true;
1770
1771
1772 MachineIRBuilder HelperBuilder(MI);
1773 GISelObserverWrapper DummyObserver;
1774 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1775 HelperBuilder.setMBB(*MI.getParent());
1776 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1777 }
1778
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1779 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1780 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1781 Register DstReg = MI.getOperand(0).getReg();
1782 Register PtrReg = MI.getOperand(1).getReg();
1783 Register CmpVal = MI.getOperand(2).getReg();
1784 Register NewVal = MI.getOperand(3).getReg();
1785
1786 assert(SITargetLowering::isFlatGlobalAddrSpace(
1787 MRI.getType(PtrReg).getAddressSpace()) &&
1788 "this should not have been custom lowered");
1789
1790 LLT ValTy = MRI.getType(CmpVal);
1791 LLT VecTy = LLT::vector(2, ValTy);
1792
1793 B.setInstr(MI);
1794 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1795
1796 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1797 .addDef(DstReg)
1798 .addUse(PtrReg)
1799 .addUse(PackedVal)
1800 .setMemRefs(MI.memoperands());
1801
1802 MI.eraseFromParent();
1803 return true;
1804 }
1805
1806 // Return the use branch instruction, otherwise null if the usage is invalid.
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br)1807 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1808 MachineRegisterInfo &MRI,
1809 MachineInstr *&Br) {
1810 Register CondDef = MI.getOperand(0).getReg();
1811 if (!MRI.hasOneNonDBGUse(CondDef))
1812 return nullptr;
1813
1814 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1815 if (UseMI.getParent() != MI.getParent() ||
1816 UseMI.getOpcode() != AMDGPU::G_BRCOND)
1817 return nullptr;
1818
1819 // Make sure the cond br is followed by a G_BR
1820 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1821 if (Next != MI.getParent()->end()) {
1822 if (Next->getOpcode() != AMDGPU::G_BR)
1823 return nullptr;
1824 Br = &*Next;
1825 }
1826
1827 return &UseMI;
1828 }
1829
getLiveInRegister(MachineRegisterInfo & MRI,Register Reg,LLT Ty) const1830 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1831 Register Reg, LLT Ty) const {
1832 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1833 if (LiveIn)
1834 return LiveIn;
1835
1836 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1837 MRI.addLiveIn(Reg, NewReg);
1838 return NewReg;
1839 }
1840
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg) const1841 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1842 const ArgDescriptor *Arg) const {
1843 if (!Arg->isRegister() || !Arg->getRegister().isValid())
1844 return false; // TODO: Handle these
1845
1846 assert(Arg->getRegister().isPhysical());
1847
1848 MachineRegisterInfo &MRI = *B.getMRI();
1849
1850 LLT Ty = MRI.getType(DstReg);
1851 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1852
1853 if (Arg->isMasked()) {
1854 // TODO: Should we try to emit this once in the entry block?
1855 const LLT S32 = LLT::scalar(32);
1856 const unsigned Mask = Arg->getMask();
1857 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1858
1859 Register AndMaskSrc = LiveIn;
1860
1861 if (Shift != 0) {
1862 auto ShiftAmt = B.buildConstant(S32, Shift);
1863 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1864 }
1865
1866 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1867 } else
1868 B.buildCopy(DstReg, LiveIn);
1869
1870 // Insert the argument copy if it doens't already exist.
1871 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1872 if (!MRI.getVRegDef(LiveIn)) {
1873 // FIXME: Should have scoped insert pt
1874 MachineBasicBlock &OrigInsBB = B.getMBB();
1875 auto OrigInsPt = B.getInsertPt();
1876
1877 MachineBasicBlock &EntryMBB = B.getMF().front();
1878 EntryMBB.addLiveIn(Arg->getRegister());
1879 B.setInsertPt(EntryMBB, EntryMBB.begin());
1880 B.buildCopy(LiveIn, Arg->getRegister());
1881
1882 B.setInsertPt(OrigInsBB, OrigInsPt);
1883 }
1884
1885 return true;
1886 }
1887
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const1888 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1889 MachineInstr &MI,
1890 MachineRegisterInfo &MRI,
1891 MachineIRBuilder &B,
1892 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1893 B.setInstr(MI);
1894
1895 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1896
1897 const ArgDescriptor *Arg;
1898 const TargetRegisterClass *RC;
1899 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1900 if (!Arg) {
1901 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1902 return false;
1903 }
1904
1905 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1906 MI.eraseFromParent();
1907 return true;
1908 }
1909
1910 return false;
1911 }
1912
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1913 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1914 MachineRegisterInfo &MRI,
1915 MachineIRBuilder &B) const {
1916 B.setInstr(MI);
1917 Register Dst = MI.getOperand(0).getReg();
1918 LLT DstTy = MRI.getType(Dst);
1919 LLT S16 = LLT::scalar(16);
1920 LLT S32 = LLT::scalar(32);
1921 LLT S64 = LLT::scalar(64);
1922
1923 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1924 return true;
1925
1926 if (DstTy == S16)
1927 return legalizeFDIV16(MI, MRI, B);
1928 if (DstTy == S32)
1929 return legalizeFDIV32(MI, MRI, B);
1930 if (DstTy == S64)
1931 return legalizeFDIV64(MI, MRI, B);
1932
1933 return false;
1934 }
1935
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1936 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1937 MachineRegisterInfo &MRI,
1938 MachineIRBuilder &B) const {
1939 Register Res = MI.getOperand(0).getReg();
1940 Register LHS = MI.getOperand(1).getReg();
1941 Register RHS = MI.getOperand(2).getReg();
1942
1943 uint16_t Flags = MI.getFlags();
1944
1945 LLT ResTy = MRI.getType(Res);
1946 LLT S32 = LLT::scalar(32);
1947 LLT S64 = LLT::scalar(64);
1948
1949 const MachineFunction &MF = B.getMF();
1950 bool Unsafe =
1951 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1952
1953 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1954 return false;
1955
1956 if (!Unsafe && ResTy == S32 &&
1957 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1958 return false;
1959
1960 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1961 // 1 / x -> RCP(x)
1962 if (CLHS->isExactlyValue(1.0)) {
1963 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1964 .addUse(RHS)
1965 .setMIFlags(Flags);
1966
1967 MI.eraseFromParent();
1968 return true;
1969 }
1970
1971 // -1 / x -> RCP( FNEG(x) )
1972 if (CLHS->isExactlyValue(-1.0)) {
1973 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1974 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1975 .addUse(FNeg.getReg(0))
1976 .setMIFlags(Flags);
1977
1978 MI.eraseFromParent();
1979 return true;
1980 }
1981 }
1982
1983 // x / y -> x * (1.0 / y)
1984 if (Unsafe) {
1985 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1986 .addUse(RHS)
1987 .setMIFlags(Flags);
1988 B.buildFMul(Res, LHS, RCP, Flags);
1989
1990 MI.eraseFromParent();
1991 return true;
1992 }
1993
1994 return false;
1995 }
1996
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1997 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1998 MachineRegisterInfo &MRI,
1999 MachineIRBuilder &B) const {
2000 B.setInstr(MI);
2001 Register Res = MI.getOperand(0).getReg();
2002 Register LHS = MI.getOperand(1).getReg();
2003 Register RHS = MI.getOperand(2).getReg();
2004
2005 uint16_t Flags = MI.getFlags();
2006
2007 LLT S16 = LLT::scalar(16);
2008 LLT S32 = LLT::scalar(32);
2009
2010 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2011 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2012
2013 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2014 .addUse(RHSExt.getReg(0))
2015 .setMIFlags(Flags);
2016
2017 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2018 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2019
2020 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2021 .addUse(RDst.getReg(0))
2022 .addUse(RHS)
2023 .addUse(LHS)
2024 .setMIFlags(Flags);
2025
2026 MI.eraseFromParent();
2027 return true;
2028 }
2029
2030 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2031 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)2032 static void toggleSPDenormMode(bool Enable,
2033 MachineIRBuilder &B,
2034 const GCNSubtarget &ST,
2035 AMDGPU::SIModeRegisterDefaults Mode) {
2036 // Set SP denorm mode to this value.
2037 unsigned SPDenormMode =
2038 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2039
2040 if (ST.hasDenormModeInst()) {
2041 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2042 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2043 ? FP_DENORM_FLUSH_NONE
2044 : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2045
2046 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2047 B.buildInstr(AMDGPU::S_DENORM_MODE)
2048 .addImm(NewDenormModeValue);
2049
2050 } else {
2051 // Select FP32 bit field in mode register.
2052 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2053 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2054 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2055
2056 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2057 .addImm(SPDenormMode)
2058 .addImm(SPDenormModeBitField);
2059 }
2060 }
2061
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2062 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2063 MachineRegisterInfo &MRI,
2064 MachineIRBuilder &B) const {
2065 B.setInstr(MI);
2066 Register Res = MI.getOperand(0).getReg();
2067 Register LHS = MI.getOperand(1).getReg();
2068 Register RHS = MI.getOperand(2).getReg();
2069 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2070 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2071
2072 uint16_t Flags = MI.getFlags();
2073
2074 LLT S32 = LLT::scalar(32);
2075 LLT S1 = LLT::scalar(1);
2076
2077 auto One = B.buildFConstant(S32, 1.0f);
2078
2079 auto DenominatorScaled =
2080 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2081 .addUse(RHS)
2082 .addUse(LHS)
2083 .addImm(1)
2084 .setMIFlags(Flags);
2085 auto NumeratorScaled =
2086 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2087 .addUse(LHS)
2088 .addUse(RHS)
2089 .addImm(0)
2090 .setMIFlags(Flags);
2091
2092 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093 .addUse(DenominatorScaled.getReg(0))
2094 .setMIFlags(Flags);
2095 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2096
2097 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2098 // aren't modeled as reading it.
2099 if (!Mode.FP32Denormals)
2100 toggleSPDenormMode(true, B, ST, Mode);
2101
2102 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2103 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2104 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2105 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2106 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2107 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2108
2109 if (!Mode.FP32Denormals)
2110 toggleSPDenormMode(false, B, ST, Mode);
2111
2112 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2113 .addUse(Fma4.getReg(0))
2114 .addUse(Fma1.getReg(0))
2115 .addUse(Fma3.getReg(0))
2116 .addUse(NumeratorScaled.getReg(1))
2117 .setMIFlags(Flags);
2118
2119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2120 .addUse(Fmas.getReg(0))
2121 .addUse(RHS)
2122 .addUse(LHS)
2123 .setMIFlags(Flags);
2124
2125 MI.eraseFromParent();
2126 return true;
2127 }
2128
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2129 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2130 MachineRegisterInfo &MRI,
2131 MachineIRBuilder &B) const {
2132 B.setInstr(MI);
2133 Register Res = MI.getOperand(0).getReg();
2134 Register LHS = MI.getOperand(1).getReg();
2135 Register RHS = MI.getOperand(2).getReg();
2136
2137 uint16_t Flags = MI.getFlags();
2138
2139 LLT S64 = LLT::scalar(64);
2140 LLT S1 = LLT::scalar(1);
2141
2142 auto One = B.buildFConstant(S64, 1.0);
2143
2144 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2145 .addUse(LHS)
2146 .addUse(RHS)
2147 .addImm(1)
2148 .setMIFlags(Flags);
2149
2150 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2151
2152 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2153 .addUse(DivScale0.getReg(0))
2154 .setMIFlags(Flags);
2155
2156 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2157 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2158 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2159
2160 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2161 .addUse(LHS)
2162 .addUse(RHS)
2163 .addImm(0)
2164 .setMIFlags(Flags);
2165
2166 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2167 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2168 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2169
2170 Register Scale;
2171 if (!ST.hasUsableDivScaleConditionOutput()) {
2172 // Workaround a hardware bug on SI where the condition output from div_scale
2173 // is not usable.
2174
2175 Scale = MRI.createGenericVirtualRegister(S1);
2176
2177 LLT S32 = LLT::scalar(32);
2178
2179 auto NumUnmerge = B.buildUnmerge(S32, LHS);
2180 auto DenUnmerge = B.buildUnmerge(S32, RHS);
2181 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2182 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2183
2184 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2185 Scale1Unmerge.getReg(1));
2186 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2187 Scale0Unmerge.getReg(1));
2188 B.buildXor(Scale, CmpNum, CmpDen);
2189 } else {
2190 Scale = DivScale1.getReg(1);
2191 }
2192
2193 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2194 .addUse(Fma4.getReg(0))
2195 .addUse(Fma3.getReg(0))
2196 .addUse(Mul.getReg(0))
2197 .addUse(Scale)
2198 .setMIFlags(Flags);
2199
2200 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2201 .addUse(Fmas.getReg(0))
2202 .addUse(RHS)
2203 .addUse(LHS)
2204 .setMIFlags(Flags);
2205
2206 MI.eraseFromParent();
2207 return true;
2208 }
2209
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2210 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2211 MachineRegisterInfo &MRI,
2212 MachineIRBuilder &B) const {
2213 B.setInstr(MI);
2214 Register Res = MI.getOperand(0).getReg();
2215 Register LHS = MI.getOperand(2).getReg();
2216 Register RHS = MI.getOperand(3).getReg();
2217 uint16_t Flags = MI.getFlags();
2218
2219 LLT S32 = LLT::scalar(32);
2220 LLT S1 = LLT::scalar(1);
2221
2222 auto Abs = B.buildFAbs(S32, RHS, Flags);
2223 const APFloat C0Val(1.0f);
2224
2225 auto C0 = B.buildConstant(S32, 0x6f800000);
2226 auto C1 = B.buildConstant(S32, 0x2f800000);
2227 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2228
2229 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2230 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2231
2232 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2233
2234 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2235 .addUse(Mul0.getReg(0))
2236 .setMIFlags(Flags);
2237
2238 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2239
2240 B.buildFMul(Res, Sel, Mul1, Flags);
2241
2242 MI.eraseFromParent();
2243 return true;
2244 }
2245
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2246 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2247 MachineRegisterInfo &MRI,
2248 MachineIRBuilder &B) const {
2249 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2250 if (!MFI->isEntryFunction()) {
2251 return legalizePreloadedArgIntrin(MI, MRI, B,
2252 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2253 }
2254
2255 B.setInstr(MI);
2256
2257 uint64_t Offset =
2258 ST.getTargetLowering()->getImplicitParameterOffset(
2259 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2260 Register DstReg = MI.getOperand(0).getReg();
2261 LLT DstTy = MRI.getType(DstReg);
2262 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2263
2264 const ArgDescriptor *Arg;
2265 const TargetRegisterClass *RC;
2266 std::tie(Arg, RC)
2267 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2268 if (!Arg)
2269 return false;
2270
2271 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2272 if (!loadInputValue(KernargPtrReg, B, Arg))
2273 return false;
2274
2275 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2276 MI.eraseFromParent();
2277 return true;
2278 }
2279
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const2280 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2281 MachineRegisterInfo &MRI,
2282 MachineIRBuilder &B,
2283 unsigned AddrSpace) const {
2284 B.setInstr(MI);
2285 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2286 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2287 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2288 MI.eraseFromParent();
2289 return true;
2290 }
2291
2292 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const2293 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2294 MachineRegisterInfo &MRI,
2295 Register Reg) const {
2296 if (!ST.hasUnpackedD16VMem())
2297 return Reg;
2298
2299 const LLT S16 = LLT::scalar(16);
2300 const LLT S32 = LLT::scalar(32);
2301 LLT StoreVT = MRI.getType(Reg);
2302 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2303
2304 auto Unmerge = B.buildUnmerge(S16, Reg);
2305
2306 SmallVector<Register, 4> WideRegs;
2307 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2308 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2309
2310 int NumElts = StoreVT.getNumElements();
2311
2312 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2313 }
2314
legalizeRawBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat) const2315 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2316 MachineRegisterInfo &MRI,
2317 MachineIRBuilder &B,
2318 bool IsFormat) const {
2319 // TODO: Reject f16 format on targets where unsupported.
2320 Register VData = MI.getOperand(1).getReg();
2321 LLT Ty = MRI.getType(VData);
2322
2323 B.setInstr(MI);
2324
2325 const LLT S32 = LLT::scalar(32);
2326 const LLT S16 = LLT::scalar(16);
2327
2328 // Fixup illegal register types for i8 stores.
2329 if (Ty == LLT::scalar(8) || Ty == S16) {
2330 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2331 MI.getOperand(1).setReg(AnyExt);
2332 return true;
2333 }
2334
2335 if (Ty.isVector()) {
2336 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2337 if (IsFormat)
2338 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2339 return true;
2340 }
2341
2342 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2343 }
2344
2345 return Ty == S32;
2346 }
2347
legalizeIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2348 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2349 MachineRegisterInfo &MRI,
2350 MachineIRBuilder &B) const {
2351 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2352 auto IntrID = MI.getIntrinsicID();
2353 switch (IntrID) {
2354 case Intrinsic::amdgcn_if:
2355 case Intrinsic::amdgcn_else: {
2356 MachineInstr *Br = nullptr;
2357 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2358 const SIRegisterInfo *TRI
2359 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2360
2361 B.setInstr(*BrCond);
2362 Register Def = MI.getOperand(1).getReg();
2363 Register Use = MI.getOperand(3).getReg();
2364
2365 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2366 if (Br)
2367 BrTarget = Br->getOperand(0).getMBB();
2368
2369 if (IntrID == Intrinsic::amdgcn_if) {
2370 B.buildInstr(AMDGPU::SI_IF)
2371 .addDef(Def)
2372 .addUse(Use)
2373 .addMBB(BrTarget);
2374 } else {
2375 B.buildInstr(AMDGPU::SI_ELSE)
2376 .addDef(Def)
2377 .addUse(Use)
2378 .addMBB(BrTarget)
2379 .addImm(0);
2380 }
2381
2382 if (Br)
2383 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2384
2385 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2386 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2387 MI.eraseFromParent();
2388 BrCond->eraseFromParent();
2389 return true;
2390 }
2391
2392 return false;
2393 }
2394 case Intrinsic::amdgcn_loop: {
2395 MachineInstr *Br = nullptr;
2396 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2397 const SIRegisterInfo *TRI
2398 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2399
2400 B.setInstr(*BrCond);
2401
2402 // FIXME: Need to adjust branch targets based on unconditional branch.
2403 Register Reg = MI.getOperand(2).getReg();
2404 B.buildInstr(AMDGPU::SI_LOOP)
2405 .addUse(Reg)
2406 .addMBB(BrCond->getOperand(1).getMBB());
2407 MI.eraseFromParent();
2408 BrCond->eraseFromParent();
2409 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2410 return true;
2411 }
2412
2413 return false;
2414 }
2415 case Intrinsic::amdgcn_kernarg_segment_ptr:
2416 return legalizePreloadedArgIntrin(
2417 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2418 case Intrinsic::amdgcn_implicitarg_ptr:
2419 return legalizeImplicitArgPtr(MI, MRI, B);
2420 case Intrinsic::amdgcn_workitem_id_x:
2421 return legalizePreloadedArgIntrin(MI, MRI, B,
2422 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2423 case Intrinsic::amdgcn_workitem_id_y:
2424 return legalizePreloadedArgIntrin(MI, MRI, B,
2425 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2426 case Intrinsic::amdgcn_workitem_id_z:
2427 return legalizePreloadedArgIntrin(MI, MRI, B,
2428 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2429 case Intrinsic::amdgcn_workgroup_id_x:
2430 return legalizePreloadedArgIntrin(MI, MRI, B,
2431 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2432 case Intrinsic::amdgcn_workgroup_id_y:
2433 return legalizePreloadedArgIntrin(MI, MRI, B,
2434 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2435 case Intrinsic::amdgcn_workgroup_id_z:
2436 return legalizePreloadedArgIntrin(MI, MRI, B,
2437 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2438 case Intrinsic::amdgcn_dispatch_ptr:
2439 return legalizePreloadedArgIntrin(MI, MRI, B,
2440 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2441 case Intrinsic::amdgcn_queue_ptr:
2442 return legalizePreloadedArgIntrin(MI, MRI, B,
2443 AMDGPUFunctionArgInfo::QUEUE_PTR);
2444 case Intrinsic::amdgcn_implicit_buffer_ptr:
2445 return legalizePreloadedArgIntrin(
2446 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2447 case Intrinsic::amdgcn_dispatch_id:
2448 return legalizePreloadedArgIntrin(MI, MRI, B,
2449 AMDGPUFunctionArgInfo::DISPATCH_ID);
2450 case Intrinsic::amdgcn_fdiv_fast:
2451 return legalizeFDIVFastIntrin(MI, MRI, B);
2452 case Intrinsic::amdgcn_is_shared:
2453 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2454 case Intrinsic::amdgcn_is_private:
2455 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2456 case Intrinsic::amdgcn_wavefrontsize: {
2457 B.setInstr(MI);
2458 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2459 MI.eraseFromParent();
2460 return true;
2461 }
2462 case Intrinsic::amdgcn_raw_buffer_store:
2463 return legalizeRawBufferStore(MI, MRI, B, false);
2464 case Intrinsic::amdgcn_raw_buffer_store_format:
2465 return legalizeRawBufferStore(MI, MRI, B, true);
2466 default:
2467 return true;
2468 }
2469
2470 return true;
2471 }
2472