1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 17 18 #include "AMDGPU.h" 19 #include "AMDGPUCallLowering.h" 20 #include "R600FrameLowering.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "SIFrameLowering.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 #include "Utils/AMDGPUBaseInfo.h" 27 #include "llvm/ADT/Triple.h" 28 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 29 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 30 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 33 #include "llvm/MC/MCInstrItineraries.h" 34 #include "llvm/Support/MathExtras.h" 35 #include <cassert> 36 #include <cstdint> 37 #include <memory> 38 #include <utility> 39 40 #define GET_SUBTARGETINFO_HEADER 41 #include "AMDGPUGenSubtargetInfo.inc" 42 #define GET_SUBTARGETINFO_HEADER 43 #include "R600GenSubtargetInfo.inc" 44 45 namespace llvm { 46 47 class StringRef; 48 49 class AMDGPUSubtarget { 50 public: 51 enum Generation { 52 R600 = 0, 53 R700 = 1, 54 EVERGREEN = 2, 55 NORTHERN_ISLANDS = 3, 56 SOUTHERN_ISLANDS = 4, 57 SEA_ISLANDS = 5, 58 VOLCANIC_ISLANDS = 6, 59 GFX9 = 7 60 }; 61 62 private: 63 Triple TargetTriple; 64 65 protected: 66 const FeatureBitset &SubtargetFeatureBits; 67 bool Has16BitInsts; 68 bool HasMadMixInsts; 69 bool FP32Denormals; 70 bool FPExceptions; 71 bool HasSDWA; 72 bool HasVOP3PInsts; 73 bool HasMulI24; 74 bool HasMulU24; 75 bool HasFminFmaxLegacy; 76 bool EnablePromoteAlloca; 77 int LocalMemorySize; 78 unsigned WavefrontSize; 79 80 public: 81 AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits); 82 83 static const AMDGPUSubtarget &get(const MachineFunction &MF); 84 static const AMDGPUSubtarget &get(const TargetMachine &TM, 85 const Function &F); 86 87 /// \returns Default range flat work group size for a calling convention. 88 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 89 90 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 91 /// for function \p F, or minimum/maximum flat work group sizes explicitly 92 /// requested using "amdgpu-flat-work-group-size" attribute attached to 93 /// function \p F. 94 /// 95 /// \returns Subtarget's default values if explicitly requested values cannot 96 /// be converted to integer, or violate subtarget's specifications. 97 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 98 99 /// \returns Subtarget's default pair of minimum/maximum number of waves per 100 /// execution unit for function \p F, or minimum/maximum number of waves per 101 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 102 /// attached to function \p F. 103 /// 104 /// \returns Subtarget's default values if explicitly requested values cannot 105 /// be converted to integer, violate subtarget's specifications, or are not 106 /// compatible with minimum/maximum number of waves limited by flat work group 107 /// size, register usage, and/or lds usage. 108 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 109 110 /// Return the amount of LDS that can be used that will not restrict the 111 /// occupancy lower than WaveCount. 112 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 113 const Function &) const; 114 115 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 116 /// the given LDS memory size is the only constraint. 117 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 118 119 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 120 isAmdHsaOS()121 bool isAmdHsaOS() const { 122 return TargetTriple.getOS() == Triple::AMDHSA; 123 } 124 isAmdPalOS()125 bool isAmdPalOS() const { 126 return TargetTriple.getOS() == Triple::AMDPAL; 127 } 128 isMesa3DOS()129 bool isMesa3DOS() const { 130 return TargetTriple.getOS() == Triple::Mesa3D; 131 } 132 isMesaKernel(const Function & F)133 bool isMesaKernel(const Function &F) const { 134 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 135 } 136 isAmdCodeObjectV2(const Function & F)137 bool isAmdCodeObjectV2(const Function &F) const { 138 return isAmdHsaOS() || isMesaKernel(F); 139 } 140 has16BitInsts()141 bool has16BitInsts() const { 142 return Has16BitInsts; 143 } 144 hasMadMixInsts()145 bool hasMadMixInsts() const { 146 return HasMadMixInsts; 147 } 148 hasFP32Denormals()149 bool hasFP32Denormals() const { 150 return FP32Denormals; 151 } 152 hasFPExceptions()153 bool hasFPExceptions() const { 154 return FPExceptions; 155 } 156 hasSDWA()157 bool hasSDWA() const { 158 return HasSDWA; 159 } 160 hasVOP3PInsts()161 bool hasVOP3PInsts() const { 162 return HasVOP3PInsts; 163 } 164 hasMulI24()165 bool hasMulI24() const { 166 return HasMulI24; 167 } 168 hasMulU24()169 bool hasMulU24() const { 170 return HasMulU24; 171 } 172 hasFminFmaxLegacy()173 bool hasFminFmaxLegacy() const { 174 return HasFminFmaxLegacy; 175 } 176 isPromoteAllocaEnabled()177 bool isPromoteAllocaEnabled() const { 178 return EnablePromoteAlloca; 179 } 180 getWavefrontSize()181 unsigned getWavefrontSize() const { 182 return WavefrontSize; 183 } 184 getLocalMemorySize()185 int getLocalMemorySize() const { 186 return LocalMemorySize; 187 } 188 getAlignmentForImplicitArgPtr()189 unsigned getAlignmentForImplicitArgPtr() const { 190 return isAmdHsaOS() ? 8 : 4; 191 } 192 193 /// Returns the offset in bytes from the start of the input buffer 194 /// of the first explicit kernel argument. getExplicitKernelArgOffset(const Function & F)195 unsigned getExplicitKernelArgOffset(const Function &F) const { 196 return isAmdCodeObjectV2(F) ? 0 : 36; 197 } 198 199 /// \returns Maximum number of work groups per compute unit supported by the 200 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)201 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { 202 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits, 203 FlatWorkGroupSize); 204 } 205 206 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()207 unsigned getMinFlatWorkGroupSize() const { 208 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits); 209 } 210 211 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()212 unsigned getMaxFlatWorkGroupSize() const { 213 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits); 214 } 215 216 /// \returns Maximum number of waves per execution unit supported by the 217 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWavesPerEU(unsigned FlatWorkGroupSize)218 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { 219 return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits, 220 FlatWorkGroupSize); 221 } 222 223 /// \returns Minimum number of waves per execution unit supported by the 224 /// subtarget. getMinWavesPerEU()225 unsigned getMinWavesPerEU() const { 226 return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits); 227 } 228 getMaxWavesPerEU()229 unsigned getMaxWavesPerEU() const { return 10; } 230 231 /// Creates value range metadata on an workitemid.* inrinsic call or load. 232 bool makeLIDRangeMetadata(Instruction *I) const; 233 234 /// \returns Number of bytes of arguments that are passed to a shader or 235 /// kernel in addition to the explicit ones declared for the function. getImplicitArgNumBytes(const Function & F)236 unsigned getImplicitArgNumBytes(const Function &F) const { 237 if (isMesaKernel(F)) 238 return 16; 239 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 240 } 241 uint64_t getExplicitKernArgSize(const Function &F, 242 unsigned &MaxAlign) const; 243 unsigned getKernArgSegmentSize(const Function &F, 244 unsigned &MaxAlign) const; 245 ~AMDGPUSubtarget()246 virtual ~AMDGPUSubtarget() {} 247 }; 248 249 class GCNSubtarget : public AMDGPUGenSubtargetInfo, 250 public AMDGPUSubtarget { 251 public: 252 enum { 253 ISAVersion0_0_0, 254 ISAVersion6_0_0, 255 ISAVersion6_0_1, 256 ISAVersion7_0_0, 257 ISAVersion7_0_1, 258 ISAVersion7_0_2, 259 ISAVersion7_0_3, 260 ISAVersion7_0_4, 261 ISAVersion8_0_1, 262 ISAVersion8_0_2, 263 ISAVersion8_0_3, 264 ISAVersion8_1_0, 265 ISAVersion9_0_0, 266 ISAVersion9_0_2, 267 ISAVersion9_0_4, 268 ISAVersion9_0_6, 269 }; 270 271 enum TrapHandlerAbi { 272 TrapHandlerAbiNone = 0, 273 TrapHandlerAbiHsa = 1 274 }; 275 276 enum TrapID { 277 TrapIDHardwareReserved = 0, 278 TrapIDHSADebugTrap = 1, 279 TrapIDLLVMTrap = 2, 280 TrapIDLLVMDebugTrap = 3, 281 TrapIDDebugBreakpoint = 7, 282 TrapIDDebugReserved8 = 8, 283 TrapIDDebugReservedFE = 0xfe, 284 TrapIDDebugReservedFF = 0xff 285 }; 286 287 enum TrapRegValues { 288 LLVMTrapHandlerRegValue = 1 289 }; 290 291 private: 292 /// GlobalISel related APIs. 293 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 294 std::unique_ptr<InstructionSelector> InstSelector; 295 std::unique_ptr<LegalizerInfo> Legalizer; 296 std::unique_ptr<RegisterBankInfo> RegBankInfo; 297 298 protected: 299 // Basic subtarget description. 300 Triple TargetTriple; 301 unsigned Gen; 302 unsigned IsaVersion; 303 int LDSBankCount; 304 unsigned MaxPrivateElementSize; 305 306 // Possibly statically set by tablegen, but may want to be overridden. 307 bool FastFMAF32; 308 bool HalfRate64Ops; 309 310 // Dynamially set bits that enable features. 311 bool FP64FP16Denormals; 312 bool DX10Clamp; 313 bool FlatForGlobal; 314 bool AutoWaitcntBeforeBarrier; 315 bool CodeObjectV3; 316 bool UnalignedScratchAccess; 317 bool UnalignedBufferAccess; 318 bool HasApertureRegs; 319 bool EnableXNACK; 320 bool TrapHandler; 321 bool DebuggerInsertNops; 322 bool DebuggerEmitPrologue; 323 324 // Used as options. 325 bool EnableHugePrivateBuffer; 326 bool EnableVGPRSpilling; 327 bool EnableLoadStoreOpt; 328 bool EnableUnsafeDSOffsetFolding; 329 bool EnableSIScheduler; 330 bool EnableDS128; 331 bool DumpCode; 332 333 // Subtarget statically properties set by tablegen 334 bool FP64; 335 bool FMA; 336 bool MIMG_R128; 337 bool IsGCN; 338 bool GCN3Encoding; 339 bool CIInsts; 340 bool GFX9Insts; 341 bool SGPRInitBug; 342 bool HasSMemRealTime; 343 bool HasIntClamp; 344 bool HasFmaMixInsts; 345 bool HasMovrel; 346 bool HasVGPRIndexMode; 347 bool HasScalarStores; 348 bool HasScalarAtomics; 349 bool HasInv2PiInlineImm; 350 bool HasSDWAOmod; 351 bool HasSDWAScalar; 352 bool HasSDWASdst; 353 bool HasSDWAMac; 354 bool HasSDWAOutModsVOPC; 355 bool HasDPP; 356 bool HasDLInsts; 357 bool D16PreservesUnusedBits; 358 bool FlatAddressSpace; 359 bool FlatInstOffsets; 360 bool FlatGlobalInsts; 361 bool FlatScratchInsts; 362 bool AddNoCarryInsts; 363 bool HasUnpackedD16VMem; 364 bool R600ALUInst; 365 bool CaymanISA; 366 bool CFALUBug; 367 bool HasVertexCache; 368 short TexVTXClauseSize; 369 bool ScalarizeGlobal; 370 371 // Dummy feature to use for assembler in tablegen. 372 bool FeatureDisable; 373 374 SelectionDAGTargetInfo TSInfo; 375 AMDGPUAS AS; 376 private: 377 SIInstrInfo InstrInfo; 378 SITargetLowering TLInfo; 379 SIFrameLowering FrameLowering; 380 381 public: 382 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 383 const GCNTargetMachine &TM); 384 ~GCNSubtarget() override; 385 386 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 387 StringRef GPU, StringRef FS); 388 getInstrInfo()389 const SIInstrInfo *getInstrInfo() const override { 390 return &InstrInfo; 391 } 392 getFrameLowering()393 const SIFrameLowering *getFrameLowering() const override { 394 return &FrameLowering; 395 } 396 getTargetLowering()397 const SITargetLowering *getTargetLowering() const override { 398 return &TLInfo; 399 } 400 getRegisterInfo()401 const SIRegisterInfo *getRegisterInfo() const override { 402 return &InstrInfo.getRegisterInfo(); 403 } 404 getCallLowering()405 const CallLowering *getCallLowering() const override { 406 return CallLoweringInfo.get(); 407 } 408 getInstructionSelector()409 const InstructionSelector *getInstructionSelector() const override { 410 return InstSelector.get(); 411 } 412 getLegalizerInfo()413 const LegalizerInfo *getLegalizerInfo() const override { 414 return Legalizer.get(); 415 } 416 getRegBankInfo()417 const RegisterBankInfo *getRegBankInfo() const override { 418 return RegBankInfo.get(); 419 } 420 421 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()422 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 423 return &TSInfo; 424 } 425 426 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 427 getGeneration()428 Generation getGeneration() const { 429 return (Generation)Gen; 430 } 431 getWavefrontSizeLog2()432 unsigned getWavefrontSizeLog2() const { 433 return Log2_32(WavefrontSize); 434 } 435 getLDSBankCount()436 int getLDSBankCount() const { 437 return LDSBankCount; 438 } 439 getMaxPrivateElementSize()440 unsigned getMaxPrivateElementSize() const { 441 return MaxPrivateElementSize; 442 } 443 getAMDGPUAS()444 AMDGPUAS getAMDGPUAS() const { 445 return AS; 446 } 447 hasIntClamp()448 bool hasIntClamp() const { 449 return HasIntClamp; 450 } 451 hasFP64()452 bool hasFP64() const { 453 return FP64; 454 } 455 hasMIMG_R128()456 bool hasMIMG_R128() const { 457 return MIMG_R128; 458 } 459 hasHWFP64()460 bool hasHWFP64() const { 461 return FP64; 462 } 463 hasFastFMAF32()464 bool hasFastFMAF32() const { 465 return FastFMAF32; 466 } 467 hasHalfRate64Ops()468 bool hasHalfRate64Ops() const { 469 return HalfRate64Ops; 470 } 471 hasAddr64()472 bool hasAddr64() const { 473 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 474 } 475 hasBFE()476 bool hasBFE() const { 477 return true; 478 } 479 hasBFI()480 bool hasBFI() const { 481 return true; 482 } 483 hasBFM()484 bool hasBFM() const { 485 return hasBFE(); 486 } 487 hasBCNT(unsigned Size)488 bool hasBCNT(unsigned Size) const { 489 return true; 490 } 491 hasFFBL()492 bool hasFFBL() const { 493 return true; 494 } 495 hasFFBH()496 bool hasFFBH() const { 497 return true; 498 } 499 hasMed3_16()500 bool hasMed3_16() const { 501 return getGeneration() >= AMDGPUSubtarget::GFX9; 502 } 503 hasMin3Max3_16()504 bool hasMin3Max3_16() const { 505 return getGeneration() >= AMDGPUSubtarget::GFX9; 506 } 507 hasFmaMixInsts()508 bool hasFmaMixInsts() const { 509 return HasFmaMixInsts; 510 } 511 hasCARRY()512 bool hasCARRY() const { 513 return true; 514 } 515 hasFMA()516 bool hasFMA() const { 517 return FMA; 518 } 519 getTrapHandlerAbi()520 TrapHandlerAbi getTrapHandlerAbi() const { 521 return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; 522 } 523 enableHugePrivateBuffer()524 bool enableHugePrivateBuffer() const { 525 return EnableHugePrivateBuffer; 526 } 527 unsafeDSOffsetFoldingEnabled()528 bool unsafeDSOffsetFoldingEnabled() const { 529 return EnableUnsafeDSOffsetFolding; 530 } 531 dumpCode()532 bool dumpCode() const { 533 return DumpCode; 534 } 535 536 /// Return the amount of LDS that can be used that will not restrict the 537 /// occupancy lower than WaveCount. 538 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 539 const Function &) const; 540 hasFP16Denormals()541 bool hasFP16Denormals() const { 542 return FP64FP16Denormals; 543 } 544 hasFP64Denormals()545 bool hasFP64Denormals() const { 546 return FP64FP16Denormals; 547 } 548 supportsMinMaxDenormModes()549 bool supportsMinMaxDenormModes() const { 550 return getGeneration() >= AMDGPUSubtarget::GFX9; 551 } 552 enableDX10Clamp()553 bool enableDX10Clamp() const { 554 return DX10Clamp; 555 } 556 enableIEEEBit(const MachineFunction & MF)557 bool enableIEEEBit(const MachineFunction &MF) const { 558 return AMDGPU::isCompute(MF.getFunction().getCallingConv()); 559 } 560 useFlatForGlobal()561 bool useFlatForGlobal() const { 562 return FlatForGlobal; 563 } 564 565 /// \returns If target supports ds_read/write_b128 and user enables generation 566 /// of ds_read/write_b128. useDS128()567 bool useDS128() const { 568 return CIInsts && EnableDS128; 569 } 570 571 /// \returns If MUBUF instructions always perform range checking, even for 572 /// buffer resources used for private memory access. privateMemoryResourceIsRangeChecked()573 bool privateMemoryResourceIsRangeChecked() const { 574 return getGeneration() < AMDGPUSubtarget::GFX9; 575 } 576 hasAutoWaitcntBeforeBarrier()577 bool hasAutoWaitcntBeforeBarrier() const { 578 return AutoWaitcntBeforeBarrier; 579 } 580 hasCodeObjectV3()581 bool hasCodeObjectV3() const { 582 return CodeObjectV3; 583 } 584 hasUnalignedBufferAccess()585 bool hasUnalignedBufferAccess() const { 586 return UnalignedBufferAccess; 587 } 588 hasUnalignedScratchAccess()589 bool hasUnalignedScratchAccess() const { 590 return UnalignedScratchAccess; 591 } 592 hasApertureRegs()593 bool hasApertureRegs() const { 594 return HasApertureRegs; 595 } 596 isTrapHandlerEnabled()597 bool isTrapHandlerEnabled() const { 598 return TrapHandler; 599 } 600 isXNACKEnabled()601 bool isXNACKEnabled() const { 602 return EnableXNACK; 603 } 604 hasFlatAddressSpace()605 bool hasFlatAddressSpace() const { 606 return FlatAddressSpace; 607 } 608 hasFlatInstOffsets()609 bool hasFlatInstOffsets() const { 610 return FlatInstOffsets; 611 } 612 hasFlatGlobalInsts()613 bool hasFlatGlobalInsts() const { 614 return FlatGlobalInsts; 615 } 616 hasFlatScratchInsts()617 bool hasFlatScratchInsts() const { 618 return FlatScratchInsts; 619 } 620 hasFlatLgkmVMemCountInOrder()621 bool hasFlatLgkmVMemCountInOrder() const { 622 return getGeneration() > GFX9; 623 } 624 hasD16LoadStore()625 bool hasD16LoadStore() const { 626 return getGeneration() >= GFX9; 627 } 628 629 /// Return if most LDS instructions have an m0 use that require m0 to be 630 /// iniitalized. ldsRequiresM0Init()631 bool ldsRequiresM0Init() const { 632 return getGeneration() < GFX9; 633 } 634 hasAddNoCarry()635 bool hasAddNoCarry() const { 636 return AddNoCarryInsts; 637 } 638 hasUnpackedD16VMem()639 bool hasUnpackedD16VMem() const { 640 return HasUnpackedD16VMem; 641 } 642 643 // Covers VS/PS/CS graphics shaders isMesaGfxShader(const Function & F)644 bool isMesaGfxShader(const Function &F) const { 645 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 646 } 647 hasMad64_32()648 bool hasMad64_32() const { 649 return getGeneration() >= SEA_ISLANDS; 650 } 651 hasSDWAOmod()652 bool hasSDWAOmod() const { 653 return HasSDWAOmod; 654 } 655 hasSDWAScalar()656 bool hasSDWAScalar() const { 657 return HasSDWAScalar; 658 } 659 hasSDWASdst()660 bool hasSDWASdst() const { 661 return HasSDWASdst; 662 } 663 hasSDWAMac()664 bool hasSDWAMac() const { 665 return HasSDWAMac; 666 } 667 hasSDWAOutModsVOPC()668 bool hasSDWAOutModsVOPC() const { 669 return HasSDWAOutModsVOPC; 670 } 671 vmemWriteNeedsExpWaitcnt()672 bool vmemWriteNeedsExpWaitcnt() const { 673 return getGeneration() < SEA_ISLANDS; 674 } 675 hasDLInsts()676 bool hasDLInsts() const { 677 return HasDLInsts; 678 } 679 d16PreservesUnusedBits()680 bool d16PreservesUnusedBits() const { 681 return D16PreservesUnusedBits; 682 } 683 684 // Scratch is allocated in 256 dword per wave blocks for the entire 685 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 686 // is 4-byte aligned. 687 // 688 // Only 4-byte alignment is really needed to access anything. Transformations 689 // on the pointer value itself may rely on the alignment / known low bits of 690 // the pointer. Set this to something above the minimum to avoid needing 691 // dynamic realignment in common cases. getStackAlignment()692 unsigned getStackAlignment() const { 693 return 16; 694 } 695 enableMachineScheduler()696 bool enableMachineScheduler() const override { 697 return true; 698 } 699 enableSubRegLiveness()700 bool enableSubRegLiveness() const override { 701 return true; 702 } 703 setScalarizeGlobalBehavior(bool b)704 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } getScalarizeGlobalBehavior()705 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 706 707 /// \returns Number of execution units per compute unit supported by the 708 /// subtarget. getEUsPerCU()709 unsigned getEUsPerCU() const { 710 return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits()); 711 } 712 713 /// \returns Maximum number of waves per compute unit supported by the 714 /// subtarget without any kind of limitation. getMaxWavesPerCU()715 unsigned getMaxWavesPerCU() const { 716 return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits()); 717 } 718 719 /// \returns Maximum number of waves per compute unit supported by the 720 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWavesPerCU(unsigned FlatWorkGroupSize)721 unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { 722 return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(), 723 FlatWorkGroupSize); 724 } 725 726 /// \returns Maximum number of waves per execution unit supported by the 727 /// subtarget without any kind of limitation. getMaxWavesPerEU()728 unsigned getMaxWavesPerEU() const { 729 return AMDGPU::IsaInfo::getMaxWavesPerEU(); 730 } 731 732 /// \returns Number of waves per work group supported by the subtarget and 733 /// limited by given \p FlatWorkGroupSize. getWavesPerWorkGroup(unsigned FlatWorkGroupSize)734 unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { 735 return AMDGPU::IsaInfo::getWavesPerWorkGroup( 736 MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize); 737 } 738 739 // static wrappers 740 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 741 742 // XXX - Why is this here if it isn't in the default pass set? enableEarlyIfConversion()743 bool enableEarlyIfConversion() const override { 744 return true; 745 } 746 747 void overrideSchedPolicy(MachineSchedPolicy &Policy, 748 unsigned NumRegionInstrs) const override; 749 750 bool isVGPRSpillingEnabled(const Function &F) const; 751 getMaxNumUserSGPRs()752 unsigned getMaxNumUserSGPRs() const { 753 return 16; 754 } 755 hasSMemRealTime()756 bool hasSMemRealTime() const { 757 return HasSMemRealTime; 758 } 759 hasMovrel()760 bool hasMovrel() const { 761 return HasMovrel; 762 } 763 hasVGPRIndexMode()764 bool hasVGPRIndexMode() const { 765 return HasVGPRIndexMode; 766 } 767 useVGPRIndexMode(bool UserEnable)768 bool useVGPRIndexMode(bool UserEnable) const { 769 return !hasMovrel() || (UserEnable && hasVGPRIndexMode()); 770 } 771 hasScalarCompareEq64()772 bool hasScalarCompareEq64() const { 773 return getGeneration() >= VOLCANIC_ISLANDS; 774 } 775 hasScalarStores()776 bool hasScalarStores() const { 777 return HasScalarStores; 778 } 779 hasScalarAtomics()780 bool hasScalarAtomics() const { 781 return HasScalarAtomics; 782 } 783 hasInv2PiInlineImm()784 bool hasInv2PiInlineImm() const { 785 return HasInv2PiInlineImm; 786 } 787 hasDPP()788 bool hasDPP() const { 789 return HasDPP; 790 } 791 enableSIScheduler()792 bool enableSIScheduler() const { 793 return EnableSIScheduler; 794 } 795 debuggerSupported()796 bool debuggerSupported() const { 797 return debuggerInsertNops() && debuggerEmitPrologue(); 798 } 799 debuggerInsertNops()800 bool debuggerInsertNops() const { 801 return DebuggerInsertNops; 802 } 803 debuggerEmitPrologue()804 bool debuggerEmitPrologue() const { 805 return DebuggerEmitPrologue; 806 } 807 loadStoreOptEnabled()808 bool loadStoreOptEnabled() const { 809 return EnableLoadStoreOpt; 810 } 811 hasSGPRInitBug()812 bool hasSGPRInitBug() const { 813 return SGPRInitBug; 814 } 815 has12DWordStoreHazard()816 bool has12DWordStoreHazard() const { 817 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 818 } 819 hasSMovFedHazard()820 bool hasSMovFedHazard() const { 821 return getGeneration() >= AMDGPUSubtarget::GFX9; 822 } 823 hasReadM0MovRelInterpHazard()824 bool hasReadM0MovRelInterpHazard() const { 825 return getGeneration() >= AMDGPUSubtarget::GFX9; 826 } 827 hasReadM0SendMsgHazard()828 bool hasReadM0SendMsgHazard() const { 829 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; 830 } 831 832 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 833 /// SGPRs 834 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 835 836 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 837 /// VGPRs 838 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 839 840 /// \returns true if the flat_scratch register should be initialized with the 841 /// pointer to the wave's scratch memory rather than a size and offset. flatScratchIsPointer()842 bool flatScratchIsPointer() const { 843 return getGeneration() >= AMDGPUSubtarget::GFX9; 844 } 845 846 /// \returns true if the machine has merged shaders in which s0-s7 are 847 /// reserved by the hardware and user SGPRs start at s8 hasMergedShaders()848 bool hasMergedShaders() const { 849 return getGeneration() >= GFX9; 850 } 851 852 /// \returns SGPR allocation granularity supported by the subtarget. getSGPRAllocGranule()853 unsigned getSGPRAllocGranule() const { 854 return AMDGPU::IsaInfo::getSGPRAllocGranule( 855 MCSubtargetInfo::getFeatureBits()); 856 } 857 858 /// \returns SGPR encoding granularity supported by the subtarget. getSGPREncodingGranule()859 unsigned getSGPREncodingGranule() const { 860 return AMDGPU::IsaInfo::getSGPREncodingGranule( 861 MCSubtargetInfo::getFeatureBits()); 862 } 863 864 /// \returns Total number of SGPRs supported by the subtarget. getTotalNumSGPRs()865 unsigned getTotalNumSGPRs() const { 866 return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits()); 867 } 868 869 /// \returns Addressable number of SGPRs supported by the subtarget. getAddressableNumSGPRs()870 unsigned getAddressableNumSGPRs() const { 871 return AMDGPU::IsaInfo::getAddressableNumSGPRs( 872 MCSubtargetInfo::getFeatureBits()); 873 } 874 875 /// \returns Minimum number of SGPRs that meets the given number of waves per 876 /// execution unit requirement supported by the subtarget. getMinNumSGPRs(unsigned WavesPerEU)877 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 878 return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(), 879 WavesPerEU); 880 } 881 882 /// \returns Maximum number of SGPRs that meets the given number of waves per 883 /// execution unit requirement supported by the subtarget. getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)884 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 885 return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(), 886 WavesPerEU, Addressable); 887 } 888 889 /// \returns Reserved number of SGPRs for given function \p MF. 890 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 891 892 /// \returns Maximum number of SGPRs that meets number of waves per execution 893 /// unit requirement for function \p MF, or number of SGPRs explicitly 894 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 895 /// 896 /// \returns Value that meets number of waves per execution unit requirement 897 /// if explicitly requested value cannot be converted to integer, violates 898 /// subtarget's specifications, or does not meet number of waves per execution 899 /// unit requirement. 900 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 901 902 /// \returns VGPR allocation granularity supported by the subtarget. getVGPRAllocGranule()903 unsigned getVGPRAllocGranule() const { 904 return AMDGPU::IsaInfo::getVGPRAllocGranule( 905 MCSubtargetInfo::getFeatureBits()); 906 } 907 908 /// \returns VGPR encoding granularity supported by the subtarget. getVGPREncodingGranule()909 unsigned getVGPREncodingGranule() const { 910 return AMDGPU::IsaInfo::getVGPREncodingGranule( 911 MCSubtargetInfo::getFeatureBits()); 912 } 913 914 /// \returns Total number of VGPRs supported by the subtarget. getTotalNumVGPRs()915 unsigned getTotalNumVGPRs() const { 916 return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits()); 917 } 918 919 /// \returns Addressable number of VGPRs supported by the subtarget. getAddressableNumVGPRs()920 unsigned getAddressableNumVGPRs() const { 921 return AMDGPU::IsaInfo::getAddressableNumVGPRs( 922 MCSubtargetInfo::getFeatureBits()); 923 } 924 925 /// \returns Minimum number of VGPRs that meets given number of waves per 926 /// execution unit requirement supported by the subtarget. getMinNumVGPRs(unsigned WavesPerEU)927 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 928 return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(), 929 WavesPerEU); 930 } 931 932 /// \returns Maximum number of VGPRs that meets given number of waves per 933 /// execution unit requirement supported by the subtarget. getMaxNumVGPRs(unsigned WavesPerEU)934 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 935 return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(), 936 WavesPerEU); 937 } 938 939 /// \returns Maximum number of VGPRs that meets number of waves per execution 940 /// unit requirement for function \p MF, or number of VGPRs explicitly 941 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 942 /// 943 /// \returns Value that meets number of waves per execution unit requirement 944 /// if explicitly requested value cannot be converted to integer, violates 945 /// subtarget's specifications, or does not meet number of waves per execution 946 /// unit requirement. 947 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 948 949 void getPostRAMutations( 950 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 951 const override; 952 }; 953 954 class R600Subtarget final : public R600GenSubtargetInfo, 955 public AMDGPUSubtarget { 956 private: 957 R600InstrInfo InstrInfo; 958 R600FrameLowering FrameLowering; 959 bool FMA; 960 bool CaymanISA; 961 bool CFALUBug; 962 bool DX10Clamp; 963 bool HasVertexCache; 964 bool R600ALUInst; 965 bool FP64; 966 short TexVTXClauseSize; 967 Generation Gen; 968 R600TargetLowering TLInfo; 969 InstrItineraryData InstrItins; 970 SelectionDAGTargetInfo TSInfo; 971 AMDGPUAS AS; 972 973 public: 974 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 975 const TargetMachine &TM); 976 getInstrInfo()977 const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } 978 getFrameLowering()979 const R600FrameLowering *getFrameLowering() const override { 980 return &FrameLowering; 981 } 982 getTargetLowering()983 const R600TargetLowering *getTargetLowering() const override { 984 return &TLInfo; 985 } 986 getRegisterInfo()987 const R600RegisterInfo *getRegisterInfo() const override { 988 return &InstrInfo.getRegisterInfo(); 989 } 990 getInstrItineraryData()991 const InstrItineraryData *getInstrItineraryData() const override { 992 return &InstrItins; 993 } 994 995 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()996 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 997 return &TSInfo; 998 } 999 1000 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 1001 getGeneration()1002 Generation getGeneration() const { 1003 return Gen; 1004 } 1005 getStackAlignment()1006 unsigned getStackAlignment() const { 1007 return 4; 1008 } 1009 1010 R600Subtarget &initializeSubtargetDependencies(const Triple &TT, 1011 StringRef GPU, StringRef FS); 1012 hasBFE()1013 bool hasBFE() const { 1014 return (getGeneration() >= EVERGREEN); 1015 } 1016 hasBFI()1017 bool hasBFI() const { 1018 return (getGeneration() >= EVERGREEN); 1019 } 1020 hasBCNT(unsigned Size)1021 bool hasBCNT(unsigned Size) const { 1022 if (Size == 32) 1023 return (getGeneration() >= EVERGREEN); 1024 1025 return false; 1026 } 1027 hasBORROW()1028 bool hasBORROW() const { 1029 return (getGeneration() >= EVERGREEN); 1030 } 1031 hasCARRY()1032 bool hasCARRY() const { 1033 return (getGeneration() >= EVERGREEN); 1034 } 1035 hasCaymanISA()1036 bool hasCaymanISA() const { 1037 return CaymanISA; 1038 } 1039 hasFFBL()1040 bool hasFFBL() const { 1041 return (getGeneration() >= EVERGREEN); 1042 } 1043 hasFFBH()1044 bool hasFFBH() const { 1045 return (getGeneration() >= EVERGREEN); 1046 } 1047 hasFMA()1048 bool hasFMA() const { return FMA; } 1049 hasCFAluBug()1050 bool hasCFAluBug() const { return CFALUBug; } 1051 hasVertexCache()1052 bool hasVertexCache() const { return HasVertexCache; } 1053 getTexVTXClauseSize()1054 short getTexVTXClauseSize() const { return TexVTXClauseSize; } 1055 getAMDGPUAS()1056 AMDGPUAS getAMDGPUAS() const { return AS; } 1057 enableMachineScheduler()1058 bool enableMachineScheduler() const override { 1059 return true; 1060 } 1061 enableSubRegLiveness()1062 bool enableSubRegLiveness() const override { 1063 return true; 1064 } 1065 }; 1066 1067 } // end namespace llvm 1068 1069 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 1070