1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "R600FrameLowering.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "SIFrameLowering.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 #include "Utils/AMDGPUBaseInfo.h" 27 #include "llvm/ADT/Triple.h" 28 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 29 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 30 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 31 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 32 #include "llvm/CodeGen/MachineFunction.h" 33 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 34 #include "llvm/MC/MCInstrItineraries.h" 35 #include "llvm/Support/MathExtras.h" 36 #include <cassert> 37 #include <cstdint> 38 #include <memory> 39 #include <utility> 40 41 #define GET_SUBTARGETINFO_HEADER 42 #include "AMDGPUGenSubtargetInfo.inc" 43 #define GET_SUBTARGETINFO_HEADER 44 #include "R600GenSubtargetInfo.inc" 45 46 namespace llvm { 47 48 class StringRef; 49 50 class AMDGPUSubtarget { 51 public: 52 enum Generation { 53 R600 = 0, 54 R700 = 1, 55 EVERGREEN = 2, 56 NORTHERN_ISLANDS = 3, 57 SOUTHERN_ISLANDS = 4, 58 SEA_ISLANDS = 5, 59 VOLCANIC_ISLANDS = 6, 60 GFX9 = 7, 61 GFX10 = 8 62 }; 63 64 private: 65 Triple TargetTriple; 66 67 protected: 68 bool Has16BitInsts; 69 bool HasMadMixInsts; 70 bool HasMadMacF32Insts; 71 bool HasDsSrc2Insts; 72 bool HasSDWA; 73 bool HasVOP3PInsts; 74 bool HasMulI24; 75 bool HasMulU24; 76 bool HasInv2PiInlineImm; 77 bool HasFminFmaxLegacy; 78 bool EnablePromoteAlloca; 79 bool HasTrigReducedRange; 80 unsigned MaxWavesPerEU; 81 unsigned LocalMemorySize; 82 char WavefrontSizeLog2; 83 84 public: 85 AMDGPUSubtarget(const Triple &TT); 86 87 static const AMDGPUSubtarget &get(const MachineFunction &MF); 88 static const AMDGPUSubtarget &get(const TargetMachine &TM, 89 const Function &F); 90 91 /// \returns Default range flat work group size for a calling convention. 92 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 93 94 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 95 /// for function \p F, or minimum/maximum flat work group sizes explicitly 96 /// requested using "amdgpu-flat-work-group-size" attribute attached to 97 /// function \p F. 98 /// 99 /// \returns Subtarget's default values if explicitly requested values cannot 100 /// be converted to integer, or violate subtarget's specifications. 101 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 102 103 /// \returns Subtarget's default pair of minimum/maximum number of waves per 104 /// execution unit for function \p F, or minimum/maximum number of waves per 105 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 106 /// attached to function \p F. 107 /// 108 /// \returns Subtarget's default values if explicitly requested values cannot 109 /// be converted to integer, violate subtarget's specifications, or are not 110 /// compatible with minimum/maximum number of waves limited by flat work group 111 /// size, register usage, and/or lds usage. 112 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 113 114 /// Return the amount of LDS that can be used that will not restrict the 115 /// occupancy lower than WaveCount. 116 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 117 const Function &) const; 118 119 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 120 /// the given LDS memory size is the only constraint. 121 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 122 123 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 124 isAmdHsaOS()125 bool isAmdHsaOS() const { 126 return TargetTriple.getOS() == Triple::AMDHSA; 127 } 128 isAmdPalOS()129 bool isAmdPalOS() const { 130 return TargetTriple.getOS() == Triple::AMDPAL; 131 } 132 isMesa3DOS()133 bool isMesa3DOS() const { 134 return TargetTriple.getOS() == Triple::Mesa3D; 135 } 136 isMesaKernel(const Function & F)137 bool isMesaKernel(const Function &F) const { 138 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 139 } 140 isAmdHsaOrMesa(const Function & F)141 bool isAmdHsaOrMesa(const Function &F) const { 142 return isAmdHsaOS() || isMesaKernel(F); 143 } 144 isGCN()145 bool isGCN() const { 146 return TargetTriple.getArch() == Triple::amdgcn; 147 } 148 has16BitInsts()149 bool has16BitInsts() const { 150 return Has16BitInsts; 151 } 152 hasMadMixInsts()153 bool hasMadMixInsts() const { 154 return HasMadMixInsts; 155 } 156 hasMadMacF32Insts()157 bool hasMadMacF32Insts() const { 158 return HasMadMacF32Insts || !isGCN(); 159 } 160 hasDsSrc2Insts()161 bool hasDsSrc2Insts() const { 162 return HasDsSrc2Insts; 163 } 164 hasSDWA()165 bool hasSDWA() const { 166 return HasSDWA; 167 } 168 hasVOP3PInsts()169 bool hasVOP3PInsts() const { 170 return HasVOP3PInsts; 171 } 172 hasMulI24()173 bool hasMulI24() const { 174 return HasMulI24; 175 } 176 hasMulU24()177 bool hasMulU24() const { 178 return HasMulU24; 179 } 180 hasInv2PiInlineImm()181 bool hasInv2PiInlineImm() const { 182 return HasInv2PiInlineImm; 183 } 184 hasFminFmaxLegacy()185 bool hasFminFmaxLegacy() const { 186 return HasFminFmaxLegacy; 187 } 188 hasTrigReducedRange()189 bool hasTrigReducedRange() const { 190 return HasTrigReducedRange; 191 } 192 isPromoteAllocaEnabled()193 bool isPromoteAllocaEnabled() const { 194 return EnablePromoteAlloca; 195 } 196 getWavefrontSize()197 unsigned getWavefrontSize() const { 198 return 1 << WavefrontSizeLog2; 199 } 200 getWavefrontSizeLog2()201 unsigned getWavefrontSizeLog2() const { 202 return WavefrontSizeLog2; 203 } 204 getLocalMemorySize()205 unsigned getLocalMemorySize() const { 206 return LocalMemorySize; 207 } 208 getAlignmentForImplicitArgPtr()209 Align getAlignmentForImplicitArgPtr() const { 210 return isAmdHsaOS() ? Align(8) : Align(4); 211 } 212 213 /// Returns the offset in bytes from the start of the input buffer 214 /// of the first explicit kernel argument. getExplicitKernelArgOffset(const Function & F)215 unsigned getExplicitKernelArgOffset(const Function &F) const { 216 return isAmdHsaOrMesa(F) ? 0 : 36; 217 } 218 219 /// \returns Maximum number of work groups per compute unit supported by the 220 /// subtarget and limited by given \p FlatWorkGroupSize. 221 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 222 223 /// \returns Minimum flat work group size supported by the subtarget. 224 virtual unsigned getMinFlatWorkGroupSize() const = 0; 225 226 /// \returns Maximum flat work group size supported by the subtarget. 227 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 228 229 /// \returns Number of waves per execution unit required to support the given 230 /// \p FlatWorkGroupSize. 231 virtual unsigned 232 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; 233 234 /// \returns Minimum number of waves per execution unit supported by the 235 /// subtarget. 236 virtual unsigned getMinWavesPerEU() const = 0; 237 238 /// \returns Maximum number of waves per execution unit supported by the 239 /// subtarget without any kind of limitation. getMaxWavesPerEU()240 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 241 242 /// Return the maximum workitem ID value in the function, for the given (0, 1, 243 /// 2) dimension. 244 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; 245 246 /// Creates value range metadata on an workitemid.* intrinsic call or load. 247 bool makeLIDRangeMetadata(Instruction *I) const; 248 249 /// \returns Number of bytes of arguments that are passed to a shader or 250 /// kernel in addition to the explicit ones declared for the function. getImplicitArgNumBytes(const Function & F)251 unsigned getImplicitArgNumBytes(const Function &F) const { 252 if (isMesaKernel(F)) 253 return 16; 254 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 255 } 256 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 257 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 258 259 /// \returns Corresponsing DWARF register number mapping flavour for the 260 /// \p WavefrontSize. getAMDGPUDwarfFlavour()261 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const { 262 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 263 : AMDGPUDwarfFlavour::Wave64; 264 } 265 ~AMDGPUSubtarget()266 virtual ~AMDGPUSubtarget() {} 267 }; 268 269 class GCNSubtarget : public AMDGPUGenSubtargetInfo, 270 public AMDGPUSubtarget { 271 272 using AMDGPUSubtarget::getMaxWavesPerEU; 273 274 public: 275 enum TrapHandlerAbi { 276 TrapHandlerAbiNone = 0, 277 TrapHandlerAbiHsa = 1 278 }; 279 280 enum TrapID { 281 TrapIDHardwareReserved = 0, 282 TrapIDHSADebugTrap = 1, 283 TrapIDLLVMTrap = 2, 284 TrapIDLLVMDebugTrap = 3, 285 TrapIDDebugBreakpoint = 7, 286 TrapIDDebugReserved8 = 8, 287 TrapIDDebugReservedFE = 0xfe, 288 TrapIDDebugReservedFF = 0xff 289 }; 290 291 enum TrapRegValues { 292 LLVMTrapHandlerRegValue = 1 293 }; 294 295 private: 296 /// GlobalISel related APIs. 297 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 298 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 299 std::unique_ptr<InstructionSelector> InstSelector; 300 std::unique_ptr<LegalizerInfo> Legalizer; 301 std::unique_ptr<RegisterBankInfo> RegBankInfo; 302 303 protected: 304 // Basic subtarget description. 305 Triple TargetTriple; 306 unsigned Gen; 307 InstrItineraryData InstrItins; 308 int LDSBankCount; 309 unsigned MaxPrivateElementSize; 310 311 // Possibly statically set by tablegen, but may want to be overridden. 312 bool FastFMAF32; 313 bool FastDenormalF32; 314 bool HalfRate64Ops; 315 316 // Dynamically set bits that enable features. 317 bool FlatForGlobal; 318 bool AutoWaitcntBeforeBarrier; 319 bool UnalignedScratchAccess; 320 bool UnalignedAccessMode; 321 bool HasApertureRegs; 322 bool EnableXNACK; 323 bool DoesNotSupportXNACK; 324 bool EnableCuMode; 325 bool TrapHandler; 326 327 // Used as options. 328 bool EnableLoadStoreOpt; 329 bool EnableUnsafeDSOffsetFolding; 330 bool EnableSIScheduler; 331 bool EnableDS128; 332 bool EnablePRTStrictNull; 333 bool DumpCode; 334 335 // Subtarget statically properties set by tablegen 336 bool FP64; 337 bool FMA; 338 bool MIMG_R128; 339 bool IsGCN; 340 bool GCN3Encoding; 341 bool CIInsts; 342 bool GFX8Insts; 343 bool GFX9Insts; 344 bool GFX10Insts; 345 bool GFX10_3Insts; 346 bool GFX7GFX8GFX9Insts; 347 bool SGPRInitBug; 348 bool HasSMemRealTime; 349 bool HasIntClamp; 350 bool HasFmaMixInsts; 351 bool HasMovrel; 352 bool HasVGPRIndexMode; 353 bool HasScalarStores; 354 bool HasScalarAtomics; 355 bool HasSDWAOmod; 356 bool HasSDWAScalar; 357 bool HasSDWASdst; 358 bool HasSDWAMac; 359 bool HasSDWAOutModsVOPC; 360 bool HasDPP; 361 bool HasDPP8; 362 bool HasR128A16; 363 bool HasGFX10A16; 364 bool HasG16; 365 bool HasNSAEncoding; 366 bool GFX10_BEncoding; 367 bool HasDLInsts; 368 bool HasDot1Insts; 369 bool HasDot2Insts; 370 bool HasDot3Insts; 371 bool HasDot4Insts; 372 bool HasDot5Insts; 373 bool HasDot6Insts; 374 bool HasMAIInsts; 375 bool HasPkFmacF16Inst; 376 bool HasAtomicFaddInsts; 377 bool EnableSRAMECC; 378 bool DoesNotSupportSRAMECC; 379 bool HasNoSdstCMPX; 380 bool HasVscnt; 381 bool HasGetWaveIdInst; 382 bool HasSMemTimeInst; 383 bool HasRegisterBanking; 384 bool HasVOP3Literal; 385 bool HasNoDataDepHazard; 386 bool FlatAddressSpace; 387 bool FlatInstOffsets; 388 bool FlatGlobalInsts; 389 bool FlatScratchInsts; 390 bool ScalarFlatScratchInsts; 391 bool AddNoCarryInsts; 392 bool HasUnpackedD16VMem; 393 bool R600ALUInst; 394 bool CaymanISA; 395 bool CFALUBug; 396 bool LDSMisalignedBug; 397 bool HasMFMAInlineLiteralBug; 398 bool HasVertexCache; 399 short TexVTXClauseSize; 400 bool UnalignedBufferAccess; 401 bool UnalignedDSAccess; 402 bool ScalarizeGlobal; 403 404 bool HasVcmpxPermlaneHazard; 405 bool HasVMEMtoScalarWriteHazard; 406 bool HasSMEMtoVectorWriteHazard; 407 bool HasInstFwdPrefetchBug; 408 bool HasVcmpxExecWARHazard; 409 bool HasLdsBranchVmemWARHazard; 410 bool HasNSAtoVMEMBug; 411 bool HasOffset3fBug; 412 bool HasFlatSegmentOffsetBug; 413 bool HasImageStoreD16Bug; 414 bool HasImageGather4D16Bug; 415 416 // Dummy feature to use for assembler in tablegen. 417 bool FeatureDisable; 418 419 SelectionDAGTargetInfo TSInfo; 420 private: 421 SIInstrInfo InstrInfo; 422 SITargetLowering TLInfo; 423 SIFrameLowering FrameLowering; 424 425 public: 426 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 427 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 428 429 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 430 const GCNTargetMachine &TM); 431 ~GCNSubtarget() override; 432 433 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 434 StringRef GPU, StringRef FS); 435 getInstrInfo()436 const SIInstrInfo *getInstrInfo() const override { 437 return &InstrInfo; 438 } 439 getFrameLowering()440 const SIFrameLowering *getFrameLowering() const override { 441 return &FrameLowering; 442 } 443 getTargetLowering()444 const SITargetLowering *getTargetLowering() const override { 445 return &TLInfo; 446 } 447 getRegisterInfo()448 const SIRegisterInfo *getRegisterInfo() const override { 449 return &InstrInfo.getRegisterInfo(); 450 } 451 getCallLowering()452 const CallLowering *getCallLowering() const override { 453 return CallLoweringInfo.get(); 454 } 455 getInlineAsmLowering()456 const InlineAsmLowering *getInlineAsmLowering() const override { 457 return InlineAsmLoweringInfo.get(); 458 } 459 getInstructionSelector()460 InstructionSelector *getInstructionSelector() const override { 461 return InstSelector.get(); 462 } 463 getLegalizerInfo()464 const LegalizerInfo *getLegalizerInfo() const override { 465 return Legalizer.get(); 466 } 467 getRegBankInfo()468 const RegisterBankInfo *getRegBankInfo() const override { 469 return RegBankInfo.get(); 470 } 471 472 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()473 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 474 return &TSInfo; 475 } 476 getInstrItineraryData()477 const InstrItineraryData *getInstrItineraryData() const override { 478 return &InstrItins; 479 } 480 481 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 482 getGeneration()483 Generation getGeneration() const { 484 return (Generation)Gen; 485 } 486 487 /// Return the number of high bits known to be zero fror a frame index. getKnownHighZeroBitsForFrameIndex()488 unsigned getKnownHighZeroBitsForFrameIndex() const { 489 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 490 } 491 getLDSBankCount()492 int getLDSBankCount() const { 493 return LDSBankCount; 494 } 495 496 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 497 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 498 } 499 500 unsigned getConstantBusLimit(unsigned Opcode) const; 501 hasIntClamp()502 bool hasIntClamp() const { 503 return HasIntClamp; 504 } 505 hasFP64()506 bool hasFP64() const { 507 return FP64; 508 } 509 hasMIMG_R128()510 bool hasMIMG_R128() const { 511 return MIMG_R128; 512 } 513 hasHWFP64()514 bool hasHWFP64() const { 515 return FP64; 516 } 517 hasFastFMAF32()518 bool hasFastFMAF32() const { 519 return FastFMAF32; 520 } 521 hasHalfRate64Ops()522 bool hasHalfRate64Ops() const { 523 return HalfRate64Ops; 524 } 525 hasAddr64()526 bool hasAddr64() const { 527 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 528 } 529 530 // Return true if the target only has the reverse operand versions of VALU 531 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). hasOnlyRevVALUShifts()532 bool hasOnlyRevVALUShifts() const { 533 return getGeneration() >= VOLCANIC_ISLANDS; 534 } 535 hasFractBug()536 bool hasFractBug() const { 537 return getGeneration() == SOUTHERN_ISLANDS; 538 } 539 hasBFE()540 bool hasBFE() const { 541 return true; 542 } 543 hasBFI()544 bool hasBFI() const { 545 return true; 546 } 547 hasBFM()548 bool hasBFM() const { 549 return hasBFE(); 550 } 551 hasBCNT(unsigned Size)552 bool hasBCNT(unsigned Size) const { 553 return true; 554 } 555 hasFFBL()556 bool hasFFBL() const { 557 return true; 558 } 559 hasFFBH()560 bool hasFFBH() const { 561 return true; 562 } 563 hasMed3_16()564 bool hasMed3_16() const { 565 return getGeneration() >= AMDGPUSubtarget::GFX9; 566 } 567 hasMin3Max3_16()568 bool hasMin3Max3_16() const { 569 return getGeneration() >= AMDGPUSubtarget::GFX9; 570 } 571 hasFmaMixInsts()572 bool hasFmaMixInsts() const { 573 return HasFmaMixInsts; 574 } 575 hasCARRY()576 bool hasCARRY() const { 577 return true; 578 } 579 hasFMA()580 bool hasFMA() const { 581 return FMA; 582 } 583 hasSwap()584 bool hasSwap() const { 585 return GFX9Insts; 586 } 587 hasScalarPackInsts()588 bool hasScalarPackInsts() const { 589 return GFX9Insts; 590 } 591 hasScalarMulHiInsts()592 bool hasScalarMulHiInsts() const { 593 return GFX9Insts; 594 } 595 getTrapHandlerAbi()596 TrapHandlerAbi getTrapHandlerAbi() const { 597 return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; 598 } 599 600 /// True if the offset field of DS instructions works as expected. On SI, the 601 /// offset uses a 16-bit adder and does not always wrap properly. hasUsableDSOffset()602 bool hasUsableDSOffset() const { 603 return getGeneration() >= SEA_ISLANDS; 604 } 605 unsafeDSOffsetFoldingEnabled()606 bool unsafeDSOffsetFoldingEnabled() const { 607 return EnableUnsafeDSOffsetFolding; 608 } 609 610 /// Condition output from div_scale is usable. hasUsableDivScaleConditionOutput()611 bool hasUsableDivScaleConditionOutput() const { 612 return getGeneration() != SOUTHERN_ISLANDS; 613 } 614 615 /// Extra wait hazard is needed in some cases before 616 /// s_cbranch_vccnz/s_cbranch_vccz. hasReadVCCZBug()617 bool hasReadVCCZBug() const { 618 return getGeneration() <= SEA_ISLANDS; 619 } 620 621 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. partialVCCWritesUpdateVCCZ()622 bool partialVCCWritesUpdateVCCZ() const { 623 return getGeneration() >= GFX10; 624 } 625 626 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 627 /// was written by a VALU instruction. hasSMRDReadVALUDefHazard()628 bool hasSMRDReadVALUDefHazard() const { 629 return getGeneration() == SOUTHERN_ISLANDS; 630 } 631 632 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 633 /// SGPR was written by a VALU Instruction. hasVMEMReadSGPRVALUDefHazard()634 bool hasVMEMReadSGPRVALUDefHazard() const { 635 return getGeneration() >= VOLCANIC_ISLANDS; 636 } 637 hasRFEHazards()638 bool hasRFEHazards() const { 639 return getGeneration() >= VOLCANIC_ISLANDS; 640 } 641 642 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. getSetRegWaitStates()643 unsigned getSetRegWaitStates() const { 644 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 645 } 646 dumpCode()647 bool dumpCode() const { 648 return DumpCode; 649 } 650 651 /// Return the amount of LDS that can be used that will not restrict the 652 /// occupancy lower than WaveCount. 653 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 654 const Function &) const; 655 supportsMinMaxDenormModes()656 bool supportsMinMaxDenormModes() const { 657 return getGeneration() >= AMDGPUSubtarget::GFX9; 658 } 659 660 /// \returns If target supports S_DENORM_MODE. hasDenormModeInst()661 bool hasDenormModeInst() const { 662 return getGeneration() >= AMDGPUSubtarget::GFX10; 663 } 664 useFlatForGlobal()665 bool useFlatForGlobal() const { 666 return FlatForGlobal; 667 } 668 669 /// \returns If target supports ds_read/write_b128 and user enables generation 670 /// of ds_read/write_b128. useDS128()671 bool useDS128() const { 672 return CIInsts && EnableDS128; 673 } 674 675 /// \return If target supports ds_read/write_b96/128. hasDS96AndDS128()676 bool hasDS96AndDS128() const { 677 return CIInsts; 678 } 679 680 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 haveRoundOpsF64()681 bool haveRoundOpsF64() const { 682 return CIInsts; 683 } 684 685 /// \returns If MUBUF instructions always perform range checking, even for 686 /// buffer resources used for private memory access. privateMemoryResourceIsRangeChecked()687 bool privateMemoryResourceIsRangeChecked() const { 688 return getGeneration() < AMDGPUSubtarget::GFX9; 689 } 690 691 /// \returns If target requires PRT Struct NULL support (zero result registers 692 /// for sparse texture support). usePRTStrictNull()693 bool usePRTStrictNull() const { 694 return EnablePRTStrictNull; 695 } 696 hasAutoWaitcntBeforeBarrier()697 bool hasAutoWaitcntBeforeBarrier() const { 698 return AutoWaitcntBeforeBarrier; 699 } 700 hasUnalignedBufferAccess()701 bool hasUnalignedBufferAccess() const { 702 return UnalignedBufferAccess; 703 } 704 hasUnalignedBufferAccessEnabled()705 bool hasUnalignedBufferAccessEnabled() const { 706 return UnalignedBufferAccess && UnalignedAccessMode; 707 } 708 hasUnalignedDSAccess()709 bool hasUnalignedDSAccess() const { 710 return UnalignedDSAccess; 711 } 712 hasUnalignedDSAccessEnabled()713 bool hasUnalignedDSAccessEnabled() const { 714 return UnalignedDSAccess && UnalignedAccessMode; 715 } 716 hasUnalignedScratchAccess()717 bool hasUnalignedScratchAccess() const { 718 return UnalignedScratchAccess; 719 } 720 hasUnalignedAccessMode()721 bool hasUnalignedAccessMode() const { 722 return UnalignedAccessMode; 723 } 724 hasApertureRegs()725 bool hasApertureRegs() const { 726 return HasApertureRegs; 727 } 728 isTrapHandlerEnabled()729 bool isTrapHandlerEnabled() const { 730 return TrapHandler; 731 } 732 isXNACKEnabled()733 bool isXNACKEnabled() const { 734 return EnableXNACK; 735 } 736 isCuModeEnabled()737 bool isCuModeEnabled() const { 738 return EnableCuMode; 739 } 740 hasFlatAddressSpace()741 bool hasFlatAddressSpace() const { 742 return FlatAddressSpace; 743 } 744 hasFlatScrRegister()745 bool hasFlatScrRegister() const { 746 return hasFlatAddressSpace(); 747 } 748 hasFlatInstOffsets()749 bool hasFlatInstOffsets() const { 750 return FlatInstOffsets; 751 } 752 hasFlatGlobalInsts()753 bool hasFlatGlobalInsts() const { 754 return FlatGlobalInsts; 755 } 756 hasFlatScratchInsts()757 bool hasFlatScratchInsts() const { 758 return FlatScratchInsts; 759 } 760 761 // Check if target supports ST addressing mode with FLAT scratch instructions. 762 // The ST addressing mode means no registers are used, either VGPR or SGPR, 763 // but only immediate offset is swizzled and added to the FLAT scratch base. hasFlatScratchSTMode()764 bool hasFlatScratchSTMode() const { 765 return hasFlatScratchInsts() && hasGFX10_3Insts(); 766 } 767 hasScalarFlatScratchInsts()768 bool hasScalarFlatScratchInsts() const { 769 return ScalarFlatScratchInsts; 770 } 771 hasGlobalAddTidInsts()772 bool hasGlobalAddTidInsts() const { 773 return GFX10_BEncoding; 774 } 775 hasAtomicCSub()776 bool hasAtomicCSub() const { 777 return GFX10_BEncoding; 778 } 779 hasMultiDwordFlatScratchAddressing()780 bool hasMultiDwordFlatScratchAddressing() const { 781 return getGeneration() >= GFX9; 782 } 783 hasFlatSegmentOffsetBug()784 bool hasFlatSegmentOffsetBug() const { 785 return HasFlatSegmentOffsetBug; 786 } 787 hasFlatLgkmVMemCountInOrder()788 bool hasFlatLgkmVMemCountInOrder() const { 789 return getGeneration() > GFX9; 790 } 791 hasD16LoadStore()792 bool hasD16LoadStore() const { 793 return getGeneration() >= GFX9; 794 } 795 d16PreservesUnusedBits()796 bool d16PreservesUnusedBits() const { 797 return hasD16LoadStore() && !isSRAMECCEnabled(); 798 } 799 hasD16Images()800 bool hasD16Images() const { 801 return getGeneration() >= VOLCANIC_ISLANDS; 802 } 803 804 /// Return if most LDS instructions have an m0 use that require m0 to be 805 /// iniitalized. ldsRequiresM0Init()806 bool ldsRequiresM0Init() const { 807 return getGeneration() < GFX9; 808 } 809 810 // True if the hardware rewinds and replays GWS operations if a wave is 811 // preempted. 812 // 813 // If this is false, a GWS operation requires testing if a nack set the 814 // MEM_VIOL bit, and repeating if so. hasGWSAutoReplay()815 bool hasGWSAutoReplay() const { 816 return getGeneration() >= GFX9; 817 } 818 819 /// \returns if target has ds_gws_sema_release_all instruction. hasGWSSemaReleaseAll()820 bool hasGWSSemaReleaseAll() const { 821 return CIInsts; 822 } 823 824 /// \returns true if the target has integer add/sub instructions that do not 825 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 826 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 827 /// for saturation. hasAddNoCarry()828 bool hasAddNoCarry() const { 829 return AddNoCarryInsts; 830 } 831 hasUnpackedD16VMem()832 bool hasUnpackedD16VMem() const { 833 return HasUnpackedD16VMem; 834 } 835 836 // Covers VS/PS/CS graphics shaders isMesaGfxShader(const Function & F)837 bool isMesaGfxShader(const Function &F) const { 838 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 839 } 840 hasMad64_32()841 bool hasMad64_32() const { 842 return getGeneration() >= SEA_ISLANDS; 843 } 844 hasSDWAOmod()845 bool hasSDWAOmod() const { 846 return HasSDWAOmod; 847 } 848 hasSDWAScalar()849 bool hasSDWAScalar() const { 850 return HasSDWAScalar; 851 } 852 hasSDWASdst()853 bool hasSDWASdst() const { 854 return HasSDWASdst; 855 } 856 hasSDWAMac()857 bool hasSDWAMac() const { 858 return HasSDWAMac; 859 } 860 hasSDWAOutModsVOPC()861 bool hasSDWAOutModsVOPC() const { 862 return HasSDWAOutModsVOPC; 863 } 864 hasDLInsts()865 bool hasDLInsts() const { 866 return HasDLInsts; 867 } 868 hasDot1Insts()869 bool hasDot1Insts() const { 870 return HasDot1Insts; 871 } 872 hasDot2Insts()873 bool hasDot2Insts() const { 874 return HasDot2Insts; 875 } 876 hasDot3Insts()877 bool hasDot3Insts() const { 878 return HasDot3Insts; 879 } 880 hasDot4Insts()881 bool hasDot4Insts() const { 882 return HasDot4Insts; 883 } 884 hasDot5Insts()885 bool hasDot5Insts() const { 886 return HasDot5Insts; 887 } 888 hasDot6Insts()889 bool hasDot6Insts() const { 890 return HasDot6Insts; 891 } 892 hasMAIInsts()893 bool hasMAIInsts() const { 894 return HasMAIInsts; 895 } 896 hasPkFmacF16Inst()897 bool hasPkFmacF16Inst() const { 898 return HasPkFmacF16Inst; 899 } 900 hasAtomicFaddInsts()901 bool hasAtomicFaddInsts() const { 902 return HasAtomicFaddInsts; 903 } 904 isSRAMECCEnabled()905 bool isSRAMECCEnabled() const { 906 return EnableSRAMECC; 907 } 908 hasNoSdstCMPX()909 bool hasNoSdstCMPX() const { 910 return HasNoSdstCMPX; 911 } 912 hasVscnt()913 bool hasVscnt() const { 914 return HasVscnt; 915 } 916 hasGetWaveIdInst()917 bool hasGetWaveIdInst() const { 918 return HasGetWaveIdInst; 919 } 920 hasSMemTimeInst()921 bool hasSMemTimeInst() const { 922 return HasSMemTimeInst; 923 } 924 hasRegisterBanking()925 bool hasRegisterBanking() const { 926 return HasRegisterBanking; 927 } 928 hasVOP3Literal()929 bool hasVOP3Literal() const { 930 return HasVOP3Literal; 931 } 932 hasNoDataDepHazard()933 bool hasNoDataDepHazard() const { 934 return HasNoDataDepHazard; 935 } 936 vmemWriteNeedsExpWaitcnt()937 bool vmemWriteNeedsExpWaitcnt() const { 938 return getGeneration() < SEA_ISLANDS; 939 } 940 941 // Scratch is allocated in 256 dword per wave blocks for the entire 942 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 943 // is 4-byte aligned. 944 // 945 // Only 4-byte alignment is really needed to access anything. Transformations 946 // on the pointer value itself may rely on the alignment / known low bits of 947 // the pointer. Set this to something above the minimum to avoid needing 948 // dynamic realignment in common cases. getStackAlignment()949 Align getStackAlignment() const { return Align(16); } 950 enableMachineScheduler()951 bool enableMachineScheduler() const override { 952 return true; 953 } 954 955 bool useAA() const override; 956 enableSubRegLiveness()957 bool enableSubRegLiveness() const override { 958 return true; 959 } 960 setScalarizeGlobalBehavior(bool b)961 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } getScalarizeGlobalBehavior()962 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 963 964 // static wrappers 965 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 966 967 // XXX - Why is this here if it isn't in the default pass set? enableEarlyIfConversion()968 bool enableEarlyIfConversion() const override { 969 return true; 970 } 971 972 bool enableFlatScratch() const; 973 974 void overrideSchedPolicy(MachineSchedPolicy &Policy, 975 unsigned NumRegionInstrs) const override; 976 getMaxNumUserSGPRs()977 unsigned getMaxNumUserSGPRs() const { 978 return 16; 979 } 980 hasSMemRealTime()981 bool hasSMemRealTime() const { 982 return HasSMemRealTime; 983 } 984 hasMovrel()985 bool hasMovrel() const { 986 return HasMovrel; 987 } 988 hasVGPRIndexMode()989 bool hasVGPRIndexMode() const { 990 return HasVGPRIndexMode; 991 } 992 993 bool useVGPRIndexMode() const; 994 hasScalarCompareEq64()995 bool hasScalarCompareEq64() const { 996 return getGeneration() >= VOLCANIC_ISLANDS; 997 } 998 hasScalarStores()999 bool hasScalarStores() const { 1000 return HasScalarStores; 1001 } 1002 hasScalarAtomics()1003 bool hasScalarAtomics() const { 1004 return HasScalarAtomics; 1005 } 1006 hasLDSFPAtomics()1007 bool hasLDSFPAtomics() const { 1008 return GFX8Insts; 1009 } 1010 hasDPP()1011 bool hasDPP() const { 1012 return HasDPP; 1013 } 1014 hasDPPBroadcasts()1015 bool hasDPPBroadcasts() const { 1016 return HasDPP && getGeneration() < GFX10; 1017 } 1018 hasDPPWavefrontShifts()1019 bool hasDPPWavefrontShifts() const { 1020 return HasDPP && getGeneration() < GFX10; 1021 } 1022 hasDPP8()1023 bool hasDPP8() const { 1024 return HasDPP8; 1025 } 1026 hasR128A16()1027 bool hasR128A16() const { 1028 return HasR128A16; 1029 } 1030 hasGFX10A16()1031 bool hasGFX10A16() const { 1032 return HasGFX10A16; 1033 } 1034 hasA16()1035 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 1036 hasG16()1037 bool hasG16() const { return HasG16; } 1038 hasOffset3fBug()1039 bool hasOffset3fBug() const { 1040 return HasOffset3fBug; 1041 } 1042 hasImageStoreD16Bug()1043 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 1044 hasImageGather4D16Bug()1045 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 1046 hasNSAEncoding()1047 bool hasNSAEncoding() const { return HasNSAEncoding; } 1048 hasGFX10_BEncoding()1049 bool hasGFX10_BEncoding() const { 1050 return GFX10_BEncoding; 1051 } 1052 hasGFX10_3Insts()1053 bool hasGFX10_3Insts() const { 1054 return GFX10_3Insts; 1055 } 1056 1057 bool hasMadF16() const; 1058 enableSIScheduler()1059 bool enableSIScheduler() const { 1060 return EnableSIScheduler; 1061 } 1062 loadStoreOptEnabled()1063 bool loadStoreOptEnabled() const { 1064 return EnableLoadStoreOpt; 1065 } 1066 hasSGPRInitBug()1067 bool hasSGPRInitBug() const { 1068 return SGPRInitBug; 1069 } 1070 hasMFMAInlineLiteralBug()1071 bool hasMFMAInlineLiteralBug() const { 1072 return HasMFMAInlineLiteralBug; 1073 } 1074 has12DWordStoreHazard()1075 bool has12DWordStoreHazard() const { 1076 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1077 } 1078 1079 // \returns true if the subtarget supports DWORDX3 load/store instructions. hasDwordx3LoadStores()1080 bool hasDwordx3LoadStores() const { 1081 return CIInsts; 1082 } 1083 hasReadM0MovRelInterpHazard()1084 bool hasReadM0MovRelInterpHazard() const { 1085 return getGeneration() == AMDGPUSubtarget::GFX9; 1086 } 1087 hasReadM0SendMsgHazard()1088 bool hasReadM0SendMsgHazard() const { 1089 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1090 getGeneration() <= AMDGPUSubtarget::GFX9; 1091 } 1092 hasVcmpxPermlaneHazard()1093 bool hasVcmpxPermlaneHazard() const { 1094 return HasVcmpxPermlaneHazard; 1095 } 1096 hasVMEMtoScalarWriteHazard()1097 bool hasVMEMtoScalarWriteHazard() const { 1098 return HasVMEMtoScalarWriteHazard; 1099 } 1100 hasSMEMtoVectorWriteHazard()1101 bool hasSMEMtoVectorWriteHazard() const { 1102 return HasSMEMtoVectorWriteHazard; 1103 } 1104 hasLDSMisalignedBug()1105 bool hasLDSMisalignedBug() const { 1106 return LDSMisalignedBug && !EnableCuMode; 1107 } 1108 hasInstFwdPrefetchBug()1109 bool hasInstFwdPrefetchBug() const { 1110 return HasInstFwdPrefetchBug; 1111 } 1112 hasVcmpxExecWARHazard()1113 bool hasVcmpxExecWARHazard() const { 1114 return HasVcmpxExecWARHazard; 1115 } 1116 hasLdsBranchVmemWARHazard()1117 bool hasLdsBranchVmemWARHazard() const { 1118 return HasLdsBranchVmemWARHazard; 1119 } 1120 hasNSAtoVMEMBug()1121 bool hasNSAtoVMEMBug() const { 1122 return HasNSAtoVMEMBug; 1123 } 1124 hasHardClauses()1125 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1126 1127 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1128 /// SGPRs 1129 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1130 1131 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1132 /// VGPRs 1133 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1134 1135 /// Return occupancy for the given function. Used LDS and a number of 1136 /// registers if provided. 1137 /// Note, occupancy can be affected by the scratch allocation as well, but 1138 /// we do not have enough information to compute it. 1139 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1140 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1141 1142 /// \returns true if the flat_scratch register should be initialized with the 1143 /// pointer to the wave's scratch memory rather than a size and offset. flatScratchIsPointer()1144 bool flatScratchIsPointer() const { 1145 return getGeneration() >= AMDGPUSubtarget::GFX9; 1146 } 1147 1148 /// \returns true if the machine has merged shaders in which s0-s7 are 1149 /// reserved by the hardware and user SGPRs start at s8 hasMergedShaders()1150 bool hasMergedShaders() const { 1151 return getGeneration() >= GFX9; 1152 } 1153 1154 /// \returns SGPR allocation granularity supported by the subtarget. getSGPRAllocGranule()1155 unsigned getSGPRAllocGranule() const { 1156 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1157 } 1158 1159 /// \returns SGPR encoding granularity supported by the subtarget. getSGPREncodingGranule()1160 unsigned getSGPREncodingGranule() const { 1161 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1162 } 1163 1164 /// \returns Total number of SGPRs supported by the subtarget. getTotalNumSGPRs()1165 unsigned getTotalNumSGPRs() const { 1166 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1167 } 1168 1169 /// \returns Addressable number of SGPRs supported by the subtarget. getAddressableNumSGPRs()1170 unsigned getAddressableNumSGPRs() const { 1171 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1172 } 1173 1174 /// \returns Minimum number of SGPRs that meets the given number of waves per 1175 /// execution unit requirement supported by the subtarget. getMinNumSGPRs(unsigned WavesPerEU)1176 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1177 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1178 } 1179 1180 /// \returns Maximum number of SGPRs that meets the given number of waves per 1181 /// execution unit requirement supported by the subtarget. getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1182 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1183 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1184 } 1185 1186 /// \returns Reserved number of SGPRs for given function \p MF. 1187 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1188 1189 /// \returns Maximum number of SGPRs that meets number of waves per execution 1190 /// unit requirement for function \p MF, or number of SGPRs explicitly 1191 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1192 /// 1193 /// \returns Value that meets number of waves per execution unit requirement 1194 /// if explicitly requested value cannot be converted to integer, violates 1195 /// subtarget's specifications, or does not meet number of waves per execution 1196 /// unit requirement. 1197 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1198 1199 /// \returns VGPR allocation granularity supported by the subtarget. getVGPRAllocGranule()1200 unsigned getVGPRAllocGranule() const { 1201 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1202 } 1203 1204 /// \returns VGPR encoding granularity supported by the subtarget. getVGPREncodingGranule()1205 unsigned getVGPREncodingGranule() const { 1206 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1207 } 1208 1209 /// \returns Total number of VGPRs supported by the subtarget. getTotalNumVGPRs()1210 unsigned getTotalNumVGPRs() const { 1211 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1212 } 1213 1214 /// \returns Addressable number of VGPRs supported by the subtarget. getAddressableNumVGPRs()1215 unsigned getAddressableNumVGPRs() const { 1216 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1217 } 1218 1219 /// \returns Minimum number of VGPRs that meets given number of waves per 1220 /// execution unit requirement supported by the subtarget. getMinNumVGPRs(unsigned WavesPerEU)1221 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1222 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1223 } 1224 1225 /// \returns Maximum number of VGPRs that meets given number of waves per 1226 /// execution unit requirement supported by the subtarget. getMaxNumVGPRs(unsigned WavesPerEU)1227 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1228 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1229 } 1230 1231 /// \returns Maximum number of VGPRs that meets number of waves per execution 1232 /// unit requirement for function \p MF, or number of VGPRs explicitly 1233 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1234 /// 1235 /// \returns Value that meets number of waves per execution unit requirement 1236 /// if explicitly requested value cannot be converted to integer, violates 1237 /// subtarget's specifications, or does not meet number of waves per execution 1238 /// unit requirement. 1239 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1240 1241 void getPostRAMutations( 1242 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1243 const override; 1244 isWave32()1245 bool isWave32() const { 1246 return getWavefrontSize() == 32; 1247 } 1248 isWave64()1249 bool isWave64() const { 1250 return getWavefrontSize() == 64; 1251 } 1252 getBoolRC()1253 const TargetRegisterClass *getBoolRC() const { 1254 return getRegisterInfo()->getBoolRC(); 1255 } 1256 1257 /// \returns Maximum number of work groups per compute unit supported by the 1258 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1259 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1260 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1261 } 1262 1263 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()1264 unsigned getMinFlatWorkGroupSize() const override { 1265 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1266 } 1267 1268 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()1269 unsigned getMaxFlatWorkGroupSize() const override { 1270 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1271 } 1272 1273 /// \returns Number of waves per execution unit required to support the given 1274 /// \p FlatWorkGroupSize. 1275 unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1276 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1277 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1278 } 1279 1280 /// \returns Minimum number of waves per execution unit supported by the 1281 /// subtarget. getMinWavesPerEU()1282 unsigned getMinWavesPerEU() const override { 1283 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1284 } 1285 1286 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1287 SDep &Dep) const override; 1288 }; 1289 1290 class R600Subtarget final : public R600GenSubtargetInfo, 1291 public AMDGPUSubtarget { 1292 private: 1293 R600InstrInfo InstrInfo; 1294 R600FrameLowering FrameLowering; 1295 bool FMA; 1296 bool CaymanISA; 1297 bool CFALUBug; 1298 bool HasVertexCache; 1299 bool R600ALUInst; 1300 bool FP64; 1301 short TexVTXClauseSize; 1302 Generation Gen; 1303 R600TargetLowering TLInfo; 1304 InstrItineraryData InstrItins; 1305 SelectionDAGTargetInfo TSInfo; 1306 1307 public: 1308 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 1309 const TargetMachine &TM); 1310 getInstrInfo()1311 const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } 1312 getFrameLowering()1313 const R600FrameLowering *getFrameLowering() const override { 1314 return &FrameLowering; 1315 } 1316 getTargetLowering()1317 const R600TargetLowering *getTargetLowering() const override { 1318 return &TLInfo; 1319 } 1320 getRegisterInfo()1321 const R600RegisterInfo *getRegisterInfo() const override { 1322 return &InstrInfo.getRegisterInfo(); 1323 } 1324 getInstrItineraryData()1325 const InstrItineraryData *getInstrItineraryData() const override { 1326 return &InstrItins; 1327 } 1328 1329 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()1330 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 1331 return &TSInfo; 1332 } 1333 1334 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 1335 getGeneration()1336 Generation getGeneration() const { 1337 return Gen; 1338 } 1339 getStackAlignment()1340 Align getStackAlignment() const { return Align(4); } 1341 1342 R600Subtarget &initializeSubtargetDependencies(const Triple &TT, 1343 StringRef GPU, StringRef FS); 1344 hasBFE()1345 bool hasBFE() const { 1346 return (getGeneration() >= EVERGREEN); 1347 } 1348 hasBFI()1349 bool hasBFI() const { 1350 return (getGeneration() >= EVERGREEN); 1351 } 1352 hasBCNT(unsigned Size)1353 bool hasBCNT(unsigned Size) const { 1354 if (Size == 32) 1355 return (getGeneration() >= EVERGREEN); 1356 1357 return false; 1358 } 1359 hasBORROW()1360 bool hasBORROW() const { 1361 return (getGeneration() >= EVERGREEN); 1362 } 1363 hasCARRY()1364 bool hasCARRY() const { 1365 return (getGeneration() >= EVERGREEN); 1366 } 1367 hasCaymanISA()1368 bool hasCaymanISA() const { 1369 return CaymanISA; 1370 } 1371 hasFFBL()1372 bool hasFFBL() const { 1373 return (getGeneration() >= EVERGREEN); 1374 } 1375 hasFFBH()1376 bool hasFFBH() const { 1377 return (getGeneration() >= EVERGREEN); 1378 } 1379 hasFMA()1380 bool hasFMA() const { return FMA; } 1381 hasCFAluBug()1382 bool hasCFAluBug() const { return CFALUBug; } 1383 hasVertexCache()1384 bool hasVertexCache() const { return HasVertexCache; } 1385 getTexVTXClauseSize()1386 short getTexVTXClauseSize() const { return TexVTXClauseSize; } 1387 enableMachineScheduler()1388 bool enableMachineScheduler() const override { 1389 return true; 1390 } 1391 enableSubRegLiveness()1392 bool enableSubRegLiveness() const override { 1393 return true; 1394 } 1395 1396 /// \returns Maximum number of work groups per compute unit supported by the 1397 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1398 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1399 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1400 } 1401 1402 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()1403 unsigned getMinFlatWorkGroupSize() const override { 1404 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1405 } 1406 1407 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()1408 unsigned getMaxFlatWorkGroupSize() const override { 1409 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1410 } 1411 1412 /// \returns Number of waves per execution unit required to support the given 1413 /// \p FlatWorkGroupSize. 1414 unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1415 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1416 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1417 } 1418 1419 /// \returns Minimum number of waves per execution unit supported by the 1420 /// subtarget. getMinWavesPerEU()1421 unsigned getMinWavesPerEU() const override { 1422 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1423 } 1424 }; 1425 1426 } // end namespace llvm 1427 1428 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 1429