• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73   UNKNOWN,
74   DS_READ,
75   DS_WRITE,
76   S_BUFFER_LOAD_IMM,
77   S_BUFFER_LOAD_SGPR_IMM,
78   S_LOAD_IMM,
79   BUFFER_LOAD,
80   BUFFER_STORE,
81   MIMG,
82   TBUFFER_LOAD,
83   TBUFFER_STORE,
84   GLOBAL_LOAD_SADDR,
85   GLOBAL_STORE_SADDR,
86   FLAT_LOAD,
87   FLAT_STORE,
88   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90                // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94   unsigned char NumVAddrs = 0;
95   bool SBase = false;
96   bool SRsrc = false;
97   bool SOffset = false;
98   bool SAddr = false;
99   bool VAddr = false;
100   bool Addr = false;
101   bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108   struct CombineInfo {
109     MachineBasicBlock::iterator I;
110     unsigned EltSize;
111     unsigned Offset;
112     unsigned Width;
113     unsigned Format;
114     unsigned BaseOff;
115     unsigned DMask;
116     InstClassEnum InstClass;
117     unsigned CPol = 0;
118     bool IsAGPR;
119     bool UseST64;
120     int AddrIdx[MaxAddressRegs];
121     const MachineOperand *AddrReg[MaxAddressRegs];
122     unsigned NumAddresses;
123     unsigned Order;
124 
hasSameBaseAddress__anon7587433f0111::SILoadStoreOptimizer::CombineInfo125     bool hasSameBaseAddress(const CombineInfo &CI) {
126       if (NumAddresses != CI.NumAddresses)
127         return false;
128 
129       const MachineInstr &MI = *CI.I;
130       for (unsigned i = 0; i < NumAddresses; i++) {
131         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136             return false;
137           }
138           continue;
139         }
140 
141         // Check same base pointer. Be careful of subregisters, which can occur
142         // with vectors of pointers.
143         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145          return false;
146         }
147       }
148       return true;
149     }
150 
hasMergeableAddress__anon7587433f0111::SILoadStoreOptimizer::CombineInfo151     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152       for (unsigned i = 0; i < NumAddresses; ++i) {
153         const MachineOperand *AddrOp = AddrReg[i];
154         // Immediates are always OK.
155         if (AddrOp->isImm())
156           continue;
157 
158         // Don't try to merge addresses that aren't either immediates or registers.
159         // TODO: Should be possible to merge FrameIndexes and maybe some other
160         // non-register
161         if (!AddrOp->isReg())
162           return false;
163 
164         // TODO: We should be able to merge physical reg addresses.
165         if (AddrOp->getReg().isPhysical())
166           return false;
167 
168         // If an address has only one use then there will be no other
169         // instructions with the same address, so we can't merge this one.
170         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
171           return false;
172       }
173       return true;
174     }
175 
176     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177 
178     // Compare by pointer order.
operator <__anon7587433f0111::SILoadStoreOptimizer::CombineInfo179     bool operator<(const CombineInfo& Other) const {
180       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181     }
182   };
183 
184   struct BaseRegisters {
185     Register LoReg;
186     Register HiReg;
187 
188     unsigned LoSubReg = 0;
189     unsigned HiSubReg = 0;
190   };
191 
192   struct MemAddress {
193     BaseRegisters Base;
194     int64_t Offset = 0;
195   };
196 
197   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
198 
199 private:
200   const GCNSubtarget *STM = nullptr;
201   const SIInstrInfo *TII = nullptr;
202   const SIRegisterInfo *TRI = nullptr;
203   MachineRegisterInfo *MRI = nullptr;
204   AliasAnalysis *AA = nullptr;
205   bool OptimizeAgain;
206 
207   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208                            const DenseSet<Register> &ARegUses,
209                            const MachineInstr &A, const MachineInstr &B) const;
210   static bool dmasksCanBeCombined(const CombineInfo &CI,
211                                   const SIInstrInfo &TII,
212                                   const CombineInfo &Paired);
213   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214                                    CombineInfo &Paired, bool Modify = false);
215   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
216                         const CombineInfo &Paired);
217   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
218   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
219                                                      const CombineInfo &Paired);
220   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
221                                                     const CombineInfo &Paired);
222   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
223 
224   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
225 
226   unsigned read2Opcode(unsigned EltSize) const;
227   unsigned read2ST64Opcode(unsigned EltSize) const;
228   MachineBasicBlock::iterator
229   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230                  MachineBasicBlock::iterator InsertBefore);
231 
232   unsigned write2Opcode(unsigned EltSize) const;
233   unsigned write2ST64Opcode(unsigned EltSize) const;
234   MachineBasicBlock::iterator
235   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236                   MachineBasicBlock::iterator InsertBefore);
237   MachineBasicBlock::iterator
238   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239                  MachineBasicBlock::iterator InsertBefore);
240   MachineBasicBlock::iterator
241   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242                        MachineBasicBlock::iterator InsertBefore);
243   MachineBasicBlock::iterator
244   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245                       MachineBasicBlock::iterator InsertBefore);
246   MachineBasicBlock::iterator
247   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248                        MachineBasicBlock::iterator InsertBefore);
249   MachineBasicBlock::iterator
250   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251                        MachineBasicBlock::iterator InsertBefore);
252   MachineBasicBlock::iterator
253   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254                         MachineBasicBlock::iterator InsertBefore);
255   MachineBasicBlock::iterator
256   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257                     MachineBasicBlock::iterator InsertBefore);
258   MachineBasicBlock::iterator
259   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260                      MachineBasicBlock::iterator InsertBefore);
261 
262   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
263                            int32_t NewOffset) const;
264   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
265   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
267   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
268   /// Promotes constant offset to the immediate by adjusting the base. It
269   /// tries to use a base from the nearby instructions that allows it to have
270   /// a 13bit constant offset which gets promoted to the immediate.
271   bool promoteConstantOffsetToImm(MachineInstr &CI,
272                                   MemInfoMap &Visited,
273                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
274   void addInstToMergeableList(const CombineInfo &CI,
275                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
276 
277   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
278       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
279       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
280       std::list<std::list<CombineInfo>> &MergeableInsts) const;
281 
282   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283                                                      const CombineInfo &Paired);
284 
285   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286                                           const CombineInfo &Paired);
287 
288 public:
289   static char ID;
290 
SILoadStoreOptimizer()291   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
292     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
293   }
294 
295   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296                                      bool &OptimizeListAgain);
297   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
298 
299   bool runOnMachineFunction(MachineFunction &MF) override;
300 
getPassName() const301   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
302 
getAnalysisUsage(AnalysisUsage & AU) const303   void getAnalysisUsage(AnalysisUsage &AU) const override {
304     AU.setPreservesCFG();
305     AU.addRequired<AAResultsWrapperPass>();
306 
307     MachineFunctionPass::getAnalysisUsage(AU);
308   }
309 
getRequiredProperties() const310   MachineFunctionProperties getRequiredProperties() const override {
311     return MachineFunctionProperties()
312       .set(MachineFunctionProperties::Property::IsSSA);
313   }
314 };
315 
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
317   const unsigned Opc = MI.getOpcode();
318 
319   if (TII.isMUBUF(Opc)) {
320     // FIXME: Handle d16 correctly
321     return AMDGPU::getMUBUFElements(Opc);
322   }
323   if (TII.isMIMG(MI)) {
324     uint64_t DMaskImm =
325         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326     return llvm::popcount(DMaskImm);
327   }
328   if (TII.isMTBUF(Opc)) {
329     return AMDGPU::getMTBUFElements(Opc);
330   }
331 
332   switch (Opc) {
333   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336   case AMDGPU::S_LOAD_DWORD_IMM:
337   case AMDGPU::GLOBAL_LOAD_DWORD:
338   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339   case AMDGPU::GLOBAL_STORE_DWORD:
340   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341   case AMDGPU::FLAT_LOAD_DWORD:
342   case AMDGPU::FLAT_STORE_DWORD:
343     return 1;
344   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347   case AMDGPU::S_LOAD_DWORDX2_IMM:
348   case AMDGPU::GLOBAL_LOAD_DWORDX2:
349   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350   case AMDGPU::GLOBAL_STORE_DWORDX2:
351   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352   case AMDGPU::FLAT_LOAD_DWORDX2:
353   case AMDGPU::FLAT_STORE_DWORDX2:
354     return 2;
355   case AMDGPU::GLOBAL_LOAD_DWORDX3:
356   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357   case AMDGPU::GLOBAL_STORE_DWORDX3:
358   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359   case AMDGPU::FLAT_LOAD_DWORDX3:
360   case AMDGPU::FLAT_STORE_DWORDX3:
361     return 3;
362   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365   case AMDGPU::S_LOAD_DWORDX4_IMM:
366   case AMDGPU::GLOBAL_LOAD_DWORDX4:
367   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368   case AMDGPU::GLOBAL_STORE_DWORDX4:
369   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370   case AMDGPU::FLAT_LOAD_DWORDX4:
371   case AMDGPU::FLAT_STORE_DWORDX4:
372     return 4;
373   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376   case AMDGPU::S_LOAD_DWORDX8_IMM:
377     return 8;
378   case AMDGPU::DS_READ_B32:      [[fallthrough]];
379   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
381   case AMDGPU::DS_WRITE_B32_gfx9:
382     return 1;
383   case AMDGPU::DS_READ_B64:      [[fallthrough]];
384   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
386   case AMDGPU::DS_WRITE_B64_gfx9:
387     return 2;
388   default:
389     return 0;
390   }
391 }
392 
393 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)394 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
395   switch (Opc) {
396   default:
397     if (TII.isMUBUF(Opc)) {
398       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
399       default:
400         return UNKNOWN;
401       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
402       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
403       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
404       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
405         return BUFFER_LOAD;
406       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
407       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
408       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
409       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
410         return BUFFER_STORE;
411       }
412     }
413     if (TII.isMIMG(Opc)) {
414       // Ignore instructions encoded without vaddr.
415       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
416           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
417         return UNKNOWN;
418       // Ignore BVH instructions
419       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
420         return UNKNOWN;
421       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
422       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
423           TII.isGather4(Opc))
424         return UNKNOWN;
425       return MIMG;
426     }
427     if (TII.isMTBUF(Opc)) {
428       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
429       default:
430         return UNKNOWN;
431       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
432       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
433       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
434       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
435         return TBUFFER_LOAD;
436       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440         return TBUFFER_STORE;
441       }
442     }
443     return UNKNOWN;
444   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448     return S_BUFFER_LOAD_IMM;
449   // For the purposes of this optimization SGPR variants of buffer loads
450   // are considered to be zero-offsetted SGPR_IMM loads.
451   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
452   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
453   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
454   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
455   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
458   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
459     return S_BUFFER_LOAD_SGPR_IMM;
460   case AMDGPU::S_LOAD_DWORD_IMM:
461   case AMDGPU::S_LOAD_DWORDX2_IMM:
462   case AMDGPU::S_LOAD_DWORDX4_IMM:
463   case AMDGPU::S_LOAD_DWORDX8_IMM:
464     return S_LOAD_IMM;
465   case AMDGPU::DS_READ_B32:
466   case AMDGPU::DS_READ_B32_gfx9:
467   case AMDGPU::DS_READ_B64:
468   case AMDGPU::DS_READ_B64_gfx9:
469     return DS_READ;
470   case AMDGPU::DS_WRITE_B32:
471   case AMDGPU::DS_WRITE_B32_gfx9:
472   case AMDGPU::DS_WRITE_B64:
473   case AMDGPU::DS_WRITE_B64_gfx9:
474     return DS_WRITE;
475   case AMDGPU::GLOBAL_LOAD_DWORD:
476   case AMDGPU::GLOBAL_LOAD_DWORDX2:
477   case AMDGPU::GLOBAL_LOAD_DWORDX3:
478   case AMDGPU::GLOBAL_LOAD_DWORDX4:
479   case AMDGPU::FLAT_LOAD_DWORD:
480   case AMDGPU::FLAT_LOAD_DWORDX2:
481   case AMDGPU::FLAT_LOAD_DWORDX3:
482   case AMDGPU::FLAT_LOAD_DWORDX4:
483     return FLAT_LOAD;
484   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
485   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
486   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
487   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
488     return GLOBAL_LOAD_SADDR;
489   case AMDGPU::GLOBAL_STORE_DWORD:
490   case AMDGPU::GLOBAL_STORE_DWORDX2:
491   case AMDGPU::GLOBAL_STORE_DWORDX3:
492   case AMDGPU::GLOBAL_STORE_DWORDX4:
493   case AMDGPU::FLAT_STORE_DWORD:
494   case AMDGPU::FLAT_STORE_DWORDX2:
495   case AMDGPU::FLAT_STORE_DWORDX3:
496   case AMDGPU::FLAT_STORE_DWORDX4:
497     return FLAT_STORE;
498   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
499   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
500   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
501   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
502     return GLOBAL_STORE_SADDR;
503   }
504 }
505 
506 /// Determines instruction subclass from opcode. Only instructions
507 /// of the same subclass can be merged together. The merged instruction may have
508 /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)509 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
510   switch (Opc) {
511   default:
512     if (TII.isMUBUF(Opc))
513       return AMDGPU::getMUBUFBaseOpcode(Opc);
514     if (TII.isMIMG(Opc)) {
515       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
516       assert(Info);
517       return Info->BaseOpcode;
518     }
519     if (TII.isMTBUF(Opc))
520       return AMDGPU::getMTBUFBaseOpcode(Opc);
521     return -1;
522   case AMDGPU::DS_READ_B32:
523   case AMDGPU::DS_READ_B32_gfx9:
524   case AMDGPU::DS_READ_B64:
525   case AMDGPU::DS_READ_B64_gfx9:
526   case AMDGPU::DS_WRITE_B32:
527   case AMDGPU::DS_WRITE_B32_gfx9:
528   case AMDGPU::DS_WRITE_B64:
529   case AMDGPU::DS_WRITE_B64_gfx9:
530     return Opc;
531   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
532   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
533   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
534   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
535     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
536   // For the purposes of this optimization SGPR variants of buffer loads
537   // are considered to be zero-offsetted SGPR_IMM loads.
538   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
539   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
540   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
541   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
542   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
543   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
544   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
545   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
546     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
547   case AMDGPU::S_LOAD_DWORD_IMM:
548   case AMDGPU::S_LOAD_DWORDX2_IMM:
549   case AMDGPU::S_LOAD_DWORDX4_IMM:
550   case AMDGPU::S_LOAD_DWORDX8_IMM:
551     return AMDGPU::S_LOAD_DWORD_IMM;
552   case AMDGPU::GLOBAL_LOAD_DWORD:
553   case AMDGPU::GLOBAL_LOAD_DWORDX2:
554   case AMDGPU::GLOBAL_LOAD_DWORDX3:
555   case AMDGPU::GLOBAL_LOAD_DWORDX4:
556   case AMDGPU::FLAT_LOAD_DWORD:
557   case AMDGPU::FLAT_LOAD_DWORDX2:
558   case AMDGPU::FLAT_LOAD_DWORDX3:
559   case AMDGPU::FLAT_LOAD_DWORDX4:
560     return AMDGPU::FLAT_LOAD_DWORD;
561   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
562   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
563   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
564   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
565     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
566   case AMDGPU::GLOBAL_STORE_DWORD:
567   case AMDGPU::GLOBAL_STORE_DWORDX2:
568   case AMDGPU::GLOBAL_STORE_DWORDX3:
569   case AMDGPU::GLOBAL_STORE_DWORDX4:
570   case AMDGPU::FLAT_STORE_DWORD:
571   case AMDGPU::FLAT_STORE_DWORDX2:
572   case AMDGPU::FLAT_STORE_DWORDX3:
573   case AMDGPU::FLAT_STORE_DWORDX4:
574     return AMDGPU::FLAT_STORE_DWORD;
575   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
576   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
577   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
578   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
579     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
580   }
581 }
582 
583 // GLOBAL loads and stores are classified as FLAT initially. If both combined
584 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
585 // If either or both instructions are non segment specific FLAT the resulting
586 // combined operation will be FLAT, potentially promoting one of the GLOBAL
587 // operations to FLAT.
588 // For other instructions return the original unmodified class.
589 InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)590 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
591                                          const CombineInfo &Paired) {
592   assert(CI.InstClass == Paired.InstClass);
593 
594   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
595       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
596     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
597 
598   return CI.InstClass;
599 }
600 
getRegs(unsigned Opc,const SIInstrInfo & TII)601 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
602   AddressRegs Result;
603 
604   if (TII.isMUBUF(Opc)) {
605     if (AMDGPU::getMUBUFHasVAddr(Opc))
606       Result.VAddr = true;
607     if (AMDGPU::getMUBUFHasSrsrc(Opc))
608       Result.SRsrc = true;
609     if (AMDGPU::getMUBUFHasSoffset(Opc))
610       Result.SOffset = true;
611 
612     return Result;
613   }
614 
615   if (TII.isMIMG(Opc)) {
616     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
617     if (VAddr0Idx >= 0) {
618       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
619       Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
620     } else {
621       Result.VAddr = true;
622     }
623     Result.SRsrc = true;
624     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
625     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
626       Result.SSamp = true;
627 
628     return Result;
629   }
630   if (TII.isMTBUF(Opc)) {
631     if (AMDGPU::getMTBUFHasVAddr(Opc))
632       Result.VAddr = true;
633     if (AMDGPU::getMTBUFHasSrsrc(Opc))
634       Result.SRsrc = true;
635     if (AMDGPU::getMTBUFHasSoffset(Opc))
636       Result.SOffset = true;
637 
638     return Result;
639   }
640 
641   switch (Opc) {
642   default:
643     return Result;
644   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
645   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
646   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
647   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
648   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
649   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
650   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
651   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
652     Result.SOffset = true;
653     [[fallthrough]];
654   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
655   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
656   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
657   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
658   case AMDGPU::S_LOAD_DWORD_IMM:
659   case AMDGPU::S_LOAD_DWORDX2_IMM:
660   case AMDGPU::S_LOAD_DWORDX4_IMM:
661   case AMDGPU::S_LOAD_DWORDX8_IMM:
662     Result.SBase = true;
663     return Result;
664   case AMDGPU::DS_READ_B32:
665   case AMDGPU::DS_READ_B64:
666   case AMDGPU::DS_READ_B32_gfx9:
667   case AMDGPU::DS_READ_B64_gfx9:
668   case AMDGPU::DS_WRITE_B32:
669   case AMDGPU::DS_WRITE_B64:
670   case AMDGPU::DS_WRITE_B32_gfx9:
671   case AMDGPU::DS_WRITE_B64_gfx9:
672     Result.Addr = true;
673     return Result;
674   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
682     Result.SAddr = true;
683     [[fallthrough]];
684   case AMDGPU::GLOBAL_LOAD_DWORD:
685   case AMDGPU::GLOBAL_LOAD_DWORDX2:
686   case AMDGPU::GLOBAL_LOAD_DWORDX3:
687   case AMDGPU::GLOBAL_LOAD_DWORDX4:
688   case AMDGPU::GLOBAL_STORE_DWORD:
689   case AMDGPU::GLOBAL_STORE_DWORDX2:
690   case AMDGPU::GLOBAL_STORE_DWORDX3:
691   case AMDGPU::GLOBAL_STORE_DWORDX4:
692   case AMDGPU::FLAT_LOAD_DWORD:
693   case AMDGPU::FLAT_LOAD_DWORDX2:
694   case AMDGPU::FLAT_LOAD_DWORDX3:
695   case AMDGPU::FLAT_LOAD_DWORDX4:
696   case AMDGPU::FLAT_STORE_DWORD:
697   case AMDGPU::FLAT_STORE_DWORDX2:
698   case AMDGPU::FLAT_STORE_DWORDX3:
699   case AMDGPU::FLAT_STORE_DWORDX4:
700     Result.VAddr = true;
701     return Result;
702   }
703 }
704 
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)705 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
706                                               const SILoadStoreOptimizer &LSO) {
707   I = MI;
708   unsigned Opc = MI->getOpcode();
709   InstClass = getInstClass(Opc, *LSO.TII);
710 
711   if (InstClass == UNKNOWN)
712     return;
713 
714   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
715 
716   switch (InstClass) {
717   case DS_READ:
718    EltSize =
719           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
720                                                                           : 4;
721    break;
722   case DS_WRITE:
723     EltSize =
724           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
725                                                                             : 4;
726     break;
727   case S_BUFFER_LOAD_IMM:
728   case S_BUFFER_LOAD_SGPR_IMM:
729   case S_LOAD_IMM:
730     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
731     break;
732   default:
733     EltSize = 4;
734     break;
735   }
736 
737   if (InstClass == MIMG) {
738     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
739     // Offset is not considered for MIMG instructions.
740     Offset = 0;
741   } else {
742     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743     Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
744   }
745 
746   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
748 
749   Width = getOpcodeWidth(*I, *LSO.TII);
750 
751   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
752     Offset &= 0xffff;
753   } else if (InstClass != MIMG) {
754     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
755   }
756 
757   AddressRegs Regs = getRegs(Opc, *LSO.TII);
758 
759   NumAddresses = 0;
760   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
761     AddrIdx[NumAddresses++] =
762         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
763   if (Regs.Addr)
764     AddrIdx[NumAddresses++] =
765         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
766   if (Regs.SBase)
767     AddrIdx[NumAddresses++] =
768         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
769   if (Regs.SRsrc)
770     AddrIdx[NumAddresses++] =
771         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
772   if (Regs.SOffset)
773     AddrIdx[NumAddresses++] =
774         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
775   if (Regs.SAddr)
776     AddrIdx[NumAddresses++] =
777         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
778   if (Regs.VAddr)
779     AddrIdx[NumAddresses++] =
780         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
781   if (Regs.SSamp)
782     AddrIdx[NumAddresses++] =
783         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
784   assert(NumAddresses <= MaxAddressRegs);
785 
786   for (unsigned J = 0; J < NumAddresses; J++)
787     AddrReg[J] = &I->getOperand(AddrIdx[J]);
788 }
789 
790 } // end anonymous namespace.
791 
792 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
793                       "SI Load Store Optimizer", false, false)
794 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
795 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
796                     false, false)
797 
798 char SILoadStoreOptimizer::ID = 0;
799 
800 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
801 
createSILoadStoreOptimizerPass()802 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
803   return new SILoadStoreOptimizer();
804 }
805 
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)806 static void addDefsUsesToList(const MachineInstr &MI,
807                               DenseSet<Register> &RegDefs,
808                               DenseSet<Register> &RegUses) {
809   for (const auto &Op : MI.operands()) {
810     if (!Op.isReg())
811       continue;
812     if (Op.isDef())
813       RegDefs.insert(Op.getReg());
814     if (Op.readsReg())
815       RegUses.insert(Op.getReg());
816   }
817 }
818 
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const819 bool SILoadStoreOptimizer::canSwapInstructions(
820     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
821     const MachineInstr &A, const MachineInstr &B) const {
822   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
823       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
824     return false;
825   for (const auto &BOp : B.operands()) {
826     if (!BOp.isReg())
827       continue;
828     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
829       return false;
830     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
831       return false;
832   }
833   return true;
834 }
835 
836 // Given that \p CI and \p Paired are adjacent memory operations produce a new
837 // MMO for the combined operation with a new access size.
838 MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)839 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
840                                                const CombineInfo &Paired) {
841   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
842   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
843 
844   unsigned Size = MMOa->getSize() + MMOb->getSize();
845 
846   // A base pointer for the combined operation is the same as the leading
847   // operation's pointer.
848   if (Paired < CI)
849     std::swap(MMOa, MMOb);
850 
851   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
852   // If merging FLAT and GLOBAL set address space to FLAT.
853   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
854     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
855 
856   MachineFunction *MF = CI.I->getMF();
857   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
858 }
859 
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)860 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
861                                                const SIInstrInfo &TII,
862                                                const CombineInfo &Paired) {
863   assert(CI.InstClass == MIMG);
864 
865   // Ignore instructions with tfe/lwe set.
866   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
867   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
868 
869   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
870     return false;
871 
872   // Check other optional immediate operands for equality.
873   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
874                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
875                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
876 
877   for (auto op : OperandsToMatch) {
878     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
879     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
880       return false;
881     if (Idx != -1 &&
882         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
883       return false;
884   }
885 
886   // Check DMask for overlaps.
887   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
888   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
889 
890   unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
891   if ((1u << AllowedBitsForMin) <= MinMask)
892     return false;
893 
894   return true;
895 }
896 
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)897 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
898                                        unsigned ComponentCount,
899                                        const GCNSubtarget &STI) {
900   if (ComponentCount > 4)
901     return 0;
902 
903   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
904       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
905   if (!OldFormatInfo)
906     return 0;
907 
908   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
909       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
910                                            ComponentCount,
911                                            OldFormatInfo->NumFormat, STI);
912 
913   if (!NewFormatInfo)
914     return 0;
915 
916   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
917          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
918 
919   return NewFormatInfo->Format;
920 }
921 
922 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
923 // highest power of two. Note that the result is well defined for all inputs
924 // including corner cases like:
925 // - if Lo == Hi, return that value
926 // - if Lo == 0, return 0 (even though the "- 1" below underflows
927 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)928 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
929   return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
930 }
931 
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)932 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
933                                                 const GCNSubtarget &STI,
934                                                 CombineInfo &Paired,
935                                                 bool Modify) {
936   assert(CI.InstClass != MIMG);
937 
938   // XXX - Would the same offset be OK? Is there any reason this would happen or
939   // be useful?
940   if (CI.Offset == Paired.Offset)
941     return false;
942 
943   // This won't be valid if the offset isn't aligned.
944   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
945     return false;
946 
947   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
948 
949     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
950         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
951     if (!Info0)
952       return false;
953     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
954         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
955     if (!Info1)
956       return false;
957 
958     if (Info0->BitsPerComp != Info1->BitsPerComp ||
959         Info0->NumFormat != Info1->NumFormat)
960       return false;
961 
962     // TODO: Should be possible to support more formats, but if format loads
963     // are not dword-aligned, the merged load might not be valid.
964     if (Info0->BitsPerComp != 32)
965       return false;
966 
967     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
968       return false;
969   }
970 
971   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
972   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
973   CI.UseST64 = false;
974   CI.BaseOff = 0;
975 
976   // Handle all non-DS instructions.
977   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
978     return (EltOffset0 + CI.Width == EltOffset1 ||
979             EltOffset1 + Paired.Width == EltOffset0) &&
980            CI.CPol == Paired.CPol;
981   }
982 
983   // If the offset in elements doesn't fit in 8-bits, we might be able to use
984   // the stride 64 versions.
985   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
986       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
987     if (Modify) {
988       CI.Offset = EltOffset0 / 64;
989       Paired.Offset = EltOffset1 / 64;
990       CI.UseST64 = true;
991     }
992     return true;
993   }
994 
995   // Check if the new offsets fit in the reduced 8-bit range.
996   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
997     if (Modify) {
998       CI.Offset = EltOffset0;
999       Paired.Offset = EltOffset1;
1000     }
1001     return true;
1002   }
1003 
1004   // Try to shift base address to decrease offsets.
1005   uint32_t Min = std::min(EltOffset0, EltOffset1);
1006   uint32_t Max = std::max(EltOffset0, EltOffset1);
1007 
1008   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1009   if (((Max - Min) & ~Mask) == 0) {
1010     if (Modify) {
1011       // From the range of values we could use for BaseOff, choose the one that
1012       // is aligned to the highest power of two, to maximise the chance that
1013       // the same offset can be reused for other load/store pairs.
1014       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1015       // Copy the low bits of the offsets, so that when we adjust them by
1016       // subtracting BaseOff they will be multiples of 64.
1017       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1018       CI.BaseOff = BaseOff * CI.EltSize;
1019       CI.Offset = (EltOffset0 - BaseOff) / 64;
1020       Paired.Offset = (EltOffset1 - BaseOff) / 64;
1021       CI.UseST64 = true;
1022     }
1023     return true;
1024   }
1025 
1026   if (isUInt<8>(Max - Min)) {
1027     if (Modify) {
1028       // From the range of values we could use for BaseOff, choose the one that
1029       // is aligned to the highest power of two, to maximise the chance that
1030       // the same offset can be reused for other load/store pairs.
1031       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1032       CI.BaseOff = BaseOff * CI.EltSize;
1033       CI.Offset = EltOffset0 - BaseOff;
1034       Paired.Offset = EltOffset1 - BaseOff;
1035     }
1036     return true;
1037   }
1038 
1039   return false;
1040 }
1041 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1042 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1043                                      const CombineInfo &CI,
1044                                      const CombineInfo &Paired) {
1045   const unsigned Width = (CI.Width + Paired.Width);
1046   switch (CI.InstClass) {
1047   default:
1048     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1049   case S_BUFFER_LOAD_IMM:
1050   case S_BUFFER_LOAD_SGPR_IMM:
1051   case S_LOAD_IMM:
1052     switch (Width) {
1053     default:
1054       return false;
1055     case 2:
1056     case 4:
1057     case 8:
1058       return true;
1059     }
1060   }
1061 }
1062 
1063 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1064 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1065   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1066     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1067   }
1068   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1069     return TRI->getRegClassForReg(*MRI, Src->getReg());
1070   }
1071   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1072     return TRI->getRegClassForReg(*MRI, Src->getReg());
1073   }
1074   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1075     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1076   }
1077   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1078     return TRI->getRegClassForReg(*MRI, Src->getReg());
1079   }
1080   return nullptr;
1081 }
1082 
1083 /// This function assumes that CI comes before Paired in a basic block. Return
1084 /// an insertion point for the merged instruction or nullptr on failure.
1085 SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1086 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1087                                            CombineInfo &Paired) {
1088   // If another instruction has already been merged into CI, it may now be a
1089   // type that we can't do any further merging into.
1090   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1091     return nullptr;
1092   assert(CI.InstClass == Paired.InstClass);
1093 
1094   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1095       getInstSubclass(Paired.I->getOpcode(), *TII))
1096     return nullptr;
1097 
1098   // Check both offsets (or masks for MIMG) can be combined and fit in the
1099   // reduced range.
1100   if (CI.InstClass == MIMG) {
1101     if (!dmasksCanBeCombined(CI, *TII, Paired))
1102       return nullptr;
1103   } else {
1104     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1105       return nullptr;
1106   }
1107 
1108   DenseSet<Register> RegDefs;
1109   DenseSet<Register> RegUses;
1110   CombineInfo *Where;
1111   if (CI.I->mayLoad()) {
1112     // Try to hoist Paired up to CI.
1113     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1114     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1115       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1116         return nullptr;
1117     }
1118     Where = &CI;
1119   } else {
1120     // Try to sink CI down to Paired.
1121     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1122     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1123       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1124         return nullptr;
1125     }
1126     Where = &Paired;
1127   }
1128 
1129   // Call offsetsCanBeCombined with modify = true so that the offsets are
1130   // correct for the new instruction.  This should return true, because
1131   // this function should only be called on CombineInfo objects that
1132   // have already been confirmed to be mergeable.
1133   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1134     offsetsCanBeCombined(CI, *STM, Paired, true);
1135   return Where;
1136 }
1137 
read2Opcode(unsigned EltSize) const1138 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1139   if (STM->ldsRequiresM0Init())
1140     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1141   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1142 }
1143 
read2ST64Opcode(unsigned EltSize) const1144 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1145   if (STM->ldsRequiresM0Init())
1146     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1147 
1148   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1149                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1150 }
1151 
1152 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1153 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1154                                      MachineBasicBlock::iterator InsertBefore) {
1155   MachineBasicBlock *MBB = CI.I->getParent();
1156 
1157   // Be careful, since the addresses could be subregisters themselves in weird
1158   // cases, like vectors of pointers.
1159   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1160 
1161   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1162   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1163 
1164   unsigned NewOffset0 = CI.Offset;
1165   unsigned NewOffset1 = Paired.Offset;
1166   unsigned Opc =
1167       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1168 
1169   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1170   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1171 
1172   if (NewOffset0 > NewOffset1) {
1173     // Canonicalize the merged instruction so the smaller offset comes first.
1174     std::swap(NewOffset0, NewOffset1);
1175     std::swap(SubRegIdx0, SubRegIdx1);
1176   }
1177 
1178   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1179          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1180 
1181   const MCInstrDesc &Read2Desc = TII->get(Opc);
1182 
1183   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1184   Register DestReg = MRI->createVirtualRegister(SuperRC);
1185 
1186   DebugLoc DL = CI.I->getDebugLoc();
1187 
1188   Register BaseReg = AddrReg->getReg();
1189   unsigned BaseSubReg = AddrReg->getSubReg();
1190   unsigned BaseRegFlags = 0;
1191   if (CI.BaseOff) {
1192     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1193     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1194         .addImm(CI.BaseOff);
1195 
1196     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1197     BaseRegFlags = RegState::Kill;
1198 
1199     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1200         .addReg(ImmReg)
1201         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1202         .addImm(0); // clamp bit
1203     BaseSubReg = 0;
1204   }
1205 
1206   MachineInstrBuilder Read2 =
1207       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1208           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1209           .addImm(NewOffset0)                        // offset0
1210           .addImm(NewOffset1)                        // offset1
1211           .addImm(0)                                 // gds
1212           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1213 
1214   (void)Read2;
1215 
1216   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1217 
1218   // Copy to the old destination registers.
1219   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1220       .add(*Dest0) // Copy to same destination including flags and sub reg.
1221       .addReg(DestReg, 0, SubRegIdx0);
1222   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1223       .add(*Dest1)
1224       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1225 
1226   CI.I->eraseFromParent();
1227   Paired.I->eraseFromParent();
1228 
1229   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1230   return Read2;
1231 }
1232 
write2Opcode(unsigned EltSize) const1233 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1234   if (STM->ldsRequiresM0Init())
1235     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1236   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1237                         : AMDGPU::DS_WRITE2_B64_gfx9;
1238 }
1239 
write2ST64Opcode(unsigned EltSize) const1240 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1241   if (STM->ldsRequiresM0Init())
1242     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1243                           : AMDGPU::DS_WRITE2ST64_B64;
1244 
1245   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1246                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1247 }
1248 
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1249 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1250     CombineInfo &CI, CombineInfo &Paired,
1251     MachineBasicBlock::iterator InsertBefore) {
1252   MachineBasicBlock *MBB = CI.I->getParent();
1253 
1254   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1255   // sure we preserve the subregister index and any register flags set on them.
1256   const MachineOperand *AddrReg =
1257       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1258   const MachineOperand *Data0 =
1259       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1260   const MachineOperand *Data1 =
1261       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1262 
1263   unsigned NewOffset0 = CI.Offset;
1264   unsigned NewOffset1 = Paired.Offset;
1265   unsigned Opc =
1266       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1267 
1268   if (NewOffset0 > NewOffset1) {
1269     // Canonicalize the merged instruction so the smaller offset comes first.
1270     std::swap(NewOffset0, NewOffset1);
1271     std::swap(Data0, Data1);
1272   }
1273 
1274   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1275          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1276 
1277   const MCInstrDesc &Write2Desc = TII->get(Opc);
1278   DebugLoc DL = CI.I->getDebugLoc();
1279 
1280   Register BaseReg = AddrReg->getReg();
1281   unsigned BaseSubReg = AddrReg->getSubReg();
1282   unsigned BaseRegFlags = 0;
1283   if (CI.BaseOff) {
1284     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1285     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1286         .addImm(CI.BaseOff);
1287 
1288     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1289     BaseRegFlags = RegState::Kill;
1290 
1291     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1292         .addReg(ImmReg)
1293         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1294         .addImm(0); // clamp bit
1295     BaseSubReg = 0;
1296   }
1297 
1298   MachineInstrBuilder Write2 =
1299       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1300           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1301           .add(*Data0)                               // data0
1302           .add(*Data1)                               // data1
1303           .addImm(NewOffset0)                        // offset0
1304           .addImm(NewOffset1)                        // offset1
1305           .addImm(0)                                 // gds
1306           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1307 
1308   CI.I->eraseFromParent();
1309   Paired.I->eraseFromParent();
1310 
1311   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1312   return Write2;
1313 }
1314 
1315 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1316 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1317                                      MachineBasicBlock::iterator InsertBefore) {
1318   MachineBasicBlock *MBB = CI.I->getParent();
1319   DebugLoc DL = CI.I->getDebugLoc();
1320   const unsigned Opcode = getNewOpcode(CI, Paired);
1321 
1322   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1323 
1324   Register DestReg = MRI->createVirtualRegister(SuperRC);
1325   unsigned MergedDMask = CI.DMask | Paired.DMask;
1326   unsigned DMaskIdx =
1327       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1328 
1329   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1330   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1331     if (I == DMaskIdx)
1332       MIB.addImm(MergedDMask);
1333     else
1334       MIB.add((*CI.I).getOperand(I));
1335   }
1336 
1337   // It shouldn't be possible to get this far if the two instructions
1338   // don't have a single memoperand, because MachineInstr::mayAlias()
1339   // will return true if this is the case.
1340   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1341 
1342   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1343 
1344   unsigned SubRegIdx0, SubRegIdx1;
1345   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1346 
1347   // Copy to the old destination registers.
1348   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1349   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1350   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1351 
1352   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1353       .add(*Dest0) // Copy to same destination including flags and sub reg.
1354       .addReg(DestReg, 0, SubRegIdx0);
1355   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1356       .add(*Dest1)
1357       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1358 
1359   CI.I->eraseFromParent();
1360   Paired.I->eraseFromParent();
1361   return New;
1362 }
1363 
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1365     CombineInfo &CI, CombineInfo &Paired,
1366     MachineBasicBlock::iterator InsertBefore) {
1367   MachineBasicBlock *MBB = CI.I->getParent();
1368   DebugLoc DL = CI.I->getDebugLoc();
1369   const unsigned Opcode = getNewOpcode(CI, Paired);
1370 
1371   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1372 
1373   Register DestReg = MRI->createVirtualRegister(SuperRC);
1374   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1375 
1376   // It shouldn't be possible to get this far if the two instructions
1377   // don't have a single memoperand, because MachineInstr::mayAlias()
1378   // will return true if this is the case.
1379   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1380 
1381   MachineInstrBuilder New =
1382       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1383           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1384   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1385     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1386   // For convenience, when SGPR_IMM buffer loads are merged into a
1387   // zero-offset load, we generate its SGPR variant.
1388   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
1389     New.addImm(MergedOffset);
1390   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1391 
1392   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1393   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1394   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1395 
1396   // Copy to the old destination registers.
1397   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1398   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1399   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1400 
1401   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1402       .add(*Dest0) // Copy to same destination including flags and sub reg.
1403       .addReg(DestReg, 0, SubRegIdx0);
1404   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1405       .add(*Dest1)
1406       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1407 
1408   CI.I->eraseFromParent();
1409   Paired.I->eraseFromParent();
1410   return New;
1411 }
1412 
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1413 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1414     CombineInfo &CI, CombineInfo &Paired,
1415     MachineBasicBlock::iterator InsertBefore) {
1416   MachineBasicBlock *MBB = CI.I->getParent();
1417   DebugLoc DL = CI.I->getDebugLoc();
1418 
1419   const unsigned Opcode = getNewOpcode(CI, Paired);
1420 
1421   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1422 
1423   // Copy to the new source register.
1424   Register DestReg = MRI->createVirtualRegister(SuperRC);
1425   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1426 
1427   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1428 
1429   AddressRegs Regs = getRegs(Opcode, *TII);
1430 
1431   if (Regs.VAddr)
1432     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1433 
1434   // It shouldn't be possible to get this far if the two instructions
1435   // don't have a single memoperand, because MachineInstr::mayAlias()
1436   // will return true if this is the case.
1437   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1438 
1439   MachineInstr *New =
1440     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1441         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1442         .addImm(MergedOffset) // offset
1443         .addImm(CI.CPol)      // cpol
1444         .addImm(0)            // swz
1445         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1446 
1447   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1448   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1449   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1450 
1451   // Copy to the old destination registers.
1452   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1453   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1454   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1455 
1456   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1457       .add(*Dest0) // Copy to same destination including flags and sub reg.
1458       .addReg(DestReg, 0, SubRegIdx0);
1459   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1460       .add(*Dest1)
1461       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1462 
1463   CI.I->eraseFromParent();
1464   Paired.I->eraseFromParent();
1465   return New;
1466 }
1467 
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1468 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1469     CombineInfo &CI, CombineInfo &Paired,
1470     MachineBasicBlock::iterator InsertBefore) {
1471   MachineBasicBlock *MBB = CI.I->getParent();
1472   DebugLoc DL = CI.I->getDebugLoc();
1473 
1474   const unsigned Opcode = getNewOpcode(CI, Paired);
1475 
1476   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1477 
1478   // Copy to the new source register.
1479   Register DestReg = MRI->createVirtualRegister(SuperRC);
1480   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1481 
1482   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1483 
1484   AddressRegs Regs = getRegs(Opcode, *TII);
1485 
1486   if (Regs.VAddr)
1487     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1488 
1489   unsigned JoinedFormat =
1490       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1491 
1492   // It shouldn't be possible to get this far if the two instructions
1493   // don't have a single memoperand, because MachineInstr::mayAlias()
1494   // will return true if this is the case.
1495   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1496 
1497   MachineInstr *New =
1498       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1499           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1500           .addImm(MergedOffset) // offset
1501           .addImm(JoinedFormat) // format
1502           .addImm(CI.CPol)      // cpol
1503           .addImm(0)            // swz
1504           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1505 
1506   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1507   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1508   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1509 
1510   // Copy to the old destination registers.
1511   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1512   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1513   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1514 
1515   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1516       .add(*Dest0) // Copy to same destination including flags and sub reg.
1517       .addReg(DestReg, 0, SubRegIdx0);
1518   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1519       .add(*Dest1)
1520       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1521 
1522   CI.I->eraseFromParent();
1523   Paired.I->eraseFromParent();
1524   return New;
1525 }
1526 
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1527 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1528     CombineInfo &CI, CombineInfo &Paired,
1529     MachineBasicBlock::iterator InsertBefore) {
1530   MachineBasicBlock *MBB = CI.I->getParent();
1531   DebugLoc DL = CI.I->getDebugLoc();
1532 
1533   const unsigned Opcode = getNewOpcode(CI, Paired);
1534 
1535   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1538 
1539   // Copy to the new source register.
1540   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1541   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1542 
1543   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1544   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1545 
1546   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1547       .add(*Src0)
1548       .addImm(SubRegIdx0)
1549       .add(*Src1)
1550       .addImm(SubRegIdx1);
1551 
1552   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1553                  .addReg(SrcReg, RegState::Kill);
1554 
1555   AddressRegs Regs = getRegs(Opcode, *TII);
1556 
1557   if (Regs.VAddr)
1558     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559 
1560   unsigned JoinedFormat =
1561       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562 
1563   // It shouldn't be possible to get this far if the two instructions
1564   // don't have a single memoperand, because MachineInstr::mayAlias()
1565   // will return true if this is the case.
1566   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567 
1568   MachineInstr *New =
1569       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1572           .addImm(JoinedFormat)                     // format
1573           .addImm(CI.CPol)                          // cpol
1574           .addImm(0)                                // swz
1575           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576 
1577   CI.I->eraseFromParent();
1578   Paired.I->eraseFromParent();
1579   return New;
1580 }
1581 
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1582 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1583     CombineInfo &CI, CombineInfo &Paired,
1584     MachineBasicBlock::iterator InsertBefore) {
1585   MachineBasicBlock *MBB = CI.I->getParent();
1586   DebugLoc DL = CI.I->getDebugLoc();
1587 
1588   const unsigned Opcode = getNewOpcode(CI, Paired);
1589 
1590   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1591   Register DestReg = MRI->createVirtualRegister(SuperRC);
1592 
1593   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1594 
1595   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1596     MIB.add(*SAddr);
1597 
1598   MachineInstr *New =
1599     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1600        .addImm(std::min(CI.Offset, Paired.Offset))
1601        .addImm(CI.CPol)
1602        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1603 
1604   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1605   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1606   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1607 
1608   // Copy to the old destination registers.
1609   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1610   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1611   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1612 
1613   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1614       .add(*Dest0) // Copy to same destination including flags and sub reg.
1615       .addReg(DestReg, 0, SubRegIdx0);
1616   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1617       .add(*Dest1)
1618       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1619 
1620   CI.I->eraseFromParent();
1621   Paired.I->eraseFromParent();
1622   return New;
1623 }
1624 
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1626     CombineInfo &CI, CombineInfo &Paired,
1627     MachineBasicBlock::iterator InsertBefore) {
1628   MachineBasicBlock *MBB = CI.I->getParent();
1629   DebugLoc DL = CI.I->getDebugLoc();
1630 
1631   const unsigned Opcode = getNewOpcode(CI, Paired);
1632 
1633   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1634   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1635   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1636 
1637   // Copy to the new source register.
1638   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1639   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1640 
1641   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1642   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1643 
1644   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1645       .add(*Src0)
1646       .addImm(SubRegIdx0)
1647       .add(*Src1)
1648       .addImm(SubRegIdx1);
1649 
1650   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1651                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1652                  .addReg(SrcReg, RegState::Kill);
1653 
1654   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1655     MIB.add(*SAddr);
1656 
1657   MachineInstr *New =
1658     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1659        .addImm(CI.CPol)
1660        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1661 
1662   CI.I->eraseFromParent();
1663   Paired.I->eraseFromParent();
1664   return New;
1665 }
1666 
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1667 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1668                                             const CombineInfo &Paired) {
1669   const unsigned Width = CI.Width + Paired.Width;
1670 
1671   switch (getCommonInstClass(CI, Paired)) {
1672   default:
1673     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1674     // FIXME: Handle d16 correctly
1675     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1676                                   Width);
1677   case TBUFFER_LOAD:
1678   case TBUFFER_STORE:
1679     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1680                                   Width);
1681 
1682   case UNKNOWN:
1683     llvm_unreachable("Unknown instruction class");
1684   case S_BUFFER_LOAD_IMM:
1685     switch (Width) {
1686     default:
1687       return 0;
1688     case 2:
1689       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1690     case 4:
1691       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1692     case 8:
1693       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1694     }
1695   case S_BUFFER_LOAD_SGPR_IMM:
1696     switch (Width) {
1697     default:
1698       return 0;
1699     case 2:
1700       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1701                             : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1702     case 4:
1703       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1704                             : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1705     case 8:
1706       return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1707                             : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1708     }
1709   case S_LOAD_IMM:
1710     switch (Width) {
1711     default:
1712       return 0;
1713     case 2:
1714       return AMDGPU::S_LOAD_DWORDX2_IMM;
1715     case 4:
1716       return AMDGPU::S_LOAD_DWORDX4_IMM;
1717     case 8:
1718       return AMDGPU::S_LOAD_DWORDX8_IMM;
1719     }
1720   case GLOBAL_LOAD:
1721     switch (Width) {
1722     default:
1723       return 0;
1724     case 2:
1725       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1726     case 3:
1727       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1728     case 4:
1729       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1730     }
1731   case GLOBAL_LOAD_SADDR:
1732     switch (Width) {
1733     default:
1734       return 0;
1735     case 2:
1736       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1737     case 3:
1738       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1739     case 4:
1740       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1741     }
1742   case GLOBAL_STORE:
1743     switch (Width) {
1744     default:
1745       return 0;
1746     case 2:
1747       return AMDGPU::GLOBAL_STORE_DWORDX2;
1748     case 3:
1749       return AMDGPU::GLOBAL_STORE_DWORDX3;
1750     case 4:
1751       return AMDGPU::GLOBAL_STORE_DWORDX4;
1752     }
1753   case GLOBAL_STORE_SADDR:
1754     switch (Width) {
1755     default:
1756       return 0;
1757     case 2:
1758       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1759     case 3:
1760       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1761     case 4:
1762       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1763     }
1764   case FLAT_LOAD:
1765     switch (Width) {
1766     default:
1767       return 0;
1768     case 2:
1769       return AMDGPU::FLAT_LOAD_DWORDX2;
1770     case 3:
1771       return AMDGPU::FLAT_LOAD_DWORDX3;
1772     case 4:
1773       return AMDGPU::FLAT_LOAD_DWORDX4;
1774     }
1775   case FLAT_STORE:
1776     switch (Width) {
1777     default:
1778       return 0;
1779     case 2:
1780       return AMDGPU::FLAT_STORE_DWORDX2;
1781     case 3:
1782       return AMDGPU::FLAT_STORE_DWORDX3;
1783     case 4:
1784       return AMDGPU::FLAT_STORE_DWORDX4;
1785     }
1786   case MIMG:
1787     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1788            "No overlaps");
1789     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1790   }
1791 }
1792 
1793 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1794 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1795                                     const CombineInfo &Paired) {
1796   assert((CI.InstClass != MIMG ||
1797           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1798            CI.Width + Paired.Width)) &&
1799          "No overlaps");
1800 
1801   unsigned Idx0;
1802   unsigned Idx1;
1803 
1804   static const unsigned Idxs[5][4] = {
1805       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1806       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1807       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1808       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1809       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1810   };
1811 
1812   assert(CI.Width >= 1 && CI.Width <= 4);
1813   assert(Paired.Width >= 1 && Paired.Width <= 4);
1814 
1815   if (Paired < CI) {
1816     Idx1 = Idxs[0][Paired.Width - 1];
1817     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1818   } else {
1819     Idx0 = Idxs[0][CI.Width - 1];
1820     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1821   }
1822 
1823   return std::pair(Idx0, Idx1);
1824 }
1825 
1826 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)1827 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1828                                              const CombineInfo &Paired) {
1829   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1830       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1831     switch (CI.Width + Paired.Width) {
1832     default:
1833       return nullptr;
1834     case 2:
1835       return &AMDGPU::SReg_64_XEXECRegClass;
1836     case 4:
1837       return &AMDGPU::SGPR_128RegClass;
1838     case 8:
1839       return &AMDGPU::SGPR_256RegClass;
1840     case 16:
1841       return &AMDGPU::SGPR_512RegClass;
1842     }
1843   }
1844 
1845   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1846   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1847              ? TRI->getAGPRClassForBitWidth(BitWidth)
1848              : TRI->getVGPRClassForBitWidth(BitWidth);
1849 }
1850 
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1851 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1852     CombineInfo &CI, CombineInfo &Paired,
1853     MachineBasicBlock::iterator InsertBefore) {
1854   MachineBasicBlock *MBB = CI.I->getParent();
1855   DebugLoc DL = CI.I->getDebugLoc();
1856 
1857   const unsigned Opcode = getNewOpcode(CI, Paired);
1858 
1859   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1860   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1861   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1862 
1863   // Copy to the new source register.
1864   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1865   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1866 
1867   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1868   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1869 
1870   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1871       .add(*Src0)
1872       .addImm(SubRegIdx0)
1873       .add(*Src1)
1874       .addImm(SubRegIdx1);
1875 
1876   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1877                  .addReg(SrcReg, RegState::Kill);
1878 
1879   AddressRegs Regs = getRegs(Opcode, *TII);
1880 
1881   if (Regs.VAddr)
1882     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1883 
1884 
1885   // It shouldn't be possible to get this far if the two instructions
1886   // don't have a single memoperand, because MachineInstr::mayAlias()
1887   // will return true if this is the case.
1888   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1889 
1890   MachineInstr *New =
1891     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1892         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1893         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1894         .addImm(CI.CPol)      // cpol
1895         .addImm(0)            // swz
1896         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1897 
1898   CI.I->eraseFromParent();
1899   Paired.I->eraseFromParent();
1900   return New;
1901 }
1902 
1903 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1904 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1905   APInt V(32, Val, true);
1906   if (TII->isInlineConstant(V))
1907     return MachineOperand::CreateImm(Val);
1908 
1909   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1910   MachineInstr *Mov =
1911   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1912           TII->get(AMDGPU::S_MOV_B32), Reg)
1913     .addImm(Val);
1914   (void)Mov;
1915   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1916   return MachineOperand::CreateReg(Reg, false);
1917 }
1918 
1919 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1920 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1921                                            const MemAddress &Addr) const {
1922   MachineBasicBlock *MBB = MI.getParent();
1923   MachineBasicBlock::iterator MBBI = MI.getIterator();
1924   DebugLoc DL = MI.getDebugLoc();
1925 
1926   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1927           Addr.Base.LoSubReg) &&
1928          "Expected 32-bit Base-Register-Low!!");
1929 
1930   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1931           Addr.Base.HiSubReg) &&
1932          "Expected 32-bit Base-Register-Hi!!");
1933 
1934   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1935   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1936   MachineOperand OffsetHi =
1937     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1938 
1939   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1940   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1941   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1942 
1943   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1944   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1945   MachineInstr *LoHalf =
1946     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1947       .addReg(CarryReg, RegState::Define)
1948       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1949       .add(OffsetLo)
1950       .addImm(0); // clamp bit
1951   (void)LoHalf;
1952   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1953 
1954   MachineInstr *HiHalf =
1955   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1956     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1957     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1958     .add(OffsetHi)
1959     .addReg(CarryReg, RegState::Kill)
1960     .addImm(0); // clamp bit
1961   (void)HiHalf;
1962   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1963 
1964   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1965   MachineInstr *FullBase =
1966     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1967       .addReg(DestSub0)
1968       .addImm(AMDGPU::sub0)
1969       .addReg(DestSub1)
1970       .addImm(AMDGPU::sub1);
1971   (void)FullBase;
1972   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1973 
1974   return FullDestReg;
1975 }
1976 
1977 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1978 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1979                                                Register NewBase,
1980                                                int32_t NewOffset) const {
1981   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1982   Base->setReg(NewBase);
1983   Base->setIsKill(false);
1984   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1985 }
1986 
1987 std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const1988 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1989   if (Op.isImm())
1990     return Op.getImm();
1991 
1992   if (!Op.isReg())
1993     return std::nullopt;
1994 
1995   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1996   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1997       !Def->getOperand(1).isImm())
1998     return std::nullopt;
1999 
2000   return Def->getOperand(1).getImm();
2001 }
2002 
2003 // Analyze Base and extracts:
2004 //  - 32bit base registers, subregisters
2005 //  - 64bit constant offset
2006 // Expecting base computation as:
2007 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2008 //   %LO:vgpr_32, %c:sreg_64_xexec =
2009 //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2010 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2011 //   %Base:vreg_64 =
2012 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2013 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2014                                                       MemAddress &Addr) const {
2015   if (!Base.isReg())
2016     return;
2017 
2018   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2019   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2020       || Def->getNumOperands() != 5)
2021     return;
2022 
2023   MachineOperand BaseLo = Def->getOperand(1);
2024   MachineOperand BaseHi = Def->getOperand(3);
2025   if (!BaseLo.isReg() || !BaseHi.isReg())
2026     return;
2027 
2028   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2029   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2030 
2031   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2032       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2033     return;
2034 
2035   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2036   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2037 
2038   auto Offset0P = extractConstOffset(*Src0);
2039   if (Offset0P)
2040     BaseLo = *Src1;
2041   else {
2042     if (!(Offset0P = extractConstOffset(*Src1)))
2043       return;
2044     BaseLo = *Src0;
2045   }
2046 
2047   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2048   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2049 
2050   if (Src0->isImm())
2051     std::swap(Src0, Src1);
2052 
2053   if (!Src1->isImm())
2054     return;
2055 
2056   uint64_t Offset1 = Src1->getImm();
2057   BaseHi = *Src0;
2058 
2059   Addr.Base.LoReg = BaseLo.getReg();
2060   Addr.Base.HiReg = BaseHi.getReg();
2061   Addr.Base.LoSubReg = BaseLo.getSubReg();
2062   Addr.Base.HiSubReg = BaseHi.getSubReg();
2063   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2064 }
2065 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2066 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2067     MachineInstr &MI,
2068     MemInfoMap &Visited,
2069     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2070 
2071   if (!(MI.mayLoad() ^ MI.mayStore()))
2072     return false;
2073 
2074   // TODO: Support flat and scratch.
2075   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2076     return false;
2077 
2078   if (MI.mayLoad() &&
2079       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2080     return false;
2081 
2082   if (AnchorList.count(&MI))
2083     return false;
2084 
2085   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2086 
2087   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2088     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2089     return false;
2090   }
2091 
2092   // Step1: Find the base-registers and a 64bit constant offset.
2093   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2094   MemAddress MAddr;
2095   if (Visited.find(&MI) == Visited.end()) {
2096     processBaseWithConstOffset(Base, MAddr);
2097     Visited[&MI] = MAddr;
2098   } else
2099     MAddr = Visited[&MI];
2100 
2101   if (MAddr.Offset == 0) {
2102     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2103                          " constant offsets that can be promoted.\n";);
2104     return false;
2105   }
2106 
2107   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2108              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2109 
2110   // Step2: Traverse through MI's basic block and find an anchor(that has the
2111   // same base-registers) with the highest 13bit distance from MI's offset.
2112   // E.g. (64bit loads)
2113   // bb:
2114   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2115   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2116   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2117   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2118   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2119   //
2120   // Starting from the first load, the optimization will try to find a new base
2121   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2122   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2123   // as the new-base(anchor) because of the maximum distance which can
2124   // accommodate more intermediate bases presumably.
2125   //
2126   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2127   // (&a + 8192) for load1, load2, load4.
2128   //   addr = &a + 8192
2129   //   load1 = load(addr,       -4096)
2130   //   load2 = load(addr,       -2048)
2131   //   load3 = load(addr,       0)
2132   //   load4 = load(addr,       2048)
2133   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2134   //
2135   MachineInstr *AnchorInst = nullptr;
2136   MemAddress AnchorAddr;
2137   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2138   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2139 
2140   MachineBasicBlock *MBB = MI.getParent();
2141   MachineBasicBlock::iterator E = MBB->end();
2142   MachineBasicBlock::iterator MBBI = MI.getIterator();
2143   ++MBBI;
2144   const SITargetLowering *TLI =
2145     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2146 
2147   for ( ; MBBI != E; ++MBBI) {
2148     MachineInstr &MINext = *MBBI;
2149     // TODO: Support finding an anchor(with same base) from store addresses or
2150     // any other load addresses where the opcodes are different.
2151     if (MINext.getOpcode() != MI.getOpcode() ||
2152         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2153       continue;
2154 
2155     const MachineOperand &BaseNext =
2156       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2157     MemAddress MAddrNext;
2158     if (Visited.find(&MINext) == Visited.end()) {
2159       processBaseWithConstOffset(BaseNext, MAddrNext);
2160       Visited[&MINext] = MAddrNext;
2161     } else
2162       MAddrNext = Visited[&MINext];
2163 
2164     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2165         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2166         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2167         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2168       continue;
2169 
2170     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2171 
2172     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2173     TargetLoweringBase::AddrMode AM;
2174     AM.HasBaseReg = true;
2175     AM.BaseOffs = Dist;
2176     if (TLI->isLegalGlobalAddressingMode(AM) &&
2177         (uint32_t)std::abs(Dist) > MaxDist) {
2178       MaxDist = std::abs(Dist);
2179 
2180       AnchorAddr = MAddrNext;
2181       AnchorInst = &MINext;
2182     }
2183   }
2184 
2185   if (AnchorInst) {
2186     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2187                AnchorInst->dump());
2188     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2189                <<  AnchorAddr.Offset << "\n\n");
2190 
2191     // Instead of moving up, just re-compute anchor-instruction's base address.
2192     Register Base = computeBase(MI, AnchorAddr);
2193 
2194     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2195     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2196 
2197     for (auto P : InstsWCommonBase) {
2198       TargetLoweringBase::AddrMode AM;
2199       AM.HasBaseReg = true;
2200       AM.BaseOffs = P.second - AnchorAddr.Offset;
2201 
2202       if (TLI->isLegalGlobalAddressingMode(AM)) {
2203         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2204                    dbgs() << ")"; P.first->dump());
2205         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2206         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2207       }
2208     }
2209     AnchorList.insert(AnchorInst);
2210     return true;
2211   }
2212 
2213   return false;
2214 }
2215 
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const2216 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2217                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2218   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2219     if (AddrList.front().InstClass == CI.InstClass &&
2220         AddrList.front().IsAGPR == CI.IsAGPR &&
2221         AddrList.front().hasSameBaseAddress(CI)) {
2222       AddrList.emplace_back(CI);
2223       return;
2224     }
2225   }
2226 
2227   // Base address not found, so add a new list.
2228   MergeableInsts.emplace_back(1, CI);
2229 }
2230 
2231 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2232 SILoadStoreOptimizer::collectMergeableInsts(
2233     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2234     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2235     std::list<std::list<CombineInfo>> &MergeableInsts) const {
2236   bool Modified = false;
2237 
2238   // Sort potential mergeable instructions into lists.  One list per base address.
2239   unsigned Order = 0;
2240   MachineBasicBlock::iterator BlockI = Begin;
2241   for (; BlockI != End; ++BlockI) {
2242     MachineInstr &MI = *BlockI;
2243 
2244     // We run this before checking if an address is mergeable, because it can produce
2245     // better code even if the instructions aren't mergeable.
2246     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2247       Modified = true;
2248 
2249     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2250     // barriers. We can look after this barrier for separate merges.
2251     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2252       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2253 
2254       // Search will resume after this instruction in a separate merge list.
2255       ++BlockI;
2256       break;
2257     }
2258 
2259     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2260     if (InstClass == UNKNOWN)
2261       continue;
2262 
2263     // Do not merge VMEM buffer instructions with "swizzled" bit set.
2264     int Swizzled =
2265         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2266     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2267       continue;
2268 
2269     CombineInfo CI;
2270     CI.setMI(MI, *this);
2271     CI.Order = Order++;
2272 
2273     if (!CI.hasMergeableAddress(*MRI))
2274       continue;
2275 
2276     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2277       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2278       //        operands. However we are reporting that ds_write2 shall have
2279       //        only VGPR data so that machine copy propagation does not
2280       //        create an illegal instruction with a VGPR and AGPR sources.
2281       //        Consequenctially if we create such instruction the verifier
2282       //        will complain.
2283       continue;
2284     }
2285 
2286     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2287 
2288     addInstToMergeableList(CI, MergeableInsts);
2289   }
2290 
2291   // At this point we have lists of Mergeable instructions.
2292   //
2293   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2294   // list try to find an instruction that can be merged with I.  If an instruction
2295   // is found, it is stored in the Paired field.  If no instructions are found, then
2296   // the CombineInfo object is deleted from the list.
2297 
2298   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2299                                                    E = MergeableInsts.end(); I != E;) {
2300 
2301     std::list<CombineInfo> &MergeList = *I;
2302     if (MergeList.size() <= 1) {
2303       // This means we have found only one instruction with a given address
2304       // that can be merged, and we need at least 2 instructions to do a merge,
2305       // so this list can be discarded.
2306       I = MergeableInsts.erase(I);
2307       continue;
2308     }
2309 
2310     // Sort the lists by offsets, this way mergeable instructions will be
2311     // adjacent to each other in the list, which will make it easier to find
2312     // matches.
2313     MergeList.sort(
2314         [] (const CombineInfo &A, const CombineInfo &B) {
2315           return A.Offset < B.Offset;
2316         });
2317     ++I;
2318   }
2319 
2320   return std::pair(BlockI, Modified);
2321 }
2322 
2323 // Scan through looking for adjacent LDS operations with constant offsets from
2324 // the same base register. We rely on the scheduler to do the hard work of
2325 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2326 bool SILoadStoreOptimizer::optimizeBlock(
2327                        std::list<std::list<CombineInfo> > &MergeableInsts) {
2328   bool Modified = false;
2329 
2330   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2331                                                    E = MergeableInsts.end(); I != E;) {
2332     std::list<CombineInfo> &MergeList = *I;
2333 
2334     bool OptimizeListAgain = false;
2335     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2336       // We weren't able to make any changes, so delete the list so we don't
2337       // process the same instructions the next time we try to optimize this
2338       // block.
2339       I = MergeableInsts.erase(I);
2340       continue;
2341     }
2342 
2343     Modified = true;
2344 
2345     // We made changes, but also determined that there were no more optimization
2346     // opportunities, so we don't need to reprocess the list
2347     if (!OptimizeListAgain) {
2348       I = MergeableInsts.erase(I);
2349       continue;
2350     }
2351     OptimizeAgain = true;
2352   }
2353   return Modified;
2354 }
2355 
2356 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2357 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2358                                           std::list<CombineInfo> &MergeList,
2359                                           bool &OptimizeListAgain) {
2360   if (MergeList.empty())
2361     return false;
2362 
2363   bool Modified = false;
2364 
2365   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2366        Next = std::next(I)) {
2367 
2368     auto First = I;
2369     auto Second = Next;
2370 
2371     if ((*First).Order > (*Second).Order)
2372       std::swap(First, Second);
2373     CombineInfo &CI = *First;
2374     CombineInfo &Paired = *Second;
2375 
2376     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2377     if (!Where) {
2378       ++I;
2379       continue;
2380     }
2381 
2382     Modified = true;
2383 
2384     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2385 
2386     MachineBasicBlock::iterator NewMI;
2387     switch (CI.InstClass) {
2388     default:
2389       llvm_unreachable("unknown InstClass");
2390       break;
2391     case DS_READ:
2392       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2393       break;
2394     case DS_WRITE:
2395       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2396       break;
2397     case S_BUFFER_LOAD_IMM:
2398     case S_BUFFER_LOAD_SGPR_IMM:
2399     case S_LOAD_IMM:
2400       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2401       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2402       break;
2403     case BUFFER_LOAD:
2404       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2405       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2406       break;
2407     case BUFFER_STORE:
2408       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2409       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2410       break;
2411     case MIMG:
2412       NewMI = mergeImagePair(CI, Paired, Where->I);
2413       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2414       break;
2415     case TBUFFER_LOAD:
2416       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2417       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418       break;
2419     case TBUFFER_STORE:
2420       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2421       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2422       break;
2423     case FLAT_LOAD:
2424     case GLOBAL_LOAD:
2425     case GLOBAL_LOAD_SADDR:
2426       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2427       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2428       break;
2429     case FLAT_STORE:
2430     case GLOBAL_STORE:
2431     case GLOBAL_STORE_SADDR:
2432       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2433       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2434       break;
2435     }
2436     CI.setMI(NewMI, *this);
2437     CI.Order = Where->Order;
2438     if (I == Second)
2439       I = Next;
2440 
2441     MergeList.erase(Second);
2442   }
2443 
2444   return Modified;
2445 }
2446 
runOnMachineFunction(MachineFunction & MF)2447 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2448   if (skipFunction(MF.getFunction()))
2449     return false;
2450 
2451   STM = &MF.getSubtarget<GCNSubtarget>();
2452   if (!STM->loadStoreOptEnabled())
2453     return false;
2454 
2455   TII = STM->getInstrInfo();
2456   TRI = &TII->getRegisterInfo();
2457 
2458   MRI = &MF.getRegInfo();
2459   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2460 
2461   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2462 
2463   bool Modified = false;
2464 
2465   // Contains the list of instructions for which constant offsets are being
2466   // promoted to the IMM. This is tracked for an entire block at time.
2467   SmallPtrSet<MachineInstr *, 4> AnchorList;
2468   MemInfoMap Visited;
2469 
2470   for (MachineBasicBlock &MBB : MF) {
2471     MachineBasicBlock::iterator SectionEnd;
2472     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2473          I = SectionEnd) {
2474       bool CollectModified;
2475       std::list<std::list<CombineInfo>> MergeableInsts;
2476 
2477       // First pass: Collect list of all instructions we know how to merge in a
2478       // subset of the block.
2479       std::tie(SectionEnd, CollectModified) =
2480           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2481 
2482       Modified |= CollectModified;
2483 
2484       do {
2485         OptimizeAgain = false;
2486         Modified |= optimizeBlock(MergeableInsts);
2487       } while (OptimizeAgain);
2488     }
2489 
2490     Visited.clear();
2491     AnchorList.clear();
2492   }
2493 
2494   return Modified;
2495 }
2496