1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66
67 using namespace llvm;
68
69 #define DEBUG_TYPE "si-load-store-opt"
70
71 namespace {
72 enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91 };
92
93 struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102 };
103
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
109 MachineBasicBlock::iterator I;
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
hasSameBaseAddress__anon7587433f0111::SILoadStoreOptimizer::CombineInfo125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
hasMergeableAddress__anon7587433f0111::SILoadStoreOptimizer::CombineInfo151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge physical reg addresses.
165 if (AddrOp->getReg().isPhysical())
166 return false;
167
168 // If an address has only one use then there will be no other
169 // instructions with the same address, so we can't merge this one.
170 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
171 return false;
172 }
173 return true;
174 }
175
176 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177
178 // Compare by pointer order.
operator <__anon7587433f0111::SILoadStoreOptimizer::CombineInfo179 bool operator<(const CombineInfo& Other) const {
180 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181 }
182 };
183
184 struct BaseRegisters {
185 Register LoReg;
186 Register HiReg;
187
188 unsigned LoSubReg = 0;
189 unsigned HiSubReg = 0;
190 };
191
192 struct MemAddress {
193 BaseRegisters Base;
194 int64_t Offset = 0;
195 };
196
197 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
198
199 private:
200 const GCNSubtarget *STM = nullptr;
201 const SIInstrInfo *TII = nullptr;
202 const SIRegisterInfo *TRI = nullptr;
203 MachineRegisterInfo *MRI = nullptr;
204 AliasAnalysis *AA = nullptr;
205 bool OptimizeAgain;
206
207 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208 const DenseSet<Register> &ARegUses,
209 const MachineInstr &A, const MachineInstr &B) const;
210 static bool dmasksCanBeCombined(const CombineInfo &CI,
211 const SIInstrInfo &TII,
212 const CombineInfo &Paired);
213 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214 CombineInfo &Paired, bool Modify = false);
215 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
216 const CombineInfo &Paired);
217 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
218 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
219 const CombineInfo &Paired);
220 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
223
224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
225
226 unsigned read2Opcode(unsigned EltSize) const;
227 unsigned read2ST64Opcode(unsigned EltSize) const;
228 MachineBasicBlock::iterator
229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230 MachineBasicBlock::iterator InsertBefore);
231
232 unsigned write2Opcode(unsigned EltSize) const;
233 unsigned write2ST64Opcode(unsigned EltSize) const;
234 MachineBasicBlock::iterator
235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236 MachineBasicBlock::iterator InsertBefore);
237 MachineBasicBlock::iterator
238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239 MachineBasicBlock::iterator InsertBefore);
240 MachineBasicBlock::iterator
241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242 MachineBasicBlock::iterator InsertBefore);
243 MachineBasicBlock::iterator
244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245 MachineBasicBlock::iterator InsertBefore);
246 MachineBasicBlock::iterator
247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248 MachineBasicBlock::iterator InsertBefore);
249 MachineBasicBlock::iterator
250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251 MachineBasicBlock::iterator InsertBefore);
252 MachineBasicBlock::iterator
253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254 MachineBasicBlock::iterator InsertBefore);
255 MachineBasicBlock::iterator
256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257 MachineBasicBlock::iterator InsertBefore);
258 MachineBasicBlock::iterator
259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260 MachineBasicBlock::iterator InsertBefore);
261
262 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
263 int32_t NewOffset) const;
264 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
265 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
267 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
268 /// Promotes constant offset to the immediate by adjusting the base. It
269 /// tries to use a base from the nearby instructions that allows it to have
270 /// a 13bit constant offset which gets promoted to the immediate.
271 bool promoteConstantOffsetToImm(MachineInstr &CI,
272 MemInfoMap &Visited,
273 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
274 void addInstToMergeableList(const CombineInfo &CI,
275 std::list<std::list<CombineInfo> > &MergeableInsts) const;
276
277 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
278 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
279 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
280 std::list<std::list<CombineInfo>> &MergeableInsts) const;
281
282 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283 const CombineInfo &Paired);
284
285 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286 const CombineInfo &Paired);
287
288 public:
289 static char ID;
290
SILoadStoreOptimizer()291 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
292 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
293 }
294
295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296 bool &OptimizeListAgain);
297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
298
299 bool runOnMachineFunction(MachineFunction &MF) override;
300
getPassName() const301 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
302
getAnalysisUsage(AnalysisUsage & AU) const303 void getAnalysisUsage(AnalysisUsage &AU) const override {
304 AU.setPreservesCFG();
305 AU.addRequired<AAResultsWrapperPass>();
306
307 MachineFunctionPass::getAnalysisUsage(AU);
308 }
309
getRequiredProperties() const310 MachineFunctionProperties getRequiredProperties() const override {
311 return MachineFunctionProperties()
312 .set(MachineFunctionProperties::Property::IsSSA);
313 }
314 };
315
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
317 const unsigned Opc = MI.getOpcode();
318
319 if (TII.isMUBUF(Opc)) {
320 // FIXME: Handle d16 correctly
321 return AMDGPU::getMUBUFElements(Opc);
322 }
323 if (TII.isMIMG(MI)) {
324 uint64_t DMaskImm =
325 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326 return llvm::popcount(DMaskImm);
327 }
328 if (TII.isMTBUF(Opc)) {
329 return AMDGPU::getMTBUFElements(Opc);
330 }
331
332 switch (Opc) {
333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336 case AMDGPU::S_LOAD_DWORD_IMM:
337 case AMDGPU::GLOBAL_LOAD_DWORD:
338 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339 case AMDGPU::GLOBAL_STORE_DWORD:
340 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341 case AMDGPU::FLAT_LOAD_DWORD:
342 case AMDGPU::FLAT_STORE_DWORD:
343 return 1;
344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347 case AMDGPU::S_LOAD_DWORDX2_IMM:
348 case AMDGPU::GLOBAL_LOAD_DWORDX2:
349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350 case AMDGPU::GLOBAL_STORE_DWORDX2:
351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352 case AMDGPU::FLAT_LOAD_DWORDX2:
353 case AMDGPU::FLAT_STORE_DWORDX2:
354 return 2;
355 case AMDGPU::GLOBAL_LOAD_DWORDX3:
356 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357 case AMDGPU::GLOBAL_STORE_DWORDX3:
358 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359 case AMDGPU::FLAT_LOAD_DWORDX3:
360 case AMDGPU::FLAT_STORE_DWORDX3:
361 return 3;
362 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365 case AMDGPU::S_LOAD_DWORDX4_IMM:
366 case AMDGPU::GLOBAL_LOAD_DWORDX4:
367 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368 case AMDGPU::GLOBAL_STORE_DWORDX4:
369 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370 case AMDGPU::FLAT_LOAD_DWORDX4:
371 case AMDGPU::FLAT_STORE_DWORDX4:
372 return 4;
373 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376 case AMDGPU::S_LOAD_DWORDX8_IMM:
377 return 8;
378 case AMDGPU::DS_READ_B32: [[fallthrough]];
379 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B32_gfx9:
382 return 1;
383 case AMDGPU::DS_READ_B64: [[fallthrough]];
384 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
386 case AMDGPU::DS_WRITE_B64_gfx9:
387 return 2;
388 default:
389 return 0;
390 }
391 }
392
393 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)394 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
395 switch (Opc) {
396 default:
397 if (TII.isMUBUF(Opc)) {
398 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
399 default:
400 return UNKNOWN;
401 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
403 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
405 return BUFFER_LOAD;
406 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
407 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
408 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
409 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
410 return BUFFER_STORE;
411 }
412 }
413 if (TII.isMIMG(Opc)) {
414 // Ignore instructions encoded without vaddr.
415 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
416 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
417 return UNKNOWN;
418 // Ignore BVH instructions
419 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
420 return UNKNOWN;
421 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
422 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
423 TII.isGather4(Opc))
424 return UNKNOWN;
425 return MIMG;
426 }
427 if (TII.isMTBUF(Opc)) {
428 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
429 default:
430 return UNKNOWN;
431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
435 return TBUFFER_LOAD;
436 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440 return TBUFFER_STORE;
441 }
442 }
443 return UNKNOWN;
444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448 return S_BUFFER_LOAD_IMM;
449 // For the purposes of this optimization SGPR variants of buffer loads
450 // are considered to be zero-offsetted SGPR_IMM loads.
451 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
452 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
453 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
454 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
455 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
458 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
459 return S_BUFFER_LOAD_SGPR_IMM;
460 case AMDGPU::S_LOAD_DWORD_IMM:
461 case AMDGPU::S_LOAD_DWORDX2_IMM:
462 case AMDGPU::S_LOAD_DWORDX4_IMM:
463 case AMDGPU::S_LOAD_DWORDX8_IMM:
464 return S_LOAD_IMM;
465 case AMDGPU::DS_READ_B32:
466 case AMDGPU::DS_READ_B32_gfx9:
467 case AMDGPU::DS_READ_B64:
468 case AMDGPU::DS_READ_B64_gfx9:
469 return DS_READ;
470 case AMDGPU::DS_WRITE_B32:
471 case AMDGPU::DS_WRITE_B32_gfx9:
472 case AMDGPU::DS_WRITE_B64:
473 case AMDGPU::DS_WRITE_B64_gfx9:
474 return DS_WRITE;
475 case AMDGPU::GLOBAL_LOAD_DWORD:
476 case AMDGPU::GLOBAL_LOAD_DWORDX2:
477 case AMDGPU::GLOBAL_LOAD_DWORDX3:
478 case AMDGPU::GLOBAL_LOAD_DWORDX4:
479 case AMDGPU::FLAT_LOAD_DWORD:
480 case AMDGPU::FLAT_LOAD_DWORDX2:
481 case AMDGPU::FLAT_LOAD_DWORDX3:
482 case AMDGPU::FLAT_LOAD_DWORDX4:
483 return FLAT_LOAD;
484 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
485 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
486 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
487 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
488 return GLOBAL_LOAD_SADDR;
489 case AMDGPU::GLOBAL_STORE_DWORD:
490 case AMDGPU::GLOBAL_STORE_DWORDX2:
491 case AMDGPU::GLOBAL_STORE_DWORDX3:
492 case AMDGPU::GLOBAL_STORE_DWORDX4:
493 case AMDGPU::FLAT_STORE_DWORD:
494 case AMDGPU::FLAT_STORE_DWORDX2:
495 case AMDGPU::FLAT_STORE_DWORDX3:
496 case AMDGPU::FLAT_STORE_DWORDX4:
497 return FLAT_STORE;
498 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
499 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
500 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
501 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
502 return GLOBAL_STORE_SADDR;
503 }
504 }
505
506 /// Determines instruction subclass from opcode. Only instructions
507 /// of the same subclass can be merged together. The merged instruction may have
508 /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)509 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
510 switch (Opc) {
511 default:
512 if (TII.isMUBUF(Opc))
513 return AMDGPU::getMUBUFBaseOpcode(Opc);
514 if (TII.isMIMG(Opc)) {
515 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
516 assert(Info);
517 return Info->BaseOpcode;
518 }
519 if (TII.isMTBUF(Opc))
520 return AMDGPU::getMTBUFBaseOpcode(Opc);
521 return -1;
522 case AMDGPU::DS_READ_B32:
523 case AMDGPU::DS_READ_B32_gfx9:
524 case AMDGPU::DS_READ_B64:
525 case AMDGPU::DS_READ_B64_gfx9:
526 case AMDGPU::DS_WRITE_B32:
527 case AMDGPU::DS_WRITE_B32_gfx9:
528 case AMDGPU::DS_WRITE_B64:
529 case AMDGPU::DS_WRITE_B64_gfx9:
530 return Opc;
531 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
535 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
536 // For the purposes of this optimization SGPR variants of buffer loads
537 // are considered to be zero-offsetted SGPR_IMM loads.
538 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
539 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
542 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
543 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
544 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
545 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
546 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
547 case AMDGPU::S_LOAD_DWORD_IMM:
548 case AMDGPU::S_LOAD_DWORDX2_IMM:
549 case AMDGPU::S_LOAD_DWORDX4_IMM:
550 case AMDGPU::S_LOAD_DWORDX8_IMM:
551 return AMDGPU::S_LOAD_DWORD_IMM;
552 case AMDGPU::GLOBAL_LOAD_DWORD:
553 case AMDGPU::GLOBAL_LOAD_DWORDX2:
554 case AMDGPU::GLOBAL_LOAD_DWORDX3:
555 case AMDGPU::GLOBAL_LOAD_DWORDX4:
556 case AMDGPU::FLAT_LOAD_DWORD:
557 case AMDGPU::FLAT_LOAD_DWORDX2:
558 case AMDGPU::FLAT_LOAD_DWORDX3:
559 case AMDGPU::FLAT_LOAD_DWORDX4:
560 return AMDGPU::FLAT_LOAD_DWORD;
561 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
562 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
563 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
564 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
565 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
566 case AMDGPU::GLOBAL_STORE_DWORD:
567 case AMDGPU::GLOBAL_STORE_DWORDX2:
568 case AMDGPU::GLOBAL_STORE_DWORDX3:
569 case AMDGPU::GLOBAL_STORE_DWORDX4:
570 case AMDGPU::FLAT_STORE_DWORD:
571 case AMDGPU::FLAT_STORE_DWORDX2:
572 case AMDGPU::FLAT_STORE_DWORDX3:
573 case AMDGPU::FLAT_STORE_DWORDX4:
574 return AMDGPU::FLAT_STORE_DWORD;
575 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
576 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
577 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
578 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
579 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
580 }
581 }
582
583 // GLOBAL loads and stores are classified as FLAT initially. If both combined
584 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
585 // If either or both instructions are non segment specific FLAT the resulting
586 // combined operation will be FLAT, potentially promoting one of the GLOBAL
587 // operations to FLAT.
588 // For other instructions return the original unmodified class.
589 InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)590 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
591 const CombineInfo &Paired) {
592 assert(CI.InstClass == Paired.InstClass);
593
594 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
595 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
596 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
597
598 return CI.InstClass;
599 }
600
getRegs(unsigned Opc,const SIInstrInfo & TII)601 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
602 AddressRegs Result;
603
604 if (TII.isMUBUF(Opc)) {
605 if (AMDGPU::getMUBUFHasVAddr(Opc))
606 Result.VAddr = true;
607 if (AMDGPU::getMUBUFHasSrsrc(Opc))
608 Result.SRsrc = true;
609 if (AMDGPU::getMUBUFHasSoffset(Opc))
610 Result.SOffset = true;
611
612 return Result;
613 }
614
615 if (TII.isMIMG(Opc)) {
616 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
617 if (VAddr0Idx >= 0) {
618 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
619 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
620 } else {
621 Result.VAddr = true;
622 }
623 Result.SRsrc = true;
624 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
625 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
626 Result.SSamp = true;
627
628 return Result;
629 }
630 if (TII.isMTBUF(Opc)) {
631 if (AMDGPU::getMTBUFHasVAddr(Opc))
632 Result.VAddr = true;
633 if (AMDGPU::getMTBUFHasSrsrc(Opc))
634 Result.SRsrc = true;
635 if (AMDGPU::getMTBUFHasSoffset(Opc))
636 Result.SOffset = true;
637
638 return Result;
639 }
640
641 switch (Opc) {
642 default:
643 return Result;
644 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
645 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
648 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
649 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
650 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
651 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
652 Result.SOffset = true;
653 [[fallthrough]];
654 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
655 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
656 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
657 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
658 case AMDGPU::S_LOAD_DWORD_IMM:
659 case AMDGPU::S_LOAD_DWORDX2_IMM:
660 case AMDGPU::S_LOAD_DWORDX4_IMM:
661 case AMDGPU::S_LOAD_DWORDX8_IMM:
662 Result.SBase = true;
663 return Result;
664 case AMDGPU::DS_READ_B32:
665 case AMDGPU::DS_READ_B64:
666 case AMDGPU::DS_READ_B32_gfx9:
667 case AMDGPU::DS_READ_B64_gfx9:
668 case AMDGPU::DS_WRITE_B32:
669 case AMDGPU::DS_WRITE_B64:
670 case AMDGPU::DS_WRITE_B32_gfx9:
671 case AMDGPU::DS_WRITE_B64_gfx9:
672 Result.Addr = true;
673 return Result;
674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
682 Result.SAddr = true;
683 [[fallthrough]];
684 case AMDGPU::GLOBAL_LOAD_DWORD:
685 case AMDGPU::GLOBAL_LOAD_DWORDX2:
686 case AMDGPU::GLOBAL_LOAD_DWORDX3:
687 case AMDGPU::GLOBAL_LOAD_DWORDX4:
688 case AMDGPU::GLOBAL_STORE_DWORD:
689 case AMDGPU::GLOBAL_STORE_DWORDX2:
690 case AMDGPU::GLOBAL_STORE_DWORDX3:
691 case AMDGPU::GLOBAL_STORE_DWORDX4:
692 case AMDGPU::FLAT_LOAD_DWORD:
693 case AMDGPU::FLAT_LOAD_DWORDX2:
694 case AMDGPU::FLAT_LOAD_DWORDX3:
695 case AMDGPU::FLAT_LOAD_DWORDX4:
696 case AMDGPU::FLAT_STORE_DWORD:
697 case AMDGPU::FLAT_STORE_DWORDX2:
698 case AMDGPU::FLAT_STORE_DWORDX3:
699 case AMDGPU::FLAT_STORE_DWORDX4:
700 Result.VAddr = true;
701 return Result;
702 }
703 }
704
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)705 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
706 const SILoadStoreOptimizer &LSO) {
707 I = MI;
708 unsigned Opc = MI->getOpcode();
709 InstClass = getInstClass(Opc, *LSO.TII);
710
711 if (InstClass == UNKNOWN)
712 return;
713
714 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
715
716 switch (InstClass) {
717 case DS_READ:
718 EltSize =
719 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
720 : 4;
721 break;
722 case DS_WRITE:
723 EltSize =
724 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
725 : 4;
726 break;
727 case S_BUFFER_LOAD_IMM:
728 case S_BUFFER_LOAD_SGPR_IMM:
729 case S_LOAD_IMM:
730 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
731 break;
732 default:
733 EltSize = 4;
734 break;
735 }
736
737 if (InstClass == MIMG) {
738 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
739 // Offset is not considered for MIMG instructions.
740 Offset = 0;
741 } else {
742 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743 Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
744 }
745
746 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
748
749 Width = getOpcodeWidth(*I, *LSO.TII);
750
751 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
752 Offset &= 0xffff;
753 } else if (InstClass != MIMG) {
754 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
755 }
756
757 AddressRegs Regs = getRegs(Opc, *LSO.TII);
758
759 NumAddresses = 0;
760 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
761 AddrIdx[NumAddresses++] =
762 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
763 if (Regs.Addr)
764 AddrIdx[NumAddresses++] =
765 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
766 if (Regs.SBase)
767 AddrIdx[NumAddresses++] =
768 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
769 if (Regs.SRsrc)
770 AddrIdx[NumAddresses++] =
771 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
772 if (Regs.SOffset)
773 AddrIdx[NumAddresses++] =
774 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
775 if (Regs.SAddr)
776 AddrIdx[NumAddresses++] =
777 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
778 if (Regs.VAddr)
779 AddrIdx[NumAddresses++] =
780 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
781 if (Regs.SSamp)
782 AddrIdx[NumAddresses++] =
783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
784 assert(NumAddresses <= MaxAddressRegs);
785
786 for (unsigned J = 0; J < NumAddresses; J++)
787 AddrReg[J] = &I->getOperand(AddrIdx[J]);
788 }
789
790 } // end anonymous namespace.
791
792 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
793 "SI Load Store Optimizer", false, false)
794 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
795 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
796 false, false)
797
798 char SILoadStoreOptimizer::ID = 0;
799
800 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
801
createSILoadStoreOptimizerPass()802 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
803 return new SILoadStoreOptimizer();
804 }
805
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)806 static void addDefsUsesToList(const MachineInstr &MI,
807 DenseSet<Register> &RegDefs,
808 DenseSet<Register> &RegUses) {
809 for (const auto &Op : MI.operands()) {
810 if (!Op.isReg())
811 continue;
812 if (Op.isDef())
813 RegDefs.insert(Op.getReg());
814 if (Op.readsReg())
815 RegUses.insert(Op.getReg());
816 }
817 }
818
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const819 bool SILoadStoreOptimizer::canSwapInstructions(
820 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
821 const MachineInstr &A, const MachineInstr &B) const {
822 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
823 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
824 return false;
825 for (const auto &BOp : B.operands()) {
826 if (!BOp.isReg())
827 continue;
828 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
829 return false;
830 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
831 return false;
832 }
833 return true;
834 }
835
836 // Given that \p CI and \p Paired are adjacent memory operations produce a new
837 // MMO for the combined operation with a new access size.
838 MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)839 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
840 const CombineInfo &Paired) {
841 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
842 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
843
844 unsigned Size = MMOa->getSize() + MMOb->getSize();
845
846 // A base pointer for the combined operation is the same as the leading
847 // operation's pointer.
848 if (Paired < CI)
849 std::swap(MMOa, MMOb);
850
851 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
852 // If merging FLAT and GLOBAL set address space to FLAT.
853 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
854 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
855
856 MachineFunction *MF = CI.I->getMF();
857 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
858 }
859
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)860 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
861 const SIInstrInfo &TII,
862 const CombineInfo &Paired) {
863 assert(CI.InstClass == MIMG);
864
865 // Ignore instructions with tfe/lwe set.
866 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
867 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
868
869 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
870 return false;
871
872 // Check other optional immediate operands for equality.
873 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
874 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
875 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
876
877 for (auto op : OperandsToMatch) {
878 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
879 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
880 return false;
881 if (Idx != -1 &&
882 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
883 return false;
884 }
885
886 // Check DMask for overlaps.
887 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
888 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
889
890 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
891 if ((1u << AllowedBitsForMin) <= MinMask)
892 return false;
893
894 return true;
895 }
896
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)897 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
898 unsigned ComponentCount,
899 const GCNSubtarget &STI) {
900 if (ComponentCount > 4)
901 return 0;
902
903 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
904 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
905 if (!OldFormatInfo)
906 return 0;
907
908 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
909 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
910 ComponentCount,
911 OldFormatInfo->NumFormat, STI);
912
913 if (!NewFormatInfo)
914 return 0;
915
916 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
917 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
918
919 return NewFormatInfo->Format;
920 }
921
922 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
923 // highest power of two. Note that the result is well defined for all inputs
924 // including corner cases like:
925 // - if Lo == Hi, return that value
926 // - if Lo == 0, return 0 (even though the "- 1" below underflows
927 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)928 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
929 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
930 }
931
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)932 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
933 const GCNSubtarget &STI,
934 CombineInfo &Paired,
935 bool Modify) {
936 assert(CI.InstClass != MIMG);
937
938 // XXX - Would the same offset be OK? Is there any reason this would happen or
939 // be useful?
940 if (CI.Offset == Paired.Offset)
941 return false;
942
943 // This won't be valid if the offset isn't aligned.
944 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
945 return false;
946
947 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
948
949 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
950 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
951 if (!Info0)
952 return false;
953 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
954 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
955 if (!Info1)
956 return false;
957
958 if (Info0->BitsPerComp != Info1->BitsPerComp ||
959 Info0->NumFormat != Info1->NumFormat)
960 return false;
961
962 // TODO: Should be possible to support more formats, but if format loads
963 // are not dword-aligned, the merged load might not be valid.
964 if (Info0->BitsPerComp != 32)
965 return false;
966
967 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
968 return false;
969 }
970
971 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
972 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
973 CI.UseST64 = false;
974 CI.BaseOff = 0;
975
976 // Handle all non-DS instructions.
977 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
978 return (EltOffset0 + CI.Width == EltOffset1 ||
979 EltOffset1 + Paired.Width == EltOffset0) &&
980 CI.CPol == Paired.CPol;
981 }
982
983 // If the offset in elements doesn't fit in 8-bits, we might be able to use
984 // the stride 64 versions.
985 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
986 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
987 if (Modify) {
988 CI.Offset = EltOffset0 / 64;
989 Paired.Offset = EltOffset1 / 64;
990 CI.UseST64 = true;
991 }
992 return true;
993 }
994
995 // Check if the new offsets fit in the reduced 8-bit range.
996 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
997 if (Modify) {
998 CI.Offset = EltOffset0;
999 Paired.Offset = EltOffset1;
1000 }
1001 return true;
1002 }
1003
1004 // Try to shift base address to decrease offsets.
1005 uint32_t Min = std::min(EltOffset0, EltOffset1);
1006 uint32_t Max = std::max(EltOffset0, EltOffset1);
1007
1008 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1009 if (((Max - Min) & ~Mask) == 0) {
1010 if (Modify) {
1011 // From the range of values we could use for BaseOff, choose the one that
1012 // is aligned to the highest power of two, to maximise the chance that
1013 // the same offset can be reused for other load/store pairs.
1014 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1015 // Copy the low bits of the offsets, so that when we adjust them by
1016 // subtracting BaseOff they will be multiples of 64.
1017 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1018 CI.BaseOff = BaseOff * CI.EltSize;
1019 CI.Offset = (EltOffset0 - BaseOff) / 64;
1020 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1021 CI.UseST64 = true;
1022 }
1023 return true;
1024 }
1025
1026 if (isUInt<8>(Max - Min)) {
1027 if (Modify) {
1028 // From the range of values we could use for BaseOff, choose the one that
1029 // is aligned to the highest power of two, to maximise the chance that
1030 // the same offset can be reused for other load/store pairs.
1031 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1032 CI.BaseOff = BaseOff * CI.EltSize;
1033 CI.Offset = EltOffset0 - BaseOff;
1034 Paired.Offset = EltOffset1 - BaseOff;
1035 }
1036 return true;
1037 }
1038
1039 return false;
1040 }
1041
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1042 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1043 const CombineInfo &CI,
1044 const CombineInfo &Paired) {
1045 const unsigned Width = (CI.Width + Paired.Width);
1046 switch (CI.InstClass) {
1047 default:
1048 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1049 case S_BUFFER_LOAD_IMM:
1050 case S_BUFFER_LOAD_SGPR_IMM:
1051 case S_LOAD_IMM:
1052 switch (Width) {
1053 default:
1054 return false;
1055 case 2:
1056 case 4:
1057 case 8:
1058 return true;
1059 }
1060 }
1061 }
1062
1063 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1064 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1065 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1066 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1067 }
1068 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1069 return TRI->getRegClassForReg(*MRI, Src->getReg());
1070 }
1071 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1072 return TRI->getRegClassForReg(*MRI, Src->getReg());
1073 }
1074 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1075 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1076 }
1077 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1078 return TRI->getRegClassForReg(*MRI, Src->getReg());
1079 }
1080 return nullptr;
1081 }
1082
1083 /// This function assumes that CI comes before Paired in a basic block. Return
1084 /// an insertion point for the merged instruction or nullptr on failure.
1085 SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1086 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1087 CombineInfo &Paired) {
1088 // If another instruction has already been merged into CI, it may now be a
1089 // type that we can't do any further merging into.
1090 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1091 return nullptr;
1092 assert(CI.InstClass == Paired.InstClass);
1093
1094 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1095 getInstSubclass(Paired.I->getOpcode(), *TII))
1096 return nullptr;
1097
1098 // Check both offsets (or masks for MIMG) can be combined and fit in the
1099 // reduced range.
1100 if (CI.InstClass == MIMG) {
1101 if (!dmasksCanBeCombined(CI, *TII, Paired))
1102 return nullptr;
1103 } else {
1104 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1105 return nullptr;
1106 }
1107
1108 DenseSet<Register> RegDefs;
1109 DenseSet<Register> RegUses;
1110 CombineInfo *Where;
1111 if (CI.I->mayLoad()) {
1112 // Try to hoist Paired up to CI.
1113 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1114 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1115 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1116 return nullptr;
1117 }
1118 Where = &CI;
1119 } else {
1120 // Try to sink CI down to Paired.
1121 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1122 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1123 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1124 return nullptr;
1125 }
1126 Where = &Paired;
1127 }
1128
1129 // Call offsetsCanBeCombined with modify = true so that the offsets are
1130 // correct for the new instruction. This should return true, because
1131 // this function should only be called on CombineInfo objects that
1132 // have already been confirmed to be mergeable.
1133 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1134 offsetsCanBeCombined(CI, *STM, Paired, true);
1135 return Where;
1136 }
1137
read2Opcode(unsigned EltSize) const1138 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1139 if (STM->ldsRequiresM0Init())
1140 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1141 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1142 }
1143
read2ST64Opcode(unsigned EltSize) const1144 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1145 if (STM->ldsRequiresM0Init())
1146 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1147
1148 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1149 : AMDGPU::DS_READ2ST64_B64_gfx9;
1150 }
1151
1152 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1153 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1154 MachineBasicBlock::iterator InsertBefore) {
1155 MachineBasicBlock *MBB = CI.I->getParent();
1156
1157 // Be careful, since the addresses could be subregisters themselves in weird
1158 // cases, like vectors of pointers.
1159 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1160
1161 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1162 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1163
1164 unsigned NewOffset0 = CI.Offset;
1165 unsigned NewOffset1 = Paired.Offset;
1166 unsigned Opc =
1167 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1168
1169 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1170 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1171
1172 if (NewOffset0 > NewOffset1) {
1173 // Canonicalize the merged instruction so the smaller offset comes first.
1174 std::swap(NewOffset0, NewOffset1);
1175 std::swap(SubRegIdx0, SubRegIdx1);
1176 }
1177
1178 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1179 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1180
1181 const MCInstrDesc &Read2Desc = TII->get(Opc);
1182
1183 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1184 Register DestReg = MRI->createVirtualRegister(SuperRC);
1185
1186 DebugLoc DL = CI.I->getDebugLoc();
1187
1188 Register BaseReg = AddrReg->getReg();
1189 unsigned BaseSubReg = AddrReg->getSubReg();
1190 unsigned BaseRegFlags = 0;
1191 if (CI.BaseOff) {
1192 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1193 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1194 .addImm(CI.BaseOff);
1195
1196 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1197 BaseRegFlags = RegState::Kill;
1198
1199 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1200 .addReg(ImmReg)
1201 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1202 .addImm(0); // clamp bit
1203 BaseSubReg = 0;
1204 }
1205
1206 MachineInstrBuilder Read2 =
1207 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1208 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1209 .addImm(NewOffset0) // offset0
1210 .addImm(NewOffset1) // offset1
1211 .addImm(0) // gds
1212 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1213
1214 (void)Read2;
1215
1216 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1217
1218 // Copy to the old destination registers.
1219 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1220 .add(*Dest0) // Copy to same destination including flags and sub reg.
1221 .addReg(DestReg, 0, SubRegIdx0);
1222 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1223 .add(*Dest1)
1224 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1225
1226 CI.I->eraseFromParent();
1227 Paired.I->eraseFromParent();
1228
1229 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1230 return Read2;
1231 }
1232
write2Opcode(unsigned EltSize) const1233 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1234 if (STM->ldsRequiresM0Init())
1235 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1236 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1237 : AMDGPU::DS_WRITE2_B64_gfx9;
1238 }
1239
write2ST64Opcode(unsigned EltSize) const1240 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1241 if (STM->ldsRequiresM0Init())
1242 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1243 : AMDGPU::DS_WRITE2ST64_B64;
1244
1245 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1246 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1247 }
1248
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1249 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1250 CombineInfo &CI, CombineInfo &Paired,
1251 MachineBasicBlock::iterator InsertBefore) {
1252 MachineBasicBlock *MBB = CI.I->getParent();
1253
1254 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1255 // sure we preserve the subregister index and any register flags set on them.
1256 const MachineOperand *AddrReg =
1257 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1258 const MachineOperand *Data0 =
1259 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1260 const MachineOperand *Data1 =
1261 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1262
1263 unsigned NewOffset0 = CI.Offset;
1264 unsigned NewOffset1 = Paired.Offset;
1265 unsigned Opc =
1266 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1267
1268 if (NewOffset0 > NewOffset1) {
1269 // Canonicalize the merged instruction so the smaller offset comes first.
1270 std::swap(NewOffset0, NewOffset1);
1271 std::swap(Data0, Data1);
1272 }
1273
1274 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1275 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1276
1277 const MCInstrDesc &Write2Desc = TII->get(Opc);
1278 DebugLoc DL = CI.I->getDebugLoc();
1279
1280 Register BaseReg = AddrReg->getReg();
1281 unsigned BaseSubReg = AddrReg->getSubReg();
1282 unsigned BaseRegFlags = 0;
1283 if (CI.BaseOff) {
1284 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1285 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1286 .addImm(CI.BaseOff);
1287
1288 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1289 BaseRegFlags = RegState::Kill;
1290
1291 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1292 .addReg(ImmReg)
1293 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1294 .addImm(0); // clamp bit
1295 BaseSubReg = 0;
1296 }
1297
1298 MachineInstrBuilder Write2 =
1299 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1300 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1301 .add(*Data0) // data0
1302 .add(*Data1) // data1
1303 .addImm(NewOffset0) // offset0
1304 .addImm(NewOffset1) // offset1
1305 .addImm(0) // gds
1306 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1307
1308 CI.I->eraseFromParent();
1309 Paired.I->eraseFromParent();
1310
1311 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1312 return Write2;
1313 }
1314
1315 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1316 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1317 MachineBasicBlock::iterator InsertBefore) {
1318 MachineBasicBlock *MBB = CI.I->getParent();
1319 DebugLoc DL = CI.I->getDebugLoc();
1320 const unsigned Opcode = getNewOpcode(CI, Paired);
1321
1322 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1323
1324 Register DestReg = MRI->createVirtualRegister(SuperRC);
1325 unsigned MergedDMask = CI.DMask | Paired.DMask;
1326 unsigned DMaskIdx =
1327 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1328
1329 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1330 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1331 if (I == DMaskIdx)
1332 MIB.addImm(MergedDMask);
1333 else
1334 MIB.add((*CI.I).getOperand(I));
1335 }
1336
1337 // It shouldn't be possible to get this far if the two instructions
1338 // don't have a single memoperand, because MachineInstr::mayAlias()
1339 // will return true if this is the case.
1340 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1341
1342 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1343
1344 unsigned SubRegIdx0, SubRegIdx1;
1345 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1346
1347 // Copy to the old destination registers.
1348 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1349 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1350 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1351
1352 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1353 .add(*Dest0) // Copy to same destination including flags and sub reg.
1354 .addReg(DestReg, 0, SubRegIdx0);
1355 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1356 .add(*Dest1)
1357 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1358
1359 CI.I->eraseFromParent();
1360 Paired.I->eraseFromParent();
1361 return New;
1362 }
1363
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1365 CombineInfo &CI, CombineInfo &Paired,
1366 MachineBasicBlock::iterator InsertBefore) {
1367 MachineBasicBlock *MBB = CI.I->getParent();
1368 DebugLoc DL = CI.I->getDebugLoc();
1369 const unsigned Opcode = getNewOpcode(CI, Paired);
1370
1371 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1372
1373 Register DestReg = MRI->createVirtualRegister(SuperRC);
1374 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1375
1376 // It shouldn't be possible to get this far if the two instructions
1377 // don't have a single memoperand, because MachineInstr::mayAlias()
1378 // will return true if this is the case.
1379 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1380
1381 MachineInstrBuilder New =
1382 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1383 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1384 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1385 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1386 // For convenience, when SGPR_IMM buffer loads are merged into a
1387 // zero-offset load, we generate its SGPR variant.
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
1389 New.addImm(MergedOffset);
1390 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1391
1392 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1393 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1394 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1395
1396 // Copy to the old destination registers.
1397 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1398 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1399 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1400
1401 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1402 .add(*Dest0) // Copy to same destination including flags and sub reg.
1403 .addReg(DestReg, 0, SubRegIdx0);
1404 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1405 .add(*Dest1)
1406 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1407
1408 CI.I->eraseFromParent();
1409 Paired.I->eraseFromParent();
1410 return New;
1411 }
1412
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1413 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1414 CombineInfo &CI, CombineInfo &Paired,
1415 MachineBasicBlock::iterator InsertBefore) {
1416 MachineBasicBlock *MBB = CI.I->getParent();
1417 DebugLoc DL = CI.I->getDebugLoc();
1418
1419 const unsigned Opcode = getNewOpcode(CI, Paired);
1420
1421 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1422
1423 // Copy to the new source register.
1424 Register DestReg = MRI->createVirtualRegister(SuperRC);
1425 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1426
1427 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1428
1429 AddressRegs Regs = getRegs(Opcode, *TII);
1430
1431 if (Regs.VAddr)
1432 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1433
1434 // It shouldn't be possible to get this far if the two instructions
1435 // don't have a single memoperand, because MachineInstr::mayAlias()
1436 // will return true if this is the case.
1437 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1438
1439 MachineInstr *New =
1440 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1441 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1442 .addImm(MergedOffset) // offset
1443 .addImm(CI.CPol) // cpol
1444 .addImm(0) // swz
1445 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1446
1447 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1448 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1449 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1450
1451 // Copy to the old destination registers.
1452 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1453 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1454 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1455
1456 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1457 .add(*Dest0) // Copy to same destination including flags and sub reg.
1458 .addReg(DestReg, 0, SubRegIdx0);
1459 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1460 .add(*Dest1)
1461 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1462
1463 CI.I->eraseFromParent();
1464 Paired.I->eraseFromParent();
1465 return New;
1466 }
1467
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1468 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1469 CombineInfo &CI, CombineInfo &Paired,
1470 MachineBasicBlock::iterator InsertBefore) {
1471 MachineBasicBlock *MBB = CI.I->getParent();
1472 DebugLoc DL = CI.I->getDebugLoc();
1473
1474 const unsigned Opcode = getNewOpcode(CI, Paired);
1475
1476 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1477
1478 // Copy to the new source register.
1479 Register DestReg = MRI->createVirtualRegister(SuperRC);
1480 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1481
1482 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1483
1484 AddressRegs Regs = getRegs(Opcode, *TII);
1485
1486 if (Regs.VAddr)
1487 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1488
1489 unsigned JoinedFormat =
1490 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1491
1492 // It shouldn't be possible to get this far if the two instructions
1493 // don't have a single memoperand, because MachineInstr::mayAlias()
1494 // will return true if this is the case.
1495 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1496
1497 MachineInstr *New =
1498 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1499 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1500 .addImm(MergedOffset) // offset
1501 .addImm(JoinedFormat) // format
1502 .addImm(CI.CPol) // cpol
1503 .addImm(0) // swz
1504 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1505
1506 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1507 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1508 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1509
1510 // Copy to the old destination registers.
1511 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1512 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1513 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1514
1515 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1516 .add(*Dest0) // Copy to same destination including flags and sub reg.
1517 .addReg(DestReg, 0, SubRegIdx0);
1518 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1519 .add(*Dest1)
1520 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1521
1522 CI.I->eraseFromParent();
1523 Paired.I->eraseFromParent();
1524 return New;
1525 }
1526
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1527 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1528 CombineInfo &CI, CombineInfo &Paired,
1529 MachineBasicBlock::iterator InsertBefore) {
1530 MachineBasicBlock *MBB = CI.I->getParent();
1531 DebugLoc DL = CI.I->getDebugLoc();
1532
1533 const unsigned Opcode = getNewOpcode(CI, Paired);
1534
1535 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1538
1539 // Copy to the new source register.
1540 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1541 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1542
1543 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1544 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1545
1546 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1547 .add(*Src0)
1548 .addImm(SubRegIdx0)
1549 .add(*Src1)
1550 .addImm(SubRegIdx1);
1551
1552 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1553 .addReg(SrcReg, RegState::Kill);
1554
1555 AddressRegs Regs = getRegs(Opcode, *TII);
1556
1557 if (Regs.VAddr)
1558 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559
1560 unsigned JoinedFormat =
1561 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562
1563 // It shouldn't be possible to get this far if the two instructions
1564 // don't have a single memoperand, because MachineInstr::mayAlias()
1565 // will return true if this is the case.
1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567
1568 MachineInstr *New =
1569 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1572 .addImm(JoinedFormat) // format
1573 .addImm(CI.CPol) // cpol
1574 .addImm(0) // swz
1575 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576
1577 CI.I->eraseFromParent();
1578 Paired.I->eraseFromParent();
1579 return New;
1580 }
1581
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1582 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1583 CombineInfo &CI, CombineInfo &Paired,
1584 MachineBasicBlock::iterator InsertBefore) {
1585 MachineBasicBlock *MBB = CI.I->getParent();
1586 DebugLoc DL = CI.I->getDebugLoc();
1587
1588 const unsigned Opcode = getNewOpcode(CI, Paired);
1589
1590 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1591 Register DestReg = MRI->createVirtualRegister(SuperRC);
1592
1593 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1594
1595 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1596 MIB.add(*SAddr);
1597
1598 MachineInstr *New =
1599 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1600 .addImm(std::min(CI.Offset, Paired.Offset))
1601 .addImm(CI.CPol)
1602 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1603
1604 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1605 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1606 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1607
1608 // Copy to the old destination registers.
1609 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1610 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1611 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1612
1613 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1614 .add(*Dest0) // Copy to same destination including flags and sub reg.
1615 .addReg(DestReg, 0, SubRegIdx0);
1616 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1617 .add(*Dest1)
1618 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1619
1620 CI.I->eraseFromParent();
1621 Paired.I->eraseFromParent();
1622 return New;
1623 }
1624
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1626 CombineInfo &CI, CombineInfo &Paired,
1627 MachineBasicBlock::iterator InsertBefore) {
1628 MachineBasicBlock *MBB = CI.I->getParent();
1629 DebugLoc DL = CI.I->getDebugLoc();
1630
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1632
1633 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1634 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1635 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1636
1637 // Copy to the new source register.
1638 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1639 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1640
1641 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1642 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1643
1644 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1645 .add(*Src0)
1646 .addImm(SubRegIdx0)
1647 .add(*Src1)
1648 .addImm(SubRegIdx1);
1649
1650 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1651 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1652 .addReg(SrcReg, RegState::Kill);
1653
1654 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1655 MIB.add(*SAddr);
1656
1657 MachineInstr *New =
1658 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1659 .addImm(CI.CPol)
1660 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1661
1662 CI.I->eraseFromParent();
1663 Paired.I->eraseFromParent();
1664 return New;
1665 }
1666
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1667 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1668 const CombineInfo &Paired) {
1669 const unsigned Width = CI.Width + Paired.Width;
1670
1671 switch (getCommonInstClass(CI, Paired)) {
1672 default:
1673 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1674 // FIXME: Handle d16 correctly
1675 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1676 Width);
1677 case TBUFFER_LOAD:
1678 case TBUFFER_STORE:
1679 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1680 Width);
1681
1682 case UNKNOWN:
1683 llvm_unreachable("Unknown instruction class");
1684 case S_BUFFER_LOAD_IMM:
1685 switch (Width) {
1686 default:
1687 return 0;
1688 case 2:
1689 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1690 case 4:
1691 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1692 case 8:
1693 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1694 }
1695 case S_BUFFER_LOAD_SGPR_IMM:
1696 switch (Width) {
1697 default:
1698 return 0;
1699 case 2:
1700 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1701 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1702 case 4:
1703 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1704 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1705 case 8:
1706 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1707 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1708 }
1709 case S_LOAD_IMM:
1710 switch (Width) {
1711 default:
1712 return 0;
1713 case 2:
1714 return AMDGPU::S_LOAD_DWORDX2_IMM;
1715 case 4:
1716 return AMDGPU::S_LOAD_DWORDX4_IMM;
1717 case 8:
1718 return AMDGPU::S_LOAD_DWORDX8_IMM;
1719 }
1720 case GLOBAL_LOAD:
1721 switch (Width) {
1722 default:
1723 return 0;
1724 case 2:
1725 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1726 case 3:
1727 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1728 case 4:
1729 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1730 }
1731 case GLOBAL_LOAD_SADDR:
1732 switch (Width) {
1733 default:
1734 return 0;
1735 case 2:
1736 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1737 case 3:
1738 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1739 case 4:
1740 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1741 }
1742 case GLOBAL_STORE:
1743 switch (Width) {
1744 default:
1745 return 0;
1746 case 2:
1747 return AMDGPU::GLOBAL_STORE_DWORDX2;
1748 case 3:
1749 return AMDGPU::GLOBAL_STORE_DWORDX3;
1750 case 4:
1751 return AMDGPU::GLOBAL_STORE_DWORDX4;
1752 }
1753 case GLOBAL_STORE_SADDR:
1754 switch (Width) {
1755 default:
1756 return 0;
1757 case 2:
1758 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1759 case 3:
1760 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1761 case 4:
1762 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1763 }
1764 case FLAT_LOAD:
1765 switch (Width) {
1766 default:
1767 return 0;
1768 case 2:
1769 return AMDGPU::FLAT_LOAD_DWORDX2;
1770 case 3:
1771 return AMDGPU::FLAT_LOAD_DWORDX3;
1772 case 4:
1773 return AMDGPU::FLAT_LOAD_DWORDX4;
1774 }
1775 case FLAT_STORE:
1776 switch (Width) {
1777 default:
1778 return 0;
1779 case 2:
1780 return AMDGPU::FLAT_STORE_DWORDX2;
1781 case 3:
1782 return AMDGPU::FLAT_STORE_DWORDX3;
1783 case 4:
1784 return AMDGPU::FLAT_STORE_DWORDX4;
1785 }
1786 case MIMG:
1787 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1788 "No overlaps");
1789 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1790 }
1791 }
1792
1793 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1794 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1795 const CombineInfo &Paired) {
1796 assert((CI.InstClass != MIMG ||
1797 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1798 CI.Width + Paired.Width)) &&
1799 "No overlaps");
1800
1801 unsigned Idx0;
1802 unsigned Idx1;
1803
1804 static const unsigned Idxs[5][4] = {
1805 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1806 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1807 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1808 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1809 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1810 };
1811
1812 assert(CI.Width >= 1 && CI.Width <= 4);
1813 assert(Paired.Width >= 1 && Paired.Width <= 4);
1814
1815 if (Paired < CI) {
1816 Idx1 = Idxs[0][Paired.Width - 1];
1817 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1818 } else {
1819 Idx0 = Idxs[0][CI.Width - 1];
1820 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1821 }
1822
1823 return std::pair(Idx0, Idx1);
1824 }
1825
1826 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)1827 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1828 const CombineInfo &Paired) {
1829 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1830 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1831 switch (CI.Width + Paired.Width) {
1832 default:
1833 return nullptr;
1834 case 2:
1835 return &AMDGPU::SReg_64_XEXECRegClass;
1836 case 4:
1837 return &AMDGPU::SGPR_128RegClass;
1838 case 8:
1839 return &AMDGPU::SGPR_256RegClass;
1840 case 16:
1841 return &AMDGPU::SGPR_512RegClass;
1842 }
1843 }
1844
1845 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1846 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1847 ? TRI->getAGPRClassForBitWidth(BitWidth)
1848 : TRI->getVGPRClassForBitWidth(BitWidth);
1849 }
1850
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1851 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1852 CombineInfo &CI, CombineInfo &Paired,
1853 MachineBasicBlock::iterator InsertBefore) {
1854 MachineBasicBlock *MBB = CI.I->getParent();
1855 DebugLoc DL = CI.I->getDebugLoc();
1856
1857 const unsigned Opcode = getNewOpcode(CI, Paired);
1858
1859 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1860 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1861 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1862
1863 // Copy to the new source register.
1864 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1865 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1866
1867 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1868 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1869
1870 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1871 .add(*Src0)
1872 .addImm(SubRegIdx0)
1873 .add(*Src1)
1874 .addImm(SubRegIdx1);
1875
1876 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1877 .addReg(SrcReg, RegState::Kill);
1878
1879 AddressRegs Regs = getRegs(Opcode, *TII);
1880
1881 if (Regs.VAddr)
1882 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1883
1884
1885 // It shouldn't be possible to get this far if the two instructions
1886 // don't have a single memoperand, because MachineInstr::mayAlias()
1887 // will return true if this is the case.
1888 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1889
1890 MachineInstr *New =
1891 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1892 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1893 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1894 .addImm(CI.CPol) // cpol
1895 .addImm(0) // swz
1896 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1897
1898 CI.I->eraseFromParent();
1899 Paired.I->eraseFromParent();
1900 return New;
1901 }
1902
1903 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1904 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1905 APInt V(32, Val, true);
1906 if (TII->isInlineConstant(V))
1907 return MachineOperand::CreateImm(Val);
1908
1909 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1910 MachineInstr *Mov =
1911 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1912 TII->get(AMDGPU::S_MOV_B32), Reg)
1913 .addImm(Val);
1914 (void)Mov;
1915 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1916 return MachineOperand::CreateReg(Reg, false);
1917 }
1918
1919 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1920 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1921 const MemAddress &Addr) const {
1922 MachineBasicBlock *MBB = MI.getParent();
1923 MachineBasicBlock::iterator MBBI = MI.getIterator();
1924 DebugLoc DL = MI.getDebugLoc();
1925
1926 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1927 Addr.Base.LoSubReg) &&
1928 "Expected 32-bit Base-Register-Low!!");
1929
1930 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1931 Addr.Base.HiSubReg) &&
1932 "Expected 32-bit Base-Register-Hi!!");
1933
1934 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1935 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1936 MachineOperand OffsetHi =
1937 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1938
1939 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1940 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1941 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1942
1943 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1944 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1945 MachineInstr *LoHalf =
1946 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1947 .addReg(CarryReg, RegState::Define)
1948 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1949 .add(OffsetLo)
1950 .addImm(0); // clamp bit
1951 (void)LoHalf;
1952 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1953
1954 MachineInstr *HiHalf =
1955 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1956 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1957 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1958 .add(OffsetHi)
1959 .addReg(CarryReg, RegState::Kill)
1960 .addImm(0); // clamp bit
1961 (void)HiHalf;
1962 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1963
1964 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1965 MachineInstr *FullBase =
1966 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1967 .addReg(DestSub0)
1968 .addImm(AMDGPU::sub0)
1969 .addReg(DestSub1)
1970 .addImm(AMDGPU::sub1);
1971 (void)FullBase;
1972 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1973
1974 return FullDestReg;
1975 }
1976
1977 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1978 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1979 Register NewBase,
1980 int32_t NewOffset) const {
1981 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1982 Base->setReg(NewBase);
1983 Base->setIsKill(false);
1984 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1985 }
1986
1987 std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const1988 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1989 if (Op.isImm())
1990 return Op.getImm();
1991
1992 if (!Op.isReg())
1993 return std::nullopt;
1994
1995 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1996 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1997 !Def->getOperand(1).isImm())
1998 return std::nullopt;
1999
2000 return Def->getOperand(1).getImm();
2001 }
2002
2003 // Analyze Base and extracts:
2004 // - 32bit base registers, subregisters
2005 // - 64bit constant offset
2006 // Expecting base computation as:
2007 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
2008 // %LO:vgpr_32, %c:sreg_64_xexec =
2009 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2010 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2011 // %Base:vreg_64 =
2012 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2013 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2014 MemAddress &Addr) const {
2015 if (!Base.isReg())
2016 return;
2017
2018 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2019 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2020 || Def->getNumOperands() != 5)
2021 return;
2022
2023 MachineOperand BaseLo = Def->getOperand(1);
2024 MachineOperand BaseHi = Def->getOperand(3);
2025 if (!BaseLo.isReg() || !BaseHi.isReg())
2026 return;
2027
2028 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2029 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2030
2031 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2032 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2033 return;
2034
2035 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2036 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2037
2038 auto Offset0P = extractConstOffset(*Src0);
2039 if (Offset0P)
2040 BaseLo = *Src1;
2041 else {
2042 if (!(Offset0P = extractConstOffset(*Src1)))
2043 return;
2044 BaseLo = *Src0;
2045 }
2046
2047 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2048 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2049
2050 if (Src0->isImm())
2051 std::swap(Src0, Src1);
2052
2053 if (!Src1->isImm())
2054 return;
2055
2056 uint64_t Offset1 = Src1->getImm();
2057 BaseHi = *Src0;
2058
2059 Addr.Base.LoReg = BaseLo.getReg();
2060 Addr.Base.HiReg = BaseHi.getReg();
2061 Addr.Base.LoSubReg = BaseLo.getSubReg();
2062 Addr.Base.HiSubReg = BaseHi.getSubReg();
2063 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2064 }
2065
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2066 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2067 MachineInstr &MI,
2068 MemInfoMap &Visited,
2069 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2070
2071 if (!(MI.mayLoad() ^ MI.mayStore()))
2072 return false;
2073
2074 // TODO: Support flat and scratch.
2075 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2076 return false;
2077
2078 if (MI.mayLoad() &&
2079 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2080 return false;
2081
2082 if (AnchorList.count(&MI))
2083 return false;
2084
2085 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2086
2087 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2088 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2089 return false;
2090 }
2091
2092 // Step1: Find the base-registers and a 64bit constant offset.
2093 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2094 MemAddress MAddr;
2095 if (Visited.find(&MI) == Visited.end()) {
2096 processBaseWithConstOffset(Base, MAddr);
2097 Visited[&MI] = MAddr;
2098 } else
2099 MAddr = Visited[&MI];
2100
2101 if (MAddr.Offset == 0) {
2102 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2103 " constant offsets that can be promoted.\n";);
2104 return false;
2105 }
2106
2107 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2108 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2109
2110 // Step2: Traverse through MI's basic block and find an anchor(that has the
2111 // same base-registers) with the highest 13bit distance from MI's offset.
2112 // E.g. (64bit loads)
2113 // bb:
2114 // addr1 = &a + 4096; load1 = load(addr1, 0)
2115 // addr2 = &a + 6144; load2 = load(addr2, 0)
2116 // addr3 = &a + 8192; load3 = load(addr3, 0)
2117 // addr4 = &a + 10240; load4 = load(addr4, 0)
2118 // addr5 = &a + 12288; load5 = load(addr5, 0)
2119 //
2120 // Starting from the first load, the optimization will try to find a new base
2121 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2122 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2123 // as the new-base(anchor) because of the maximum distance which can
2124 // accommodate more intermediate bases presumably.
2125 //
2126 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2127 // (&a + 8192) for load1, load2, load4.
2128 // addr = &a + 8192
2129 // load1 = load(addr, -4096)
2130 // load2 = load(addr, -2048)
2131 // load3 = load(addr, 0)
2132 // load4 = load(addr, 2048)
2133 // addr5 = &a + 12288; load5 = load(addr5, 0)
2134 //
2135 MachineInstr *AnchorInst = nullptr;
2136 MemAddress AnchorAddr;
2137 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2138 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2139
2140 MachineBasicBlock *MBB = MI.getParent();
2141 MachineBasicBlock::iterator E = MBB->end();
2142 MachineBasicBlock::iterator MBBI = MI.getIterator();
2143 ++MBBI;
2144 const SITargetLowering *TLI =
2145 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2146
2147 for ( ; MBBI != E; ++MBBI) {
2148 MachineInstr &MINext = *MBBI;
2149 // TODO: Support finding an anchor(with same base) from store addresses or
2150 // any other load addresses where the opcodes are different.
2151 if (MINext.getOpcode() != MI.getOpcode() ||
2152 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2153 continue;
2154
2155 const MachineOperand &BaseNext =
2156 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2157 MemAddress MAddrNext;
2158 if (Visited.find(&MINext) == Visited.end()) {
2159 processBaseWithConstOffset(BaseNext, MAddrNext);
2160 Visited[&MINext] = MAddrNext;
2161 } else
2162 MAddrNext = Visited[&MINext];
2163
2164 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2165 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2166 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2167 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2168 continue;
2169
2170 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2171
2172 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2173 TargetLoweringBase::AddrMode AM;
2174 AM.HasBaseReg = true;
2175 AM.BaseOffs = Dist;
2176 if (TLI->isLegalGlobalAddressingMode(AM) &&
2177 (uint32_t)std::abs(Dist) > MaxDist) {
2178 MaxDist = std::abs(Dist);
2179
2180 AnchorAddr = MAddrNext;
2181 AnchorInst = &MINext;
2182 }
2183 }
2184
2185 if (AnchorInst) {
2186 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2187 AnchorInst->dump());
2188 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2189 << AnchorAddr.Offset << "\n\n");
2190
2191 // Instead of moving up, just re-compute anchor-instruction's base address.
2192 Register Base = computeBase(MI, AnchorAddr);
2193
2194 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2195 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2196
2197 for (auto P : InstsWCommonBase) {
2198 TargetLoweringBase::AddrMode AM;
2199 AM.HasBaseReg = true;
2200 AM.BaseOffs = P.second - AnchorAddr.Offset;
2201
2202 if (TLI->isLegalGlobalAddressingMode(AM)) {
2203 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2204 dbgs() << ")"; P.first->dump());
2205 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2206 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2207 }
2208 }
2209 AnchorList.insert(AnchorInst);
2210 return true;
2211 }
2212
2213 return false;
2214 }
2215
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const2216 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2217 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2218 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2219 if (AddrList.front().InstClass == CI.InstClass &&
2220 AddrList.front().IsAGPR == CI.IsAGPR &&
2221 AddrList.front().hasSameBaseAddress(CI)) {
2222 AddrList.emplace_back(CI);
2223 return;
2224 }
2225 }
2226
2227 // Base address not found, so add a new list.
2228 MergeableInsts.emplace_back(1, CI);
2229 }
2230
2231 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2232 SILoadStoreOptimizer::collectMergeableInsts(
2233 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2234 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2235 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2236 bool Modified = false;
2237
2238 // Sort potential mergeable instructions into lists. One list per base address.
2239 unsigned Order = 0;
2240 MachineBasicBlock::iterator BlockI = Begin;
2241 for (; BlockI != End; ++BlockI) {
2242 MachineInstr &MI = *BlockI;
2243
2244 // We run this before checking if an address is mergeable, because it can produce
2245 // better code even if the instructions aren't mergeable.
2246 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2247 Modified = true;
2248
2249 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2250 // barriers. We can look after this barrier for separate merges.
2251 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2252 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2253
2254 // Search will resume after this instruction in a separate merge list.
2255 ++BlockI;
2256 break;
2257 }
2258
2259 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2260 if (InstClass == UNKNOWN)
2261 continue;
2262
2263 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2264 int Swizzled =
2265 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2266 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2267 continue;
2268
2269 CombineInfo CI;
2270 CI.setMI(MI, *this);
2271 CI.Order = Order++;
2272
2273 if (!CI.hasMergeableAddress(*MRI))
2274 continue;
2275
2276 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2277 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2278 // operands. However we are reporting that ds_write2 shall have
2279 // only VGPR data so that machine copy propagation does not
2280 // create an illegal instruction with a VGPR and AGPR sources.
2281 // Consequenctially if we create such instruction the verifier
2282 // will complain.
2283 continue;
2284 }
2285
2286 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2287
2288 addInstToMergeableList(CI, MergeableInsts);
2289 }
2290
2291 // At this point we have lists of Mergeable instructions.
2292 //
2293 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2294 // list try to find an instruction that can be merged with I. If an instruction
2295 // is found, it is stored in the Paired field. If no instructions are found, then
2296 // the CombineInfo object is deleted from the list.
2297
2298 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2299 E = MergeableInsts.end(); I != E;) {
2300
2301 std::list<CombineInfo> &MergeList = *I;
2302 if (MergeList.size() <= 1) {
2303 // This means we have found only one instruction with a given address
2304 // that can be merged, and we need at least 2 instructions to do a merge,
2305 // so this list can be discarded.
2306 I = MergeableInsts.erase(I);
2307 continue;
2308 }
2309
2310 // Sort the lists by offsets, this way mergeable instructions will be
2311 // adjacent to each other in the list, which will make it easier to find
2312 // matches.
2313 MergeList.sort(
2314 [] (const CombineInfo &A, const CombineInfo &B) {
2315 return A.Offset < B.Offset;
2316 });
2317 ++I;
2318 }
2319
2320 return std::pair(BlockI, Modified);
2321 }
2322
2323 // Scan through looking for adjacent LDS operations with constant offsets from
2324 // the same base register. We rely on the scheduler to do the hard work of
2325 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2326 bool SILoadStoreOptimizer::optimizeBlock(
2327 std::list<std::list<CombineInfo> > &MergeableInsts) {
2328 bool Modified = false;
2329
2330 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2331 E = MergeableInsts.end(); I != E;) {
2332 std::list<CombineInfo> &MergeList = *I;
2333
2334 bool OptimizeListAgain = false;
2335 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2336 // We weren't able to make any changes, so delete the list so we don't
2337 // process the same instructions the next time we try to optimize this
2338 // block.
2339 I = MergeableInsts.erase(I);
2340 continue;
2341 }
2342
2343 Modified = true;
2344
2345 // We made changes, but also determined that there were no more optimization
2346 // opportunities, so we don't need to reprocess the list
2347 if (!OptimizeListAgain) {
2348 I = MergeableInsts.erase(I);
2349 continue;
2350 }
2351 OptimizeAgain = true;
2352 }
2353 return Modified;
2354 }
2355
2356 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2357 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2358 std::list<CombineInfo> &MergeList,
2359 bool &OptimizeListAgain) {
2360 if (MergeList.empty())
2361 return false;
2362
2363 bool Modified = false;
2364
2365 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2366 Next = std::next(I)) {
2367
2368 auto First = I;
2369 auto Second = Next;
2370
2371 if ((*First).Order > (*Second).Order)
2372 std::swap(First, Second);
2373 CombineInfo &CI = *First;
2374 CombineInfo &Paired = *Second;
2375
2376 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2377 if (!Where) {
2378 ++I;
2379 continue;
2380 }
2381
2382 Modified = true;
2383
2384 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2385
2386 MachineBasicBlock::iterator NewMI;
2387 switch (CI.InstClass) {
2388 default:
2389 llvm_unreachable("unknown InstClass");
2390 break;
2391 case DS_READ:
2392 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2393 break;
2394 case DS_WRITE:
2395 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2396 break;
2397 case S_BUFFER_LOAD_IMM:
2398 case S_BUFFER_LOAD_SGPR_IMM:
2399 case S_LOAD_IMM:
2400 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2401 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2402 break;
2403 case BUFFER_LOAD:
2404 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2405 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2406 break;
2407 case BUFFER_STORE:
2408 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2409 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2410 break;
2411 case MIMG:
2412 NewMI = mergeImagePair(CI, Paired, Where->I);
2413 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2414 break;
2415 case TBUFFER_LOAD:
2416 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2417 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2418 break;
2419 case TBUFFER_STORE:
2420 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2421 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2422 break;
2423 case FLAT_LOAD:
2424 case GLOBAL_LOAD:
2425 case GLOBAL_LOAD_SADDR:
2426 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2427 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2428 break;
2429 case FLAT_STORE:
2430 case GLOBAL_STORE:
2431 case GLOBAL_STORE_SADDR:
2432 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2433 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2434 break;
2435 }
2436 CI.setMI(NewMI, *this);
2437 CI.Order = Where->Order;
2438 if (I == Second)
2439 I = Next;
2440
2441 MergeList.erase(Second);
2442 }
2443
2444 return Modified;
2445 }
2446
runOnMachineFunction(MachineFunction & MF)2447 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2448 if (skipFunction(MF.getFunction()))
2449 return false;
2450
2451 STM = &MF.getSubtarget<GCNSubtarget>();
2452 if (!STM->loadStoreOptEnabled())
2453 return false;
2454
2455 TII = STM->getInstrInfo();
2456 TRI = &TII->getRegisterInfo();
2457
2458 MRI = &MF.getRegInfo();
2459 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2460
2461 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2462
2463 bool Modified = false;
2464
2465 // Contains the list of instructions for which constant offsets are being
2466 // promoted to the IMM. This is tracked for an entire block at time.
2467 SmallPtrSet<MachineInstr *, 4> AnchorList;
2468 MemInfoMap Visited;
2469
2470 for (MachineBasicBlock &MBB : MF) {
2471 MachineBasicBlock::iterator SectionEnd;
2472 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2473 I = SectionEnd) {
2474 bool CollectModified;
2475 std::list<std::list<CombineInfo>> MergeableInsts;
2476
2477 // First pass: Collect list of all instructions we know how to merge in a
2478 // subset of the block.
2479 std::tie(SectionEnd, CollectModified) =
2480 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2481
2482 Modified |= CollectModified;
2483
2484 do {
2485 OptimizeAgain = false;
2486 Modified |= optimizeBlock(MergeableInsts);
2487 } while (OptimizeAgain);
2488 }
2489
2490 Visited.clear();
2491 AnchorList.clear();
2492 }
2493
2494 return Modified;
2495 }
2496