1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Target/TargetMachine.h"
21
22 #define DEBUG_TYPE "gi-combiner"
23
24 using namespace llvm;
25
26 // Option to allow testing of the combiner while no targets know about indexed
27 // addressing.
28 static cl::opt<bool>
29 ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
30 cl::desc("Force all indexed operations to be "
31 "legal for the GlobalISel combiner"));
32
33
CombinerHelper(GISelChangeObserver & Observer,MachineIRBuilder & B,GISelKnownBits * KB,MachineDominatorTree * MDT)34 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
35 MachineIRBuilder &B, GISelKnownBits *KB,
36 MachineDominatorTree *MDT)
37 : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
38 KB(KB), MDT(MDT) {
39 (void)this->KB;
40 }
41
replaceRegWith(MachineRegisterInfo & MRI,Register FromReg,Register ToReg) const42 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
43 Register ToReg) const {
44 Observer.changingAllUsesOfReg(MRI, FromReg);
45
46 if (MRI.constrainRegAttrs(ToReg, FromReg))
47 MRI.replaceRegWith(FromReg, ToReg);
48 else
49 Builder.buildCopy(ToReg, FromReg);
50
51 Observer.finishedChangingAllUsesOfReg();
52 }
53
replaceRegOpWith(MachineRegisterInfo & MRI,MachineOperand & FromRegOp,Register ToReg) const54 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
55 MachineOperand &FromRegOp,
56 Register ToReg) const {
57 assert(FromRegOp.getParent() && "Expected an operand in an MI");
58 Observer.changingInstr(*FromRegOp.getParent());
59
60 FromRegOp.setReg(ToReg);
61
62 Observer.changedInstr(*FromRegOp.getParent());
63 }
64
tryCombineCopy(MachineInstr & MI)65 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
66 if (matchCombineCopy(MI)) {
67 applyCombineCopy(MI);
68 return true;
69 }
70 return false;
71 }
matchCombineCopy(MachineInstr & MI)72 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
73 if (MI.getOpcode() != TargetOpcode::COPY)
74 return false;
75 Register DstReg = MI.getOperand(0).getReg();
76 Register SrcReg = MI.getOperand(1).getReg();
77
78 // Give up if either DstReg or SrcReg is a physical register.
79 if (Register::isPhysicalRegister(DstReg) ||
80 Register::isPhysicalRegister(SrcReg))
81 return false;
82
83 // Give up the types don't match.
84 LLT DstTy = MRI.getType(DstReg);
85 LLT SrcTy = MRI.getType(SrcReg);
86 // Give up if one has a valid LLT, but the other doesn't.
87 if (DstTy.isValid() != SrcTy.isValid())
88 return false;
89 // Give up if the types don't match.
90 if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
91 return false;
92
93 // Get the register banks and classes.
94 const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
95 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
96 const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
97 const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
98
99 // Replace if the register constraints match.
100 if ((SrcRC == DstRC) && (SrcBank == DstBank))
101 return true;
102 // Replace if DstReg has no constraints.
103 if (!DstBank && !DstRC)
104 return true;
105
106 return false;
107 }
applyCombineCopy(MachineInstr & MI)108 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
109 Register DstReg = MI.getOperand(0).getReg();
110 Register SrcReg = MI.getOperand(1).getReg();
111 MI.eraseFromParent();
112 replaceRegWith(MRI, DstReg, SrcReg);
113 }
114
tryCombineConcatVectors(MachineInstr & MI)115 bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
116 bool IsUndef = false;
117 SmallVector<Register, 4> Ops;
118 if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
119 applyCombineConcatVectors(MI, IsUndef, Ops);
120 return true;
121 }
122 return false;
123 }
124
matchCombineConcatVectors(MachineInstr & MI,bool & IsUndef,SmallVectorImpl<Register> & Ops)125 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
126 SmallVectorImpl<Register> &Ops) {
127 assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
128 "Invalid instruction");
129 IsUndef = true;
130 MachineInstr *Undef = nullptr;
131
132 // Walk over all the operands of concat vectors and check if they are
133 // build_vector themselves or undef.
134 // Then collect their operands in Ops.
135 for (const MachineOperand &MO : MI.uses()) {
136 Register Reg = MO.getReg();
137 MachineInstr *Def = MRI.getVRegDef(Reg);
138 assert(Def && "Operand not defined");
139 switch (Def->getOpcode()) {
140 case TargetOpcode::G_BUILD_VECTOR:
141 IsUndef = false;
142 // Remember the operands of the build_vector to fold
143 // them into the yet-to-build flattened concat vectors.
144 for (const MachineOperand &BuildVecMO : Def->uses())
145 Ops.push_back(BuildVecMO.getReg());
146 break;
147 case TargetOpcode::G_IMPLICIT_DEF: {
148 LLT OpType = MRI.getType(Reg);
149 // Keep one undef value for all the undef operands.
150 if (!Undef) {
151 Builder.setInsertPt(*MI.getParent(), MI);
152 Undef = Builder.buildUndef(OpType.getScalarType());
153 }
154 assert(MRI.getType(Undef->getOperand(0).getReg()) ==
155 OpType.getScalarType() &&
156 "All undefs should have the same type");
157 // Break the undef vector in as many scalar elements as needed
158 // for the flattening.
159 for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
160 EltIdx != EltEnd; ++EltIdx)
161 Ops.push_back(Undef->getOperand(0).getReg());
162 break;
163 }
164 default:
165 return false;
166 }
167 }
168 return true;
169 }
applyCombineConcatVectors(MachineInstr & MI,bool IsUndef,const ArrayRef<Register> Ops)170 void CombinerHelper::applyCombineConcatVectors(
171 MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
172 // We determined that the concat_vectors can be flatten.
173 // Generate the flattened build_vector.
174 Register DstReg = MI.getOperand(0).getReg();
175 Builder.setInsertPt(*MI.getParent(), MI);
176 Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
177
178 // Note: IsUndef is sort of redundant. We could have determine it by
179 // checking that at all Ops are undef. Alternatively, we could have
180 // generate a build_vector of undefs and rely on another combine to
181 // clean that up. For now, given we already gather this information
182 // in tryCombineConcatVectors, just save compile time and issue the
183 // right thing.
184 if (IsUndef)
185 Builder.buildUndef(NewDstReg);
186 else
187 Builder.buildBuildVector(NewDstReg, Ops);
188 MI.eraseFromParent();
189 replaceRegWith(MRI, DstReg, NewDstReg);
190 }
191
tryCombineShuffleVector(MachineInstr & MI)192 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
193 SmallVector<Register, 4> Ops;
194 if (matchCombineShuffleVector(MI, Ops)) {
195 applyCombineShuffleVector(MI, Ops);
196 return true;
197 }
198 return false;
199 }
200
matchCombineShuffleVector(MachineInstr & MI,SmallVectorImpl<Register> & Ops)201 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
202 SmallVectorImpl<Register> &Ops) {
203 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
204 "Invalid instruction kind");
205 LLT DstType = MRI.getType(MI.getOperand(0).getReg());
206 Register Src1 = MI.getOperand(1).getReg();
207 LLT SrcType = MRI.getType(Src1);
208 // As bizarre as it may look, shuffle vector can actually produce
209 // scalar! This is because at the IR level a <1 x ty> shuffle
210 // vector is perfectly valid.
211 unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
212 unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
213
214 // If the resulting vector is smaller than the size of the source
215 // vectors being concatenated, we won't be able to replace the
216 // shuffle vector into a concat_vectors.
217 //
218 // Note: We may still be able to produce a concat_vectors fed by
219 // extract_vector_elt and so on. It is less clear that would
220 // be better though, so don't bother for now.
221 //
222 // If the destination is a scalar, the size of the sources doesn't
223 // matter. we will lower the shuffle to a plain copy. This will
224 // work only if the source and destination have the same size. But
225 // that's covered by the next condition.
226 //
227 // TODO: If the size between the source and destination don't match
228 // we could still emit an extract vector element in that case.
229 if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
230 return false;
231
232 // Check that the shuffle mask can be broken evenly between the
233 // different sources.
234 if (DstNumElts % SrcNumElts != 0)
235 return false;
236
237 // Mask length is a multiple of the source vector length.
238 // Check if the shuffle is some kind of concatenation of the input
239 // vectors.
240 unsigned NumConcat = DstNumElts / SrcNumElts;
241 SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
242 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
243 for (unsigned i = 0; i != DstNumElts; ++i) {
244 int Idx = Mask[i];
245 // Undef value.
246 if (Idx < 0)
247 continue;
248 // Ensure the indices in each SrcType sized piece are sequential and that
249 // the same source is used for the whole piece.
250 if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
251 (ConcatSrcs[i / SrcNumElts] >= 0 &&
252 ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
253 return false;
254 // Remember which source this index came from.
255 ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
256 }
257
258 // The shuffle is concatenating multiple vectors together.
259 // Collect the different operands for that.
260 Register UndefReg;
261 Register Src2 = MI.getOperand(2).getReg();
262 for (auto Src : ConcatSrcs) {
263 if (Src < 0) {
264 if (!UndefReg) {
265 Builder.setInsertPt(*MI.getParent(), MI);
266 UndefReg = Builder.buildUndef(SrcType).getReg(0);
267 }
268 Ops.push_back(UndefReg);
269 } else if (Src == 0)
270 Ops.push_back(Src1);
271 else
272 Ops.push_back(Src2);
273 }
274 return true;
275 }
276
applyCombineShuffleVector(MachineInstr & MI,const ArrayRef<Register> Ops)277 void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
278 const ArrayRef<Register> Ops) {
279 Register DstReg = MI.getOperand(0).getReg();
280 Builder.setInsertPt(*MI.getParent(), MI);
281 Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
282
283 if (Ops.size() == 1)
284 Builder.buildCopy(NewDstReg, Ops[0]);
285 else
286 Builder.buildMerge(NewDstReg, Ops);
287
288 MI.eraseFromParent();
289 replaceRegWith(MRI, DstReg, NewDstReg);
290 }
291
292 namespace {
293
294 /// Select a preference between two uses. CurrentUse is the current preference
295 /// while *ForCandidate is attributes of the candidate under consideration.
ChoosePreferredUse(PreferredTuple & CurrentUse,const LLT & TyForCandidate,unsigned OpcodeForCandidate,MachineInstr * MIForCandidate)296 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
297 const LLT &TyForCandidate,
298 unsigned OpcodeForCandidate,
299 MachineInstr *MIForCandidate) {
300 if (!CurrentUse.Ty.isValid()) {
301 if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
302 CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
303 return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
304 return CurrentUse;
305 }
306
307 // We permit the extend to hoist through basic blocks but this is only
308 // sensible if the target has extending loads. If you end up lowering back
309 // into a load and extend during the legalizer then the end result is
310 // hoisting the extend up to the load.
311
312 // Prefer defined extensions to undefined extensions as these are more
313 // likely to reduce the number of instructions.
314 if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
315 CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
316 return CurrentUse;
317 else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
318 OpcodeForCandidate != TargetOpcode::G_ANYEXT)
319 return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
320
321 // Prefer sign extensions to zero extensions as sign-extensions tend to be
322 // more expensive.
323 if (CurrentUse.Ty == TyForCandidate) {
324 if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
325 OpcodeForCandidate == TargetOpcode::G_ZEXT)
326 return CurrentUse;
327 else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
328 OpcodeForCandidate == TargetOpcode::G_SEXT)
329 return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
330 }
331
332 // This is potentially target specific. We've chosen the largest type
333 // because G_TRUNC is usually free. One potential catch with this is that
334 // some targets have a reduced number of larger registers than smaller
335 // registers and this choice potentially increases the live-range for the
336 // larger value.
337 if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
338 return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
339 }
340 return CurrentUse;
341 }
342
343 /// Find a suitable place to insert some instructions and insert them. This
344 /// function accounts for special cases like inserting before a PHI node.
345 /// The current strategy for inserting before PHI's is to duplicate the
346 /// instructions for each predecessor. However, while that's ok for G_TRUNC
347 /// on most targets since it generally requires no code, other targets/cases may
348 /// want to try harder to find a dominating block.
InsertInsnsWithoutSideEffectsBeforeUse(MachineIRBuilder & Builder,MachineInstr & DefMI,MachineOperand & UseMO,std::function<void (MachineBasicBlock *,MachineBasicBlock::iterator,MachineOperand & UseMO)> Inserter)349 static void InsertInsnsWithoutSideEffectsBeforeUse(
350 MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
351 std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
352 MachineOperand &UseMO)>
353 Inserter) {
354 MachineInstr &UseMI = *UseMO.getParent();
355
356 MachineBasicBlock *InsertBB = UseMI.getParent();
357
358 // If the use is a PHI then we want the predecessor block instead.
359 if (UseMI.isPHI()) {
360 MachineOperand *PredBB = std::next(&UseMO);
361 InsertBB = PredBB->getMBB();
362 }
363
364 // If the block is the same block as the def then we want to insert just after
365 // the def instead of at the start of the block.
366 if (InsertBB == DefMI.getParent()) {
367 MachineBasicBlock::iterator InsertPt = &DefMI;
368 Inserter(InsertBB, std::next(InsertPt), UseMO);
369 return;
370 }
371
372 // Otherwise we want the start of the BB
373 Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
374 }
375 } // end anonymous namespace
376
tryCombineExtendingLoads(MachineInstr & MI)377 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
378 PreferredTuple Preferred;
379 if (matchCombineExtendingLoads(MI, Preferred)) {
380 applyCombineExtendingLoads(MI, Preferred);
381 return true;
382 }
383 return false;
384 }
385
matchCombineExtendingLoads(MachineInstr & MI,PreferredTuple & Preferred)386 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
387 PreferredTuple &Preferred) {
388 // We match the loads and follow the uses to the extend instead of matching
389 // the extends and following the def to the load. This is because the load
390 // must remain in the same position for correctness (unless we also add code
391 // to find a safe place to sink it) whereas the extend is freely movable.
392 // It also prevents us from duplicating the load for the volatile case or just
393 // for performance.
394
395 if (MI.getOpcode() != TargetOpcode::G_LOAD &&
396 MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
397 MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
398 return false;
399
400 auto &LoadValue = MI.getOperand(0);
401 assert(LoadValue.isReg() && "Result wasn't a register?");
402
403 LLT LoadValueTy = MRI.getType(LoadValue.getReg());
404 if (!LoadValueTy.isScalar())
405 return false;
406
407 // Most architectures are going to legalize <s8 loads into at least a 1 byte
408 // load, and the MMOs can only describe memory accesses in multiples of bytes.
409 // If we try to perform extload combining on those, we can end up with
410 // %a(s8) = extload %ptr (load 1 byte from %ptr)
411 // ... which is an illegal extload instruction.
412 if (LoadValueTy.getSizeInBits() < 8)
413 return false;
414
415 // For non power-of-2 types, they will very likely be legalized into multiple
416 // loads. Don't bother trying to match them into extending loads.
417 if (!isPowerOf2_32(LoadValueTy.getSizeInBits()))
418 return false;
419
420 // Find the preferred type aside from the any-extends (unless it's the only
421 // one) and non-extending ops. We'll emit an extending load to that type and
422 // and emit a variant of (extend (trunc X)) for the others according to the
423 // relative type sizes. At the same time, pick an extend to use based on the
424 // extend involved in the chosen type.
425 unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
426 ? TargetOpcode::G_ANYEXT
427 : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
428 ? TargetOpcode::G_SEXT
429 : TargetOpcode::G_ZEXT;
430 Preferred = {LLT(), PreferredOpcode, nullptr};
431 for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
432 if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
433 UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
434 UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
435 Preferred = ChoosePreferredUse(Preferred,
436 MRI.getType(UseMI.getOperand(0).getReg()),
437 UseMI.getOpcode(), &UseMI);
438 }
439 }
440
441 // There were no extends
442 if (!Preferred.MI)
443 return false;
444 // It should be impossible to chose an extend without selecting a different
445 // type since by definition the result of an extend is larger.
446 assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
447
448 LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
449 return true;
450 }
451
applyCombineExtendingLoads(MachineInstr & MI,PreferredTuple & Preferred)452 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
453 PreferredTuple &Preferred) {
454 // Rewrite the load to the chosen extending load.
455 Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
456
457 // Inserter to insert a truncate back to the original type at a given point
458 // with some basic CSE to limit truncate duplication to one per BB.
459 DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
460 auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
461 MachineBasicBlock::iterator InsertBefore,
462 MachineOperand &UseMO) {
463 MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
464 if (PreviouslyEmitted) {
465 Observer.changingInstr(*UseMO.getParent());
466 UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
467 Observer.changedInstr(*UseMO.getParent());
468 return;
469 }
470
471 Builder.setInsertPt(*InsertIntoBB, InsertBefore);
472 Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
473 MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
474 EmittedInsns[InsertIntoBB] = NewMI;
475 replaceRegOpWith(MRI, UseMO, NewDstReg);
476 };
477
478 Observer.changingInstr(MI);
479 MI.setDesc(
480 Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
481 ? TargetOpcode::G_SEXTLOAD
482 : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
483 ? TargetOpcode::G_ZEXTLOAD
484 : TargetOpcode::G_LOAD));
485
486 // Rewrite all the uses to fix up the types.
487 auto &LoadValue = MI.getOperand(0);
488 SmallVector<MachineOperand *, 4> Uses;
489 for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
490 Uses.push_back(&UseMO);
491
492 for (auto *UseMO : Uses) {
493 MachineInstr *UseMI = UseMO->getParent();
494
495 // If the extend is compatible with the preferred extend then we should fix
496 // up the type and extend so that it uses the preferred use.
497 if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
498 UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
499 Register UseDstReg = UseMI->getOperand(0).getReg();
500 MachineOperand &UseSrcMO = UseMI->getOperand(1);
501 const LLT &UseDstTy = MRI.getType(UseDstReg);
502 if (UseDstReg != ChosenDstReg) {
503 if (Preferred.Ty == UseDstTy) {
504 // If the use has the same type as the preferred use, then merge
505 // the vregs and erase the extend. For example:
506 // %1:_(s8) = G_LOAD ...
507 // %2:_(s32) = G_SEXT %1(s8)
508 // %3:_(s32) = G_ANYEXT %1(s8)
509 // ... = ... %3(s32)
510 // rewrites to:
511 // %2:_(s32) = G_SEXTLOAD ...
512 // ... = ... %2(s32)
513 replaceRegWith(MRI, UseDstReg, ChosenDstReg);
514 Observer.erasingInstr(*UseMO->getParent());
515 UseMO->getParent()->eraseFromParent();
516 } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
517 // If the preferred size is smaller, then keep the extend but extend
518 // from the result of the extending load. For example:
519 // %1:_(s8) = G_LOAD ...
520 // %2:_(s32) = G_SEXT %1(s8)
521 // %3:_(s64) = G_ANYEXT %1(s8)
522 // ... = ... %3(s64)
523 /// rewrites to:
524 // %2:_(s32) = G_SEXTLOAD ...
525 // %3:_(s64) = G_ANYEXT %2:_(s32)
526 // ... = ... %3(s64)
527 replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
528 } else {
529 // If the preferred size is large, then insert a truncate. For
530 // example:
531 // %1:_(s8) = G_LOAD ...
532 // %2:_(s64) = G_SEXT %1(s8)
533 // %3:_(s32) = G_ZEXT %1(s8)
534 // ... = ... %3(s32)
535 /// rewrites to:
536 // %2:_(s64) = G_SEXTLOAD ...
537 // %4:_(s8) = G_TRUNC %2:_(s32)
538 // %3:_(s64) = G_ZEXT %2:_(s8)
539 // ... = ... %3(s64)
540 InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
541 InsertTruncAt);
542 }
543 continue;
544 }
545 // The use is (one of) the uses of the preferred use we chose earlier.
546 // We're going to update the load to def this value later so just erase
547 // the old extend.
548 Observer.erasingInstr(*UseMO->getParent());
549 UseMO->getParent()->eraseFromParent();
550 continue;
551 }
552
553 // The use isn't an extend. Truncate back to the type we originally loaded.
554 // This is free on many targets.
555 InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
556 }
557
558 MI.getOperand(0).setReg(ChosenDstReg);
559 Observer.changedInstr(MI);
560 }
561
isPredecessor(MachineInstr & DefMI,MachineInstr & UseMI)562 bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
563 assert(DefMI.getParent() == UseMI.getParent());
564 if (&DefMI == &UseMI)
565 return false;
566
567 // Loop through the basic block until we find one of the instructions.
568 MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
569 for (; &*I != &DefMI && &*I != &UseMI; ++I)
570 return &*I == &DefMI;
571
572 llvm_unreachable("Block must contain instructions");
573 }
574
dominates(MachineInstr & DefMI,MachineInstr & UseMI)575 bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
576 if (MDT)
577 return MDT->dominates(&DefMI, &UseMI);
578 else if (DefMI.getParent() != UseMI.getParent())
579 return false;
580
581 return isPredecessor(DefMI, UseMI);
582 }
583
findPostIndexCandidate(MachineInstr & MI,Register & Addr,Register & Base,Register & Offset)584 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
585 Register &Base, Register &Offset) {
586 auto &MF = *MI.getParent()->getParent();
587 const auto &TLI = *MF.getSubtarget().getTargetLowering();
588
589 #ifndef NDEBUG
590 unsigned Opcode = MI.getOpcode();
591 assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
592 Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
593 #endif
594
595 Base = MI.getOperand(1).getReg();
596 MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
597 if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
598 return false;
599
600 LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
601
602 for (auto &Use : MRI.use_instructions(Base)) {
603 if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
604 continue;
605
606 Offset = Use.getOperand(2).getReg();
607 if (!ForceLegalIndexing &&
608 !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
609 LLVM_DEBUG(dbgs() << " Ignoring candidate with illegal addrmode: "
610 << Use);
611 continue;
612 }
613
614 // Make sure the offset calculation is before the potentially indexed op.
615 // FIXME: we really care about dependency here. The offset calculation might
616 // be movable.
617 MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
618 if (!OffsetDef || !dominates(*OffsetDef, MI)) {
619 LLVM_DEBUG(dbgs() << " Ignoring candidate with offset after mem-op: "
620 << Use);
621 continue;
622 }
623
624 // FIXME: check whether all uses of Base are load/store with foldable
625 // addressing modes. If so, using the normal addr-modes is better than
626 // forming an indexed one.
627
628 bool MemOpDominatesAddrUses = true;
629 for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
630 if (!dominates(MI, PtrAddUse)) {
631 MemOpDominatesAddrUses = false;
632 break;
633 }
634 }
635
636 if (!MemOpDominatesAddrUses) {
637 LLVM_DEBUG(
638 dbgs() << " Ignoring candidate as memop does not dominate uses: "
639 << Use);
640 continue;
641 }
642
643 LLVM_DEBUG(dbgs() << " Found match: " << Use);
644 Addr = Use.getOperand(0).getReg();
645 return true;
646 }
647
648 return false;
649 }
650
findPreIndexCandidate(MachineInstr & MI,Register & Addr,Register & Base,Register & Offset)651 bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
652 Register &Base, Register &Offset) {
653 auto &MF = *MI.getParent()->getParent();
654 const auto &TLI = *MF.getSubtarget().getTargetLowering();
655
656 #ifndef NDEBUG
657 unsigned Opcode = MI.getOpcode();
658 assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
659 Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
660 #endif
661
662 Addr = MI.getOperand(1).getReg();
663 MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
664 if (!AddrDef || MRI.hasOneUse(Addr))
665 return false;
666
667 Base = AddrDef->getOperand(1).getReg();
668 Offset = AddrDef->getOperand(2).getReg();
669
670 LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
671
672 if (!ForceLegalIndexing &&
673 !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
674 LLVM_DEBUG(dbgs() << " Skipping, not legal for target");
675 return false;
676 }
677
678 MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
679 if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
680 LLVM_DEBUG(dbgs() << " Skipping, frame index would need copy anyway.");
681 return false;
682 }
683
684 if (MI.getOpcode() == TargetOpcode::G_STORE) {
685 // Would require a copy.
686 if (Base == MI.getOperand(0).getReg()) {
687 LLVM_DEBUG(dbgs() << " Skipping, storing base so need copy anyway.");
688 return false;
689 }
690
691 // We're expecting one use of Addr in MI, but it could also be the
692 // value stored, which isn't actually dominated by the instruction.
693 if (MI.getOperand(0).getReg() == Addr) {
694 LLVM_DEBUG(dbgs() << " Skipping, does not dominate all addr uses");
695 return false;
696 }
697 }
698
699 // FIXME: check whether all uses of the base pointer are constant PtrAdds.
700 // That might allow us to end base's liveness here by adjusting the constant.
701
702 for (auto &UseMI : MRI.use_instructions(Addr)) {
703 if (!dominates(MI, UseMI)) {
704 LLVM_DEBUG(dbgs() << " Skipping, does not dominate all addr uses.");
705 return false;
706 }
707 }
708
709 return true;
710 }
711
tryCombineIndexedLoadStore(MachineInstr & MI)712 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
713 IndexedLoadStoreMatchInfo MatchInfo;
714 if (matchCombineIndexedLoadStore(MI, MatchInfo)) {
715 applyCombineIndexedLoadStore(MI, MatchInfo);
716 return true;
717 }
718 return false;
719 }
720
matchCombineIndexedLoadStore(MachineInstr & MI,IndexedLoadStoreMatchInfo & MatchInfo)721 bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
722 unsigned Opcode = MI.getOpcode();
723 if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
724 Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
725 return false;
726
727 MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
728 MatchInfo.Offset);
729 if (!MatchInfo.IsPre &&
730 !findPostIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
731 MatchInfo.Offset))
732 return false;
733
734 return true;
735 }
736
applyCombineIndexedLoadStore(MachineInstr & MI,IndexedLoadStoreMatchInfo & MatchInfo)737 void CombinerHelper::applyCombineIndexedLoadStore(
738 MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
739 MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
740 MachineIRBuilder MIRBuilder(MI);
741 unsigned Opcode = MI.getOpcode();
742 bool IsStore = Opcode == TargetOpcode::G_STORE;
743 unsigned NewOpcode;
744 switch (Opcode) {
745 case TargetOpcode::G_LOAD:
746 NewOpcode = TargetOpcode::G_INDEXED_LOAD;
747 break;
748 case TargetOpcode::G_SEXTLOAD:
749 NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
750 break;
751 case TargetOpcode::G_ZEXTLOAD:
752 NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
753 break;
754 case TargetOpcode::G_STORE:
755 NewOpcode = TargetOpcode::G_INDEXED_STORE;
756 break;
757 default:
758 llvm_unreachable("Unknown load/store opcode");
759 }
760
761 auto MIB = MIRBuilder.buildInstr(NewOpcode);
762 if (IsStore) {
763 MIB.addDef(MatchInfo.Addr);
764 MIB.addUse(MI.getOperand(0).getReg());
765 } else {
766 MIB.addDef(MI.getOperand(0).getReg());
767 MIB.addDef(MatchInfo.Addr);
768 }
769
770 MIB.addUse(MatchInfo.Base);
771 MIB.addUse(MatchInfo.Offset);
772 MIB.addImm(MatchInfo.IsPre);
773 MI.eraseFromParent();
774 AddrDef.eraseFromParent();
775
776 LLVM_DEBUG(dbgs() << " Combinined to indexed operation");
777 }
778
matchElideBrByInvertingCond(MachineInstr & MI)779 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
780 if (MI.getOpcode() != TargetOpcode::G_BR)
781 return false;
782
783 // Try to match the following:
784 // bb1:
785 // %c(s32) = G_ICMP pred, %a, %b
786 // %c1(s1) = G_TRUNC %c(s32)
787 // G_BRCOND %c1, %bb2
788 // G_BR %bb3
789 // bb2:
790 // ...
791 // bb3:
792
793 // The above pattern does not have a fall through to the successor bb2, always
794 // resulting in a branch no matter which path is taken. Here we try to find
795 // and replace that pattern with conditional branch to bb3 and otherwise
796 // fallthrough to bb2.
797
798 MachineBasicBlock *MBB = MI.getParent();
799 MachineBasicBlock::iterator BrIt(MI);
800 if (BrIt == MBB->begin())
801 return false;
802 assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
803
804 MachineInstr *BrCond = &*std::prev(BrIt);
805 if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
806 return false;
807
808 // Check that the next block is the conditional branch target.
809 if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
810 return false;
811
812 MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
813 if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
814 !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
815 return false;
816 return true;
817 }
818
tryElideBrByInvertingCond(MachineInstr & MI)819 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
820 if (!matchElideBrByInvertingCond(MI))
821 return false;
822 applyElideBrByInvertingCond(MI);
823 return true;
824 }
825
applyElideBrByInvertingCond(MachineInstr & MI)826 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
827 MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
828 MachineBasicBlock::iterator BrIt(MI);
829 MachineInstr *BrCond = &*std::prev(BrIt);
830 MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
831
832 CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
833 (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
834
835 // Invert the G_ICMP condition.
836 Observer.changingInstr(*CmpMI);
837 CmpMI->getOperand(1).setPredicate(InversePred);
838 Observer.changedInstr(*CmpMI);
839
840 // Change the conditional branch target.
841 Observer.changingInstr(*BrCond);
842 BrCond->getOperand(1).setMBB(BrTarget);
843 Observer.changedInstr(*BrCond);
844 MI.eraseFromParent();
845 }
846
shouldLowerMemFuncForSize(const MachineFunction & MF)847 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
848 // On Darwin, -Os means optimize for size without hurting performance, so
849 // only really optimize for size when -Oz (MinSize) is used.
850 if (MF.getTarget().getTargetTriple().isOSDarwin())
851 return MF.getFunction().hasMinSize();
852 return MF.getFunction().hasOptSize();
853 }
854
855 // Returns a list of types to use for memory op lowering in MemOps. A partial
856 // port of findOptimalMemOpLowering in TargetLowering.
findGISelOptimalMemOpLowering(std::vector<LLT> & MemOps,unsigned Limit,uint64_t Size,unsigned DstAlign,unsigned SrcAlign,bool IsMemset,bool ZeroMemset,bool MemcpyStrSrc,bool AllowOverlap,unsigned DstAS,unsigned SrcAS,const AttributeList & FuncAttributes,const TargetLowering & TLI)857 static bool findGISelOptimalMemOpLowering(
858 std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
859 unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
860 bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
861 const AttributeList &FuncAttributes, const TargetLowering &TLI) {
862 // If 'SrcAlign' is zero, that means the memory operation does not need to
863 // load the value, i.e. memset or memcpy from constant string. Otherwise,
864 // it's the inferred alignment of the source. 'DstAlign', on the other hand,
865 // is the specified alignment of the memory operation. If it is zero, that
866 // means it's possible to change the alignment of the destination.
867 // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
868 // not need to be loaded.
869 if (SrcAlign != 0 && SrcAlign < DstAlign)
870 return false;
871
872 LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
873 ZeroMemset, MemcpyStrSrc, FuncAttributes);
874
875 if (Ty == LLT()) {
876 // Use the largest scalar type whose alignment constraints are satisfied.
877 // We only need to check DstAlign here as SrcAlign is always greater or
878 // equal to DstAlign (or zero).
879 Ty = LLT::scalar(64);
880 while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
881 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
882 Ty = LLT::scalar(Ty.getSizeInBytes());
883 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
884 // FIXME: check for the largest legal type we can load/store to.
885 }
886
887 unsigned NumMemOps = 0;
888 while (Size != 0) {
889 unsigned TySize = Ty.getSizeInBytes();
890 while (TySize > Size) {
891 // For now, only use non-vector load / store's for the left-over pieces.
892 LLT NewTy = Ty;
893 // FIXME: check for mem op safety and legality of the types. Not all of
894 // SDAGisms map cleanly to GISel concepts.
895 if (NewTy.isVector())
896 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
897 NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
898 unsigned NewTySize = NewTy.getSizeInBytes();
899 assert(NewTySize > 0 && "Could not find appropriate type");
900
901 // If the new LLT cannot cover all of the remaining bits, then consider
902 // issuing a (or a pair of) unaligned and overlapping load / store.
903 bool Fast;
904 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
905 MVT VT = getMVTForLLT(Ty);
906 if (NumMemOps && AllowOverlap && NewTySize < Size &&
907 TLI.allowsMisalignedMemoryAccesses(
908 VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
909 Fast)
910 TySize = Size;
911 else {
912 Ty = NewTy;
913 TySize = NewTySize;
914 }
915 }
916
917 if (++NumMemOps > Limit)
918 return false;
919
920 MemOps.push_back(Ty);
921 Size -= TySize;
922 }
923
924 return true;
925 }
926
getTypeForLLT(LLT Ty,LLVMContext & C)927 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
928 if (Ty.isVector())
929 return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
930 Ty.getNumElements());
931 return IntegerType::get(C, Ty.getSizeInBits());
932 }
933
934 // Get a vectorized representation of the memset value operand, GISel edition.
getMemsetValue(Register Val,LLT Ty,MachineIRBuilder & MIB)935 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
936 MachineRegisterInfo &MRI = *MIB.getMRI();
937 unsigned NumBits = Ty.getScalarSizeInBits();
938 auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
939 if (!Ty.isVector() && ValVRegAndVal) {
940 unsigned KnownVal = ValVRegAndVal->Value;
941 APInt Scalar = APInt(8, KnownVal);
942 APInt SplatVal = APInt::getSplat(NumBits, Scalar);
943 return MIB.buildConstant(Ty, SplatVal).getReg(0);
944 }
945 // FIXME: for vector types create a G_BUILD_VECTOR.
946 if (Ty.isVector())
947 return Register();
948
949 // Extend the byte value to the larger type, and then multiply by a magic
950 // value 0x010101... in order to replicate it across every byte.
951 LLT ExtType = Ty.getScalarType();
952 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
953 if (NumBits > 8) {
954 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
955 auto MagicMI = MIB.buildConstant(ExtType, Magic);
956 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
957 }
958
959 assert(ExtType == Ty && "Vector memset value type not supported yet");
960 return Val;
961 }
962
optimizeMemset(MachineInstr & MI,Register Dst,Register Val,unsigned KnownLen,unsigned Align,bool IsVolatile)963 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
964 unsigned KnownLen, unsigned Align,
965 bool IsVolatile) {
966 auto &MF = *MI.getParent()->getParent();
967 const auto &TLI = *MF.getSubtarget().getTargetLowering();
968 auto &DL = MF.getDataLayout();
969 LLVMContext &C = MF.getFunction().getContext();
970
971 assert(KnownLen != 0 && "Have a zero length memset length!");
972
973 bool DstAlignCanChange = false;
974 MachineFrameInfo &MFI = MF.getFrameInfo();
975 bool OptSize = shouldLowerMemFuncForSize(MF);
976
977 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
978 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
979 DstAlignCanChange = true;
980
981 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
982 std::vector<LLT> MemOps;
983
984 const auto &DstMMO = **MI.memoperands_begin();
985 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
986
987 auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
988 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
989
990 if (!findGISelOptimalMemOpLowering(
991 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
992 /*IsMemset=*/true,
993 /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
994 /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
995 MF.getFunction().getAttributes(), TLI))
996 return false;
997
998 if (DstAlignCanChange) {
999 // Get an estimate of the type from the LLT.
1000 Type *IRTy = getTypeForLLT(MemOps[0], C);
1001 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1002 if (NewAlign > Align) {
1003 Align = NewAlign;
1004 unsigned FI = FIDef->getOperand(1).getIndex();
1005 // Give the stack frame object a larger alignment if needed.
1006 if (MFI.getObjectAlignment(FI) < Align)
1007 MFI.setObjectAlignment(FI, Align);
1008 }
1009 }
1010
1011 MachineIRBuilder MIB(MI);
1012 // Find the largest store and generate the bit pattern for it.
1013 LLT LargestTy = MemOps[0];
1014 for (unsigned i = 1; i < MemOps.size(); i++)
1015 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
1016 LargestTy = MemOps[i];
1017
1018 // The memset stored value is always defined as an s8, so in order to make it
1019 // work with larger store types we need to repeat the bit pattern across the
1020 // wider type.
1021 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
1022
1023 if (!MemSetValue)
1024 return false;
1025
1026 // Generate the stores. For each store type in the list, we generate the
1027 // matching store of that type to the destination address.
1028 LLT PtrTy = MRI.getType(Dst);
1029 unsigned DstOff = 0;
1030 unsigned Size = KnownLen;
1031 for (unsigned I = 0; I < MemOps.size(); I++) {
1032 LLT Ty = MemOps[I];
1033 unsigned TySize = Ty.getSizeInBytes();
1034 if (TySize > Size) {
1035 // Issuing an unaligned load / store pair that overlaps with the previous
1036 // pair. Adjust the offset accordingly.
1037 assert(I == MemOps.size() - 1 && I != 0);
1038 DstOff -= TySize - Size;
1039 }
1040
1041 // If this store is smaller than the largest store see whether we can get
1042 // the smaller value for free with a truncate.
1043 Register Value = MemSetValue;
1044 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
1045 MVT VT = getMVTForLLT(Ty);
1046 MVT LargestVT = getMVTForLLT(LargestTy);
1047 if (!LargestTy.isVector() && !Ty.isVector() &&
1048 TLI.isTruncateFree(LargestVT, VT))
1049 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
1050 else
1051 Value = getMemsetValue(Val, Ty, MIB);
1052 if (!Value)
1053 return false;
1054 }
1055
1056 auto *StoreMMO =
1057 MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
1058
1059 Register Ptr = Dst;
1060 if (DstOff != 0) {
1061 auto Offset =
1062 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
1063 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1064 }
1065
1066 MIB.buildStore(Value, Ptr, *StoreMMO);
1067 DstOff += Ty.getSizeInBytes();
1068 Size -= TySize;
1069 }
1070
1071 MI.eraseFromParent();
1072 return true;
1073 }
1074
1075
optimizeMemcpy(MachineInstr & MI,Register Dst,Register Src,unsigned KnownLen,unsigned DstAlign,unsigned SrcAlign,bool IsVolatile)1076 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
1077 Register Src, unsigned KnownLen,
1078 unsigned DstAlign, unsigned SrcAlign,
1079 bool IsVolatile) {
1080 auto &MF = *MI.getParent()->getParent();
1081 const auto &TLI = *MF.getSubtarget().getTargetLowering();
1082 auto &DL = MF.getDataLayout();
1083 LLVMContext &C = MF.getFunction().getContext();
1084
1085 assert(KnownLen != 0 && "Have a zero length memcpy length!");
1086
1087 bool DstAlignCanChange = false;
1088 MachineFrameInfo &MFI = MF.getFrameInfo();
1089 bool OptSize = shouldLowerMemFuncForSize(MF);
1090 unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1091
1092 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1093 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1094 DstAlignCanChange = true;
1095
1096 // FIXME: infer better src pointer alignment like SelectionDAG does here.
1097 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1098 // if the memcpy is in a tail call position.
1099
1100 unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
1101 std::vector<LLT> MemOps;
1102
1103 const auto &DstMMO = **MI.memoperands_begin();
1104 const auto &SrcMMO = **std::next(MI.memoperands_begin());
1105 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1106 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1107
1108 if (!findGISelOptimalMemOpLowering(
1109 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1110 SrcAlign,
1111 /*IsMemset=*/false,
1112 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1113 /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
1114 SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1115 return false;
1116
1117 if (DstAlignCanChange) {
1118 // Get an estimate of the type from the LLT.
1119 Type *IRTy = getTypeForLLT(MemOps[0], C);
1120 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1121
1122 // Don't promote to an alignment that would require dynamic stack
1123 // realignment.
1124 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1125 if (!TRI->needsStackRealignment(MF))
1126 while (NewAlign > Alignment &&
1127 DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1128 NewAlign /= 2;
1129
1130 if (NewAlign > Alignment) {
1131 Alignment = NewAlign;
1132 unsigned FI = FIDef->getOperand(1).getIndex();
1133 // Give the stack frame object a larger alignment if needed.
1134 if (MFI.getObjectAlignment(FI) < Alignment)
1135 MFI.setObjectAlignment(FI, Alignment);
1136 }
1137 }
1138
1139 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
1140
1141 MachineIRBuilder MIB(MI);
1142 // Now we need to emit a pair of load and stores for each of the types we've
1143 // collected. I.e. for each type, generate a load from the source pointer of
1144 // that type width, and then generate a corresponding store to the dest buffer
1145 // of that value loaded. This can result in a sequence of loads and stores
1146 // mixed types, depending on what the target specifies as good types to use.
1147 unsigned CurrOffset = 0;
1148 LLT PtrTy = MRI.getType(Src);
1149 unsigned Size = KnownLen;
1150 for (auto CopyTy : MemOps) {
1151 // Issuing an unaligned load / store pair that overlaps with the previous
1152 // pair. Adjust the offset accordingly.
1153 if (CopyTy.getSizeInBytes() > Size)
1154 CurrOffset -= CopyTy.getSizeInBytes() - Size;
1155
1156 // Construct MMOs for the accesses.
1157 auto *LoadMMO =
1158 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1159 auto *StoreMMO =
1160 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1161
1162 // Create the load.
1163 Register LoadPtr = Src;
1164 Register Offset;
1165 if (CurrOffset != 0) {
1166 Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
1167 .getReg(0);
1168 LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1169 }
1170 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
1171
1172 // Create the store.
1173 Register StorePtr =
1174 CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1175 MIB.buildStore(LdVal, StorePtr, *StoreMMO);
1176 CurrOffset += CopyTy.getSizeInBytes();
1177 Size -= CopyTy.getSizeInBytes();
1178 }
1179
1180 MI.eraseFromParent();
1181 return true;
1182 }
1183
optimizeMemmove(MachineInstr & MI,Register Dst,Register Src,unsigned KnownLen,unsigned DstAlign,unsigned SrcAlign,bool IsVolatile)1184 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
1185 Register Src, unsigned KnownLen,
1186 unsigned DstAlign, unsigned SrcAlign,
1187 bool IsVolatile) {
1188 auto &MF = *MI.getParent()->getParent();
1189 const auto &TLI = *MF.getSubtarget().getTargetLowering();
1190 auto &DL = MF.getDataLayout();
1191 LLVMContext &C = MF.getFunction().getContext();
1192
1193 assert(KnownLen != 0 && "Have a zero length memmove length!");
1194
1195 bool DstAlignCanChange = false;
1196 MachineFrameInfo &MFI = MF.getFrameInfo();
1197 bool OptSize = shouldLowerMemFuncForSize(MF);
1198 unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1199
1200 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1201 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1202 DstAlignCanChange = true;
1203
1204 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
1205 std::vector<LLT> MemOps;
1206
1207 const auto &DstMMO = **MI.memoperands_begin();
1208 const auto &SrcMMO = **std::next(MI.memoperands_begin());
1209 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1210 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1211
1212 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1213 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1214 // same thing here.
1215 if (!findGISelOptimalMemOpLowering(
1216 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1217 SrcAlign,
1218 /*IsMemset=*/false,
1219 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1220 /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
1221 SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1222 return false;
1223
1224 if (DstAlignCanChange) {
1225 // Get an estimate of the type from the LLT.
1226 Type *IRTy = getTypeForLLT(MemOps[0], C);
1227 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1228
1229 // Don't promote to an alignment that would require dynamic stack
1230 // realignment.
1231 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1232 if (!TRI->needsStackRealignment(MF))
1233 while (NewAlign > Alignment &&
1234 DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1235 NewAlign /= 2;
1236
1237 if (NewAlign > Alignment) {
1238 Alignment = NewAlign;
1239 unsigned FI = FIDef->getOperand(1).getIndex();
1240 // Give the stack frame object a larger alignment if needed.
1241 if (MFI.getObjectAlignment(FI) < Alignment)
1242 MFI.setObjectAlignment(FI, Alignment);
1243 }
1244 }
1245
1246 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
1247
1248 MachineIRBuilder MIB(MI);
1249 // Memmove requires that we perform the loads first before issuing the stores.
1250 // Apart from that, this loop is pretty much doing the same thing as the
1251 // memcpy codegen function.
1252 unsigned CurrOffset = 0;
1253 LLT PtrTy = MRI.getType(Src);
1254 SmallVector<Register, 16> LoadVals;
1255 for (auto CopyTy : MemOps) {
1256 // Construct MMO for the load.
1257 auto *LoadMMO =
1258 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1259
1260 // Create the load.
1261 Register LoadPtr = Src;
1262 if (CurrOffset != 0) {
1263 auto Offset =
1264 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1265 LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1266 }
1267 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
1268 CurrOffset += CopyTy.getSizeInBytes();
1269 }
1270
1271 CurrOffset = 0;
1272 for (unsigned I = 0; I < MemOps.size(); ++I) {
1273 LLT CopyTy = MemOps[I];
1274 // Now store the values loaded.
1275 auto *StoreMMO =
1276 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1277
1278 Register StorePtr = Dst;
1279 if (CurrOffset != 0) {
1280 auto Offset =
1281 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1282 StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1283 }
1284 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
1285 CurrOffset += CopyTy.getSizeInBytes();
1286 }
1287 MI.eraseFromParent();
1288 return true;
1289 }
1290
tryCombineMemCpyFamily(MachineInstr & MI,unsigned MaxLen)1291 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
1292 // This combine is fairly complex so it's not written with a separate
1293 // matcher function.
1294 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1295 Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
1296 assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
1297 ID == Intrinsic::memset) &&
1298 "Expected a memcpy like intrinsic");
1299
1300 auto MMOIt = MI.memoperands_begin();
1301 const MachineMemOperand *MemOp = *MMOIt;
1302 bool IsVolatile = MemOp->isVolatile();
1303 // Don't try to optimize volatile.
1304 if (IsVolatile)
1305 return false;
1306
1307 unsigned DstAlign = MemOp->getBaseAlignment();
1308 unsigned SrcAlign = 0;
1309 Register Dst = MI.getOperand(1).getReg();
1310 Register Src = MI.getOperand(2).getReg();
1311 Register Len = MI.getOperand(3).getReg();
1312
1313 if (ID != Intrinsic::memset) {
1314 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
1315 MemOp = *(++MMOIt);
1316 SrcAlign = MemOp->getBaseAlignment();
1317 }
1318
1319 // See if this is a constant length copy
1320 auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
1321 if (!LenVRegAndVal)
1322 return false; // Leave it to the legalizer to lower it to a libcall.
1323 unsigned KnownLen = LenVRegAndVal->Value;
1324
1325 if (KnownLen == 0) {
1326 MI.eraseFromParent();
1327 return true;
1328 }
1329
1330 if (MaxLen && KnownLen > MaxLen)
1331 return false;
1332
1333 if (ID == Intrinsic::memcpy)
1334 return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1335 if (ID == Intrinsic::memmove)
1336 return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1337 if (ID == Intrinsic::memset)
1338 return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
1339 return false;
1340 }
1341
matchPtrAddImmedChain(MachineInstr & MI,PtrAddChain & MatchInfo)1342 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
1343 PtrAddChain &MatchInfo) {
1344 // We're trying to match the following pattern:
1345 // %t1 = G_PTR_ADD %base, G_CONSTANT imm1
1346 // %root = G_PTR_ADD %t1, G_CONSTANT imm2
1347 // -->
1348 // %root = G_PTR_ADD %base, G_CONSTANT (imm1 + imm2)
1349
1350 if (MI.getOpcode() != TargetOpcode::G_PTR_ADD)
1351 return false;
1352
1353 Register Add2 = MI.getOperand(1).getReg();
1354 Register Imm1 = MI.getOperand(2).getReg();
1355 auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
1356 if (!MaybeImmVal)
1357 return false;
1358
1359 MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
1360 if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
1361 return false;
1362
1363 Register Base = Add2Def->getOperand(1).getReg();
1364 Register Imm2 = Add2Def->getOperand(2).getReg();
1365 auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
1366 if (!MaybeImm2Val)
1367 return false;
1368
1369 // Pass the combined immediate to the apply function.
1370 MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value;
1371 MatchInfo.Base = Base;
1372 return true;
1373 }
1374
applyPtrAddImmedChain(MachineInstr & MI,PtrAddChain & MatchInfo)1375 bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
1376 PtrAddChain &MatchInfo) {
1377 assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
1378 MachineIRBuilder MIB(MI);
1379 LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
1380 auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
1381 Observer.changingInstr(MI);
1382 MI.getOperand(1).setReg(MatchInfo.Base);
1383 MI.getOperand(2).setReg(NewOffset.getReg(0));
1384 Observer.changedInstr(MI);
1385 return true;
1386 }
1387
tryCombine(MachineInstr & MI)1388 bool CombinerHelper::tryCombine(MachineInstr &MI) {
1389 if (tryCombineCopy(MI))
1390 return true;
1391 if (tryCombineExtendingLoads(MI))
1392 return true;
1393 if (tryCombineIndexedLoadStore(MI))
1394 return true;
1395 return false;
1396 }
1397