1 //===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the visit functions for load, store and alloca.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "InstCombineInternal.h"
14 #include "llvm/ADT/MapVector.h"
15 #include "llvm/ADT/SmallString.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/Analysis/Loads.h"
19 #include "llvm/IR/ConstantRange.h"
20 #include "llvm/IR/DataLayout.h"
21 #include "llvm/IR/DebugInfoMetadata.h"
22 #include "llvm/IR/IntrinsicInst.h"
23 #include "llvm/IR/LLVMContext.h"
24 #include "llvm/IR/MDBuilder.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/Transforms/InstCombine/InstCombiner.h"
27 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
28 #include "llvm/Transforms/Utils/Local.h"
29 using namespace llvm;
30 using namespace PatternMatch;
31
32 #define DEBUG_TYPE "instcombine"
33
34 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
35 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
36
37 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
38 /// pointer to an alloca. Ignore any reads of the pointer, return false if we
39 /// see any stores or other unknown uses. If we see pointer arithmetic, keep
40 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
41 /// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to
42 /// the alloca, and if the source pointer is a pointer to a constant global, we
43 /// can optimize this.
44 static bool
isOnlyCopiedFromConstantMemory(AAResults * AA,Value * V,MemTransferInst * & TheCopy,SmallVectorImpl<Instruction * > & ToDelete)45 isOnlyCopiedFromConstantMemory(AAResults *AA,
46 Value *V, MemTransferInst *&TheCopy,
47 SmallVectorImpl<Instruction *> &ToDelete) {
48 // We track lifetime intrinsics as we encounter them. If we decide to go
49 // ahead and replace the value with the global, this lets the caller quickly
50 // eliminate the markers.
51
52 SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
53 ValuesToInspect.emplace_back(V, false);
54 while (!ValuesToInspect.empty()) {
55 auto ValuePair = ValuesToInspect.pop_back_val();
56 const bool IsOffset = ValuePair.second;
57 for (auto &U : ValuePair.first->uses()) {
58 auto *I = cast<Instruction>(U.getUser());
59
60 if (auto *LI = dyn_cast<LoadInst>(I)) {
61 // Ignore non-volatile loads, they are always ok.
62 if (!LI->isSimple()) return false;
63 continue;
64 }
65
66 if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
67 // If uses of the bitcast are ok, we are ok.
68 ValuesToInspect.emplace_back(I, IsOffset);
69 continue;
70 }
71 if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
72 // If the GEP has all zero indices, it doesn't offset the pointer. If it
73 // doesn't, it does.
74 ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
75 continue;
76 }
77
78 if (auto *Call = dyn_cast<CallBase>(I)) {
79 // If this is the function being called then we treat it like a load and
80 // ignore it.
81 if (Call->isCallee(&U))
82 continue;
83
84 unsigned DataOpNo = Call->getDataOperandNo(&U);
85 bool IsArgOperand = Call->isArgOperand(&U);
86
87 // Inalloca arguments are clobbered by the call.
88 if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
89 return false;
90
91 // If this is a readonly/readnone call site, then we know it is just a
92 // load (but one that potentially returns the value itself), so we can
93 // ignore it if we know that the value isn't captured.
94 if (Call->onlyReadsMemory() &&
95 (Call->use_empty() || Call->doesNotCapture(DataOpNo)))
96 continue;
97
98 // If this is being passed as a byval argument, the caller is making a
99 // copy, so it is only a read of the alloca.
100 if (IsArgOperand && Call->isByValArgument(DataOpNo))
101 continue;
102 }
103
104 // Lifetime intrinsics can be handled by the caller.
105 if (I->isLifetimeStartOrEnd()) {
106 assert(I->use_empty() && "Lifetime markers have no result to use!");
107 ToDelete.push_back(I);
108 continue;
109 }
110
111 // If this is isn't our memcpy/memmove, reject it as something we can't
112 // handle.
113 MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
114 if (!MI)
115 return false;
116
117 // If the transfer is using the alloca as a source of the transfer, then
118 // ignore it since it is a load (unless the transfer is volatile).
119 if (U.getOperandNo() == 1) {
120 if (MI->isVolatile()) return false;
121 continue;
122 }
123
124 // If we already have seen a copy, reject the second one.
125 if (TheCopy) return false;
126
127 // If the pointer has been offset from the start of the alloca, we can't
128 // safely handle this.
129 if (IsOffset) return false;
130
131 // If the memintrinsic isn't using the alloca as the dest, reject it.
132 if (U.getOperandNo() != 0) return false;
133
134 // If the source of the memcpy/move is not a constant global, reject it.
135 if (!AA->pointsToConstantMemory(MI->getSource()))
136 return false;
137
138 // Otherwise, the transform is safe. Remember the copy instruction.
139 TheCopy = MI;
140 }
141 }
142 return true;
143 }
144
145 /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
146 /// modified by a copy from a constant global. If we can prove this, we can
147 /// replace any uses of the alloca with uses of the global directly.
148 static MemTransferInst *
isOnlyCopiedFromConstantMemory(AAResults * AA,AllocaInst * AI,SmallVectorImpl<Instruction * > & ToDelete)149 isOnlyCopiedFromConstantMemory(AAResults *AA,
150 AllocaInst *AI,
151 SmallVectorImpl<Instruction *> &ToDelete) {
152 MemTransferInst *TheCopy = nullptr;
153 if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
154 return TheCopy;
155 return nullptr;
156 }
157
158 /// Returns true if V is dereferenceable for size of alloca.
isDereferenceableForAllocaSize(const Value * V,const AllocaInst * AI,const DataLayout & DL)159 static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
160 const DataLayout &DL) {
161 if (AI->isArrayAllocation())
162 return false;
163 uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
164 if (!AllocaSize)
165 return false;
166 return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
167 APInt(64, AllocaSize), DL);
168 }
169
simplifyAllocaArraySize(InstCombinerImpl & IC,AllocaInst & AI)170 static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
171 AllocaInst &AI) {
172 // Check for array size of 1 (scalar allocation).
173 if (!AI.isArrayAllocation()) {
174 // i32 1 is the canonical array size for scalar allocations.
175 if (AI.getArraySize()->getType()->isIntegerTy(32))
176 return nullptr;
177
178 // Canonicalize it.
179 return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
180 }
181
182 // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
183 if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
184 if (C->getValue().getActiveBits() <= 64) {
185 Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
186 AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
187 New->setAlignment(AI.getAlign());
188
189 // Scan to the end of the allocation instructions, to skip over a block of
190 // allocas if possible...also skip interleaved debug info
191 //
192 BasicBlock::iterator It(New);
193 while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
194 ++It;
195
196 // Now that I is pointing to the first non-allocation-inst in the block,
197 // insert our getelementptr instruction...
198 //
199 Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
200 Value *NullIdx = Constant::getNullValue(IdxTy);
201 Value *Idx[2] = {NullIdx, NullIdx};
202 Instruction *GEP = GetElementPtrInst::CreateInBounds(
203 NewTy, New, Idx, New->getName() + ".sub");
204 IC.InsertNewInstBefore(GEP, *It);
205
206 // Now make everything use the getelementptr instead of the original
207 // allocation.
208 return IC.replaceInstUsesWith(AI, GEP);
209 }
210 }
211
212 if (isa<UndefValue>(AI.getArraySize()))
213 return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
214
215 // Ensure that the alloca array size argument has type intptr_t, so that
216 // any casting is exposed early.
217 Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
218 if (AI.getArraySize()->getType() != IntPtrTy) {
219 Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
220 return IC.replaceOperand(AI, 0, V);
221 }
222
223 return nullptr;
224 }
225
226 namespace {
227 // If I and V are pointers in different address space, it is not allowed to
228 // use replaceAllUsesWith since I and V have different types. A
229 // non-target-specific transformation should not use addrspacecast on V since
230 // the two address space may be disjoint depending on target.
231 //
232 // This class chases down uses of the old pointer until reaching the load
233 // instructions, then replaces the old pointer in the load instructions with
234 // the new pointer. If during the chasing it sees bitcast or GEP, it will
235 // create new bitcast or GEP with the new pointer and use them in the load
236 // instruction.
237 class PointerReplacer {
238 public:
PointerReplacer(InstCombinerImpl & IC)239 PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
240
241 bool collectUsers(Instruction &I);
242 void replacePointer(Instruction &I, Value *V);
243
244 private:
245 void replace(Instruction *I);
246 Value *getReplacement(Value *I);
247
248 SmallSetVector<Instruction *, 4> Worklist;
249 MapVector<Value *, Value *> WorkMap;
250 InstCombinerImpl &IC;
251 };
252 } // end anonymous namespace
253
collectUsers(Instruction & I)254 bool PointerReplacer::collectUsers(Instruction &I) {
255 for (auto U : I.users()) {
256 Instruction *Inst = cast<Instruction>(&*U);
257 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
258 if (Load->isVolatile())
259 return false;
260 Worklist.insert(Load);
261 } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
262 Worklist.insert(Inst);
263 if (!collectUsers(*Inst))
264 return false;
265 } else if (isa<MemTransferInst>(Inst)) {
266 Worklist.insert(Inst);
267 } else {
268 LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
269 return false;
270 }
271 }
272
273 return true;
274 }
275
getReplacement(Value * V)276 Value *PointerReplacer::getReplacement(Value *V) {
277 auto Loc = WorkMap.find(V);
278 if (Loc != WorkMap.end())
279 return Loc->second;
280 return nullptr;
281 }
282
replace(Instruction * I)283 void PointerReplacer::replace(Instruction *I) {
284 if (getReplacement(I))
285 return;
286
287 if (auto *LT = dyn_cast<LoadInst>(I)) {
288 auto *V = getReplacement(LT->getPointerOperand());
289 assert(V && "Operand not replaced");
290 auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
291 LT->getAlign(), LT->getOrdering(),
292 LT->getSyncScopeID());
293 NewI->takeName(LT);
294 copyMetadataForLoad(*NewI, *LT);
295
296 IC.InsertNewInstWith(NewI, *LT);
297 IC.replaceInstUsesWith(*LT, NewI);
298 WorkMap[LT] = NewI;
299 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
300 auto *V = getReplacement(GEP->getPointerOperand());
301 assert(V && "Operand not replaced");
302 SmallVector<Value *, 8> Indices;
303 Indices.append(GEP->idx_begin(), GEP->idx_end());
304 auto *NewI = GetElementPtrInst::Create(
305 V->getType()->getPointerElementType(), V, Indices);
306 IC.InsertNewInstWith(NewI, *GEP);
307 NewI->takeName(GEP);
308 WorkMap[GEP] = NewI;
309 } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
310 auto *V = getReplacement(BC->getOperand(0));
311 assert(V && "Operand not replaced");
312 auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
313 V->getType()->getPointerAddressSpace());
314 auto *NewI = new BitCastInst(V, NewT);
315 IC.InsertNewInstWith(NewI, *BC);
316 NewI->takeName(BC);
317 WorkMap[BC] = NewI;
318 } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
319 auto *SrcV = getReplacement(MemCpy->getRawSource());
320 // The pointer may appear in the destination of a copy, but we don't want to
321 // replace it.
322 if (!SrcV) {
323 assert(getReplacement(MemCpy->getRawDest()) &&
324 "destination not in replace list");
325 return;
326 }
327
328 IC.Builder.SetInsertPoint(MemCpy);
329 auto *NewI = IC.Builder.CreateMemTransferInst(
330 MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
331 SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
332 MemCpy->isVolatile());
333 AAMDNodes AAMD;
334 MemCpy->getAAMetadata(AAMD);
335 if (AAMD)
336 NewI->setAAMetadata(AAMD);
337
338 IC.eraseInstFromFunction(*MemCpy);
339 WorkMap[MemCpy] = NewI;
340 } else {
341 llvm_unreachable("should never reach here");
342 }
343 }
344
replacePointer(Instruction & I,Value * V)345 void PointerReplacer::replacePointer(Instruction &I, Value *V) {
346 #ifndef NDEBUG
347 auto *PT = cast<PointerType>(I.getType());
348 auto *NT = cast<PointerType>(V->getType());
349 assert(PT != NT && PT->getElementType() == NT->getElementType() &&
350 "Invalid usage");
351 #endif
352 WorkMap[&I] = V;
353
354 for (Instruction *Workitem : Worklist)
355 replace(Workitem);
356 }
357
visitAllocaInst(AllocaInst & AI)358 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
359 if (auto *I = simplifyAllocaArraySize(*this, AI))
360 return I;
361
362 if (AI.getAllocatedType()->isSized()) {
363 // Move all alloca's of zero byte objects to the entry block and merge them
364 // together. Note that we only do this for alloca's, because malloc should
365 // allocate and return a unique pointer, even for a zero byte allocation.
366 if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
367 // For a zero sized alloca there is no point in doing an array allocation.
368 // This is helpful if the array size is a complicated expression not used
369 // elsewhere.
370 if (AI.isArrayAllocation())
371 return replaceOperand(AI, 0,
372 ConstantInt::get(AI.getArraySize()->getType(), 1));
373
374 // Get the first instruction in the entry block.
375 BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
376 Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
377 if (FirstInst != &AI) {
378 // If the entry block doesn't start with a zero-size alloca then move
379 // this one to the start of the entry block. There is no problem with
380 // dominance as the array size was forced to a constant earlier already.
381 AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
382 if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
383 DL.getTypeAllocSize(EntryAI->getAllocatedType())
384 .getKnownMinSize() != 0) {
385 AI.moveBefore(FirstInst);
386 return &AI;
387 }
388
389 // Replace this zero-sized alloca with the one at the start of the entry
390 // block after ensuring that the address will be aligned enough for both
391 // types.
392 const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
393 EntryAI->setAlignment(MaxAlign);
394 if (AI.getType() != EntryAI->getType())
395 return new BitCastInst(EntryAI, AI.getType());
396 return replaceInstUsesWith(AI, EntryAI);
397 }
398 }
399 }
400
401 // Check to see if this allocation is only modified by a memcpy/memmove from
402 // a constant whose alignment is equal to or exceeds that of the allocation.
403 // If this is the case, we can change all users to use the constant global
404 // instead. This is commonly produced by the CFE by constructs like "void
405 // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
406 // read.
407 SmallVector<Instruction *, 4> ToDelete;
408 if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
409 Value *TheSrc = Copy->getSource();
410 Align AllocaAlign = AI.getAlign();
411 Align SourceAlign = getOrEnforceKnownAlignment(
412 TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
413 if (AllocaAlign <= SourceAlign &&
414 isDereferenceableForAllocaSize(TheSrc, &AI, DL)) {
415 LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
416 LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
417 unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
418 auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
419 if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
420 for (Instruction *Delete : ToDelete)
421 eraseInstFromFunction(*Delete);
422
423 Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
424 Instruction *NewI = replaceInstUsesWith(AI, Cast);
425 eraseInstFromFunction(*Copy);
426 ++NumGlobalCopies;
427 return NewI;
428 }
429
430 PointerReplacer PtrReplacer(*this);
431 if (PtrReplacer.collectUsers(AI)) {
432 for (Instruction *Delete : ToDelete)
433 eraseInstFromFunction(*Delete);
434
435 Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
436 PtrReplacer.replacePointer(AI, Cast);
437 ++NumGlobalCopies;
438 }
439 }
440 }
441
442 // At last, use the generic allocation site handler to aggressively remove
443 // unused allocas.
444 return visitAllocSite(AI);
445 }
446
447 // Are we allowed to form a atomic load or store of this type?
isSupportedAtomicType(Type * Ty)448 static bool isSupportedAtomicType(Type *Ty) {
449 return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
450 }
451
452 /// Helper to combine a load to a new type.
453 ///
454 /// This just does the work of combining a load to a new type. It handles
455 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
456 /// loaded *value* type. This will convert it to a pointer, cast the operand to
457 /// that pointer type, load it, etc.
458 ///
459 /// Note that this will create all of the instructions with whatever insert
460 /// point the \c InstCombinerImpl currently is using.
combineLoadToNewType(LoadInst & LI,Type * NewTy,const Twine & Suffix)461 LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
462 const Twine &Suffix) {
463 assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
464 "can't fold an atomic load to requested type");
465
466 Value *Ptr = LI.getPointerOperand();
467 unsigned AS = LI.getPointerAddressSpace();
468 Value *NewPtr = nullptr;
469 if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
470 NewPtr->getType()->getPointerElementType() == NewTy &&
471 NewPtr->getType()->getPointerAddressSpace() == AS))
472 NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
473
474 LoadInst *NewLoad = Builder.CreateAlignedLoad(
475 NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
476 NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
477 copyMetadataForLoad(*NewLoad, LI);
478 return NewLoad;
479 }
480
481 /// Combine a store to a new type.
482 ///
483 /// Returns the newly created store instruction.
combineStoreToNewValue(InstCombinerImpl & IC,StoreInst & SI,Value * V)484 static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
485 Value *V) {
486 assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
487 "can't fold an atomic store of requested type");
488
489 Value *Ptr = SI.getPointerOperand();
490 unsigned AS = SI.getPointerAddressSpace();
491 SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
492 SI.getAllMetadata(MD);
493
494 StoreInst *NewStore = IC.Builder.CreateAlignedStore(
495 V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
496 SI.getAlign(), SI.isVolatile());
497 NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
498 for (const auto &MDPair : MD) {
499 unsigned ID = MDPair.first;
500 MDNode *N = MDPair.second;
501 // Note, essentially every kind of metadata should be preserved here! This
502 // routine is supposed to clone a store instruction changing *only its
503 // type*. The only metadata it makes sense to drop is metadata which is
504 // invalidated when the pointer type changes. This should essentially
505 // never be the case in LLVM, but we explicitly switch over only known
506 // metadata to be conservatively correct. If you are adding metadata to
507 // LLVM which pertains to stores, you almost certainly want to add it
508 // here.
509 switch (ID) {
510 case LLVMContext::MD_dbg:
511 case LLVMContext::MD_tbaa:
512 case LLVMContext::MD_prof:
513 case LLVMContext::MD_fpmath:
514 case LLVMContext::MD_tbaa_struct:
515 case LLVMContext::MD_alias_scope:
516 case LLVMContext::MD_noalias:
517 case LLVMContext::MD_nontemporal:
518 case LLVMContext::MD_mem_parallel_loop_access:
519 case LLVMContext::MD_access_group:
520 // All of these directly apply.
521 NewStore->setMetadata(ID, N);
522 break;
523 case LLVMContext::MD_invariant_load:
524 case LLVMContext::MD_nonnull:
525 case LLVMContext::MD_noundef:
526 case LLVMContext::MD_range:
527 case LLVMContext::MD_align:
528 case LLVMContext::MD_dereferenceable:
529 case LLVMContext::MD_dereferenceable_or_null:
530 // These don't apply for stores.
531 break;
532 }
533 }
534
535 return NewStore;
536 }
537
538 /// Returns true if instruction represent minmax pattern like:
539 /// select ((cmp load V1, load V2), V1, V2).
isMinMaxWithLoads(Value * V,Type * & LoadTy)540 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
541 assert(V->getType()->isPointerTy() && "Expected pointer type.");
542 // Ignore possible ty* to ixx* bitcast.
543 V = InstCombiner::peekThroughBitcast(V);
544 // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
545 // pattern.
546 CmpInst::Predicate Pred;
547 Instruction *L1;
548 Instruction *L2;
549 Value *LHS;
550 Value *RHS;
551 if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
552 m_Value(LHS), m_Value(RHS))))
553 return false;
554 LoadTy = L1->getType();
555 return (match(L1, m_Load(m_Specific(LHS))) &&
556 match(L2, m_Load(m_Specific(RHS)))) ||
557 (match(L1, m_Load(m_Specific(RHS))) &&
558 match(L2, m_Load(m_Specific(LHS))));
559 }
560
561 /// Combine loads to match the type of their uses' value after looking
562 /// through intervening bitcasts.
563 ///
564 /// The core idea here is that if the result of a load is used in an operation,
565 /// we should load the type most conducive to that operation. For example, when
566 /// loading an integer and converting that immediately to a pointer, we should
567 /// instead directly load a pointer.
568 ///
569 /// However, this routine must never change the width of a load or the number of
570 /// loads as that would introduce a semantic change. This combine is expected to
571 /// be a semantic no-op which just allows loads to more closely model the types
572 /// of their consuming operations.
573 ///
574 /// Currently, we also refuse to change the precise type used for an atomic load
575 /// or a volatile load. This is debatable, and might be reasonable to change
576 /// later. However, it is risky in case some backend or other part of LLVM is
577 /// relying on the exact type loaded to select appropriate atomic operations.
combineLoadToOperationType(InstCombinerImpl & IC,LoadInst & LI)578 static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
579 LoadInst &LI) {
580 // FIXME: We could probably with some care handle both volatile and ordered
581 // atomic loads here but it isn't clear that this is important.
582 if (!LI.isUnordered())
583 return nullptr;
584
585 if (LI.use_empty())
586 return nullptr;
587
588 // swifterror values can't be bitcasted.
589 if (LI.getPointerOperand()->isSwiftError())
590 return nullptr;
591
592 const DataLayout &DL = IC.getDataLayout();
593
594 // Fold away bit casts of the loaded value by loading the desired type.
595 // Note that we should not do this for pointer<->integer casts,
596 // because that would result in type punning.
597 if (LI.hasOneUse())
598 if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
599 if (CI->isNoopCast(DL) && LI.getType()->isPtrOrPtrVectorTy() ==
600 CI->getDestTy()->isPtrOrPtrVectorTy())
601 if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
602 LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
603 CI->replaceAllUsesWith(NewLoad);
604 IC.eraseInstFromFunction(*CI);
605 return &LI;
606 }
607
608 // FIXME: We should also canonicalize loads of vectors when their elements are
609 // cast to other types.
610 return nullptr;
611 }
612
unpackLoadToAggregate(InstCombinerImpl & IC,LoadInst & LI)613 static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
614 // FIXME: We could probably with some care handle both volatile and atomic
615 // stores here but it isn't clear that this is important.
616 if (!LI.isSimple())
617 return nullptr;
618
619 Type *T = LI.getType();
620 if (!T->isAggregateType())
621 return nullptr;
622
623 StringRef Name = LI.getName();
624 assert(LI.getAlignment() && "Alignment must be set at this point");
625
626 if (auto *ST = dyn_cast<StructType>(T)) {
627 // If the struct only have one element, we unpack.
628 auto NumElements = ST->getNumElements();
629 if (NumElements == 1) {
630 LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
631 ".unpack");
632 AAMDNodes AAMD;
633 LI.getAAMetadata(AAMD);
634 NewLoad->setAAMetadata(AAMD);
635 return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
636 UndefValue::get(T), NewLoad, 0, Name));
637 }
638
639 // We don't want to break loads with padding here as we'd loose
640 // the knowledge that padding exists for the rest of the pipeline.
641 const DataLayout &DL = IC.getDataLayout();
642 auto *SL = DL.getStructLayout(ST);
643 if (SL->hasPadding())
644 return nullptr;
645
646 const auto Align = LI.getAlign();
647 auto *Addr = LI.getPointerOperand();
648 auto *IdxType = Type::getInt32Ty(T->getContext());
649 auto *Zero = ConstantInt::get(IdxType, 0);
650
651 Value *V = UndefValue::get(T);
652 for (unsigned i = 0; i < NumElements; i++) {
653 Value *Indices[2] = {
654 Zero,
655 ConstantInt::get(IdxType, i),
656 };
657 auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
658 Name + ".elt");
659 auto *L = IC.Builder.CreateAlignedLoad(
660 ST->getElementType(i), Ptr,
661 commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
662 // Propagate AA metadata. It'll still be valid on the narrowed load.
663 AAMDNodes AAMD;
664 LI.getAAMetadata(AAMD);
665 L->setAAMetadata(AAMD);
666 V = IC.Builder.CreateInsertValue(V, L, i);
667 }
668
669 V->setName(Name);
670 return IC.replaceInstUsesWith(LI, V);
671 }
672
673 if (auto *AT = dyn_cast<ArrayType>(T)) {
674 auto *ET = AT->getElementType();
675 auto NumElements = AT->getNumElements();
676 if (NumElements == 1) {
677 LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
678 AAMDNodes AAMD;
679 LI.getAAMetadata(AAMD);
680 NewLoad->setAAMetadata(AAMD);
681 return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
682 UndefValue::get(T), NewLoad, 0, Name));
683 }
684
685 // Bail out if the array is too large. Ideally we would like to optimize
686 // arrays of arbitrary size but this has a terrible impact on compile time.
687 // The threshold here is chosen arbitrarily, maybe needs a little bit of
688 // tuning.
689 if (NumElements > IC.MaxArraySizeForCombine)
690 return nullptr;
691
692 const DataLayout &DL = IC.getDataLayout();
693 auto EltSize = DL.getTypeAllocSize(ET);
694 const auto Align = LI.getAlign();
695
696 auto *Addr = LI.getPointerOperand();
697 auto *IdxType = Type::getInt64Ty(T->getContext());
698 auto *Zero = ConstantInt::get(IdxType, 0);
699
700 Value *V = UndefValue::get(T);
701 uint64_t Offset = 0;
702 for (uint64_t i = 0; i < NumElements; i++) {
703 Value *Indices[2] = {
704 Zero,
705 ConstantInt::get(IdxType, i),
706 };
707 auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
708 Name + ".elt");
709 auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
710 commonAlignment(Align, Offset),
711 Name + ".unpack");
712 AAMDNodes AAMD;
713 LI.getAAMetadata(AAMD);
714 L->setAAMetadata(AAMD);
715 V = IC.Builder.CreateInsertValue(V, L, i);
716 Offset += EltSize;
717 }
718
719 V->setName(Name);
720 return IC.replaceInstUsesWith(LI, V);
721 }
722
723 return nullptr;
724 }
725
726 // If we can determine that all possible objects pointed to by the provided
727 // pointer value are, not only dereferenceable, but also definitively less than
728 // or equal to the provided maximum size, then return true. Otherwise, return
729 // false (constant global values and allocas fall into this category).
730 //
731 // FIXME: This should probably live in ValueTracking (or similar).
isObjectSizeLessThanOrEq(Value * V,uint64_t MaxSize,const DataLayout & DL)732 static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
733 const DataLayout &DL) {
734 SmallPtrSet<Value *, 4> Visited;
735 SmallVector<Value *, 4> Worklist(1, V);
736
737 do {
738 Value *P = Worklist.pop_back_val();
739 P = P->stripPointerCasts();
740
741 if (!Visited.insert(P).second)
742 continue;
743
744 if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
745 Worklist.push_back(SI->getTrueValue());
746 Worklist.push_back(SI->getFalseValue());
747 continue;
748 }
749
750 if (PHINode *PN = dyn_cast<PHINode>(P)) {
751 for (Value *IncValue : PN->incoming_values())
752 Worklist.push_back(IncValue);
753 continue;
754 }
755
756 if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
757 if (GA->isInterposable())
758 return false;
759 Worklist.push_back(GA->getAliasee());
760 continue;
761 }
762
763 // If we know how big this object is, and it is less than MaxSize, continue
764 // searching. Otherwise, return false.
765 if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
766 if (!AI->getAllocatedType()->isSized())
767 return false;
768
769 ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
770 if (!CS)
771 return false;
772
773 uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
774 // Make sure that, even if the multiplication below would wrap as an
775 // uint64_t, we still do the right thing.
776 if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
777 return false;
778 continue;
779 }
780
781 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
782 if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
783 return false;
784
785 uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
786 if (InitSize > MaxSize)
787 return false;
788 continue;
789 }
790
791 return false;
792 } while (!Worklist.empty());
793
794 return true;
795 }
796
797 // If we're indexing into an object of a known size, and the outer index is
798 // not a constant, but having any value but zero would lead to undefined
799 // behavior, replace it with zero.
800 //
801 // For example, if we have:
802 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
803 // ...
804 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
805 // ... = load i32* %arrayidx, align 4
806 // Then we know that we can replace %x in the GEP with i64 0.
807 //
808 // FIXME: We could fold any GEP index to zero that would cause UB if it were
809 // not zero. Currently, we only handle the first such index. Also, we could
810 // also search through non-zero constant indices if we kept track of the
811 // offsets those indices implied.
canReplaceGEPIdxWithZero(InstCombinerImpl & IC,GetElementPtrInst * GEPI,Instruction * MemI,unsigned & Idx)812 static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
813 GetElementPtrInst *GEPI, Instruction *MemI,
814 unsigned &Idx) {
815 if (GEPI->getNumOperands() < 2)
816 return false;
817
818 // Find the first non-zero index of a GEP. If all indices are zero, return
819 // one past the last index.
820 auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
821 unsigned I = 1;
822 for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
823 Value *V = GEPI->getOperand(I);
824 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
825 if (CI->isZero())
826 continue;
827
828 break;
829 }
830
831 return I;
832 };
833
834 // Skip through initial 'zero' indices, and find the corresponding pointer
835 // type. See if the next index is not a constant.
836 Idx = FirstNZIdx(GEPI);
837 if (Idx == GEPI->getNumOperands())
838 return false;
839 if (isa<Constant>(GEPI->getOperand(Idx)))
840 return false;
841
842 SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
843 Type *SourceElementType = GEPI->getSourceElementType();
844 // Size information about scalable vectors is not available, so we cannot
845 // deduce whether indexing at n is undefined behaviour or not. Bail out.
846 if (isa<ScalableVectorType>(SourceElementType))
847 return false;
848
849 Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
850 if (!AllocTy || !AllocTy->isSized())
851 return false;
852 const DataLayout &DL = IC.getDataLayout();
853 uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedSize();
854
855 // If there are more indices after the one we might replace with a zero, make
856 // sure they're all non-negative. If any of them are negative, the overall
857 // address being computed might be before the base address determined by the
858 // first non-zero index.
859 auto IsAllNonNegative = [&]() {
860 for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
861 KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
862 if (Known.isNonNegative())
863 continue;
864 return false;
865 }
866
867 return true;
868 };
869
870 // FIXME: If the GEP is not inbounds, and there are extra indices after the
871 // one we'll replace, those could cause the address computation to wrap
872 // (rendering the IsAllNonNegative() check below insufficient). We can do
873 // better, ignoring zero indices (and other indices we can prove small
874 // enough not to wrap).
875 if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
876 return false;
877
878 // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
879 // also known to be dereferenceable.
880 return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
881 IsAllNonNegative();
882 }
883
884 // If we're indexing into an object with a variable index for the memory
885 // access, but the object has only one element, we can assume that the index
886 // will always be zero. If we replace the GEP, return it.
887 template <typename T>
replaceGEPIdxWithZero(InstCombinerImpl & IC,Value * Ptr,T & MemI)888 static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
889 T &MemI) {
890 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
891 unsigned Idx;
892 if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
893 Instruction *NewGEPI = GEPI->clone();
894 NewGEPI->setOperand(Idx,
895 ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
896 NewGEPI->insertBefore(GEPI);
897 MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
898 return NewGEPI;
899 }
900 }
901
902 return nullptr;
903 }
904
canSimplifyNullStoreOrGEP(StoreInst & SI)905 static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
906 if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
907 return false;
908
909 auto *Ptr = SI.getPointerOperand();
910 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
911 Ptr = GEPI->getOperand(0);
912 return (isa<ConstantPointerNull>(Ptr) &&
913 !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
914 }
915
canSimplifyNullLoadOrGEP(LoadInst & LI,Value * Op)916 static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
917 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
918 const Value *GEPI0 = GEPI->getOperand(0);
919 if (isa<ConstantPointerNull>(GEPI0) &&
920 !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
921 return true;
922 }
923 if (isa<UndefValue>(Op) ||
924 (isa<ConstantPointerNull>(Op) &&
925 !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
926 return true;
927 return false;
928 }
929
visitLoadInst(LoadInst & LI)930 Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
931 Value *Op = LI.getOperand(0);
932
933 // Try to canonicalize the loaded type.
934 if (Instruction *Res = combineLoadToOperationType(*this, LI))
935 return Res;
936
937 // Attempt to improve the alignment.
938 Align KnownAlign = getOrEnforceKnownAlignment(
939 Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
940 if (KnownAlign > LI.getAlign())
941 LI.setAlignment(KnownAlign);
942
943 // Replace GEP indices if possible.
944 if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
945 Worklist.push(NewGEPI);
946 return &LI;
947 }
948
949 if (Instruction *Res = unpackLoadToAggregate(*this, LI))
950 return Res;
951
952 // Do really simple store-to-load forwarding and load CSE, to catch cases
953 // where there are several consecutive memory accesses to the same location,
954 // separated by a few arithmetic operations.
955 BasicBlock::iterator BBI(LI);
956 bool IsLoadCSE = false;
957 if (Value *AvailableVal = FindAvailableLoadedValue(
958 &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
959 if (IsLoadCSE)
960 combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
961
962 return replaceInstUsesWith(
963 LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
964 LI.getName() + ".cast"));
965 }
966
967 // None of the following transforms are legal for volatile/ordered atomic
968 // loads. Most of them do apply for unordered atomics.
969 if (!LI.isUnordered()) return nullptr;
970
971 // load(gep null, ...) -> unreachable
972 // load null/undef -> unreachable
973 // TODO: Consider a target hook for valid address spaces for this xforms.
974 if (canSimplifyNullLoadOrGEP(LI, Op)) {
975 // Insert a new store to null instruction before the load to indicate
976 // that this code is not reachable. We do this instead of inserting
977 // an unreachable instruction directly because we cannot modify the
978 // CFG.
979 StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
980 Constant::getNullValue(Op->getType()), &LI);
981 SI->setDebugLoc(LI.getDebugLoc());
982 return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
983 }
984
985 if (Op->hasOneUse()) {
986 // Change select and PHI nodes to select values instead of addresses: this
987 // helps alias analysis out a lot, allows many others simplifications, and
988 // exposes redundancy in the code.
989 //
990 // Note that we cannot do the transformation unless we know that the
991 // introduced loads cannot trap! Something like this is valid as long as
992 // the condition is always false: load (select bool %C, int* null, int* %G),
993 // but it would not be valid if we transformed it to load from null
994 // unconditionally.
995 //
996 if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
997 // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2).
998 Align Alignment = LI.getAlign();
999 if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
1000 Alignment, DL, SI) &&
1001 isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
1002 Alignment, DL, SI)) {
1003 LoadInst *V1 =
1004 Builder.CreateLoad(LI.getType(), SI->getOperand(1),
1005 SI->getOperand(1)->getName() + ".val");
1006 LoadInst *V2 =
1007 Builder.CreateLoad(LI.getType(), SI->getOperand(2),
1008 SI->getOperand(2)->getName() + ".val");
1009 assert(LI.isUnordered() && "implied by above");
1010 V1->setAlignment(Alignment);
1011 V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1012 V2->setAlignment(Alignment);
1013 V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1014 return SelectInst::Create(SI->getCondition(), V1, V2);
1015 }
1016
1017 // load (select (cond, null, P)) -> load P
1018 if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
1019 !NullPointerIsDefined(SI->getFunction(),
1020 LI.getPointerAddressSpace()))
1021 return replaceOperand(LI, 0, SI->getOperand(2));
1022
1023 // load (select (cond, P, null)) -> load P
1024 if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
1025 !NullPointerIsDefined(SI->getFunction(),
1026 LI.getPointerAddressSpace()))
1027 return replaceOperand(LI, 0, SI->getOperand(1));
1028 }
1029 }
1030 return nullptr;
1031 }
1032
1033 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
1034 ///
1035 /// \returns underlying value that was "cast", or nullptr otherwise.
1036 ///
1037 /// For example, if we have:
1038 ///
1039 /// %E0 = extractelement <2 x double> %U, i32 0
1040 /// %V0 = insertvalue [2 x double] undef, double %E0, 0
1041 /// %E1 = extractelement <2 x double> %U, i32 1
1042 /// %V1 = insertvalue [2 x double] %V0, double %E1, 1
1043 ///
1044 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
1045 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
1046 /// Note that %U may contain non-undef values where %V1 has undef.
likeBitCastFromVector(InstCombinerImpl & IC,Value * V)1047 static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
1048 Value *U = nullptr;
1049 while (auto *IV = dyn_cast<InsertValueInst>(V)) {
1050 auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
1051 if (!E)
1052 return nullptr;
1053 auto *W = E->getVectorOperand();
1054 if (!U)
1055 U = W;
1056 else if (U != W)
1057 return nullptr;
1058 auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
1059 if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
1060 return nullptr;
1061 V = IV->getAggregateOperand();
1062 }
1063 if (!isa<UndefValue>(V) ||!U)
1064 return nullptr;
1065
1066 auto *UT = cast<VectorType>(U->getType());
1067 auto *VT = V->getType();
1068 // Check that types UT and VT are bitwise isomorphic.
1069 const auto &DL = IC.getDataLayout();
1070 if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
1071 return nullptr;
1072 }
1073 if (auto *AT = dyn_cast<ArrayType>(VT)) {
1074 if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1075 return nullptr;
1076 } else {
1077 auto *ST = cast<StructType>(VT);
1078 if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1079 return nullptr;
1080 for (const auto *EltT : ST->elements()) {
1081 if (EltT != UT->getElementType())
1082 return nullptr;
1083 }
1084 }
1085 return U;
1086 }
1087
1088 /// Combine stores to match the type of value being stored.
1089 ///
1090 /// The core idea here is that the memory does not have any intrinsic type and
1091 /// where we can we should match the type of a store to the type of value being
1092 /// stored.
1093 ///
1094 /// However, this routine must never change the width of a store or the number of
1095 /// stores as that would introduce a semantic change. This combine is expected to
1096 /// be a semantic no-op which just allows stores to more closely model the types
1097 /// of their incoming values.
1098 ///
1099 /// Currently, we also refuse to change the precise type used for an atomic or
1100 /// volatile store. This is debatable, and might be reasonable to change later.
1101 /// However, it is risky in case some backend or other part of LLVM is relying
1102 /// on the exact type stored to select appropriate atomic operations.
1103 ///
1104 /// \returns true if the store was successfully combined away. This indicates
1105 /// the caller must erase the store instruction. We have to let the caller erase
1106 /// the store instruction as otherwise there is no way to signal whether it was
1107 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
combineStoreToValueType(InstCombinerImpl & IC,StoreInst & SI)1108 static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
1109 // FIXME: We could probably with some care handle both volatile and ordered
1110 // atomic stores here but it isn't clear that this is important.
1111 if (!SI.isUnordered())
1112 return false;
1113
1114 // swifterror values can't be bitcasted.
1115 if (SI.getPointerOperand()->isSwiftError())
1116 return false;
1117
1118 Value *V = SI.getValueOperand();
1119
1120 // Fold away bit casts of the stored value by storing the original type.
1121 if (auto *BC = dyn_cast<BitCastInst>(V)) {
1122 V = BC->getOperand(0);
1123 if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
1124 combineStoreToNewValue(IC, SI, V);
1125 return true;
1126 }
1127 }
1128
1129 if (Value *U = likeBitCastFromVector(IC, V))
1130 if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
1131 combineStoreToNewValue(IC, SI, U);
1132 return true;
1133 }
1134
1135 // FIXME: We should also canonicalize stores of vectors when their elements
1136 // are cast to other types.
1137 return false;
1138 }
1139
unpackStoreToAggregate(InstCombinerImpl & IC,StoreInst & SI)1140 static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
1141 // FIXME: We could probably with some care handle both volatile and atomic
1142 // stores here but it isn't clear that this is important.
1143 if (!SI.isSimple())
1144 return false;
1145
1146 Value *V = SI.getValueOperand();
1147 Type *T = V->getType();
1148
1149 if (!T->isAggregateType())
1150 return false;
1151
1152 if (auto *ST = dyn_cast<StructType>(T)) {
1153 // If the struct only have one element, we unpack.
1154 unsigned Count = ST->getNumElements();
1155 if (Count == 1) {
1156 V = IC.Builder.CreateExtractValue(V, 0);
1157 combineStoreToNewValue(IC, SI, V);
1158 return true;
1159 }
1160
1161 // We don't want to break loads with padding here as we'd loose
1162 // the knowledge that padding exists for the rest of the pipeline.
1163 const DataLayout &DL = IC.getDataLayout();
1164 auto *SL = DL.getStructLayout(ST);
1165 if (SL->hasPadding())
1166 return false;
1167
1168 const auto Align = SI.getAlign();
1169
1170 SmallString<16> EltName = V->getName();
1171 EltName += ".elt";
1172 auto *Addr = SI.getPointerOperand();
1173 SmallString<16> AddrName = Addr->getName();
1174 AddrName += ".repack";
1175
1176 auto *IdxType = Type::getInt32Ty(ST->getContext());
1177 auto *Zero = ConstantInt::get(IdxType, 0);
1178 for (unsigned i = 0; i < Count; i++) {
1179 Value *Indices[2] = {
1180 Zero,
1181 ConstantInt::get(IdxType, i),
1182 };
1183 auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
1184 AddrName);
1185 auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1186 auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
1187 llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1188 AAMDNodes AAMD;
1189 SI.getAAMetadata(AAMD);
1190 NS->setAAMetadata(AAMD);
1191 }
1192
1193 return true;
1194 }
1195
1196 if (auto *AT = dyn_cast<ArrayType>(T)) {
1197 // If the array only have one element, we unpack.
1198 auto NumElements = AT->getNumElements();
1199 if (NumElements == 1) {
1200 V = IC.Builder.CreateExtractValue(V, 0);
1201 combineStoreToNewValue(IC, SI, V);
1202 return true;
1203 }
1204
1205 // Bail out if the array is too large. Ideally we would like to optimize
1206 // arrays of arbitrary size but this has a terrible impact on compile time.
1207 // The threshold here is chosen arbitrarily, maybe needs a little bit of
1208 // tuning.
1209 if (NumElements > IC.MaxArraySizeForCombine)
1210 return false;
1211
1212 const DataLayout &DL = IC.getDataLayout();
1213 auto EltSize = DL.getTypeAllocSize(AT->getElementType());
1214 const auto Align = SI.getAlign();
1215
1216 SmallString<16> EltName = V->getName();
1217 EltName += ".elt";
1218 auto *Addr = SI.getPointerOperand();
1219 SmallString<16> AddrName = Addr->getName();
1220 AddrName += ".repack";
1221
1222 auto *IdxType = Type::getInt64Ty(T->getContext());
1223 auto *Zero = ConstantInt::get(IdxType, 0);
1224
1225 uint64_t Offset = 0;
1226 for (uint64_t i = 0; i < NumElements; i++) {
1227 Value *Indices[2] = {
1228 Zero,
1229 ConstantInt::get(IdxType, i),
1230 };
1231 auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
1232 AddrName);
1233 auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1234 auto EltAlign = commonAlignment(Align, Offset);
1235 Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1236 AAMDNodes AAMD;
1237 SI.getAAMetadata(AAMD);
1238 NS->setAAMetadata(AAMD);
1239 Offset += EltSize;
1240 }
1241
1242 return true;
1243 }
1244
1245 return false;
1246 }
1247
1248 /// equivalentAddressValues - Test if A and B will obviously have the same
1249 /// value. This includes recognizing that %t0 and %t1 will have the same
1250 /// value in code like this:
1251 /// %t0 = getelementptr \@a, 0, 3
1252 /// store i32 0, i32* %t0
1253 /// %t1 = getelementptr \@a, 0, 3
1254 /// %t2 = load i32* %t1
1255 ///
equivalentAddressValues(Value * A,Value * B)1256 static bool equivalentAddressValues(Value *A, Value *B) {
1257 // Test if the values are trivially equivalent.
1258 if (A == B) return true;
1259
1260 // Test if the values come form identical arithmetic instructions.
1261 // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
1262 // its only used to compare two uses within the same basic block, which
1263 // means that they'll always either have the same value or one of them
1264 // will have an undefined value.
1265 if (isa<BinaryOperator>(A) ||
1266 isa<CastInst>(A) ||
1267 isa<PHINode>(A) ||
1268 isa<GetElementPtrInst>(A))
1269 if (Instruction *BI = dyn_cast<Instruction>(B))
1270 if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
1271 return true;
1272
1273 // Otherwise they may not be equivalent.
1274 return false;
1275 }
1276
1277 /// Converts store (bitcast (load (bitcast (select ...)))) to
1278 /// store (load (select ...)), where select is minmax:
1279 /// select ((cmp load V1, load V2), V1, V2).
removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl & IC,StoreInst & SI)1280 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
1281 StoreInst &SI) {
1282 // bitcast?
1283 if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
1284 return false;
1285 // load? integer?
1286 Value *LoadAddr;
1287 if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
1288 return false;
1289 auto *LI = cast<LoadInst>(SI.getValueOperand());
1290 if (!LI->getType()->isIntegerTy())
1291 return false;
1292 Type *CmpLoadTy;
1293 if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
1294 return false;
1295
1296 // Make sure the type would actually change.
1297 // This condition can be hit with chains of bitcasts.
1298 if (LI->getType() == CmpLoadTy)
1299 return false;
1300
1301 // Make sure we're not changing the size of the load/store.
1302 const auto &DL = IC.getDataLayout();
1303 if (DL.getTypeStoreSizeInBits(LI->getType()) !=
1304 DL.getTypeStoreSizeInBits(CmpLoadTy))
1305 return false;
1306
1307 if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
1308 auto *SI = dyn_cast<StoreInst>(U);
1309 return SI && SI->getPointerOperand() != LI &&
1310 InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
1311 LoadAddr &&
1312 !SI->getPointerOperand()->isSwiftError();
1313 }))
1314 return false;
1315
1316 IC.Builder.SetInsertPoint(LI);
1317 LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
1318 // Replace all the stores with stores of the newly loaded value.
1319 for (auto *UI : LI->users()) {
1320 auto *USI = cast<StoreInst>(UI);
1321 IC.Builder.SetInsertPoint(USI);
1322 combineStoreToNewValue(IC, *USI, NewLI);
1323 }
1324 IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
1325 IC.eraseInstFromFunction(*LI);
1326 return true;
1327 }
1328
visitStoreInst(StoreInst & SI)1329 Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
1330 Value *Val = SI.getOperand(0);
1331 Value *Ptr = SI.getOperand(1);
1332
1333 // Try to canonicalize the stored type.
1334 if (combineStoreToValueType(*this, SI))
1335 return eraseInstFromFunction(SI);
1336
1337 // Attempt to improve the alignment.
1338 const Align KnownAlign = getOrEnforceKnownAlignment(
1339 Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
1340 if (KnownAlign > SI.getAlign())
1341 SI.setAlignment(KnownAlign);
1342
1343 // Try to canonicalize the stored type.
1344 if (unpackStoreToAggregate(*this, SI))
1345 return eraseInstFromFunction(SI);
1346
1347 if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
1348 return eraseInstFromFunction(SI);
1349
1350 // Replace GEP indices if possible.
1351 if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
1352 Worklist.push(NewGEPI);
1353 return &SI;
1354 }
1355
1356 // Don't hack volatile/ordered stores.
1357 // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
1358 if (!SI.isUnordered()) return nullptr;
1359
1360 // If the RHS is an alloca with a single use, zapify the store, making the
1361 // alloca dead.
1362 if (Ptr->hasOneUse()) {
1363 if (isa<AllocaInst>(Ptr))
1364 return eraseInstFromFunction(SI);
1365 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1366 if (isa<AllocaInst>(GEP->getOperand(0))) {
1367 if (GEP->getOperand(0)->hasOneUse())
1368 return eraseInstFromFunction(SI);
1369 }
1370 }
1371 }
1372
1373 // If we have a store to a location which is known constant, we can conclude
1374 // that the store must be storing the constant value (else the memory
1375 // wouldn't be constant), and this must be a noop.
1376 if (AA->pointsToConstantMemory(Ptr))
1377 return eraseInstFromFunction(SI);
1378
1379 // Do really simple DSE, to catch cases where there are several consecutive
1380 // stores to the same location, separated by a few arithmetic operations. This
1381 // situation often occurs with bitfield accesses.
1382 BasicBlock::iterator BBI(SI);
1383 for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
1384 --ScanInsts) {
1385 --BBI;
1386 // Don't count debug info directives, lest they affect codegen,
1387 // and we skip pointer-to-pointer bitcasts, which are NOPs.
1388 if (isa<DbgInfoIntrinsic>(BBI) ||
1389 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1390 ScanInsts++;
1391 continue;
1392 }
1393
1394 if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
1395 // Prev store isn't volatile, and stores to the same location?
1396 if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
1397 SI.getOperand(1))) {
1398 ++NumDeadStore;
1399 // Manually add back the original store to the worklist now, so it will
1400 // be processed after the operands of the removed store, as this may
1401 // expose additional DSE opportunities.
1402 Worklist.push(&SI);
1403 eraseInstFromFunction(*PrevSI);
1404 return nullptr;
1405 }
1406 break;
1407 }
1408
1409 // If this is a load, we have to stop. However, if the loaded value is from
1410 // the pointer we're loading and is producing the pointer we're storing,
1411 // then *this* store is dead (X = load P; store X -> P).
1412 if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
1413 if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
1414 assert(SI.isUnordered() && "can't eliminate ordering operation");
1415 return eraseInstFromFunction(SI);
1416 }
1417
1418 // Otherwise, this is a load from some other location. Stores before it
1419 // may not be dead.
1420 break;
1421 }
1422
1423 // Don't skip over loads, throws or things that can modify memory.
1424 if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
1425 break;
1426 }
1427
1428 // store X, null -> turns into 'unreachable' in SimplifyCFG
1429 // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
1430 if (canSimplifyNullStoreOrGEP(SI)) {
1431 if (!isa<UndefValue>(Val))
1432 return replaceOperand(SI, 0, UndefValue::get(Val->getType()));
1433 return nullptr; // Do not modify these!
1434 }
1435
1436 // store undef, Ptr -> noop
1437 if (isa<UndefValue>(Val))
1438 return eraseInstFromFunction(SI);
1439
1440 return nullptr;
1441 }
1442
1443 /// Try to transform:
1444 /// if () { *P = v1; } else { *P = v2 }
1445 /// or:
1446 /// *P = v1; if () { *P = v2; }
1447 /// into a phi node with a store in the successor.
mergeStoreIntoSuccessor(StoreInst & SI)1448 bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
1449 if (!SI.isUnordered())
1450 return false; // This code has not been audited for volatile/ordered case.
1451
1452 // Check if the successor block has exactly 2 incoming edges.
1453 BasicBlock *StoreBB = SI.getParent();
1454 BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
1455 if (!DestBB->hasNPredecessors(2))
1456 return false;
1457
1458 // Capture the other block (the block that doesn't contain our store).
1459 pred_iterator PredIter = pred_begin(DestBB);
1460 if (*PredIter == StoreBB)
1461 ++PredIter;
1462 BasicBlock *OtherBB = *PredIter;
1463
1464 // Bail out if all of the relevant blocks aren't distinct. This can happen,
1465 // for example, if SI is in an infinite loop.
1466 if (StoreBB == DestBB || OtherBB == DestBB)
1467 return false;
1468
1469 // Verify that the other block ends in a branch and is not otherwise empty.
1470 BasicBlock::iterator BBI(OtherBB->getTerminator());
1471 BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
1472 if (!OtherBr || BBI == OtherBB->begin())
1473 return false;
1474
1475 // If the other block ends in an unconditional branch, check for the 'if then
1476 // else' case. There is an instruction before the branch.
1477 StoreInst *OtherStore = nullptr;
1478 if (OtherBr->isUnconditional()) {
1479 --BBI;
1480 // Skip over debugging info.
1481 while (isa<DbgInfoIntrinsic>(BBI) ||
1482 (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1483 if (BBI==OtherBB->begin())
1484 return false;
1485 --BBI;
1486 }
1487 // If this isn't a store, isn't a store to the same location, or is not the
1488 // right kind of store, bail out.
1489 OtherStore = dyn_cast<StoreInst>(BBI);
1490 if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
1491 !SI.isSameOperationAs(OtherStore))
1492 return false;
1493 } else {
1494 // Otherwise, the other block ended with a conditional branch. If one of the
1495 // destinations is StoreBB, then we have the if/then case.
1496 if (OtherBr->getSuccessor(0) != StoreBB &&
1497 OtherBr->getSuccessor(1) != StoreBB)
1498 return false;
1499
1500 // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1501 // if/then triangle. See if there is a store to the same ptr as SI that
1502 // lives in OtherBB.
1503 for (;; --BBI) {
1504 // Check to see if we find the matching store.
1505 if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
1506 if (OtherStore->getOperand(1) != SI.getOperand(1) ||
1507 !SI.isSameOperationAs(OtherStore))
1508 return false;
1509 break;
1510 }
1511 // If we find something that may be using or overwriting the stored
1512 // value, or if we run out of instructions, we can't do the transform.
1513 if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
1514 BBI->mayWriteToMemory() || BBI == OtherBB->begin())
1515 return false;
1516 }
1517
1518 // In order to eliminate the store in OtherBr, we have to make sure nothing
1519 // reads or overwrites the stored value in StoreBB.
1520 for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1521 // FIXME: This should really be AA driven.
1522 if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
1523 return false;
1524 }
1525 }
1526
1527 // Insert a PHI node now if we need it.
1528 Value *MergedVal = OtherStore->getOperand(0);
1529 // The debug locations of the original instructions might differ. Merge them.
1530 DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
1531 OtherStore->getDebugLoc());
1532 if (MergedVal != SI.getOperand(0)) {
1533 PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
1534 PN->addIncoming(SI.getOperand(0), SI.getParent());
1535 PN->addIncoming(OtherStore->getOperand(0), OtherBB);
1536 MergedVal = InsertNewInstBefore(PN, DestBB->front());
1537 PN->setDebugLoc(MergedLoc);
1538 }
1539
1540 // Advance to a place where it is safe to insert the new store and insert it.
1541 BBI = DestBB->getFirstInsertionPt();
1542 StoreInst *NewSI =
1543 new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
1544 SI.getOrdering(), SI.getSyncScopeID());
1545 InsertNewInstBefore(NewSI, *BBI);
1546 NewSI->setDebugLoc(MergedLoc);
1547
1548 // If the two stores had AA tags, merge them.
1549 AAMDNodes AATags;
1550 SI.getAAMetadata(AATags);
1551 if (AATags) {
1552 OtherStore->getAAMetadata(AATags, /* Merge = */ true);
1553 NewSI->setAAMetadata(AATags);
1554 }
1555
1556 // Nuke the old stores.
1557 eraseInstFromFunction(SI);
1558 eraseInstFromFunction(*OtherStore);
1559 return true;
1560 }
1561