1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/CallingConv.h"
27 #include "llvm/IR/DataLayout.h"
28 #include "llvm/IR/GetElementPtrTypeIterator.h"
29 #include "llvm/IR/GlobalAlias.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/Operator.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35
36 using namespace llvm;
37
38 #define DEBUG_TYPE "inline-cost"
39
40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
41
42 namespace {
43
44 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
45 typedef InstVisitor<CallAnalyzer, bool> Base;
46 friend class InstVisitor<CallAnalyzer, bool>;
47
48 /// The TargetTransformInfo available for this compilation.
49 const TargetTransformInfo &TTI;
50
51 /// The cache of @llvm.assume intrinsics.
52 AssumptionCacheTracker *ACT;
53
54 // The called function.
55 Function &F;
56
57 // The candidate callsite being analyzed. Please do not use this to do
58 // analysis in the caller function; we want the inline cost query to be
59 // easily cacheable. Instead, use the cover function paramHasAttr.
60 CallSite CandidateCS;
61
62 int Threshold;
63 int Cost;
64
65 bool IsCallerRecursive;
66 bool IsRecursiveCall;
67 bool ExposesReturnsTwice;
68 bool HasDynamicAlloca;
69 bool ContainsNoDuplicateCall;
70 bool HasReturn;
71 bool HasIndirectBr;
72 bool HasFrameEscape;
73
74 /// Number of bytes allocated statically by the callee.
75 uint64_t AllocatedSize;
76 unsigned NumInstructions, NumVectorInstructions;
77 int FiftyPercentVectorBonus, TenPercentVectorBonus;
78 int VectorBonus;
79
80 // While we walk the potentially-inlined instructions, we build up and
81 // maintain a mapping of simplified values specific to this callsite. The
82 // idea is to propagate any special information we have about arguments to
83 // this call through the inlinable section of the function, and account for
84 // likely simplifications post-inlining. The most important aspect we track
85 // is CFG altering simplifications -- when we prove a basic block dead, that
86 // can cause dramatic shifts in the cost of inlining a function.
87 DenseMap<Value *, Constant *> SimplifiedValues;
88
89 // Keep track of the values which map back (through function arguments) to
90 // allocas on the caller stack which could be simplified through SROA.
91 DenseMap<Value *, Value *> SROAArgValues;
92
93 // The mapping of caller Alloca values to their accumulated cost savings. If
94 // we have to disable SROA for one of the allocas, this tells us how much
95 // cost must be added.
96 DenseMap<Value *, int> SROAArgCosts;
97
98 // Keep track of values which map to a pointer base and constant offset.
99 DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
100
101 // Custom simplification helper routines.
102 bool isAllocaDerivedArg(Value *V);
103 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
104 DenseMap<Value *, int>::iterator &CostIt);
105 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
106 void disableSROA(Value *V);
107 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
108 int InstructionCost);
109 bool isGEPOffsetConstant(GetElementPtrInst &GEP);
110 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
111 bool simplifyCallSite(Function *F, CallSite CS);
112 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
113
114 /// Return true if the given argument to the function being considered for
115 /// inlining has the given attribute set either at the call site or the
116 /// function declaration. Primarily used to inspect call site specific
117 /// attributes since these can be more precise than the ones on the callee
118 /// itself.
119 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
120
121 /// Return true if the given value is known non null within the callee if
122 /// inlined through this particular callsite.
123 bool isKnownNonNullInCallee(Value *V);
124
125 // Custom analysis routines.
126 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
127
128 // Disable several entry points to the visitor so we don't accidentally use
129 // them by declaring but not defining them here.
130 void visit(Module *); void visit(Module &);
131 void visit(Function *); void visit(Function &);
132 void visit(BasicBlock *); void visit(BasicBlock &);
133
134 // Provide base case for our instruction visit.
135 bool visitInstruction(Instruction &I);
136
137 // Our visit overrides.
138 bool visitAlloca(AllocaInst &I);
139 bool visitPHI(PHINode &I);
140 bool visitGetElementPtr(GetElementPtrInst &I);
141 bool visitBitCast(BitCastInst &I);
142 bool visitPtrToInt(PtrToIntInst &I);
143 bool visitIntToPtr(IntToPtrInst &I);
144 bool visitCastInst(CastInst &I);
145 bool visitUnaryInstruction(UnaryInstruction &I);
146 bool visitCmpInst(CmpInst &I);
147 bool visitSub(BinaryOperator &I);
148 bool visitBinaryOperator(BinaryOperator &I);
149 bool visitLoad(LoadInst &I);
150 bool visitStore(StoreInst &I);
151 bool visitExtractValue(ExtractValueInst &I);
152 bool visitInsertValue(InsertValueInst &I);
153 bool visitCallSite(CallSite CS);
154 bool visitReturnInst(ReturnInst &RI);
155 bool visitBranchInst(BranchInst &BI);
156 bool visitSwitchInst(SwitchInst &SI);
157 bool visitIndirectBrInst(IndirectBrInst &IBI);
158 bool visitResumeInst(ResumeInst &RI);
159 bool visitCleanupReturnInst(CleanupReturnInst &RI);
160 bool visitCatchReturnInst(CatchReturnInst &RI);
161 bool visitUnreachableInst(UnreachableInst &I);
162
163 public:
CallAnalyzer(const TargetTransformInfo & TTI,AssumptionCacheTracker * ACT,Function & Callee,int Threshold,CallSite CSArg)164 CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
165 Function &Callee, int Threshold, CallSite CSArg)
166 : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
167 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
168 ExposesReturnsTwice(false), HasDynamicAlloca(false),
169 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
170 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
171 NumVectorInstructions(0), FiftyPercentVectorBonus(0),
172 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
173 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
174 NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
175 SROACostSavings(0), SROACostSavingsLost(0) {}
176
177 bool analyzeCall(CallSite CS);
178
getThreshold()179 int getThreshold() { return Threshold; }
getCost()180 int getCost() { return Cost; }
181
182 // Keep a bunch of stats about the cost savings found so we can print them
183 // out when debugging.
184 unsigned NumConstantArgs;
185 unsigned NumConstantOffsetPtrArgs;
186 unsigned NumAllocaArgs;
187 unsigned NumConstantPtrCmps;
188 unsigned NumConstantPtrDiffs;
189 unsigned NumInstructionsSimplified;
190 unsigned SROACostSavings;
191 unsigned SROACostSavingsLost;
192
193 void dump();
194 };
195
196 } // namespace
197
198 /// \brief Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)199 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
200 return SROAArgValues.count(V);
201 }
202
203 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
204 /// Returns false if V does not map to a SROA-candidate.
lookupSROAArgAndCost(Value * V,Value * & Arg,DenseMap<Value *,int>::iterator & CostIt)205 bool CallAnalyzer::lookupSROAArgAndCost(
206 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
207 if (SROAArgValues.empty() || SROAArgCosts.empty())
208 return false;
209
210 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
211 if (ArgIt == SROAArgValues.end())
212 return false;
213
214 Arg = ArgIt->second;
215 CostIt = SROAArgCosts.find(Arg);
216 return CostIt != SROAArgCosts.end();
217 }
218
219 /// \brief Disable SROA for the candidate marked by this cost iterator.
220 ///
221 /// This marks the candidate as no longer viable for SROA, and adds the cost
222 /// savings associated with it back into the inline cost measurement.
disableSROA(DenseMap<Value *,int>::iterator CostIt)223 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
224 // If we're no longer able to perform SROA we need to undo its cost savings
225 // and prevent subsequent analysis.
226 Cost += CostIt->second;
227 SROACostSavings -= CostIt->second;
228 SROACostSavingsLost += CostIt->second;
229 SROAArgCosts.erase(CostIt);
230 }
231
232 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)233 void CallAnalyzer::disableSROA(Value *V) {
234 Value *SROAArg;
235 DenseMap<Value *, int>::iterator CostIt;
236 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
237 disableSROA(CostIt);
238 }
239
240 /// \brief Accumulate the given cost for a particular SROA candidate.
accumulateSROACost(DenseMap<Value *,int>::iterator CostIt,int InstructionCost)241 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
242 int InstructionCost) {
243 CostIt->second += InstructionCost;
244 SROACostSavings += InstructionCost;
245 }
246
247 /// \brief Check whether a GEP's indices are all constant.
248 ///
249 /// Respects any simplified values known during the analysis of this callsite.
isGEPOffsetConstant(GetElementPtrInst & GEP)250 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
251 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
252 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
253 return false;
254
255 return true;
256 }
257
258 /// \brief Accumulate a constant GEP offset into an APInt if possible.
259 ///
260 /// Returns false if unable to compute the offset for any reason. Respects any
261 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)262 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
263 const DataLayout &DL = F.getParent()->getDataLayout();
264 unsigned IntPtrWidth = DL.getPointerSizeInBits();
265 assert(IntPtrWidth == Offset.getBitWidth());
266
267 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
268 GTI != GTE; ++GTI) {
269 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
270 if (!OpC)
271 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
272 OpC = dyn_cast<ConstantInt>(SimpleOp);
273 if (!OpC)
274 return false;
275 if (OpC->isZero()) continue;
276
277 // Handle a struct index, which adds its field offset to the pointer.
278 if (StructType *STy = dyn_cast<StructType>(*GTI)) {
279 unsigned ElementIdx = OpC->getZExtValue();
280 const StructLayout *SL = DL.getStructLayout(STy);
281 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
282 continue;
283 }
284
285 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
286 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
287 }
288 return true;
289 }
290
visitAlloca(AllocaInst & I)291 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
292 // Check whether inlining will turn a dynamic alloca into a static
293 // alloca, and handle that case.
294 if (I.isArrayAllocation()) {
295 if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
296 ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
297 assert(AllocSize && "Allocation size not a constant int?");
298 Type *Ty = I.getAllocatedType();
299 AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
300 return Base::visitAlloca(I);
301 }
302 }
303
304 // Accumulate the allocated size.
305 if (I.isStaticAlloca()) {
306 const DataLayout &DL = F.getParent()->getDataLayout();
307 Type *Ty = I.getAllocatedType();
308 AllocatedSize += DL.getTypeAllocSize(Ty);
309 }
310
311 // We will happily inline static alloca instructions.
312 if (I.isStaticAlloca())
313 return Base::visitAlloca(I);
314
315 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
316 // a variety of reasons, and so we would like to not inline them into
317 // functions which don't currently have a dynamic alloca. This simply
318 // disables inlining altogether in the presence of a dynamic alloca.
319 HasDynamicAlloca = true;
320 return false;
321 }
322
visitPHI(PHINode & I)323 bool CallAnalyzer::visitPHI(PHINode &I) {
324 // FIXME: We should potentially be tracking values through phi nodes,
325 // especially when they collapse to a single value due to deleted CFG edges
326 // during inlining.
327
328 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
329 // though we don't want to propagate it's bonuses. The idea is to disable
330 // SROA if it *might* be used in an inappropriate manner.
331
332 // Phi nodes are always zero-cost.
333 return true;
334 }
335
visitGetElementPtr(GetElementPtrInst & I)336 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
337 Value *SROAArg;
338 DenseMap<Value *, int>::iterator CostIt;
339 bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
340 SROAArg, CostIt);
341
342 // Try to fold GEPs of constant-offset call site argument pointers. This
343 // requires target data and inbounds GEPs.
344 if (I.isInBounds()) {
345 // Check if we have a base + offset for the pointer.
346 Value *Ptr = I.getPointerOperand();
347 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
348 if (BaseAndOffset.first) {
349 // Check if the offset of this GEP is constant, and if so accumulate it
350 // into Offset.
351 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
352 // Non-constant GEPs aren't folded, and disable SROA.
353 if (SROACandidate)
354 disableSROA(CostIt);
355 return false;
356 }
357
358 // Add the result as a new mapping to Base + Offset.
359 ConstantOffsetPtrs[&I] = BaseAndOffset;
360
361 // Also handle SROA candidates here, we already know that the GEP is
362 // all-constant indexed.
363 if (SROACandidate)
364 SROAArgValues[&I] = SROAArg;
365
366 return true;
367 }
368 }
369
370 if (isGEPOffsetConstant(I)) {
371 if (SROACandidate)
372 SROAArgValues[&I] = SROAArg;
373
374 // Constant GEPs are modeled as free.
375 return true;
376 }
377
378 // Variable GEPs will require math and will disable SROA.
379 if (SROACandidate)
380 disableSROA(CostIt);
381 return false;
382 }
383
visitBitCast(BitCastInst & I)384 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
385 // Propagate constants through bitcasts.
386 Constant *COp = dyn_cast<Constant>(I.getOperand(0));
387 if (!COp)
388 COp = SimplifiedValues.lookup(I.getOperand(0));
389 if (COp)
390 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
391 SimplifiedValues[&I] = C;
392 return true;
393 }
394
395 // Track base/offsets through casts
396 std::pair<Value *, APInt> BaseAndOffset
397 = ConstantOffsetPtrs.lookup(I.getOperand(0));
398 // Casts don't change the offset, just wrap it up.
399 if (BaseAndOffset.first)
400 ConstantOffsetPtrs[&I] = BaseAndOffset;
401
402 // Also look for SROA candidates here.
403 Value *SROAArg;
404 DenseMap<Value *, int>::iterator CostIt;
405 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
406 SROAArgValues[&I] = SROAArg;
407
408 // Bitcasts are always zero cost.
409 return true;
410 }
411
visitPtrToInt(PtrToIntInst & I)412 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
413 // Propagate constants through ptrtoint.
414 Constant *COp = dyn_cast<Constant>(I.getOperand(0));
415 if (!COp)
416 COp = SimplifiedValues.lookup(I.getOperand(0));
417 if (COp)
418 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
419 SimplifiedValues[&I] = C;
420 return true;
421 }
422
423 // Track base/offset pairs when converted to a plain integer provided the
424 // integer is large enough to represent the pointer.
425 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
426 const DataLayout &DL = F.getParent()->getDataLayout();
427 if (IntegerSize >= DL.getPointerSizeInBits()) {
428 std::pair<Value *, APInt> BaseAndOffset
429 = ConstantOffsetPtrs.lookup(I.getOperand(0));
430 if (BaseAndOffset.first)
431 ConstantOffsetPtrs[&I] = BaseAndOffset;
432 }
433
434 // This is really weird. Technically, ptrtoint will disable SROA. However,
435 // unless that ptrtoint is *used* somewhere in the live basic blocks after
436 // inlining, it will be nuked, and SROA should proceed. All of the uses which
437 // would block SROA would also block SROA if applied directly to a pointer,
438 // and so we can just add the integer in here. The only places where SROA is
439 // preserved either cannot fire on an integer, or won't in-and-of themselves
440 // disable SROA (ext) w/o some later use that we would see and disable.
441 Value *SROAArg;
442 DenseMap<Value *, int>::iterator CostIt;
443 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
444 SROAArgValues[&I] = SROAArg;
445
446 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
447 }
448
visitIntToPtr(IntToPtrInst & I)449 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
450 // Propagate constants through ptrtoint.
451 Constant *COp = dyn_cast<Constant>(I.getOperand(0));
452 if (!COp)
453 COp = SimplifiedValues.lookup(I.getOperand(0));
454 if (COp)
455 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
456 SimplifiedValues[&I] = C;
457 return true;
458 }
459
460 // Track base/offset pairs when round-tripped through a pointer without
461 // modifications provided the integer is not too large.
462 Value *Op = I.getOperand(0);
463 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
464 const DataLayout &DL = F.getParent()->getDataLayout();
465 if (IntegerSize <= DL.getPointerSizeInBits()) {
466 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
467 if (BaseAndOffset.first)
468 ConstantOffsetPtrs[&I] = BaseAndOffset;
469 }
470
471 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
472 Value *SROAArg;
473 DenseMap<Value *, int>::iterator CostIt;
474 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
475 SROAArgValues[&I] = SROAArg;
476
477 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
478 }
479
visitCastInst(CastInst & I)480 bool CallAnalyzer::visitCastInst(CastInst &I) {
481 // Propagate constants through ptrtoint.
482 Constant *COp = dyn_cast<Constant>(I.getOperand(0));
483 if (!COp)
484 COp = SimplifiedValues.lookup(I.getOperand(0));
485 if (COp)
486 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
487 SimplifiedValues[&I] = C;
488 return true;
489 }
490
491 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
492 disableSROA(I.getOperand(0));
493
494 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
495 }
496
visitUnaryInstruction(UnaryInstruction & I)497 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
498 Value *Operand = I.getOperand(0);
499 Constant *COp = dyn_cast<Constant>(Operand);
500 if (!COp)
501 COp = SimplifiedValues.lookup(Operand);
502 if (COp) {
503 const DataLayout &DL = F.getParent()->getDataLayout();
504 if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
505 COp, DL)) {
506 SimplifiedValues[&I] = C;
507 return true;
508 }
509 }
510
511 // Disable any SROA on the argument to arbitrary unary operators.
512 disableSROA(Operand);
513
514 return false;
515 }
516
paramHasAttr(Argument * A,Attribute::AttrKind Attr)517 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
518 unsigned ArgNo = A->getArgNo();
519 return CandidateCS.paramHasAttr(ArgNo+1, Attr);
520 }
521
isKnownNonNullInCallee(Value * V)522 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
523 // Does the *call site* have the NonNull attribute set on an argument? We
524 // use the attribute on the call site to memoize any analysis done in the
525 // caller. This will also trip if the callee function has a non-null
526 // parameter attribute, but that's a less interesting case because hopefully
527 // the callee would already have been simplified based on that.
528 if (Argument *A = dyn_cast<Argument>(V))
529 if (paramHasAttr(A, Attribute::NonNull))
530 return true;
531
532 // Is this an alloca in the caller? This is distinct from the attribute case
533 // above because attributes aren't updated within the inliner itself and we
534 // always want to catch the alloca derived case.
535 if (isAllocaDerivedArg(V))
536 // We can actually predict the result of comparisons between an
537 // alloca-derived value and null. Note that this fires regardless of
538 // SROA firing.
539 return true;
540
541 return false;
542 }
543
visitCmpInst(CmpInst & I)544 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
545 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
546 // First try to handle simplified comparisons.
547 if (!isa<Constant>(LHS))
548 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
549 LHS = SimpleLHS;
550 if (!isa<Constant>(RHS))
551 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
552 RHS = SimpleRHS;
553 if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
554 if (Constant *CRHS = dyn_cast<Constant>(RHS))
555 if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
556 SimplifiedValues[&I] = C;
557 return true;
558 }
559 }
560
561 if (I.getOpcode() == Instruction::FCmp)
562 return false;
563
564 // Otherwise look for a comparison between constant offset pointers with
565 // a common base.
566 Value *LHSBase, *RHSBase;
567 APInt LHSOffset, RHSOffset;
568 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
569 if (LHSBase) {
570 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
571 if (RHSBase && LHSBase == RHSBase) {
572 // We have common bases, fold the icmp to a constant based on the
573 // offsets.
574 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
575 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
576 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
577 SimplifiedValues[&I] = C;
578 ++NumConstantPtrCmps;
579 return true;
580 }
581 }
582 }
583
584 // If the comparison is an equality comparison with null, we can simplify it
585 // if we know the value (argument) can't be null
586 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
587 isKnownNonNullInCallee(I.getOperand(0))) {
588 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
589 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
590 : ConstantInt::getFalse(I.getType());
591 return true;
592 }
593 // Finally check for SROA candidates in comparisons.
594 Value *SROAArg;
595 DenseMap<Value *, int>::iterator CostIt;
596 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
597 if (isa<ConstantPointerNull>(I.getOperand(1))) {
598 accumulateSROACost(CostIt, InlineConstants::InstrCost);
599 return true;
600 }
601
602 disableSROA(CostIt);
603 }
604
605 return false;
606 }
607
visitSub(BinaryOperator & I)608 bool CallAnalyzer::visitSub(BinaryOperator &I) {
609 // Try to handle a special case: we can fold computing the difference of two
610 // constant-related pointers.
611 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
612 Value *LHSBase, *RHSBase;
613 APInt LHSOffset, RHSOffset;
614 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
615 if (LHSBase) {
616 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
617 if (RHSBase && LHSBase == RHSBase) {
618 // We have common bases, fold the subtract to a constant based on the
619 // offsets.
620 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
621 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
622 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
623 SimplifiedValues[&I] = C;
624 ++NumConstantPtrDiffs;
625 return true;
626 }
627 }
628 }
629
630 // Otherwise, fall back to the generic logic for simplifying and handling
631 // instructions.
632 return Base::visitSub(I);
633 }
634
visitBinaryOperator(BinaryOperator & I)635 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
636 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
637 const DataLayout &DL = F.getParent()->getDataLayout();
638 if (!isa<Constant>(LHS))
639 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
640 LHS = SimpleLHS;
641 if (!isa<Constant>(RHS))
642 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
643 RHS = SimpleRHS;
644 Value *SimpleV = nullptr;
645 if (auto FI = dyn_cast<FPMathOperator>(&I))
646 SimpleV =
647 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
648 else
649 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
650
651 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
652 SimplifiedValues[&I] = C;
653 return true;
654 }
655
656 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
657 disableSROA(LHS);
658 disableSROA(RHS);
659
660 return false;
661 }
662
visitLoad(LoadInst & I)663 bool CallAnalyzer::visitLoad(LoadInst &I) {
664 Value *SROAArg;
665 DenseMap<Value *, int>::iterator CostIt;
666 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
667 if (I.isSimple()) {
668 accumulateSROACost(CostIt, InlineConstants::InstrCost);
669 return true;
670 }
671
672 disableSROA(CostIt);
673 }
674
675 return false;
676 }
677
visitStore(StoreInst & I)678 bool CallAnalyzer::visitStore(StoreInst &I) {
679 Value *SROAArg;
680 DenseMap<Value *, int>::iterator CostIt;
681 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
682 if (I.isSimple()) {
683 accumulateSROACost(CostIt, InlineConstants::InstrCost);
684 return true;
685 }
686
687 disableSROA(CostIt);
688 }
689
690 return false;
691 }
692
visitExtractValue(ExtractValueInst & I)693 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
694 // Constant folding for extract value is trivial.
695 Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
696 if (!C)
697 C = SimplifiedValues.lookup(I.getAggregateOperand());
698 if (C) {
699 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
700 return true;
701 }
702
703 // SROA can look through these but give them a cost.
704 return false;
705 }
706
visitInsertValue(InsertValueInst & I)707 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
708 // Constant folding for insert value is trivial.
709 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
710 if (!AggC)
711 AggC = SimplifiedValues.lookup(I.getAggregateOperand());
712 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
713 if (!InsertedC)
714 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
715 if (AggC && InsertedC) {
716 SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
717 I.getIndices());
718 return true;
719 }
720
721 // SROA can look through these but give them a cost.
722 return false;
723 }
724
725 /// \brief Try to simplify a call site.
726 ///
727 /// Takes a concrete function and callsite and tries to actually simplify it by
728 /// analyzing the arguments and call itself with instsimplify. Returns true if
729 /// it has simplified the callsite to some other entity (a constant), making it
730 /// free.
simplifyCallSite(Function * F,CallSite CS)731 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
732 // FIXME: Using the instsimplify logic directly for this is inefficient
733 // because we have to continually rebuild the argument list even when no
734 // simplifications can be performed. Until that is fixed with remapping
735 // inside of instsimplify, directly constant fold calls here.
736 if (!canConstantFoldCallTo(F))
737 return false;
738
739 // Try to re-map the arguments to constants.
740 SmallVector<Constant *, 4> ConstantArgs;
741 ConstantArgs.reserve(CS.arg_size());
742 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
743 I != E; ++I) {
744 Constant *C = dyn_cast<Constant>(*I);
745 if (!C)
746 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
747 if (!C)
748 return false; // This argument doesn't map to a constant.
749
750 ConstantArgs.push_back(C);
751 }
752 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
753 SimplifiedValues[CS.getInstruction()] = C;
754 return true;
755 }
756
757 return false;
758 }
759
visitCallSite(CallSite CS)760 bool CallAnalyzer::visitCallSite(CallSite CS) {
761 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
762 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
763 // This aborts the entire analysis.
764 ExposesReturnsTwice = true;
765 return false;
766 }
767 if (CS.isCall() &&
768 cast<CallInst>(CS.getInstruction())->cannotDuplicate())
769 ContainsNoDuplicateCall = true;
770
771 if (Function *F = CS.getCalledFunction()) {
772 // When we have a concrete function, first try to simplify it directly.
773 if (simplifyCallSite(F, CS))
774 return true;
775
776 // Next check if it is an intrinsic we know about.
777 // FIXME: Lift this into part of the InstVisitor.
778 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
779 switch (II->getIntrinsicID()) {
780 default:
781 return Base::visitCallSite(CS);
782
783 case Intrinsic::memset:
784 case Intrinsic::memcpy:
785 case Intrinsic::memmove:
786 // SROA can usually chew through these intrinsics, but they aren't free.
787 return false;
788 case Intrinsic::localescape:
789 HasFrameEscape = true;
790 return false;
791 }
792 }
793
794 if (F == CS.getInstruction()->getParent()->getParent()) {
795 // This flag will fully abort the analysis, so don't bother with anything
796 // else.
797 IsRecursiveCall = true;
798 return false;
799 }
800
801 if (TTI.isLoweredToCall(F)) {
802 // We account for the average 1 instruction per call argument setup
803 // here.
804 Cost += CS.arg_size() * InlineConstants::InstrCost;
805
806 // Everything other than inline ASM will also have a significant cost
807 // merely from making the call.
808 if (!isa<InlineAsm>(CS.getCalledValue()))
809 Cost += InlineConstants::CallPenalty;
810 }
811
812 return Base::visitCallSite(CS);
813 }
814
815 // Otherwise we're in a very special case -- an indirect function call. See
816 // if we can be particularly clever about this.
817 Value *Callee = CS.getCalledValue();
818
819 // First, pay the price of the argument setup. We account for the average
820 // 1 instruction per call argument setup here.
821 Cost += CS.arg_size() * InlineConstants::InstrCost;
822
823 // Next, check if this happens to be an indirect function call to a known
824 // function in this inline context. If not, we've done all we can.
825 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
826 if (!F)
827 return Base::visitCallSite(CS);
828
829 // If we have a constant that we are calling as a function, we can peer
830 // through it and see the function target. This happens not infrequently
831 // during devirtualization and so we want to give it a hefty bonus for
832 // inlining, but cap that bonus in the event that inlining wouldn't pan
833 // out. Pretend to inline the function, with a custom threshold.
834 CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
835 if (CA.analyzeCall(CS)) {
836 // We were able to inline the indirect call! Subtract the cost from the
837 // threshold to get the bonus we want to apply, but don't go below zero.
838 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
839 }
840
841 return Base::visitCallSite(CS);
842 }
843
visitReturnInst(ReturnInst & RI)844 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
845 // At least one return instruction will be free after inlining.
846 bool Free = !HasReturn;
847 HasReturn = true;
848 return Free;
849 }
850
visitBranchInst(BranchInst & BI)851 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
852 // We model unconditional branches as essentially free -- they really
853 // shouldn't exist at all, but handling them makes the behavior of the
854 // inliner more regular and predictable. Interestingly, conditional branches
855 // which will fold away are also free.
856 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
857 dyn_cast_or_null<ConstantInt>(
858 SimplifiedValues.lookup(BI.getCondition()));
859 }
860
visitSwitchInst(SwitchInst & SI)861 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
862 // We model unconditional switches as free, see the comments on handling
863 // branches.
864 if (isa<ConstantInt>(SI.getCondition()))
865 return true;
866 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
867 if (isa<ConstantInt>(V))
868 return true;
869
870 // Otherwise, we need to accumulate a cost proportional to the number of
871 // distinct successor blocks. This fan-out in the CFG cannot be represented
872 // for free even if we can represent the core switch as a jumptable that
873 // takes a single instruction.
874 //
875 // NB: We convert large switches which are just used to initialize large phi
876 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
877 // inlining those. It will prevent inlining in cases where the optimization
878 // does not (yet) fire.
879 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
880 SuccessorBlocks.insert(SI.getDefaultDest());
881 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
882 SuccessorBlocks.insert(I.getCaseSuccessor());
883 // Add cost corresponding to the number of distinct destinations. The first
884 // we model as free because of fallthrough.
885 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
886 return false;
887 }
888
visitIndirectBrInst(IndirectBrInst & IBI)889 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
890 // We never want to inline functions that contain an indirectbr. This is
891 // incorrect because all the blockaddress's (in static global initializers
892 // for example) would be referring to the original function, and this
893 // indirect jump would jump from the inlined copy of the function into the
894 // original function which is extremely undefined behavior.
895 // FIXME: This logic isn't really right; we can safely inline functions with
896 // indirectbr's as long as no other function or global references the
897 // blockaddress of a block within the current function.
898 HasIndirectBr = true;
899 return false;
900 }
901
visitResumeInst(ResumeInst & RI)902 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
903 // FIXME: It's not clear that a single instruction is an accurate model for
904 // the inline cost of a resume instruction.
905 return false;
906 }
907
visitCleanupReturnInst(CleanupReturnInst & CRI)908 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
909 // FIXME: It's not clear that a single instruction is an accurate model for
910 // the inline cost of a cleanupret instruction.
911 return false;
912 }
913
visitCatchReturnInst(CatchReturnInst & CRI)914 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
915 // FIXME: It's not clear that a single instruction is an accurate model for
916 // the inline cost of a catchret instruction.
917 return false;
918 }
919
visitUnreachableInst(UnreachableInst & I)920 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
921 // FIXME: It might be reasonably to discount the cost of instructions leading
922 // to unreachable as they have the lowest possible impact on both runtime and
923 // code size.
924 return true; // No actual code is needed for unreachable.
925 }
926
visitInstruction(Instruction & I)927 bool CallAnalyzer::visitInstruction(Instruction &I) {
928 // Some instructions are free. All of the free intrinsics can also be
929 // handled by SROA, etc.
930 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
931 return true;
932
933 // We found something we don't understand or can't handle. Mark any SROA-able
934 // values in the operand list as no longer viable.
935 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
936 disableSROA(*OI);
937
938 return false;
939 }
940
941
942 /// \brief Analyze a basic block for its contribution to the inline cost.
943 ///
944 /// This method walks the analyzer over every instruction in the given basic
945 /// block and accounts for their cost during inlining at this callsite. It
946 /// aborts early if the threshold has been exceeded or an impossible to inline
947 /// construct has been detected. It returns false if inlining is no longer
948 /// viable, and true if inlining remains viable.
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)949 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
950 SmallPtrSetImpl<const Value *> &EphValues) {
951 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
952 // FIXME: Currently, the number of instructions in a function regardless of
953 // our ability to simplify them during inline to constants or dead code,
954 // are actually used by the vector bonus heuristic. As long as that's true,
955 // we have to special case debug intrinsics here to prevent differences in
956 // inlining due to debug symbols. Eventually, the number of unsimplified
957 // instructions shouldn't factor into the cost computation, but until then,
958 // hack around it here.
959 if (isa<DbgInfoIntrinsic>(I))
960 continue;
961
962 // Skip ephemeral values.
963 if (EphValues.count(&*I))
964 continue;
965
966 ++NumInstructions;
967 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
968 ++NumVectorInstructions;
969
970 // If the instruction is floating point, and the target says this operation
971 // is expensive or the function has the "use-soft-float" attribute, this may
972 // eventually become a library call. Treat the cost as such.
973 if (I->getType()->isFloatingPointTy()) {
974 bool hasSoftFloatAttr = false;
975
976 // If the function has the "use-soft-float" attribute, mark it as
977 // expensive.
978 if (F.hasFnAttribute("use-soft-float")) {
979 Attribute Attr = F.getFnAttribute("use-soft-float");
980 StringRef Val = Attr.getValueAsString();
981 if (Val == "true")
982 hasSoftFloatAttr = true;
983 }
984
985 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
986 hasSoftFloatAttr)
987 Cost += InlineConstants::CallPenalty;
988 }
989
990 // If the instruction simplified to a constant, there is no cost to this
991 // instruction. Visit the instructions using our InstVisitor to account for
992 // all of the per-instruction logic. The visit tree returns true if we
993 // consumed the instruction in any way, and false if the instruction's base
994 // cost should count against inlining.
995 if (Base::visit(&*I))
996 ++NumInstructionsSimplified;
997 else
998 Cost += InlineConstants::InstrCost;
999
1000 // If the visit this instruction detected an uninlinable pattern, abort.
1001 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1002 HasIndirectBr || HasFrameEscape)
1003 return false;
1004
1005 // If the caller is a recursive function then we don't want to inline
1006 // functions which allocate a lot of stack space because it would increase
1007 // the caller stack usage dramatically.
1008 if (IsCallerRecursive &&
1009 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1010 return false;
1011
1012 // Check if we've past the maximum possible threshold so we don't spin in
1013 // huge basic blocks that will never inline.
1014 if (Cost > Threshold)
1015 return false;
1016 }
1017
1018 return true;
1019 }
1020
1021 /// \brief Compute the base pointer and cumulative constant offsets for V.
1022 ///
1023 /// This strips all constant offsets off of V, leaving it the base pointer, and
1024 /// accumulates the total constant offset applied in the returned constant. It
1025 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1026 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)1027 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1028 if (!V->getType()->isPointerTy())
1029 return nullptr;
1030
1031 const DataLayout &DL = F.getParent()->getDataLayout();
1032 unsigned IntPtrWidth = DL.getPointerSizeInBits();
1033 APInt Offset = APInt::getNullValue(IntPtrWidth);
1034
1035 // Even though we don't look through PHI nodes, we could be called on an
1036 // instruction in an unreachable block, which may be on a cycle.
1037 SmallPtrSet<Value *, 4> Visited;
1038 Visited.insert(V);
1039 do {
1040 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1041 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1042 return nullptr;
1043 V = GEP->getPointerOperand();
1044 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1045 V = cast<Operator>(V)->getOperand(0);
1046 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1047 if (GA->mayBeOverridden())
1048 break;
1049 V = GA->getAliasee();
1050 } else {
1051 break;
1052 }
1053 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1054 } while (Visited.insert(V).second);
1055
1056 Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1057 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1058 }
1059
1060 /// \brief Analyze a call site for potential inlining.
1061 ///
1062 /// Returns true if inlining this call is viable, and false if it is not
1063 /// viable. It computes the cost and adjusts the threshold based on numerous
1064 /// factors and heuristics. If this method returns false but the computed cost
1065 /// is below the computed threshold, then inlining was forcibly disabled by
1066 /// some artifact of the routine.
analyzeCall(CallSite CS)1067 bool CallAnalyzer::analyzeCall(CallSite CS) {
1068 ++NumCallsAnalyzed;
1069
1070 // Perform some tweaks to the cost and threshold based on the direct
1071 // callsite information.
1072
1073 // We want to more aggressively inline vector-dense kernels, so up the
1074 // threshold, and we'll lower it if the % of vector instructions gets too
1075 // low. Note that these bonuses are some what arbitrary and evolved over time
1076 // by accident as much as because they are principled bonuses.
1077 //
1078 // FIXME: It would be nice to remove all such bonuses. At least it would be
1079 // nice to base the bonus values on something more scientific.
1080 assert(NumInstructions == 0);
1081 assert(NumVectorInstructions == 0);
1082 FiftyPercentVectorBonus = 3 * Threshold / 2;
1083 TenPercentVectorBonus = 3 * Threshold / 4;
1084 const DataLayout &DL = F.getParent()->getDataLayout();
1085
1086 // Track whether the post-inlining function would have more than one basic
1087 // block. A single basic block is often intended for inlining. Balloon the
1088 // threshold by 50% until we pass the single-BB phase.
1089 bool SingleBB = true;
1090 int SingleBBBonus = Threshold / 2;
1091
1092 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1093 // this Threshold any time, and cost cannot decrease, we can stop processing
1094 // the rest of the function body.
1095 Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1096
1097 // Give out bonuses per argument, as the instructions setting them up will
1098 // be gone after inlining.
1099 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1100 if (CS.isByValArgument(I)) {
1101 // We approximate the number of loads and stores needed by dividing the
1102 // size of the byval type by the target's pointer size.
1103 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1104 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1105 unsigned PointerSize = DL.getPointerSizeInBits();
1106 // Ceiling division.
1107 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1108
1109 // If it generates more than 8 stores it is likely to be expanded as an
1110 // inline memcpy so we take that as an upper bound. Otherwise we assume
1111 // one load and one store per word copied.
1112 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1113 // here instead of a magic number of 8, but it's not available via
1114 // DataLayout.
1115 NumStores = std::min(NumStores, 8U);
1116
1117 Cost -= 2 * NumStores * InlineConstants::InstrCost;
1118 } else {
1119 // For non-byval arguments subtract off one instruction per call
1120 // argument.
1121 Cost -= InlineConstants::InstrCost;
1122 }
1123 }
1124
1125 // If there is only one call of the function, and it has internal linkage,
1126 // the cost of inlining it drops dramatically.
1127 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
1128 &F == CS.getCalledFunction();
1129 if (OnlyOneCallAndLocalLinkage)
1130 Cost += InlineConstants::LastCallToStaticBonus;
1131
1132 // If the instruction after the call, or if the normal destination of the
1133 // invoke is an unreachable instruction, the function is noreturn. As such,
1134 // there is little point in inlining this unless there is literally zero
1135 // cost.
1136 Instruction *Instr = CS.getInstruction();
1137 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
1138 if (isa<UnreachableInst>(II->getNormalDest()->begin()))
1139 Threshold = 0;
1140 } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
1141 Threshold = 0;
1142
1143 // If this function uses the coldcc calling convention, prefer not to inline
1144 // it.
1145 if (F.getCallingConv() == CallingConv::Cold)
1146 Cost += InlineConstants::ColdccPenalty;
1147
1148 // Check if we're done. This can happen due to bonuses and penalties.
1149 if (Cost > Threshold)
1150 return false;
1151
1152 if (F.empty())
1153 return true;
1154
1155 Function *Caller = CS.getInstruction()->getParent()->getParent();
1156 // Check if the caller function is recursive itself.
1157 for (User *U : Caller->users()) {
1158 CallSite Site(U);
1159 if (!Site)
1160 continue;
1161 Instruction *I = Site.getInstruction();
1162 if (I->getParent()->getParent() == Caller) {
1163 IsCallerRecursive = true;
1164 break;
1165 }
1166 }
1167
1168 // Populate our simplified values by mapping from function arguments to call
1169 // arguments with known important simplifications.
1170 CallSite::arg_iterator CAI = CS.arg_begin();
1171 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1172 FAI != FAE; ++FAI, ++CAI) {
1173 assert(CAI != CS.arg_end());
1174 if (Constant *C = dyn_cast<Constant>(CAI))
1175 SimplifiedValues[&*FAI] = C;
1176
1177 Value *PtrArg = *CAI;
1178 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1179 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1180
1181 // We can SROA any pointer arguments derived from alloca instructions.
1182 if (isa<AllocaInst>(PtrArg)) {
1183 SROAArgValues[&*FAI] = PtrArg;
1184 SROAArgCosts[PtrArg] = 0;
1185 }
1186 }
1187 }
1188 NumConstantArgs = SimplifiedValues.size();
1189 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1190 NumAllocaArgs = SROAArgValues.size();
1191
1192 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1193 // the ephemeral values multiple times (and they're completely determined by
1194 // the callee, so this is purely duplicate work).
1195 SmallPtrSet<const Value *, 32> EphValues;
1196 CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
1197
1198 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1199 // adding basic blocks of the callee which can be proven to be dead for this
1200 // particular call site in order to get more accurate cost estimates. This
1201 // requires a somewhat heavyweight iteration pattern: we need to walk the
1202 // basic blocks in a breadth-first order as we insert live successors. To
1203 // accomplish this, prioritizing for small iterations because we exit after
1204 // crossing our threshold, we use a small-size optimized SetVector.
1205 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1206 SmallPtrSet<BasicBlock *, 16> > BBSetVector;
1207 BBSetVector BBWorklist;
1208 BBWorklist.insert(&F.getEntryBlock());
1209 // Note that we *must not* cache the size, this loop grows the worklist.
1210 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1211 // Bail out the moment we cross the threshold. This means we'll under-count
1212 // the cost, but only when undercounting doesn't matter.
1213 if (Cost > Threshold)
1214 break;
1215
1216 BasicBlock *BB = BBWorklist[Idx];
1217 if (BB->empty())
1218 continue;
1219
1220 // Disallow inlining a blockaddress. A blockaddress only has defined
1221 // behavior for an indirect branch in the same function, and we do not
1222 // currently support inlining indirect branches. But, the inliner may not
1223 // see an indirect branch that ends up being dead code at a particular call
1224 // site. If the blockaddress escapes the function, e.g., via a global
1225 // variable, inlining may lead to an invalid cross-function reference.
1226 if (BB->hasAddressTaken())
1227 return false;
1228
1229 // Analyze the cost of this block. If we blow through the threshold, this
1230 // returns false, and we can bail on out.
1231 if (!analyzeBlock(BB, EphValues)) {
1232 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1233 HasIndirectBr || HasFrameEscape)
1234 return false;
1235
1236 // If the caller is a recursive function then we don't want to inline
1237 // functions which allocate a lot of stack space because it would increase
1238 // the caller stack usage dramatically.
1239 if (IsCallerRecursive &&
1240 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1241 return false;
1242
1243 break;
1244 }
1245
1246 TerminatorInst *TI = BB->getTerminator();
1247
1248 // Add in the live successors by first checking whether we have terminator
1249 // that may be simplified based on the values simplified by this call.
1250 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1251 if (BI->isConditional()) {
1252 Value *Cond = BI->getCondition();
1253 if (ConstantInt *SimpleCond
1254 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1255 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1256 continue;
1257 }
1258 }
1259 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1260 Value *Cond = SI->getCondition();
1261 if (ConstantInt *SimpleCond
1262 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1263 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1264 continue;
1265 }
1266 }
1267
1268 // If we're unable to select a particular successor, just count all of
1269 // them.
1270 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1271 ++TIdx)
1272 BBWorklist.insert(TI->getSuccessor(TIdx));
1273
1274 // If we had any successors at this point, than post-inlining is likely to
1275 // have them as well. Note that we assume any basic blocks which existed
1276 // due to branches or switches which folded above will also fold after
1277 // inlining.
1278 if (SingleBB && TI->getNumSuccessors() > 1) {
1279 // Take off the bonus we applied to the threshold.
1280 Threshold -= SingleBBBonus;
1281 SingleBB = false;
1282 }
1283 }
1284
1285 // If this is a noduplicate call, we can still inline as long as
1286 // inlining this would cause the removal of the caller (so the instruction
1287 // is not actually duplicated, just moved).
1288 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1289 return false;
1290
1291 // We applied the maximum possible vector bonus at the beginning. Now,
1292 // subtract the excess bonus, if any, from the Threshold before
1293 // comparing against Cost.
1294 if (NumVectorInstructions <= NumInstructions / 10)
1295 Threshold -= FiftyPercentVectorBonus;
1296 else if (NumVectorInstructions <= NumInstructions / 2)
1297 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1298
1299 return Cost <= std::max(0, Threshold);
1300 }
1301
1302 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1303 /// \brief Dump stats about this call's analysis.
dump()1304 void CallAnalyzer::dump() {
1305 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1306 DEBUG_PRINT_STAT(NumConstantArgs);
1307 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1308 DEBUG_PRINT_STAT(NumAllocaArgs);
1309 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1310 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1311 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1312 DEBUG_PRINT_STAT(NumInstructions);
1313 DEBUG_PRINT_STAT(SROACostSavings);
1314 DEBUG_PRINT_STAT(SROACostSavingsLost);
1315 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1316 DEBUG_PRINT_STAT(Cost);
1317 DEBUG_PRINT_STAT(Threshold);
1318 #undef DEBUG_PRINT_STAT
1319 }
1320 #endif
1321
1322 INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
1323 true, true)
1324 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1325 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1326 INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
1327 true, true)
1328
1329 char InlineCostAnalysis::ID = 0;
1330
InlineCostAnalysis()1331 InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {}
1332
~InlineCostAnalysis()1333 InlineCostAnalysis::~InlineCostAnalysis() {}
1334
getAnalysisUsage(AnalysisUsage & AU) const1335 void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
1336 AU.setPreservesAll();
1337 AU.addRequired<AssumptionCacheTracker>();
1338 AU.addRequired<TargetTransformInfoWrapperPass>();
1339 CallGraphSCCPass::getAnalysisUsage(AU);
1340 }
1341
runOnSCC(CallGraphSCC & SCC)1342 bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
1343 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
1344 ACT = &getAnalysis<AssumptionCacheTracker>();
1345 return false;
1346 }
1347
getInlineCost(CallSite CS,int Threshold)1348 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
1349 return getInlineCost(CS, CS.getCalledFunction(), Threshold);
1350 }
1351
1352 /// \brief Test that two functions either have or have not the given attribute
1353 /// at the same time.
1354 template<typename AttrKind>
attributeMatches(Function * F1,Function * F2,AttrKind Attr)1355 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1356 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1357 }
1358
1359 /// \brief Test that there are no attribute conflicts between Caller and Callee
1360 /// that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI)1361 static bool functionsHaveCompatibleAttributes(Function *Caller,
1362 Function *Callee,
1363 TargetTransformInfo &TTI) {
1364 return TTI.areInlineCompatible(Caller, Callee) &&
1365 attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
1366 attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
1367 attributeMatches(Caller, Callee, Attribute::SanitizeThread);
1368 }
1369
getInlineCost(CallSite CS,Function * Callee,int Threshold)1370 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
1371 int Threshold) {
1372 // Cannot inline indirect calls.
1373 if (!Callee)
1374 return llvm::InlineCost::getNever();
1375
1376 // Calls to functions with always-inline attributes should be inlined
1377 // whenever possible.
1378 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1379 if (isInlineViable(*Callee))
1380 return llvm::InlineCost::getAlways();
1381 return llvm::InlineCost::getNever();
1382 }
1383
1384 // Never inline functions with conflicting attributes (unless callee has
1385 // always-inline attribute).
1386 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
1387 TTIWP->getTTI(*Callee)))
1388 return llvm::InlineCost::getNever();
1389
1390 // Don't inline this call if the caller has the optnone attribute.
1391 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1392 return llvm::InlineCost::getNever();
1393
1394 // Don't inline functions which can be redefined at link-time to mean
1395 // something else. Don't inline functions marked noinline or call sites
1396 // marked noinline.
1397 if (Callee->mayBeOverridden() ||
1398 Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
1399 return llvm::InlineCost::getNever();
1400
1401 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
1402 << "...\n");
1403
1404 CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
1405 bool ShouldInline = CA.analyzeCall(CS);
1406
1407 DEBUG(CA.dump());
1408
1409 // Check if there was a reason to force inlining or no inlining.
1410 if (!ShouldInline && CA.getCost() < CA.getThreshold())
1411 return InlineCost::getNever();
1412 if (ShouldInline && CA.getCost() >= CA.getThreshold())
1413 return InlineCost::getAlways();
1414
1415 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1416 }
1417
isInlineViable(Function & F)1418 bool InlineCostAnalysis::isInlineViable(Function &F) {
1419 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1420 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1421 // Disallow inlining of functions which contain indirect branches or
1422 // blockaddresses.
1423 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1424 return false;
1425
1426 for (auto &II : *BI) {
1427 CallSite CS(&II);
1428 if (!CS)
1429 continue;
1430
1431 // Disallow recursive calls.
1432 if (&F == CS.getCalledFunction())
1433 return false;
1434
1435 // Disallow calls which expose returns-twice to a function not previously
1436 // attributed as such.
1437 if (!ReturnsTwice && CS.isCall() &&
1438 cast<CallInst>(CS.getInstruction())->canReturnTwice())
1439 return false;
1440
1441 // Disallow inlining functions that call @llvm.localescape. Doing this
1442 // correctly would require major changes to the inliner.
1443 if (CS.getCalledFunction() &&
1444 CS.getCalledFunction()->getIntrinsicID() ==
1445 llvm::Intrinsic::localescape)
1446 return false;
1447 }
1448 }
1449
1450 return true;
1451 }
1452