• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/ProfileSummaryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/IR/CallSite.h"
27 #include "llvm/IR/CallingConv.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/GetElementPtrTypeIterator.h"
30 #include "llvm/IR/GlobalAlias.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/IR/Operator.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "inline-cost"
40 
41 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
42 
43 // Threshold to use when optsize is specified (and there is no
44 // -inline-threshold).
45 const int OptSizeThreshold = 75;
46 
47 // Threshold to use when -Oz is specified (and there is no -inline-threshold).
48 const int OptMinSizeThreshold = 25;
49 
50 // Threshold to use when -O[34] is specified (and there is no
51 // -inline-threshold).
52 const int OptAggressiveThreshold = 275;
53 
54 static cl::opt<int> DefaultInlineThreshold(
55     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
56     cl::desc("Control the amount of inlining to perform (default = 225)"));
57 
58 static cl::opt<int> HintThreshold(
59     "inlinehint-threshold", cl::Hidden, cl::init(325),
60     cl::desc("Threshold for inlining functions with inline hint"));
61 
62 // We introduce this threshold to help performance of instrumentation based
63 // PGO before we actually hook up inliner with analysis passes such as BPI and
64 // BFI.
65 static cl::opt<int> ColdThreshold(
66     "inlinecold-threshold", cl::Hidden, cl::init(225),
67     cl::desc("Threshold for inlining functions with cold attribute"));
68 
69 namespace {
70 
71 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
72   typedef InstVisitor<CallAnalyzer, bool> Base;
73   friend class InstVisitor<CallAnalyzer, bool>;
74 
75   /// The TargetTransformInfo available for this compilation.
76   const TargetTransformInfo &TTI;
77 
78   /// The cache of @llvm.assume intrinsics.
79   AssumptionCacheTracker *ACT;
80 
81   /// Profile summary information.
82   ProfileSummaryInfo *PSI;
83 
84   // The called function.
85   Function &F;
86 
87   // The candidate callsite being analyzed. Please do not use this to do
88   // analysis in the caller function; we want the inline cost query to be
89   // easily cacheable. Instead, use the cover function paramHasAttr.
90   CallSite CandidateCS;
91 
92   int Threshold;
93   int Cost;
94 
95   bool IsCallerRecursive;
96   bool IsRecursiveCall;
97   bool ExposesReturnsTwice;
98   bool HasDynamicAlloca;
99   bool ContainsNoDuplicateCall;
100   bool HasReturn;
101   bool HasIndirectBr;
102   bool HasFrameEscape;
103 
104   /// Number of bytes allocated statically by the callee.
105   uint64_t AllocatedSize;
106   unsigned NumInstructions, NumVectorInstructions;
107   int FiftyPercentVectorBonus, TenPercentVectorBonus;
108   int VectorBonus;
109 
110   // While we walk the potentially-inlined instructions, we build up and
111   // maintain a mapping of simplified values specific to this callsite. The
112   // idea is to propagate any special information we have about arguments to
113   // this call through the inlinable section of the function, and account for
114   // likely simplifications post-inlining. The most important aspect we track
115   // is CFG altering simplifications -- when we prove a basic block dead, that
116   // can cause dramatic shifts in the cost of inlining a function.
117   DenseMap<Value *, Constant *> SimplifiedValues;
118 
119   // Keep track of the values which map back (through function arguments) to
120   // allocas on the caller stack which could be simplified through SROA.
121   DenseMap<Value *, Value *> SROAArgValues;
122 
123   // The mapping of caller Alloca values to their accumulated cost savings. If
124   // we have to disable SROA for one of the allocas, this tells us how much
125   // cost must be added.
126   DenseMap<Value *, int> SROAArgCosts;
127 
128   // Keep track of values which map to a pointer base and constant offset.
129   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
130 
131   // Custom simplification helper routines.
132   bool isAllocaDerivedArg(Value *V);
133   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
134                             DenseMap<Value *, int>::iterator &CostIt);
135   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
136   void disableSROA(Value *V);
137   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
138                           int InstructionCost);
139   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
140   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
141   bool simplifyCallSite(Function *F, CallSite CS);
142   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
143 
144   /// Return true if the given argument to the function being considered for
145   /// inlining has the given attribute set either at the call site or the
146   /// function declaration.  Primarily used to inspect call site specific
147   /// attributes since these can be more precise than the ones on the callee
148   /// itself.
149   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
150 
151   /// Return true if the given value is known non null within the callee if
152   /// inlined through this particular callsite.
153   bool isKnownNonNullInCallee(Value *V);
154 
155   /// Update Threshold based on callsite properties such as callee
156   /// attributes and callee hotness for PGO builds. The Callee is explicitly
157   /// passed to support analyzing indirect calls whose target is inferred by
158   /// analysis.
159   void updateThreshold(CallSite CS, Function &Callee);
160 
161   /// Return true if size growth is allowed when inlining the callee at CS.
162   bool allowSizeGrowth(CallSite CS);
163 
164   // Custom analysis routines.
165   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
166 
167   // Disable several entry points to the visitor so we don't accidentally use
168   // them by declaring but not defining them here.
169   void visit(Module *);
170   void visit(Module &);
171   void visit(Function *);
172   void visit(Function &);
173   void visit(BasicBlock *);
174   void visit(BasicBlock &);
175 
176   // Provide base case for our instruction visit.
177   bool visitInstruction(Instruction &I);
178 
179   // Our visit overrides.
180   bool visitAlloca(AllocaInst &I);
181   bool visitPHI(PHINode &I);
182   bool visitGetElementPtr(GetElementPtrInst &I);
183   bool visitBitCast(BitCastInst &I);
184   bool visitPtrToInt(PtrToIntInst &I);
185   bool visitIntToPtr(IntToPtrInst &I);
186   bool visitCastInst(CastInst &I);
187   bool visitUnaryInstruction(UnaryInstruction &I);
188   bool visitCmpInst(CmpInst &I);
189   bool visitSub(BinaryOperator &I);
190   bool visitBinaryOperator(BinaryOperator &I);
191   bool visitLoad(LoadInst &I);
192   bool visitStore(StoreInst &I);
193   bool visitExtractValue(ExtractValueInst &I);
194   bool visitInsertValue(InsertValueInst &I);
195   bool visitCallSite(CallSite CS);
196   bool visitReturnInst(ReturnInst &RI);
197   bool visitBranchInst(BranchInst &BI);
198   bool visitSwitchInst(SwitchInst &SI);
199   bool visitIndirectBrInst(IndirectBrInst &IBI);
200   bool visitResumeInst(ResumeInst &RI);
201   bool visitCleanupReturnInst(CleanupReturnInst &RI);
202   bool visitCatchReturnInst(CatchReturnInst &RI);
203   bool visitUnreachableInst(UnreachableInst &I);
204 
205 public:
CallAnalyzer(const TargetTransformInfo & TTI,AssumptionCacheTracker * ACT,ProfileSummaryInfo * PSI,Function & Callee,int Threshold,CallSite CSArg)206   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
207                ProfileSummaryInfo *PSI, Function &Callee, int Threshold,
208                CallSite CSArg)
209       : TTI(TTI), ACT(ACT), PSI(PSI), F(Callee), CandidateCS(CSArg),
210         Threshold(Threshold), Cost(0), IsCallerRecursive(false),
211         IsRecursiveCall(false), ExposesReturnsTwice(false),
212         HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
213         HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
214         AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
215         FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
216         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
217         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
218         NumInstructionsSimplified(0), SROACostSavings(0),
219         SROACostSavingsLost(0) {}
220 
221   bool analyzeCall(CallSite CS);
222 
getThreshold()223   int getThreshold() { return Threshold; }
getCost()224   int getCost() { return Cost; }
225 
226   // Keep a bunch of stats about the cost savings found so we can print them
227   // out when debugging.
228   unsigned NumConstantArgs;
229   unsigned NumConstantOffsetPtrArgs;
230   unsigned NumAllocaArgs;
231   unsigned NumConstantPtrCmps;
232   unsigned NumConstantPtrDiffs;
233   unsigned NumInstructionsSimplified;
234   unsigned SROACostSavings;
235   unsigned SROACostSavingsLost;
236 
237   void dump();
238 };
239 
240 } // namespace
241 
242 /// \brief Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)243 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
244   return SROAArgValues.count(V);
245 }
246 
247 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
248 /// Returns false if V does not map to a SROA-candidate.
lookupSROAArgAndCost(Value * V,Value * & Arg,DenseMap<Value *,int>::iterator & CostIt)249 bool CallAnalyzer::lookupSROAArgAndCost(
250     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
251   if (SROAArgValues.empty() || SROAArgCosts.empty())
252     return false;
253 
254   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
255   if (ArgIt == SROAArgValues.end())
256     return false;
257 
258   Arg = ArgIt->second;
259   CostIt = SROAArgCosts.find(Arg);
260   return CostIt != SROAArgCosts.end();
261 }
262 
263 /// \brief Disable SROA for the candidate marked by this cost iterator.
264 ///
265 /// This marks the candidate as no longer viable for SROA, and adds the cost
266 /// savings associated with it back into the inline cost measurement.
disableSROA(DenseMap<Value *,int>::iterator CostIt)267 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
268   // If we're no longer able to perform SROA we need to undo its cost savings
269   // and prevent subsequent analysis.
270   Cost += CostIt->second;
271   SROACostSavings -= CostIt->second;
272   SROACostSavingsLost += CostIt->second;
273   SROAArgCosts.erase(CostIt);
274 }
275 
276 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)277 void CallAnalyzer::disableSROA(Value *V) {
278   Value *SROAArg;
279   DenseMap<Value *, int>::iterator CostIt;
280   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
281     disableSROA(CostIt);
282 }
283 
284 /// \brief Accumulate the given cost for a particular SROA candidate.
accumulateSROACost(DenseMap<Value *,int>::iterator CostIt,int InstructionCost)285 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
286                                       int InstructionCost) {
287   CostIt->second += InstructionCost;
288   SROACostSavings += InstructionCost;
289 }
290 
291 /// \brief Check whether a GEP's indices are all constant.
292 ///
293 /// Respects any simplified values known during the analysis of this callsite.
isGEPOffsetConstant(GetElementPtrInst & GEP)294 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
295   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
296     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
297       return false;
298 
299   return true;
300 }
301 
302 /// \brief Accumulate a constant GEP offset into an APInt if possible.
303 ///
304 /// Returns false if unable to compute the offset for any reason. Respects any
305 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)306 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
307   const DataLayout &DL = F.getParent()->getDataLayout();
308   unsigned IntPtrWidth = DL.getPointerSizeInBits();
309   assert(IntPtrWidth == Offset.getBitWidth());
310 
311   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
312        GTI != GTE; ++GTI) {
313     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
314     if (!OpC)
315       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
316         OpC = dyn_cast<ConstantInt>(SimpleOp);
317     if (!OpC)
318       return false;
319     if (OpC->isZero())
320       continue;
321 
322     // Handle a struct index, which adds its field offset to the pointer.
323     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
324       unsigned ElementIdx = OpC->getZExtValue();
325       const StructLayout *SL = DL.getStructLayout(STy);
326       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
327       continue;
328     }
329 
330     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
331     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
332   }
333   return true;
334 }
335 
visitAlloca(AllocaInst & I)336 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
337   // Check whether inlining will turn a dynamic alloca into a static
338   // alloca and handle that case.
339   if (I.isArrayAllocation()) {
340     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
341     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
342       const DataLayout &DL = F.getParent()->getDataLayout();
343       Type *Ty = I.getAllocatedType();
344       AllocatedSize = SaturatingMultiplyAdd(
345           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
346       return Base::visitAlloca(I);
347     }
348   }
349 
350   // Accumulate the allocated size.
351   if (I.isStaticAlloca()) {
352     const DataLayout &DL = F.getParent()->getDataLayout();
353     Type *Ty = I.getAllocatedType();
354     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
355   }
356 
357   // We will happily inline static alloca instructions.
358   if (I.isStaticAlloca())
359     return Base::visitAlloca(I);
360 
361   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
362   // a variety of reasons, and so we would like to not inline them into
363   // functions which don't currently have a dynamic alloca. This simply
364   // disables inlining altogether in the presence of a dynamic alloca.
365   HasDynamicAlloca = true;
366   return false;
367 }
368 
visitPHI(PHINode & I)369 bool CallAnalyzer::visitPHI(PHINode &I) {
370   // FIXME: We should potentially be tracking values through phi nodes,
371   // especially when they collapse to a single value due to deleted CFG edges
372   // during inlining.
373 
374   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
375   // though we don't want to propagate it's bonuses. The idea is to disable
376   // SROA if it *might* be used in an inappropriate manner.
377 
378   // Phi nodes are always zero-cost.
379   return true;
380 }
381 
visitGetElementPtr(GetElementPtrInst & I)382 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
383   Value *SROAArg;
384   DenseMap<Value *, int>::iterator CostIt;
385   bool SROACandidate =
386       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
387 
388   // Try to fold GEPs of constant-offset call site argument pointers. This
389   // requires target data and inbounds GEPs.
390   if (I.isInBounds()) {
391     // Check if we have a base + offset for the pointer.
392     Value *Ptr = I.getPointerOperand();
393     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
394     if (BaseAndOffset.first) {
395       // Check if the offset of this GEP is constant, and if so accumulate it
396       // into Offset.
397       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
398         // Non-constant GEPs aren't folded, and disable SROA.
399         if (SROACandidate)
400           disableSROA(CostIt);
401         return false;
402       }
403 
404       // Add the result as a new mapping to Base + Offset.
405       ConstantOffsetPtrs[&I] = BaseAndOffset;
406 
407       // Also handle SROA candidates here, we already know that the GEP is
408       // all-constant indexed.
409       if (SROACandidate)
410         SROAArgValues[&I] = SROAArg;
411 
412       return true;
413     }
414   }
415 
416   if (isGEPOffsetConstant(I)) {
417     if (SROACandidate)
418       SROAArgValues[&I] = SROAArg;
419 
420     // Constant GEPs are modeled as free.
421     return true;
422   }
423 
424   // Variable GEPs will require math and will disable SROA.
425   if (SROACandidate)
426     disableSROA(CostIt);
427   return false;
428 }
429 
visitBitCast(BitCastInst & I)430 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
431   // Propagate constants through bitcasts.
432   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
433   if (!COp)
434     COp = SimplifiedValues.lookup(I.getOperand(0));
435   if (COp)
436     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
437       SimplifiedValues[&I] = C;
438       return true;
439     }
440 
441   // Track base/offsets through casts
442   std::pair<Value *, APInt> BaseAndOffset =
443       ConstantOffsetPtrs.lookup(I.getOperand(0));
444   // Casts don't change the offset, just wrap it up.
445   if (BaseAndOffset.first)
446     ConstantOffsetPtrs[&I] = BaseAndOffset;
447 
448   // Also look for SROA candidates here.
449   Value *SROAArg;
450   DenseMap<Value *, int>::iterator CostIt;
451   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
452     SROAArgValues[&I] = SROAArg;
453 
454   // Bitcasts are always zero cost.
455   return true;
456 }
457 
visitPtrToInt(PtrToIntInst & I)458 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
459   // Propagate constants through ptrtoint.
460   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
461   if (!COp)
462     COp = SimplifiedValues.lookup(I.getOperand(0));
463   if (COp)
464     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
465       SimplifiedValues[&I] = C;
466       return true;
467     }
468 
469   // Track base/offset pairs when converted to a plain integer provided the
470   // integer is large enough to represent the pointer.
471   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
472   const DataLayout &DL = F.getParent()->getDataLayout();
473   if (IntegerSize >= DL.getPointerSizeInBits()) {
474     std::pair<Value *, APInt> BaseAndOffset =
475         ConstantOffsetPtrs.lookup(I.getOperand(0));
476     if (BaseAndOffset.first)
477       ConstantOffsetPtrs[&I] = BaseAndOffset;
478   }
479 
480   // This is really weird. Technically, ptrtoint will disable SROA. However,
481   // unless that ptrtoint is *used* somewhere in the live basic blocks after
482   // inlining, it will be nuked, and SROA should proceed. All of the uses which
483   // would block SROA would also block SROA if applied directly to a pointer,
484   // and so we can just add the integer in here. The only places where SROA is
485   // preserved either cannot fire on an integer, or won't in-and-of themselves
486   // disable SROA (ext) w/o some later use that we would see and disable.
487   Value *SROAArg;
488   DenseMap<Value *, int>::iterator CostIt;
489   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
490     SROAArgValues[&I] = SROAArg;
491 
492   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
493 }
494 
visitIntToPtr(IntToPtrInst & I)495 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
496   // Propagate constants through ptrtoint.
497   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
498   if (!COp)
499     COp = SimplifiedValues.lookup(I.getOperand(0));
500   if (COp)
501     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
502       SimplifiedValues[&I] = C;
503       return true;
504     }
505 
506   // Track base/offset pairs when round-tripped through a pointer without
507   // modifications provided the integer is not too large.
508   Value *Op = I.getOperand(0);
509   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
510   const DataLayout &DL = F.getParent()->getDataLayout();
511   if (IntegerSize <= DL.getPointerSizeInBits()) {
512     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
513     if (BaseAndOffset.first)
514       ConstantOffsetPtrs[&I] = BaseAndOffset;
515   }
516 
517   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
518   Value *SROAArg;
519   DenseMap<Value *, int>::iterator CostIt;
520   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
521     SROAArgValues[&I] = SROAArg;
522 
523   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
524 }
525 
visitCastInst(CastInst & I)526 bool CallAnalyzer::visitCastInst(CastInst &I) {
527   // Propagate constants through ptrtoint.
528   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
529   if (!COp)
530     COp = SimplifiedValues.lookup(I.getOperand(0));
531   if (COp)
532     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
533       SimplifiedValues[&I] = C;
534       return true;
535     }
536 
537   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
538   disableSROA(I.getOperand(0));
539 
540   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
541 }
542 
visitUnaryInstruction(UnaryInstruction & I)543 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
544   Value *Operand = I.getOperand(0);
545   Constant *COp = dyn_cast<Constant>(Operand);
546   if (!COp)
547     COp = SimplifiedValues.lookup(Operand);
548   if (COp) {
549     const DataLayout &DL = F.getParent()->getDataLayout();
550     if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
551       SimplifiedValues[&I] = C;
552       return true;
553     }
554   }
555 
556   // Disable any SROA on the argument to arbitrary unary operators.
557   disableSROA(Operand);
558 
559   return false;
560 }
561 
paramHasAttr(Argument * A,Attribute::AttrKind Attr)562 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
563   unsigned ArgNo = A->getArgNo();
564   return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
565 }
566 
isKnownNonNullInCallee(Value * V)567 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
568   // Does the *call site* have the NonNull attribute set on an argument?  We
569   // use the attribute on the call site to memoize any analysis done in the
570   // caller. This will also trip if the callee function has a non-null
571   // parameter attribute, but that's a less interesting case because hopefully
572   // the callee would already have been simplified based on that.
573   if (Argument *A = dyn_cast<Argument>(V))
574     if (paramHasAttr(A, Attribute::NonNull))
575       return true;
576 
577   // Is this an alloca in the caller?  This is distinct from the attribute case
578   // above because attributes aren't updated within the inliner itself and we
579   // always want to catch the alloca derived case.
580   if (isAllocaDerivedArg(V))
581     // We can actually predict the result of comparisons between an
582     // alloca-derived value and null. Note that this fires regardless of
583     // SROA firing.
584     return true;
585 
586   return false;
587 }
588 
allowSizeGrowth(CallSite CS)589 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
590   // If the normal destination of the invoke or the parent block of the call
591   // site is unreachable-terminated, there is little point in inlining this
592   // unless there is literally zero cost.
593   // FIXME: Note that it is possible that an unreachable-terminated block has a
594   // hot entry. For example, in below scenario inlining hot_call_X() may be
595   // beneficial :
596   // main() {
597   //   hot_call_1();
598   //   ...
599   //   hot_call_N()
600   //   exit(0);
601   // }
602   // For now, we are not handling this corner case here as it is rare in real
603   // code. In future, we should elaborate this based on BPI and BFI in more
604   // general threshold adjusting heuristics in updateThreshold().
605   Instruction *Instr = CS.getInstruction();
606   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
607     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
608       return false;
609   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
610     return false;
611 
612   return true;
613 }
614 
updateThreshold(CallSite CS,Function & Callee)615 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
616   // If no size growth is allowed for this inlining, set Threshold to 0.
617   if (!allowSizeGrowth(CS)) {
618     Threshold = 0;
619     return;
620   }
621 
622   Function *Caller = CS.getCaller();
623   if (DefaultInlineThreshold.getNumOccurrences() > 0) {
624     // Explicitly specified -inline-threhold overrides the threshold passed to
625     // CallAnalyzer's constructor.
626     Threshold = DefaultInlineThreshold;
627   } else {
628     // If -inline-threshold is not given, listen to the optsize and minsize
629     // attributes when they would decrease the threshold.
630     if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
631       Threshold = OptMinSizeThreshold;
632     else if (Caller->optForSize() && OptSizeThreshold < Threshold)
633       Threshold = OptSizeThreshold;
634   }
635 
636   bool HotCallsite = false;
637   uint64_t TotalWeight;
638   if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
639       PSI->isHotCount(TotalWeight))
640     HotCallsite = true;
641 
642   // Listen to the inlinehint attribute or profile based hotness information
643   // when it would increase the threshold and the caller does not need to
644   // minimize its size.
645   bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
646                     PSI->isHotFunction(&Callee) ||
647                     HotCallsite;
648   if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
649     Threshold = HintThreshold;
650 
651   bool ColdCallee = PSI->isColdFunction(&Callee);
652   // Command line argument for DefaultInlineThreshold will override the default
653   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
654   // do not use the default cold threshold even if it is smaller.
655   if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
656        ColdThreshold.getNumOccurrences() > 0) &&
657       ColdCallee && ColdThreshold < Threshold)
658     Threshold = ColdThreshold;
659 
660   // Finally, take the target-specific inlining threshold multiplier into
661   // account.
662   Threshold *= TTI.getInliningThresholdMultiplier();
663 }
664 
visitCmpInst(CmpInst & I)665 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
666   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
667   // First try to handle simplified comparisons.
668   if (!isa<Constant>(LHS))
669     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
670       LHS = SimpleLHS;
671   if (!isa<Constant>(RHS))
672     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
673       RHS = SimpleRHS;
674   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
675     if (Constant *CRHS = dyn_cast<Constant>(RHS))
676       if (Constant *C =
677               ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
678         SimplifiedValues[&I] = C;
679         return true;
680       }
681   }
682 
683   if (I.getOpcode() == Instruction::FCmp)
684     return false;
685 
686   // Otherwise look for a comparison between constant offset pointers with
687   // a common base.
688   Value *LHSBase, *RHSBase;
689   APInt LHSOffset, RHSOffset;
690   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
691   if (LHSBase) {
692     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
693     if (RHSBase && LHSBase == RHSBase) {
694       // We have common bases, fold the icmp to a constant based on the
695       // offsets.
696       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
697       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
698       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
699         SimplifiedValues[&I] = C;
700         ++NumConstantPtrCmps;
701         return true;
702       }
703     }
704   }
705 
706   // If the comparison is an equality comparison with null, we can simplify it
707   // if we know the value (argument) can't be null
708   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
709       isKnownNonNullInCallee(I.getOperand(0))) {
710     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
711     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
712                                       : ConstantInt::getFalse(I.getType());
713     return true;
714   }
715   // Finally check for SROA candidates in comparisons.
716   Value *SROAArg;
717   DenseMap<Value *, int>::iterator CostIt;
718   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
719     if (isa<ConstantPointerNull>(I.getOperand(1))) {
720       accumulateSROACost(CostIt, InlineConstants::InstrCost);
721       return true;
722     }
723 
724     disableSROA(CostIt);
725   }
726 
727   return false;
728 }
729 
visitSub(BinaryOperator & I)730 bool CallAnalyzer::visitSub(BinaryOperator &I) {
731   // Try to handle a special case: we can fold computing the difference of two
732   // constant-related pointers.
733   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
734   Value *LHSBase, *RHSBase;
735   APInt LHSOffset, RHSOffset;
736   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
737   if (LHSBase) {
738     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
739     if (RHSBase && LHSBase == RHSBase) {
740       // We have common bases, fold the subtract to a constant based on the
741       // offsets.
742       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
743       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
744       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
745         SimplifiedValues[&I] = C;
746         ++NumConstantPtrDiffs;
747         return true;
748       }
749     }
750   }
751 
752   // Otherwise, fall back to the generic logic for simplifying and handling
753   // instructions.
754   return Base::visitSub(I);
755 }
756 
visitBinaryOperator(BinaryOperator & I)757 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
758   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
759   const DataLayout &DL = F.getParent()->getDataLayout();
760   if (!isa<Constant>(LHS))
761     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
762       LHS = SimpleLHS;
763   if (!isa<Constant>(RHS))
764     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
765       RHS = SimpleRHS;
766   Value *SimpleV = nullptr;
767   if (auto FI = dyn_cast<FPMathOperator>(&I))
768     SimpleV =
769         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
770   else
771     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
772 
773   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
774     SimplifiedValues[&I] = C;
775     return true;
776   }
777 
778   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
779   disableSROA(LHS);
780   disableSROA(RHS);
781 
782   return false;
783 }
784 
visitLoad(LoadInst & I)785 bool CallAnalyzer::visitLoad(LoadInst &I) {
786   Value *SROAArg;
787   DenseMap<Value *, int>::iterator CostIt;
788   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
789     if (I.isSimple()) {
790       accumulateSROACost(CostIt, InlineConstants::InstrCost);
791       return true;
792     }
793 
794     disableSROA(CostIt);
795   }
796 
797   return false;
798 }
799 
visitStore(StoreInst & I)800 bool CallAnalyzer::visitStore(StoreInst &I) {
801   Value *SROAArg;
802   DenseMap<Value *, int>::iterator CostIt;
803   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
804     if (I.isSimple()) {
805       accumulateSROACost(CostIt, InlineConstants::InstrCost);
806       return true;
807     }
808 
809     disableSROA(CostIt);
810   }
811 
812   return false;
813 }
814 
visitExtractValue(ExtractValueInst & I)815 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
816   // Constant folding for extract value is trivial.
817   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
818   if (!C)
819     C = SimplifiedValues.lookup(I.getAggregateOperand());
820   if (C) {
821     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
822     return true;
823   }
824 
825   // SROA can look through these but give them a cost.
826   return false;
827 }
828 
visitInsertValue(InsertValueInst & I)829 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
830   // Constant folding for insert value is trivial.
831   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
832   if (!AggC)
833     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
834   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
835   if (!InsertedC)
836     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
837   if (AggC && InsertedC) {
838     SimplifiedValues[&I] =
839         ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
840     return true;
841   }
842 
843   // SROA can look through these but give them a cost.
844   return false;
845 }
846 
847 /// \brief Try to simplify a call site.
848 ///
849 /// Takes a concrete function and callsite and tries to actually simplify it by
850 /// analyzing the arguments and call itself with instsimplify. Returns true if
851 /// it has simplified the callsite to some other entity (a constant), making it
852 /// free.
simplifyCallSite(Function * F,CallSite CS)853 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
854   // FIXME: Using the instsimplify logic directly for this is inefficient
855   // because we have to continually rebuild the argument list even when no
856   // simplifications can be performed. Until that is fixed with remapping
857   // inside of instsimplify, directly constant fold calls here.
858   if (!canConstantFoldCallTo(F))
859     return false;
860 
861   // Try to re-map the arguments to constants.
862   SmallVector<Constant *, 4> ConstantArgs;
863   ConstantArgs.reserve(CS.arg_size());
864   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
865        ++I) {
866     Constant *C = dyn_cast<Constant>(*I);
867     if (!C)
868       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
869     if (!C)
870       return false; // This argument doesn't map to a constant.
871 
872     ConstantArgs.push_back(C);
873   }
874   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
875     SimplifiedValues[CS.getInstruction()] = C;
876     return true;
877   }
878 
879   return false;
880 }
881 
visitCallSite(CallSite CS)882 bool CallAnalyzer::visitCallSite(CallSite CS) {
883   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
884       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
885     // This aborts the entire analysis.
886     ExposesReturnsTwice = true;
887     return false;
888   }
889   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
890     ContainsNoDuplicateCall = true;
891 
892   if (Function *F = CS.getCalledFunction()) {
893     // When we have a concrete function, first try to simplify it directly.
894     if (simplifyCallSite(F, CS))
895       return true;
896 
897     // Next check if it is an intrinsic we know about.
898     // FIXME: Lift this into part of the InstVisitor.
899     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
900       switch (II->getIntrinsicID()) {
901       default:
902         return Base::visitCallSite(CS);
903 
904       case Intrinsic::load_relative:
905         // This is normally lowered to 4 LLVM instructions.
906         Cost += 3 * InlineConstants::InstrCost;
907         return false;
908 
909       case Intrinsic::memset:
910       case Intrinsic::memcpy:
911       case Intrinsic::memmove:
912         // SROA can usually chew through these intrinsics, but they aren't free.
913         return false;
914       case Intrinsic::localescape:
915         HasFrameEscape = true;
916         return false;
917       }
918     }
919 
920     if (F == CS.getInstruction()->getParent()->getParent()) {
921       // This flag will fully abort the analysis, so don't bother with anything
922       // else.
923       IsRecursiveCall = true;
924       return false;
925     }
926 
927     if (TTI.isLoweredToCall(F)) {
928       // We account for the average 1 instruction per call argument setup
929       // here.
930       Cost += CS.arg_size() * InlineConstants::InstrCost;
931 
932       // Everything other than inline ASM will also have a significant cost
933       // merely from making the call.
934       if (!isa<InlineAsm>(CS.getCalledValue()))
935         Cost += InlineConstants::CallPenalty;
936     }
937 
938     return Base::visitCallSite(CS);
939   }
940 
941   // Otherwise we're in a very special case -- an indirect function call. See
942   // if we can be particularly clever about this.
943   Value *Callee = CS.getCalledValue();
944 
945   // First, pay the price of the argument setup. We account for the average
946   // 1 instruction per call argument setup here.
947   Cost += CS.arg_size() * InlineConstants::InstrCost;
948 
949   // Next, check if this happens to be an indirect function call to a known
950   // function in this inline context. If not, we've done all we can.
951   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
952   if (!F)
953     return Base::visitCallSite(CS);
954 
955   // If we have a constant that we are calling as a function, we can peer
956   // through it and see the function target. This happens not infrequently
957   // during devirtualization and so we want to give it a hefty bonus for
958   // inlining, but cap that bonus in the event that inlining wouldn't pan
959   // out. Pretend to inline the function, with a custom threshold.
960   CallAnalyzer CA(TTI, ACT, PSI, *F, InlineConstants::IndirectCallThreshold,
961                   CS);
962   if (CA.analyzeCall(CS)) {
963     // We were able to inline the indirect call! Subtract the cost from the
964     // threshold to get the bonus we want to apply, but don't go below zero.
965     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
966   }
967 
968   return Base::visitCallSite(CS);
969 }
970 
visitReturnInst(ReturnInst & RI)971 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
972   // At least one return instruction will be free after inlining.
973   bool Free = !HasReturn;
974   HasReturn = true;
975   return Free;
976 }
977 
visitBranchInst(BranchInst & BI)978 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
979   // We model unconditional branches as essentially free -- they really
980   // shouldn't exist at all, but handling them makes the behavior of the
981   // inliner more regular and predictable. Interestingly, conditional branches
982   // which will fold away are also free.
983   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
984          dyn_cast_or_null<ConstantInt>(
985              SimplifiedValues.lookup(BI.getCondition()));
986 }
987 
visitSwitchInst(SwitchInst & SI)988 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
989   // We model unconditional switches as free, see the comments on handling
990   // branches.
991   if (isa<ConstantInt>(SI.getCondition()))
992     return true;
993   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
994     if (isa<ConstantInt>(V))
995       return true;
996 
997   // Otherwise, we need to accumulate a cost proportional to the number of
998   // distinct successor blocks. This fan-out in the CFG cannot be represented
999   // for free even if we can represent the core switch as a jumptable that
1000   // takes a single instruction.
1001   //
1002   // NB: We convert large switches which are just used to initialize large phi
1003   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1004   // inlining those. It will prevent inlining in cases where the optimization
1005   // does not (yet) fire.
1006   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1007   SuccessorBlocks.insert(SI.getDefaultDest());
1008   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
1009     SuccessorBlocks.insert(I.getCaseSuccessor());
1010   // Add cost corresponding to the number of distinct destinations. The first
1011   // we model as free because of fallthrough.
1012   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1013   return false;
1014 }
1015 
visitIndirectBrInst(IndirectBrInst & IBI)1016 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1017   // We never want to inline functions that contain an indirectbr.  This is
1018   // incorrect because all the blockaddress's (in static global initializers
1019   // for example) would be referring to the original function, and this
1020   // indirect jump would jump from the inlined copy of the function into the
1021   // original function which is extremely undefined behavior.
1022   // FIXME: This logic isn't really right; we can safely inline functions with
1023   // indirectbr's as long as no other function or global references the
1024   // blockaddress of a block within the current function.
1025   HasIndirectBr = true;
1026   return false;
1027 }
1028 
visitResumeInst(ResumeInst & RI)1029 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1030   // FIXME: It's not clear that a single instruction is an accurate model for
1031   // the inline cost of a resume instruction.
1032   return false;
1033 }
1034 
visitCleanupReturnInst(CleanupReturnInst & CRI)1035 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1036   // FIXME: It's not clear that a single instruction is an accurate model for
1037   // the inline cost of a cleanupret instruction.
1038   return false;
1039 }
1040 
visitCatchReturnInst(CatchReturnInst & CRI)1041 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1042   // FIXME: It's not clear that a single instruction is an accurate model for
1043   // the inline cost of a catchret instruction.
1044   return false;
1045 }
1046 
visitUnreachableInst(UnreachableInst & I)1047 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1048   // FIXME: It might be reasonably to discount the cost of instructions leading
1049   // to unreachable as they have the lowest possible impact on both runtime and
1050   // code size.
1051   return true; // No actual code is needed for unreachable.
1052 }
1053 
visitInstruction(Instruction & I)1054 bool CallAnalyzer::visitInstruction(Instruction &I) {
1055   // Some instructions are free. All of the free intrinsics can also be
1056   // handled by SROA, etc.
1057   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1058     return true;
1059 
1060   // We found something we don't understand or can't handle. Mark any SROA-able
1061   // values in the operand list as no longer viable.
1062   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1063     disableSROA(*OI);
1064 
1065   return false;
1066 }
1067 
1068 /// \brief Analyze a basic block for its contribution to the inline cost.
1069 ///
1070 /// This method walks the analyzer over every instruction in the given basic
1071 /// block and accounts for their cost during inlining at this callsite. It
1072 /// aborts early if the threshold has been exceeded or an impossible to inline
1073 /// construct has been detected. It returns false if inlining is no longer
1074 /// viable, and true if inlining remains viable.
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)1075 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1076                                 SmallPtrSetImpl<const Value *> &EphValues) {
1077   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1078     // FIXME: Currently, the number of instructions in a function regardless of
1079     // our ability to simplify them during inline to constants or dead code,
1080     // are actually used by the vector bonus heuristic. As long as that's true,
1081     // we have to special case debug intrinsics here to prevent differences in
1082     // inlining due to debug symbols. Eventually, the number of unsimplified
1083     // instructions shouldn't factor into the cost computation, but until then,
1084     // hack around it here.
1085     if (isa<DbgInfoIntrinsic>(I))
1086       continue;
1087 
1088     // Skip ephemeral values.
1089     if (EphValues.count(&*I))
1090       continue;
1091 
1092     ++NumInstructions;
1093     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1094       ++NumVectorInstructions;
1095 
1096     // If the instruction is floating point, and the target says this operation
1097     // is expensive or the function has the "use-soft-float" attribute, this may
1098     // eventually become a library call. Treat the cost as such.
1099     if (I->getType()->isFloatingPointTy()) {
1100       bool hasSoftFloatAttr = false;
1101 
1102       // If the function has the "use-soft-float" attribute, mark it as
1103       // expensive.
1104       if (F.hasFnAttribute("use-soft-float")) {
1105         Attribute Attr = F.getFnAttribute("use-soft-float");
1106         StringRef Val = Attr.getValueAsString();
1107         if (Val == "true")
1108           hasSoftFloatAttr = true;
1109       }
1110 
1111       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1112           hasSoftFloatAttr)
1113         Cost += InlineConstants::CallPenalty;
1114     }
1115 
1116     // If the instruction simplified to a constant, there is no cost to this
1117     // instruction. Visit the instructions using our InstVisitor to account for
1118     // all of the per-instruction logic. The visit tree returns true if we
1119     // consumed the instruction in any way, and false if the instruction's base
1120     // cost should count against inlining.
1121     if (Base::visit(&*I))
1122       ++NumInstructionsSimplified;
1123     else
1124       Cost += InlineConstants::InstrCost;
1125 
1126     // If the visit this instruction detected an uninlinable pattern, abort.
1127     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1128         HasIndirectBr || HasFrameEscape)
1129       return false;
1130 
1131     // If the caller is a recursive function then we don't want to inline
1132     // functions which allocate a lot of stack space because it would increase
1133     // the caller stack usage dramatically.
1134     if (IsCallerRecursive &&
1135         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1136       return false;
1137 
1138     // Check if we've past the maximum possible threshold so we don't spin in
1139     // huge basic blocks that will never inline.
1140     if (Cost > Threshold)
1141       return false;
1142   }
1143 
1144   return true;
1145 }
1146 
1147 /// \brief Compute the base pointer and cumulative constant offsets for V.
1148 ///
1149 /// This strips all constant offsets off of V, leaving it the base pointer, and
1150 /// accumulates the total constant offset applied in the returned constant. It
1151 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1152 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)1153 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1154   if (!V->getType()->isPointerTy())
1155     return nullptr;
1156 
1157   const DataLayout &DL = F.getParent()->getDataLayout();
1158   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1159   APInt Offset = APInt::getNullValue(IntPtrWidth);
1160 
1161   // Even though we don't look through PHI nodes, we could be called on an
1162   // instruction in an unreachable block, which may be on a cycle.
1163   SmallPtrSet<Value *, 4> Visited;
1164   Visited.insert(V);
1165   do {
1166     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1167       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1168         return nullptr;
1169       V = GEP->getPointerOperand();
1170     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1171       V = cast<Operator>(V)->getOperand(0);
1172     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1173       if (GA->isInterposable())
1174         break;
1175       V = GA->getAliasee();
1176     } else {
1177       break;
1178     }
1179     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1180   } while (Visited.insert(V).second);
1181 
1182   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1183   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1184 }
1185 
1186 /// \brief Analyze a call site for potential inlining.
1187 ///
1188 /// Returns true if inlining this call is viable, and false if it is not
1189 /// viable. It computes the cost and adjusts the threshold based on numerous
1190 /// factors and heuristics. If this method returns false but the computed cost
1191 /// is below the computed threshold, then inlining was forcibly disabled by
1192 /// some artifact of the routine.
analyzeCall(CallSite CS)1193 bool CallAnalyzer::analyzeCall(CallSite CS) {
1194   ++NumCallsAnalyzed;
1195 
1196   // Perform some tweaks to the cost and threshold based on the direct
1197   // callsite information.
1198 
1199   // We want to more aggressively inline vector-dense kernels, so up the
1200   // threshold, and we'll lower it if the % of vector instructions gets too
1201   // low. Note that these bonuses are some what arbitrary and evolved over time
1202   // by accident as much as because they are principled bonuses.
1203   //
1204   // FIXME: It would be nice to remove all such bonuses. At least it would be
1205   // nice to base the bonus values on something more scientific.
1206   assert(NumInstructions == 0);
1207   assert(NumVectorInstructions == 0);
1208 
1209   // Update the threshold based on callsite properties
1210   updateThreshold(CS, F);
1211 
1212   FiftyPercentVectorBonus = 3 * Threshold / 2;
1213   TenPercentVectorBonus = 3 * Threshold / 4;
1214   const DataLayout &DL = F.getParent()->getDataLayout();
1215 
1216   // Track whether the post-inlining function would have more than one basic
1217   // block. A single basic block is often intended for inlining. Balloon the
1218   // threshold by 50% until we pass the single-BB phase.
1219   bool SingleBB = true;
1220   int SingleBBBonus = Threshold / 2;
1221 
1222   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1223   // this Threshold any time, and cost cannot decrease, we can stop processing
1224   // the rest of the function body.
1225   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1226 
1227   // Give out bonuses per argument, as the instructions setting them up will
1228   // be gone after inlining.
1229   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1230     if (CS.isByValArgument(I)) {
1231       // We approximate the number of loads and stores needed by dividing the
1232       // size of the byval type by the target's pointer size.
1233       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1234       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1235       unsigned PointerSize = DL.getPointerSizeInBits();
1236       // Ceiling division.
1237       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1238 
1239       // If it generates more than 8 stores it is likely to be expanded as an
1240       // inline memcpy so we take that as an upper bound. Otherwise we assume
1241       // one load and one store per word copied.
1242       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1243       // here instead of a magic number of 8, but it's not available via
1244       // DataLayout.
1245       NumStores = std::min(NumStores, 8U);
1246 
1247       Cost -= 2 * NumStores * InlineConstants::InstrCost;
1248     } else {
1249       // For non-byval arguments subtract off one instruction per call
1250       // argument.
1251       Cost -= InlineConstants::InstrCost;
1252     }
1253   }
1254 
1255   // If there is only one call of the function, and it has internal linkage,
1256   // the cost of inlining it drops dramatically.
1257   bool OnlyOneCallAndLocalLinkage =
1258       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1259   if (OnlyOneCallAndLocalLinkage)
1260     Cost += InlineConstants::LastCallToStaticBonus;
1261 
1262   // If this function uses the coldcc calling convention, prefer not to inline
1263   // it.
1264   if (F.getCallingConv() == CallingConv::Cold)
1265     Cost += InlineConstants::ColdccPenalty;
1266 
1267   // Check if we're done. This can happen due to bonuses and penalties.
1268   if (Cost > Threshold)
1269     return false;
1270 
1271   if (F.empty())
1272     return true;
1273 
1274   Function *Caller = CS.getInstruction()->getParent()->getParent();
1275   // Check if the caller function is recursive itself.
1276   for (User *U : Caller->users()) {
1277     CallSite Site(U);
1278     if (!Site)
1279       continue;
1280     Instruction *I = Site.getInstruction();
1281     if (I->getParent()->getParent() == Caller) {
1282       IsCallerRecursive = true;
1283       break;
1284     }
1285   }
1286 
1287   // Populate our simplified values by mapping from function arguments to call
1288   // arguments with known important simplifications.
1289   CallSite::arg_iterator CAI = CS.arg_begin();
1290   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1291        FAI != FAE; ++FAI, ++CAI) {
1292     assert(CAI != CS.arg_end());
1293     if (Constant *C = dyn_cast<Constant>(CAI))
1294       SimplifiedValues[&*FAI] = C;
1295 
1296     Value *PtrArg = *CAI;
1297     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1298       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1299 
1300       // We can SROA any pointer arguments derived from alloca instructions.
1301       if (isa<AllocaInst>(PtrArg)) {
1302         SROAArgValues[&*FAI] = PtrArg;
1303         SROAArgCosts[PtrArg] = 0;
1304       }
1305     }
1306   }
1307   NumConstantArgs = SimplifiedValues.size();
1308   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1309   NumAllocaArgs = SROAArgValues.size();
1310 
1311   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1312   // the ephemeral values multiple times (and they're completely determined by
1313   // the callee, so this is purely duplicate work).
1314   SmallPtrSet<const Value *, 32> EphValues;
1315   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F),
1316                                       EphValues);
1317 
1318   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1319   // adding basic blocks of the callee which can be proven to be dead for this
1320   // particular call site in order to get more accurate cost estimates. This
1321   // requires a somewhat heavyweight iteration pattern: we need to walk the
1322   // basic blocks in a breadth-first order as we insert live successors. To
1323   // accomplish this, prioritizing for small iterations because we exit after
1324   // crossing our threshold, we use a small-size optimized SetVector.
1325   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1326                     SmallPtrSet<BasicBlock *, 16>>
1327       BBSetVector;
1328   BBSetVector BBWorklist;
1329   BBWorklist.insert(&F.getEntryBlock());
1330   // Note that we *must not* cache the size, this loop grows the worklist.
1331   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1332     // Bail out the moment we cross the threshold. This means we'll under-count
1333     // the cost, but only when undercounting doesn't matter.
1334     if (Cost > Threshold)
1335       break;
1336 
1337     BasicBlock *BB = BBWorklist[Idx];
1338     if (BB->empty())
1339       continue;
1340 
1341     // Disallow inlining a blockaddress. A blockaddress only has defined
1342     // behavior for an indirect branch in the same function, and we do not
1343     // currently support inlining indirect branches. But, the inliner may not
1344     // see an indirect branch that ends up being dead code at a particular call
1345     // site. If the blockaddress escapes the function, e.g., via a global
1346     // variable, inlining may lead to an invalid cross-function reference.
1347     if (BB->hasAddressTaken())
1348       return false;
1349 
1350     // Analyze the cost of this block. If we blow through the threshold, this
1351     // returns false, and we can bail on out.
1352     if (!analyzeBlock(BB, EphValues))
1353       return false;
1354 
1355     TerminatorInst *TI = BB->getTerminator();
1356 
1357     // Add in the live successors by first checking whether we have terminator
1358     // that may be simplified based on the values simplified by this call.
1359     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1360       if (BI->isConditional()) {
1361         Value *Cond = BI->getCondition();
1362         if (ConstantInt *SimpleCond =
1363                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1364           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1365           continue;
1366         }
1367       }
1368     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1369       Value *Cond = SI->getCondition();
1370       if (ConstantInt *SimpleCond =
1371               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1372         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1373         continue;
1374       }
1375     }
1376 
1377     // If we're unable to select a particular successor, just count all of
1378     // them.
1379     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1380          ++TIdx)
1381       BBWorklist.insert(TI->getSuccessor(TIdx));
1382 
1383     // If we had any successors at this point, than post-inlining is likely to
1384     // have them as well. Note that we assume any basic blocks which existed
1385     // due to branches or switches which folded above will also fold after
1386     // inlining.
1387     if (SingleBB && TI->getNumSuccessors() > 1) {
1388       // Take off the bonus we applied to the threshold.
1389       Threshold -= SingleBBBonus;
1390       SingleBB = false;
1391     }
1392   }
1393 
1394   // If this is a noduplicate call, we can still inline as long as
1395   // inlining this would cause the removal of the caller (so the instruction
1396   // is not actually duplicated, just moved).
1397   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1398     return false;
1399 
1400   // We applied the maximum possible vector bonus at the beginning. Now,
1401   // subtract the excess bonus, if any, from the Threshold before
1402   // comparing against Cost.
1403   if (NumVectorInstructions <= NumInstructions / 10)
1404     Threshold -= FiftyPercentVectorBonus;
1405   else if (NumVectorInstructions <= NumInstructions / 2)
1406     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1407 
1408   return Cost < std::max(1, Threshold);
1409 }
1410 
1411 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1412 /// \brief Dump stats about this call's analysis.
dump()1413 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1414 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1415   DEBUG_PRINT_STAT(NumConstantArgs);
1416   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1417   DEBUG_PRINT_STAT(NumAllocaArgs);
1418   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1419   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1420   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1421   DEBUG_PRINT_STAT(NumInstructions);
1422   DEBUG_PRINT_STAT(SROACostSavings);
1423   DEBUG_PRINT_STAT(SROACostSavingsLost);
1424   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1425   DEBUG_PRINT_STAT(Cost);
1426   DEBUG_PRINT_STAT(Threshold);
1427 #undef DEBUG_PRINT_STAT
1428 }
1429 #endif
1430 
1431 /// \brief Test that two functions either have or have not the given attribute
1432 ///        at the same time.
1433 template <typename AttrKind>
attributeMatches(Function * F1,Function * F2,AttrKind Attr)1434 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1435   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1436 }
1437 
1438 /// \brief Test that there are no attribute conflicts between Caller and Callee
1439 ///        that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI)1440 static bool functionsHaveCompatibleAttributes(Function *Caller,
1441                                               Function *Callee,
1442                                               TargetTransformInfo &TTI) {
1443   return TTI.areInlineCompatible(Caller, Callee) &&
1444          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1445 }
1446 
getInlineCost(CallSite CS,int DefaultThreshold,TargetTransformInfo & CalleeTTI,AssumptionCacheTracker * ACT,ProfileSummaryInfo * PSI)1447 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
1448                                TargetTransformInfo &CalleeTTI,
1449                                AssumptionCacheTracker *ACT,
1450                                ProfileSummaryInfo *PSI) {
1451   return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
1452                        ACT, PSI);
1453 }
1454 
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)1455 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
1456                                         unsigned SizeOptLevel) {
1457   if (OptLevel > 2)
1458     return OptAggressiveThreshold;
1459   if (SizeOptLevel == 1) // -Os
1460     return OptSizeThreshold;
1461   if (SizeOptLevel == 2) // -Oz
1462     return OptMinSizeThreshold;
1463   return DefaultInlineThreshold;
1464 }
1465 
getDefaultInlineThreshold()1466 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
1467 
getInlineCost(CallSite CS,Function * Callee,int DefaultThreshold,TargetTransformInfo & CalleeTTI,AssumptionCacheTracker * ACT,ProfileSummaryInfo * PSI)1468 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
1469                                int DefaultThreshold,
1470                                TargetTransformInfo &CalleeTTI,
1471                                AssumptionCacheTracker *ACT,
1472                                ProfileSummaryInfo *PSI) {
1473 
1474   // Cannot inline indirect calls.
1475   if (!Callee)
1476     return llvm::InlineCost::getNever();
1477 
1478   // Calls to functions with always-inline attributes should be inlined
1479   // whenever possible.
1480   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1481     if (isInlineViable(*Callee))
1482       return llvm::InlineCost::getAlways();
1483     return llvm::InlineCost::getNever();
1484   }
1485 
1486   // Never inline functions with conflicting attributes (unless callee has
1487   // always-inline attribute).
1488   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
1489     return llvm::InlineCost::getNever();
1490 
1491   // Don't inline this call if the caller has the optnone attribute.
1492   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1493     return llvm::InlineCost::getNever();
1494 
1495   // Don't inline functions which can be interposed at link-time.  Don't inline
1496   // functions marked noinline or call sites marked noinline.
1497   // Note: inlining non-exact non-interposable fucntions is fine, since we know
1498   // we have *a* correct implementation of the source level function.
1499   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1500       CS.isNoInline())
1501     return llvm::InlineCost::getNever();
1502 
1503   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1504                      << "...\n");
1505 
1506   CallAnalyzer CA(CalleeTTI, ACT, PSI, *Callee, DefaultThreshold, CS);
1507   bool ShouldInline = CA.analyzeCall(CS);
1508 
1509   DEBUG(CA.dump());
1510 
1511   // Check if there was a reason to force inlining or no inlining.
1512   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1513     return InlineCost::getNever();
1514   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1515     return InlineCost::getAlways();
1516 
1517   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1518 }
1519 
isInlineViable(Function & F)1520 bool llvm::isInlineViable(Function &F) {
1521   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1522   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1523     // Disallow inlining of functions which contain indirect branches or
1524     // blockaddresses.
1525     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1526       return false;
1527 
1528     for (auto &II : *BI) {
1529       CallSite CS(&II);
1530       if (!CS)
1531         continue;
1532 
1533       // Disallow recursive calls.
1534       if (&F == CS.getCalledFunction())
1535         return false;
1536 
1537       // Disallow calls which expose returns-twice to a function not previously
1538       // attributed as such.
1539       if (!ReturnsTwice && CS.isCall() &&
1540           cast<CallInst>(CS.getInstruction())->canReturnTwice())
1541         return false;
1542 
1543       // Disallow inlining functions that call @llvm.localescape. Doing this
1544       // correctly would require major changes to the inliner.
1545       if (CS.getCalledFunction() &&
1546           CS.getCalledFunction()->getIntrinsicID() ==
1547               llvm::Intrinsic::localescape)
1548         return false;
1549     }
1550   }
1551 
1552   return true;
1553 }
1554