1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/IR/CallSite.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/InstVisitor.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/raw_ostream.h"
40
41 using namespace llvm;
42
43 #define DEBUG_TYPE "inline-cost"
44
45 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
46
47 static cl::opt<int> InlineThreshold(
48 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
49 cl::desc("Control the amount of inlining to perform (default = 225)"));
50
51 static cl::opt<int> HintThreshold(
52 "inlinehint-threshold", cl::Hidden, cl::init(325),
53 cl::desc("Threshold for inlining functions with inline hint"));
54
55 static cl::opt<int>
56 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
57 cl::init(45),
58 cl::desc("Threshold for inlining cold callsites"));
59
60 // We introduce this threshold to help performance of instrumentation based
61 // PGO before we actually hook up inliner with analysis passes such as BPI and
62 // BFI.
63 static cl::opt<int> ColdThreshold(
64 "inlinecold-threshold", cl::Hidden, cl::init(45),
65 cl::desc("Threshold for inlining functions with cold attribute"));
66
67 static cl::opt<int>
68 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
69 cl::ZeroOrMore,
70 cl::desc("Threshold for hot callsites "));
71
72 static cl::opt<int> LocallyHotCallSiteThreshold(
73 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
74 cl::desc("Threshold for locally hot callsites "));
75
76 static cl::opt<int> ColdCallSiteRelFreq(
77 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
78 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
79 "entry frequency, for a callsite to be cold in the absence of "
80 "profile information."));
81
82 static cl::opt<int> HotCallSiteRelFreq(
83 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
84 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
85 "entry frequency, for a callsite to be hot in the absence of "
86 "profile information."));
87
88 static cl::opt<bool> OptComputeFullInlineCost(
89 "inline-cost-full", cl::Hidden, cl::init(false),
90 cl::desc("Compute the full inline cost of a call site even when the cost "
91 "exceeds the threshold."));
92
93 namespace {
94
95 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
96 typedef InstVisitor<CallAnalyzer, bool> Base;
97 friend class InstVisitor<CallAnalyzer, bool>;
98
99 /// The TargetTransformInfo available for this compilation.
100 const TargetTransformInfo &TTI;
101
102 /// Getter for the cache of @llvm.assume intrinsics.
103 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
104
105 /// Getter for BlockFrequencyInfo
106 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
107
108 /// Profile summary information.
109 ProfileSummaryInfo *PSI;
110
111 /// The called function.
112 Function &F;
113
114 // Cache the DataLayout since we use it a lot.
115 const DataLayout &DL;
116
117 /// The OptimizationRemarkEmitter available for this compilation.
118 OptimizationRemarkEmitter *ORE;
119
120 /// The candidate callsite being analyzed. Please do not use this to do
121 /// analysis in the caller function; we want the inline cost query to be
122 /// easily cacheable. Instead, use the cover function paramHasAttr.
123 CallSite CandidateCS;
124
125 /// Tunable parameters that control the analysis.
126 const InlineParams &Params;
127
128 int Threshold;
129 int Cost;
130 bool ComputeFullInlineCost;
131
132 bool IsCallerRecursive;
133 bool IsRecursiveCall;
134 bool ExposesReturnsTwice;
135 bool HasDynamicAlloca;
136 bool ContainsNoDuplicateCall;
137 bool HasReturn;
138 bool HasIndirectBr;
139 bool HasUninlineableIntrinsic;
140 bool UsesVarArgs;
141
142 /// Number of bytes allocated statically by the callee.
143 uint64_t AllocatedSize;
144 unsigned NumInstructions, NumVectorInstructions;
145 int VectorBonus, TenPercentVectorBonus;
146 // Bonus to be applied when the callee has only one reachable basic block.
147 int SingleBBBonus;
148
149 /// While we walk the potentially-inlined instructions, we build up and
150 /// maintain a mapping of simplified values specific to this callsite. The
151 /// idea is to propagate any special information we have about arguments to
152 /// this call through the inlinable section of the function, and account for
153 /// likely simplifications post-inlining. The most important aspect we track
154 /// is CFG altering simplifications -- when we prove a basic block dead, that
155 /// can cause dramatic shifts in the cost of inlining a function.
156 DenseMap<Value *, Constant *> SimplifiedValues;
157
158 /// Keep track of the values which map back (through function arguments) to
159 /// allocas on the caller stack which could be simplified through SROA.
160 DenseMap<Value *, Value *> SROAArgValues;
161
162 /// The mapping of caller Alloca values to their accumulated cost savings. If
163 /// we have to disable SROA for one of the allocas, this tells us how much
164 /// cost must be added.
165 DenseMap<Value *, int> SROAArgCosts;
166
167 /// Keep track of values which map to a pointer base and constant offset.
168 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
169
170 /// Keep track of dead blocks due to the constant arguments.
171 SetVector<BasicBlock *> DeadBlocks;
172
173 /// The mapping of the blocks to their known unique successors due to the
174 /// constant arguments.
175 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
176
177 /// Model the elimination of repeated loads that is expected to happen
178 /// whenever we simplify away the stores that would otherwise cause them to be
179 /// loads.
180 bool EnableLoadElimination;
181 SmallPtrSet<Value *, 16> LoadAddrSet;
182 int LoadEliminationCost;
183
184 // Custom simplification helper routines.
185 bool isAllocaDerivedArg(Value *V);
186 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
187 DenseMap<Value *, int>::iterator &CostIt);
188 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
189 void disableSROA(Value *V);
190 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
191 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
192 int InstructionCost);
193 void disableLoadElimination();
194 bool isGEPFree(GetElementPtrInst &GEP);
195 bool canFoldInboundsGEP(GetElementPtrInst &I);
196 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
197 bool simplifyCallSite(Function *F, CallSite CS);
198 template <typename Callable>
199 bool simplifyInstruction(Instruction &I, Callable Evaluate);
200 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
201
202 /// Return true if the given argument to the function being considered for
203 /// inlining has the given attribute set either at the call site or the
204 /// function declaration. Primarily used to inspect call site specific
205 /// attributes since these can be more precise than the ones on the callee
206 /// itself.
207 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
208
209 /// Return true if the given value is known non null within the callee if
210 /// inlined through this particular callsite.
211 bool isKnownNonNullInCallee(Value *V);
212
213 /// Update Threshold based on callsite properties such as callee
214 /// attributes and callee hotness for PGO builds. The Callee is explicitly
215 /// passed to support analyzing indirect calls whose target is inferred by
216 /// analysis.
217 void updateThreshold(CallSite CS, Function &Callee);
218
219 /// Return true if size growth is allowed when inlining the callee at CS.
220 bool allowSizeGrowth(CallSite CS);
221
222 /// Return true if \p CS is a cold callsite.
223 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
224
225 /// Return a higher threshold if \p CS is a hot callsite.
226 Optional<int> getHotCallSiteThreshold(CallSite CS,
227 BlockFrequencyInfo *CallerBFI);
228
229 // Custom analysis routines.
230 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
231
232 // Disable several entry points to the visitor so we don't accidentally use
233 // them by declaring but not defining them here.
234 void visit(Module *);
235 void visit(Module &);
236 void visit(Function *);
237 void visit(Function &);
238 void visit(BasicBlock *);
239 void visit(BasicBlock &);
240
241 // Provide base case for our instruction visit.
242 bool visitInstruction(Instruction &I);
243
244 // Our visit overrides.
245 bool visitAlloca(AllocaInst &I);
246 bool visitPHI(PHINode &I);
247 bool visitGetElementPtr(GetElementPtrInst &I);
248 bool visitBitCast(BitCastInst &I);
249 bool visitPtrToInt(PtrToIntInst &I);
250 bool visitIntToPtr(IntToPtrInst &I);
251 bool visitCastInst(CastInst &I);
252 bool visitUnaryInstruction(UnaryInstruction &I);
253 bool visitCmpInst(CmpInst &I);
254 bool visitSub(BinaryOperator &I);
255 bool visitBinaryOperator(BinaryOperator &I);
256 bool visitLoad(LoadInst &I);
257 bool visitStore(StoreInst &I);
258 bool visitExtractValue(ExtractValueInst &I);
259 bool visitInsertValue(InsertValueInst &I);
260 bool visitCallSite(CallSite CS);
261 bool visitReturnInst(ReturnInst &RI);
262 bool visitBranchInst(BranchInst &BI);
263 bool visitSelectInst(SelectInst &SI);
264 bool visitSwitchInst(SwitchInst &SI);
265 bool visitIndirectBrInst(IndirectBrInst &IBI);
266 bool visitResumeInst(ResumeInst &RI);
267 bool visitCleanupReturnInst(CleanupReturnInst &RI);
268 bool visitCatchReturnInst(CatchReturnInst &RI);
269 bool visitUnreachableInst(UnreachableInst &I);
270
271 public:
CallAnalyzer(const TargetTransformInfo & TTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> & GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,Function & Callee,CallSite CSArg,const InlineParams & Params)272 CallAnalyzer(const TargetTransformInfo &TTI,
273 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
274 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
275 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
276 Function &Callee, CallSite CSArg, const InlineParams &Params)
277 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
278 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
279 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
280 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
281 Params.ComputeFullInlineCost || ORE),
282 IsCallerRecursive(false), IsRecursiveCall(false),
283 ExposesReturnsTwice(false), HasDynamicAlloca(false),
284 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
285 HasUninlineableIntrinsic(false), UsesVarArgs(false), AllocatedSize(0),
286 NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
287 SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
288 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
289 NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
290 NumInstructionsSimplified(0), SROACostSavings(0),
291 SROACostSavingsLost(0) {}
292
293 bool analyzeCall(CallSite CS);
294
getThreshold()295 int getThreshold() { return Threshold; }
getCost()296 int getCost() { return Cost; }
297
298 // Keep a bunch of stats about the cost savings found so we can print them
299 // out when debugging.
300 unsigned NumConstantArgs;
301 unsigned NumConstantOffsetPtrArgs;
302 unsigned NumAllocaArgs;
303 unsigned NumConstantPtrCmps;
304 unsigned NumConstantPtrDiffs;
305 unsigned NumInstructionsSimplified;
306 unsigned SROACostSavings;
307 unsigned SROACostSavingsLost;
308
309 void dump();
310 };
311
312 } // namespace
313
314 /// Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)315 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
316 return SROAArgValues.count(V);
317 }
318
319 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
320 /// Returns false if V does not map to a SROA-candidate.
lookupSROAArgAndCost(Value * V,Value * & Arg,DenseMap<Value *,int>::iterator & CostIt)321 bool CallAnalyzer::lookupSROAArgAndCost(
322 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
323 if (SROAArgValues.empty() || SROAArgCosts.empty())
324 return false;
325
326 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
327 if (ArgIt == SROAArgValues.end())
328 return false;
329
330 Arg = ArgIt->second;
331 CostIt = SROAArgCosts.find(Arg);
332 return CostIt != SROAArgCosts.end();
333 }
334
335 /// Disable SROA for the candidate marked by this cost iterator.
336 ///
337 /// This marks the candidate as no longer viable for SROA, and adds the cost
338 /// savings associated with it back into the inline cost measurement.
disableSROA(DenseMap<Value *,int>::iterator CostIt)339 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
340 // If we're no longer able to perform SROA we need to undo its cost savings
341 // and prevent subsequent analysis.
342 Cost += CostIt->second;
343 SROACostSavings -= CostIt->second;
344 SROACostSavingsLost += CostIt->second;
345 SROAArgCosts.erase(CostIt);
346 disableLoadElimination();
347 }
348
349 /// If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)350 void CallAnalyzer::disableSROA(Value *V) {
351 Value *SROAArg;
352 DenseMap<Value *, int>::iterator CostIt;
353 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
354 disableSROA(CostIt);
355 }
356
357 /// Accumulate the given cost for a particular SROA candidate.
accumulateSROACost(DenseMap<Value *,int>::iterator CostIt,int InstructionCost)358 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
359 int InstructionCost) {
360 CostIt->second += InstructionCost;
361 SROACostSavings += InstructionCost;
362 }
363
disableLoadElimination()364 void CallAnalyzer::disableLoadElimination() {
365 if (EnableLoadElimination) {
366 Cost += LoadEliminationCost;
367 LoadEliminationCost = 0;
368 EnableLoadElimination = false;
369 }
370 }
371
372 /// Accumulate a constant GEP offset into an APInt if possible.
373 ///
374 /// Returns false if unable to compute the offset for any reason. Respects any
375 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)376 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
377 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
378 assert(IntPtrWidth == Offset.getBitWidth());
379
380 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
381 GTI != GTE; ++GTI) {
382 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
383 if (!OpC)
384 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
385 OpC = dyn_cast<ConstantInt>(SimpleOp);
386 if (!OpC)
387 return false;
388 if (OpC->isZero())
389 continue;
390
391 // Handle a struct index, which adds its field offset to the pointer.
392 if (StructType *STy = GTI.getStructTypeOrNull()) {
393 unsigned ElementIdx = OpC->getZExtValue();
394 const StructLayout *SL = DL.getStructLayout(STy);
395 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
396 continue;
397 }
398
399 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
400 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
401 }
402 return true;
403 }
404
405 /// Use TTI to check whether a GEP is free.
406 ///
407 /// Respects any simplified values known during the analysis of this callsite.
isGEPFree(GetElementPtrInst & GEP)408 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
409 SmallVector<Value *, 4> Operands;
410 Operands.push_back(GEP.getOperand(0));
411 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
412 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
413 Operands.push_back(SimpleOp);
414 else
415 Operands.push_back(*I);
416 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
417 }
418
visitAlloca(AllocaInst & I)419 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
420 // Check whether inlining will turn a dynamic alloca into a static
421 // alloca and handle that case.
422 if (I.isArrayAllocation()) {
423 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
424 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
425 Type *Ty = I.getAllocatedType();
426 AllocatedSize = SaturatingMultiplyAdd(
427 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
428 return Base::visitAlloca(I);
429 }
430 }
431
432 // Accumulate the allocated size.
433 if (I.isStaticAlloca()) {
434 Type *Ty = I.getAllocatedType();
435 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
436 }
437
438 // We will happily inline static alloca instructions.
439 if (I.isStaticAlloca())
440 return Base::visitAlloca(I);
441
442 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
443 // a variety of reasons, and so we would like to not inline them into
444 // functions which don't currently have a dynamic alloca. This simply
445 // disables inlining altogether in the presence of a dynamic alloca.
446 HasDynamicAlloca = true;
447 return false;
448 }
449
visitPHI(PHINode & I)450 bool CallAnalyzer::visitPHI(PHINode &I) {
451 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
452 // though we don't want to propagate it's bonuses. The idea is to disable
453 // SROA if it *might* be used in an inappropriate manner.
454
455 // Phi nodes are always zero-cost.
456 // FIXME: Pointer sizes may differ between different address spaces, so do we
457 // need to use correct address space in the call to getPointerSizeInBits here?
458 // Or could we skip the getPointerSizeInBits call completely? As far as I can
459 // see the ZeroOffset is used as a dummy value, so we can probably use any
460 // bit width for the ZeroOffset?
461 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
462 bool CheckSROA = I.getType()->isPointerTy();
463
464 // Track the constant or pointer with constant offset we've seen so far.
465 Constant *FirstC = nullptr;
466 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
467 Value *FirstV = nullptr;
468
469 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
470 BasicBlock *Pred = I.getIncomingBlock(i);
471 // If the incoming block is dead, skip the incoming block.
472 if (DeadBlocks.count(Pred))
473 continue;
474 // If the parent block of phi is not the known successor of the incoming
475 // block, skip the incoming block.
476 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
477 if (KnownSuccessor && KnownSuccessor != I.getParent())
478 continue;
479
480 Value *V = I.getIncomingValue(i);
481 // If the incoming value is this phi itself, skip the incoming value.
482 if (&I == V)
483 continue;
484
485 Constant *C = dyn_cast<Constant>(V);
486 if (!C)
487 C = SimplifiedValues.lookup(V);
488
489 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
490 if (!C && CheckSROA)
491 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
492
493 if (!C && !BaseAndOffset.first)
494 // The incoming value is neither a constant nor a pointer with constant
495 // offset, exit early.
496 return true;
497
498 if (FirstC) {
499 if (FirstC == C)
500 // If we've seen a constant incoming value before and it is the same
501 // constant we see this time, continue checking the next incoming value.
502 continue;
503 // Otherwise early exit because we either see a different constant or saw
504 // a constant before but we have a pointer with constant offset this time.
505 return true;
506 }
507
508 if (FirstV) {
509 // The same logic as above, but check pointer with constant offset here.
510 if (FirstBaseAndOffset == BaseAndOffset)
511 continue;
512 return true;
513 }
514
515 if (C) {
516 // This is the 1st time we've seen a constant, record it.
517 FirstC = C;
518 continue;
519 }
520
521 // The remaining case is that this is the 1st time we've seen a pointer with
522 // constant offset, record it.
523 FirstV = V;
524 FirstBaseAndOffset = BaseAndOffset;
525 }
526
527 // Check if we can map phi to a constant.
528 if (FirstC) {
529 SimplifiedValues[&I] = FirstC;
530 return true;
531 }
532
533 // Check if we can map phi to a pointer with constant offset.
534 if (FirstBaseAndOffset.first) {
535 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
536
537 Value *SROAArg;
538 DenseMap<Value *, int>::iterator CostIt;
539 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
540 SROAArgValues[&I] = SROAArg;
541 }
542
543 return true;
544 }
545
546 /// Check we can fold GEPs of constant-offset call site argument pointers.
547 /// This requires target data and inbounds GEPs.
548 ///
549 /// \return true if the specified GEP can be folded.
canFoldInboundsGEP(GetElementPtrInst & I)550 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
551 // Check if we have a base + offset for the pointer.
552 std::pair<Value *, APInt> BaseAndOffset =
553 ConstantOffsetPtrs.lookup(I.getPointerOperand());
554 if (!BaseAndOffset.first)
555 return false;
556
557 // Check if the offset of this GEP is constant, and if so accumulate it
558 // into Offset.
559 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
560 return false;
561
562 // Add the result as a new mapping to Base + Offset.
563 ConstantOffsetPtrs[&I] = BaseAndOffset;
564
565 return true;
566 }
567
visitGetElementPtr(GetElementPtrInst & I)568 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
569 Value *SROAArg;
570 DenseMap<Value *, int>::iterator CostIt;
571 bool SROACandidate =
572 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
573
574 // Lambda to check whether a GEP's indices are all constant.
575 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
576 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
577 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
578 return false;
579 return true;
580 };
581
582 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
583 if (SROACandidate)
584 SROAArgValues[&I] = SROAArg;
585
586 // Constant GEPs are modeled as free.
587 return true;
588 }
589
590 // Variable GEPs will require math and will disable SROA.
591 if (SROACandidate)
592 disableSROA(CostIt);
593 return isGEPFree(I);
594 }
595
596 /// Simplify \p I if its operands are constants and update SimplifiedValues.
597 /// \p Evaluate is a callable specific to instruction type that evaluates the
598 /// instruction when all the operands are constants.
599 template <typename Callable>
simplifyInstruction(Instruction & I,Callable Evaluate)600 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
601 SmallVector<Constant *, 2> COps;
602 for (Value *Op : I.operands()) {
603 Constant *COp = dyn_cast<Constant>(Op);
604 if (!COp)
605 COp = SimplifiedValues.lookup(Op);
606 if (!COp)
607 return false;
608 COps.push_back(COp);
609 }
610 auto *C = Evaluate(COps);
611 if (!C)
612 return false;
613 SimplifiedValues[&I] = C;
614 return true;
615 }
616
visitBitCast(BitCastInst & I)617 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
618 // Propagate constants through bitcasts.
619 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
620 return ConstantExpr::getBitCast(COps[0], I.getType());
621 }))
622 return true;
623
624 // Track base/offsets through casts
625 std::pair<Value *, APInt> BaseAndOffset =
626 ConstantOffsetPtrs.lookup(I.getOperand(0));
627 // Casts don't change the offset, just wrap it up.
628 if (BaseAndOffset.first)
629 ConstantOffsetPtrs[&I] = BaseAndOffset;
630
631 // Also look for SROA candidates here.
632 Value *SROAArg;
633 DenseMap<Value *, int>::iterator CostIt;
634 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
635 SROAArgValues[&I] = SROAArg;
636
637 // Bitcasts are always zero cost.
638 return true;
639 }
640
visitPtrToInt(PtrToIntInst & I)641 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
642 // Propagate constants through ptrtoint.
643 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
644 return ConstantExpr::getPtrToInt(COps[0], I.getType());
645 }))
646 return true;
647
648 // Track base/offset pairs when converted to a plain integer provided the
649 // integer is large enough to represent the pointer.
650 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
651 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
652 if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
653 std::pair<Value *, APInt> BaseAndOffset =
654 ConstantOffsetPtrs.lookup(I.getOperand(0));
655 if (BaseAndOffset.first)
656 ConstantOffsetPtrs[&I] = BaseAndOffset;
657 }
658
659 // This is really weird. Technically, ptrtoint will disable SROA. However,
660 // unless that ptrtoint is *used* somewhere in the live basic blocks after
661 // inlining, it will be nuked, and SROA should proceed. All of the uses which
662 // would block SROA would also block SROA if applied directly to a pointer,
663 // and so we can just add the integer in here. The only places where SROA is
664 // preserved either cannot fire on an integer, or won't in-and-of themselves
665 // disable SROA (ext) w/o some later use that we would see and disable.
666 Value *SROAArg;
667 DenseMap<Value *, int>::iterator CostIt;
668 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
669 SROAArgValues[&I] = SROAArg;
670
671 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
672 }
673
visitIntToPtr(IntToPtrInst & I)674 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
675 // Propagate constants through ptrtoint.
676 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
677 return ConstantExpr::getIntToPtr(COps[0], I.getType());
678 }))
679 return true;
680
681 // Track base/offset pairs when round-tripped through a pointer without
682 // modifications provided the integer is not too large.
683 Value *Op = I.getOperand(0);
684 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
685 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
686 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
687 if (BaseAndOffset.first)
688 ConstantOffsetPtrs[&I] = BaseAndOffset;
689 }
690
691 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
692 Value *SROAArg;
693 DenseMap<Value *, int>::iterator CostIt;
694 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
695 SROAArgValues[&I] = SROAArg;
696
697 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
698 }
699
visitCastInst(CastInst & I)700 bool CallAnalyzer::visitCastInst(CastInst &I) {
701 // Propagate constants through ptrtoint.
702 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
703 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
704 }))
705 return true;
706
707 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
708 disableSROA(I.getOperand(0));
709
710 // If this is a floating-point cast, and the target says this operation
711 // is expensive, this may eventually become a library call. Treat the cost
712 // as such.
713 switch (I.getOpcode()) {
714 case Instruction::FPTrunc:
715 case Instruction::FPExt:
716 case Instruction::UIToFP:
717 case Instruction::SIToFP:
718 case Instruction::FPToUI:
719 case Instruction::FPToSI:
720 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
721 Cost += InlineConstants::CallPenalty;
722 break;
723 default:
724 break;
725 }
726
727 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
728 }
729
visitUnaryInstruction(UnaryInstruction & I)730 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
731 Value *Operand = I.getOperand(0);
732 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
733 return ConstantFoldInstOperands(&I, COps[0], DL);
734 }))
735 return true;
736
737 // Disable any SROA on the argument to arbitrary unary operators.
738 disableSROA(Operand);
739
740 return false;
741 }
742
paramHasAttr(Argument * A,Attribute::AttrKind Attr)743 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
744 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
745 }
746
isKnownNonNullInCallee(Value * V)747 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
748 // Does the *call site* have the NonNull attribute set on an argument? We
749 // use the attribute on the call site to memoize any analysis done in the
750 // caller. This will also trip if the callee function has a non-null
751 // parameter attribute, but that's a less interesting case because hopefully
752 // the callee would already have been simplified based on that.
753 if (Argument *A = dyn_cast<Argument>(V))
754 if (paramHasAttr(A, Attribute::NonNull))
755 return true;
756
757 // Is this an alloca in the caller? This is distinct from the attribute case
758 // above because attributes aren't updated within the inliner itself and we
759 // always want to catch the alloca derived case.
760 if (isAllocaDerivedArg(V))
761 // We can actually predict the result of comparisons between an
762 // alloca-derived value and null. Note that this fires regardless of
763 // SROA firing.
764 return true;
765
766 return false;
767 }
768
allowSizeGrowth(CallSite CS)769 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
770 // If the normal destination of the invoke or the parent block of the call
771 // site is unreachable-terminated, there is little point in inlining this
772 // unless there is literally zero cost.
773 // FIXME: Note that it is possible that an unreachable-terminated block has a
774 // hot entry. For example, in below scenario inlining hot_call_X() may be
775 // beneficial :
776 // main() {
777 // hot_call_1();
778 // ...
779 // hot_call_N()
780 // exit(0);
781 // }
782 // For now, we are not handling this corner case here as it is rare in real
783 // code. In future, we should elaborate this based on BPI and BFI in more
784 // general threshold adjusting heuristics in updateThreshold().
785 Instruction *Instr = CS.getInstruction();
786 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
787 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
788 return false;
789 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
790 return false;
791
792 return true;
793 }
794
isColdCallSite(CallSite CS,BlockFrequencyInfo * CallerBFI)795 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
796 // If global profile summary is available, then callsite's coldness is
797 // determined based on that.
798 if (PSI && PSI->hasProfileSummary())
799 return PSI->isColdCallSite(CS, CallerBFI);
800
801 // Otherwise we need BFI to be available.
802 if (!CallerBFI)
803 return false;
804
805 // Determine if the callsite is cold relative to caller's entry. We could
806 // potentially cache the computation of scaled entry frequency, but the added
807 // complexity is not worth it unless this scaling shows up high in the
808 // profiles.
809 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
810 auto CallSiteBB = CS.getInstruction()->getParent();
811 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
812 auto CallerEntryFreq =
813 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
814 return CallSiteFreq < CallerEntryFreq * ColdProb;
815 }
816
817 Optional<int>
getHotCallSiteThreshold(CallSite CS,BlockFrequencyInfo * CallerBFI)818 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
819 BlockFrequencyInfo *CallerBFI) {
820
821 // If global profile summary is available, then callsite's hotness is
822 // determined based on that.
823 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
824 return Params.HotCallSiteThreshold;
825
826 // Otherwise we need BFI to be available and to have a locally hot callsite
827 // threshold.
828 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
829 return None;
830
831 // Determine if the callsite is hot relative to caller's entry. We could
832 // potentially cache the computation of scaled entry frequency, but the added
833 // complexity is not worth it unless this scaling shows up high in the
834 // profiles.
835 auto CallSiteBB = CS.getInstruction()->getParent();
836 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
837 auto CallerEntryFreq = CallerBFI->getEntryFreq();
838 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
839 return Params.LocallyHotCallSiteThreshold;
840
841 // Otherwise treat it normally.
842 return None;
843 }
844
updateThreshold(CallSite CS,Function & Callee)845 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
846 // If no size growth is allowed for this inlining, set Threshold to 0.
847 if (!allowSizeGrowth(CS)) {
848 Threshold = 0;
849 return;
850 }
851
852 Function *Caller = CS.getCaller();
853
854 // return min(A, B) if B is valid.
855 auto MinIfValid = [](int A, Optional<int> B) {
856 return B ? std::min(A, B.getValue()) : A;
857 };
858
859 // return max(A, B) if B is valid.
860 auto MaxIfValid = [](int A, Optional<int> B) {
861 return B ? std::max(A, B.getValue()) : A;
862 };
863
864 // Various bonus percentages. These are multiplied by Threshold to get the
865 // bonus values.
866 // SingleBBBonus: This bonus is applied if the callee has a single reachable
867 // basic block at the given callsite context. This is speculatively applied
868 // and withdrawn if more than one basic block is seen.
869 //
870 // Vector bonuses: We want to more aggressively inline vector-dense kernels
871 // and apply this bonus based on the percentage of vector instructions. A
872 // bonus is applied if the vector instructions exceed 50% and half that amount
873 // is applied if it exceeds 10%. Note that these bonuses are some what
874 // arbitrary and evolved over time by accident as much as because they are
875 // principled bonuses.
876 // FIXME: It would be nice to base the bonus values on something more
877 // scientific.
878 //
879 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
880 // of the last call to a static function as inlining such functions is
881 // guaranteed to reduce code size.
882 //
883 // These bonus percentages may be set to 0 based on properties of the caller
884 // and the callsite.
885 int SingleBBBonusPercent = 50;
886 int VectorBonusPercent = 150;
887 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
888
889 // Lambda to set all the above bonus and bonus percentages to 0.
890 auto DisallowAllBonuses = [&]() {
891 SingleBBBonusPercent = 0;
892 VectorBonusPercent = 0;
893 LastCallToStaticBonus = 0;
894 };
895
896 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
897 // and reduce the threshold if the caller has the necessary attribute.
898 if (Caller->optForMinSize()) {
899 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
900 // For minsize, we want to disable the single BB bonus and the vector
901 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
902 // a static function will, at the minimum, eliminate the parameter setup and
903 // call/return instructions.
904 SingleBBBonusPercent = 0;
905 VectorBonusPercent = 0;
906 } else if (Caller->optForSize())
907 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
908
909 // Adjust the threshold based on inlinehint attribute and profile based
910 // hotness information if the caller does not have MinSize attribute.
911 if (!Caller->optForMinSize()) {
912 if (Callee.hasFnAttribute(Attribute::InlineHint))
913 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
914
915 // FIXME: After switching to the new passmanager, simplify the logic below
916 // by checking only the callsite hotness/coldness as we will reliably
917 // have local profile information.
918 //
919 // Callsite hotness and coldness can be determined if sample profile is
920 // used (which adds hotness metadata to calls) or if caller's
921 // BlockFrequencyInfo is available.
922 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
923 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
924 if (!Caller->optForSize() && HotCallSiteThreshold) {
925 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
926 // FIXME: This should update the threshold only if it exceeds the
927 // current threshold, but AutoFDO + ThinLTO currently relies on this
928 // behavior to prevent inlining of hot callsites during ThinLTO
929 // compile phase.
930 Threshold = HotCallSiteThreshold.getValue();
931 } else if (isColdCallSite(CS, CallerBFI)) {
932 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
933 // Do not apply bonuses for a cold callsite including the
934 // LastCallToStatic bonus. While this bonus might result in code size
935 // reduction, it can cause the size of a non-cold caller to increase
936 // preventing it from being inlined.
937 DisallowAllBonuses();
938 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
939 } else if (PSI) {
940 // Use callee's global profile information only if we have no way of
941 // determining this via callsite information.
942 if (PSI->isFunctionEntryHot(&Callee)) {
943 LLVM_DEBUG(dbgs() << "Hot callee.\n");
944 // If callsite hotness can not be determined, we may still know
945 // that the callee is hot and treat it as a weaker hint for threshold
946 // increase.
947 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
948 } else if (PSI->isFunctionEntryCold(&Callee)) {
949 LLVM_DEBUG(dbgs() << "Cold callee.\n");
950 // Do not apply bonuses for a cold callee including the
951 // LastCallToStatic bonus. While this bonus might result in code size
952 // reduction, it can cause the size of a non-cold caller to increase
953 // preventing it from being inlined.
954 DisallowAllBonuses();
955 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
956 }
957 }
958 }
959
960 // Finally, take the target-specific inlining threshold multiplier into
961 // account.
962 Threshold *= TTI.getInliningThresholdMultiplier();
963
964 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
965 VectorBonus = Threshold * VectorBonusPercent / 100;
966
967 bool OnlyOneCallAndLocalLinkage =
968 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
969 // If there is only one call of the function, and it has internal linkage,
970 // the cost of inlining it drops dramatically. It may seem odd to update
971 // Cost in updateThreshold, but the bonus depends on the logic in this method.
972 if (OnlyOneCallAndLocalLinkage)
973 Cost -= LastCallToStaticBonus;
974 }
975
visitCmpInst(CmpInst & I)976 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
977 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
978 // First try to handle simplified comparisons.
979 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
980 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
981 }))
982 return true;
983
984 if (I.getOpcode() == Instruction::FCmp)
985 return false;
986
987 // Otherwise look for a comparison between constant offset pointers with
988 // a common base.
989 Value *LHSBase, *RHSBase;
990 APInt LHSOffset, RHSOffset;
991 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
992 if (LHSBase) {
993 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
994 if (RHSBase && LHSBase == RHSBase) {
995 // We have common bases, fold the icmp to a constant based on the
996 // offsets.
997 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
998 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
999 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1000 SimplifiedValues[&I] = C;
1001 ++NumConstantPtrCmps;
1002 return true;
1003 }
1004 }
1005 }
1006
1007 // If the comparison is an equality comparison with null, we can simplify it
1008 // if we know the value (argument) can't be null
1009 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1010 isKnownNonNullInCallee(I.getOperand(0))) {
1011 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1012 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1013 : ConstantInt::getFalse(I.getType());
1014 return true;
1015 }
1016 // Finally check for SROA candidates in comparisons.
1017 Value *SROAArg;
1018 DenseMap<Value *, int>::iterator CostIt;
1019 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1020 if (isa<ConstantPointerNull>(I.getOperand(1))) {
1021 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1022 return true;
1023 }
1024
1025 disableSROA(CostIt);
1026 }
1027
1028 return false;
1029 }
1030
visitSub(BinaryOperator & I)1031 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1032 // Try to handle a special case: we can fold computing the difference of two
1033 // constant-related pointers.
1034 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1035 Value *LHSBase, *RHSBase;
1036 APInt LHSOffset, RHSOffset;
1037 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1038 if (LHSBase) {
1039 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1040 if (RHSBase && LHSBase == RHSBase) {
1041 // We have common bases, fold the subtract to a constant based on the
1042 // offsets.
1043 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1044 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1045 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1046 SimplifiedValues[&I] = C;
1047 ++NumConstantPtrDiffs;
1048 return true;
1049 }
1050 }
1051 }
1052
1053 // Otherwise, fall back to the generic logic for simplifying and handling
1054 // instructions.
1055 return Base::visitSub(I);
1056 }
1057
visitBinaryOperator(BinaryOperator & I)1058 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1059 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1060 Constant *CLHS = dyn_cast<Constant>(LHS);
1061 if (!CLHS)
1062 CLHS = SimplifiedValues.lookup(LHS);
1063 Constant *CRHS = dyn_cast<Constant>(RHS);
1064 if (!CRHS)
1065 CRHS = SimplifiedValues.lookup(RHS);
1066
1067 Value *SimpleV = nullptr;
1068 if (auto FI = dyn_cast<FPMathOperator>(&I))
1069 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1070 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1071 else
1072 SimpleV =
1073 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1074
1075 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1076 SimplifiedValues[&I] = C;
1077
1078 if (SimpleV)
1079 return true;
1080
1081 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1082 disableSROA(LHS);
1083 disableSROA(RHS);
1084
1085 // If the instruction is floating point, and the target says this operation
1086 // is expensive, this may eventually become a library call. Treat the cost
1087 // as such.
1088 if (I.getType()->isFloatingPointTy() &&
1089 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1090 Cost += InlineConstants::CallPenalty;
1091
1092 return false;
1093 }
1094
visitLoad(LoadInst & I)1095 bool CallAnalyzer::visitLoad(LoadInst &I) {
1096 Value *SROAArg;
1097 DenseMap<Value *, int>::iterator CostIt;
1098 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1099 if (I.isSimple()) {
1100 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1101 return true;
1102 }
1103
1104 disableSROA(CostIt);
1105 }
1106
1107 // If the data is already loaded from this address and hasn't been clobbered
1108 // by any stores or calls, this load is likely to be redundant and can be
1109 // eliminated.
1110 if (EnableLoadElimination &&
1111 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1112 LoadEliminationCost += InlineConstants::InstrCost;
1113 return true;
1114 }
1115
1116 return false;
1117 }
1118
visitStore(StoreInst & I)1119 bool CallAnalyzer::visitStore(StoreInst &I) {
1120 Value *SROAArg;
1121 DenseMap<Value *, int>::iterator CostIt;
1122 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1123 if (I.isSimple()) {
1124 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1125 return true;
1126 }
1127
1128 disableSROA(CostIt);
1129 }
1130
1131 // The store can potentially clobber loads and prevent repeated loads from
1132 // being eliminated.
1133 // FIXME:
1134 // 1. We can probably keep an initial set of eliminatable loads substracted
1135 // from the cost even when we finally see a store. We just need to disable
1136 // *further* accumulation of elimination savings.
1137 // 2. We should probably at some point thread MemorySSA for the callee into
1138 // this and then use that to actually compute *really* precise savings.
1139 disableLoadElimination();
1140 return false;
1141 }
1142
visitExtractValue(ExtractValueInst & I)1143 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1144 // Constant folding for extract value is trivial.
1145 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1146 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1147 }))
1148 return true;
1149
1150 // SROA can look through these but give them a cost.
1151 return false;
1152 }
1153
visitInsertValue(InsertValueInst & I)1154 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1155 // Constant folding for insert value is trivial.
1156 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1157 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1158 /*InsertedValueOperand*/ COps[1],
1159 I.getIndices());
1160 }))
1161 return true;
1162
1163 // SROA can look through these but give them a cost.
1164 return false;
1165 }
1166
1167 /// Try to simplify a call site.
1168 ///
1169 /// Takes a concrete function and callsite and tries to actually simplify it by
1170 /// analyzing the arguments and call itself with instsimplify. Returns true if
1171 /// it has simplified the callsite to some other entity (a constant), making it
1172 /// free.
simplifyCallSite(Function * F,CallSite CS)1173 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1174 // FIXME: Using the instsimplify logic directly for this is inefficient
1175 // because we have to continually rebuild the argument list even when no
1176 // simplifications can be performed. Until that is fixed with remapping
1177 // inside of instsimplify, directly constant fold calls here.
1178 if (!canConstantFoldCallTo(CS, F))
1179 return false;
1180
1181 // Try to re-map the arguments to constants.
1182 SmallVector<Constant *, 4> ConstantArgs;
1183 ConstantArgs.reserve(CS.arg_size());
1184 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1185 ++I) {
1186 Constant *C = dyn_cast<Constant>(*I);
1187 if (!C)
1188 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1189 if (!C)
1190 return false; // This argument doesn't map to a constant.
1191
1192 ConstantArgs.push_back(C);
1193 }
1194 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1195 SimplifiedValues[CS.getInstruction()] = C;
1196 return true;
1197 }
1198
1199 return false;
1200 }
1201
visitCallSite(CallSite CS)1202 bool CallAnalyzer::visitCallSite(CallSite CS) {
1203 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1204 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1205 // This aborts the entire analysis.
1206 ExposesReturnsTwice = true;
1207 return false;
1208 }
1209 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1210 ContainsNoDuplicateCall = true;
1211
1212 if (Function *F = CS.getCalledFunction()) {
1213 // When we have a concrete function, first try to simplify it directly.
1214 if (simplifyCallSite(F, CS))
1215 return true;
1216
1217 // Next check if it is an intrinsic we know about.
1218 // FIXME: Lift this into part of the InstVisitor.
1219 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1220 switch (II->getIntrinsicID()) {
1221 default:
1222 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1223 disableLoadElimination();
1224 return Base::visitCallSite(CS);
1225
1226 case Intrinsic::load_relative:
1227 // This is normally lowered to 4 LLVM instructions.
1228 Cost += 3 * InlineConstants::InstrCost;
1229 return false;
1230
1231 case Intrinsic::memset:
1232 case Intrinsic::memcpy:
1233 case Intrinsic::memmove:
1234 disableLoadElimination();
1235 // SROA can usually chew through these intrinsics, but they aren't free.
1236 return false;
1237 case Intrinsic::icall_branch_funnel:
1238 case Intrinsic::localescape:
1239 HasUninlineableIntrinsic = true;
1240 return false;
1241 case Intrinsic::vastart:
1242 case Intrinsic::vaend:
1243 UsesVarArgs = true;
1244 return false;
1245 }
1246 }
1247
1248 if (F == CS.getInstruction()->getFunction()) {
1249 // This flag will fully abort the analysis, so don't bother with anything
1250 // else.
1251 IsRecursiveCall = true;
1252 return false;
1253 }
1254
1255 if (TTI.isLoweredToCall(F)) {
1256 // We account for the average 1 instruction per call argument setup
1257 // here.
1258 Cost += CS.arg_size() * InlineConstants::InstrCost;
1259
1260 // Everything other than inline ASM will also have a significant cost
1261 // merely from making the call.
1262 if (!isa<InlineAsm>(CS.getCalledValue()))
1263 Cost += InlineConstants::CallPenalty;
1264 }
1265
1266 if (!CS.onlyReadsMemory())
1267 disableLoadElimination();
1268 return Base::visitCallSite(CS);
1269 }
1270
1271 // Otherwise we're in a very special case -- an indirect function call. See
1272 // if we can be particularly clever about this.
1273 Value *Callee = CS.getCalledValue();
1274
1275 // First, pay the price of the argument setup. We account for the average
1276 // 1 instruction per call argument setup here.
1277 Cost += CS.arg_size() * InlineConstants::InstrCost;
1278
1279 // Next, check if this happens to be an indirect function call to a known
1280 // function in this inline context. If not, we've done all we can.
1281 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1282 if (!F) {
1283 if (!CS.onlyReadsMemory())
1284 disableLoadElimination();
1285 return Base::visitCallSite(CS);
1286 }
1287
1288 // If we have a constant that we are calling as a function, we can peer
1289 // through it and see the function target. This happens not infrequently
1290 // during devirtualization and so we want to give it a hefty bonus for
1291 // inlining, but cap that bonus in the event that inlining wouldn't pan
1292 // out. Pretend to inline the function, with a custom threshold.
1293 auto IndirectCallParams = Params;
1294 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1295 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1296 IndirectCallParams);
1297 if (CA.analyzeCall(CS)) {
1298 // We were able to inline the indirect call! Subtract the cost from the
1299 // threshold to get the bonus we want to apply, but don't go below zero.
1300 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1301 }
1302
1303 if (!F->onlyReadsMemory())
1304 disableLoadElimination();
1305 return Base::visitCallSite(CS);
1306 }
1307
visitReturnInst(ReturnInst & RI)1308 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1309 // At least one return instruction will be free after inlining.
1310 bool Free = !HasReturn;
1311 HasReturn = true;
1312 return Free;
1313 }
1314
visitBranchInst(BranchInst & BI)1315 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1316 // We model unconditional branches as essentially free -- they really
1317 // shouldn't exist at all, but handling them makes the behavior of the
1318 // inliner more regular and predictable. Interestingly, conditional branches
1319 // which will fold away are also free.
1320 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1321 dyn_cast_or_null<ConstantInt>(
1322 SimplifiedValues.lookup(BI.getCondition()));
1323 }
1324
visitSelectInst(SelectInst & SI)1325 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1326 bool CheckSROA = SI.getType()->isPointerTy();
1327 Value *TrueVal = SI.getTrueValue();
1328 Value *FalseVal = SI.getFalseValue();
1329
1330 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1331 if (!TrueC)
1332 TrueC = SimplifiedValues.lookup(TrueVal);
1333 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1334 if (!FalseC)
1335 FalseC = SimplifiedValues.lookup(FalseVal);
1336 Constant *CondC =
1337 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1338
1339 if (!CondC) {
1340 // Select C, X, X => X
1341 if (TrueC == FalseC && TrueC) {
1342 SimplifiedValues[&SI] = TrueC;
1343 return true;
1344 }
1345
1346 if (!CheckSROA)
1347 return Base::visitSelectInst(SI);
1348
1349 std::pair<Value *, APInt> TrueBaseAndOffset =
1350 ConstantOffsetPtrs.lookup(TrueVal);
1351 std::pair<Value *, APInt> FalseBaseAndOffset =
1352 ConstantOffsetPtrs.lookup(FalseVal);
1353 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1354 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1355
1356 Value *SROAArg;
1357 DenseMap<Value *, int>::iterator CostIt;
1358 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1359 SROAArgValues[&SI] = SROAArg;
1360 return true;
1361 }
1362
1363 return Base::visitSelectInst(SI);
1364 }
1365
1366 // Select condition is a constant.
1367 Value *SelectedV = CondC->isAllOnesValue()
1368 ? TrueVal
1369 : (CondC->isNullValue()) ? FalseVal : nullptr;
1370 if (!SelectedV) {
1371 // Condition is a vector constant that is not all 1s or all 0s. If all
1372 // operands are constants, ConstantExpr::getSelect() can handle the cases
1373 // such as select vectors.
1374 if (TrueC && FalseC) {
1375 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1376 SimplifiedValues[&SI] = C;
1377 return true;
1378 }
1379 }
1380 return Base::visitSelectInst(SI);
1381 }
1382
1383 // Condition is either all 1s or all 0s. SI can be simplified.
1384 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1385 SimplifiedValues[&SI] = SelectedC;
1386 return true;
1387 }
1388
1389 if (!CheckSROA)
1390 return true;
1391
1392 std::pair<Value *, APInt> BaseAndOffset =
1393 ConstantOffsetPtrs.lookup(SelectedV);
1394 if (BaseAndOffset.first) {
1395 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1396
1397 Value *SROAArg;
1398 DenseMap<Value *, int>::iterator CostIt;
1399 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1400 SROAArgValues[&SI] = SROAArg;
1401 }
1402
1403 return true;
1404 }
1405
visitSwitchInst(SwitchInst & SI)1406 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1407 // We model unconditional switches as free, see the comments on handling
1408 // branches.
1409 if (isa<ConstantInt>(SI.getCondition()))
1410 return true;
1411 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1412 if (isa<ConstantInt>(V))
1413 return true;
1414
1415 // Assume the most general case where the switch is lowered into
1416 // either a jump table, bit test, or a balanced binary tree consisting of
1417 // case clusters without merging adjacent clusters with the same
1418 // destination. We do not consider the switches that are lowered with a mix
1419 // of jump table/bit test/binary search tree. The cost of the switch is
1420 // proportional to the size of the tree or the size of jump table range.
1421 //
1422 // NB: We convert large switches which are just used to initialize large phi
1423 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1424 // inlining those. It will prevent inlining in cases where the optimization
1425 // does not (yet) fire.
1426
1427 // Maximum valid cost increased in this function.
1428 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1429
1430 // Exit early for a large switch, assuming one case needs at least one
1431 // instruction.
1432 // FIXME: This is not true for a bit test, but ignore such case for now to
1433 // save compile-time.
1434 int64_t CostLowerBound =
1435 std::min((int64_t)CostUpperBound,
1436 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1437
1438 if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1439 Cost = CostLowerBound;
1440 return false;
1441 }
1442
1443 unsigned JumpTableSize = 0;
1444 unsigned NumCaseCluster =
1445 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1446
1447 // If suitable for a jump table, consider the cost for the table size and
1448 // branch to destination.
1449 if (JumpTableSize) {
1450 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1451 4 * InlineConstants::InstrCost;
1452
1453 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1454 return false;
1455 }
1456
1457 // Considering forming a binary search, we should find the number of nodes
1458 // which is same as the number of comparisons when lowered. For a given
1459 // number of clusters, n, we can define a recursive function, f(n), to find
1460 // the number of nodes in the tree. The recursion is :
1461 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1462 // and f(n) = n, when n <= 3.
1463 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1464 // when n > 3. So, the number of comparisons from leaves should be n, while
1465 // the number of non-leaf should be :
1466 // 2^(log2(n) - 1) - 1
1467 // = 2^log2(n) * 2^-1 - 1
1468 // = n / 2 - 1.
1469 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1470 // number of comparisons in a simple closed form :
1471 // n + n / 2 - 1 = n * 3 / 2 - 1
1472 if (NumCaseCluster <= 3) {
1473 // Suppose a comparison includes one compare and one conditional branch.
1474 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1475 return false;
1476 }
1477
1478 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1479 int64_t SwitchCost =
1480 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1481
1482 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1483 return false;
1484 }
1485
visitIndirectBrInst(IndirectBrInst & IBI)1486 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1487 // We never want to inline functions that contain an indirectbr. This is
1488 // incorrect because all the blockaddress's (in static global initializers
1489 // for example) would be referring to the original function, and this
1490 // indirect jump would jump from the inlined copy of the function into the
1491 // original function which is extremely undefined behavior.
1492 // FIXME: This logic isn't really right; we can safely inline functions with
1493 // indirectbr's as long as no other function or global references the
1494 // blockaddress of a block within the current function.
1495 HasIndirectBr = true;
1496 return false;
1497 }
1498
visitResumeInst(ResumeInst & RI)1499 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1500 // FIXME: It's not clear that a single instruction is an accurate model for
1501 // the inline cost of a resume instruction.
1502 return false;
1503 }
1504
visitCleanupReturnInst(CleanupReturnInst & CRI)1505 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1506 // FIXME: It's not clear that a single instruction is an accurate model for
1507 // the inline cost of a cleanupret instruction.
1508 return false;
1509 }
1510
visitCatchReturnInst(CatchReturnInst & CRI)1511 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1512 // FIXME: It's not clear that a single instruction is an accurate model for
1513 // the inline cost of a catchret instruction.
1514 return false;
1515 }
1516
visitUnreachableInst(UnreachableInst & I)1517 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1518 // FIXME: It might be reasonably to discount the cost of instructions leading
1519 // to unreachable as they have the lowest possible impact on both runtime and
1520 // code size.
1521 return true; // No actual code is needed for unreachable.
1522 }
1523
visitInstruction(Instruction & I)1524 bool CallAnalyzer::visitInstruction(Instruction &I) {
1525 // Some instructions are free. All of the free intrinsics can also be
1526 // handled by SROA, etc.
1527 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1528 return true;
1529
1530 // We found something we don't understand or can't handle. Mark any SROA-able
1531 // values in the operand list as no longer viable.
1532 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1533 disableSROA(*OI);
1534
1535 return false;
1536 }
1537
1538 /// Analyze a basic block for its contribution to the inline cost.
1539 ///
1540 /// This method walks the analyzer over every instruction in the given basic
1541 /// block and accounts for their cost during inlining at this callsite. It
1542 /// aborts early if the threshold has been exceeded or an impossible to inline
1543 /// construct has been detected. It returns false if inlining is no longer
1544 /// viable, and true if inlining remains viable.
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)1545 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1546 SmallPtrSetImpl<const Value *> &EphValues) {
1547 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1548 // FIXME: Currently, the number of instructions in a function regardless of
1549 // our ability to simplify them during inline to constants or dead code,
1550 // are actually used by the vector bonus heuristic. As long as that's true,
1551 // we have to special case debug intrinsics here to prevent differences in
1552 // inlining due to debug symbols. Eventually, the number of unsimplified
1553 // instructions shouldn't factor into the cost computation, but until then,
1554 // hack around it here.
1555 if (isa<DbgInfoIntrinsic>(I))
1556 continue;
1557
1558 // Skip ephemeral values.
1559 if (EphValues.count(&*I))
1560 continue;
1561
1562 ++NumInstructions;
1563 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1564 ++NumVectorInstructions;
1565
1566 // If the instruction simplified to a constant, there is no cost to this
1567 // instruction. Visit the instructions using our InstVisitor to account for
1568 // all of the per-instruction logic. The visit tree returns true if we
1569 // consumed the instruction in any way, and false if the instruction's base
1570 // cost should count against inlining.
1571 if (Base::visit(&*I))
1572 ++NumInstructionsSimplified;
1573 else
1574 Cost += InlineConstants::InstrCost;
1575
1576 using namespace ore;
1577 // If the visit this instruction detected an uninlinable pattern, abort.
1578 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1579 HasIndirectBr || HasUninlineableIntrinsic || UsesVarArgs) {
1580 if (ORE)
1581 ORE->emit([&]() {
1582 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1583 CandidateCS.getInstruction())
1584 << NV("Callee", &F)
1585 << " has uninlinable pattern and cost is not fully computed";
1586 });
1587 return false;
1588 }
1589
1590 // If the caller is a recursive function then we don't want to inline
1591 // functions which allocate a lot of stack space because it would increase
1592 // the caller stack usage dramatically.
1593 if (IsCallerRecursive &&
1594 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1595 if (ORE)
1596 ORE->emit([&]() {
1597 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1598 CandidateCS.getInstruction())
1599 << NV("Callee", &F)
1600 << " is recursive and allocates too much stack space. Cost is "
1601 "not fully computed";
1602 });
1603 return false;
1604 }
1605
1606 // Check if we've past the maximum possible threshold so we don't spin in
1607 // huge basic blocks that will never inline.
1608 if (Cost >= Threshold && !ComputeFullInlineCost)
1609 return false;
1610 }
1611
1612 return true;
1613 }
1614
1615 /// Compute the base pointer and cumulative constant offsets for V.
1616 ///
1617 /// This strips all constant offsets off of V, leaving it the base pointer, and
1618 /// accumulates the total constant offset applied in the returned constant. It
1619 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1620 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)1621 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1622 if (!V->getType()->isPointerTy())
1623 return nullptr;
1624
1625 unsigned AS = V->getType()->getPointerAddressSpace();
1626 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1627 APInt Offset = APInt::getNullValue(IntPtrWidth);
1628
1629 // Even though we don't look through PHI nodes, we could be called on an
1630 // instruction in an unreachable block, which may be on a cycle.
1631 SmallPtrSet<Value *, 4> Visited;
1632 Visited.insert(V);
1633 do {
1634 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1635 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1636 return nullptr;
1637 V = GEP->getPointerOperand();
1638 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1639 V = cast<Operator>(V)->getOperand(0);
1640 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1641 if (GA->isInterposable())
1642 break;
1643 V = GA->getAliasee();
1644 } else {
1645 break;
1646 }
1647 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1648 } while (Visited.insert(V).second);
1649
1650 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1651 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1652 }
1653
1654 /// Find dead blocks due to deleted CFG edges during inlining.
1655 ///
1656 /// If we know the successor of the current block, \p CurrBB, has to be \p
1657 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1658 /// no live incoming CFG edges. If one block is found to be dead, we can
1659 /// continue growing the dead block list by checking the successors of the dead
1660 /// blocks to see if all their incoming edges are dead or not.
findDeadBlocks(BasicBlock * CurrBB,BasicBlock * NextBB)1661 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1662 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1663 // A CFG edge is dead if the predecessor is dead or the predessor has a
1664 // known successor which is not the one under exam.
1665 return (DeadBlocks.count(Pred) ||
1666 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1667 };
1668
1669 auto IsNewlyDead = [&](BasicBlock *BB) {
1670 // If all the edges to a block are dead, the block is also dead.
1671 return (!DeadBlocks.count(BB) &&
1672 llvm::all_of(predecessors(BB),
1673 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1674 };
1675
1676 for (BasicBlock *Succ : successors(CurrBB)) {
1677 if (Succ == NextBB || !IsNewlyDead(Succ))
1678 continue;
1679 SmallVector<BasicBlock *, 4> NewDead;
1680 NewDead.push_back(Succ);
1681 while (!NewDead.empty()) {
1682 BasicBlock *Dead = NewDead.pop_back_val();
1683 if (DeadBlocks.insert(Dead))
1684 // Continue growing the dead block lists.
1685 for (BasicBlock *S : successors(Dead))
1686 if (IsNewlyDead(S))
1687 NewDead.push_back(S);
1688 }
1689 }
1690 }
1691
1692 /// Analyze a call site for potential inlining.
1693 ///
1694 /// Returns true if inlining this call is viable, and false if it is not
1695 /// viable. It computes the cost and adjusts the threshold based on numerous
1696 /// factors and heuristics. If this method returns false but the computed cost
1697 /// is below the computed threshold, then inlining was forcibly disabled by
1698 /// some artifact of the routine.
analyzeCall(CallSite CS)1699 bool CallAnalyzer::analyzeCall(CallSite CS) {
1700 ++NumCallsAnalyzed;
1701
1702 // Perform some tweaks to the cost and threshold based on the direct
1703 // callsite information.
1704
1705 // We want to more aggressively inline vector-dense kernels, so up the
1706 // threshold, and we'll lower it if the % of vector instructions gets too
1707 // low. Note that these bonuses are some what arbitrary and evolved over time
1708 // by accident as much as because they are principled bonuses.
1709 //
1710 // FIXME: It would be nice to remove all such bonuses. At least it would be
1711 // nice to base the bonus values on something more scientific.
1712 assert(NumInstructions == 0);
1713 assert(NumVectorInstructions == 0);
1714
1715 // Update the threshold based on callsite properties
1716 updateThreshold(CS, F);
1717
1718 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1719 // this Threshold any time, and cost cannot decrease, we can stop processing
1720 // the rest of the function body.
1721 Threshold += (SingleBBBonus + VectorBonus);
1722
1723 // Give out bonuses for the callsite, as the instructions setting them up
1724 // will be gone after inlining.
1725 Cost -= getCallsiteCost(CS, DL);
1726
1727 // If this function uses the coldcc calling convention, prefer not to inline
1728 // it.
1729 if (F.getCallingConv() == CallingConv::Cold)
1730 Cost += InlineConstants::ColdccPenalty;
1731
1732 // Check if we're done. This can happen due to bonuses and penalties.
1733 if (Cost >= Threshold && !ComputeFullInlineCost)
1734 return false;
1735
1736 if (F.empty())
1737 return true;
1738
1739 Function *Caller = CS.getInstruction()->getFunction();
1740 // Check if the caller function is recursive itself.
1741 for (User *U : Caller->users()) {
1742 CallSite Site(U);
1743 if (!Site)
1744 continue;
1745 Instruction *I = Site.getInstruction();
1746 if (I->getFunction() == Caller) {
1747 IsCallerRecursive = true;
1748 break;
1749 }
1750 }
1751
1752 // Populate our simplified values by mapping from function arguments to call
1753 // arguments with known important simplifications.
1754 CallSite::arg_iterator CAI = CS.arg_begin();
1755 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1756 FAI != FAE; ++FAI, ++CAI) {
1757 assert(CAI != CS.arg_end());
1758 if (Constant *C = dyn_cast<Constant>(CAI))
1759 SimplifiedValues[&*FAI] = C;
1760
1761 Value *PtrArg = *CAI;
1762 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1763 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1764
1765 // We can SROA any pointer arguments derived from alloca instructions.
1766 if (isa<AllocaInst>(PtrArg)) {
1767 SROAArgValues[&*FAI] = PtrArg;
1768 SROAArgCosts[PtrArg] = 0;
1769 }
1770 }
1771 }
1772 NumConstantArgs = SimplifiedValues.size();
1773 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1774 NumAllocaArgs = SROAArgValues.size();
1775
1776 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1777 // the ephemeral values multiple times (and they're completely determined by
1778 // the callee, so this is purely duplicate work).
1779 SmallPtrSet<const Value *, 32> EphValues;
1780 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1781
1782 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1783 // adding basic blocks of the callee which can be proven to be dead for this
1784 // particular call site in order to get more accurate cost estimates. This
1785 // requires a somewhat heavyweight iteration pattern: we need to walk the
1786 // basic blocks in a breadth-first order as we insert live successors. To
1787 // accomplish this, prioritizing for small iterations because we exit after
1788 // crossing our threshold, we use a small-size optimized SetVector.
1789 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1790 SmallPtrSet<BasicBlock *, 16>>
1791 BBSetVector;
1792 BBSetVector BBWorklist;
1793 BBWorklist.insert(&F.getEntryBlock());
1794 bool SingleBB = true;
1795 // Note that we *must not* cache the size, this loop grows the worklist.
1796 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1797 // Bail out the moment we cross the threshold. This means we'll under-count
1798 // the cost, but only when undercounting doesn't matter.
1799 if (Cost >= Threshold && !ComputeFullInlineCost)
1800 break;
1801
1802 BasicBlock *BB = BBWorklist[Idx];
1803 if (BB->empty())
1804 continue;
1805
1806 // Disallow inlining a blockaddress. A blockaddress only has defined
1807 // behavior for an indirect branch in the same function, and we do not
1808 // currently support inlining indirect branches. But, the inliner may not
1809 // see an indirect branch that ends up being dead code at a particular call
1810 // site. If the blockaddress escapes the function, e.g., via a global
1811 // variable, inlining may lead to an invalid cross-function reference.
1812 if (BB->hasAddressTaken())
1813 return false;
1814
1815 // Analyze the cost of this block. If we blow through the threshold, this
1816 // returns false, and we can bail on out.
1817 if (!analyzeBlock(BB, EphValues))
1818 return false;
1819
1820 TerminatorInst *TI = BB->getTerminator();
1821
1822 // Add in the live successors by first checking whether we have terminator
1823 // that may be simplified based on the values simplified by this call.
1824 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1825 if (BI->isConditional()) {
1826 Value *Cond = BI->getCondition();
1827 if (ConstantInt *SimpleCond =
1828 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1829 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1830 BBWorklist.insert(NextBB);
1831 KnownSuccessors[BB] = NextBB;
1832 findDeadBlocks(BB, NextBB);
1833 continue;
1834 }
1835 }
1836 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1837 Value *Cond = SI->getCondition();
1838 if (ConstantInt *SimpleCond =
1839 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1840 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1841 BBWorklist.insert(NextBB);
1842 KnownSuccessors[BB] = NextBB;
1843 findDeadBlocks(BB, NextBB);
1844 continue;
1845 }
1846 }
1847
1848 // If we're unable to select a particular successor, just count all of
1849 // them.
1850 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1851 ++TIdx)
1852 BBWorklist.insert(TI->getSuccessor(TIdx));
1853
1854 // If we had any successors at this point, than post-inlining is likely to
1855 // have them as well. Note that we assume any basic blocks which existed
1856 // due to branches or switches which folded above will also fold after
1857 // inlining.
1858 if (SingleBB && TI->getNumSuccessors() > 1) {
1859 // Take off the bonus we applied to the threshold.
1860 Threshold -= SingleBBBonus;
1861 SingleBB = false;
1862 }
1863 }
1864
1865 bool OnlyOneCallAndLocalLinkage =
1866 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1867 // If this is a noduplicate call, we can still inline as long as
1868 // inlining this would cause the removal of the caller (so the instruction
1869 // is not actually duplicated, just moved).
1870 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1871 return false;
1872
1873 // We applied the maximum possible vector bonus at the beginning. Now,
1874 // subtract the excess bonus, if any, from the Threshold before
1875 // comparing against Cost.
1876 if (NumVectorInstructions <= NumInstructions / 10)
1877 Threshold -= VectorBonus;
1878 else if (NumVectorInstructions <= NumInstructions / 2)
1879 Threshold -= VectorBonus/2;
1880
1881 return Cost < std::max(1, Threshold);
1882 }
1883
1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1885 /// Dump stats about this call's analysis.
dump()1886 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1887 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1888 DEBUG_PRINT_STAT(NumConstantArgs);
1889 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1890 DEBUG_PRINT_STAT(NumAllocaArgs);
1891 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1892 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1893 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1894 DEBUG_PRINT_STAT(NumInstructions);
1895 DEBUG_PRINT_STAT(SROACostSavings);
1896 DEBUG_PRINT_STAT(SROACostSavingsLost);
1897 DEBUG_PRINT_STAT(LoadEliminationCost);
1898 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1899 DEBUG_PRINT_STAT(Cost);
1900 DEBUG_PRINT_STAT(Threshold);
1901 #undef DEBUG_PRINT_STAT
1902 }
1903 #endif
1904
1905 /// Test that there are no attribute conflicts between Caller and Callee
1906 /// that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI)1907 static bool functionsHaveCompatibleAttributes(Function *Caller,
1908 Function *Callee,
1909 TargetTransformInfo &TTI) {
1910 return TTI.areInlineCompatible(Caller, Callee) &&
1911 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1912 }
1913
getCallsiteCost(CallSite CS,const DataLayout & DL)1914 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1915 int Cost = 0;
1916 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1917 if (CS.isByValArgument(I)) {
1918 // We approximate the number of loads and stores needed by dividing the
1919 // size of the byval type by the target's pointer size.
1920 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1921 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1922 unsigned AS = PTy->getAddressSpace();
1923 unsigned PointerSize = DL.getPointerSizeInBits(AS);
1924 // Ceiling division.
1925 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1926
1927 // If it generates more than 8 stores it is likely to be expanded as an
1928 // inline memcpy so we take that as an upper bound. Otherwise we assume
1929 // one load and one store per word copied.
1930 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1931 // here instead of a magic number of 8, but it's not available via
1932 // DataLayout.
1933 NumStores = std::min(NumStores, 8U);
1934
1935 Cost += 2 * NumStores * InlineConstants::InstrCost;
1936 } else {
1937 // For non-byval arguments subtract off one instruction per call
1938 // argument.
1939 Cost += InlineConstants::InstrCost;
1940 }
1941 }
1942 // The call instruction also disappears after inlining.
1943 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1944 return Cost;
1945 }
1946
getInlineCost(CallSite CS,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1947 InlineCost llvm::getInlineCost(
1948 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1949 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1950 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1951 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1952 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1953 GetAssumptionCache, GetBFI, PSI, ORE);
1954 }
1955
getInlineCost(CallSite CS,Function * Callee,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1956 InlineCost llvm::getInlineCost(
1957 CallSite CS, Function *Callee, const InlineParams &Params,
1958 TargetTransformInfo &CalleeTTI,
1959 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1960 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1961 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1962
1963 // Cannot inline indirect calls.
1964 if (!Callee)
1965 return llvm::InlineCost::getNever();
1966
1967 // Never inline calls with byval arguments that does not have the alloca
1968 // address space. Since byval arguments can be replaced with a copy to an
1969 // alloca, the inlined code would need to be adjusted to handle that the
1970 // argument is in the alloca address space (so it is a little bit complicated
1971 // to solve).
1972 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
1973 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
1974 if (CS.isByValArgument(I)) {
1975 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1976 if (PTy->getAddressSpace() != AllocaAS)
1977 return llvm::InlineCost::getNever();
1978 }
1979
1980 // Calls to functions with always-inline attributes should be inlined
1981 // whenever possible.
1982 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1983 if (isInlineViable(*Callee))
1984 return llvm::InlineCost::getAlways();
1985 return llvm::InlineCost::getNever();
1986 }
1987
1988 // Never inline functions with conflicting attributes (unless callee has
1989 // always-inline attribute).
1990 Function *Caller = CS.getCaller();
1991 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1992 return llvm::InlineCost::getNever();
1993
1994 // Don't inline this call if the caller has the optnone attribute.
1995 if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1996 return llvm::InlineCost::getNever();
1997
1998 // Don't inline a function that treats null pointer as valid into a caller
1999 // that does not have this attribute.
2000 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2001 return llvm::InlineCost::getNever();
2002
2003 // Don't inline functions which can be interposed at link-time. Don't inline
2004 // functions marked noinline or call sites marked noinline.
2005 // Note: inlining non-exact non-interposable functions is fine, since we know
2006 // we have *a* correct implementation of the source level function.
2007 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
2008 CS.isNoInline())
2009 return llvm::InlineCost::getNever();
2010
2011 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2012 << "... (caller:" << Caller->getName() << ")\n");
2013
2014 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2015 Params);
2016 bool ShouldInline = CA.analyzeCall(CS);
2017
2018 LLVM_DEBUG(CA.dump());
2019
2020 // Check if there was a reason to force inlining or no inlining.
2021 if (!ShouldInline && CA.getCost() < CA.getThreshold())
2022 return InlineCost::getNever();
2023 if (ShouldInline && CA.getCost() >= CA.getThreshold())
2024 return InlineCost::getAlways();
2025
2026 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2027 }
2028
isInlineViable(Function & F)2029 bool llvm::isInlineViable(Function &F) {
2030 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2031 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2032 // Disallow inlining of functions which contain indirect branches or
2033 // blockaddresses.
2034 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2035 return false;
2036
2037 for (auto &II : *BI) {
2038 CallSite CS(&II);
2039 if (!CS)
2040 continue;
2041
2042 // Disallow recursive calls.
2043 if (&F == CS.getCalledFunction())
2044 return false;
2045
2046 // Disallow calls which expose returns-twice to a function not previously
2047 // attributed as such.
2048 if (!ReturnsTwice && CS.isCall() &&
2049 cast<CallInst>(CS.getInstruction())->canReturnTwice())
2050 return false;
2051
2052 if (CS.getCalledFunction())
2053 switch (CS.getCalledFunction()->getIntrinsicID()) {
2054 default:
2055 break;
2056 // Disallow inlining of @llvm.icall.branch.funnel because current
2057 // backend can't separate call targets from call arguments.
2058 case llvm::Intrinsic::icall_branch_funnel:
2059 // Disallow inlining functions that call @llvm.localescape. Doing this
2060 // correctly would require major changes to the inliner.
2061 case llvm::Intrinsic::localescape:
2062 // Disallow inlining of functions that access VarArgs.
2063 case llvm::Intrinsic::vastart:
2064 case llvm::Intrinsic::vaend:
2065 return false;
2066 }
2067 }
2068 }
2069
2070 return true;
2071 }
2072
2073 // APIs to create InlineParams based on command line flags and/or other
2074 // parameters.
2075
getInlineParams(int Threshold)2076 InlineParams llvm::getInlineParams(int Threshold) {
2077 InlineParams Params;
2078
2079 // This field is the threshold to use for a callee by default. This is
2080 // derived from one or more of:
2081 // * optimization or size-optimization levels,
2082 // * a value passed to createFunctionInliningPass function, or
2083 // * the -inline-threshold flag.
2084 // If the -inline-threshold flag is explicitly specified, that is used
2085 // irrespective of anything else.
2086 if (InlineThreshold.getNumOccurrences() > 0)
2087 Params.DefaultThreshold = InlineThreshold;
2088 else
2089 Params.DefaultThreshold = Threshold;
2090
2091 // Set the HintThreshold knob from the -inlinehint-threshold.
2092 Params.HintThreshold = HintThreshold;
2093
2094 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2095 Params.HotCallSiteThreshold = HotCallSiteThreshold;
2096
2097 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2098 // populate LocallyHotCallSiteThreshold. Later, we populate
2099 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2100 // we know that optimization level is O3 (in the getInlineParams variant that
2101 // takes the opt and size levels).
2102 // FIXME: Remove this check (and make the assignment unconditional) after
2103 // addressing size regression issues at O2.
2104 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2105 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2106
2107 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2108 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2109
2110 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2111 // -inlinehint-threshold commandline option is not explicitly given. If that
2112 // option is present, then its value applies even for callees with size and
2113 // minsize attributes.
2114 // If the -inline-threshold is not specified, set the ColdThreshold from the
2115 // -inlinecold-threshold even if it is not explicitly passed. If
2116 // -inline-threshold is specified, then -inlinecold-threshold needs to be
2117 // explicitly specified to set the ColdThreshold knob
2118 if (InlineThreshold.getNumOccurrences() == 0) {
2119 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2120 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2121 Params.ColdThreshold = ColdThreshold;
2122 } else if (ColdThreshold.getNumOccurrences() > 0) {
2123 Params.ColdThreshold = ColdThreshold;
2124 }
2125 return Params;
2126 }
2127
getInlineParams()2128 InlineParams llvm::getInlineParams() {
2129 return getInlineParams(InlineThreshold);
2130 }
2131
2132 // Compute the default threshold for inlining based on the opt level and the
2133 // size opt level.
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)2134 static int computeThresholdFromOptLevels(unsigned OptLevel,
2135 unsigned SizeOptLevel) {
2136 if (OptLevel > 2)
2137 return InlineConstants::OptAggressiveThreshold;
2138 if (SizeOptLevel == 1) // -Os
2139 return InlineConstants::OptSizeThreshold;
2140 if (SizeOptLevel == 2) // -Oz
2141 return InlineConstants::OptMinSizeThreshold;
2142 return InlineThreshold;
2143 }
2144
getInlineParams(unsigned OptLevel,unsigned SizeOptLevel)2145 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2146 auto Params =
2147 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2148 // At O3, use the value of -locally-hot-callsite-threshold option to populate
2149 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2150 // when it is specified explicitly.
2151 if (OptLevel > 2)
2152 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2153 return Params;
2154 }
2155