1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/InitializePasses.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/BlockFrequency.h"
48 #include "llvm/Support/BranchProbability.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/Utils/Cloning.h"
54 #include "llvm/Transforms/Utils/CodeExtractor.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <functional>
60 #include <iterator>
61 #include <memory>
62 #include <tuple>
63 #include <vector>
64
65 using namespace llvm;
66
67 #define DEBUG_TYPE "partial-inlining"
68
69 STATISTIC(NumPartialInlined,
70 "Number of callsites functions partially inlined into.");
71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
72 "cold outlined regions were partially "
73 "inlined into its caller(s).");
74 STATISTIC(NumColdRegionsFound,
75 "Number of cold single entry/exit regions found.");
76 STATISTIC(NumColdRegionsOutlined,
77 "Number of cold single entry/exit regions outlined.");
78
79 // Command line option to disable partial-inlining. The default is false:
80 static cl::opt<bool>
81 DisablePartialInlining("disable-partial-inlining", cl::init(false),
82 cl::Hidden, cl::desc("Disable partial inlining"));
83 // Command line option to disable multi-region partial-inlining. The default is
84 // false:
85 static cl::opt<bool> DisableMultiRegionPartialInline(
86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
87 cl::desc("Disable multi-region partial inlining"));
88
89 // Command line option to force outlining in regions with live exit variables.
90 // The default is false:
91 static cl::opt<bool>
92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
93 cl::desc("Force outline regions with live exits"));
94
95 // Command line option to enable marking outline functions with Cold Calling
96 // Convention. The default is false:
97 static cl::opt<bool>
98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
99 cl::desc("Mark outline function calls with ColdCC"));
100
101 #ifndef NDEBUG
102 // Command line option to debug partial-inlining. The default is none:
103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
104 cl::init(false), cl::Hidden,
105 cl::desc("Trace partial inlining."));
106 #endif
107
108 // This is an option used by testing:
109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
110 cl::init(false), cl::ZeroOrMore,
111 cl::ReallyHidden,
112 cl::desc("Skip Cost Analysis"));
113 // Used to determine if a cold region is worth outlining based on
114 // its inlining cost compared to the original function. Default is set at 10%.
115 // ie. if the cold region reduces the inlining cost of the original function by
116 // at least 10%.
117 static cl::opt<float> MinRegionSizeRatio(
118 "min-region-size-ratio", cl::init(0.1), cl::Hidden,
119 cl::desc("Minimum ratio comparing relative sizes of each "
120 "outline candidate and original function"));
121 // Used to tune the minimum number of execution counts needed in the predecessor
122 // block to the cold edge. ie. confidence interval.
123 static cl::opt<unsigned>
124 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
125 cl::desc("Minimum block executions to consider "
126 "its BranchProbabilityInfo valid"));
127 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
128 // if the branch probability is 10% or less, then it is deemed as 'cold'.
129 static cl::opt<float> ColdBranchRatio(
130 "cold-branch-ratio", cl::init(0.1), cl::Hidden,
131 cl::desc("Minimum BranchProbability to consider a region cold."));
132
133 static cl::opt<unsigned> MaxNumInlineBlocks(
134 "max-num-inline-blocks", cl::init(5), cl::Hidden,
135 cl::desc("Max number of blocks to be partially inlined"));
136
137 // Command line option to set the maximum number of partial inlining allowed
138 // for the module. The default value of -1 means no limit.
139 static cl::opt<int> MaxNumPartialInlining(
140 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
141 cl::desc("Max number of partial inlining. The default is unlimited"));
142
143 // Used only when PGO or user annotated branch data is absent. It is
144 // the least value that is used to weigh the outline region. If BFI
145 // produces larger value, the BFI value will be used.
146 static cl::opt<int>
147 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
148 cl::Hidden, cl::ZeroOrMore,
149 cl::desc("Relative frequency of outline region to "
150 "the entry block"));
151
152 static cl::opt<unsigned> ExtraOutliningPenalty(
153 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
154 cl::desc("A debug option to add additional penalty to the computed one."));
155
156 namespace {
157
158 struct FunctionOutliningInfo {
159 FunctionOutliningInfo() = default;
160
161 // Returns the number of blocks to be inlined including all blocks
162 // in Entries and one return block.
GetNumInlinedBlocks__anon97bf38560111::FunctionOutliningInfo163 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
164
165 // A set of blocks including the function entry that guard
166 // the region to be outlined.
167 SmallVector<BasicBlock *, 4> Entries;
168
169 // The return block that is not included in the outlined region.
170 BasicBlock *ReturnBlock = nullptr;
171
172 // The dominating block of the region to be outlined.
173 BasicBlock *NonReturnBlock = nullptr;
174
175 // The set of blocks in Entries that that are predecessors to ReturnBlock
176 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
177 };
178
179 struct FunctionOutliningMultiRegionInfo {
FunctionOutliningMultiRegionInfo__anon97bf38560111::FunctionOutliningMultiRegionInfo180 FunctionOutliningMultiRegionInfo()
181 : ORI() {}
182
183 // Container for outline regions
184 struct OutlineRegionInfo {
OutlineRegionInfo__anon97bf38560111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo185 OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
186 BasicBlock *EntryBlock, BasicBlock *ExitBlock,
187 BasicBlock *ReturnBlock)
188 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
189 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
190 SmallVector<BasicBlock *, 8> Region;
191 BasicBlock *EntryBlock;
192 BasicBlock *ExitBlock;
193 BasicBlock *ReturnBlock;
194 };
195
196 SmallVector<OutlineRegionInfo, 4> ORI;
197 };
198
199 struct PartialInlinerImpl {
200
PartialInlinerImpl__anon97bf38560111::PartialInlinerImpl201 PartialInlinerImpl(
202 std::function<AssumptionCache &(Function &)> *GetAC,
203 function_ref<AssumptionCache *(Function &)> LookupAC,
204 std::function<TargetTransformInfo &(Function &)> *GTTI,
205 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
206 ProfileSummaryInfo *ProfSI)
207 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
208 GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
209
210 bool run(Module &M);
211 // Main part of the transformation that calls helper functions to find
212 // outlining candidates, clone & outline the function, and attempt to
213 // partially inline the resulting function. Returns true if
214 // inlining was successful, false otherwise. Also returns the outline
215 // function (only if we partially inlined early returns) as there is a
216 // possibility to further "peel" early return statements that were left in the
217 // outline function due to code size.
218 std::pair<bool, Function *> unswitchFunction(Function *F);
219
220 // This class speculatively clones the function to be partial inlined.
221 // At the end of partial inlining, the remaining callsites to the cloned
222 // function that are not partially inlined will be fixed up to reference
223 // the original function, and the cloned function will be erased.
224 struct FunctionCloner {
225 // Two constructors, one for single region outlining, the other for
226 // multi-region outlining.
227 FunctionCloner(Function *F, FunctionOutliningInfo *OI,
228 OptimizationRemarkEmitter &ORE,
229 function_ref<AssumptionCache *(Function &)> LookupAC);
230 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
231 OptimizationRemarkEmitter &ORE,
232 function_ref<AssumptionCache *(Function &)> LookupAC);
233 ~FunctionCloner();
234
235 // Prepare for function outlining: making sure there is only
236 // one incoming edge from the extracted/outlined region to
237 // the return block.
238 void NormalizeReturnBlock();
239
240 // Do function outlining for cold regions.
241 bool doMultiRegionFunctionOutlining();
242 // Do function outlining for region after early return block(s).
243 // NOTE: For vararg functions that do the vararg handling in the outlined
244 // function, we temporarily generate IR that does not properly
245 // forward varargs to the outlined function. Calling InlineFunction
246 // will update calls to the outlined functions to properly forward
247 // the varargs.
248 Function *doSingleRegionFunctionOutlining();
249
250 Function *OrigFunc = nullptr;
251 Function *ClonedFunc = nullptr;
252
253 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
254 // Keep track of Outlined Functions and the basic block they're called from.
255 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
256
257 // ClonedFunc is inlined in one of its callers after function
258 // outlining.
259 bool IsFunctionInlined = false;
260 // The cost of the region to be outlined.
261 int OutlinedRegionCost = 0;
262 // ClonedOI is specific to outlining non-early return blocks.
263 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
264 // ClonedOMRI is specific to outlining cold regions.
265 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
266 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
267 OptimizationRemarkEmitter &ORE;
268 function_ref<AssumptionCache *(Function &)> LookupAC;
269 };
270
271 private:
272 int NumPartialInlining = 0;
273 std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
274 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
275 std::function<TargetTransformInfo &(Function &)> *GetTTI;
276 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
277 ProfileSummaryInfo *PSI;
278
279 // Return the frequency of the OutlininingBB relative to F's entry point.
280 // The result is no larger than 1 and is represented using BP.
281 // (Note that the outlined region's 'head' block can only have incoming
282 // edges from the guarding entry blocks).
283 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
284
285 // Return true if the callee of CS should be partially inlined with
286 // profit.
287 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
288 BlockFrequency WeightedOutliningRcost,
289 OptimizationRemarkEmitter &ORE);
290
291 // Try to inline DuplicateFunction (cloned from F with call to
292 // the OutlinedFunction into its callers. Return true
293 // if there is any successful inlining.
294 bool tryPartialInline(FunctionCloner &Cloner);
295
296 // Compute the mapping from use site of DuplicationFunction to the enclosing
297 // BB's profile count.
298 void computeCallsiteToProfCountMap(Function *DuplicateFunction,
299 DenseMap<User *, uint64_t> &SiteCountMap);
300
IsLimitReached__anon97bf38560111::PartialInlinerImpl301 bool IsLimitReached() {
302 return (MaxNumPartialInlining != -1 &&
303 NumPartialInlining >= MaxNumPartialInlining);
304 }
305
getCallSite__anon97bf38560111::PartialInlinerImpl306 static CallSite getCallSite(User *U) {
307 CallSite CS;
308 if (CallInst *CI = dyn_cast<CallInst>(U))
309 CS = CallSite(CI);
310 else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
311 CS = CallSite(II);
312 else
313 llvm_unreachable("All uses must be calls");
314 return CS;
315 }
316
getOneCallSiteTo__anon97bf38560111::PartialInlinerImpl317 static CallSite getOneCallSiteTo(Function *F) {
318 User *User = *F->user_begin();
319 return getCallSite(User);
320 }
321
getOneDebugLoc__anon97bf38560111::PartialInlinerImpl322 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
323 CallSite CS = getOneCallSiteTo(F);
324 DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
325 BasicBlock *Block = CS.getParent();
326 return std::make_tuple(DLoc, Block);
327 }
328
329 // Returns the costs associated with function outlining:
330 // - The first value is the non-weighted runtime cost for making the call
331 // to the outlined function, including the addtional setup cost in the
332 // outlined function itself;
333 // - The second value is the estimated size of the new call sequence in
334 // basic block Cloner.OutliningCallBB;
335 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
336
337 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
338 // approximate both the size and runtime cost (Note that in the current
339 // inline cost analysis, there is no clear distinction there either).
340 static int computeBBInlineCost(BasicBlock *BB);
341
342 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
343 std::unique_ptr<FunctionOutliningMultiRegionInfo>
344 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
345 };
346
347 struct PartialInlinerLegacyPass : public ModulePass {
348 static char ID; // Pass identification, replacement for typeid
349
PartialInlinerLegacyPass__anon97bf38560111::PartialInlinerLegacyPass350 PartialInlinerLegacyPass() : ModulePass(ID) {
351 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
352 }
353
getAnalysisUsage__anon97bf38560111::PartialInlinerLegacyPass354 void getAnalysisUsage(AnalysisUsage &AU) const override {
355 AU.addRequired<AssumptionCacheTracker>();
356 AU.addRequired<ProfileSummaryInfoWrapperPass>();
357 AU.addRequired<TargetTransformInfoWrapperPass>();
358 }
359
runOnModule__anon97bf38560111::PartialInlinerLegacyPass360 bool runOnModule(Module &M) override {
361 if (skipModule(M))
362 return false;
363
364 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
365 TargetTransformInfoWrapperPass *TTIWP =
366 &getAnalysis<TargetTransformInfoWrapperPass>();
367 ProfileSummaryInfo *PSI =
368 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
369
370 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
371 [&ACT](Function &F) -> AssumptionCache & {
372 return ACT->getAssumptionCache(F);
373 };
374
375 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
376 return ACT->lookupAssumptionCache(F);
377 };
378
379 std::function<TargetTransformInfo &(Function &)> GetTTI =
380 [&TTIWP](Function &F) -> TargetTransformInfo & {
381 return TTIWP->getTTI(F);
382 };
383
384 return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache,
385 &GetTTI, NoneType::None, PSI)
386 .run(M);
387 }
388 };
389
390 } // end anonymous namespace
391
392 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function * F,OptimizationRemarkEmitter & ORE)393 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
394 OptimizationRemarkEmitter &ORE) {
395 BasicBlock *EntryBlock = &F->front();
396
397 DominatorTree DT(*F);
398 LoopInfo LI(DT);
399 BranchProbabilityInfo BPI(*F, LI);
400 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
401 BlockFrequencyInfo *BFI;
402 if (!GetBFI) {
403 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
404 BFI = ScopedBFI.get();
405 } else
406 BFI = &(*GetBFI)(*F);
407
408 // Return if we don't have profiling information.
409 if (!PSI->hasInstrumentationProfile())
410 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
411
412 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
413 std::make_unique<FunctionOutliningMultiRegionInfo>();
414
415 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
416 BasicBlock *Dom = BlockList.front();
417 return BlockList.size() > 1 && Dom->hasNPredecessors(1);
418 };
419
420 auto IsSingleExit =
421 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
422 BasicBlock *ExitBlock = nullptr;
423 for (auto *Block : BlockList) {
424 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
425 if (!is_contained(BlockList, *SI)) {
426 if (ExitBlock) {
427 ORE.emit([&]() {
428 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
429 &SI->front())
430 << "Region dominated by "
431 << ore::NV("Block", BlockList.front()->getName())
432 << " has more than one region exit edge.";
433 });
434 return nullptr;
435 } else
436 ExitBlock = Block;
437 }
438 }
439 }
440 return ExitBlock;
441 };
442
443 auto BBProfileCount = [BFI](BasicBlock *BB) {
444 return BFI->getBlockProfileCount(BB)
445 ? BFI->getBlockProfileCount(BB).getValue()
446 : 0;
447 };
448
449 // Use the same computeBBInlineCost function to compute the cost savings of
450 // the outlining the candidate region.
451 int OverallFunctionCost = 0;
452 for (auto &BB : *F)
453 OverallFunctionCost += computeBBInlineCost(&BB);
454
455 #ifndef NDEBUG
456 if (TracePartialInlining)
457 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
458 #endif
459 int MinOutlineRegionCost =
460 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
461 BranchProbability MinBranchProbability(
462 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
463 MinBlockCounterExecution);
464 bool ColdCandidateFound = false;
465 BasicBlock *CurrEntry = EntryBlock;
466 std::vector<BasicBlock *> DFS;
467 DenseMap<BasicBlock *, bool> VisitedMap;
468 DFS.push_back(CurrEntry);
469 VisitedMap[CurrEntry] = true;
470 // Use Depth First Search on the basic blocks to find CFG edges that are
471 // considered cold.
472 // Cold regions considered must also have its inline cost compared to the
473 // overall inline cost of the original function. The region is outlined only
474 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475 // more.
476 while (!DFS.empty()) {
477 auto *thisBB = DFS.back();
478 DFS.pop_back();
479 // Only consider regions with predecessor blocks that are considered
480 // not-cold (default: part of the top 99.99% of all block counters)
481 // AND greater than our minimum block execution count (default: 100).
482 if (PSI->isColdBlock(thisBB, BFI) ||
483 BBProfileCount(thisBB) < MinBlockCounterExecution)
484 continue;
485 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
486 if (VisitedMap[*SI])
487 continue;
488 VisitedMap[*SI] = true;
489 DFS.push_back(*SI);
490 // If branch isn't cold, we skip to the next one.
491 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
492 if (SuccProb > MinBranchProbability)
493 continue;
494 #ifndef NDEBUG
495 if (TracePartialInlining) {
496 dbgs() << "Found cold edge: " << thisBB->getName() << "->"
497 << (*SI)->getName() << "\nBranch Probability = " << SuccProb
498 << "\n";
499 }
500 #endif
501 SmallVector<BasicBlock *, 8> DominateVector;
502 DT.getDescendants(*SI, DominateVector);
503 // We can only outline single entry regions (for now).
504 if (!IsSingleEntry(DominateVector))
505 continue;
506 BasicBlock *ExitBlock = nullptr;
507 // We can only outline single exit regions (for now).
508 if (!(ExitBlock = IsSingleExit(DominateVector)))
509 continue;
510 int OutlineRegionCost = 0;
511 for (auto *BB : DominateVector)
512 OutlineRegionCost += computeBBInlineCost(BB);
513
514 #ifndef NDEBUG
515 if (TracePartialInlining)
516 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
517 #endif
518
519 if (OutlineRegionCost < MinOutlineRegionCost) {
520 ORE.emit([&]() {
521 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
522 &SI->front())
523 << ore::NV("Callee", F) << " inline cost-savings smaller than "
524 << ore::NV("Cost", MinOutlineRegionCost);
525 });
526 continue;
527 }
528 // For now, ignore blocks that belong to a SISE region that is a
529 // candidate for outlining. In the future, we may want to look
530 // at inner regions because the outer region may have live-exit
531 // variables.
532 for (auto *BB : DominateVector)
533 VisitedMap[BB] = true;
534 // ReturnBlock here means the block after the outline call
535 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
536 // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
537 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
538 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
539 OutliningInfo->ORI.push_back(RegInfo);
540 #ifndef NDEBUG
541 if (TracePartialInlining) {
542 dbgs() << "Found Cold Candidate starting at block: "
543 << DominateVector.front()->getName() << "\n";
544 }
545 #endif
546 ColdCandidateFound = true;
547 NumColdRegionsFound++;
548 }
549 }
550 if (ColdCandidateFound)
551 return OutliningInfo;
552 else
553 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
554 }
555
556 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function * F)557 PartialInlinerImpl::computeOutliningInfo(Function *F) {
558 BasicBlock *EntryBlock = &F->front();
559 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
560 if (!BR || BR->isUnconditional())
561 return std::unique_ptr<FunctionOutliningInfo>();
562
563 // Returns true if Succ is BB's successor
564 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
565 return is_contained(successors(BB), Succ);
566 };
567
568 auto IsReturnBlock = [](BasicBlock *BB) {
569 Instruction *TI = BB->getTerminator();
570 return isa<ReturnInst>(TI);
571 };
572
573 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574 if (IsReturnBlock(Succ1))
575 return std::make_tuple(Succ1, Succ2);
576 if (IsReturnBlock(Succ2))
577 return std::make_tuple(Succ2, Succ1);
578
579 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580 };
581
582 // Detect a triangular shape:
583 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584 if (IsSuccessor(Succ1, Succ2))
585 return std::make_tuple(Succ1, Succ2);
586 if (IsSuccessor(Succ2, Succ1))
587 return std::make_tuple(Succ2, Succ1);
588
589 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590 };
591
592 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
593 std::make_unique<FunctionOutliningInfo>();
594
595 BasicBlock *CurrEntry = EntryBlock;
596 bool CandidateFound = false;
597 do {
598 // The number of blocks to be inlined has already reached
599 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
600 // disables partial inlining for the function.
601 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
602 break;
603
604 if (succ_size(CurrEntry) != 2)
605 break;
606
607 BasicBlock *Succ1 = *succ_begin(CurrEntry);
608 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
609
610 BasicBlock *ReturnBlock, *NonReturnBlock;
611 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
612
613 if (ReturnBlock) {
614 OutliningInfo->Entries.push_back(CurrEntry);
615 OutliningInfo->ReturnBlock = ReturnBlock;
616 OutliningInfo->NonReturnBlock = NonReturnBlock;
617 CandidateFound = true;
618 break;
619 }
620
621 BasicBlock *CommSucc;
622 BasicBlock *OtherSucc;
623 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
624
625 if (!CommSucc)
626 break;
627
628 OutliningInfo->Entries.push_back(CurrEntry);
629 CurrEntry = OtherSucc;
630 } while (true);
631
632 if (!CandidateFound)
633 return std::unique_ptr<FunctionOutliningInfo>();
634
635 // Do sanity check of the entries: threre should not
636 // be any successors (not in the entry set) other than
637 // {ReturnBlock, NonReturnBlock}
638 assert(OutliningInfo->Entries[0] == &F->front() &&
639 "Function Entry must be the first in Entries vector");
640 DenseSet<BasicBlock *> Entries;
641 for (BasicBlock *E : OutliningInfo->Entries)
642 Entries.insert(E);
643
644 // Returns true of BB has Predecessor which is not
645 // in Entries set.
646 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
647 for (auto Pred : predecessors(BB)) {
648 if (!Entries.count(Pred))
649 return true;
650 }
651 return false;
652 };
653 auto CheckAndNormalizeCandidate =
654 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
655 for (BasicBlock *E : OutliningInfo->Entries) {
656 for (auto Succ : successors(E)) {
657 if (Entries.count(Succ))
658 continue;
659 if (Succ == OutliningInfo->ReturnBlock)
660 OutliningInfo->ReturnBlockPreds.push_back(E);
661 else if (Succ != OutliningInfo->NonReturnBlock)
662 return false;
663 }
664 // There should not be any outside incoming edges either:
665 if (HasNonEntryPred(E))
666 return false;
667 }
668 return true;
669 };
670
671 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
672 return std::unique_ptr<FunctionOutliningInfo>();
673
674 // Now further growing the candidate's inlining region by
675 // peeling off dominating blocks from the outlining region:
676 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
677 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
678 if (succ_size(Cand) != 2)
679 break;
680
681 if (HasNonEntryPred(Cand))
682 break;
683
684 BasicBlock *Succ1 = *succ_begin(Cand);
685 BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
686
687 BasicBlock *ReturnBlock, *NonReturnBlock;
688 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
689 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
690 break;
691
692 if (NonReturnBlock->getSinglePredecessor() != Cand)
693 break;
694
695 // Now grow and update OutlininigInfo:
696 OutliningInfo->Entries.push_back(Cand);
697 OutliningInfo->NonReturnBlock = NonReturnBlock;
698 OutliningInfo->ReturnBlockPreds.push_back(Cand);
699 Entries.insert(Cand);
700 }
701
702 return OutliningInfo;
703 }
704
705 // Check if there is PGO data or user annotated branch data:
hasProfileData(Function * F,FunctionOutliningInfo * OI)706 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
707 if (F->hasProfileData())
708 return true;
709 // Now check if any of the entry block has MD_prof data:
710 for (auto *E : OI->Entries) {
711 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
712 if (!BR || BR->isUnconditional())
713 continue;
714 uint64_t T, F;
715 if (BR->extractProfMetadata(T, F))
716 return true;
717 }
718 return false;
719 }
720
721 BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner)722 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
723 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
724 auto EntryFreq =
725 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
726 auto OutliningCallFreq =
727 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
728 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
729 // we outlined any regions, so we may encounter situations where the
730 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
731 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
732 OutliningCallFreq = EntryFreq;
733 }
734 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
735 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
736
737 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
738 return OutlineRegionRelFreq;
739
740 // When profile data is not available, we need to be conservative in
741 // estimating the overall savings. Static branch prediction can usually
742 // guess the branch direction right (taken/non-taken), but the guessed
743 // branch probability is usually not biased enough. In case when the
744 // outlined region is predicted to be likely, its probability needs
745 // to be made higher (more biased) to not under-estimate the cost of
746 // function outlining. On the other hand, if the outlined region
747 // is predicted to be less likely, the predicted probablity is usually
748 // higher than the actual. For instance, the actual probability of the
749 // less likely target is only 5%, but the guessed probablity can be
750 // 40%. In the latter case, there is no need for further adjustement.
751 // FIXME: add an option for this.
752 if (OutlineRegionRelFreq < BranchProbability(45, 100))
753 return OutlineRegionRelFreq;
754
755 OutlineRegionRelFreq = std::max(
756 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
757
758 return OutlineRegionRelFreq;
759 }
760
shouldPartialInline(CallSite CS,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE)761 bool PartialInlinerImpl::shouldPartialInline(
762 CallSite CS, FunctionCloner &Cloner,
763 BlockFrequency WeightedOutliningRcost,
764 OptimizationRemarkEmitter &ORE) {
765 using namespace ore;
766
767 Instruction *Call = CS.getInstruction();
768 Function *Callee = CS.getCalledFunction();
769 assert(Callee == Cloner.ClonedFunc);
770
771 if (SkipCostAnalysis)
772 return isInlineViable(*Callee);
773
774 Function *Caller = CS.getCaller();
775 auto &CalleeTTI = (*GetTTI)(*Callee);
776 bool RemarksEnabled =
777 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
778 DEBUG_TYPE);
779 assert(Call && "invalid callsite for partial inline");
780 InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(),
781 CalleeTTI, *GetAssumptionCache, GetBFI, PSI,
782 RemarksEnabled ? &ORE : nullptr);
783
784 if (IC.isAlways()) {
785 ORE.emit([&]() {
786 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
787 << NV("Callee", Cloner.OrigFunc)
788 << " should always be fully inlined, not partially";
789 });
790 return false;
791 }
792
793 if (IC.isNever()) {
794 ORE.emit([&]() {
795 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
796 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
797 << NV("Caller", Caller)
798 << " because it should never be inlined (cost=never)";
799 });
800 return false;
801 }
802
803 if (!IC) {
804 ORE.emit([&]() {
805 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
806 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
807 << NV("Caller", Caller) << " because too costly to inline (cost="
808 << NV("Cost", IC.getCost()) << ", threshold="
809 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
810 });
811 return false;
812 }
813 const DataLayout &DL = Caller->getParent()->getDataLayout();
814
815 // The savings of eliminating the call:
816 int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL);
817 BlockFrequency NormWeightedSavings(NonWeightedSavings);
818
819 // Weighted saving is smaller than weighted cost, return false
820 if (NormWeightedSavings < WeightedOutliningRcost) {
821 ORE.emit([&]() {
822 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
823 Call)
824 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
825 << NV("Caller", Caller) << " runtime overhead (overhead="
826 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
827 << ", savings="
828 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
829 << ")"
830 << " of making the outlined call is too high";
831 });
832
833 return false;
834 }
835
836 ORE.emit([&]() {
837 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
838 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
839 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
840 << " (threshold="
841 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
842 });
843 return true;
844 }
845
846 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
847 // For now use a simplified version. The returned 'InlineCost' will be used
848 // to esimate the size cost as well as runtime cost of the BB.
computeBBInlineCost(BasicBlock * BB)849 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
850 int InlineCost = 0;
851 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
852 for (Instruction &I : BB->instructionsWithoutDebug()) {
853 // Skip free instructions.
854 switch (I.getOpcode()) {
855 case Instruction::BitCast:
856 case Instruction::PtrToInt:
857 case Instruction::IntToPtr:
858 case Instruction::Alloca:
859 case Instruction::PHI:
860 continue;
861 case Instruction::GetElementPtr:
862 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
863 continue;
864 break;
865 default:
866 break;
867 }
868
869 if (I.isLifetimeStartOrEnd())
870 continue;
871
872 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
873 InlineCost += getCallsiteCost(*CI, DL);
874 continue;
875 }
876
877 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
878 InlineCost += getCallsiteCost(*II, DL);
879 continue;
880 }
881
882 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
883 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
884 continue;
885 }
886 InlineCost += InlineConstants::InstrCost;
887 }
888 return InlineCost;
889 }
890
891 std::tuple<int, int>
computeOutliningCosts(FunctionCloner & Cloner)892 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
893 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
894 for (auto FuncBBPair : Cloner.OutlinedFunctions) {
895 Function *OutlinedFunc = FuncBBPair.first;
896 BasicBlock* OutliningCallBB = FuncBBPair.second;
897 // Now compute the cost of the call sequence to the outlined function
898 // 'OutlinedFunction' in BB 'OutliningCallBB':
899 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
900
901 // Now compute the cost of the extracted/outlined function itself:
902 for (BasicBlock &BB : *OutlinedFunc)
903 OutlinedFunctionCost += computeBBInlineCost(&BB);
904 }
905 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
906 "Outlined function cost should be no less than the outlined region");
907
908 // The code extractor introduces a new root and exit stub blocks with
909 // additional unconditional branches. Those branches will be eliminated
910 // later with bb layout. The cost should be adjusted accordingly:
911 OutlinedFunctionCost -=
912 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
913
914 int OutliningRuntimeOverhead =
915 OutliningFuncCallCost +
916 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
917 ExtraOutliningPenalty;
918
919 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
920 }
921
922 // Create the callsite to profile count map which is
923 // used to update the original function's entry count,
924 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap)925 void PartialInlinerImpl::computeCallsiteToProfCountMap(
926 Function *DuplicateFunction,
927 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
928 std::vector<User *> Users(DuplicateFunction->user_begin(),
929 DuplicateFunction->user_end());
930 Function *CurrentCaller = nullptr;
931 std::unique_ptr<BlockFrequencyInfo> TempBFI;
932 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
933
934 auto ComputeCurrBFI = [&,this](Function *Caller) {
935 // For the old pass manager:
936 if (!GetBFI) {
937 DominatorTree DT(*Caller);
938 LoopInfo LI(DT);
939 BranchProbabilityInfo BPI(*Caller, LI);
940 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
941 CurrentCallerBFI = TempBFI.get();
942 } else {
943 // New pass manager:
944 CurrentCallerBFI = &(*GetBFI)(*Caller);
945 }
946 };
947
948 for (User *User : Users) {
949 CallSite CS = getCallSite(User);
950 Function *Caller = CS.getCaller();
951 if (CurrentCaller != Caller) {
952 CurrentCaller = Caller;
953 ComputeCurrBFI(Caller);
954 } else {
955 assert(CurrentCallerBFI && "CallerBFI is not set");
956 }
957 BasicBlock *CallBB = CS.getInstruction()->getParent();
958 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
959 if (Count)
960 CallSiteToProfCountMap[User] = *Count;
961 else
962 CallSiteToProfCountMap[User] = 0;
963 }
964 }
965
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC)966 PartialInlinerImpl::FunctionCloner::FunctionCloner(
967 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
968 function_ref<AssumptionCache *(Function &)> LookupAC)
969 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
970 ClonedOI = std::make_unique<FunctionOutliningInfo>();
971
972 // Clone the function, so that we can hack away on it.
973 ValueToValueMapTy VMap;
974 ClonedFunc = CloneFunction(F, VMap);
975
976 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
977 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
978 for (BasicBlock *BB : OI->Entries) {
979 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
980 }
981 for (BasicBlock *E : OI->ReturnBlockPreds) {
982 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
983 ClonedOI->ReturnBlockPreds.push_back(NewE);
984 }
985 // Go ahead and update all uses to the duplicate, so that we can just
986 // use the inliner functionality when we're done hacking.
987 F->replaceAllUsesWith(ClonedFunc);
988 }
989
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC)990 PartialInlinerImpl::FunctionCloner::FunctionCloner(
991 Function *F, FunctionOutliningMultiRegionInfo *OI,
992 OptimizationRemarkEmitter &ORE,
993 function_ref<AssumptionCache *(Function &)> LookupAC)
994 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
995 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
996
997 // Clone the function, so that we can hack away on it.
998 ValueToValueMapTy VMap;
999 ClonedFunc = CloneFunction(F, VMap);
1000
1001 // Go through all Outline Candidate Regions and update all BasicBlock
1002 // information.
1003 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1004 OI->ORI) {
1005 SmallVector<BasicBlock *, 8> Region;
1006 for (BasicBlock *BB : RegionInfo.Region) {
1007 Region.push_back(cast<BasicBlock>(VMap[BB]));
1008 }
1009 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1010 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1011 BasicBlock *NewReturnBlock = nullptr;
1012 if (RegionInfo.ReturnBlock)
1013 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1014 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1015 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1016 ClonedOMRI->ORI.push_back(MappedRegionInfo);
1017 }
1018 // Go ahead and update all uses to the duplicate, so that we can just
1019 // use the inliner functionality when we're done hacking.
1020 F->replaceAllUsesWith(ClonedFunc);
1021 }
1022
NormalizeReturnBlock()1023 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1024 auto getFirstPHI = [](BasicBlock *BB) {
1025 BasicBlock::iterator I = BB->begin();
1026 PHINode *FirstPhi = nullptr;
1027 while (I != BB->end()) {
1028 PHINode *Phi = dyn_cast<PHINode>(I);
1029 if (!Phi)
1030 break;
1031 if (!FirstPhi) {
1032 FirstPhi = Phi;
1033 break;
1034 }
1035 }
1036 return FirstPhi;
1037 };
1038
1039 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1040 // blocks.
1041 if (!ClonedOI)
1042 return;
1043
1044 // Special hackery is needed with PHI nodes that have inputs from more than
1045 // one extracted block. For simplicity, just split the PHIs into a two-level
1046 // sequence of PHIs, some of which will go in the extracted region, and some
1047 // of which will go outside.
1048 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1049 // only split block when necessary:
1050 PHINode *FirstPhi = getFirstPHI(PreReturn);
1051 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1052
1053 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1054 return;
1055
1056 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1057 Value *CommonValue = PN->getIncomingValue(0);
1058 if (all_of(PN->incoming_values(),
1059 [&](Value *V) { return V == CommonValue; }))
1060 return CommonValue;
1061 return nullptr;
1062 };
1063
1064 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1065 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1066 BasicBlock::iterator I = PreReturn->begin();
1067 Instruction *Ins = &ClonedOI->ReturnBlock->front();
1068 SmallVector<Instruction *, 4> DeadPhis;
1069 while (I != PreReturn->end()) {
1070 PHINode *OldPhi = dyn_cast<PHINode>(I);
1071 if (!OldPhi)
1072 break;
1073
1074 PHINode *RetPhi =
1075 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1076 OldPhi->replaceAllUsesWith(RetPhi);
1077 Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1078
1079 RetPhi->addIncoming(&*I, PreReturn);
1080 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1081 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1082 OldPhi->removeIncomingValue(E);
1083 }
1084
1085 // After incoming values splitting, the old phi may become trivial.
1086 // Keeping the trivial phi can introduce definition inside the outline
1087 // region which is live-out, causing necessary overhead (load, store
1088 // arg passing etc).
1089 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1090 OldPhi->replaceAllUsesWith(OldPhiVal);
1091 DeadPhis.push_back(OldPhi);
1092 }
1093 ++I;
1094 }
1095 for (auto *DP : DeadPhis)
1096 DP->eraseFromParent();
1097
1098 for (auto E : ClonedOI->ReturnBlockPreds) {
1099 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1100 }
1101 }
1102
doMultiRegionFunctionOutlining()1103 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1104
1105 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1106 int Cost = 0;
1107 for (BasicBlock* BB : Region)
1108 Cost += computeBBInlineCost(BB);
1109 return Cost;
1110 };
1111
1112 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1113
1114 if (ClonedOMRI->ORI.empty())
1115 return false;
1116
1117 // The CodeExtractor needs a dominator tree.
1118 DominatorTree DT;
1119 DT.recalculate(*ClonedFunc);
1120
1121 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1122 LoopInfo LI(DT);
1123 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1124 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1125
1126 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1127 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1128
1129 SetVector<Value *> Inputs, Outputs, Sinks;
1130 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1131 ClonedOMRI->ORI) {
1132 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1133
1134 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1135 ClonedFuncBFI.get(), &BPI,
1136 LookupAC(*RegionInfo.EntryBlock->getParent()),
1137 /* AllowVarargs */ false);
1138
1139 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1140
1141 #ifndef NDEBUG
1142 if (TracePartialInlining) {
1143 dbgs() << "inputs: " << Inputs.size() << "\n";
1144 dbgs() << "outputs: " << Outputs.size() << "\n";
1145 for (Value *value : Inputs)
1146 dbgs() << "value used in func: " << *value << "\n";
1147 for (Value *output : Outputs)
1148 dbgs() << "instr used in func: " << *output << "\n";
1149 }
1150 #endif
1151 // Do not extract regions that have live exit variables.
1152 if (Outputs.size() > 0 && !ForceLiveExit)
1153 continue;
1154
1155 Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1156
1157 if (OutlinedFunc) {
1158 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1159 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1160 assert(OutliningCallBB->getParent() == ClonedFunc);
1161 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1162 NumColdRegionsOutlined++;
1163 OutlinedRegionCost += CurrentOutlinedRegionCost;
1164
1165 if (MarkOutlinedColdCC) {
1166 OutlinedFunc->setCallingConv(CallingConv::Cold);
1167 OCS.setCallingConv(CallingConv::Cold);
1168 }
1169 } else
1170 ORE.emit([&]() {
1171 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1172 &RegionInfo.Region.front()->front())
1173 << "Failed to extract region at block "
1174 << ore::NV("Block", RegionInfo.Region.front());
1175 });
1176 }
1177
1178 return !OutlinedFunctions.empty();
1179 }
1180
1181 Function *
doSingleRegionFunctionOutlining()1182 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1183 // Returns true if the block is to be partial inlined into the caller
1184 // (i.e. not to be extracted to the out of line function)
1185 auto ToBeInlined = [&, this](BasicBlock *BB) {
1186 return BB == ClonedOI->ReturnBlock ||
1187 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1188 ClonedOI->Entries.end());
1189 };
1190
1191 assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1192 // The CodeExtractor needs a dominator tree.
1193 DominatorTree DT;
1194 DT.recalculate(*ClonedFunc);
1195
1196 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1197 LoopInfo LI(DT);
1198 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1199 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1200
1201 // Gather up the blocks that we're going to extract.
1202 std::vector<BasicBlock *> ToExtract;
1203 ToExtract.push_back(ClonedOI->NonReturnBlock);
1204 OutlinedRegionCost +=
1205 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1206 for (BasicBlock &BB : *ClonedFunc)
1207 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1208 ToExtract.push_back(&BB);
1209 // FIXME: the code extractor may hoist/sink more code
1210 // into the outlined function which may make the outlining
1211 // overhead (the difference of the outlined function cost
1212 // and OutliningRegionCost) look larger.
1213 OutlinedRegionCost += computeBBInlineCost(&BB);
1214 }
1215
1216 // Extract the body of the if.
1217 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1218 Function *OutlinedFunc =
1219 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1220 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1221 /* AllowVarargs */ true)
1222 .extractCodeRegion(CEAC);
1223
1224 if (OutlinedFunc) {
1225 BasicBlock *OutliningCallBB =
1226 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1227 .getInstruction()
1228 ->getParent();
1229 assert(OutliningCallBB->getParent() == ClonedFunc);
1230 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1231 } else
1232 ORE.emit([&]() {
1233 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1234 &ToExtract.front()->front())
1235 << "Failed to extract region at block "
1236 << ore::NV("Block", ToExtract.front());
1237 });
1238
1239 return OutlinedFunc;
1240 }
1241
~FunctionCloner()1242 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1243 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1244 // users (function pointers, etc.) back to the original function.
1245 ClonedFunc->replaceAllUsesWith(OrigFunc);
1246 ClonedFunc->eraseFromParent();
1247 if (!IsFunctionInlined) {
1248 // Remove each function that was speculatively created if there is no
1249 // reference.
1250 for (auto FuncBBPair : OutlinedFunctions) {
1251 Function *Func = FuncBBPair.first;
1252 Func->eraseFromParent();
1253 }
1254 }
1255 }
1256
unswitchFunction(Function * F)1257 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1258
1259 if (F->hasAddressTaken())
1260 return {false, nullptr};
1261
1262 // Let inliner handle it
1263 if (F->hasFnAttribute(Attribute::AlwaysInline))
1264 return {false, nullptr};
1265
1266 if (F->hasFnAttribute(Attribute::NoInline))
1267 return {false, nullptr};
1268
1269 if (PSI->isFunctionEntryCold(F))
1270 return {false, nullptr};
1271
1272 if (F->users().empty())
1273 return {false, nullptr};
1274
1275 OptimizationRemarkEmitter ORE(F);
1276
1277 // Only try to outline cold regions if we have a profile summary, which
1278 // implies we have profiling information.
1279 if (PSI->hasProfileSummary() && F->hasProfileData() &&
1280 !DisableMultiRegionPartialInline) {
1281 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1282 computeOutliningColdRegionsInfo(F, ORE);
1283 if (OMRI) {
1284 FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1285
1286 #ifndef NDEBUG
1287 if (TracePartialInlining) {
1288 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1289 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1290 << "\n";
1291 }
1292 #endif
1293 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1294
1295 if (DidOutline) {
1296 #ifndef NDEBUG
1297 if (TracePartialInlining) {
1298 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1299 Cloner.ClonedFunc->print(dbgs());
1300 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1301 }
1302 #endif
1303
1304 if (tryPartialInline(Cloner))
1305 return {true, nullptr};
1306 }
1307 }
1308 }
1309
1310 // Fall-thru to regular partial inlining if we:
1311 // i) can't find any cold regions to outline, or
1312 // ii) can't inline the outlined function anywhere.
1313 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1314 if (!OI)
1315 return {false, nullptr};
1316
1317 FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1318 Cloner.NormalizeReturnBlock();
1319
1320 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1321
1322 if (!OutlinedFunction)
1323 return {false, nullptr};
1324
1325 bool AnyInline = tryPartialInline(Cloner);
1326
1327 if (AnyInline)
1328 return {true, OutlinedFunction};
1329
1330 return {false, nullptr};
1331 }
1332
tryPartialInline(FunctionCloner & Cloner)1333 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1334 if (Cloner.OutlinedFunctions.empty())
1335 return false;
1336
1337 int SizeCost = 0;
1338 BlockFrequency WeightedRcost;
1339 int NonWeightedRcost;
1340 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1341
1342 // Only calculate RelativeToEntryFreq when we are doing single region
1343 // outlining.
1344 BranchProbability RelativeToEntryFreq;
1345 if (Cloner.ClonedOI) {
1346 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1347 } else
1348 // RelativeToEntryFreq doesn't make sense when we have more than one
1349 // outlined call because each call will have a different relative frequency
1350 // to the entry block. We can consider using the average, but the
1351 // usefulness of that information is questionable. For now, assume we never
1352 // execute the calls to outlined functions.
1353 RelativeToEntryFreq = BranchProbability(0, 1);
1354
1355 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1356
1357 // The call sequence(s) to the outlined function(s) are larger than the sum of
1358 // the original outlined region size(s), it does not increase the chances of
1359 // inlining the function with outlining (The inliner uses the size increase to
1360 // model the cost of inlining a callee).
1361 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1362 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1363 DebugLoc DLoc;
1364 BasicBlock *Block;
1365 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1366 OrigFuncORE.emit([&]() {
1367 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1368 DLoc, Block)
1369 << ore::NV("Function", Cloner.OrigFunc)
1370 << " not partially inlined into callers (Original Size = "
1371 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1372 << ", Size of call sequence to outlined function = "
1373 << ore::NV("NewSize", SizeCost) << ")";
1374 });
1375 return false;
1376 }
1377
1378 assert(Cloner.OrigFunc->users().empty() &&
1379 "F's users should all be replaced!");
1380
1381 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1382 Cloner.ClonedFunc->user_end());
1383
1384 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1385 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1386 if (CalleeEntryCount)
1387 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1388
1389 uint64_t CalleeEntryCountV =
1390 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1391
1392 bool AnyInline = false;
1393 for (User *User : Users) {
1394 CallSite CS = getCallSite(User);
1395
1396 if (IsLimitReached())
1397 continue;
1398
1399 OptimizationRemarkEmitter CallerORE(CS.getCaller());
1400 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1401 continue;
1402
1403 // Construct remark before doing the inlining, as after successful inlining
1404 // the callsite is removed.
1405 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1406 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1407 << ore::NV("Caller", CS.getCaller());
1408
1409 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1410 // We can only forward varargs when we outlined a single region, else we
1411 // bail on vararg functions.
1412 if (!InlineFunction(CS, IFI, nullptr, true,
1413 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1414 : nullptr)))
1415 continue;
1416
1417 CallerORE.emit(OR);
1418
1419 // Now update the entry count:
1420 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1421 uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1422 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1423 }
1424
1425 AnyInline = true;
1426 NumPartialInlining++;
1427 // Update the stats
1428 if (Cloner.ClonedOI)
1429 NumPartialInlined++;
1430 else
1431 NumColdOutlinePartialInlined++;
1432
1433 }
1434
1435 if (AnyInline) {
1436 Cloner.IsFunctionInlined = true;
1437 if (CalleeEntryCount)
1438 Cloner.OrigFunc->setEntryCount(
1439 CalleeEntryCount.setCount(CalleeEntryCountV));
1440 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1441 OrigFuncORE.emit([&]() {
1442 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1443 << "Partially inlined into at least one caller";
1444 });
1445
1446 }
1447
1448 return AnyInline;
1449 }
1450
run(Module & M)1451 bool PartialInlinerImpl::run(Module &M) {
1452 if (DisablePartialInlining)
1453 return false;
1454
1455 std::vector<Function *> Worklist;
1456 Worklist.reserve(M.size());
1457 for (Function &F : M)
1458 if (!F.use_empty() && !F.isDeclaration())
1459 Worklist.push_back(&F);
1460
1461 bool Changed = false;
1462 while (!Worklist.empty()) {
1463 Function *CurrFunc = Worklist.back();
1464 Worklist.pop_back();
1465
1466 if (CurrFunc->use_empty())
1467 continue;
1468
1469 bool Recursive = false;
1470 for (User *U : CurrFunc->users())
1471 if (Instruction *I = dyn_cast<Instruction>(U))
1472 if (I->getParent()->getParent() == CurrFunc) {
1473 Recursive = true;
1474 break;
1475 }
1476 if (Recursive)
1477 continue;
1478
1479 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1480 if (Result.second)
1481 Worklist.push_back(Result.second);
1482 Changed |= Result.first;
1483 }
1484
1485 return Changed;
1486 }
1487
1488 char PartialInlinerLegacyPass::ID = 0;
1489
1490 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1491 "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1492 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1493 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1494 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1495 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1496 "Partial Inliner", false, false)
1497
1498 ModulePass *llvm::createPartialInliningPass() {
1499 return new PartialInlinerLegacyPass();
1500 }
1501
run(Module & M,ModuleAnalysisManager & AM)1502 PreservedAnalyses PartialInlinerPass::run(Module &M,
1503 ModuleAnalysisManager &AM) {
1504 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1505
1506 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1507 [&FAM](Function &F) -> AssumptionCache & {
1508 return FAM.getResult<AssumptionAnalysis>(F);
1509 };
1510
1511 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1512 return FAM.getCachedResult<AssumptionAnalysis>(F);
1513 };
1514
1515 std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1516 [&FAM](Function &F) -> BlockFrequencyInfo & {
1517 return FAM.getResult<BlockFrequencyAnalysis>(F);
1518 };
1519
1520 std::function<TargetTransformInfo &(Function &)> GetTTI =
1521 [&FAM](Function &F) -> TargetTransformInfo & {
1522 return FAM.getResult<TargetIRAnalysis>(F);
1523 };
1524
1525 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1526
1527 if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI,
1528 {GetBFI}, PSI)
1529 .run(M))
1530 return PreservedAnalyses::none();
1531 return PreservedAnalyses::all();
1532 }
1533