• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
getMemInstValueType(Value * I)358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
hasIrregularType(Type * Ty,const DataLayout & DL,ElementCount VF)369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   // Determine if an array of VF elements of type Ty is "bitcast compatible"
371   // with a <VF x Ty> vector.
372   if (VF.isVector()) {
373     auto *VectorTy = VectorType::get(Ty, VF);
374     return TypeSize::get(VF.getKnownMinValue() *
375                              DL.getTypeAllocSize(Ty).getFixedValue(),
376                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377   }
378 
379   // If the vectorization factor is one, we just check if an array of type Ty
380   // requires padding between elements.
381   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383 
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 ///       we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()390 static unsigned getReciprocalPredBlockProb() { return 2; }
391 
392 /// A helper function that adds a 'fast' flag to floating-point operations.
addFastMathFlag(Value * V)393 static Value *addFastMathFlag(Value *V) {
394   if (isa<FPMathOperator>(V))
395     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396   return V;
397 }
398 
addFastMathFlag(Value * V,FastMathFlags FMF)399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FMF);
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 namespace llvm {
436 
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 ///   counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 ///   instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)453   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454                       LoopInfo *LI, DominatorTree *DT,
455                       const TargetLibraryInfo *TLI,
456                       const TargetTransformInfo *TTI, AssumptionCache *AC,
457                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460                       ProfileSummaryInfo *PSI)
461       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463         Builder(PSE.getSE()->getContext()),
464         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465         BFI(BFI), PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop();
497 
498   // Return true if any runtime check is added.
areSafetyChecksAdded()499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516 
517   /// A helper function to scalarize a single Instruction in the innermost loop.
518   /// Generates a sequence of scalar instances for each lane between \p MinLane
519   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521   /// Instr's operands.
522   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523                             const VPIteration &Instance, bool IfPredicateInstr,
524                             VPTransformState &State);
525 
526   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527   /// is provided, the integer induction variable will first be truncated to
528   /// the corresponding type.
529   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530 
531   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532   /// vector or scalar value on-demand if one is not yet available. When
533   /// vectorizing a loop, we visit the definition of an instruction before its
534   /// uses. When visiting the definition, we either vectorize or scalarize the
535   /// instruction, creating an entry for it in the corresponding map. (In some
536   /// cases, such as induction variables, we will create both vector and scalar
537   /// entries.) Then, as we encounter uses of the definition, we derive values
538   /// for each scalar or vector use unless such a value is already available.
539   /// For example, if we scalarize a definition and one of its uses is vector,
540   /// we build the required vector on-demand with an insertelement sequence
541   /// when visiting the use. Otherwise, if the use is scalar, we can use the
542   /// existing scalar definition.
543   ///
544   /// Return a value in the new loop corresponding to \p V from the original
545   /// loop at unroll index \p Part. If the value has already been vectorized,
546   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548   /// a new vector value on-demand by inserting the scalar values into a vector
549   /// with an insertelement sequence. If the value has been neither vectorized
550   /// nor scalarized, it must be loop invariant, so we simply broadcast the
551   /// value into a vector.
552   Value *getOrCreateVectorValue(Value *V, unsigned Part);
553 
setVectorValue(Value * Scalar,unsigned Part,Value * Vector)554   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556   }
557 
558   /// Return a value in the new loop corresponding to \p V from the original
559   /// loop at unroll and vector indices \p Instance. If the value has been
560   /// vectorized but not scalarized, the necessary extractelement instruction
561   /// will be generated.
562   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563 
564   /// Construct the vector value of a scalarized value \p V one lane at a time.
565   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566 
567   /// Try to vectorize interleaved access group \p Group with the base address
568   /// given in \p Addr, optionally masking the vector operations if \p
569   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570   /// values in the vectorized loop.
571   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572                                 VPTransformState &State, VPValue *Addr,
573                                 ArrayRef<VPValue *> StoredValues,
574                                 VPValue *BlockInMask = nullptr);
575 
576   /// Vectorize Load and Store instructions with the base address given in \p
577   /// Addr, optionally masking the vector operations if \p BlockInMask is
578   /// non-null. Use \p State to translate given VPValues to IR values in the
579   /// vectorized loop.
580   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
581                                   VPValue *Def, VPValue *Addr,
582                                   VPValue *StoredValue, VPValue *BlockInMask);
583 
584   /// Set the debug location in the builder using the debug location in
585   /// the instruction.
586   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
587 
588   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
589   void fixNonInductionPHIs(void);
590 
591 protected:
592   friend class LoopVectorizationPlanner;
593 
594   /// A small list of PHINodes.
595   using PhiVector = SmallVector<PHINode *, 4>;
596 
597   /// A type for scalarized values in the new loop. Each value from the
598   /// original loop, when scalarized, is represented by UF x VF scalar values
599   /// in the new unrolled loop, where UF is the unroll factor and VF is the
600   /// vectorization factor.
601   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
602 
603   /// Set up the values of the IVs correctly when exiting the vector loop.
604   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605                     Value *CountRoundDown, Value *EndValue,
606                     BasicBlock *MiddleBlock);
607 
608   /// Create a new induction variable inside L.
609   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
610                                    Value *Step, Instruction *DL);
611 
612   /// Handle all cross-iteration phis in the header.
613   void fixCrossIterationPHIs();
614 
615   /// Fix a first-order recurrence. This is the second phase of vectorizing
616   /// this phi node.
617   void fixFirstOrderRecurrence(PHINode *Phi);
618 
619   /// Fix a reduction cross-iteration phi. This is the second phase of
620   /// vectorizing this phi node.
621   void fixReduction(PHINode *Phi);
622 
623   /// Clear NSW/NUW flags from reduction instructions if necessary.
624   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
625 
626   /// The Loop exit block may have single value PHI nodes with some
627   /// incoming value. While vectorizing we only handled real values
628   /// that were defined inside the loop and we should have one value for
629   /// each predecessor of its parent basic block. See PR14725.
630   void fixLCSSAPHIs();
631 
632   /// Iteratively sink the scalarized operands of a predicated instruction into
633   /// the block that was created for it.
634   void sinkScalarOperands(Instruction *PredInst);
635 
636   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
637   /// represented as.
638   void truncateToMinimalBitwidths();
639 
640   /// Create a broadcast instruction. This method generates a broadcast
641   /// instruction (shuffle) for loop invariant values and for the induction
642   /// value. If this is the induction variable then we extend it to N, N+1, ...
643   /// this is needed because each iteration in the loop corresponds to a SIMD
644   /// element.
645   virtual Value *getBroadcastInstrs(Value *V);
646 
647   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
648   /// to each vector element of Val. The sequence starts at StartIndex.
649   /// \p Opcode is relevant for FP induction variable.
650   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
651                                Instruction::BinaryOps Opcode =
652                                Instruction::BinaryOpsEnd);
653 
654   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
655   /// variable on which to base the steps, \p Step is the size of the step, and
656   /// \p EntryVal is the value from the original loop that maps to the steps.
657   /// Note that \p EntryVal doesn't have to be an induction variable - it
658   /// can also be a truncate instruction.
659   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
660                         const InductionDescriptor &ID);
661 
662   /// Create a vector induction phi node based on an existing scalar one. \p
663   /// EntryVal is the value from the original loop that maps to the vector phi
664   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
665   /// truncate instruction, instead of widening the original IV, we widen a
666   /// version of the IV truncated to \p EntryVal's type.
667   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
668                                        Value *Step, Instruction *EntryVal);
669 
670   /// Returns true if an instruction \p I should be scalarized instead of
671   /// vectorized for the chosen vectorization factor.
672   bool shouldScalarizeInstruction(Instruction *I) const;
673 
674   /// Returns true if we should generate a scalar version of \p IV.
675   bool needsScalarInduction(Instruction *IV) const;
676 
677   /// If there is a cast involved in the induction variable \p ID, which should
678   /// be ignored in the vectorized loop body, this function records the
679   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
680   /// cast. We had already proved that the casted Phi is equal to the uncasted
681   /// Phi in the vectorized loop (under a runtime guard), and therefore
682   /// there is no need to vectorize the cast - the same value can be used in the
683   /// vector loop for both the Phi and the cast.
684   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
685   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
686   ///
687   /// \p EntryVal is the value from the original loop that maps to the vector
688   /// phi node and is used to distinguish what is the IV currently being
689   /// processed - original one (if \p EntryVal is a phi corresponding to the
690   /// original IV) or the "newly-created" one based on the proof mentioned above
691   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
692   /// latter case \p EntryVal is a TruncInst and we must not record anything for
693   /// that IV, but it's error-prone to expect callers of this routine to care
694   /// about that, hence this explicit parameter.
695   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
696                                              const Instruction *EntryVal,
697                                              Value *VectorLoopValue,
698                                              unsigned Part,
699                                              unsigned Lane = UINT_MAX);
700 
701   /// Generate a shuffle sequence that will reverse the vector Vec.
702   virtual Value *reverseVector(Value *Vec);
703 
704   /// Returns (and creates if needed) the original loop trip count.
705   Value *getOrCreateTripCount(Loop *NewLoop);
706 
707   /// Returns (and creates if needed) the trip count of the widened loop.
708   Value *getOrCreateVectorTripCount(Loop *NewLoop);
709 
710   /// Returns a bitcasted value to the requested vector type.
711   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
712   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
713                                 const DataLayout &DL);
714 
715   /// Emit a bypass check to see if the vector trip count is zero, including if
716   /// it overflows.
717   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
718 
719   /// Emit a bypass check to see if all of the SCEV assumptions we've
720   /// had to make are correct.
721   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
722 
723   /// Emit bypass checks to check any memory assumptions we may have made.
724   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
725 
726   /// Compute the transformed value of Index at offset StartValue using step
727   /// StepValue.
728   /// For integer induction, returns StartValue + Index * StepValue.
729   /// For pointer induction, returns StartValue[Index * StepValue].
730   /// FIXME: The newly created binary instructions should contain nsw/nuw
731   /// flags, which can be found from the original scalar operations.
732   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
733                               const DataLayout &DL,
734                               const InductionDescriptor &ID) const;
735 
736   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
737   /// vector loop preheader, middle block and scalar preheader. Also
738   /// allocate a loop object for the new vector loop and return it.
739   Loop *createVectorLoopSkeleton(StringRef Prefix);
740 
741   /// Create new phi nodes for the induction variables to resume iteration count
742   /// in the scalar epilogue, from where the vectorized loop left off (given by
743   /// \p VectorTripCount).
744   /// In cases where the loop skeleton is more complicated (eg. epilogue
745   /// vectorization) and the resume values can come from an additional bypass
746   /// block, the \p AdditionalBypass pair provides information about the bypass
747   /// block and the end value on the edge from bypass to this loop.
748   void createInductionResumeValues(
749       Loop *L, Value *VectorTripCount,
750       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
751 
752   /// Complete the loop skeleton by adding debug MDs, creating appropriate
753   /// conditional branches in the middle block, preparing the builder and
754   /// running the verifier. Take in the vector loop \p L as argument, and return
755   /// the preheader of the completed vector loop.
756   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
757 
758   /// Add additional metadata to \p To that was not present on \p Orig.
759   ///
760   /// Currently this is used to add the noalias annotations based on the
761   /// inserted memchecks.  Use this for instructions that are *cloned* into the
762   /// vector loop.
763   void addNewMetadata(Instruction *To, const Instruction *Orig);
764 
765   /// Add metadata from one instruction to another.
766   ///
767   /// This includes both the original MDs from \p From and additional ones (\see
768   /// addNewMetadata).  Use this for *newly created* instructions in the vector
769   /// loop.
770   void addMetadata(Instruction *To, Instruction *From);
771 
772   /// Similar to the previous function but it adds the metadata to a
773   /// vector of instructions.
774   void addMetadata(ArrayRef<Value *> To, Instruction *From);
775 
776   /// Allow subclasses to override and print debug traces before/after vplan
777   /// execution, when trace information is requested.
printDebugTracesAtStart()778   virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()779   virtual void printDebugTracesAtEnd(){};
780 
781   /// The original loop.
782   Loop *OrigLoop;
783 
784   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
785   /// dynamic knowledge to simplify SCEV expressions and converts them to a
786   /// more usable form.
787   PredicatedScalarEvolution &PSE;
788 
789   /// Loop Info.
790   LoopInfo *LI;
791 
792   /// Dominator Tree.
793   DominatorTree *DT;
794 
795   /// Alias Analysis.
796   AAResults *AA;
797 
798   /// Target Library Info.
799   const TargetLibraryInfo *TLI;
800 
801   /// Target Transform Info.
802   const TargetTransformInfo *TTI;
803 
804   /// Assumption Cache.
805   AssumptionCache *AC;
806 
807   /// Interface to emit optimization remarks.
808   OptimizationRemarkEmitter *ORE;
809 
810   /// LoopVersioning.  It's only set up (non-null) if memchecks were
811   /// used.
812   ///
813   /// This is currently only used to add no-alias metadata based on the
814   /// memchecks.  The actually versioning is performed manually.
815   std::unique_ptr<LoopVersioning> LVer;
816 
817   /// The vectorization SIMD factor to use. Each vector will have this many
818   /// vector elements.
819   ElementCount VF;
820 
821   /// The vectorization unroll factor to use. Each scalar is vectorized to this
822   /// many different vector instructions.
823   unsigned UF;
824 
825   /// The builder that we use
826   IRBuilder<> Builder;
827 
828   // --- Vectorization state ---
829 
830   /// The vector-loop preheader.
831   BasicBlock *LoopVectorPreHeader;
832 
833   /// The scalar-loop preheader.
834   BasicBlock *LoopScalarPreHeader;
835 
836   /// Middle Block between the vector and the scalar.
837   BasicBlock *LoopMiddleBlock;
838 
839   /// The ExitBlock of the scalar loop.
840   BasicBlock *LoopExitBlock;
841 
842   /// The vector loop body.
843   BasicBlock *LoopVectorBody;
844 
845   /// The scalar loop body.
846   BasicBlock *LoopScalarBody;
847 
848   /// A list of all bypass blocks. The first block is the entry of the loop.
849   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
850 
851   /// The new Induction variable which was added to the new block.
852   PHINode *Induction = nullptr;
853 
854   /// The induction variable of the old basic block.
855   PHINode *OldInduction = nullptr;
856 
857   /// Maps values from the original loop to their corresponding values in the
858   /// vectorized loop. A key value can map to either vector values, scalar
859   /// values or both kinds of values, depending on whether the key was
860   /// vectorized and scalarized.
861   VectorizerValueMap VectorLoopValueMap;
862 
863   /// Store instructions that were predicated.
864   SmallVector<Instruction *, 4> PredicatedInstructions;
865 
866   /// Trip count of the original loop.
867   Value *TripCount = nullptr;
868 
869   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
870   Value *VectorTripCount = nullptr;
871 
872   /// The legality analysis.
873   LoopVectorizationLegality *Legal;
874 
875   /// The profitablity analysis.
876   LoopVectorizationCostModel *Cost;
877 
878   // Record whether runtime checks are added.
879   bool AddedSafetyChecks = false;
880 
881   // Holds the end values for each induction variable. We save the end values
882   // so we can later fix-up the external users of the induction variables.
883   DenseMap<PHINode *, Value *> IVEndValues;
884 
885   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
886   // fixed up at the end of vector code generation.
887   SmallVector<PHINode *, 8> OrigPHIsToFix;
888 
889   /// BFI and PSI are used to check for profile guided size optimizations.
890   BlockFrequencyInfo *BFI;
891   ProfileSummaryInfo *PSI;
892 
893   // Whether this loop should be optimized for size based on profile guided size
894   // optimizatios.
895   bool OptForSizeBasedOnProfile;
896 };
897 
898 class InnerLoopUnroller : public InnerLoopVectorizer {
899 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)900   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
901                     LoopInfo *LI, DominatorTree *DT,
902                     const TargetLibraryInfo *TLI,
903                     const TargetTransformInfo *TTI, AssumptionCache *AC,
904                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
905                     LoopVectorizationLegality *LVL,
906                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
907                     ProfileSummaryInfo *PSI)
908       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
909                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
910                             BFI, PSI) {}
911 
912 private:
913   Value *getBroadcastInstrs(Value *V) override;
914   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
915                        Instruction::BinaryOps Opcode =
916                        Instruction::BinaryOpsEnd) override;
917   Value *reverseVector(Value *Vec) override;
918 };
919 
920 /// Encapsulate information regarding vectorization of a loop and its epilogue.
921 /// This information is meant to be updated and used across two stages of
922 /// epilogue vectorization.
923 struct EpilogueLoopVectorizationInfo {
924   ElementCount MainLoopVF = ElementCount::getFixed(0);
925   unsigned MainLoopUF = 0;
926   ElementCount EpilogueVF = ElementCount::getFixed(0);
927   unsigned EpilogueUF = 0;
928   BasicBlock *MainLoopIterationCountCheck = nullptr;
929   BasicBlock *EpilogueIterationCountCheck = nullptr;
930   BasicBlock *SCEVSafetyCheck = nullptr;
931   BasicBlock *MemSafetyCheck = nullptr;
932   Value *TripCount = nullptr;
933   Value *VectorTripCount = nullptr;
934 
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo935   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
936                                 unsigned EUF)
937       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
938         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
939     assert(EUF == 1 &&
940            "A high UF for the epilogue loop is likely not beneficial.");
941   }
942 };
943 
944 /// An extension of the inner loop vectorizer that creates a skeleton for a
945 /// vectorized loop that has its epilogue (residual) also vectorized.
946 /// The idea is to run the vplan on a given loop twice, firstly to setup the
947 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
948 /// from the first step and vectorize the epilogue.  This is achieved by
949 /// deriving two concrete strategy classes from this base class and invoking
950 /// them in succession from the loop vectorizer planner.
951 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
952 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)953   InnerLoopAndEpilogueVectorizer(
954       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
955       DominatorTree *DT, const TargetLibraryInfo *TLI,
956       const TargetTransformInfo *TTI, AssumptionCache *AC,
957       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
958       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
959       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
960       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
961                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
962         EPI(EPI) {}
963 
964   // Override this function to handle the more complex control flow around the
965   // three loops.
createVectorizedLoopSkeleton()966   BasicBlock *createVectorizedLoopSkeleton() final override {
967     return createEpilogueVectorizedLoopSkeleton();
968   }
969 
970   /// The interface for creating a vectorized skeleton using one of two
971   /// different strategies, each corresponding to one execution of the vplan
972   /// as described above.
973   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
974 
975   /// Holds and updates state information required to vectorize the main loop
976   /// and its epilogue in two separate passes. This setup helps us avoid
977   /// regenerating and recomputing runtime safety checks. It also helps us to
978   /// shorten the iteration-count-check path length for the cases where the
979   /// iteration count of the loop is so small that the main vector loop is
980   /// completely skipped.
981   EpilogueLoopVectorizationInfo &EPI;
982 };
983 
984 /// A specialized derived class of inner loop vectorizer that performs
985 /// vectorization of *main* loops in the process of vectorizing loops and their
986 /// epilogues.
987 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
988 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)989   EpilogueVectorizerMainLoop(
990       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
991       DominatorTree *DT, const TargetLibraryInfo *TLI,
992       const TargetTransformInfo *TTI, AssumptionCache *AC,
993       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
994       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
995       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
996       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
997                                        EPI, LVL, CM, BFI, PSI) {}
998   /// Implements the interface for creating a vectorized skeleton using the
999   /// *main loop* strategy (ie the first pass of vplan execution).
1000   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1001 
1002 protected:
1003   /// Emits an iteration count bypass check once for the main loop (when \p
1004   /// ForEpilogue is false) and once for the epilogue loop (when \p
1005   /// ForEpilogue is true).
1006   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1007                                              bool ForEpilogue);
1008   void printDebugTracesAtStart() override;
1009   void printDebugTracesAtEnd() override;
1010 };
1011 
1012 // A specialized derived class of inner loop vectorizer that performs
1013 // vectorization of *epilogue* loops in the process of vectorizing loops and
1014 // their epilogues.
1015 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1016 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)1017   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1018                     LoopInfo *LI, DominatorTree *DT,
1019                     const TargetLibraryInfo *TLI,
1020                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1021                     OptimizationRemarkEmitter *ORE,
1022                     EpilogueLoopVectorizationInfo &EPI,
1023                     LoopVectorizationLegality *LVL,
1024                     llvm::LoopVectorizationCostModel *CM,
1025                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1026       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1027                                        EPI, LVL, CM, BFI, PSI) {}
1028   /// Implements the interface for creating a vectorized skeleton using the
1029   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1030   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1031 
1032 protected:
1033   /// Emits an iteration count bypass check after the main vector loop has
1034   /// finished to see if there are any iterations left to execute by either
1035   /// the vector epilogue or the scalar epilogue.
1036   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1037                                                       BasicBlock *Bypass,
1038                                                       BasicBlock *Insert);
1039   void printDebugTracesAtStart() override;
1040   void printDebugTracesAtEnd() override;
1041 };
1042 } // end namespace llvm
1043 
1044 /// Look for a meaningful debug location on the instruction or it's
1045 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)1046 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1047   if (!I)
1048     return I;
1049 
1050   DebugLoc Empty;
1051   if (I->getDebugLoc() != Empty)
1052     return I;
1053 
1054   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1055     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1056       if (OpInst->getDebugLoc() != Empty)
1057         return OpInst;
1058   }
1059 
1060   return I;
1061 }
1062 
setDebugLocFromInst(IRBuilder<> & B,const Value * Ptr)1063 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1064   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1065     const DILocation *DIL = Inst->getDebugLoc();
1066     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1067         !isa<DbgInfoIntrinsic>(Inst)) {
1068       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1069       auto NewDIL =
1070           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1071       if (NewDIL)
1072         B.SetCurrentDebugLocation(NewDIL.getValue());
1073       else
1074         LLVM_DEBUG(dbgs()
1075                    << "Failed to create new discriminator: "
1076                    << DIL->getFilename() << " Line: " << DIL->getLine());
1077     }
1078     else
1079       B.SetCurrentDebugLocation(DIL);
1080   } else
1081     B.SetCurrentDebugLocation(DebugLoc());
1082 }
1083 
1084 /// Write a record \p DebugMsg about vectorization failure to the debug
1085 /// output stream. If \p I is passed, it is an instruction that prevents
1086 /// vectorization.
1087 #ifndef NDEBUG
debugVectorizationFailure(const StringRef DebugMsg,Instruction * I)1088 static void debugVectorizationFailure(const StringRef DebugMsg,
1089     Instruction *I) {
1090   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1091   if (I != nullptr)
1092     dbgs() << " " << *I;
1093   else
1094     dbgs() << '.';
1095   dbgs() << '\n';
1096 }
1097 #endif
1098 
1099 /// Create an analysis remark that explains why vectorization failed
1100 ///
1101 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1102 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1103 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1104 /// the location of the remark.  \return the remark object that can be
1105 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)1106 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1107     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1108   Value *CodeRegion = TheLoop->getHeader();
1109   DebugLoc DL = TheLoop->getStartLoc();
1110 
1111   if (I) {
1112     CodeRegion = I->getParent();
1113     // If there is no debug location attached to the instruction, revert back to
1114     // using the loop's.
1115     if (I->getDebugLoc())
1116       DL = I->getDebugLoc();
1117   }
1118 
1119   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1120   R << "loop not vectorized: ";
1121   return R;
1122 }
1123 
1124 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilder<> & B,Constant * Step,ElementCount VF)1125 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1126   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1127   Constant *StepVal = ConstantInt::get(
1128       Step->getType(),
1129       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1130   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1131 }
1132 
1133 namespace llvm {
1134 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1135 void reportVectorizationFailure(const StringRef DebugMsg,
1136     const StringRef OREMsg, const StringRef ORETag,
1137     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1138   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1139   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1140   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1141                 ORETag, TheLoop, I) << OREMsg);
1142 }
1143 
1144 } // end namespace llvm
1145 
1146 #ifndef NDEBUG
1147 /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)1148 static std::string getDebugLocString(const Loop *L) {
1149   std::string Result;
1150   if (L) {
1151     raw_string_ostream OS(Result);
1152     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1153       LoopDbgLoc.print(OS);
1154     else
1155       // Just print the module name.
1156       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1157     OS.flush();
1158   }
1159   return Result;
1160 }
1161 #endif
1162 
addNewMetadata(Instruction * To,const Instruction * Orig)1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1164                                          const Instruction *Orig) {
1165   // If the loop was versioned with memchecks, add the corresponding no-alias
1166   // metadata.
1167   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1168     LVer->annotateInstWithNoAlias(To, Orig);
1169 }
1170 
addMetadata(Instruction * To,Instruction * From)1171 void InnerLoopVectorizer::addMetadata(Instruction *To,
1172                                       Instruction *From) {
1173   propagateMetadata(To, From);
1174   addNewMetadata(To, From);
1175 }
1176 
addMetadata(ArrayRef<Value * > To,Instruction * From)1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1178                                       Instruction *From) {
1179   for (Value *V : To) {
1180     if (Instruction *I = dyn_cast<Instruction>(V))
1181       addMetadata(I, From);
1182   }
1183 }
1184 
1185 namespace llvm {
1186 
1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1188 // lowered.
1189 enum ScalarEpilogueLowering {
1190 
1191   // The default: allowing scalar epilogues.
1192   CM_ScalarEpilogueAllowed,
1193 
1194   // Vectorization with OptForSize: don't allow epilogues.
1195   CM_ScalarEpilogueNotAllowedOptSize,
1196 
1197   // A special case of vectorisation with OptForSize: loops with a very small
1198   // trip count are considered for vectorization under OptForSize, thereby
1199   // making sure the cost of their loop body is dominant, free of runtime
1200   // guards and scalar iteration overheads.
1201   CM_ScalarEpilogueNotAllowedLowTripLoop,
1202 
1203   // Loop hint predicate indicating an epilogue is undesired.
1204   CM_ScalarEpilogueNotNeededUsePredicate
1205 };
1206 
1207 /// LoopVectorizationCostModel - estimates the expected speedups due to
1208 /// vectorization.
1209 /// In many cases vectorization is not profitable. This can happen because of
1210 /// a number of reasons. In this class we mainly attempt to predict the
1211 /// expected speedup/slowdowns due to the supported instruction set. We use the
1212 /// TargetTransformInfo to query the different backends for the cost of
1213 /// different operations.
1214 class LoopVectorizationCostModel {
1215 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1216   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1217                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1218                              LoopVectorizationLegality *Legal,
1219                              const TargetTransformInfo &TTI,
1220                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1221                              AssumptionCache *AC,
1222                              OptimizationRemarkEmitter *ORE, const Function *F,
1223                              const LoopVectorizeHints *Hints,
1224                              InterleavedAccessInfo &IAI)
1225       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1226         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1227         Hints(Hints), InterleaveInfo(IAI) {}
1228 
1229   /// \return An upper bound for the vectorization factor, or None if
1230   /// vectorization and interleaving should be avoided up front.
1231   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1232 
1233   /// \return True if runtime checks are required for vectorization, and false
1234   /// otherwise.
1235   bool runtimeChecksRequired();
1236 
1237   /// \return The most profitable vectorization factor and the cost of that VF.
1238   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1239   /// then this vectorization factor will be selected if vectorization is
1240   /// possible.
1241   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1242   VectorizationFactor
1243   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1244                                     const LoopVectorizationPlanner &LVP);
1245 
1246   /// Setup cost-based decisions for user vectorization factor.
selectUserVectorizationFactor(ElementCount UserVF)1247   void selectUserVectorizationFactor(ElementCount UserVF) {
1248     collectUniformsAndScalars(UserVF);
1249     collectInstsToScalarize(UserVF);
1250   }
1251 
1252   /// \return The size (in bits) of the smallest and widest types in the code
1253   /// that needs to be vectorized. We ignore values that remain scalar such as
1254   /// 64 bit loop indices.
1255   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1256 
1257   /// \return The desired interleave count.
1258   /// If interleave count has been specified by metadata it will be returned.
1259   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1260   /// are the selected vectorization factor and the cost of the selected VF.
1261   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1262 
1263   /// Memory access instruction may be vectorized in more than one way.
1264   /// Form of instruction after vectorization depends on cost.
1265   /// This function takes cost-based decisions for Load/Store instructions
1266   /// and collects them in a map. This decisions map is used for building
1267   /// the lists of loop-uniform and loop-scalar instructions.
1268   /// The calculated cost is saved with widening decision in order to
1269   /// avoid redundant calculations.
1270   void setCostBasedWideningDecision(ElementCount VF);
1271 
1272   /// A struct that represents some properties of the register usage
1273   /// of a loop.
1274   struct RegisterUsage {
1275     /// Holds the number of loop invariant values that are used in the loop.
1276     /// The key is ClassID of target-provided register class.
1277     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1278     /// Holds the maximum number of concurrent live intervals in the loop.
1279     /// The key is ClassID of target-provided register class.
1280     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1281   };
1282 
1283   /// \return Returns information about the register usages of the loop for the
1284   /// given vectorization factors.
1285   SmallVector<RegisterUsage, 8>
1286   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1287 
1288   /// Collect values we want to ignore in the cost model.
1289   void collectValuesToIgnore();
1290 
1291   /// Split reductions into those that happen in the loop, and those that happen
1292   /// outside. In loop reductions are collected into InLoopReductionChains.
1293   void collectInLoopReductions();
1294 
1295   /// \returns The smallest bitwidth each instruction can be represented with.
1296   /// The vector equivalents of these instructions should be truncated to this
1297   /// type.
getMinimalBitwidths() const1298   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1299     return MinBWs;
1300   }
1301 
1302   /// \returns True if it is more profitable to scalarize instruction \p I for
1303   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1304   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1305     assert(VF.isVector() &&
1306            "Profitable to scalarize relevant only for VF > 1.");
1307 
1308     // Cost model is not run in the VPlan-native path - return conservative
1309     // result until this changes.
1310     if (EnableVPlanNativePath)
1311       return false;
1312 
1313     auto Scalars = InstsToScalarize.find(VF);
1314     assert(Scalars != InstsToScalarize.end() &&
1315            "VF not yet analyzed for scalarization profitability");
1316     return Scalars->second.find(I) != Scalars->second.end();
1317   }
1318 
1319   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1320   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1321     if (VF.isScalar())
1322       return true;
1323 
1324     // Cost model is not run in the VPlan-native path - return conservative
1325     // result until this changes.
1326     if (EnableVPlanNativePath)
1327       return false;
1328 
1329     auto UniformsPerVF = Uniforms.find(VF);
1330     assert(UniformsPerVF != Uniforms.end() &&
1331            "VF not yet analyzed for uniformity");
1332     return UniformsPerVF->second.count(I);
1333   }
1334 
1335   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1336   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337     if (VF.isScalar())
1338       return true;
1339 
1340     // Cost model is not run in the VPlan-native path - return conservative
1341     // result until this changes.
1342     if (EnableVPlanNativePath)
1343       return false;
1344 
1345     auto ScalarsPerVF = Scalars.find(VF);
1346     assert(ScalarsPerVF != Scalars.end() &&
1347            "Scalar values are not calculated for VF");
1348     return ScalarsPerVF->second.count(I);
1349   }
1350 
1351   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1353   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1355            !isProfitableToScalarize(I, VF) &&
1356            !isScalarAfterVectorization(I, VF);
1357   }
1358 
1359   /// Decision that was taken during cost calculation for memory instruction.
1360   enum InstWidening {
1361     CM_Unknown,
1362     CM_Widen,         // For consecutive accesses with stride +1.
1363     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364     CM_Interleave,
1365     CM_GatherScatter,
1366     CM_Scalarize
1367   };
1368 
1369   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1370   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,unsigned Cost)1371   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1372                            unsigned Cost) {
1373     assert(VF.isVector() && "Expected VF >=2");
1374     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1375   }
1376 
1377   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1378   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,unsigned Cost)1379   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1380                            ElementCount VF, InstWidening W, unsigned Cost) {
1381     assert(VF.isVector() && "Expected VF >=2");
1382     /// Broadcast this decicion to all instructions inside the group.
1383     /// But the cost will be assigned to one instruction only.
1384     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1385       if (auto *I = Grp->getMember(i)) {
1386         if (Grp->getInsertPos() == I)
1387           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1388         else
1389           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1390       }
1391     }
1392   }
1393 
1394   /// Return the cost model decision for the given instruction \p I and vector
1395   /// width \p VF. Return CM_Unknown if this instruction did not pass
1396   /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF)1397   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1398     assert(VF.isVector() && "Expected VF to be a vector VF");
1399     // Cost model is not run in the VPlan-native path - return conservative
1400     // result until this changes.
1401     if (EnableVPlanNativePath)
1402       return CM_GatherScatter;
1403 
1404     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1405     auto Itr = WideningDecisions.find(InstOnVF);
1406     if (Itr == WideningDecisions.end())
1407       return CM_Unknown;
1408     return Itr->second.first;
1409   }
1410 
1411   /// Return the vectorization cost for the given instruction \p I and vector
1412   /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1413   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1414     assert(VF.isVector() && "Expected VF >=2");
1415     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1416     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1417            "The cost is not calculated");
1418     return WideningDecisions[InstOnVF].second;
1419   }
1420 
1421   /// Return True if instruction \p I is an optimizable truncate whose operand
1422   /// is an induction variable. Such a truncate will be removed by adding a new
1423   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1424   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1425     // If the instruction is not a truncate, return false.
1426     auto *Trunc = dyn_cast<TruncInst>(I);
1427     if (!Trunc)
1428       return false;
1429 
1430     // Get the source and destination types of the truncate.
1431     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1432     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1433 
1434     // If the truncate is free for the given types, return false. Replacing a
1435     // free truncate with an induction variable would add an induction variable
1436     // update instruction to each iteration of the loop. We exclude from this
1437     // check the primary induction variable since it will need an update
1438     // instruction regardless.
1439     Value *Op = Trunc->getOperand(0);
1440     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1441       return false;
1442 
1443     // If the truncated value is not an induction variable, return false.
1444     return Legal->isInductionPhi(Op);
1445   }
1446 
1447   /// Collects the instructions to scalarize for each predicated instruction in
1448   /// the loop.
1449   void collectInstsToScalarize(ElementCount VF);
1450 
1451   /// Collect Uniform and Scalar values for the given \p VF.
1452   /// The sets depend on CM decision for Load/Store instructions
1453   /// that may be vectorized as interleave, gather-scatter or scalarized.
collectUniformsAndScalars(ElementCount VF)1454   void collectUniformsAndScalars(ElementCount VF) {
1455     // Do the analysis once.
1456     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1457       return;
1458     setCostBasedWideningDecision(VF);
1459     collectLoopUniforms(VF);
1460     collectLoopScalars(VF);
1461   }
1462 
1463   /// Returns true if the target machine supports masked store operation
1464   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment)1465   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1466     return Legal->isConsecutivePtr(Ptr) &&
1467            TTI.isLegalMaskedStore(DataType, Alignment);
1468   }
1469 
1470   /// Returns true if the target machine supports masked load operation
1471   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment)1472   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1473     return Legal->isConsecutivePtr(Ptr) &&
1474            TTI.isLegalMaskedLoad(DataType, Alignment);
1475   }
1476 
1477   /// Returns true if the target machine supports masked scatter operation
1478   /// for the given \p DataType.
isLegalMaskedScatter(Type * DataType,Align Alignment)1479   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1480     return TTI.isLegalMaskedScatter(DataType, Alignment);
1481   }
1482 
1483   /// Returns true if the target machine supports masked gather operation
1484   /// for the given \p DataType.
isLegalMaskedGather(Type * DataType,Align Alignment)1485   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1486     return TTI.isLegalMaskedGather(DataType, Alignment);
1487   }
1488 
1489   /// Returns true if the target machine can represent \p V as a masked gather
1490   /// or scatter operation.
isLegalGatherOrScatter(Value * V)1491   bool isLegalGatherOrScatter(Value *V) {
1492     bool LI = isa<LoadInst>(V);
1493     bool SI = isa<StoreInst>(V);
1494     if (!LI && !SI)
1495       return false;
1496     auto *Ty = getMemInstValueType(V);
1497     Align Align = getLoadStoreAlignment(V);
1498     return (LI && isLegalMaskedGather(Ty, Align)) ||
1499            (SI && isLegalMaskedScatter(Ty, Align));
1500   }
1501 
1502   /// Returns true if \p I is an instruction that will be scalarized with
1503   /// predication. Such instructions include conditional stores and
1504   /// instructions that may divide by zero.
1505   /// If a non-zero VF has been calculated, we check if I will be scalarized
1506   /// predication for that VF.
1507   bool isScalarWithPredication(Instruction *I,
1508                                ElementCount VF = ElementCount::getFixed(1));
1509 
1510   // Returns true if \p I is an instruction that will be predicated either
1511   // through scalar predication or masked load/store or masked gather/scatter.
1512   // Superset of instructions that return true for isScalarWithPredication.
isPredicatedInst(Instruction * I)1513   bool isPredicatedInst(Instruction *I) {
1514     if (!blockNeedsPredication(I->getParent()))
1515       return false;
1516     // Loads and stores that need some form of masked operation are predicated
1517     // instructions.
1518     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1519       return Legal->isMaskRequired(I);
1520     return isScalarWithPredication(I);
1521   }
1522 
1523   /// Returns true if \p I is a memory instruction with consecutive memory
1524   /// access that can be widened.
1525   bool
1526   memoryInstructionCanBeWidened(Instruction *I,
1527                                 ElementCount VF = ElementCount::getFixed(1));
1528 
1529   /// Returns true if \p I is a memory instruction in an interleaved-group
1530   /// of memory accesses that can be vectorized with wide vector loads/stores
1531   /// and shuffles.
1532   bool
1533   interleavedAccessCanBeWidened(Instruction *I,
1534                                 ElementCount VF = ElementCount::getFixed(1));
1535 
1536   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)1537   bool isAccessInterleaved(Instruction *Instr) {
1538     return InterleaveInfo.isInterleaved(Instr);
1539   }
1540 
1541   /// Get the interleaved access group that \p Instr belongs to.
1542   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)1543   getInterleavedAccessGroup(Instruction *Instr) {
1544     return InterleaveInfo.getInterleaveGroup(Instr);
1545   }
1546 
1547   /// Returns true if an interleaved group requires a scalar iteration
1548   /// to handle accesses with gaps, and there is nothing preventing us from
1549   /// creating a scalar epilogue.
requiresScalarEpilogue() const1550   bool requiresScalarEpilogue() const {
1551     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1552   }
1553 
1554   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1555   /// loop hint annotation.
isScalarEpilogueAllowed() const1556   bool isScalarEpilogueAllowed() const {
1557     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1558   }
1559 
1560   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1561   bool foldTailByMasking() const { return FoldTailByMasking; }
1562 
blockNeedsPredication(BasicBlock * BB)1563   bool blockNeedsPredication(BasicBlock *BB) {
1564     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1565   }
1566 
1567   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1568   /// nodes to the chain of instructions representing the reductions. Uses a
1569   /// MapVector to ensure deterministic iteration order.
1570   using ReductionChainMap =
1571       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1572 
1573   /// Return the chain of instructions representing an inloop reduction.
getInLoopReductionChains() const1574   const ReductionChainMap &getInLoopReductionChains() const {
1575     return InLoopReductionChains;
1576   }
1577 
1578   /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1579   bool isInLoopReduction(PHINode *Phi) const {
1580     return InLoopReductionChains.count(Phi);
1581   }
1582 
1583   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1584   /// with factor VF.  Return the cost of the instruction, including
1585   /// scalarization overhead if it's needed.
1586   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1587 
1588   /// Estimate cost of a call instruction CI if it were vectorized with factor
1589   /// VF. Return the cost of the instruction, including scalarization overhead
1590   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1591   /// scalarized -
1592   /// i.e. either vector version isn't available, or is too expensive.
1593   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1594                              bool &NeedToScalarize);
1595 
1596   /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1597   void invalidateCostModelingDecisions() {
1598     WideningDecisions.clear();
1599     Uniforms.clear();
1600     Scalars.clear();
1601   }
1602 
1603 private:
1604   unsigned NumPredStores = 0;
1605 
1606   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1607   /// than zero. One is returned if vectorization should best be avoided due
1608   /// to cost.
1609   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1610                                     ElementCount UserVF);
1611 
1612   /// The vectorization cost is a combination of the cost itself and a boolean
1613   /// indicating whether any of the contributing operations will actually
1614   /// operate on
1615   /// vector values after type legalization in the backend. If this latter value
1616   /// is
1617   /// false, then all operations will be scalarized (i.e. no vectorization has
1618   /// actually taken place).
1619   using VectorizationCostTy = std::pair<unsigned, bool>;
1620 
1621   /// Returns the expected execution cost. The unit of the cost does
1622   /// not matter because we use the 'cost' units to compare different
1623   /// vector widths. The cost that is returned is *not* normalized by
1624   /// the factor width.
1625   VectorizationCostTy expectedCost(ElementCount VF);
1626 
1627   /// Returns the execution time cost of an instruction for a given vector
1628   /// width. Vector width of one means scalar.
1629   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1630 
1631   /// The cost-computation logic from getInstructionCost which provides
1632   /// the vector type as an output parameter.
1633   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1634 
1635   /// Calculate vectorization cost of memory instruction \p I.
1636   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1637 
1638   /// The cost computation for scalarized memory instruction.
1639   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost computation for interleaving group of memory instructions.
1642   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1643 
1644   /// The cost computation for Gather/Scatter instruction.
1645   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1646 
1647   /// The cost computation for widening instruction \p I with consecutive
1648   /// memory access.
1649   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1652   /// Load: scalar load + broadcast.
1653   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1654   /// element)
1655   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1656 
1657   /// Estimate the overhead of scalarizing an instruction. This is a
1658   /// convenience wrapper for the type-based getScalarizationOverhead API.
1659   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1660 
1661   /// Returns whether the instruction is a load or store and will be a emitted
1662   /// as a vector operation.
1663   bool isConsecutiveLoadOrStore(Instruction *I);
1664 
1665   /// Returns true if an artificially high cost for emulated masked memrefs
1666   /// should be used.
1667   bool useEmulatedMaskMemRefHack(Instruction *I);
1668 
1669   /// Map of scalar integer values to the smallest bitwidth they can be legally
1670   /// represented as. The vector equivalents of these values should be truncated
1671   /// to this type.
1672   MapVector<Instruction *, uint64_t> MinBWs;
1673 
1674   /// A type representing the costs for instructions if they were to be
1675   /// scalarized rather than vectorized. The entries are Instruction-Cost
1676   /// pairs.
1677   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1678 
1679   /// A set containing all BasicBlocks that are known to present after
1680   /// vectorization as a predicated block.
1681   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1682 
1683   /// Records whether it is allowed to have the original scalar loop execute at
1684   /// least once. This may be needed as a fallback loop in case runtime
1685   /// aliasing/dependence checks fail, or to handle the tail/remainder
1686   /// iterations when the trip count is unknown or doesn't divide by the VF,
1687   /// or as a peel-loop to handle gaps in interleave-groups.
1688   /// Under optsize and when the trip count is very small we don't allow any
1689   /// iterations to execute in the scalar loop.
1690   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1691 
1692   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1693   bool FoldTailByMasking = false;
1694 
1695   /// A map holding scalar costs for different vectorization factors. The
1696   /// presence of a cost for an instruction in the mapping indicates that the
1697   /// instruction will be scalarized when vectorizing with the associated
1698   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1699   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1700 
1701   /// Holds the instructions known to be uniform after vectorization.
1702   /// The data is collected per VF.
1703   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1704 
1705   /// Holds the instructions known to be scalar after vectorization.
1706   /// The data is collected per VF.
1707   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1708 
1709   /// Holds the instructions (address computations) that are forced to be
1710   /// scalarized.
1711   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1712 
1713   /// PHINodes of the reductions that should be expanded in-loop along with
1714   /// their associated chains of reduction operations, in program order from top
1715   /// (PHI) to bottom
1716   ReductionChainMap InLoopReductionChains;
1717 
1718   /// Returns the expected difference in cost from scalarizing the expression
1719   /// feeding a predicated instruction \p PredInst. The instructions to
1720   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1721   /// non-negative return value implies the expression will be scalarized.
1722   /// Currently, only single-use chains are considered for scalarization.
1723   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1724                               ElementCount VF);
1725 
1726   /// Collect the instructions that are uniform after vectorization. An
1727   /// instruction is uniform if we represent it with a single scalar value in
1728   /// the vectorized loop corresponding to each vector iteration. Examples of
1729   /// uniform instructions include pointer operands of consecutive or
1730   /// interleaved memory accesses. Note that although uniformity implies an
1731   /// instruction will be scalar, the reverse is not true. In general, a
1732   /// scalarized instruction will be represented by VF scalar values in the
1733   /// vectorized loop, each corresponding to an iteration of the original
1734   /// scalar loop.
1735   void collectLoopUniforms(ElementCount VF);
1736 
1737   /// Collect the instructions that are scalar after vectorization. An
1738   /// instruction is scalar if it is known to be uniform or will be scalarized
1739   /// during vectorization. Non-uniform scalarized instructions will be
1740   /// represented by VF values in the vectorized loop, each corresponding to an
1741   /// iteration of the original scalar loop.
1742   void collectLoopScalars(ElementCount VF);
1743 
1744   /// Keeps cost model vectorization decision and cost for instructions.
1745   /// Right now it is used for memory instructions only.
1746   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1747                                 std::pair<InstWidening, unsigned>>;
1748 
1749   DecisionList WideningDecisions;
1750 
1751   /// Returns true if \p V is expected to be vectorized and it needs to be
1752   /// extracted.
needsExtract(Value * V,ElementCount VF) const1753   bool needsExtract(Value *V, ElementCount VF) const {
1754     Instruction *I = dyn_cast<Instruction>(V);
1755     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1756         TheLoop->isLoopInvariant(I))
1757       return false;
1758 
1759     // Assume we can vectorize V (and hence we need extraction) if the
1760     // scalars are not computed yet. This can happen, because it is called
1761     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1762     // the scalars are collected. That should be a safe assumption in most
1763     // cases, because we check if the operands have vectorizable types
1764     // beforehand in LoopVectorizationLegality.
1765     return Scalars.find(VF) == Scalars.end() ||
1766            !isScalarAfterVectorization(I, VF);
1767   };
1768 
1769   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF)1770   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1771                                                    ElementCount VF) {
1772     return SmallVector<Value *, 4>(make_filter_range(
1773         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1774   }
1775 
1776   /// Determines if we have the infrastructure to vectorize loop \p L and its
1777   /// epilogue, assuming the main loop is vectorized by \p VF.
1778   bool isCandidateForEpilogueVectorization(const Loop &L,
1779                                            const ElementCount VF) const;
1780 
1781   /// Returns true if epilogue vectorization is considered profitable, and
1782   /// false otherwise.
1783   /// \p VF is the vectorization factor chosen for the original loop.
1784   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1785 
1786 public:
1787   /// The loop that we evaluate.
1788   Loop *TheLoop;
1789 
1790   /// Predicated scalar evolution analysis.
1791   PredicatedScalarEvolution &PSE;
1792 
1793   /// Loop Info analysis.
1794   LoopInfo *LI;
1795 
1796   /// Vectorization legality.
1797   LoopVectorizationLegality *Legal;
1798 
1799   /// Vector target information.
1800   const TargetTransformInfo &TTI;
1801 
1802   /// Target Library Info.
1803   const TargetLibraryInfo *TLI;
1804 
1805   /// Demanded bits analysis.
1806   DemandedBits *DB;
1807 
1808   /// Assumption cache.
1809   AssumptionCache *AC;
1810 
1811   /// Interface to emit optimization remarks.
1812   OptimizationRemarkEmitter *ORE;
1813 
1814   const Function *TheFunction;
1815 
1816   /// Loop Vectorize Hint.
1817   const LoopVectorizeHints *Hints;
1818 
1819   /// The interleave access information contains groups of interleaved accesses
1820   /// with the same stride and close to each other.
1821   InterleavedAccessInfo &InterleaveInfo;
1822 
1823   /// Values to ignore in the cost model.
1824   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1825 
1826   /// Values to ignore in the cost model when VF > 1.
1827   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1828 
1829   /// Profitable vector factors.
1830   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1831 };
1832 
1833 } // end namespace llvm
1834 
1835 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1836 // vectorization. The loop needs to be annotated with #pragma omp simd
1837 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1838 // vector length information is not provided, vectorization is not considered
1839 // explicit. Interleave hints are not allowed either. These limitations will be
1840 // relaxed in the future.
1841 // Please, note that we are currently forced to abuse the pragma 'clang
1842 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1843 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1844 // provides *explicit vectorization hints* (LV can bypass legal checks and
1845 // assume that vectorization is legal). However, both hints are implemented
1846 // using the same metadata (llvm.loop.vectorize, processed by
1847 // LoopVectorizeHints). This will be fixed in the future when the native IR
1848 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)1849 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1850                                    OptimizationRemarkEmitter *ORE) {
1851   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1852   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1853 
1854   // Only outer loops with an explicit vectorization hint are supported.
1855   // Unannotated outer loops are ignored.
1856   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1857     return false;
1858 
1859   Function *Fn = OuterLp->getHeader()->getParent();
1860   if (!Hints.allowVectorization(Fn, OuterLp,
1861                                 true /*VectorizeOnlyWhenForced*/)) {
1862     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1863     return false;
1864   }
1865 
1866   if (Hints.getInterleave() > 1) {
1867     // TODO: Interleave support is future work.
1868     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1869                          "outer loops.\n");
1870     Hints.emitRemarkWithHints();
1871     return false;
1872   }
1873 
1874   return true;
1875 }
1876 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)1877 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1878                                   OptimizationRemarkEmitter *ORE,
1879                                   SmallVectorImpl<Loop *> &V) {
1880   // Collect inner loops and outer loops without irreducible control flow. For
1881   // now, only collect outer loops that have explicit vectorization hints. If we
1882   // are stress testing the VPlan H-CFG construction, we collect the outermost
1883   // loop of every loop nest.
1884   if (L.isInnermost() || VPlanBuildStressTest ||
1885       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1886     LoopBlocksRPO RPOT(&L);
1887     RPOT.perform(LI);
1888     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1889       V.push_back(&L);
1890       // TODO: Collect inner loops inside marked outer loops in case
1891       // vectorization fails for the outer loop. Do not invoke
1892       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1893       // already known to be reducible. We can use an inherited attribute for
1894       // that.
1895       return;
1896     }
1897   }
1898   for (Loop *InnerL : L)
1899     collectSupportedLoops(*InnerL, LI, ORE, V);
1900 }
1901 
1902 namespace {
1903 
1904 /// The LoopVectorize Pass.
1905 struct LoopVectorize : public FunctionPass {
1906   /// Pass identification, replacement for typeid
1907   static char ID;
1908 
1909   LoopVectorizePass Impl;
1910 
LoopVectorize__anon0f3167640211::LoopVectorize1911   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1912                          bool VectorizeOnlyWhenForced = false)
1913       : FunctionPass(ID),
1914         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1915     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1916   }
1917 
runOnFunction__anon0f3167640211::LoopVectorize1918   bool runOnFunction(Function &F) override {
1919     if (skipFunction(F))
1920       return false;
1921 
1922     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1923     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1924     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1925     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1926     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1927     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1928     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1929     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1930     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1931     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1932     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1933     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1934     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1935 
1936     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1937         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1938 
1939     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1940                         GetLAA, *ORE, PSI).MadeAnyChange;
1941   }
1942 
getAnalysisUsage__anon0f3167640211::LoopVectorize1943   void getAnalysisUsage(AnalysisUsage &AU) const override {
1944     AU.addRequired<AssumptionCacheTracker>();
1945     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1946     AU.addRequired<DominatorTreeWrapperPass>();
1947     AU.addRequired<LoopInfoWrapperPass>();
1948     AU.addRequired<ScalarEvolutionWrapperPass>();
1949     AU.addRequired<TargetTransformInfoWrapperPass>();
1950     AU.addRequired<AAResultsWrapperPass>();
1951     AU.addRequired<LoopAccessLegacyAnalysis>();
1952     AU.addRequired<DemandedBitsWrapperPass>();
1953     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1954     AU.addRequired<InjectTLIMappingsLegacy>();
1955 
1956     // We currently do not preserve loopinfo/dominator analyses with outer loop
1957     // vectorization. Until this is addressed, mark these analyses as preserved
1958     // only for non-VPlan-native path.
1959     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1960     if (!EnableVPlanNativePath) {
1961       AU.addPreserved<LoopInfoWrapperPass>();
1962       AU.addPreserved<DominatorTreeWrapperPass>();
1963     }
1964 
1965     AU.addPreserved<BasicAAWrapperPass>();
1966     AU.addPreserved<GlobalsAAWrapperPass>();
1967     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1968   }
1969 };
1970 
1971 } // end anonymous namespace
1972 
1973 //===----------------------------------------------------------------------===//
1974 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1975 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1976 //===----------------------------------------------------------------------===//
1977 
getBroadcastInstrs(Value * V)1978 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1979   // We need to place the broadcast of invariant variables outside the loop,
1980   // but only if it's proven safe to do so. Else, broadcast will be inside
1981   // vector loop body.
1982   Instruction *Instr = dyn_cast<Instruction>(V);
1983   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1984                      (!Instr ||
1985                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1986   // Place the code for broadcasting invariant variables in the new preheader.
1987   IRBuilder<>::InsertPointGuard Guard(Builder);
1988   if (SafeToHoist)
1989     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1990 
1991   // Broadcast the scalar into all locations in the vector.
1992   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1993 
1994   return Shuf;
1995 }
1996 
createVectorIntOrFpInductionPHI(const InductionDescriptor & II,Value * Step,Instruction * EntryVal)1997 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1998     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1999   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2000          "Expected either an induction phi-node or a truncate of it!");
2001   Value *Start = II.getStartValue();
2002 
2003   // Construct the initial value of the vector IV in the vector loop preheader
2004   auto CurrIP = Builder.saveIP();
2005   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2006   if (isa<TruncInst>(EntryVal)) {
2007     assert(Start->getType()->isIntegerTy() &&
2008            "Truncation requires an integer type");
2009     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2010     Step = Builder.CreateTrunc(Step, TruncType);
2011     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2012   }
2013   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2014   Value *SteppedStart =
2015       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2016 
2017   // We create vector phi nodes for both integer and floating-point induction
2018   // variables. Here, we determine the kind of arithmetic we will perform.
2019   Instruction::BinaryOps AddOp;
2020   Instruction::BinaryOps MulOp;
2021   if (Step->getType()->isIntegerTy()) {
2022     AddOp = Instruction::Add;
2023     MulOp = Instruction::Mul;
2024   } else {
2025     AddOp = II.getInductionOpcode();
2026     MulOp = Instruction::FMul;
2027   }
2028 
2029   // Multiply the vectorization factor by the step using integer or
2030   // floating-point arithmetic as appropriate.
2031   Value *ConstVF =
2032       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2033   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2034 
2035   // Create a vector splat to use in the induction update.
2036   //
2037   // FIXME: If the step is non-constant, we create the vector splat with
2038   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2039   //        handle a constant vector splat.
2040   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2041   Value *SplatVF = isa<Constant>(Mul)
2042                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2043                        : Builder.CreateVectorSplat(VF, Mul);
2044   Builder.restoreIP(CurrIP);
2045 
2046   // We may need to add the step a number of times, depending on the unroll
2047   // factor. The last of those goes into the PHI.
2048   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2049                                     &*LoopVectorBody->getFirstInsertionPt());
2050   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2051   Instruction *LastInduction = VecInd;
2052   for (unsigned Part = 0; Part < UF; ++Part) {
2053     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2054 
2055     if (isa<TruncInst>(EntryVal))
2056       addMetadata(LastInduction, EntryVal);
2057     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2058 
2059     LastInduction = cast<Instruction>(addFastMathFlag(
2060         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2061     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2062   }
2063 
2064   // Move the last step to the end of the latch block. This ensures consistent
2065   // placement of all induction updates.
2066   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2067   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2068   auto *ICmp = cast<Instruction>(Br->getCondition());
2069   LastInduction->moveBefore(ICmp);
2070   LastInduction->setName("vec.ind.next");
2071 
2072   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2073   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2074 }
2075 
shouldScalarizeInstruction(Instruction * I) const2076 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2077   return Cost->isScalarAfterVectorization(I, VF) ||
2078          Cost->isProfitableToScalarize(I, VF);
2079 }
2080 
needsScalarInduction(Instruction * IV) const2081 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2082   if (shouldScalarizeInstruction(IV))
2083     return true;
2084   auto isScalarInst = [&](User *U) -> bool {
2085     auto *I = cast<Instruction>(U);
2086     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2087   };
2088   return llvm::any_of(IV->users(), isScalarInst);
2089 }
2090 
recordVectorLoopValueForInductionCast(const InductionDescriptor & ID,const Instruction * EntryVal,Value * VectorLoopVal,unsigned Part,unsigned Lane)2091 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2092     const InductionDescriptor &ID, const Instruction *EntryVal,
2093     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2094   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2095          "Expected either an induction phi-node or a truncate of it!");
2096 
2097   // This induction variable is not the phi from the original loop but the
2098   // newly-created IV based on the proof that casted Phi is equal to the
2099   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2100   // re-uses the same InductionDescriptor that original IV uses but we don't
2101   // have to do any recording in this case - that is done when original IV is
2102   // processed.
2103   if (isa<TruncInst>(EntryVal))
2104     return;
2105 
2106   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2107   if (Casts.empty())
2108     return;
2109   // Only the first Cast instruction in the Casts vector is of interest.
2110   // The rest of the Casts (if exist) have no uses outside the
2111   // induction update chain itself.
2112   Instruction *CastInst = *Casts.begin();
2113   if (Lane < UINT_MAX)
2114     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2115   else
2116     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2117 }
2118 
widenIntOrFpInduction(PHINode * IV,TruncInst * Trunc)2119 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2120   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2121          "Primary induction variable must have an integer type");
2122 
2123   auto II = Legal->getInductionVars().find(IV);
2124   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2125 
2126   auto ID = II->second;
2127   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2128 
2129   // The value from the original loop to which we are mapping the new induction
2130   // variable.
2131   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2132 
2133   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2134 
2135   // Generate code for the induction step. Note that induction steps are
2136   // required to be loop-invariant
2137   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2138     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2139            "Induction step should be loop invariant");
2140     if (PSE.getSE()->isSCEVable(IV->getType())) {
2141       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2142       return Exp.expandCodeFor(Step, Step->getType(),
2143                                LoopVectorPreHeader->getTerminator());
2144     }
2145     return cast<SCEVUnknown>(Step)->getValue();
2146   };
2147 
2148   // The scalar value to broadcast. This is derived from the canonical
2149   // induction variable. If a truncation type is given, truncate the canonical
2150   // induction variable and step. Otherwise, derive these values from the
2151   // induction descriptor.
2152   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2153     Value *ScalarIV = Induction;
2154     if (IV != OldInduction) {
2155       ScalarIV = IV->getType()->isIntegerTy()
2156                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2157                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2158                                           IV->getType());
2159       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2160       ScalarIV->setName("offset.idx");
2161     }
2162     if (Trunc) {
2163       auto *TruncType = cast<IntegerType>(Trunc->getType());
2164       assert(Step->getType()->isIntegerTy() &&
2165              "Truncation requires an integer step");
2166       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2167       Step = Builder.CreateTrunc(Step, TruncType);
2168     }
2169     return ScalarIV;
2170   };
2171 
2172   // Create the vector values from the scalar IV, in the absence of creating a
2173   // vector IV.
2174   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2175     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2176     for (unsigned Part = 0; Part < UF; ++Part) {
2177       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2178       Value *EntryPart =
2179           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2180                         ID.getInductionOpcode());
2181       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2182       if (Trunc)
2183         addMetadata(EntryPart, Trunc);
2184       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2185     }
2186   };
2187 
2188   // Now do the actual transformations, and start with creating the step value.
2189   Value *Step = CreateStepValue(ID.getStep());
2190   if (VF.isZero() || VF.isScalar()) {
2191     Value *ScalarIV = CreateScalarIV(Step);
2192     CreateSplatIV(ScalarIV, Step);
2193     return;
2194   }
2195 
2196   // Determine if we want a scalar version of the induction variable. This is
2197   // true if the induction variable itself is not widened, or if it has at
2198   // least one user in the loop that is not widened.
2199   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2200   if (!NeedsScalarIV) {
2201     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2202     return;
2203   }
2204 
2205   // Try to create a new independent vector induction variable. If we can't
2206   // create the phi node, we will splat the scalar induction variable in each
2207   // loop iteration.
2208   if (!shouldScalarizeInstruction(EntryVal)) {
2209     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2210     Value *ScalarIV = CreateScalarIV(Step);
2211     // Create scalar steps that can be used by instructions we will later
2212     // scalarize. Note that the addition of the scalar steps will not increase
2213     // the number of instructions in the loop in the common case prior to
2214     // InstCombine. We will be trading one vector extract for each scalar step.
2215     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2216     return;
2217   }
2218 
2219   // All IV users are scalar instructions, so only emit a scalar IV, not a
2220   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2221   // predicate used by the masked loads/stores.
2222   Value *ScalarIV = CreateScalarIV(Step);
2223   if (!Cost->isScalarEpilogueAllowed())
2224     CreateSplatIV(ScalarIV, Step);
2225   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2226 }
2227 
getStepVector(Value * Val,int StartIdx,Value * Step,Instruction::BinaryOps BinOp)2228 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2229                                           Instruction::BinaryOps BinOp) {
2230   // Create and check the types.
2231   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2232   int VLen = ValVTy->getNumElements();
2233 
2234   Type *STy = Val->getType()->getScalarType();
2235   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2236          "Induction Step must be an integer or FP");
2237   assert(Step->getType() == STy && "Step has wrong type");
2238 
2239   SmallVector<Constant *, 8> Indices;
2240 
2241   if (STy->isIntegerTy()) {
2242     // Create a vector of consecutive numbers from zero to VF.
2243     for (int i = 0; i < VLen; ++i)
2244       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2245 
2246     // Add the consecutive indices to the vector value.
2247     Constant *Cv = ConstantVector::get(Indices);
2248     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2249     Step = Builder.CreateVectorSplat(VLen, Step);
2250     assert(Step->getType() == Val->getType() && "Invalid step vec");
2251     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2252     // which can be found from the original scalar operations.
2253     Step = Builder.CreateMul(Cv, Step);
2254     return Builder.CreateAdd(Val, Step, "induction");
2255   }
2256 
2257   // Floating point induction.
2258   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2259          "Binary Opcode should be specified for FP induction");
2260   // Create a vector of consecutive numbers from zero to VF.
2261   for (int i = 0; i < VLen; ++i)
2262     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2263 
2264   // Add the consecutive indices to the vector value.
2265   Constant *Cv = ConstantVector::get(Indices);
2266 
2267   Step = Builder.CreateVectorSplat(VLen, Step);
2268 
2269   // Floating point operations had to be 'fast' to enable the induction.
2270   FastMathFlags Flags;
2271   Flags.setFast();
2272 
2273   Value *MulOp = Builder.CreateFMul(Cv, Step);
2274   if (isa<Instruction>(MulOp))
2275     // Have to check, MulOp may be a constant
2276     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2277 
2278   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2279   if (isa<Instruction>(BOp))
2280     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2281   return BOp;
2282 }
2283 
buildScalarSteps(Value * ScalarIV,Value * Step,Instruction * EntryVal,const InductionDescriptor & ID)2284 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2285                                            Instruction *EntryVal,
2286                                            const InductionDescriptor &ID) {
2287   // We shouldn't have to build scalar steps if we aren't vectorizing.
2288   assert(VF.isVector() && "VF should be greater than one");
2289   // Get the value type and ensure it and the step have the same integer type.
2290   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2291   assert(ScalarIVTy == Step->getType() &&
2292          "Val and Step should have the same type");
2293 
2294   // We build scalar steps for both integer and floating-point induction
2295   // variables. Here, we determine the kind of arithmetic we will perform.
2296   Instruction::BinaryOps AddOp;
2297   Instruction::BinaryOps MulOp;
2298   if (ScalarIVTy->isIntegerTy()) {
2299     AddOp = Instruction::Add;
2300     MulOp = Instruction::Mul;
2301   } else {
2302     AddOp = ID.getInductionOpcode();
2303     MulOp = Instruction::FMul;
2304   }
2305 
2306   // Determine the number of scalars we need to generate for each unroll
2307   // iteration. If EntryVal is uniform, we only need to generate the first
2308   // lane. Otherwise, we generate all VF values.
2309   unsigned Lanes =
2310       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2311           ? 1
2312           : VF.getKnownMinValue();
2313   assert((!VF.isScalable() || Lanes == 1) &&
2314          "Should never scalarize a scalable vector");
2315   // Compute the scalar steps and save the results in VectorLoopValueMap.
2316   for (unsigned Part = 0; Part < UF; ++Part) {
2317     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2318       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2319                                          ScalarIVTy->getScalarSizeInBits());
2320       Value *StartIdx =
2321           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2322       if (ScalarIVTy->isFloatingPointTy())
2323         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2324       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2325           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2326       // The step returned by `createStepForVF` is a runtime-evaluated value
2327       // when VF is scalable. Otherwise, it should be folded into a Constant.
2328       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2329              "Expected StartIdx to be folded to a constant when VF is not "
2330              "scalable");
2331       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2332       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2333       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2334       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2335     }
2336   }
2337 }
2338 
getOrCreateVectorValue(Value * V,unsigned Part)2339 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2340   assert(V != Induction && "The new induction variable should not be used.");
2341   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2342   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2343 
2344   // If we have a stride that is replaced by one, do it here. Defer this for
2345   // the VPlan-native path until we start running Legal checks in that path.
2346   if (!EnableVPlanNativePath && Legal->hasStride(V))
2347     V = ConstantInt::get(V->getType(), 1);
2348 
2349   // If we have a vector mapped to this value, return it.
2350   if (VectorLoopValueMap.hasVectorValue(V, Part))
2351     return VectorLoopValueMap.getVectorValue(V, Part);
2352 
2353   // If the value has not been vectorized, check if it has been scalarized
2354   // instead. If it has been scalarized, and we actually need the value in
2355   // vector form, we will construct the vector values on demand.
2356   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2357     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2358 
2359     // If we've scalarized a value, that value should be an instruction.
2360     auto *I = cast<Instruction>(V);
2361 
2362     // If we aren't vectorizing, we can just copy the scalar map values over to
2363     // the vector map.
2364     if (VF.isScalar()) {
2365       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2366       return ScalarValue;
2367     }
2368 
2369     // Get the last scalar instruction we generated for V and Part. If the value
2370     // is known to be uniform after vectorization, this corresponds to lane zero
2371     // of the Part unroll iteration. Otherwise, the last instruction is the one
2372     // we created for the last vector lane of the Part unroll iteration.
2373     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2374                             ? 0
2375                             : VF.getKnownMinValue() - 1;
2376     assert((!VF.isScalable() || LastLane == 0) &&
2377            "Scalable vectorization can't lead to any scalarized values.");
2378     auto *LastInst = cast<Instruction>(
2379         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2380 
2381     // Set the insert point after the last scalarized instruction. This ensures
2382     // the insertelement sequence will directly follow the scalar definitions.
2383     auto OldIP = Builder.saveIP();
2384     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2385     Builder.SetInsertPoint(&*NewIP);
2386 
2387     // However, if we are vectorizing, we need to construct the vector values.
2388     // If the value is known to be uniform after vectorization, we can just
2389     // broadcast the scalar value corresponding to lane zero for each unroll
2390     // iteration. Otherwise, we construct the vector values using insertelement
2391     // instructions. Since the resulting vectors are stored in
2392     // VectorLoopValueMap, we will only generate the insertelements once.
2393     Value *VectorValue = nullptr;
2394     if (Cost->isUniformAfterVectorization(I, VF)) {
2395       VectorValue = getBroadcastInstrs(ScalarValue);
2396       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2397     } else {
2398       // Initialize packing with insertelements to start from undef.
2399       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2400       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2401       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2402       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2403         packScalarIntoVectorValue(V, {Part, Lane});
2404       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2405     }
2406     Builder.restoreIP(OldIP);
2407     return VectorValue;
2408   }
2409 
2410   // If this scalar is unknown, assume that it is a constant or that it is
2411   // loop invariant. Broadcast V and save the value for future uses.
2412   Value *B = getBroadcastInstrs(V);
2413   VectorLoopValueMap.setVectorValue(V, Part, B);
2414   return B;
2415 }
2416 
2417 Value *
getOrCreateScalarValue(Value * V,const VPIteration & Instance)2418 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2419                                             const VPIteration &Instance) {
2420   // If the value is not an instruction contained in the loop, it should
2421   // already be scalar.
2422   if (OrigLoop->isLoopInvariant(V))
2423     return V;
2424 
2425   assert(Instance.Lane > 0
2426              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2427              : true && "Uniform values only have lane zero");
2428 
2429   // If the value from the original loop has not been vectorized, it is
2430   // represented by UF x VF scalar values in the new loop. Return the requested
2431   // scalar value.
2432   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2433     return VectorLoopValueMap.getScalarValue(V, Instance);
2434 
2435   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2436   // for the given unroll part. If this entry is not a vector type (i.e., the
2437   // vectorization factor is one), there is no need to generate an
2438   // extractelement instruction.
2439   auto *U = getOrCreateVectorValue(V, Instance.Part);
2440   if (!U->getType()->isVectorTy()) {
2441     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2442     return U;
2443   }
2444 
2445   // Otherwise, the value from the original loop has been vectorized and is
2446   // represented by UF vector values. Extract and return the requested scalar
2447   // value from the appropriate vector lane.
2448   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2449 }
2450 
packScalarIntoVectorValue(Value * V,const VPIteration & Instance)2451 void InnerLoopVectorizer::packScalarIntoVectorValue(
2452     Value *V, const VPIteration &Instance) {
2453   assert(V != Induction && "The new induction variable should not be used.");
2454   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2455   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2456 
2457   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2458   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2459   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2460                                             Builder.getInt32(Instance.Lane));
2461   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2462 }
2463 
reverseVector(Value * Vec)2464 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2465   assert(Vec->getType()->isVectorTy() && "Invalid type");
2466   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2467   SmallVector<int, 8> ShuffleMask;
2468   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2469     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2470 
2471   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2472 }
2473 
2474 // Return whether we allow using masked interleave-groups (for dealing with
2475 // strided loads/stores that reside in predicated blocks, or for dealing
2476 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2477 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2478   // If an override option has been passed in for interleaved accesses, use it.
2479   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2480     return EnableMaskedInterleavedMemAccesses;
2481 
2482   return TTI.enableMaskedInterleavedAccessVectorization();
2483 }
2484 
2485 // Try to vectorize the interleave group that \p Instr belongs to.
2486 //
2487 // E.g. Translate following interleaved load group (factor = 3):
2488 //   for (i = 0; i < N; i+=3) {
2489 //     R = Pic[i];             // Member of index 0
2490 //     G = Pic[i+1];           // Member of index 1
2491 //     B = Pic[i+2];           // Member of index 2
2492 //     ... // do something to R, G, B
2493 //   }
2494 // To:
2495 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2496 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2497 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2498 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2499 //
2500 // Or translate following interleaved store group (factor = 3):
2501 //   for (i = 0; i < N; i+=3) {
2502 //     ... do something to R, G, B
2503 //     Pic[i]   = R;           // Member of index 0
2504 //     Pic[i+1] = G;           // Member of index 1
2505 //     Pic[i+2] = B;           // Member of index 2
2506 //   }
2507 // To:
2508 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2509 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2510 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2511 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2512 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask)2513 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2514     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2515     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2516   Instruction *Instr = Group->getInsertPos();
2517   const DataLayout &DL = Instr->getModule()->getDataLayout();
2518 
2519   // Prepare for the vector type of the interleaved load/store.
2520   Type *ScalarTy = getMemInstValueType(Instr);
2521   unsigned InterleaveFactor = Group->getFactor();
2522   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2523   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2524 
2525   // Prepare for the new pointers.
2526   SmallVector<Value *, 2> AddrParts;
2527   unsigned Index = Group->getIndex(Instr);
2528 
2529   // TODO: extend the masked interleaved-group support to reversed access.
2530   assert((!BlockInMask || !Group->isReverse()) &&
2531          "Reversed masked interleave-group not supported.");
2532 
2533   // If the group is reverse, adjust the index to refer to the last vector lane
2534   // instead of the first. We adjust the index from the first vector lane,
2535   // rather than directly getting the pointer for lane VF - 1, because the
2536   // pointer operand of the interleaved access is supposed to be uniform. For
2537   // uniform instructions, we're only required to generate a value for the
2538   // first vector lane in each unroll iteration.
2539   assert(!VF.isScalable() &&
2540          "scalable vector reverse operation is not implemented");
2541   if (Group->isReverse())
2542     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2543 
2544   for (unsigned Part = 0; Part < UF; Part++) {
2545     Value *AddrPart = State.get(Addr, {Part, 0});
2546     setDebugLocFromInst(Builder, AddrPart);
2547 
2548     // Notice current instruction could be any index. Need to adjust the address
2549     // to the member of index 0.
2550     //
2551     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2552     //       b = A[i];       // Member of index 0
2553     // Current pointer is pointed to A[i+1], adjust it to A[i].
2554     //
2555     // E.g.  A[i+1] = a;     // Member of index 1
2556     //       A[i]   = b;     // Member of index 0
2557     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2558     // Current pointer is pointed to A[i+2], adjust it to A[i].
2559 
2560     bool InBounds = false;
2561     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2562       InBounds = gep->isInBounds();
2563     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2564     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2565 
2566     // Cast to the vector pointer type.
2567     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2568     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2569     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2570   }
2571 
2572   setDebugLocFromInst(Builder, Instr);
2573   Value *UndefVec = UndefValue::get(VecTy);
2574 
2575   Value *MaskForGaps = nullptr;
2576   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2577     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2578     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2579     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2580   }
2581 
2582   // Vectorize the interleaved load group.
2583   if (isa<LoadInst>(Instr)) {
2584     // For each unroll part, create a wide load for the group.
2585     SmallVector<Value *, 2> NewLoads;
2586     for (unsigned Part = 0; Part < UF; Part++) {
2587       Instruction *NewLoad;
2588       if (BlockInMask || MaskForGaps) {
2589         assert(useMaskedInterleavedAccesses(*TTI) &&
2590                "masked interleaved groups are not allowed.");
2591         Value *GroupMask = MaskForGaps;
2592         if (BlockInMask) {
2593           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2594           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2595           Value *ShuffledMask = Builder.CreateShuffleVector(
2596               BlockInMaskPart,
2597               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2598               "interleaved.mask");
2599           GroupMask = MaskForGaps
2600                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2601                                                 MaskForGaps)
2602                           : ShuffledMask;
2603         }
2604         NewLoad =
2605             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2606                                      GroupMask, UndefVec, "wide.masked.vec");
2607       }
2608       else
2609         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2610                                             Group->getAlign(), "wide.vec");
2611       Group->addMetadata(NewLoad);
2612       NewLoads.push_back(NewLoad);
2613     }
2614 
2615     // For each member in the group, shuffle out the appropriate data from the
2616     // wide loads.
2617     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2618       Instruction *Member = Group->getMember(I);
2619 
2620       // Skip the gaps in the group.
2621       if (!Member)
2622         continue;
2623 
2624       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2625       auto StrideMask =
2626           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2627       for (unsigned Part = 0; Part < UF; Part++) {
2628         Value *StridedVec = Builder.CreateShuffleVector(
2629             NewLoads[Part], StrideMask, "strided.vec");
2630 
2631         // If this member has different type, cast the result type.
2632         if (Member->getType() != ScalarTy) {
2633           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2634           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2635           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2636         }
2637 
2638         if (Group->isReverse())
2639           StridedVec = reverseVector(StridedVec);
2640 
2641         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2642       }
2643     }
2644     return;
2645   }
2646 
2647   // The sub vector type for current instruction.
2648   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2649   auto *SubVT = VectorType::get(ScalarTy, VF);
2650 
2651   // Vectorize the interleaved store group.
2652   for (unsigned Part = 0; Part < UF; Part++) {
2653     // Collect the stored vector from each member.
2654     SmallVector<Value *, 4> StoredVecs;
2655     for (unsigned i = 0; i < InterleaveFactor; i++) {
2656       // Interleaved store group doesn't allow a gap, so each index has a member
2657       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2658 
2659       Value *StoredVec = State.get(StoredValues[i], Part);
2660 
2661       if (Group->isReverse())
2662         StoredVec = reverseVector(StoredVec);
2663 
2664       // If this member has different type, cast it to a unified type.
2665 
2666       if (StoredVec->getType() != SubVT)
2667         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2668 
2669       StoredVecs.push_back(StoredVec);
2670     }
2671 
2672     // Concatenate all vectors into a wide vector.
2673     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2674 
2675     // Interleave the elements in the wide vector.
2676     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2677     Value *IVec = Builder.CreateShuffleVector(
2678         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2679         "interleaved.vec");
2680 
2681     Instruction *NewStoreInstr;
2682     if (BlockInMask) {
2683       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2684       Value *ShuffledMask = Builder.CreateShuffleVector(
2685           BlockInMaskPart,
2686           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2687           "interleaved.mask");
2688       NewStoreInstr = Builder.CreateMaskedStore(
2689           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2690     }
2691     else
2692       NewStoreInstr =
2693           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2694 
2695     Group->addMetadata(NewStoreInstr);
2696   }
2697 }
2698 
vectorizeMemoryInstruction(Instruction * Instr,VPTransformState & State,VPValue * Def,VPValue * Addr,VPValue * StoredValue,VPValue * BlockInMask)2699 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2700     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2701     VPValue *StoredValue, VPValue *BlockInMask) {
2702   // Attempt to issue a wide load.
2703   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2704   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2705 
2706   assert((LI || SI) && "Invalid Load/Store instruction");
2707   assert((!SI || StoredValue) && "No stored value provided for widened store");
2708   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2709 
2710   LoopVectorizationCostModel::InstWidening Decision =
2711       Cost->getWideningDecision(Instr, VF);
2712   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2713           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2714           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2715          "CM decision is not to widen the memory instruction");
2716 
2717   Type *ScalarDataTy = getMemInstValueType(Instr);
2718 
2719   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2720   const Align Alignment = getLoadStoreAlignment(Instr);
2721 
2722   // Determine if the pointer operand of the access is either consecutive or
2723   // reverse consecutive.
2724   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2725   bool ConsecutiveStride =
2726       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2727   bool CreateGatherScatter =
2728       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2729 
2730   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2731   // gather/scatter. Otherwise Decision should have been to Scalarize.
2732   assert((ConsecutiveStride || CreateGatherScatter) &&
2733          "The instruction should be scalarized");
2734   (void)ConsecutiveStride;
2735 
2736   VectorParts BlockInMaskParts(UF);
2737   bool isMaskRequired = BlockInMask;
2738   if (isMaskRequired)
2739     for (unsigned Part = 0; Part < UF; ++Part)
2740       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2741 
2742   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2743     // Calculate the pointer for the specific unroll-part.
2744     GetElementPtrInst *PartPtr = nullptr;
2745 
2746     bool InBounds = false;
2747     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2748       InBounds = gep->isInBounds();
2749 
2750     if (Reverse) {
2751       assert(!VF.isScalable() &&
2752              "Reversing vectors is not yet supported for scalable vectors.");
2753 
2754       // If the address is consecutive but reversed, then the
2755       // wide store needs to start at the last vector element.
2756       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2757           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2758       PartPtr->setIsInBounds(InBounds);
2759       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2760           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2761       PartPtr->setIsInBounds(InBounds);
2762       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2763         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2764     } else {
2765       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2766       PartPtr = cast<GetElementPtrInst>(
2767           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2768       PartPtr->setIsInBounds(InBounds);
2769     }
2770 
2771     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2772     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2773   };
2774 
2775   // Handle Stores:
2776   if (SI) {
2777     setDebugLocFromInst(Builder, SI);
2778 
2779     for (unsigned Part = 0; Part < UF; ++Part) {
2780       Instruction *NewSI = nullptr;
2781       Value *StoredVal = State.get(StoredValue, Part);
2782       if (CreateGatherScatter) {
2783         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2784         Value *VectorGep = State.get(Addr, Part);
2785         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2786                                             MaskPart);
2787       } else {
2788         if (Reverse) {
2789           // If we store to reverse consecutive memory locations, then we need
2790           // to reverse the order of elements in the stored value.
2791           StoredVal = reverseVector(StoredVal);
2792           // We don't want to update the value in the map as it might be used in
2793           // another expression. So don't call resetVectorValue(StoredVal).
2794         }
2795         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2796         if (isMaskRequired)
2797           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2798                                             BlockInMaskParts[Part]);
2799         else
2800           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2801       }
2802       addMetadata(NewSI, SI);
2803     }
2804     return;
2805   }
2806 
2807   // Handle loads.
2808   assert(LI && "Must have a load instruction");
2809   setDebugLocFromInst(Builder, LI);
2810   for (unsigned Part = 0; Part < UF; ++Part) {
2811     Value *NewLI;
2812     if (CreateGatherScatter) {
2813       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2814       Value *VectorGep = State.get(Addr, Part);
2815       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2816                                          nullptr, "wide.masked.gather");
2817       addMetadata(NewLI, LI);
2818     } else {
2819       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2820       if (isMaskRequired)
2821         NewLI = Builder.CreateMaskedLoad(
2822             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2823             "wide.masked.load");
2824       else
2825         NewLI =
2826             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2827 
2828       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2829       addMetadata(NewLI, LI);
2830       if (Reverse)
2831         NewLI = reverseVector(NewLI);
2832     }
2833 
2834     State.set(Def, Instr, NewLI, Part);
2835   }
2836 }
2837 
scalarizeInstruction(Instruction * Instr,VPUser & User,const VPIteration & Instance,bool IfPredicateInstr,VPTransformState & State)2838 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2839                                                const VPIteration &Instance,
2840                                                bool IfPredicateInstr,
2841                                                VPTransformState &State) {
2842   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2843 
2844   setDebugLocFromInst(Builder, Instr);
2845 
2846   // Does this instruction return a value ?
2847   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2848 
2849   Instruction *Cloned = Instr->clone();
2850   if (!IsVoidRetTy)
2851     Cloned->setName(Instr->getName() + ".cloned");
2852 
2853   // Replace the operands of the cloned instructions with their scalar
2854   // equivalents in the new loop.
2855   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2856     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2857     auto InputInstance = Instance;
2858     if (!Operand || !OrigLoop->contains(Operand) ||
2859         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2860       InputInstance.Lane = 0;
2861     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2862     Cloned->setOperand(op, NewOp);
2863   }
2864   addNewMetadata(Cloned, Instr);
2865 
2866   // Place the cloned scalar in the new loop.
2867   Builder.Insert(Cloned);
2868 
2869   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2870   // representing scalar values in VPTransformState. Add the cloned scalar to
2871   // the scalar map entry.
2872   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2873 
2874   // If we just cloned a new assumption, add it the assumption cache.
2875   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2876     if (II->getIntrinsicID() == Intrinsic::assume)
2877       AC->registerAssumption(II);
2878 
2879   // End if-block.
2880   if (IfPredicateInstr)
2881     PredicatedInstructions.push_back(Cloned);
2882 }
2883 
createInductionVariable(Loop * L,Value * Start,Value * End,Value * Step,Instruction * DL)2884 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2885                                                       Value *End, Value *Step,
2886                                                       Instruction *DL) {
2887   BasicBlock *Header = L->getHeader();
2888   BasicBlock *Latch = L->getLoopLatch();
2889   // As we're just creating this loop, it's possible no latch exists
2890   // yet. If so, use the header as this will be a single block loop.
2891   if (!Latch)
2892     Latch = Header;
2893 
2894   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2895   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2896   setDebugLocFromInst(Builder, OldInst);
2897   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2898 
2899   Builder.SetInsertPoint(Latch->getTerminator());
2900   setDebugLocFromInst(Builder, OldInst);
2901 
2902   // Create i+1 and fill the PHINode.
2903   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2904   Induction->addIncoming(Start, L->getLoopPreheader());
2905   Induction->addIncoming(Next, Latch);
2906   // Create the compare.
2907   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2908   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2909 
2910   // Now we have two terminators. Remove the old one from the block.
2911   Latch->getTerminator()->eraseFromParent();
2912 
2913   return Induction;
2914 }
2915 
getOrCreateTripCount(Loop * L)2916 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2917   if (TripCount)
2918     return TripCount;
2919 
2920   assert(L && "Create Trip Count for null loop.");
2921   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2922   // Find the loop boundaries.
2923   ScalarEvolution *SE = PSE.getSE();
2924   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2925   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2926          "Invalid loop count");
2927 
2928   Type *IdxTy = Legal->getWidestInductionType();
2929   assert(IdxTy && "No type for induction");
2930 
2931   // The exit count might have the type of i64 while the phi is i32. This can
2932   // happen if we have an induction variable that is sign extended before the
2933   // compare. The only way that we get a backedge taken count is that the
2934   // induction variable was signed and as such will not overflow. In such a case
2935   // truncation is legal.
2936   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2937       IdxTy->getPrimitiveSizeInBits())
2938     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2939   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2940 
2941   // Get the total trip count from the count by adding 1.
2942   const SCEV *ExitCount = SE->getAddExpr(
2943       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2944 
2945   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2946 
2947   // Expand the trip count and place the new instructions in the preheader.
2948   // Notice that the pre-header does not change, only the loop body.
2949   SCEVExpander Exp(*SE, DL, "induction");
2950 
2951   // Count holds the overall loop count (N).
2952   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2953                                 L->getLoopPreheader()->getTerminator());
2954 
2955   if (TripCount->getType()->isPointerTy())
2956     TripCount =
2957         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2958                                     L->getLoopPreheader()->getTerminator());
2959 
2960   return TripCount;
2961 }
2962 
getOrCreateVectorTripCount(Loop * L)2963 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2964   if (VectorTripCount)
2965     return VectorTripCount;
2966 
2967   Value *TC = getOrCreateTripCount(L);
2968   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2969 
2970   Type *Ty = TC->getType();
2971   // This is where we can make the step a runtime constant.
2972   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2973 
2974   // If the tail is to be folded by masking, round the number of iterations N
2975   // up to a multiple of Step instead of rounding down. This is done by first
2976   // adding Step-1 and then rounding down. Note that it's ok if this addition
2977   // overflows: the vector induction variable will eventually wrap to zero given
2978   // that it starts at zero and its Step is a power of two; the loop will then
2979   // exit, with the last early-exit vector comparison also producing all-true.
2980   if (Cost->foldTailByMasking()) {
2981     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2982            "VF*UF must be a power of 2 when folding tail by masking");
2983     assert(!VF.isScalable() &&
2984            "Tail folding not yet supported for scalable vectors");
2985     TC = Builder.CreateAdd(
2986         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2987   }
2988 
2989   // Now we need to generate the expression for the part of the loop that the
2990   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2991   // iterations are not required for correctness, or N - Step, otherwise. Step
2992   // is equal to the vectorization factor (number of SIMD elements) times the
2993   // unroll factor (number of SIMD instructions).
2994   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2995 
2996   // If there is a non-reversed interleaved group that may speculatively access
2997   // memory out-of-bounds, we need to ensure that there will be at least one
2998   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2999   // the trip count, we set the remainder to be equal to the step. If the step
3000   // does not evenly divide the trip count, no adjustment is necessary since
3001   // there will already be scalar iterations. Note that the minimum iterations
3002   // check ensures that N >= Step.
3003   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3004     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3005     R = Builder.CreateSelect(IsZero, Step, R);
3006   }
3007 
3008   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3009 
3010   return VectorTripCount;
3011 }
3012 
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)3013 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3014                                                    const DataLayout &DL) {
3015   // Verify that V is a vector type with same number of elements as DstVTy.
3016   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3017   unsigned VF = DstFVTy->getNumElements();
3018   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3019   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3020   Type *SrcElemTy = SrcVecTy->getElementType();
3021   Type *DstElemTy = DstFVTy->getElementType();
3022   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3023          "Vector elements must have same size");
3024 
3025   // Do a direct cast if element types are castable.
3026   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3027     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3028   }
3029   // V cannot be directly casted to desired vector type.
3030   // May happen when V is a floating point vector but DstVTy is a vector of
3031   // pointers or vice-versa. Handle this using a two-step bitcast using an
3032   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3033   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3034          "Only one type should be a pointer type");
3035   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3036          "Only one type should be a floating point type");
3037   Type *IntTy =
3038       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3039   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3040   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3041   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3042 }
3043 
emitMinimumIterationCountCheck(Loop * L,BasicBlock * Bypass)3044 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3045                                                          BasicBlock *Bypass) {
3046   Value *Count = getOrCreateTripCount(L);
3047   // Reuse existing vector loop preheader for TC checks.
3048   // Note that new preheader block is generated for vector loop.
3049   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3050   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3051 
3052   // Generate code to check if the loop's trip count is less than VF * UF, or
3053   // equal to it in case a scalar epilogue is required; this implies that the
3054   // vector trip count is zero. This check also covers the case where adding one
3055   // to the backedge-taken count overflowed leading to an incorrect trip count
3056   // of zero. In this case we will also jump to the scalar loop.
3057   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3058                                           : ICmpInst::ICMP_ULT;
3059 
3060   // If tail is to be folded, vector loop takes care of all iterations.
3061   Value *CheckMinIters = Builder.getFalse();
3062   if (!Cost->foldTailByMasking()) {
3063     Value *Step =
3064         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3065     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3066   }
3067   // Create new preheader for vector loop.
3068   LoopVectorPreHeader =
3069       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3070                  "vector.ph");
3071 
3072   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3073                                DT->getNode(Bypass)->getIDom()) &&
3074          "TC check is expected to dominate Bypass");
3075 
3076   // Update dominator for Bypass & LoopExit.
3077   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3078   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3079 
3080   ReplaceInstWithInst(
3081       TCCheckBlock->getTerminator(),
3082       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3083   LoopBypassBlocks.push_back(TCCheckBlock);
3084 }
3085 
emitSCEVChecks(Loop * L,BasicBlock * Bypass)3086 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3087   // Reuse existing vector loop preheader for SCEV checks.
3088   // Note that new preheader block is generated for vector loop.
3089   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3090 
3091   // Generate the code to check that the SCEV assumptions that we made.
3092   // We want the new basic block to start at the first instruction in a
3093   // sequence of instructions that form a check.
3094   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3095                    "scev.check");
3096   Value *SCEVCheck = Exp.expandCodeForPredicate(
3097       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3098 
3099   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3100     if (C->isZero())
3101       return;
3102 
3103   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3104            (OptForSizeBasedOnProfile &&
3105             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3106          "Cannot SCEV check stride or overflow when optimizing for size");
3107 
3108   SCEVCheckBlock->setName("vector.scevcheck");
3109   // Create new preheader for vector loop.
3110   LoopVectorPreHeader =
3111       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3112                  nullptr, "vector.ph");
3113 
3114   // Update dominator only if this is first RT check.
3115   if (LoopBypassBlocks.empty()) {
3116     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3117     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3118   }
3119 
3120   ReplaceInstWithInst(
3121       SCEVCheckBlock->getTerminator(),
3122       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3123   LoopBypassBlocks.push_back(SCEVCheckBlock);
3124   AddedSafetyChecks = true;
3125 }
3126 
emitMemRuntimeChecks(Loop * L,BasicBlock * Bypass)3127 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3128   // VPlan-native path does not do any analysis for runtime checks currently.
3129   if (EnableVPlanNativePath)
3130     return;
3131 
3132   // Reuse existing vector loop preheader for runtime memory checks.
3133   // Note that new preheader block is generated for vector loop.
3134   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3135 
3136   // Generate the code that checks in runtime if arrays overlap. We put the
3137   // checks into a separate block to make the more common case of few elements
3138   // faster.
3139   auto *LAI = Legal->getLAI();
3140   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3141   if (!RtPtrChecking.Need)
3142     return;
3143 
3144   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3145     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3146            "Cannot emit memory checks when optimizing for size, unless forced "
3147            "to vectorize.");
3148     ORE->emit([&]() {
3149       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3150                                         L->getStartLoc(), L->getHeader())
3151              << "Code-size may be reduced by not forcing "
3152                 "vectorization, or by source-code modifications "
3153                 "eliminating the need for runtime checks "
3154                 "(e.g., adding 'restrict').";
3155     });
3156   }
3157 
3158   MemCheckBlock->setName("vector.memcheck");
3159   // Create new preheader for vector loop.
3160   LoopVectorPreHeader =
3161       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3162                  "vector.ph");
3163 
3164   auto *CondBranch = cast<BranchInst>(
3165       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3166   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3167   LoopBypassBlocks.push_back(MemCheckBlock);
3168   AddedSafetyChecks = true;
3169 
3170   // Update dominator only if this is first RT check.
3171   if (LoopBypassBlocks.empty()) {
3172     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3173     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3174   }
3175 
3176   Instruction *FirstCheckInst;
3177   Instruction *MemRuntimeCheck;
3178   std::tie(FirstCheckInst, MemRuntimeCheck) =
3179       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3180                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3181   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3182                             "claimed checks are required");
3183   CondBranch->setCondition(MemRuntimeCheck);
3184 
3185   // We currently don't use LoopVersioning for the actual loop cloning but we
3186   // still use it to add the noalias metadata.
3187   LVer = std::make_unique<LoopVersioning>(
3188       *Legal->getLAI(),
3189       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3190       DT, PSE.getSE());
3191   LVer->prepareNoAliasMetadata();
3192 }
3193 
emitTransformedIndex(IRBuilder<> & B,Value * Index,ScalarEvolution * SE,const DataLayout & DL,const InductionDescriptor & ID) const3194 Value *InnerLoopVectorizer::emitTransformedIndex(
3195     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3196     const InductionDescriptor &ID) const {
3197 
3198   SCEVExpander Exp(*SE, DL, "induction");
3199   auto Step = ID.getStep();
3200   auto StartValue = ID.getStartValue();
3201   assert(Index->getType() == Step->getType() &&
3202          "Index type does not match StepValue type");
3203 
3204   // Note: the IR at this point is broken. We cannot use SE to create any new
3205   // SCEV and then expand it, hoping that SCEV's simplification will give us
3206   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3207   // lead to various SCEV crashes. So all we can do is to use builder and rely
3208   // on InstCombine for future simplifications. Here we handle some trivial
3209   // cases only.
3210   auto CreateAdd = [&B](Value *X, Value *Y) {
3211     assert(X->getType() == Y->getType() && "Types don't match!");
3212     if (auto *CX = dyn_cast<ConstantInt>(X))
3213       if (CX->isZero())
3214         return Y;
3215     if (auto *CY = dyn_cast<ConstantInt>(Y))
3216       if (CY->isZero())
3217         return X;
3218     return B.CreateAdd(X, Y);
3219   };
3220 
3221   auto CreateMul = [&B](Value *X, Value *Y) {
3222     assert(X->getType() == Y->getType() && "Types don't match!");
3223     if (auto *CX = dyn_cast<ConstantInt>(X))
3224       if (CX->isOne())
3225         return Y;
3226     if (auto *CY = dyn_cast<ConstantInt>(Y))
3227       if (CY->isOne())
3228         return X;
3229     return B.CreateMul(X, Y);
3230   };
3231 
3232   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3233   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3234   // the DomTree is not kept up-to-date for additional blocks generated in the
3235   // vector loop. By using the header as insertion point, we guarantee that the
3236   // expanded instructions dominate all their uses.
3237   auto GetInsertPoint = [this, &B]() {
3238     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3239     if (InsertBB != LoopVectorBody &&
3240         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3241       return LoopVectorBody->getTerminator();
3242     return &*B.GetInsertPoint();
3243   };
3244   switch (ID.getKind()) {
3245   case InductionDescriptor::IK_IntInduction: {
3246     assert(Index->getType() == StartValue->getType() &&
3247            "Index type does not match StartValue type");
3248     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3249       return B.CreateSub(StartValue, Index);
3250     auto *Offset = CreateMul(
3251         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3252     return CreateAdd(StartValue, Offset);
3253   }
3254   case InductionDescriptor::IK_PtrInduction: {
3255     assert(isa<SCEVConstant>(Step) &&
3256            "Expected constant step for pointer induction");
3257     return B.CreateGEP(
3258         StartValue->getType()->getPointerElementType(), StartValue,
3259         CreateMul(Index,
3260                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3261   }
3262   case InductionDescriptor::IK_FpInduction: {
3263     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3264     auto InductionBinOp = ID.getInductionBinOp();
3265     assert(InductionBinOp &&
3266            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3267             InductionBinOp->getOpcode() == Instruction::FSub) &&
3268            "Original bin op should be defined for FP induction");
3269 
3270     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3271 
3272     // Floating point operations had to be 'fast' to enable the induction.
3273     FastMathFlags Flags;
3274     Flags.setFast();
3275 
3276     Value *MulExp = B.CreateFMul(StepValue, Index);
3277     if (isa<Instruction>(MulExp))
3278       // We have to check, the MulExp may be a constant.
3279       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3280 
3281     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3282                                "induction");
3283     if (isa<Instruction>(BOp))
3284       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3285 
3286     return BOp;
3287   }
3288   case InductionDescriptor::IK_NoInduction:
3289     return nullptr;
3290   }
3291   llvm_unreachable("invalid enum");
3292 }
3293 
createVectorLoopSkeleton(StringRef Prefix)3294 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3295   LoopScalarBody = OrigLoop->getHeader();
3296   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3297   LoopExitBlock = OrigLoop->getExitBlock();
3298   assert(LoopExitBlock && "Must have an exit block");
3299   assert(LoopVectorPreHeader && "Invalid loop structure");
3300 
3301   LoopMiddleBlock =
3302       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3303                  LI, nullptr, Twine(Prefix) + "middle.block");
3304   LoopScalarPreHeader =
3305       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3306                  nullptr, Twine(Prefix) + "scalar.ph");
3307   // We intentionally don't let SplitBlock to update LoopInfo since
3308   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3309   // LoopVectorBody is explicitly added to the correct place few lines later.
3310   LoopVectorBody =
3311       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3312                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3313 
3314   // Update dominator for loop exit.
3315   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3316 
3317   // Create and register the new vector loop.
3318   Loop *Lp = LI->AllocateLoop();
3319   Loop *ParentLoop = OrigLoop->getParentLoop();
3320 
3321   // Insert the new loop into the loop nest and register the new basic blocks
3322   // before calling any utilities such as SCEV that require valid LoopInfo.
3323   if (ParentLoop) {
3324     ParentLoop->addChildLoop(Lp);
3325   } else {
3326     LI->addTopLevelLoop(Lp);
3327   }
3328   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3329   return Lp;
3330 }
3331 
createInductionResumeValues(Loop * L,Value * VectorTripCount,std::pair<BasicBlock *,Value * > AdditionalBypass)3332 void InnerLoopVectorizer::createInductionResumeValues(
3333     Loop *L, Value *VectorTripCount,
3334     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3335   assert(VectorTripCount && L && "Expected valid arguments");
3336   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3337           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3338          "Inconsistent information about additional bypass.");
3339   // We are going to resume the execution of the scalar loop.
3340   // Go over all of the induction variables that we found and fix the
3341   // PHIs that are left in the scalar version of the loop.
3342   // The starting values of PHI nodes depend on the counter of the last
3343   // iteration in the vectorized loop.
3344   // If we come from a bypass edge then we need to start from the original
3345   // start value.
3346   for (auto &InductionEntry : Legal->getInductionVars()) {
3347     PHINode *OrigPhi = InductionEntry.first;
3348     InductionDescriptor II = InductionEntry.second;
3349 
3350     // Create phi nodes to merge from the  backedge-taken check block.
3351     PHINode *BCResumeVal =
3352         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3353                         LoopScalarPreHeader->getTerminator());
3354     // Copy original phi DL over to the new one.
3355     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3356     Value *&EndValue = IVEndValues[OrigPhi];
3357     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3358     if (OrigPhi == OldInduction) {
3359       // We know what the end value is.
3360       EndValue = VectorTripCount;
3361     } else {
3362       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3363       Type *StepType = II.getStep()->getType();
3364       Instruction::CastOps CastOp =
3365           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3366       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3367       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3368       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3369       EndValue->setName("ind.end");
3370 
3371       // Compute the end value for the additional bypass (if applicable).
3372       if (AdditionalBypass.first) {
3373         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3374         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3375                                          StepType, true);
3376         CRD =
3377             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3378         EndValueFromAdditionalBypass =
3379             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3380         EndValueFromAdditionalBypass->setName("ind.end");
3381       }
3382     }
3383     // The new PHI merges the original incoming value, in case of a bypass,
3384     // or the value at the end of the vectorized loop.
3385     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3386 
3387     // Fix the scalar body counter (PHI node).
3388     // The old induction's phi node in the scalar body needs the truncated
3389     // value.
3390     for (BasicBlock *BB : LoopBypassBlocks)
3391       BCResumeVal->addIncoming(II.getStartValue(), BB);
3392 
3393     if (AdditionalBypass.first)
3394       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3395                                             EndValueFromAdditionalBypass);
3396 
3397     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3398   }
3399 }
3400 
completeLoopSkeleton(Loop * L,MDNode * OrigLoopID)3401 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3402                                                       MDNode *OrigLoopID) {
3403   assert(L && "Expected valid loop.");
3404 
3405   // The trip counts should be cached by now.
3406   Value *Count = getOrCreateTripCount(L);
3407   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3408 
3409   // We need the OrigLoop (scalar loop part) latch terminator to help
3410   // produce correct debug info for the middle block BB instructions.
3411   // The legality check stage guarantees that the loop will have a single
3412   // latch.
3413   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3414          "Scalar loop latch terminator isn't a branch");
3415   BranchInst *ScalarLatchBr =
3416       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3417 
3418   // Add a check in the middle block to see if we have completed
3419   // all of the iterations in the first vector loop.
3420   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3421   // If tail is to be folded, we know we don't need to run the remainder.
3422   Value *CmpN = Builder.getTrue();
3423   if (!Cost->foldTailByMasking()) {
3424     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3425                            VectorTripCount, "cmp.n",
3426                            LoopMiddleBlock->getTerminator());
3427 
3428     // Here we use the same DebugLoc as the scalar loop latch branch instead
3429     // of the corresponding compare because they may have ended up with
3430     // different line numbers and we want to avoid awkward line stepping while
3431     // debugging. Eg. if the compare has got a line number inside the loop.
3432     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3433   }
3434 
3435   BranchInst *BrInst =
3436       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3437   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3438   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3439 
3440   // Get ready to start creating new instructions into the vectorized body.
3441   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3442          "Inconsistent vector loop preheader");
3443   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3444 
3445   Optional<MDNode *> VectorizedLoopID =
3446       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3447                                       LLVMLoopVectorizeFollowupVectorized});
3448   if (VectorizedLoopID.hasValue()) {
3449     L->setLoopID(VectorizedLoopID.getValue());
3450 
3451     // Do not setAlreadyVectorized if loop attributes have been defined
3452     // explicitly.
3453     return LoopVectorPreHeader;
3454   }
3455 
3456   // Keep all loop hints from the original loop on the vector loop (we'll
3457   // replace the vectorizer-specific hints below).
3458   if (MDNode *LID = OrigLoop->getLoopID())
3459     L->setLoopID(LID);
3460 
3461   LoopVectorizeHints Hints(L, true, *ORE);
3462   Hints.setAlreadyVectorized();
3463 
3464 #ifdef EXPENSIVE_CHECKS
3465   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3466   LI->verify(*DT);
3467 #endif
3468 
3469   return LoopVectorPreHeader;
3470 }
3471 
createVectorizedLoopSkeleton()3472 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3473   /*
3474    In this function we generate a new loop. The new loop will contain
3475    the vectorized instructions while the old loop will continue to run the
3476    scalar remainder.
3477 
3478        [ ] <-- loop iteration number check.
3479     /   |
3480    /    v
3481   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3482   |  /  |
3483   | /   v
3484   ||   [ ]     <-- vector pre header.
3485   |/    |
3486   |     v
3487   |    [  ] \
3488   |    [  ]_|   <-- vector loop.
3489   |     |
3490   |     v
3491   |   -[ ]   <--- middle-block.
3492   |  /  |
3493   | /   v
3494   -|- >[ ]     <--- new preheader.
3495    |    |
3496    |    v
3497    |   [ ] \
3498    |   [ ]_|   <-- old scalar loop to handle remainder.
3499     \   |
3500      \  v
3501       >[ ]     <-- exit block.
3502    ...
3503    */
3504 
3505   // Get the metadata of the original loop before it gets modified.
3506   MDNode *OrigLoopID = OrigLoop->getLoopID();
3507 
3508   // Create an empty vector loop, and prepare basic blocks for the runtime
3509   // checks.
3510   Loop *Lp = createVectorLoopSkeleton("");
3511 
3512   // Now, compare the new count to zero. If it is zero skip the vector loop and
3513   // jump to the scalar loop. This check also covers the case where the
3514   // backedge-taken count is uint##_max: adding one to it will overflow leading
3515   // to an incorrect trip count of zero. In this (rare) case we will also jump
3516   // to the scalar loop.
3517   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3518 
3519   // Generate the code to check any assumptions that we've made for SCEV
3520   // expressions.
3521   emitSCEVChecks(Lp, LoopScalarPreHeader);
3522 
3523   // Generate the code that checks in runtime if arrays overlap. We put the
3524   // checks into a separate block to make the more common case of few elements
3525   // faster.
3526   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3527 
3528   // Some loops have a single integer induction variable, while other loops
3529   // don't. One example is c++ iterators that often have multiple pointer
3530   // induction variables. In the code below we also support a case where we
3531   // don't have a single induction variable.
3532   //
3533   // We try to obtain an induction variable from the original loop as hard
3534   // as possible. However if we don't find one that:
3535   //   - is an integer
3536   //   - counts from zero, stepping by one
3537   //   - is the size of the widest induction variable type
3538   // then we create a new one.
3539   OldInduction = Legal->getPrimaryInduction();
3540   Type *IdxTy = Legal->getWidestInductionType();
3541   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3542   // The loop step is equal to the vectorization factor (num of SIMD elements)
3543   // times the unroll factor (num of SIMD instructions).
3544   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3545   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3546   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3547   Induction =
3548       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3549                               getDebugLocFromInstOrOperands(OldInduction));
3550 
3551   // Emit phis for the new starting index of the scalar loop.
3552   createInductionResumeValues(Lp, CountRoundDown);
3553 
3554   return completeLoopSkeleton(Lp, OrigLoopID);
3555 }
3556 
3557 // Fix up external users of the induction variable. At this point, we are
3558 // in LCSSA form, with all external PHIs that use the IV having one input value,
3559 // coming from the remainder loop. We need those PHIs to also have a correct
3560 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * CountRoundDown,Value * EndValue,BasicBlock * MiddleBlock)3561 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3562                                        const InductionDescriptor &II,
3563                                        Value *CountRoundDown, Value *EndValue,
3564                                        BasicBlock *MiddleBlock) {
3565   // There are two kinds of external IV usages - those that use the value
3566   // computed in the last iteration (the PHI) and those that use the penultimate
3567   // value (the value that feeds into the phi from the loop latch).
3568   // We allow both, but they, obviously, have different values.
3569 
3570   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3571 
3572   DenseMap<Value *, Value *> MissingVals;
3573 
3574   // An external user of the last iteration's value should see the value that
3575   // the remainder loop uses to initialize its own IV.
3576   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3577   for (User *U : PostInc->users()) {
3578     Instruction *UI = cast<Instruction>(U);
3579     if (!OrigLoop->contains(UI)) {
3580       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3581       MissingVals[UI] = EndValue;
3582     }
3583   }
3584 
3585   // An external user of the penultimate value need to see EndValue - Step.
3586   // The simplest way to get this is to recompute it from the constituent SCEVs,
3587   // that is Start + (Step * (CRD - 1)).
3588   for (User *U : OrigPhi->users()) {
3589     auto *UI = cast<Instruction>(U);
3590     if (!OrigLoop->contains(UI)) {
3591       const DataLayout &DL =
3592           OrigLoop->getHeader()->getModule()->getDataLayout();
3593       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3594 
3595       IRBuilder<> B(MiddleBlock->getTerminator());
3596       Value *CountMinusOne = B.CreateSub(
3597           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3598       Value *CMO =
3599           !II.getStep()->getType()->isIntegerTy()
3600               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3601                              II.getStep()->getType())
3602               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3603       CMO->setName("cast.cmo");
3604       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3605       Escape->setName("ind.escape");
3606       MissingVals[UI] = Escape;
3607     }
3608   }
3609 
3610   for (auto &I : MissingVals) {
3611     PHINode *PHI = cast<PHINode>(I.first);
3612     // One corner case we have to handle is two IVs "chasing" each-other,
3613     // that is %IV2 = phi [...], [ %IV1, %latch ]
3614     // In this case, if IV1 has an external use, we need to avoid adding both
3615     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3616     // don't already have an incoming value for the middle block.
3617     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3618       PHI->addIncoming(I.second, MiddleBlock);
3619   }
3620 }
3621 
3622 namespace {
3623 
3624 struct CSEDenseMapInfo {
canHandle__anon0f3167640d11::CSEDenseMapInfo3625   static bool canHandle(const Instruction *I) {
3626     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3627            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3628   }
3629 
getEmptyKey__anon0f3167640d11::CSEDenseMapInfo3630   static inline Instruction *getEmptyKey() {
3631     return DenseMapInfo<Instruction *>::getEmptyKey();
3632   }
3633 
getTombstoneKey__anon0f3167640d11::CSEDenseMapInfo3634   static inline Instruction *getTombstoneKey() {
3635     return DenseMapInfo<Instruction *>::getTombstoneKey();
3636   }
3637 
getHashValue__anon0f3167640d11::CSEDenseMapInfo3638   static unsigned getHashValue(const Instruction *I) {
3639     assert(canHandle(I) && "Unknown instruction!");
3640     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3641                                                            I->value_op_end()));
3642   }
3643 
isEqual__anon0f3167640d11::CSEDenseMapInfo3644   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3645     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3646         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3647       return LHS == RHS;
3648     return LHS->isIdenticalTo(RHS);
3649   }
3650 };
3651 
3652 } // end anonymous namespace
3653 
3654 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)3655 static void cse(BasicBlock *BB) {
3656   // Perform simple cse.
3657   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3658   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3659     Instruction *In = &*I++;
3660 
3661     if (!CSEDenseMapInfo::canHandle(In))
3662       continue;
3663 
3664     // Check if we can replace this instruction with any of the
3665     // visited instructions.
3666     if (Instruction *V = CSEMap.lookup(In)) {
3667       In->replaceAllUsesWith(V);
3668       In->eraseFromParent();
3669       continue;
3670     }
3671 
3672     CSEMap[In] = In;
3673   }
3674 }
3675 
getVectorCallCost(CallInst * CI,ElementCount VF,bool & NeedToScalarize)3676 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3677                                                        ElementCount VF,
3678                                                        bool &NeedToScalarize) {
3679   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3680   Function *F = CI->getCalledFunction();
3681   Type *ScalarRetTy = CI->getType();
3682   SmallVector<Type *, 4> Tys, ScalarTys;
3683   for (auto &ArgOp : CI->arg_operands())
3684     ScalarTys.push_back(ArgOp->getType());
3685 
3686   // Estimate cost of scalarized vector call. The source operands are assumed
3687   // to be vectors, so we need to extract individual elements from there,
3688   // execute VF scalar calls, and then gather the result into the vector return
3689   // value.
3690   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3691                                                  TTI::TCK_RecipThroughput);
3692   if (VF.isScalar())
3693     return ScalarCallCost;
3694 
3695   // Compute corresponding vector type for return value and arguments.
3696   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3697   for (Type *ScalarTy : ScalarTys)
3698     Tys.push_back(ToVectorTy(ScalarTy, VF));
3699 
3700   // Compute costs of unpacking argument values for the scalar calls and
3701   // packing the return values to a vector.
3702   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3703 
3704   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3705 
3706   // If we can't emit a vector call for this function, then the currently found
3707   // cost is the cost we need to return.
3708   NeedToScalarize = true;
3709   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3710   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3711 
3712   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3713     return Cost;
3714 
3715   // If the corresponding vector cost is cheaper, return its cost.
3716   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3717                                                  TTI::TCK_RecipThroughput);
3718   if (VectorCallCost < Cost) {
3719     NeedToScalarize = false;
3720     return VectorCallCost;
3721   }
3722   return Cost;
3723 }
3724 
getVectorIntrinsicCost(CallInst * CI,ElementCount VF)3725 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3726                                                             ElementCount VF) {
3727   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3728   assert(ID && "Expected intrinsic call!");
3729 
3730   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3731   return TTI.getIntrinsicInstrCost(CostAttrs,
3732                                    TargetTransformInfo::TCK_RecipThroughput);
3733 }
3734 
smallestIntegerVectorType(Type * T1,Type * T2)3735 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3736   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3737   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3738   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3739 }
3740 
largestIntegerVectorType(Type * T1,Type * T2)3741 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3742   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3743   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3744   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3745 }
3746 
truncateToMinimalBitwidths()3747 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3748   // For every instruction `I` in MinBWs, truncate the operands, create a
3749   // truncated version of `I` and reextend its result. InstCombine runs
3750   // later and will remove any ext/trunc pairs.
3751   SmallPtrSet<Value *, 4> Erased;
3752   for (const auto &KV : Cost->getMinimalBitwidths()) {
3753     // If the value wasn't vectorized, we must maintain the original scalar
3754     // type. The absence of the value from VectorLoopValueMap indicates that it
3755     // wasn't vectorized.
3756     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3757       continue;
3758     for (unsigned Part = 0; Part < UF; ++Part) {
3759       Value *I = getOrCreateVectorValue(KV.first, Part);
3760       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3761         continue;
3762       Type *OriginalTy = I->getType();
3763       Type *ScalarTruncatedTy =
3764           IntegerType::get(OriginalTy->getContext(), KV.second);
3765       auto *TruncatedTy = FixedVectorType::get(
3766           ScalarTruncatedTy,
3767           cast<FixedVectorType>(OriginalTy)->getNumElements());
3768       if (TruncatedTy == OriginalTy)
3769         continue;
3770 
3771       IRBuilder<> B(cast<Instruction>(I));
3772       auto ShrinkOperand = [&](Value *V) -> Value * {
3773         if (auto *ZI = dyn_cast<ZExtInst>(V))
3774           if (ZI->getSrcTy() == TruncatedTy)
3775             return ZI->getOperand(0);
3776         return B.CreateZExtOrTrunc(V, TruncatedTy);
3777       };
3778 
3779       // The actual instruction modification depends on the instruction type,
3780       // unfortunately.
3781       Value *NewI = nullptr;
3782       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3783         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3784                              ShrinkOperand(BO->getOperand(1)));
3785 
3786         // Any wrapping introduced by shrinking this operation shouldn't be
3787         // considered undefined behavior. So, we can't unconditionally copy
3788         // arithmetic wrapping flags to NewI.
3789         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3790       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3791         NewI =
3792             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3793                          ShrinkOperand(CI->getOperand(1)));
3794       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3795         NewI = B.CreateSelect(SI->getCondition(),
3796                               ShrinkOperand(SI->getTrueValue()),
3797                               ShrinkOperand(SI->getFalseValue()));
3798       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3799         switch (CI->getOpcode()) {
3800         default:
3801           llvm_unreachable("Unhandled cast!");
3802         case Instruction::Trunc:
3803           NewI = ShrinkOperand(CI->getOperand(0));
3804           break;
3805         case Instruction::SExt:
3806           NewI = B.CreateSExtOrTrunc(
3807               CI->getOperand(0),
3808               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3809           break;
3810         case Instruction::ZExt:
3811           NewI = B.CreateZExtOrTrunc(
3812               CI->getOperand(0),
3813               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3814           break;
3815         }
3816       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3817         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3818                              ->getNumElements();
3819         auto *O0 = B.CreateZExtOrTrunc(
3820             SI->getOperand(0),
3821             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3822         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3823                              ->getNumElements();
3824         auto *O1 = B.CreateZExtOrTrunc(
3825             SI->getOperand(1),
3826             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3827 
3828         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3829       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3830         // Don't do anything with the operands, just extend the result.
3831         continue;
3832       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3833         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3834                             ->getNumElements();
3835         auto *O0 = B.CreateZExtOrTrunc(
3836             IE->getOperand(0),
3837             FixedVectorType::get(ScalarTruncatedTy, Elements));
3838         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3839         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3840       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3841         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3842                             ->getNumElements();
3843         auto *O0 = B.CreateZExtOrTrunc(
3844             EE->getOperand(0),
3845             FixedVectorType::get(ScalarTruncatedTy, Elements));
3846         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3847       } else {
3848         // If we don't know what to do, be conservative and don't do anything.
3849         continue;
3850       }
3851 
3852       // Lastly, extend the result.
3853       NewI->takeName(cast<Instruction>(I));
3854       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3855       I->replaceAllUsesWith(Res);
3856       cast<Instruction>(I)->eraseFromParent();
3857       Erased.insert(I);
3858       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3859     }
3860   }
3861 
3862   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3863   for (const auto &KV : Cost->getMinimalBitwidths()) {
3864     // If the value wasn't vectorized, we must maintain the original scalar
3865     // type. The absence of the value from VectorLoopValueMap indicates that it
3866     // wasn't vectorized.
3867     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3868       continue;
3869     for (unsigned Part = 0; Part < UF; ++Part) {
3870       Value *I = getOrCreateVectorValue(KV.first, Part);
3871       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3872       if (Inst && Inst->use_empty()) {
3873         Value *NewI = Inst->getOperand(0);
3874         Inst->eraseFromParent();
3875         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3876       }
3877     }
3878   }
3879 }
3880 
fixVectorizedLoop()3881 void InnerLoopVectorizer::fixVectorizedLoop() {
3882   // Insert truncates and extends for any truncated instructions as hints to
3883   // InstCombine.
3884   if (VF.isVector())
3885     truncateToMinimalBitwidths();
3886 
3887   // Fix widened non-induction PHIs by setting up the PHI operands.
3888   if (OrigPHIsToFix.size()) {
3889     assert(EnableVPlanNativePath &&
3890            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3891     fixNonInductionPHIs();
3892   }
3893 
3894   // At this point every instruction in the original loop is widened to a
3895   // vector form. Now we need to fix the recurrences in the loop. These PHI
3896   // nodes are currently empty because we did not want to introduce cycles.
3897   // This is the second stage of vectorizing recurrences.
3898   fixCrossIterationPHIs();
3899 
3900   // Forget the original basic block.
3901   PSE.getSE()->forgetLoop(OrigLoop);
3902 
3903   // Fix-up external users of the induction variables.
3904   for (auto &Entry : Legal->getInductionVars())
3905     fixupIVUsers(Entry.first, Entry.second,
3906                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3907                  IVEndValues[Entry.first], LoopMiddleBlock);
3908 
3909   fixLCSSAPHIs();
3910   for (Instruction *PI : PredicatedInstructions)
3911     sinkScalarOperands(&*PI);
3912 
3913   // Remove redundant induction instructions.
3914   cse(LoopVectorBody);
3915 
3916   // Set/update profile weights for the vector and remainder loops as original
3917   // loop iterations are now distributed among them. Note that original loop
3918   // represented by LoopScalarBody becomes remainder loop after vectorization.
3919   //
3920   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3921   // end up getting slightly roughened result but that should be OK since
3922   // profile is not inherently precise anyway. Note also possible bypass of
3923   // vector code caused by legality checks is ignored, assigning all the weight
3924   // to the vector loop, optimistically.
3925   //
3926   // For scalable vectorization we can't know at compile time how many iterations
3927   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3928   // vscale of '1'.
3929   setProfileInfoAfterUnrolling(
3930       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3931       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3932 }
3933 
fixCrossIterationPHIs()3934 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3935   // In order to support recurrences we need to be able to vectorize Phi nodes.
3936   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3937   // stage #2: We now need to fix the recurrences by adding incoming edges to
3938   // the currently empty PHI nodes. At this point every instruction in the
3939   // original loop is widened to a vector form so we can use them to construct
3940   // the incoming edges.
3941   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3942     // Handle first-order recurrences and reductions that need to be fixed.
3943     if (Legal->isFirstOrderRecurrence(&Phi))
3944       fixFirstOrderRecurrence(&Phi);
3945     else if (Legal->isReductionVariable(&Phi))
3946       fixReduction(&Phi);
3947   }
3948 }
3949 
fixFirstOrderRecurrence(PHINode * Phi)3950 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3951   // This is the second phase of vectorizing first-order recurrences. An
3952   // overview of the transformation is described below. Suppose we have the
3953   // following loop.
3954   //
3955   //   for (int i = 0; i < n; ++i)
3956   //     b[i] = a[i] - a[i - 1];
3957   //
3958   // There is a first-order recurrence on "a". For this loop, the shorthand
3959   // scalar IR looks like:
3960   //
3961   //   scalar.ph:
3962   //     s_init = a[-1]
3963   //     br scalar.body
3964   //
3965   //   scalar.body:
3966   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3967   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3968   //     s2 = a[i]
3969   //     b[i] = s2 - s1
3970   //     br cond, scalar.body, ...
3971   //
3972   // In this example, s1 is a recurrence because it's value depends on the
3973   // previous iteration. In the first phase of vectorization, we created a
3974   // temporary value for s1. We now complete the vectorization and produce the
3975   // shorthand vector IR shown below (for VF = 4, UF = 1).
3976   //
3977   //   vector.ph:
3978   //     v_init = vector(..., ..., ..., a[-1])
3979   //     br vector.body
3980   //
3981   //   vector.body
3982   //     i = phi [0, vector.ph], [i+4, vector.body]
3983   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3984   //     v2 = a[i, i+1, i+2, i+3];
3985   //     v3 = vector(v1(3), v2(0, 1, 2))
3986   //     b[i, i+1, i+2, i+3] = v2 - v3
3987   //     br cond, vector.body, middle.block
3988   //
3989   //   middle.block:
3990   //     x = v2(3)
3991   //     br scalar.ph
3992   //
3993   //   scalar.ph:
3994   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3995   //     br scalar.body
3996   //
3997   // After execution completes the vector loop, we extract the next value of
3998   // the recurrence (x) to use as the initial value in the scalar loop.
3999 
4000   // Get the original loop preheader and single loop latch.
4001   auto *Preheader = OrigLoop->getLoopPreheader();
4002   auto *Latch = OrigLoop->getLoopLatch();
4003 
4004   // Get the initial and previous values of the scalar recurrence.
4005   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4006   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4007 
4008   // Create a vector from the initial value.
4009   auto *VectorInit = ScalarInit;
4010   if (VF.isVector()) {
4011     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4012     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4013     VectorInit = Builder.CreateInsertElement(
4014         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4015         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4016   }
4017 
4018   // We constructed a temporary phi node in the first phase of vectorization.
4019   // This phi node will eventually be deleted.
4020   Builder.SetInsertPoint(
4021       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4022 
4023   // Create a phi node for the new recurrence. The current value will either be
4024   // the initial value inserted into a vector or loop-varying vector value.
4025   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4026   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4027 
4028   // Get the vectorized previous value of the last part UF - 1. It appears last
4029   // among all unrolled iterations, due to the order of their construction.
4030   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4031 
4032   // Find and set the insertion point after the previous value if it is an
4033   // instruction.
4034   BasicBlock::iterator InsertPt;
4035   // Note that the previous value may have been constant-folded so it is not
4036   // guaranteed to be an instruction in the vector loop.
4037   // FIXME: Loop invariant values do not form recurrences. We should deal with
4038   //        them earlier.
4039   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4040     InsertPt = LoopVectorBody->getFirstInsertionPt();
4041   else {
4042     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4043     if (isa<PHINode>(PreviousLastPart))
4044       // If the previous value is a phi node, we should insert after all the phi
4045       // nodes in the block containing the PHI to avoid breaking basic block
4046       // verification. Note that the basic block may be different to
4047       // LoopVectorBody, in case we predicate the loop.
4048       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4049     else
4050       InsertPt = ++PreviousInst->getIterator();
4051   }
4052   Builder.SetInsertPoint(&*InsertPt);
4053 
4054   // We will construct a vector for the recurrence by combining the values for
4055   // the current and previous iterations. This is the required shuffle mask.
4056   assert(!VF.isScalable());
4057   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4058   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4059   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4060     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4061 
4062   // The vector from which to take the initial value for the current iteration
4063   // (actual or unrolled). Initially, this is the vector phi node.
4064   Value *Incoming = VecPhi;
4065 
4066   // Shuffle the current and previous vector and update the vector parts.
4067   for (unsigned Part = 0; Part < UF; ++Part) {
4068     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4069     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4070     auto *Shuffle =
4071         VF.isVector()
4072             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4073             : Incoming;
4074     PhiPart->replaceAllUsesWith(Shuffle);
4075     cast<Instruction>(PhiPart)->eraseFromParent();
4076     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4077     Incoming = PreviousPart;
4078   }
4079 
4080   // Fix the latch value of the new recurrence in the vector loop.
4081   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4082 
4083   // Extract the last vector element in the middle block. This will be the
4084   // initial value for the recurrence when jumping to the scalar loop.
4085   auto *ExtractForScalar = Incoming;
4086   if (VF.isVector()) {
4087     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4088     ExtractForScalar = Builder.CreateExtractElement(
4089         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4090         "vector.recur.extract");
4091   }
4092   // Extract the second last element in the middle block if the
4093   // Phi is used outside the loop. We need to extract the phi itself
4094   // and not the last element (the phi update in the current iteration). This
4095   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4096   // when the scalar loop is not run at all.
4097   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4098   if (VF.isVector())
4099     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4100         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4101         "vector.recur.extract.for.phi");
4102   // When loop is unrolled without vectorizing, initialize
4103   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4104   // `Incoming`. This is analogous to the vectorized case above: extracting the
4105   // second last element when VF > 1.
4106   else if (UF > 1)
4107     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4108 
4109   // Fix the initial value of the original recurrence in the scalar loop.
4110   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4111   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4112   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4113     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4114     Start->addIncoming(Incoming, BB);
4115   }
4116 
4117   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4118   Phi->setName("scalar.recur");
4119 
4120   // Finally, fix users of the recurrence outside the loop. The users will need
4121   // either the last value of the scalar recurrence or the last value of the
4122   // vector recurrence we extracted in the middle block. Since the loop is in
4123   // LCSSA form, we just need to find all the phi nodes for the original scalar
4124   // recurrence in the exit block, and then add an edge for the middle block.
4125   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4126     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4127       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4128     }
4129   }
4130 }
4131 
fixReduction(PHINode * Phi)4132 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4133   Constant *Zero = Builder.getInt32(0);
4134 
4135   // Get it's reduction variable descriptor.
4136   assert(Legal->isReductionVariable(Phi) &&
4137          "Unable to find the reduction variable");
4138   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4139 
4140   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4141   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4142   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4143   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4144     RdxDesc.getMinMaxRecurrenceKind();
4145   setDebugLocFromInst(Builder, ReductionStartValue);
4146   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4147 
4148   // We need to generate a reduction vector from the incoming scalar.
4149   // To do so, we need to generate the 'identity' vector and override
4150   // one of the elements with the incoming scalar reduction. We need
4151   // to do it in the vector-loop preheader.
4152   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4153 
4154   // This is the vector-clone of the value that leaves the loop.
4155   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4156 
4157   // Find the reduction identity variable. Zero for addition, or, xor,
4158   // one for multiplication, -1 for And.
4159   Value *Identity;
4160   Value *VectorStart;
4161   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4162       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4163     // MinMax reduction have the start value as their identify.
4164     if (VF.isScalar() || IsInLoopReductionPhi) {
4165       VectorStart = Identity = ReductionStartValue;
4166     } else {
4167       VectorStart = Identity =
4168         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4169     }
4170   } else {
4171     // Handle other reduction kinds:
4172     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4173         RK, MinMaxKind, VecTy->getScalarType());
4174     if (VF.isScalar() || IsInLoopReductionPhi) {
4175       Identity = Iden;
4176       // This vector is the Identity vector where the first element is the
4177       // incoming scalar reduction.
4178       VectorStart = ReductionStartValue;
4179     } else {
4180       Identity = ConstantVector::getSplat(VF, Iden);
4181 
4182       // This vector is the Identity vector where the first element is the
4183       // incoming scalar reduction.
4184       VectorStart =
4185         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4186     }
4187   }
4188 
4189   // Wrap flags are in general invalid after vectorization, clear them.
4190   clearReductionWrapFlags(RdxDesc);
4191 
4192   // Fix the vector-loop phi.
4193 
4194   // Reductions do not have to start at zero. They can start with
4195   // any loop invariant values.
4196   BasicBlock *Latch = OrigLoop->getLoopLatch();
4197   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4198 
4199   for (unsigned Part = 0; Part < UF; ++Part) {
4200     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4201     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4202     // Make sure to add the reduction start value only to the
4203     // first unroll part.
4204     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4205     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4206     cast<PHINode>(VecRdxPhi)
4207       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4208   }
4209 
4210   // Before each round, move the insertion point right between
4211   // the PHIs and the values we are going to write.
4212   // This allows us to write both PHINodes and the extractelement
4213   // instructions.
4214   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4215 
4216   setDebugLocFromInst(Builder, LoopExitInst);
4217 
4218   // If tail is folded by masking, the vector value to leave the loop should be
4219   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4220   // instead of the former. For an inloop reduction the reduction will already
4221   // be predicated, and does not need to be handled here.
4222   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4223     for (unsigned Part = 0; Part < UF; ++Part) {
4224       Value *VecLoopExitInst =
4225           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4226       Value *Sel = nullptr;
4227       for (User *U : VecLoopExitInst->users()) {
4228         if (isa<SelectInst>(U)) {
4229           assert(!Sel && "Reduction exit feeding two selects");
4230           Sel = U;
4231         } else
4232           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4233       }
4234       assert(Sel && "Reduction exit feeds no select");
4235       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4236 
4237       // If the target can create a predicated operator for the reduction at no
4238       // extra cost in the loop (for example a predicated vadd), it can be
4239       // cheaper for the select to remain in the loop than be sunk out of it,
4240       // and so use the select value for the phi instead of the old
4241       // LoopExitValue.
4242       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4243       if (PreferPredicatedReductionSelect ||
4244           TTI->preferPredicatedReductionSelect(
4245               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4246               TargetTransformInfo::ReductionFlags())) {
4247         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4248         VecRdxPhi->setIncomingValueForBlock(
4249             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4250       }
4251     }
4252   }
4253 
4254   // If the vector reduction can be performed in a smaller type, we truncate
4255   // then extend the loop exit value to enable InstCombine to evaluate the
4256   // entire expression in the smaller type.
4257   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4258     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4259     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4260     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4261     Builder.SetInsertPoint(
4262         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4263     VectorParts RdxParts(UF);
4264     for (unsigned Part = 0; Part < UF; ++Part) {
4265       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4266       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4267       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4268                                         : Builder.CreateZExt(Trunc, VecTy);
4269       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4270            UI != RdxParts[Part]->user_end();)
4271         if (*UI != Trunc) {
4272           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4273           RdxParts[Part] = Extnd;
4274         } else {
4275           ++UI;
4276         }
4277     }
4278     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4279     for (unsigned Part = 0; Part < UF; ++Part) {
4280       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4281       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4282     }
4283   }
4284 
4285   // Reduce all of the unrolled parts into a single vector.
4286   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4287   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4288 
4289   // The middle block terminator has already been assigned a DebugLoc here (the
4290   // OrigLoop's single latch terminator). We want the whole middle block to
4291   // appear to execute on this line because: (a) it is all compiler generated,
4292   // (b) these instructions are always executed after evaluating the latch
4293   // conditional branch, and (c) other passes may add new predecessors which
4294   // terminate on this line. This is the easiest way to ensure we don't
4295   // accidentally cause an extra step back into the loop while debugging.
4296   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4297   for (unsigned Part = 1; Part < UF; ++Part) {
4298     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4299     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4300       // Floating point operations had to be 'fast' to enable the reduction.
4301       ReducedPartRdx = addFastMathFlag(
4302           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4303                               ReducedPartRdx, "bin.rdx"),
4304           RdxDesc.getFastMathFlags());
4305     else
4306       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4307                                       RdxPart);
4308   }
4309 
4310   // Create the reduction after the loop. Note that inloop reductions create the
4311   // target reduction in the loop using a Reduction recipe.
4312   if (VF.isVector() && !IsInLoopReductionPhi) {
4313     bool NoNaN = Legal->hasFunNoNaNAttr();
4314     ReducedPartRdx =
4315         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4316     // If the reduction can be performed in a smaller type, we need to extend
4317     // the reduction to the wider type before we branch to the original loop.
4318     if (Phi->getType() != RdxDesc.getRecurrenceType())
4319       ReducedPartRdx =
4320         RdxDesc.isSigned()
4321         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4322         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4323   }
4324 
4325   // Create a phi node that merges control-flow from the backedge-taken check
4326   // block and the middle block.
4327   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4328                                         LoopScalarPreHeader->getTerminator());
4329   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4330     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4331   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4332 
4333   // Now, we need to fix the users of the reduction variable
4334   // inside and outside of the scalar remainder loop.
4335   // We know that the loop is in LCSSA form. We need to update the
4336   // PHI nodes in the exit blocks.
4337   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4338     // All PHINodes need to have a single entry edge, or two if
4339     // we already fixed them.
4340     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4341 
4342     // We found a reduction value exit-PHI. Update it with the
4343     // incoming bypass edge.
4344     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4345       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4346   } // end of the LCSSA phi scan.
4347 
4348     // Fix the scalar loop reduction variable with the incoming reduction sum
4349     // from the vector body and from the backedge value.
4350   int IncomingEdgeBlockIdx =
4351     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4352   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4353   // Pick the other block.
4354   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4355   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4356   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4357 }
4358 
clearReductionWrapFlags(RecurrenceDescriptor & RdxDesc)4359 void InnerLoopVectorizer::clearReductionWrapFlags(
4360     RecurrenceDescriptor &RdxDesc) {
4361   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4362   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4363       RK != RecurrenceDescriptor::RK_IntegerMult)
4364     return;
4365 
4366   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4367   assert(LoopExitInstr && "null loop exit instruction");
4368   SmallVector<Instruction *, 8> Worklist;
4369   SmallPtrSet<Instruction *, 8> Visited;
4370   Worklist.push_back(LoopExitInstr);
4371   Visited.insert(LoopExitInstr);
4372 
4373   while (!Worklist.empty()) {
4374     Instruction *Cur = Worklist.pop_back_val();
4375     if (isa<OverflowingBinaryOperator>(Cur))
4376       for (unsigned Part = 0; Part < UF; ++Part) {
4377         Value *V = getOrCreateVectorValue(Cur, Part);
4378         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4379       }
4380 
4381     for (User *U : Cur->users()) {
4382       Instruction *UI = cast<Instruction>(U);
4383       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4384           Visited.insert(UI).second)
4385         Worklist.push_back(UI);
4386     }
4387   }
4388 }
4389 
fixLCSSAPHIs()4390 void InnerLoopVectorizer::fixLCSSAPHIs() {
4391   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4392     if (LCSSAPhi.getNumIncomingValues() == 1) {
4393       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4394       // Non-instruction incoming values will have only one value.
4395       unsigned LastLane = 0;
4396       if (isa<Instruction>(IncomingValue))
4397         LastLane = Cost->isUniformAfterVectorization(
4398                        cast<Instruction>(IncomingValue), VF)
4399                        ? 0
4400                        : VF.getKnownMinValue() - 1;
4401       assert((!VF.isScalable() || LastLane == 0) &&
4402              "scalable vectors dont support non-uniform scalars yet");
4403       // Can be a loop invariant incoming value or the last scalar value to be
4404       // extracted from the vectorized loop.
4405       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4406       Value *lastIncomingValue =
4407           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4408       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4409     }
4410   }
4411 }
4412 
sinkScalarOperands(Instruction * PredInst)4413 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4414   // The basic block and loop containing the predicated instruction.
4415   auto *PredBB = PredInst->getParent();
4416   auto *VectorLoop = LI->getLoopFor(PredBB);
4417 
4418   // Initialize a worklist with the operands of the predicated instruction.
4419   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4420 
4421   // Holds instructions that we need to analyze again. An instruction may be
4422   // reanalyzed if we don't yet know if we can sink it or not.
4423   SmallVector<Instruction *, 8> InstsToReanalyze;
4424 
4425   // Returns true if a given use occurs in the predicated block. Phi nodes use
4426   // their operands in their corresponding predecessor blocks.
4427   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4428     auto *I = cast<Instruction>(U.getUser());
4429     BasicBlock *BB = I->getParent();
4430     if (auto *Phi = dyn_cast<PHINode>(I))
4431       BB = Phi->getIncomingBlock(
4432           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4433     return BB == PredBB;
4434   };
4435 
4436   // Iteratively sink the scalarized operands of the predicated instruction
4437   // into the block we created for it. When an instruction is sunk, it's
4438   // operands are then added to the worklist. The algorithm ends after one pass
4439   // through the worklist doesn't sink a single instruction.
4440   bool Changed;
4441   do {
4442     // Add the instructions that need to be reanalyzed to the worklist, and
4443     // reset the changed indicator.
4444     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4445     InstsToReanalyze.clear();
4446     Changed = false;
4447 
4448     while (!Worklist.empty()) {
4449       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4450 
4451       // We can't sink an instruction if it is a phi node, is already in the
4452       // predicated block, is not in the loop, or may have side effects.
4453       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4454           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4455         continue;
4456 
4457       // It's legal to sink the instruction if all its uses occur in the
4458       // predicated block. Otherwise, there's nothing to do yet, and we may
4459       // need to reanalyze the instruction.
4460       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4461         InstsToReanalyze.push_back(I);
4462         continue;
4463       }
4464 
4465       // Move the instruction to the beginning of the predicated block, and add
4466       // it's operands to the worklist.
4467       I->moveBefore(&*PredBB->getFirstInsertionPt());
4468       Worklist.insert(I->op_begin(), I->op_end());
4469 
4470       // The sinking may have enabled other instructions to be sunk, so we will
4471       // need to iterate.
4472       Changed = true;
4473     }
4474   } while (Changed);
4475 }
4476 
fixNonInductionPHIs()4477 void InnerLoopVectorizer::fixNonInductionPHIs() {
4478   for (PHINode *OrigPhi : OrigPHIsToFix) {
4479     PHINode *NewPhi =
4480         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4481     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4482 
4483     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4484         predecessors(OrigPhi->getParent()));
4485     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4486         predecessors(NewPhi->getParent()));
4487     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4488            "Scalar and Vector BB should have the same number of predecessors");
4489 
4490     // The insertion point in Builder may be invalidated by the time we get
4491     // here. Force the Builder insertion point to something valid so that we do
4492     // not run into issues during insertion point restore in
4493     // getOrCreateVectorValue calls below.
4494     Builder.SetInsertPoint(NewPhi);
4495 
4496     // The predecessor order is preserved and we can rely on mapping between
4497     // scalar and vector block predecessors.
4498     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4499       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4500 
4501       // When looking up the new scalar/vector values to fix up, use incoming
4502       // values from original phi.
4503       Value *ScIncV =
4504           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4505 
4506       // Scalar incoming value may need a broadcast
4507       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4508       NewPhi->addIncoming(NewIncV, NewPredBB);
4509     }
4510   }
4511 }
4512 
widenGEP(GetElementPtrInst * GEP,VPValue * VPDef,VPUser & Operands,unsigned UF,ElementCount VF,bool IsPtrLoopInvariant,SmallBitVector & IsIndexLoopInvariant,VPTransformState & State)4513 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4514                                    VPUser &Operands, unsigned UF,
4515                                    ElementCount VF, bool IsPtrLoopInvariant,
4516                                    SmallBitVector &IsIndexLoopInvariant,
4517                                    VPTransformState &State) {
4518   // Construct a vector GEP by widening the operands of the scalar GEP as
4519   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4520   // results in a vector of pointers when at least one operand of the GEP
4521   // is vector-typed. Thus, to keep the representation compact, we only use
4522   // vector-typed operands for loop-varying values.
4523 
4524   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4525     // If we are vectorizing, but the GEP has only loop-invariant operands,
4526     // the GEP we build (by only using vector-typed operands for
4527     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4528     // produce a vector of pointers, we need to either arbitrarily pick an
4529     // operand to broadcast, or broadcast a clone of the original GEP.
4530     // Here, we broadcast a clone of the original.
4531     //
4532     // TODO: If at some point we decide to scalarize instructions having
4533     //       loop-invariant operands, this special case will no longer be
4534     //       required. We would add the scalarization decision to
4535     //       collectLoopScalars() and teach getVectorValue() to broadcast
4536     //       the lane-zero scalar value.
4537     auto *Clone = Builder.Insert(GEP->clone());
4538     for (unsigned Part = 0; Part < UF; ++Part) {
4539       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4540       State.set(VPDef, GEP, EntryPart, Part);
4541       addMetadata(EntryPart, GEP);
4542     }
4543   } else {
4544     // If the GEP has at least one loop-varying operand, we are sure to
4545     // produce a vector of pointers. But if we are only unrolling, we want
4546     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4547     // produce with the code below will be scalar (if VF == 1) or vector
4548     // (otherwise). Note that for the unroll-only case, we still maintain
4549     // values in the vector mapping with initVector, as we do for other
4550     // instructions.
4551     for (unsigned Part = 0; Part < UF; ++Part) {
4552       // The pointer operand of the new GEP. If it's loop-invariant, we
4553       // won't broadcast it.
4554       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4555                                      : State.get(Operands.getOperand(0), Part);
4556 
4557       // Collect all the indices for the new GEP. If any index is
4558       // loop-invariant, we won't broadcast it.
4559       SmallVector<Value *, 4> Indices;
4560       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4561         VPValue *Operand = Operands.getOperand(I);
4562         if (IsIndexLoopInvariant[I - 1])
4563           Indices.push_back(State.get(Operand, {0, 0}));
4564         else
4565           Indices.push_back(State.get(Operand, Part));
4566       }
4567 
4568       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4569       // but it should be a vector, otherwise.
4570       auto *NewGEP =
4571           GEP->isInBounds()
4572               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4573                                           Indices)
4574               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4575       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4576              "NewGEP is not a pointer vector");
4577       State.set(VPDef, GEP, NewGEP, Part);
4578       addMetadata(NewGEP, GEP);
4579     }
4580   }
4581 }
4582 
widenPHIInstruction(Instruction * PN,unsigned UF,ElementCount VF)4583 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4584                                               ElementCount VF) {
4585   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4586   PHINode *P = cast<PHINode>(PN);
4587   if (EnableVPlanNativePath) {
4588     // Currently we enter here in the VPlan-native path for non-induction
4589     // PHIs where all control flow is uniform. We simply widen these PHIs.
4590     // Create a vector phi with no operands - the vector phi operands will be
4591     // set at the end of vector code generation.
4592     Type *VecTy =
4593         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4594     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4595     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4596     OrigPHIsToFix.push_back(P);
4597 
4598     return;
4599   }
4600 
4601   assert(PN->getParent() == OrigLoop->getHeader() &&
4602          "Non-header phis should have been handled elsewhere");
4603 
4604   // In order to support recurrences we need to be able to vectorize Phi nodes.
4605   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4606   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4607   // this value when we vectorize all of the instructions that use the PHI.
4608   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4609     for (unsigned Part = 0; Part < UF; ++Part) {
4610       // This is phase one of vectorizing PHIs.
4611       bool ScalarPHI =
4612           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4613       Type *VecTy =
4614           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4615       Value *EntryPart = PHINode::Create(
4616           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4617       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4618     }
4619     return;
4620   }
4621 
4622   setDebugLocFromInst(Builder, P);
4623 
4624   // This PHINode must be an induction variable.
4625   // Make sure that we know about it.
4626   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4627 
4628   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4629   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4630 
4631   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4632   // which can be found from the original scalar operations.
4633   switch (II.getKind()) {
4634   case InductionDescriptor::IK_NoInduction:
4635     llvm_unreachable("Unknown induction");
4636   case InductionDescriptor::IK_IntInduction:
4637   case InductionDescriptor::IK_FpInduction:
4638     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4639   case InductionDescriptor::IK_PtrInduction: {
4640     // Handle the pointer induction variable case.
4641     assert(P->getType()->isPointerTy() && "Unexpected type.");
4642 
4643     if (Cost->isScalarAfterVectorization(P, VF)) {
4644       // This is the normalized GEP that starts counting at zero.
4645       Value *PtrInd =
4646           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4647       // Determine the number of scalars we need to generate for each unroll
4648       // iteration. If the instruction is uniform, we only need to generate the
4649       // first lane. Otherwise, we generate all VF values.
4650       unsigned Lanes =
4651           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4652       for (unsigned Part = 0; Part < UF; ++Part) {
4653         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4654           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4655                                            Lane + Part * VF.getKnownMinValue());
4656           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4657           Value *SclrGep =
4658               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4659           SclrGep->setName("next.gep");
4660           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4661         }
4662       }
4663       return;
4664     }
4665     assert(isa<SCEVConstant>(II.getStep()) &&
4666            "Induction step not a SCEV constant!");
4667     Type *PhiType = II.getStep()->getType();
4668 
4669     // Build a pointer phi
4670     Value *ScalarStartValue = II.getStartValue();
4671     Type *ScStValueType = ScalarStartValue->getType();
4672     PHINode *NewPointerPhi =
4673         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4674     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4675 
4676     // A pointer induction, performed by using a gep
4677     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4678     Instruction *InductionLoc = LoopLatch->getTerminator();
4679     const SCEV *ScalarStep = II.getStep();
4680     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4681     Value *ScalarStepValue =
4682         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4683     Value *InductionGEP = GetElementPtrInst::Create(
4684         ScStValueType->getPointerElementType(), NewPointerPhi,
4685         Builder.CreateMul(
4686             ScalarStepValue,
4687             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4688         "ptr.ind", InductionLoc);
4689     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4690 
4691     // Create UF many actual address geps that use the pointer
4692     // phi as base and a vectorized version of the step value
4693     // (<step*0, ..., step*N>) as offset.
4694     for (unsigned Part = 0; Part < UF; ++Part) {
4695       SmallVector<Constant *, 8> Indices;
4696       // Create a vector of consecutive numbers from zero to VF.
4697       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4698         Indices.push_back(
4699             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4700       Constant *StartOffset = ConstantVector::get(Indices);
4701 
4702       Value *GEP = Builder.CreateGEP(
4703           ScStValueType->getPointerElementType(), NewPointerPhi,
4704           Builder.CreateMul(
4705               StartOffset,
4706               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4707               "vector.gep"));
4708       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4709     }
4710   }
4711   }
4712 }
4713 
4714 /// A helper function for checking whether an integer division-related
4715 /// instruction may divide by zero (in which case it must be predicated if
4716 /// executed conditionally in the scalar code).
4717 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4718 /// Non-zero divisors that are non compile-time constants will not be
4719 /// converted into multiplication, so we will still end up scalarizing
4720 /// the division, but can do so w/o predication.
mayDivideByZero(Instruction & I)4721 static bool mayDivideByZero(Instruction &I) {
4722   assert((I.getOpcode() == Instruction::UDiv ||
4723           I.getOpcode() == Instruction::SDiv ||
4724           I.getOpcode() == Instruction::URem ||
4725           I.getOpcode() == Instruction::SRem) &&
4726          "Unexpected instruction");
4727   Value *Divisor = I.getOperand(1);
4728   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4729   return !CInt || CInt->isZero();
4730 }
4731 
widenInstruction(Instruction & I,VPValue * Def,VPUser & User,VPTransformState & State)4732 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4733                                            VPUser &User,
4734                                            VPTransformState &State) {
4735   switch (I.getOpcode()) {
4736   case Instruction::Call:
4737   case Instruction::Br:
4738   case Instruction::PHI:
4739   case Instruction::GetElementPtr:
4740   case Instruction::Select:
4741     llvm_unreachable("This instruction is handled by a different recipe.");
4742   case Instruction::UDiv:
4743   case Instruction::SDiv:
4744   case Instruction::SRem:
4745   case Instruction::URem:
4746   case Instruction::Add:
4747   case Instruction::FAdd:
4748   case Instruction::Sub:
4749   case Instruction::FSub:
4750   case Instruction::FNeg:
4751   case Instruction::Mul:
4752   case Instruction::FMul:
4753   case Instruction::FDiv:
4754   case Instruction::FRem:
4755   case Instruction::Shl:
4756   case Instruction::LShr:
4757   case Instruction::AShr:
4758   case Instruction::And:
4759   case Instruction::Or:
4760   case Instruction::Xor: {
4761     // Just widen unops and binops.
4762     setDebugLocFromInst(Builder, &I);
4763 
4764     for (unsigned Part = 0; Part < UF; ++Part) {
4765       SmallVector<Value *, 2> Ops;
4766       for (VPValue *VPOp : User.operands())
4767         Ops.push_back(State.get(VPOp, Part));
4768 
4769       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4770 
4771       if (auto *VecOp = dyn_cast<Instruction>(V))
4772         VecOp->copyIRFlags(&I);
4773 
4774       // Use this vector value for all users of the original instruction.
4775       State.set(Def, &I, V, Part);
4776       addMetadata(V, &I);
4777     }
4778 
4779     break;
4780   }
4781   case Instruction::ICmp:
4782   case Instruction::FCmp: {
4783     // Widen compares. Generate vector compares.
4784     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4785     auto *Cmp = cast<CmpInst>(&I);
4786     setDebugLocFromInst(Builder, Cmp);
4787     for (unsigned Part = 0; Part < UF; ++Part) {
4788       Value *A = State.get(User.getOperand(0), Part);
4789       Value *B = State.get(User.getOperand(1), Part);
4790       Value *C = nullptr;
4791       if (FCmp) {
4792         // Propagate fast math flags.
4793         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4794         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4795         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4796       } else {
4797         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4798       }
4799       State.set(Def, &I, C, Part);
4800       addMetadata(C, &I);
4801     }
4802 
4803     break;
4804   }
4805 
4806   case Instruction::ZExt:
4807   case Instruction::SExt:
4808   case Instruction::FPToUI:
4809   case Instruction::FPToSI:
4810   case Instruction::FPExt:
4811   case Instruction::PtrToInt:
4812   case Instruction::IntToPtr:
4813   case Instruction::SIToFP:
4814   case Instruction::UIToFP:
4815   case Instruction::Trunc:
4816   case Instruction::FPTrunc:
4817   case Instruction::BitCast: {
4818     auto *CI = cast<CastInst>(&I);
4819     setDebugLocFromInst(Builder, CI);
4820 
4821     /// Vectorize casts.
4822     Type *DestTy =
4823         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4824 
4825     for (unsigned Part = 0; Part < UF; ++Part) {
4826       Value *A = State.get(User.getOperand(0), Part);
4827       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4828       State.set(Def, &I, Cast, Part);
4829       addMetadata(Cast, &I);
4830     }
4831     break;
4832   }
4833   default:
4834     // This instruction is not vectorized by simple widening.
4835     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4836     llvm_unreachable("Unhandled instruction!");
4837   } // end of switch.
4838 }
4839 
widenCallInstruction(CallInst & I,VPValue * Def,VPUser & ArgOperands,VPTransformState & State)4840 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4841                                                VPUser &ArgOperands,
4842                                                VPTransformState &State) {
4843   assert(!isa<DbgInfoIntrinsic>(I) &&
4844          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4845   setDebugLocFromInst(Builder, &I);
4846 
4847   Module *M = I.getParent()->getParent()->getParent();
4848   auto *CI = cast<CallInst>(&I);
4849 
4850   SmallVector<Type *, 4> Tys;
4851   for (Value *ArgOperand : CI->arg_operands())
4852     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4853 
4854   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4855 
4856   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4857   // version of the instruction.
4858   // Is it beneficial to perform intrinsic call compared to lib call?
4859   bool NeedToScalarize = false;
4860   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4861   bool UseVectorIntrinsic =
4862       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4863   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4864          "Instruction should be scalarized elsewhere.");
4865 
4866   for (unsigned Part = 0; Part < UF; ++Part) {
4867     SmallVector<Value *, 4> Args;
4868     for (auto &I : enumerate(ArgOperands.operands())) {
4869       // Some intrinsics have a scalar argument - don't replace it with a
4870       // vector.
4871       Value *Arg;
4872       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4873         Arg = State.get(I.value(), Part);
4874       else
4875         Arg = State.get(I.value(), {0, 0});
4876       Args.push_back(Arg);
4877     }
4878 
4879     Function *VectorF;
4880     if (UseVectorIntrinsic) {
4881       // Use vector version of the intrinsic.
4882       Type *TysForDecl[] = {CI->getType()};
4883       if (VF.isVector()) {
4884         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4885         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4886       }
4887       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4888       assert(VectorF && "Can't retrieve vector intrinsic.");
4889     } else {
4890       // Use vector version of the function call.
4891       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4892 #ifndef NDEBUG
4893       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4894              "Can't create vector function.");
4895 #endif
4896         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4897     }
4898       SmallVector<OperandBundleDef, 1> OpBundles;
4899       CI->getOperandBundlesAsDefs(OpBundles);
4900       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4901 
4902       if (isa<FPMathOperator>(V))
4903         V->copyFastMathFlags(CI);
4904 
4905       State.set(Def, &I, V, Part);
4906       addMetadata(V, &I);
4907   }
4908 }
4909 
widenSelectInstruction(SelectInst & I,VPValue * VPDef,VPUser & Operands,bool InvariantCond,VPTransformState & State)4910 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4911                                                  VPUser &Operands,
4912                                                  bool InvariantCond,
4913                                                  VPTransformState &State) {
4914   setDebugLocFromInst(Builder, &I);
4915 
4916   // The condition can be loop invariant  but still defined inside the
4917   // loop. This means that we can't just use the original 'cond' value.
4918   // We have to take the 'vectorized' value and pick the first lane.
4919   // Instcombine will make this a no-op.
4920   auto *InvarCond =
4921       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4922 
4923   for (unsigned Part = 0; Part < UF; ++Part) {
4924     Value *Cond =
4925         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4926     Value *Op0 = State.get(Operands.getOperand(1), Part);
4927     Value *Op1 = State.get(Operands.getOperand(2), Part);
4928     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4929     State.set(VPDef, &I, Sel, Part);
4930     addMetadata(Sel, &I);
4931   }
4932 }
4933 
collectLoopScalars(ElementCount VF)4934 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4935   // We should not collect Scalars more than once per VF. Right now, this
4936   // function is called from collectUniformsAndScalars(), which already does
4937   // this check. Collecting Scalars for VF=1 does not make any sense.
4938   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4939          "This function should not be visited twice for the same VF");
4940 
4941   SmallSetVector<Instruction *, 8> Worklist;
4942 
4943   // These sets are used to seed the analysis with pointers used by memory
4944   // accesses that will remain scalar.
4945   SmallSetVector<Instruction *, 8> ScalarPtrs;
4946   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4947   auto *Latch = TheLoop->getLoopLatch();
4948 
4949   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4950   // The pointer operands of loads and stores will be scalar as long as the
4951   // memory access is not a gather or scatter operation. The value operand of a
4952   // store will remain scalar if the store is scalarized.
4953   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4954     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4955     assert(WideningDecision != CM_Unknown &&
4956            "Widening decision should be ready at this moment");
4957     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4958       if (Ptr == Store->getValueOperand())
4959         return WideningDecision == CM_Scalarize;
4960     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4961            "Ptr is neither a value or pointer operand");
4962     return WideningDecision != CM_GatherScatter;
4963   };
4964 
4965   // A helper that returns true if the given value is a bitcast or
4966   // getelementptr instruction contained in the loop.
4967   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4968     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4969             isa<GetElementPtrInst>(V)) &&
4970            !TheLoop->isLoopInvariant(V);
4971   };
4972 
4973   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4974     if (!isa<PHINode>(Ptr) ||
4975         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4976       return false;
4977     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4978     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4979       return false;
4980     return isScalarUse(MemAccess, Ptr);
4981   };
4982 
4983   // A helper that evaluates a memory access's use of a pointer. If the
4984   // pointer is actually the pointer induction of a loop, it is being
4985   // inserted into Worklist. If the use will be a scalar use, and the
4986   // pointer is only used by memory accesses, we place the pointer in
4987   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4988   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4989     if (isScalarPtrInduction(MemAccess, Ptr)) {
4990       Worklist.insert(cast<Instruction>(Ptr));
4991       Instruction *Update = cast<Instruction>(
4992           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4993       Worklist.insert(Update);
4994       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4995                         << "\n");
4996       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4997                         << "\n");
4998       return;
4999     }
5000     // We only care about bitcast and getelementptr instructions contained in
5001     // the loop.
5002     if (!isLoopVaryingBitCastOrGEP(Ptr))
5003       return;
5004 
5005     // If the pointer has already been identified as scalar (e.g., if it was
5006     // also identified as uniform), there's nothing to do.
5007     auto *I = cast<Instruction>(Ptr);
5008     if (Worklist.count(I))
5009       return;
5010 
5011     // If the use of the pointer will be a scalar use, and all users of the
5012     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5013     // place the pointer in PossibleNonScalarPtrs.
5014     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5015           return isa<LoadInst>(U) || isa<StoreInst>(U);
5016         }))
5017       ScalarPtrs.insert(I);
5018     else
5019       PossibleNonScalarPtrs.insert(I);
5020   };
5021 
5022   // We seed the scalars analysis with three classes of instructions: (1)
5023   // instructions marked uniform-after-vectorization and (2) bitcast,
5024   // getelementptr and (pointer) phi instructions used by memory accesses
5025   // requiring a scalar use.
5026   //
5027   // (1) Add to the worklist all instructions that have been identified as
5028   // uniform-after-vectorization.
5029   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5030 
5031   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5032   // memory accesses requiring a scalar use. The pointer operands of loads and
5033   // stores will be scalar as long as the memory accesses is not a gather or
5034   // scatter operation. The value operand of a store will remain scalar if the
5035   // store is scalarized.
5036   for (auto *BB : TheLoop->blocks())
5037     for (auto &I : *BB) {
5038       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5039         evaluatePtrUse(Load, Load->getPointerOperand());
5040       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5041         evaluatePtrUse(Store, Store->getPointerOperand());
5042         evaluatePtrUse(Store, Store->getValueOperand());
5043       }
5044     }
5045   for (auto *I : ScalarPtrs)
5046     if (!PossibleNonScalarPtrs.count(I)) {
5047       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5048       Worklist.insert(I);
5049     }
5050 
5051   // Insert the forced scalars.
5052   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5053   // induction variable when the PHI user is scalarized.
5054   auto ForcedScalar = ForcedScalars.find(VF);
5055   if (ForcedScalar != ForcedScalars.end())
5056     for (auto *I : ForcedScalar->second)
5057       Worklist.insert(I);
5058 
5059   // Expand the worklist by looking through any bitcasts and getelementptr
5060   // instructions we've already identified as scalar. This is similar to the
5061   // expansion step in collectLoopUniforms(); however, here we're only
5062   // expanding to include additional bitcasts and getelementptr instructions.
5063   unsigned Idx = 0;
5064   while (Idx != Worklist.size()) {
5065     Instruction *Dst = Worklist[Idx++];
5066     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5067       continue;
5068     auto *Src = cast<Instruction>(Dst->getOperand(0));
5069     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5070           auto *J = cast<Instruction>(U);
5071           return !TheLoop->contains(J) || Worklist.count(J) ||
5072                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5073                   isScalarUse(J, Src));
5074         })) {
5075       Worklist.insert(Src);
5076       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5077     }
5078   }
5079 
5080   // An induction variable will remain scalar if all users of the induction
5081   // variable and induction variable update remain scalar.
5082   for (auto &Induction : Legal->getInductionVars()) {
5083     auto *Ind = Induction.first;
5084     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5085 
5086     // If tail-folding is applied, the primary induction variable will be used
5087     // to feed a vector compare.
5088     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5089       continue;
5090 
5091     // Determine if all users of the induction variable are scalar after
5092     // vectorization.
5093     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5094       auto *I = cast<Instruction>(U);
5095       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5096     });
5097     if (!ScalarInd)
5098       continue;
5099 
5100     // Determine if all users of the induction variable update instruction are
5101     // scalar after vectorization.
5102     auto ScalarIndUpdate =
5103         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5104           auto *I = cast<Instruction>(U);
5105           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5106         });
5107     if (!ScalarIndUpdate)
5108       continue;
5109 
5110     // The induction variable and its update instruction will remain scalar.
5111     Worklist.insert(Ind);
5112     Worklist.insert(IndUpdate);
5113     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5114     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5115                       << "\n");
5116   }
5117 
5118   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5119 }
5120 
isScalarWithPredication(Instruction * I,ElementCount VF)5121 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5122                                                          ElementCount VF) {
5123   if (!blockNeedsPredication(I->getParent()))
5124     return false;
5125   switch(I->getOpcode()) {
5126   default:
5127     break;
5128   case Instruction::Load:
5129   case Instruction::Store: {
5130     if (!Legal->isMaskRequired(I))
5131       return false;
5132     auto *Ptr = getLoadStorePointerOperand(I);
5133     auto *Ty = getMemInstValueType(I);
5134     // We have already decided how to vectorize this instruction, get that
5135     // result.
5136     if (VF.isVector()) {
5137       InstWidening WideningDecision = getWideningDecision(I, VF);
5138       assert(WideningDecision != CM_Unknown &&
5139              "Widening decision should be ready at this moment");
5140       return WideningDecision == CM_Scalarize;
5141     }
5142     const Align Alignment = getLoadStoreAlignment(I);
5143     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5144                                 isLegalMaskedGather(Ty, Alignment))
5145                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5146                                 isLegalMaskedScatter(Ty, Alignment));
5147   }
5148   case Instruction::UDiv:
5149   case Instruction::SDiv:
5150   case Instruction::SRem:
5151   case Instruction::URem:
5152     return mayDivideByZero(*I);
5153   }
5154   return false;
5155 }
5156 
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)5157 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5158     Instruction *I, ElementCount VF) {
5159   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5160   assert(getWideningDecision(I, VF) == CM_Unknown &&
5161          "Decision should not be set yet.");
5162   auto *Group = getInterleavedAccessGroup(I);
5163   assert(Group && "Must have a group.");
5164 
5165   // If the instruction's allocated size doesn't equal it's type size, it
5166   // requires padding and will be scalarized.
5167   auto &DL = I->getModule()->getDataLayout();
5168   auto *ScalarTy = getMemInstValueType(I);
5169   if (hasIrregularType(ScalarTy, DL, VF))
5170     return false;
5171 
5172   // Check if masking is required.
5173   // A Group may need masking for one of two reasons: it resides in a block that
5174   // needs predication, or it was decided to use masking to deal with gaps.
5175   bool PredicatedAccessRequiresMasking =
5176       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5177   bool AccessWithGapsRequiresMasking =
5178       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5179   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5180     return true;
5181 
5182   // If masked interleaving is required, we expect that the user/target had
5183   // enabled it, because otherwise it either wouldn't have been created or
5184   // it should have been invalidated by the CostModel.
5185   assert(useMaskedInterleavedAccesses(TTI) &&
5186          "Masked interleave-groups for predicated accesses are not enabled.");
5187 
5188   auto *Ty = getMemInstValueType(I);
5189   const Align Alignment = getLoadStoreAlignment(I);
5190   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5191                           : TTI.isLegalMaskedStore(Ty, Alignment);
5192 }
5193 
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)5194 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5195     Instruction *I, ElementCount VF) {
5196   // Get and ensure we have a valid memory instruction.
5197   LoadInst *LI = dyn_cast<LoadInst>(I);
5198   StoreInst *SI = dyn_cast<StoreInst>(I);
5199   assert((LI || SI) && "Invalid memory instruction");
5200 
5201   auto *Ptr = getLoadStorePointerOperand(I);
5202 
5203   // In order to be widened, the pointer should be consecutive, first of all.
5204   if (!Legal->isConsecutivePtr(Ptr))
5205     return false;
5206 
5207   // If the instruction is a store located in a predicated block, it will be
5208   // scalarized.
5209   if (isScalarWithPredication(I))
5210     return false;
5211 
5212   // If the instruction's allocated size doesn't equal it's type size, it
5213   // requires padding and will be scalarized.
5214   auto &DL = I->getModule()->getDataLayout();
5215   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5216   if (hasIrregularType(ScalarTy, DL, VF))
5217     return false;
5218 
5219   return true;
5220 }
5221 
collectLoopUniforms(ElementCount VF)5222 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5223   // We should not collect Uniforms more than once per VF. Right now,
5224   // this function is called from collectUniformsAndScalars(), which
5225   // already does this check. Collecting Uniforms for VF=1 does not make any
5226   // sense.
5227 
5228   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5229          "This function should not be visited twice for the same VF");
5230 
5231   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5232   // not analyze again.  Uniforms.count(VF) will return 1.
5233   Uniforms[VF].clear();
5234 
5235   // We now know that the loop is vectorizable!
5236   // Collect instructions inside the loop that will remain uniform after
5237   // vectorization.
5238 
5239   // Global values, params and instructions outside of current loop are out of
5240   // scope.
5241   auto isOutOfScope = [&](Value *V) -> bool {
5242     Instruction *I = dyn_cast<Instruction>(V);
5243     return (!I || !TheLoop->contains(I));
5244   };
5245 
5246   SetVector<Instruction *> Worklist;
5247   BasicBlock *Latch = TheLoop->getLoopLatch();
5248 
5249   // Instructions that are scalar with predication must not be considered
5250   // uniform after vectorization, because that would create an erroneous
5251   // replicating region where only a single instance out of VF should be formed.
5252   // TODO: optimize such seldom cases if found important, see PR40816.
5253   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5254     if (isOutOfScope(I)) {
5255       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5256                         << *I << "\n");
5257       return;
5258     }
5259     if (isScalarWithPredication(I, VF)) {
5260       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5261                         << *I << "\n");
5262       return;
5263     }
5264     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5265     Worklist.insert(I);
5266   };
5267 
5268   // Start with the conditional branch. If the branch condition is an
5269   // instruction contained in the loop that is only used by the branch, it is
5270   // uniform.
5271   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5272   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5273     addToWorklistIfAllowed(Cmp);
5274 
5275   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5276     InstWidening WideningDecision = getWideningDecision(I, VF);
5277     assert(WideningDecision != CM_Unknown &&
5278            "Widening decision should be ready at this moment");
5279 
5280     // A uniform memory op is itself uniform.  We exclude uniform stores
5281     // here as they demand the last lane, not the first one.
5282     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5283       assert(WideningDecision == CM_Scalarize);
5284       return true;
5285     }
5286 
5287     return (WideningDecision == CM_Widen ||
5288             WideningDecision == CM_Widen_Reverse ||
5289             WideningDecision == CM_Interleave);
5290   };
5291 
5292 
5293   // Returns true if Ptr is the pointer operand of a memory access instruction
5294   // I, and I is known to not require scalarization.
5295   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5296     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5297   };
5298 
5299   // Holds a list of values which are known to have at least one uniform use.
5300   // Note that there may be other uses which aren't uniform.  A "uniform use"
5301   // here is something which only demands lane 0 of the unrolled iterations;
5302   // it does not imply that all lanes produce the same value (e.g. this is not
5303   // the usual meaning of uniform)
5304   SmallPtrSet<Value *, 8> HasUniformUse;
5305 
5306   // Scan the loop for instructions which are either a) known to have only
5307   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5308   for (auto *BB : TheLoop->blocks())
5309     for (auto &I : *BB) {
5310       // If there's no pointer operand, there's nothing to do.
5311       auto *Ptr = getLoadStorePointerOperand(&I);
5312       if (!Ptr)
5313         continue;
5314 
5315       // A uniform memory op is itself uniform.  We exclude uniform stores
5316       // here as they demand the last lane, not the first one.
5317       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5318         addToWorklistIfAllowed(&I);
5319 
5320       if (isUniformDecision(&I, VF)) {
5321         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5322         HasUniformUse.insert(Ptr);
5323       }
5324     }
5325 
5326   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5327   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5328   // disallows uses outside the loop as well.
5329   for (auto *V : HasUniformUse) {
5330     if (isOutOfScope(V))
5331       continue;
5332     auto *I = cast<Instruction>(V);
5333     auto UsersAreMemAccesses =
5334       llvm::all_of(I->users(), [&](User *U) -> bool {
5335         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5336       });
5337     if (UsersAreMemAccesses)
5338       addToWorklistIfAllowed(I);
5339   }
5340 
5341   // Expand Worklist in topological order: whenever a new instruction
5342   // is added , its users should be already inside Worklist.  It ensures
5343   // a uniform instruction will only be used by uniform instructions.
5344   unsigned idx = 0;
5345   while (idx != Worklist.size()) {
5346     Instruction *I = Worklist[idx++];
5347 
5348     for (auto OV : I->operand_values()) {
5349       // isOutOfScope operands cannot be uniform instructions.
5350       if (isOutOfScope(OV))
5351         continue;
5352       // First order recurrence Phi's should typically be considered
5353       // non-uniform.
5354       auto *OP = dyn_cast<PHINode>(OV);
5355       if (OP && Legal->isFirstOrderRecurrence(OP))
5356         continue;
5357       // If all the users of the operand are uniform, then add the
5358       // operand into the uniform worklist.
5359       auto *OI = cast<Instruction>(OV);
5360       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5361             auto *J = cast<Instruction>(U);
5362             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5363           }))
5364         addToWorklistIfAllowed(OI);
5365     }
5366   }
5367 
5368   // For an instruction to be added into Worklist above, all its users inside
5369   // the loop should also be in Worklist. However, this condition cannot be
5370   // true for phi nodes that form a cyclic dependence. We must process phi
5371   // nodes separately. An induction variable will remain uniform if all users
5372   // of the induction variable and induction variable update remain uniform.
5373   // The code below handles both pointer and non-pointer induction variables.
5374   for (auto &Induction : Legal->getInductionVars()) {
5375     auto *Ind = Induction.first;
5376     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5377 
5378     // Determine if all users of the induction variable are uniform after
5379     // vectorization.
5380     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5381       auto *I = cast<Instruction>(U);
5382       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5383              isVectorizedMemAccessUse(I, Ind);
5384     });
5385     if (!UniformInd)
5386       continue;
5387 
5388     // Determine if all users of the induction variable update instruction are
5389     // uniform after vectorization.
5390     auto UniformIndUpdate =
5391         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5392           auto *I = cast<Instruction>(U);
5393           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5394                  isVectorizedMemAccessUse(I, IndUpdate);
5395         });
5396     if (!UniformIndUpdate)
5397       continue;
5398 
5399     // The induction variable and its update instruction will remain uniform.
5400     addToWorklistIfAllowed(Ind);
5401     addToWorklistIfAllowed(IndUpdate);
5402   }
5403 
5404   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5405 }
5406 
runtimeChecksRequired()5407 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5408   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5409 
5410   if (Legal->getRuntimePointerChecking()->Need) {
5411     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5412         "runtime pointer checks needed. Enable vectorization of this "
5413         "loop with '#pragma clang loop vectorize(enable)' when "
5414         "compiling with -Os/-Oz",
5415         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5416     return true;
5417   }
5418 
5419   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5420     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5421         "runtime SCEV checks needed. Enable vectorization of this "
5422         "loop with '#pragma clang loop vectorize(enable)' when "
5423         "compiling with -Os/-Oz",
5424         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5425     return true;
5426   }
5427 
5428   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5429   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5430     reportVectorizationFailure("Runtime stride check for small trip count",
5431         "runtime stride == 1 checks needed. Enable vectorization of "
5432         "this loop without such check by compiling with -Os/-Oz",
5433         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5434     return true;
5435   }
5436 
5437   return false;
5438 }
5439 
5440 Optional<ElementCount>
computeMaxVF(ElementCount UserVF,unsigned UserIC)5441 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5442   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5443     // TODO: It may by useful to do since it's still likely to be dynamically
5444     // uniform if the target can skip.
5445     reportVectorizationFailure(
5446         "Not inserting runtime ptr check for divergent target",
5447         "runtime pointer checks needed. Not enabled for divergent target",
5448         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5449     return None;
5450   }
5451 
5452   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5453   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5454   if (TC == 1) {
5455     reportVectorizationFailure("Single iteration (non) loop",
5456         "loop trip count is one, irrelevant for vectorization",
5457         "SingleIterationLoop", ORE, TheLoop);
5458     return None;
5459   }
5460 
5461   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5462 
5463   switch (ScalarEpilogueStatus) {
5464   case CM_ScalarEpilogueAllowed:
5465     return MaxVF;
5466   case CM_ScalarEpilogueNotNeededUsePredicate:
5467     LLVM_DEBUG(
5468         dbgs() << "LV: vector predicate hint/switch found.\n"
5469                << "LV: Not allowing scalar epilogue, creating predicated "
5470                << "vector loop.\n");
5471     break;
5472   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5473     // fallthrough as a special case of OptForSize
5474   case CM_ScalarEpilogueNotAllowedOptSize:
5475     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5476       LLVM_DEBUG(
5477           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5478     else
5479       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5480                         << "count.\n");
5481 
5482     // Bail if runtime checks are required, which are not good when optimising
5483     // for size.
5484     if (runtimeChecksRequired())
5485       return None;
5486     break;
5487   }
5488 
5489   // Now try the tail folding
5490 
5491   // Invalidate interleave groups that require an epilogue if we can't mask
5492   // the interleave-group.
5493   if (!useMaskedInterleavedAccesses(TTI)) {
5494     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5495            "No decisions should have been taken at this point");
5496     // Note: There is no need to invalidate any cost modeling decisions here, as
5497     // non where taken so far.
5498     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5499   }
5500 
5501   assert(!MaxVF.isScalable() &&
5502          "Scalable vectors do not yet support tail folding");
5503   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5504          "MaxVF must be a power of 2");
5505   unsigned MaxVFtimesIC =
5506       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5507   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5508     // Accept MaxVF if we do not have a tail.
5509     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5510     return MaxVF;
5511   }
5512 
5513   // If we don't know the precise trip count, or if the trip count that we
5514   // found modulo the vectorization factor is not zero, try to fold the tail
5515   // by masking.
5516   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5517   if (Legal->prepareToFoldTailByMasking()) {
5518     FoldTailByMasking = true;
5519     return MaxVF;
5520   }
5521 
5522   // If there was a tail-folding hint/switch, but we can't fold the tail by
5523   // masking, fallback to a vectorization with a scalar epilogue.
5524   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5525     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5526       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5527       return None;
5528     }
5529     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5530                          "scalar epilogue instead.\n");
5531     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5532     return MaxVF;
5533   }
5534 
5535   if (TC == 0) {
5536     reportVectorizationFailure(
5537         "Unable to calculate the loop count due to complex control flow",
5538         "unable to calculate the loop count due to complex control flow",
5539         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5540     return None;
5541   }
5542 
5543   reportVectorizationFailure(
5544       "Cannot optimize for size and vectorize at the same time.",
5545       "cannot optimize for size and vectorize at the same time. "
5546       "Enable vectorization of this loop with '#pragma clang loop "
5547       "vectorize(enable)' when compiling with -Os/-Oz",
5548       "NoTailLoopWithOptForSize", ORE, TheLoop);
5549   return None;
5550 }
5551 
5552 ElementCount
computeFeasibleMaxVF(unsigned ConstTripCount,ElementCount UserVF)5553 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5554                                                  ElementCount UserVF) {
5555   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5556   unsigned SmallestType, WidestType;
5557   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5558   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5559 
5560   // Get the maximum safe dependence distance in bits computed by LAA.
5561   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5562   // the memory accesses that is most restrictive (involved in the smallest
5563   // dependence distance).
5564   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5565 
5566   if (UserVF.isNonZero()) {
5567     // For now, don't verify legality of scalable vectors.
5568     // This will be addressed properly in https://reviews.llvm.org/D91718.
5569     if (UserVF.isScalable())
5570       return UserVF;
5571 
5572     // If legally unsafe, clamp the user vectorization factor to a safe value.
5573     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5574     if (UserVF.getFixedValue() <= MaxSafeVF)
5575       return UserVF;
5576 
5577     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5578                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5579                       << ".\n");
5580     ORE->emit([&]() {
5581       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5582                                         TheLoop->getStartLoc(),
5583                                         TheLoop->getHeader())
5584              << "User-specified vectorization factor "
5585              << ore::NV("UserVectorizationFactor", UserVF)
5586              << " is unsafe, clamping to maximum safe vectorization factor "
5587              << ore::NV("VectorizationFactor", MaxSafeVF);
5588     });
5589     return ElementCount::getFixed(MaxSafeVF);
5590   }
5591 
5592   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5593 
5594   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5595   // Note that both WidestRegister and WidestType may not be a powers of 2.
5596   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5597 
5598   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5599                     << " / " << WidestType << " bits.\n");
5600   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5601                     << WidestRegister << " bits.\n");
5602 
5603   assert(MaxVectorSize <= WidestRegister &&
5604          "Did not expect to pack so many elements"
5605          " into one vector!");
5606   if (MaxVectorSize == 0) {
5607     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5608     MaxVectorSize = 1;
5609     return ElementCount::getFixed(MaxVectorSize);
5610   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5611              isPowerOf2_32(ConstTripCount)) {
5612     // We need to clamp the VF to be the ConstTripCount. There is no point in
5613     // choosing a higher viable VF as done in the loop below.
5614     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5615                       << ConstTripCount << "\n");
5616     MaxVectorSize = ConstTripCount;
5617     return ElementCount::getFixed(MaxVectorSize);
5618   }
5619 
5620   unsigned MaxVF = MaxVectorSize;
5621   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5622       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5623     // Collect all viable vectorization factors larger than the default MaxVF
5624     // (i.e. MaxVectorSize).
5625     SmallVector<ElementCount, 8> VFs;
5626     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5627     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5628       VFs.push_back(ElementCount::getFixed(VS));
5629 
5630     // For each VF calculate its register usage.
5631     auto RUs = calculateRegisterUsage(VFs);
5632 
5633     // Select the largest VF which doesn't require more registers than existing
5634     // ones.
5635     for (int i = RUs.size() - 1; i >= 0; --i) {
5636       bool Selected = true;
5637       for (auto& pair : RUs[i].MaxLocalUsers) {
5638         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5639         if (pair.second > TargetNumRegisters)
5640           Selected = false;
5641       }
5642       if (Selected) {
5643         MaxVF = VFs[i].getKnownMinValue();
5644         break;
5645       }
5646     }
5647     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5648       if (MaxVF < MinVF) {
5649         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5650                           << ") with target's minimum: " << MinVF << '\n');
5651         MaxVF = MinVF;
5652       }
5653     }
5654   }
5655   return ElementCount::getFixed(MaxVF);
5656 }
5657 
5658 VectorizationFactor
selectVectorizationFactor(ElementCount MaxVF)5659 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5660   // FIXME: This can be fixed for scalable vectors later, because at this stage
5661   // the LoopVectorizer will only consider vectorizing a loop with scalable
5662   // vectors when the loop has a hint to enable vectorization for a given VF.
5663   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5664 
5665   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5666   const float ScalarCost = Cost;
5667   unsigned Width = 1;
5668   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5669 
5670   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5671   if (ForceVectorization && MaxVF.isVector()) {
5672     // Ignore scalar width, because the user explicitly wants vectorization.
5673     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5674     // evaluation.
5675     Cost = std::numeric_limits<float>::max();
5676   }
5677 
5678   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5679     // Notice that the vector loop needs to be executed less times, so
5680     // we need to divide the cost of the vector loops by the width of
5681     // the vector elements.
5682     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5683     float VectorCost = C.first / (float)i;
5684     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5685                       << " costs: " << (int)VectorCost << ".\n");
5686     if (!C.second && !ForceVectorization) {
5687       LLVM_DEBUG(
5688           dbgs() << "LV: Not considering vector loop of width " << i
5689                  << " because it will not generate any vector instructions.\n");
5690       continue;
5691     }
5692 
5693     // If profitable add it to ProfitableVF list.
5694     if (VectorCost < ScalarCost) {
5695       ProfitableVFs.push_back(VectorizationFactor(
5696           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5697     }
5698 
5699     if (VectorCost < Cost) {
5700       Cost = VectorCost;
5701       Width = i;
5702     }
5703   }
5704 
5705   if (!EnableCondStoresVectorization && NumPredStores) {
5706     reportVectorizationFailure("There are conditional stores.",
5707         "store that is conditionally executed prevents vectorization",
5708         "ConditionalStore", ORE, TheLoop);
5709     Width = 1;
5710     Cost = ScalarCost;
5711   }
5712 
5713   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5714              << "LV: Vectorization seems to be not beneficial, "
5715              << "but was forced by a user.\n");
5716   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5717   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5718                                 (unsigned)(Width * Cost)};
5719   return Factor;
5720 }
5721 
isCandidateForEpilogueVectorization(const Loop & L,ElementCount VF) const5722 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5723     const Loop &L, ElementCount VF) const {
5724   // Cross iteration phis such as reductions need special handling and are
5725   // currently unsupported.
5726   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5727         return Legal->isFirstOrderRecurrence(&Phi) ||
5728                Legal->isReductionVariable(&Phi);
5729       }))
5730     return false;
5731 
5732   // Phis with uses outside of the loop require special handling and are
5733   // currently unsupported.
5734   for (auto &Entry : Legal->getInductionVars()) {
5735     // Look for uses of the value of the induction at the last iteration.
5736     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5737     for (User *U : PostInc->users())
5738       if (!L.contains(cast<Instruction>(U)))
5739         return false;
5740     // Look for uses of penultimate value of the induction.
5741     for (User *U : Entry.first->users())
5742       if (!L.contains(cast<Instruction>(U)))
5743         return false;
5744   }
5745 
5746   // Induction variables that are widened require special handling that is
5747   // currently not supported.
5748   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5749         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5750                  this->isProfitableToScalarize(Entry.first, VF));
5751       }))
5752     return false;
5753 
5754   return true;
5755 }
5756 
isEpilogueVectorizationProfitable(const ElementCount VF) const5757 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5758     const ElementCount VF) const {
5759   // FIXME: We need a much better cost-model to take different parameters such
5760   // as register pressure, code size increase and cost of extra branches into
5761   // account. For now we apply a very crude heuristic and only consider loops
5762   // with vectorization factors larger than a certain value.
5763   // We also consider epilogue vectorization unprofitable for targets that don't
5764   // consider interleaving beneficial (eg. MVE).
5765   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5766     return false;
5767   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5768     return true;
5769   return false;
5770 }
5771 
5772 VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,const LoopVectorizationPlanner & LVP)5773 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5774     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5775   VectorizationFactor Result = VectorizationFactor::Disabled();
5776   if (!EnableEpilogueVectorization) {
5777     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5778     return Result;
5779   }
5780 
5781   if (!isScalarEpilogueAllowed()) {
5782     LLVM_DEBUG(
5783         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5784                   "allowed.\n";);
5785     return Result;
5786   }
5787 
5788   // Not really a cost consideration, but check for unsupported cases here to
5789   // simplify the logic.
5790   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5791     LLVM_DEBUG(
5792         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5793                   "not a supported candidate.\n";);
5794     return Result;
5795   }
5796 
5797   if (EpilogueVectorizationForceVF > 1) {
5798     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5799     if (LVP.hasPlanWithVFs(
5800             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5801       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5802     else {
5803       LLVM_DEBUG(
5804           dbgs()
5805               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5806       return Result;
5807     }
5808   }
5809 
5810   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5811       TheLoop->getHeader()->getParent()->hasMinSize()) {
5812     LLVM_DEBUG(
5813         dbgs()
5814             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5815     return Result;
5816   }
5817 
5818   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5819     return Result;
5820 
5821   for (auto &NextVF : ProfitableVFs)
5822     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5823         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5824         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5825       Result = NextVF;
5826 
5827   if (Result != VectorizationFactor::Disabled())
5828     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5829                       << Result.Width.getFixedValue() << "\n";);
5830   return Result;
5831 }
5832 
5833 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()5834 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5835   unsigned MinWidth = -1U;
5836   unsigned MaxWidth = 8;
5837   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5838 
5839   // For each block.
5840   for (BasicBlock *BB : TheLoop->blocks()) {
5841     // For each instruction in the loop.
5842     for (Instruction &I : BB->instructionsWithoutDebug()) {
5843       Type *T = I.getType();
5844 
5845       // Skip ignored values.
5846       if (ValuesToIgnore.count(&I))
5847         continue;
5848 
5849       // Only examine Loads, Stores and PHINodes.
5850       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5851         continue;
5852 
5853       // Examine PHI nodes that are reduction variables. Update the type to
5854       // account for the recurrence type.
5855       if (auto *PN = dyn_cast<PHINode>(&I)) {
5856         if (!Legal->isReductionVariable(PN))
5857           continue;
5858         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5859         T = RdxDesc.getRecurrenceType();
5860       }
5861 
5862       // Examine the stored values.
5863       if (auto *ST = dyn_cast<StoreInst>(&I))
5864         T = ST->getValueOperand()->getType();
5865 
5866       // Ignore loaded pointer types and stored pointer types that are not
5867       // vectorizable.
5868       //
5869       // FIXME: The check here attempts to predict whether a load or store will
5870       //        be vectorized. We only know this for certain after a VF has
5871       //        been selected. Here, we assume that if an access can be
5872       //        vectorized, it will be. We should also look at extending this
5873       //        optimization to non-pointer types.
5874       //
5875       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5876           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5877         continue;
5878 
5879       MinWidth = std::min(MinWidth,
5880                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5881       MaxWidth = std::max(MaxWidth,
5882                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5883     }
5884   }
5885 
5886   return {MinWidth, MaxWidth};
5887 }
5888 
selectInterleaveCount(ElementCount VF,unsigned LoopCost)5889 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5890                                                            unsigned LoopCost) {
5891   // -- The interleave heuristics --
5892   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5893   // There are many micro-architectural considerations that we can't predict
5894   // at this level. For example, frontend pressure (on decode or fetch) due to
5895   // code size, or the number and capabilities of the execution ports.
5896   //
5897   // We use the following heuristics to select the interleave count:
5898   // 1. If the code has reductions, then we interleave to break the cross
5899   // iteration dependency.
5900   // 2. If the loop is really small, then we interleave to reduce the loop
5901   // overhead.
5902   // 3. We don't interleave if we think that we will spill registers to memory
5903   // due to the increased register pressure.
5904 
5905   if (!isScalarEpilogueAllowed())
5906     return 1;
5907 
5908   // We used the distance for the interleave count.
5909   if (Legal->getMaxSafeDepDistBytes() != -1U)
5910     return 1;
5911 
5912   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5913   const bool HasReductions = !Legal->getReductionVars().empty();
5914   // Do not interleave loops with a relatively small known or estimated trip
5915   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5916   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5917   // because with the above conditions interleaving can expose ILP and break
5918   // cross iteration dependences for reductions.
5919   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5920       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5921     return 1;
5922 
5923   RegisterUsage R = calculateRegisterUsage({VF})[0];
5924   // We divide by these constants so assume that we have at least one
5925   // instruction that uses at least one register.
5926   for (auto& pair : R.MaxLocalUsers) {
5927     pair.second = std::max(pair.second, 1U);
5928   }
5929 
5930   // We calculate the interleave count using the following formula.
5931   // Subtract the number of loop invariants from the number of available
5932   // registers. These registers are used by all of the interleaved instances.
5933   // Next, divide the remaining registers by the number of registers that is
5934   // required by the loop, in order to estimate how many parallel instances
5935   // fit without causing spills. All of this is rounded down if necessary to be
5936   // a power of two. We want power of two interleave count to simplify any
5937   // addressing operations or alignment considerations.
5938   // We also want power of two interleave counts to ensure that the induction
5939   // variable of the vector loop wraps to zero, when tail is folded by masking;
5940   // this currently happens when OptForSize, in which case IC is set to 1 above.
5941   unsigned IC = UINT_MAX;
5942 
5943   for (auto& pair : R.MaxLocalUsers) {
5944     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5945     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5946                       << " registers of "
5947                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5948     if (VF.isScalar()) {
5949       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5950         TargetNumRegisters = ForceTargetNumScalarRegs;
5951     } else {
5952       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5953         TargetNumRegisters = ForceTargetNumVectorRegs;
5954     }
5955     unsigned MaxLocalUsers = pair.second;
5956     unsigned LoopInvariantRegs = 0;
5957     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5958       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5959 
5960     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5961     // Don't count the induction variable as interleaved.
5962     if (EnableIndVarRegisterHeur) {
5963       TmpIC =
5964           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5965                         std::max(1U, (MaxLocalUsers - 1)));
5966     }
5967 
5968     IC = std::min(IC, TmpIC);
5969   }
5970 
5971   // Clamp the interleave ranges to reasonable counts.
5972   unsigned MaxInterleaveCount =
5973       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5974 
5975   // Check if the user has overridden the max.
5976   if (VF.isScalar()) {
5977     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5978       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5979   } else {
5980     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5981       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5982   }
5983 
5984   // If trip count is known or estimated compile time constant, limit the
5985   // interleave count to be less than the trip count divided by VF, provided it
5986   // is at least 1.
5987   //
5988   // For scalable vectors we can't know if interleaving is beneficial. It may
5989   // not be beneficial for small loops if none of the lanes in the second vector
5990   // iterations is enabled. However, for larger loops, there is likely to be a
5991   // similar benefit as for fixed-width vectors. For now, we choose to leave
5992   // the InterleaveCount as if vscale is '1', although if some information about
5993   // the vector is known (e.g. min vector size), we can make a better decision.
5994   if (BestKnownTC) {
5995     MaxInterleaveCount =
5996         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5997     // Make sure MaxInterleaveCount is greater than 0.
5998     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5999   }
6000 
6001   assert(MaxInterleaveCount > 0 &&
6002          "Maximum interleave count must be greater than 0");
6003 
6004   // Clamp the calculated IC to be between the 1 and the max interleave count
6005   // that the target and trip count allows.
6006   if (IC > MaxInterleaveCount)
6007     IC = MaxInterleaveCount;
6008   else
6009     // Make sure IC is greater than 0.
6010     IC = std::max(1u, IC);
6011 
6012   assert(IC > 0 && "Interleave count must be greater than 0.");
6013 
6014   // If we did not calculate the cost for VF (because the user selected the VF)
6015   // then we calculate the cost of VF here.
6016   if (LoopCost == 0)
6017     LoopCost = expectedCost(VF).first;
6018 
6019   assert(LoopCost && "Non-zero loop cost expected");
6020 
6021   // Interleave if we vectorized this loop and there is a reduction that could
6022   // benefit from interleaving.
6023   if (VF.isVector() && HasReductions) {
6024     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6025     return IC;
6026   }
6027 
6028   // Note that if we've already vectorized the loop we will have done the
6029   // runtime check and so interleaving won't require further checks.
6030   bool InterleavingRequiresRuntimePointerCheck =
6031       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6032 
6033   // We want to interleave small loops in order to reduce the loop overhead and
6034   // potentially expose ILP opportunities.
6035   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6036                     << "LV: IC is " << IC << '\n'
6037                     << "LV: VF is " << VF << '\n');
6038   const bool AggressivelyInterleaveReductions =
6039       TTI.enableAggressiveInterleaving(HasReductions);
6040   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6041     // We assume that the cost overhead is 1 and we use the cost model
6042     // to estimate the cost of the loop and interleave until the cost of the
6043     // loop overhead is about 5% of the cost of the loop.
6044     unsigned SmallIC =
6045         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6046 
6047     // Interleave until store/load ports (estimated by max interleave count) are
6048     // saturated.
6049     unsigned NumStores = Legal->getNumStores();
6050     unsigned NumLoads = Legal->getNumLoads();
6051     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6052     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6053 
6054     // If we have a scalar reduction (vector reductions are already dealt with
6055     // by this point), we can increase the critical path length if the loop
6056     // we're interleaving is inside another loop. Limit, by default to 2, so the
6057     // critical path only gets increased by one reduction operation.
6058     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6059       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6060       SmallIC = std::min(SmallIC, F);
6061       StoresIC = std::min(StoresIC, F);
6062       LoadsIC = std::min(LoadsIC, F);
6063     }
6064 
6065     if (EnableLoadStoreRuntimeInterleave &&
6066         std::max(StoresIC, LoadsIC) > SmallIC) {
6067       LLVM_DEBUG(
6068           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6069       return std::max(StoresIC, LoadsIC);
6070     }
6071 
6072     // If there are scalar reductions and TTI has enabled aggressive
6073     // interleaving for reductions, we will interleave to expose ILP.
6074     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6075         AggressivelyInterleaveReductions) {
6076       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6077       // Interleave no less than SmallIC but not as aggressive as the normal IC
6078       // to satisfy the rare situation when resources are too limited.
6079       return std::max(IC / 2, SmallIC);
6080     } else {
6081       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6082       return SmallIC;
6083     }
6084   }
6085 
6086   // Interleave if this is a large loop (small loops are already dealt with by
6087   // this point) that could benefit from interleaving.
6088   if (AggressivelyInterleaveReductions) {
6089     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6090     return IC;
6091   }
6092 
6093   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6094   return 1;
6095 }
6096 
6097 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)6098 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6099   // This function calculates the register usage by measuring the highest number
6100   // of values that are alive at a single location. Obviously, this is a very
6101   // rough estimation. We scan the loop in a topological order in order and
6102   // assign a number to each instruction. We use RPO to ensure that defs are
6103   // met before their users. We assume that each instruction that has in-loop
6104   // users starts an interval. We record every time that an in-loop value is
6105   // used, so we have a list of the first and last occurrences of each
6106   // instruction. Next, we transpose this data structure into a multi map that
6107   // holds the list of intervals that *end* at a specific location. This multi
6108   // map allows us to perform a linear search. We scan the instructions linearly
6109   // and record each time that a new interval starts, by placing it in a set.
6110   // If we find this value in the multi-map then we remove it from the set.
6111   // The max register usage is the maximum size of the set.
6112   // We also search for instructions that are defined outside the loop, but are
6113   // used inside the loop. We need this number separately from the max-interval
6114   // usage number because when we unroll, loop-invariant values do not take
6115   // more register.
6116   LoopBlocksDFS DFS(TheLoop);
6117   DFS.perform(LI);
6118 
6119   RegisterUsage RU;
6120 
6121   // Each 'key' in the map opens a new interval. The values
6122   // of the map are the index of the 'last seen' usage of the
6123   // instruction that is the key.
6124   using IntervalMap = DenseMap<Instruction *, unsigned>;
6125 
6126   // Maps instruction to its index.
6127   SmallVector<Instruction *, 64> IdxToInstr;
6128   // Marks the end of each interval.
6129   IntervalMap EndPoint;
6130   // Saves the list of instruction indices that are used in the loop.
6131   SmallPtrSet<Instruction *, 8> Ends;
6132   // Saves the list of values that are used in the loop but are
6133   // defined outside the loop, such as arguments and constants.
6134   SmallPtrSet<Value *, 8> LoopInvariants;
6135 
6136   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6137     for (Instruction &I : BB->instructionsWithoutDebug()) {
6138       IdxToInstr.push_back(&I);
6139 
6140       // Save the end location of each USE.
6141       for (Value *U : I.operands()) {
6142         auto *Instr = dyn_cast<Instruction>(U);
6143 
6144         // Ignore non-instruction values such as arguments, constants, etc.
6145         if (!Instr)
6146           continue;
6147 
6148         // If this instruction is outside the loop then record it and continue.
6149         if (!TheLoop->contains(Instr)) {
6150           LoopInvariants.insert(Instr);
6151           continue;
6152         }
6153 
6154         // Overwrite previous end points.
6155         EndPoint[Instr] = IdxToInstr.size();
6156         Ends.insert(Instr);
6157       }
6158     }
6159   }
6160 
6161   // Saves the list of intervals that end with the index in 'key'.
6162   using InstrList = SmallVector<Instruction *, 2>;
6163   DenseMap<unsigned, InstrList> TransposeEnds;
6164 
6165   // Transpose the EndPoints to a list of values that end at each index.
6166   for (auto &Interval : EndPoint)
6167     TransposeEnds[Interval.second].push_back(Interval.first);
6168 
6169   SmallPtrSet<Instruction *, 8> OpenIntervals;
6170   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6171   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6172 
6173   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6174 
6175   // A lambda that gets the register usage for the given type and VF.
6176   const auto &TTICapture = TTI;
6177   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6178     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6179       return 0U;
6180     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6181   };
6182 
6183   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6184     Instruction *I = IdxToInstr[i];
6185 
6186     // Remove all of the instructions that end at this location.
6187     InstrList &List = TransposeEnds[i];
6188     for (Instruction *ToRemove : List)
6189       OpenIntervals.erase(ToRemove);
6190 
6191     // Ignore instructions that are never used within the loop.
6192     if (!Ends.count(I))
6193       continue;
6194 
6195     // Skip ignored values.
6196     if (ValuesToIgnore.count(I))
6197       continue;
6198 
6199     // For each VF find the maximum usage of registers.
6200     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6201       // Count the number of live intervals.
6202       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6203 
6204       if (VFs[j].isScalar()) {
6205         for (auto Inst : OpenIntervals) {
6206           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6207           if (RegUsage.find(ClassID) == RegUsage.end())
6208             RegUsage[ClassID] = 1;
6209           else
6210             RegUsage[ClassID] += 1;
6211         }
6212       } else {
6213         collectUniformsAndScalars(VFs[j]);
6214         for (auto Inst : OpenIntervals) {
6215           // Skip ignored values for VF > 1.
6216           if (VecValuesToIgnore.count(Inst))
6217             continue;
6218           if (isScalarAfterVectorization(Inst, VFs[j])) {
6219             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6220             if (RegUsage.find(ClassID) == RegUsage.end())
6221               RegUsage[ClassID] = 1;
6222             else
6223               RegUsage[ClassID] += 1;
6224           } else {
6225             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6226             if (RegUsage.find(ClassID) == RegUsage.end())
6227               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6228             else
6229               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6230           }
6231         }
6232       }
6233 
6234       for (auto& pair : RegUsage) {
6235         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6236           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6237         else
6238           MaxUsages[j][pair.first] = pair.second;
6239       }
6240     }
6241 
6242     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6243                       << OpenIntervals.size() << '\n');
6244 
6245     // Add the current instruction to the list of open intervals.
6246     OpenIntervals.insert(I);
6247   }
6248 
6249   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6250     SmallMapVector<unsigned, unsigned, 4> Invariant;
6251 
6252     for (auto Inst : LoopInvariants) {
6253       unsigned Usage =
6254           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6255       unsigned ClassID =
6256           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6257       if (Invariant.find(ClassID) == Invariant.end())
6258         Invariant[ClassID] = Usage;
6259       else
6260         Invariant[ClassID] += Usage;
6261     }
6262 
6263     LLVM_DEBUG({
6264       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6265       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6266              << " item\n";
6267       for (const auto &pair : MaxUsages[i]) {
6268         dbgs() << "LV(REG): RegisterClass: "
6269                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6270                << " registers\n";
6271       }
6272       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6273              << " item\n";
6274       for (const auto &pair : Invariant) {
6275         dbgs() << "LV(REG): RegisterClass: "
6276                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6277                << " registers\n";
6278       }
6279     });
6280 
6281     RU.LoopInvariantRegs = Invariant;
6282     RU.MaxLocalUsers = MaxUsages[i];
6283     RUs[i] = RU;
6284   }
6285 
6286   return RUs;
6287 }
6288 
useEmulatedMaskMemRefHack(Instruction * I)6289 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6290   // TODO: Cost model for emulated masked load/store is completely
6291   // broken. This hack guides the cost model to use an artificially
6292   // high enough value to practically disable vectorization with such
6293   // operations, except where previously deployed legality hack allowed
6294   // using very low cost values. This is to avoid regressions coming simply
6295   // from moving "masked load/store" check from legality to cost model.
6296   // Masked Load/Gather emulation was previously never allowed.
6297   // Limited number of Masked Store/Scatter emulation was allowed.
6298   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6299   return isa<LoadInst>(I) ||
6300          (isa<StoreInst>(I) &&
6301           NumPredStores > NumberOfStoresToPredicate);
6302 }
6303 
collectInstsToScalarize(ElementCount VF)6304 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6305   // If we aren't vectorizing the loop, or if we've already collected the
6306   // instructions to scalarize, there's nothing to do. Collection may already
6307   // have occurred if we have a user-selected VF and are now computing the
6308   // expected cost for interleaving.
6309   if (VF.isScalar() || VF.isZero() ||
6310       InstsToScalarize.find(VF) != InstsToScalarize.end())
6311     return;
6312 
6313   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6314   // not profitable to scalarize any instructions, the presence of VF in the
6315   // map will indicate that we've analyzed it already.
6316   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6317 
6318   // Find all the instructions that are scalar with predication in the loop and
6319   // determine if it would be better to not if-convert the blocks they are in.
6320   // If so, we also record the instructions to scalarize.
6321   for (BasicBlock *BB : TheLoop->blocks()) {
6322     if (!blockNeedsPredication(BB))
6323       continue;
6324     for (Instruction &I : *BB)
6325       if (isScalarWithPredication(&I)) {
6326         ScalarCostsTy ScalarCosts;
6327         // Do not apply discount logic if hacked cost is needed
6328         // for emulated masked memrefs.
6329         if (!useEmulatedMaskMemRefHack(&I) &&
6330             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6331           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6332         // Remember that BB will remain after vectorization.
6333         PredicatedBBsAfterVectorization.insert(BB);
6334       }
6335   }
6336 }
6337 
computePredInstDiscount(Instruction * PredInst,DenseMap<Instruction *,unsigned> & ScalarCosts,ElementCount VF)6338 int LoopVectorizationCostModel::computePredInstDiscount(
6339     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6340     ElementCount VF) {
6341   assert(!isUniformAfterVectorization(PredInst, VF) &&
6342          "Instruction marked uniform-after-vectorization will be predicated");
6343 
6344   // Initialize the discount to zero, meaning that the scalar version and the
6345   // vector version cost the same.
6346   int Discount = 0;
6347 
6348   // Holds instructions to analyze. The instructions we visit are mapped in
6349   // ScalarCosts. Those instructions are the ones that would be scalarized if
6350   // we find that the scalar version costs less.
6351   SmallVector<Instruction *, 8> Worklist;
6352 
6353   // Returns true if the given instruction can be scalarized.
6354   auto canBeScalarized = [&](Instruction *I) -> bool {
6355     // We only attempt to scalarize instructions forming a single-use chain
6356     // from the original predicated block that would otherwise be vectorized.
6357     // Although not strictly necessary, we give up on instructions we know will
6358     // already be scalar to avoid traversing chains that are unlikely to be
6359     // beneficial.
6360     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6361         isScalarAfterVectorization(I, VF))
6362       return false;
6363 
6364     // If the instruction is scalar with predication, it will be analyzed
6365     // separately. We ignore it within the context of PredInst.
6366     if (isScalarWithPredication(I))
6367       return false;
6368 
6369     // If any of the instruction's operands are uniform after vectorization,
6370     // the instruction cannot be scalarized. This prevents, for example, a
6371     // masked load from being scalarized.
6372     //
6373     // We assume we will only emit a value for lane zero of an instruction
6374     // marked uniform after vectorization, rather than VF identical values.
6375     // Thus, if we scalarize an instruction that uses a uniform, we would
6376     // create uses of values corresponding to the lanes we aren't emitting code
6377     // for. This behavior can be changed by allowing getScalarValue to clone
6378     // the lane zero values for uniforms rather than asserting.
6379     for (Use &U : I->operands())
6380       if (auto *J = dyn_cast<Instruction>(U.get()))
6381         if (isUniformAfterVectorization(J, VF))
6382           return false;
6383 
6384     // Otherwise, we can scalarize the instruction.
6385     return true;
6386   };
6387 
6388   // Compute the expected cost discount from scalarizing the entire expression
6389   // feeding the predicated instruction. We currently only consider expressions
6390   // that are single-use instruction chains.
6391   Worklist.push_back(PredInst);
6392   while (!Worklist.empty()) {
6393     Instruction *I = Worklist.pop_back_val();
6394 
6395     // If we've already analyzed the instruction, there's nothing to do.
6396     if (ScalarCosts.find(I) != ScalarCosts.end())
6397       continue;
6398 
6399     // Compute the cost of the vector instruction. Note that this cost already
6400     // includes the scalarization overhead of the predicated instruction.
6401     unsigned VectorCost = getInstructionCost(I, VF).first;
6402 
6403     // Compute the cost of the scalarized instruction. This cost is the cost of
6404     // the instruction as if it wasn't if-converted and instead remained in the
6405     // predicated block. We will scale this cost by block probability after
6406     // computing the scalarization overhead.
6407     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6408     unsigned ScalarCost =
6409         VF.getKnownMinValue() *
6410         getInstructionCost(I, ElementCount::getFixed(1)).first;
6411 
6412     // Compute the scalarization overhead of needed insertelement instructions
6413     // and phi nodes.
6414     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6415       ScalarCost += TTI.getScalarizationOverhead(
6416           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6417           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6418       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6419       ScalarCost +=
6420           VF.getKnownMinValue() *
6421           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6422     }
6423 
6424     // Compute the scalarization overhead of needed extractelement
6425     // instructions. For each of the instruction's operands, if the operand can
6426     // be scalarized, add it to the worklist; otherwise, account for the
6427     // overhead.
6428     for (Use &U : I->operands())
6429       if (auto *J = dyn_cast<Instruction>(U.get())) {
6430         assert(VectorType::isValidElementType(J->getType()) &&
6431                "Instruction has non-scalar type");
6432         if (canBeScalarized(J))
6433           Worklist.push_back(J);
6434         else if (needsExtract(J, VF)) {
6435           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6436           ScalarCost += TTI.getScalarizationOverhead(
6437               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6438               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6439         }
6440       }
6441 
6442     // Scale the total scalar cost by block probability.
6443     ScalarCost /= getReciprocalPredBlockProb();
6444 
6445     // Compute the discount. A non-negative discount means the vector version
6446     // of the instruction costs more, and scalarizing would be beneficial.
6447     Discount += VectorCost - ScalarCost;
6448     ScalarCosts[I] = ScalarCost;
6449   }
6450 
6451   return Discount;
6452 }
6453 
6454 LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF)6455 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6456   VectorizationCostTy Cost;
6457 
6458   // For each block.
6459   for (BasicBlock *BB : TheLoop->blocks()) {
6460     VectorizationCostTy BlockCost;
6461 
6462     // For each instruction in the old loop.
6463     for (Instruction &I : BB->instructionsWithoutDebug()) {
6464       // Skip ignored values.
6465       if (ValuesToIgnore.count(&I) ||
6466           (VF.isVector() && VecValuesToIgnore.count(&I)))
6467         continue;
6468 
6469       VectorizationCostTy C = getInstructionCost(&I, VF);
6470 
6471       // Check if we should override the cost.
6472       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6473         C.first = ForceTargetInstructionCost;
6474 
6475       BlockCost.first += C.first;
6476       BlockCost.second |= C.second;
6477       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6478                         << " for VF " << VF << " For instruction: " << I
6479                         << '\n');
6480     }
6481 
6482     // If we are vectorizing a predicated block, it will have been
6483     // if-converted. This means that the block's instructions (aside from
6484     // stores and instructions that may divide by zero) will now be
6485     // unconditionally executed. For the scalar case, we may not always execute
6486     // the predicated block. Thus, scale the block's cost by the probability of
6487     // executing it.
6488     if (VF.isScalar() && blockNeedsPredication(BB))
6489       BlockCost.first /= getReciprocalPredBlockProb();
6490 
6491     Cost.first += BlockCost.first;
6492     Cost.second |= BlockCost.second;
6493   }
6494 
6495   return Cost;
6496 }
6497 
6498 /// Gets Address Access SCEV after verifying that the access pattern
6499 /// is loop invariant except the induction variable dependence.
6500 ///
6501 /// This SCEV can be sent to the Target in order to estimate the address
6502 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)6503 static const SCEV *getAddressAccessSCEV(
6504               Value *Ptr,
6505               LoopVectorizationLegality *Legal,
6506               PredicatedScalarEvolution &PSE,
6507               const Loop *TheLoop) {
6508 
6509   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6510   if (!Gep)
6511     return nullptr;
6512 
6513   // We are looking for a gep with all loop invariant indices except for one
6514   // which should be an induction variable.
6515   auto SE = PSE.getSE();
6516   unsigned NumOperands = Gep->getNumOperands();
6517   for (unsigned i = 1; i < NumOperands; ++i) {
6518     Value *Opd = Gep->getOperand(i);
6519     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6520         !Legal->isInductionVariable(Opd))
6521       return nullptr;
6522   }
6523 
6524   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6525   return PSE.getSCEV(Ptr);
6526 }
6527 
isStrideMul(Instruction * I,LoopVectorizationLegality * Legal)6528 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6529   return Legal->hasStride(I->getOperand(0)) ||
6530          Legal->hasStride(I->getOperand(1));
6531 }
6532 
6533 unsigned
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6534 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6535                                                         ElementCount VF) {
6536   assert(VF.isVector() &&
6537          "Scalarization cost of instruction implies vectorization.");
6538   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6539   Type *ValTy = getMemInstValueType(I);
6540   auto SE = PSE.getSE();
6541 
6542   unsigned AS = getLoadStoreAddressSpace(I);
6543   Value *Ptr = getLoadStorePointerOperand(I);
6544   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6545 
6546   // Figure out whether the access is strided and get the stride value
6547   // if it's known in compile time
6548   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6549 
6550   // Get the cost of the scalar memory instruction and address computation.
6551   unsigned Cost =
6552       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6553 
6554   // Don't pass *I here, since it is scalar but will actually be part of a
6555   // vectorized loop where the user of it is a vectorized instruction.
6556   const Align Alignment = getLoadStoreAlignment(I);
6557   Cost += VF.getKnownMinValue() *
6558           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6559                               AS, TTI::TCK_RecipThroughput);
6560 
6561   // Get the overhead of the extractelement and insertelement instructions
6562   // we might create due to scalarization.
6563   Cost += getScalarizationOverhead(I, VF);
6564 
6565   // If we have a predicated store, it may not be executed for each vector
6566   // lane. Scale the cost by the probability of executing the predicated
6567   // block.
6568   if (isPredicatedInst(I)) {
6569     Cost /= getReciprocalPredBlockProb();
6570 
6571     if (useEmulatedMaskMemRefHack(I))
6572       // Artificially setting to a high enough value to practically disable
6573       // vectorization with such operations.
6574       Cost = 3000000;
6575   }
6576 
6577   return Cost;
6578 }
6579 
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6580 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6581                                                              ElementCount VF) {
6582   Type *ValTy = getMemInstValueType(I);
6583   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6584   Value *Ptr = getLoadStorePointerOperand(I);
6585   unsigned AS = getLoadStoreAddressSpace(I);
6586   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6587   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6588 
6589   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6590          "Stride should be 1 or -1 for consecutive memory access");
6591   const Align Alignment = getLoadStoreAlignment(I);
6592   unsigned Cost = 0;
6593   if (Legal->isMaskRequired(I))
6594     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6595                                       CostKind);
6596   else
6597     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6598                                 CostKind, I);
6599 
6600   bool Reverse = ConsecutiveStride < 0;
6601   if (Reverse)
6602     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6603   return Cost;
6604 }
6605 
getUniformMemOpCost(Instruction * I,ElementCount VF)6606 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6607                                                          ElementCount VF) {
6608   assert(Legal->isUniformMemOp(*I));
6609 
6610   Type *ValTy = getMemInstValueType(I);
6611   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6612   const Align Alignment = getLoadStoreAlignment(I);
6613   unsigned AS = getLoadStoreAddressSpace(I);
6614   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6615   if (isa<LoadInst>(I)) {
6616     return TTI.getAddressComputationCost(ValTy) +
6617            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6618                                CostKind) +
6619            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6620   }
6621   StoreInst *SI = cast<StoreInst>(I);
6622 
6623   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6624   return TTI.getAddressComputationCost(ValTy) +
6625          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6626                              CostKind) +
6627          (isLoopInvariantStoreValue
6628               ? 0
6629               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6630                                        VF.getKnownMinValue() - 1));
6631 }
6632 
getGatherScatterCost(Instruction * I,ElementCount VF)6633 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6634                                                           ElementCount VF) {
6635   Type *ValTy = getMemInstValueType(I);
6636   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6637   const Align Alignment = getLoadStoreAlignment(I);
6638   const Value *Ptr = getLoadStorePointerOperand(I);
6639 
6640   return TTI.getAddressComputationCost(VectorTy) +
6641          TTI.getGatherScatterOpCost(
6642              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6643              TargetTransformInfo::TCK_RecipThroughput, I);
6644 }
6645 
getInterleaveGroupCost(Instruction * I,ElementCount VF)6646 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6647                                                             ElementCount VF) {
6648   Type *ValTy = getMemInstValueType(I);
6649   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6650   unsigned AS = getLoadStoreAddressSpace(I);
6651 
6652   auto Group = getInterleavedAccessGroup(I);
6653   assert(Group && "Fail to get an interleaved access group.");
6654 
6655   unsigned InterleaveFactor = Group->getFactor();
6656   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6657   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6658 
6659   // Holds the indices of existing members in an interleaved load group.
6660   // An interleaved store group doesn't need this as it doesn't allow gaps.
6661   SmallVector<unsigned, 4> Indices;
6662   if (isa<LoadInst>(I)) {
6663     for (unsigned i = 0; i < InterleaveFactor; i++)
6664       if (Group->getMember(i))
6665         Indices.push_back(i);
6666   }
6667 
6668   // Calculate the cost of the whole interleaved group.
6669   bool UseMaskForGaps =
6670       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6671   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6672       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6673       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6674 
6675   if (Group->isReverse()) {
6676     // TODO: Add support for reversed masked interleaved access.
6677     assert(!Legal->isMaskRequired(I) &&
6678            "Reverse masked interleaved access not supported.");
6679     Cost += Group->getNumMembers() *
6680             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6681   }
6682   return Cost;
6683 }
6684 
getMemoryInstructionCost(Instruction * I,ElementCount VF)6685 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6686                                                               ElementCount VF) {
6687   // Calculate scalar cost only. Vectorization cost should be ready at this
6688   // moment.
6689   if (VF.isScalar()) {
6690     Type *ValTy = getMemInstValueType(I);
6691     const Align Alignment = getLoadStoreAlignment(I);
6692     unsigned AS = getLoadStoreAddressSpace(I);
6693 
6694     return TTI.getAddressComputationCost(ValTy) +
6695            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6696                                TTI::TCK_RecipThroughput, I);
6697   }
6698   return getWideningCost(I, VF);
6699 }
6700 
6701 LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6702 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6703                                                ElementCount VF) {
6704   // If we know that this instruction will remain uniform, check the cost of
6705   // the scalar version.
6706   if (isUniformAfterVectorization(I, VF))
6707     VF = ElementCount::getFixed(1);
6708 
6709   if (VF.isVector() && isProfitableToScalarize(I, VF))
6710     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6711 
6712   // Forced scalars do not have any scalarization overhead.
6713   auto ForcedScalar = ForcedScalars.find(VF);
6714   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6715     auto InstSet = ForcedScalar->second;
6716     if (InstSet.count(I))
6717       return VectorizationCostTy(
6718           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6719            VF.getKnownMinValue()),
6720           false);
6721   }
6722 
6723   Type *VectorTy;
6724   unsigned C = getInstructionCost(I, VF, VectorTy);
6725 
6726   bool TypeNotScalarized =
6727       VF.isVector() && VectorTy->isVectorTy() &&
6728       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6729   return VectorizationCostTy(C, TypeNotScalarized);
6730 }
6731 
getScalarizationOverhead(Instruction * I,ElementCount VF)6732 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6733                                                               ElementCount VF) {
6734 
6735   assert(!VF.isScalable() &&
6736          "cannot compute scalarization overhead for scalable vectorization");
6737   if (VF.isScalar())
6738     return 0;
6739 
6740   unsigned Cost = 0;
6741   Type *RetTy = ToVectorTy(I->getType(), VF);
6742   if (!RetTy->isVoidTy() &&
6743       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6744     Cost += TTI.getScalarizationOverhead(
6745         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6746         true, false);
6747 
6748   // Some targets keep addresses scalar.
6749   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6750     return Cost;
6751 
6752   // Some targets support efficient element stores.
6753   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6754     return Cost;
6755 
6756   // Collect operands to consider.
6757   CallInst *CI = dyn_cast<CallInst>(I);
6758   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6759 
6760   // Skip operands that do not require extraction/scalarization and do not incur
6761   // any overhead.
6762   return Cost + TTI.getOperandsScalarizationOverhead(
6763                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6764 }
6765 
setCostBasedWideningDecision(ElementCount VF)6766 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6767   if (VF.isScalar())
6768     return;
6769   NumPredStores = 0;
6770   for (BasicBlock *BB : TheLoop->blocks()) {
6771     // For each instruction in the old loop.
6772     for (Instruction &I : *BB) {
6773       Value *Ptr =  getLoadStorePointerOperand(&I);
6774       if (!Ptr)
6775         continue;
6776 
6777       // TODO: We should generate better code and update the cost model for
6778       // predicated uniform stores. Today they are treated as any other
6779       // predicated store (see added test cases in
6780       // invariant-store-vectorization.ll).
6781       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6782         NumPredStores++;
6783 
6784       if (Legal->isUniformMemOp(I)) {
6785         // TODO: Avoid replicating loads and stores instead of
6786         // relying on instcombine to remove them.
6787         // Load: Scalar load + broadcast
6788         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6789         unsigned Cost = getUniformMemOpCost(&I, VF);
6790         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6791         continue;
6792       }
6793 
6794       // We assume that widening is the best solution when possible.
6795       if (memoryInstructionCanBeWidened(&I, VF)) {
6796         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6797         int ConsecutiveStride =
6798                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6799         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6800                "Expected consecutive stride.");
6801         InstWidening Decision =
6802             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6803         setWideningDecision(&I, VF, Decision, Cost);
6804         continue;
6805       }
6806 
6807       // Choose between Interleaving, Gather/Scatter or Scalarization.
6808       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6809       unsigned NumAccesses = 1;
6810       if (isAccessInterleaved(&I)) {
6811         auto Group = getInterleavedAccessGroup(&I);
6812         assert(Group && "Fail to get an interleaved access group.");
6813 
6814         // Make one decision for the whole group.
6815         if (getWideningDecision(&I, VF) != CM_Unknown)
6816           continue;
6817 
6818         NumAccesses = Group->getNumMembers();
6819         if (interleavedAccessCanBeWidened(&I, VF))
6820           InterleaveCost = getInterleaveGroupCost(&I, VF);
6821       }
6822 
6823       unsigned GatherScatterCost =
6824           isLegalGatherOrScatter(&I)
6825               ? getGatherScatterCost(&I, VF) * NumAccesses
6826               : std::numeric_limits<unsigned>::max();
6827 
6828       unsigned ScalarizationCost =
6829           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6830 
6831       // Choose better solution for the current VF,
6832       // write down this decision and use it during vectorization.
6833       unsigned Cost;
6834       InstWidening Decision;
6835       if (InterleaveCost <= GatherScatterCost &&
6836           InterleaveCost < ScalarizationCost) {
6837         Decision = CM_Interleave;
6838         Cost = InterleaveCost;
6839       } else if (GatherScatterCost < ScalarizationCost) {
6840         Decision = CM_GatherScatter;
6841         Cost = GatherScatterCost;
6842       } else {
6843         Decision = CM_Scalarize;
6844         Cost = ScalarizationCost;
6845       }
6846       // If the instructions belongs to an interleave group, the whole group
6847       // receives the same decision. The whole group receives the cost, but
6848       // the cost will actually be assigned to one instruction.
6849       if (auto Group = getInterleavedAccessGroup(&I))
6850         setWideningDecision(Group, VF, Decision, Cost);
6851       else
6852         setWideningDecision(&I, VF, Decision, Cost);
6853     }
6854   }
6855 
6856   // Make sure that any load of address and any other address computation
6857   // remains scalar unless there is gather/scatter support. This avoids
6858   // inevitable extracts into address registers, and also has the benefit of
6859   // activating LSR more, since that pass can't optimize vectorized
6860   // addresses.
6861   if (TTI.prefersVectorizedAddressing())
6862     return;
6863 
6864   // Start with all scalar pointer uses.
6865   SmallPtrSet<Instruction *, 8> AddrDefs;
6866   for (BasicBlock *BB : TheLoop->blocks())
6867     for (Instruction &I : *BB) {
6868       Instruction *PtrDef =
6869         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6870       if (PtrDef && TheLoop->contains(PtrDef) &&
6871           getWideningDecision(&I, VF) != CM_GatherScatter)
6872         AddrDefs.insert(PtrDef);
6873     }
6874 
6875   // Add all instructions used to generate the addresses.
6876   SmallVector<Instruction *, 4> Worklist;
6877   for (auto *I : AddrDefs)
6878     Worklist.push_back(I);
6879   while (!Worklist.empty()) {
6880     Instruction *I = Worklist.pop_back_val();
6881     for (auto &Op : I->operands())
6882       if (auto *InstOp = dyn_cast<Instruction>(Op))
6883         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6884             AddrDefs.insert(InstOp).second)
6885           Worklist.push_back(InstOp);
6886   }
6887 
6888   for (auto *I : AddrDefs) {
6889     if (isa<LoadInst>(I)) {
6890       // Setting the desired widening decision should ideally be handled in
6891       // by cost functions, but since this involves the task of finding out
6892       // if the loaded register is involved in an address computation, it is
6893       // instead changed here when we know this is the case.
6894       InstWidening Decision = getWideningDecision(I, VF);
6895       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6896         // Scalarize a widened load of address.
6897         setWideningDecision(
6898             I, VF, CM_Scalarize,
6899             (VF.getKnownMinValue() *
6900              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6901       else if (auto Group = getInterleavedAccessGroup(I)) {
6902         // Scalarize an interleave group of address loads.
6903         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6904           if (Instruction *Member = Group->getMember(I))
6905             setWideningDecision(
6906                 Member, VF, CM_Scalarize,
6907                 (VF.getKnownMinValue() *
6908                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6909         }
6910       }
6911     } else
6912       // Make sure I gets scalarized and a cost estimate without
6913       // scalarization overhead.
6914       ForcedScalars[VF].insert(I);
6915   }
6916 }
6917 
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)6918 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6919                                                         ElementCount VF,
6920                                                         Type *&VectorTy) {
6921   Type *RetTy = I->getType();
6922   if (canTruncateToMinimalBitwidth(I, VF))
6923     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6924   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6925   auto SE = PSE.getSE();
6926   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6927 
6928   // TODO: We need to estimate the cost of intrinsic calls.
6929   switch (I->getOpcode()) {
6930   case Instruction::GetElementPtr:
6931     // We mark this instruction as zero-cost because the cost of GEPs in
6932     // vectorized code depends on whether the corresponding memory instruction
6933     // is scalarized or not. Therefore, we handle GEPs with the memory
6934     // instruction cost.
6935     return 0;
6936   case Instruction::Br: {
6937     // In cases of scalarized and predicated instructions, there will be VF
6938     // predicated blocks in the vectorized loop. Each branch around these
6939     // blocks requires also an extract of its vector compare i1 element.
6940     bool ScalarPredicatedBB = false;
6941     BranchInst *BI = cast<BranchInst>(I);
6942     if (VF.isVector() && BI->isConditional() &&
6943         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6944          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6945       ScalarPredicatedBB = true;
6946 
6947     if (ScalarPredicatedBB) {
6948       // Return cost for branches around scalarized and predicated blocks.
6949       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6950       auto *Vec_i1Ty =
6951           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6952       return (TTI.getScalarizationOverhead(
6953                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6954                   false, true) +
6955               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6956                VF.getKnownMinValue()));
6957     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6958       // The back-edge branch will remain, as will all scalar branches.
6959       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6960     else
6961       // This branch will be eliminated by if-conversion.
6962       return 0;
6963     // Note: We currently assume zero cost for an unconditional branch inside
6964     // a predicated block since it will become a fall-through, although we
6965     // may decide in the future to call TTI for all branches.
6966   }
6967   case Instruction::PHI: {
6968     auto *Phi = cast<PHINode>(I);
6969 
6970     // First-order recurrences are replaced by vector shuffles inside the loop.
6971     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6972     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6973       return TTI.getShuffleCost(
6974           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6975           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6976 
6977     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6978     // converted into select instructions. We require N - 1 selects per phi
6979     // node, where N is the number of incoming values.
6980     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6981       return (Phi->getNumIncomingValues() - 1) *
6982              TTI.getCmpSelInstrCost(
6983                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6984                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6985                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6986 
6987     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6988   }
6989   case Instruction::UDiv:
6990   case Instruction::SDiv:
6991   case Instruction::URem:
6992   case Instruction::SRem:
6993     // If we have a predicated instruction, it may not be executed for each
6994     // vector lane. Get the scalarization cost and scale this amount by the
6995     // probability of executing the predicated block. If the instruction is not
6996     // predicated, we fall through to the next case.
6997     if (VF.isVector() && isScalarWithPredication(I)) {
6998       unsigned Cost = 0;
6999 
7000       // These instructions have a non-void type, so account for the phi nodes
7001       // that we will create. This cost is likely to be zero. The phi node
7002       // cost, if any, should be scaled by the block probability because it
7003       // models a copy at the end of each predicated block.
7004       Cost += VF.getKnownMinValue() *
7005               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7006 
7007       // The cost of the non-predicated instruction.
7008       Cost += VF.getKnownMinValue() *
7009               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7010 
7011       // The cost of insertelement and extractelement instructions needed for
7012       // scalarization.
7013       Cost += getScalarizationOverhead(I, VF);
7014 
7015       // Scale the cost by the probability of executing the predicated blocks.
7016       // This assumes the predicated block for each vector lane is equally
7017       // likely.
7018       return Cost / getReciprocalPredBlockProb();
7019     }
7020     LLVM_FALLTHROUGH;
7021   case Instruction::Add:
7022   case Instruction::FAdd:
7023   case Instruction::Sub:
7024   case Instruction::FSub:
7025   case Instruction::Mul:
7026   case Instruction::FMul:
7027   case Instruction::FDiv:
7028   case Instruction::FRem:
7029   case Instruction::Shl:
7030   case Instruction::LShr:
7031   case Instruction::AShr:
7032   case Instruction::And:
7033   case Instruction::Or:
7034   case Instruction::Xor: {
7035     // Since we will replace the stride by 1 the multiplication should go away.
7036     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7037       return 0;
7038     // Certain instructions can be cheaper to vectorize if they have a constant
7039     // second vector operand. One example of this are shifts on x86.
7040     Value *Op2 = I->getOperand(1);
7041     TargetTransformInfo::OperandValueProperties Op2VP;
7042     TargetTransformInfo::OperandValueKind Op2VK =
7043         TTI.getOperandInfo(Op2, Op2VP);
7044     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7045       Op2VK = TargetTransformInfo::OK_UniformValue;
7046 
7047     SmallVector<const Value *, 4> Operands(I->operand_values());
7048     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7049     return N * TTI.getArithmeticInstrCost(
7050                    I->getOpcode(), VectorTy, CostKind,
7051                    TargetTransformInfo::OK_AnyValue,
7052                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7053   }
7054   case Instruction::FNeg: {
7055     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7056     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7057     return N * TTI.getArithmeticInstrCost(
7058                    I->getOpcode(), VectorTy, CostKind,
7059                    TargetTransformInfo::OK_AnyValue,
7060                    TargetTransformInfo::OK_AnyValue,
7061                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7062                    I->getOperand(0), I);
7063   }
7064   case Instruction::Select: {
7065     SelectInst *SI = cast<SelectInst>(I);
7066     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7067     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7068     Type *CondTy = SI->getCondition()->getType();
7069     if (!ScalarCond) {
7070       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7071       CondTy = VectorType::get(CondTy, VF);
7072     }
7073     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7074                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7075   }
7076   case Instruction::ICmp:
7077   case Instruction::FCmp: {
7078     Type *ValTy = I->getOperand(0)->getType();
7079     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7080     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7081       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7082     VectorTy = ToVectorTy(ValTy, VF);
7083     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7084                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7085   }
7086   case Instruction::Store:
7087   case Instruction::Load: {
7088     ElementCount Width = VF;
7089     if (Width.isVector()) {
7090       InstWidening Decision = getWideningDecision(I, Width);
7091       assert(Decision != CM_Unknown &&
7092              "CM decision should be taken at this point");
7093       if (Decision == CM_Scalarize)
7094         Width = ElementCount::getFixed(1);
7095     }
7096     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7097     return getMemoryInstructionCost(I, VF);
7098   }
7099   case Instruction::ZExt:
7100   case Instruction::SExt:
7101   case Instruction::FPToUI:
7102   case Instruction::FPToSI:
7103   case Instruction::FPExt:
7104   case Instruction::PtrToInt:
7105   case Instruction::IntToPtr:
7106   case Instruction::SIToFP:
7107   case Instruction::UIToFP:
7108   case Instruction::Trunc:
7109   case Instruction::FPTrunc:
7110   case Instruction::BitCast: {
7111     // Computes the CastContextHint from a Load/Store instruction.
7112     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7113       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7114              "Expected a load or a store!");
7115 
7116       if (VF.isScalar() || !TheLoop->contains(I))
7117         return TTI::CastContextHint::Normal;
7118 
7119       switch (getWideningDecision(I, VF)) {
7120       case LoopVectorizationCostModel::CM_GatherScatter:
7121         return TTI::CastContextHint::GatherScatter;
7122       case LoopVectorizationCostModel::CM_Interleave:
7123         return TTI::CastContextHint::Interleave;
7124       case LoopVectorizationCostModel::CM_Scalarize:
7125       case LoopVectorizationCostModel::CM_Widen:
7126         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7127                                         : TTI::CastContextHint::Normal;
7128       case LoopVectorizationCostModel::CM_Widen_Reverse:
7129         return TTI::CastContextHint::Reversed;
7130       case LoopVectorizationCostModel::CM_Unknown:
7131         llvm_unreachable("Instr did not go through cost modelling?");
7132       }
7133 
7134       llvm_unreachable("Unhandled case!");
7135     };
7136 
7137     unsigned Opcode = I->getOpcode();
7138     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7139     // For Trunc, the context is the only user, which must be a StoreInst.
7140     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7141       if (I->hasOneUse())
7142         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7143           CCH = ComputeCCH(Store);
7144     }
7145     // For Z/Sext, the context is the operand, which must be a LoadInst.
7146     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7147              Opcode == Instruction::FPExt) {
7148       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7149         CCH = ComputeCCH(Load);
7150     }
7151 
7152     // We optimize the truncation of induction variables having constant
7153     // integer steps. The cost of these truncations is the same as the scalar
7154     // operation.
7155     if (isOptimizableIVTruncate(I, VF)) {
7156       auto *Trunc = cast<TruncInst>(I);
7157       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7158                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7159     }
7160 
7161     Type *SrcScalarTy = I->getOperand(0)->getType();
7162     Type *SrcVecTy =
7163         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7164     if (canTruncateToMinimalBitwidth(I, VF)) {
7165       // This cast is going to be shrunk. This may remove the cast or it might
7166       // turn it into slightly different cast. For example, if MinBW == 16,
7167       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7168       //
7169       // Calculate the modified src and dest types.
7170       Type *MinVecTy = VectorTy;
7171       if (Opcode == Instruction::Trunc) {
7172         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7173         VectorTy =
7174             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7175       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7176         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7177         VectorTy =
7178             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7179       }
7180     }
7181 
7182     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7183     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7184     return N *
7185            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7186   }
7187   case Instruction::Call: {
7188     bool NeedToScalarize;
7189     CallInst *CI = cast<CallInst>(I);
7190     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7191     if (getVectorIntrinsicIDForCall(CI, TLI))
7192       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7193     return CallCost;
7194   }
7195   case Instruction::ExtractValue:
7196     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7197   default:
7198     // The cost of executing VF copies of the scalar instruction. This opcode
7199     // is unknown. Assume that it is the same as 'mul'.
7200     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7201                                        Instruction::Mul, VectorTy, CostKind) +
7202            getScalarizationOverhead(I, VF);
7203   } // end of switch.
7204 }
7205 
7206 char LoopVectorize::ID = 0;
7207 
7208 static const char lv_name[] = "Loop Vectorization";
7209 
7210 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7211 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7212 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7213 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7214 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7215 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7216 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7217 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7218 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7219 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7220 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7221 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7222 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7223 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7224 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7225 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7226 
7227 namespace llvm {
7228 
createLoopVectorizePass()7229 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7230 
createLoopVectorizePass(bool InterleaveOnlyWhenForced,bool VectorizeOnlyWhenForced)7231 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7232                               bool VectorizeOnlyWhenForced) {
7233   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7234 }
7235 
7236 } // end namespace llvm
7237 
isConsecutiveLoadOrStore(Instruction * Inst)7238 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7239   // Check if the pointer operand of a load or store instruction is
7240   // consecutive.
7241   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7242     return Legal->isConsecutivePtr(Ptr);
7243   return false;
7244 }
7245 
collectValuesToIgnore()7246 void LoopVectorizationCostModel::collectValuesToIgnore() {
7247   // Ignore ephemeral values.
7248   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7249 
7250   // Ignore type-promoting instructions we identified during reduction
7251   // detection.
7252   for (auto &Reduction : Legal->getReductionVars()) {
7253     RecurrenceDescriptor &RedDes = Reduction.second;
7254     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7255     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7256   }
7257   // Ignore type-casting instructions we identified during induction
7258   // detection.
7259   for (auto &Induction : Legal->getInductionVars()) {
7260     InductionDescriptor &IndDes = Induction.second;
7261     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7262     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7263   }
7264 }
7265 
collectInLoopReductions()7266 void LoopVectorizationCostModel::collectInLoopReductions() {
7267   for (auto &Reduction : Legal->getReductionVars()) {
7268     PHINode *Phi = Reduction.first;
7269     RecurrenceDescriptor &RdxDesc = Reduction.second;
7270 
7271     // We don't collect reductions that are type promoted (yet).
7272     if (RdxDesc.getRecurrenceType() != Phi->getType())
7273       continue;
7274 
7275     // If the target would prefer this reduction to happen "in-loop", then we
7276     // want to record it as such.
7277     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7278     if (!PreferInLoopReductions &&
7279         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7280                                    TargetTransformInfo::ReductionFlags()))
7281       continue;
7282 
7283     // Check that we can correctly put the reductions into the loop, by
7284     // finding the chain of operations that leads from the phi to the loop
7285     // exit value.
7286     SmallVector<Instruction *, 4> ReductionOperations =
7287         RdxDesc.getReductionOpChain(Phi, TheLoop);
7288     bool InLoop = !ReductionOperations.empty();
7289     if (InLoop)
7290       InLoopReductionChains[Phi] = ReductionOperations;
7291     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7292                       << " reduction for phi: " << *Phi << "\n");
7293   }
7294 }
7295 
7296 // TODO: we could return a pair of values that specify the max VF and
7297 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7298 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7299 // doesn't have a cost model that can choose which plan to execute if
7300 // more than one is generated.
determineVPlanVF(const unsigned WidestVectorRegBits,LoopVectorizationCostModel & CM)7301 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7302                                  LoopVectorizationCostModel &CM) {
7303   unsigned WidestType;
7304   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7305   return WidestVectorRegBits / WidestType;
7306 }
7307 
7308 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7309 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7310   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7311   ElementCount VF = UserVF;
7312   // Outer loop handling: They may require CFG and instruction level
7313   // transformations before even evaluating whether vectorization is profitable.
7314   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7315   // the vectorization pipeline.
7316   if (!OrigLoop->isInnermost()) {
7317     // If the user doesn't provide a vectorization factor, determine a
7318     // reasonable one.
7319     if (UserVF.isZero()) {
7320       VF = ElementCount::getFixed(
7321           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7322       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7323 
7324       // Make sure we have a VF > 1 for stress testing.
7325       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7326         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7327                           << "overriding computed VF.\n");
7328         VF = ElementCount::getFixed(4);
7329       }
7330     }
7331     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7332     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7333            "VF needs to be a power of two");
7334     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7335                       << "VF " << VF << " to build VPlans.\n");
7336     buildVPlans(VF, VF);
7337 
7338     // For VPlan build stress testing, we bail out after VPlan construction.
7339     if (VPlanBuildStressTest)
7340       return VectorizationFactor::Disabled();
7341 
7342     return {VF, 0 /*Cost*/};
7343   }
7344 
7345   LLVM_DEBUG(
7346       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7347                 "VPlan-native path.\n");
7348   return VectorizationFactor::Disabled();
7349 }
7350 
7351 Optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7352 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7353   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7354   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7355   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7356     return None;
7357 
7358   // Invalidate interleave groups if all blocks of loop will be predicated.
7359   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7360       !useMaskedInterleavedAccesses(*TTI)) {
7361     LLVM_DEBUG(
7362         dbgs()
7363         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7364            "which requires masked-interleaved support.\n");
7365     if (CM.InterleaveInfo.invalidateGroups())
7366       // Invalidating interleave groups also requires invalidating all decisions
7367       // based on them, which includes widening decisions and uniform and scalar
7368       // values.
7369       CM.invalidateCostModelingDecisions();
7370   }
7371 
7372   ElementCount MaxVF = MaybeMaxVF.getValue();
7373   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7374 
7375   if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7376     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7377     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378            "VF needs to be a power of two");
7379     // Collect the instructions (and their associated costs) that will be more
7380     // profitable to scalarize.
7381     CM.selectUserVectorizationFactor(UserVF);
7382     CM.collectInLoopReductions();
7383     buildVPlansWithVPRecipes(UserVF, UserVF);
7384     LLVM_DEBUG(printPlans(dbgs()));
7385     return {{UserVF, 0}};
7386   }
7387 
7388   assert(!MaxVF.isScalable() &&
7389          "Scalable vectors not yet supported beyond this point");
7390 
7391   for (ElementCount VF = ElementCount::getFixed(1);
7392        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7393     // Collect Uniform and Scalar instructions after vectorization with VF.
7394     CM.collectUniformsAndScalars(VF);
7395 
7396     // Collect the instructions (and their associated costs) that will be more
7397     // profitable to scalarize.
7398     if (VF.isVector())
7399       CM.collectInstsToScalarize(VF);
7400   }
7401 
7402   CM.collectInLoopReductions();
7403 
7404   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7405   LLVM_DEBUG(printPlans(dbgs()));
7406   if (MaxVF.isScalar())
7407     return VectorizationFactor::Disabled();
7408 
7409   // Select the optimal vectorization factor.
7410   return CM.selectVectorizationFactor(MaxVF);
7411 }
7412 
setBestPlan(ElementCount VF,unsigned UF)7413 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7414   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7415                     << '\n');
7416   BestVF = VF;
7417   BestUF = UF;
7418 
7419   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7420     return !Plan->hasVF(VF);
7421   });
7422   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7423 }
7424 
executePlan(InnerLoopVectorizer & ILV,DominatorTree * DT)7425 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7426                                            DominatorTree *DT) {
7427   // Perform the actual loop transformation.
7428 
7429   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7430   VPCallbackILV CallbackILV(ILV);
7431 
7432   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7433 
7434   VPTransformState State{*BestVF, BestUF,      LI,
7435                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7436                          &ILV,    CallbackILV};
7437   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7438   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7439   State.CanonicalIV = ILV.Induction;
7440 
7441   ILV.printDebugTracesAtStart();
7442 
7443   //===------------------------------------------------===//
7444   //
7445   // Notice: any optimization or new instruction that go
7446   // into the code below should also be implemented in
7447   // the cost-model.
7448   //
7449   //===------------------------------------------------===//
7450 
7451   // 2. Copy and widen instructions from the old loop into the new loop.
7452   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7453   VPlans.front()->execute(&State);
7454 
7455   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7456   //    predication, updating analyses.
7457   ILV.fixVectorizedLoop();
7458 
7459   ILV.printDebugTracesAtEnd();
7460 }
7461 
collectTriviallyDeadInstructions(SmallPtrSetImpl<Instruction * > & DeadInstructions)7462 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7463     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7464   BasicBlock *Latch = OrigLoop->getLoopLatch();
7465 
7466   // We create new control-flow for the vectorized loop, so the original
7467   // condition will be dead after vectorization if it's only used by the
7468   // branch.
7469   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7470   if (Cmp && Cmp->hasOneUse()) {
7471     DeadInstructions.insert(Cmp);
7472 
7473     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7474     for (Value *Op : Cmp->operands()) {
7475       if (isa<TruncInst>(Op) && Op->hasOneUse())
7476           DeadInstructions.insert(cast<Instruction>(Op));
7477     }
7478   }
7479 
7480   // We create new "steps" for induction variable updates to which the original
7481   // induction variables map. An original update instruction will be dead if
7482   // all its users except the induction variable are dead.
7483   for (auto &Induction : Legal->getInductionVars()) {
7484     PHINode *Ind = Induction.first;
7485     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7486 
7487     // If the tail is to be folded by masking, the primary induction variable,
7488     // if exists, isn't dead: it will be used for masking. Don't kill it.
7489     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7490       continue;
7491 
7492     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7493           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7494         }))
7495       DeadInstructions.insert(IndUpdate);
7496 
7497     // We record as "Dead" also the type-casting instructions we had identified
7498     // during induction analysis. We don't need any handling for them in the
7499     // vectorized loop because we have proven that, under a proper runtime
7500     // test guarding the vectorized loop, the value of the phi, and the casted
7501     // value of the phi, are the same. The last instruction in this casting chain
7502     // will get its scalar/vector/widened def from the scalar/vector/widened def
7503     // of the respective phi node. Any other casts in the induction def-use chain
7504     // have no other uses outside the phi update chain, and will be ignored.
7505     InductionDescriptor &IndDes = Induction.second;
7506     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7507     DeadInstructions.insert(Casts.begin(), Casts.end());
7508   }
7509 }
7510 
reverseVector(Value * Vec)7511 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7512 
getBroadcastInstrs(Value * V)7513 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7514 
getStepVector(Value * Val,int StartIdx,Value * Step,Instruction::BinaryOps BinOp)7515 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7516                                         Instruction::BinaryOps BinOp) {
7517   // When unrolling and the VF is 1, we only need to add a simple scalar.
7518   Type *Ty = Val->getType();
7519   assert(!Ty->isVectorTy() && "Val must be a scalar");
7520 
7521   if (Ty->isFloatingPointTy()) {
7522     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7523 
7524     // Floating point operations had to be 'fast' to enable the unrolling.
7525     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7526     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7527   }
7528   Constant *C = ConstantInt::get(Ty, StartIdx);
7529   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7530 }
7531 
AddRuntimeUnrollDisableMetaData(Loop * L)7532 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7533   SmallVector<Metadata *, 4> MDs;
7534   // Reserve first location for self reference to the LoopID metadata node.
7535   MDs.push_back(nullptr);
7536   bool IsUnrollMetadata = false;
7537   MDNode *LoopID = L->getLoopID();
7538   if (LoopID) {
7539     // First find existing loop unrolling disable metadata.
7540     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7541       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7542       if (MD) {
7543         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7544         IsUnrollMetadata =
7545             S && S->getString().startswith("llvm.loop.unroll.disable");
7546       }
7547       MDs.push_back(LoopID->getOperand(i));
7548     }
7549   }
7550 
7551   if (!IsUnrollMetadata) {
7552     // Add runtime unroll disable metadata.
7553     LLVMContext &Context = L->getHeader()->getContext();
7554     SmallVector<Metadata *, 1> DisableOperands;
7555     DisableOperands.push_back(
7556         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7557     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7558     MDs.push_back(DisableNode);
7559     MDNode *NewLoopID = MDNode::get(Context, MDs);
7560     // Set operand 0 to refer to the loop id itself.
7561     NewLoopID->replaceOperandWith(0, NewLoopID);
7562     L->setLoopID(NewLoopID);
7563   }
7564 }
7565 
7566 //===--------------------------------------------------------------------===//
7567 // EpilogueVectorizerMainLoop
7568 //===--------------------------------------------------------------------===//
7569 
7570 /// This function is partially responsible for generating the control flow
7571 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
createEpilogueVectorizedLoopSkeleton()7572 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7573   MDNode *OrigLoopID = OrigLoop->getLoopID();
7574   Loop *Lp = createVectorLoopSkeleton("");
7575 
7576   // Generate the code to check the minimum iteration count of the vector
7577   // epilogue (see below).
7578   EPI.EpilogueIterationCountCheck =
7579       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7580   EPI.EpilogueIterationCountCheck->setName("iter.check");
7581 
7582   // Generate the code to check any assumptions that we've made for SCEV
7583   // expressions.
7584   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7585   emitSCEVChecks(Lp, LoopScalarPreHeader);
7586 
7587   // If a safety check was generated save it.
7588   if (SavedPreHeader != LoopVectorPreHeader)
7589     EPI.SCEVSafetyCheck = SavedPreHeader;
7590 
7591   // Generate the code that checks at runtime if arrays overlap. We put the
7592   // checks into a separate block to make the more common case of few elements
7593   // faster.
7594   SavedPreHeader = LoopVectorPreHeader;
7595   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7596 
7597   // If a safety check was generated save/overwite it.
7598   if (SavedPreHeader != LoopVectorPreHeader)
7599     EPI.MemSafetyCheck = SavedPreHeader;
7600 
7601   // Generate the iteration count check for the main loop, *after* the check
7602   // for the epilogue loop, so that the path-length is shorter for the case
7603   // that goes directly through the vector epilogue. The longer-path length for
7604   // the main loop is compensated for, by the gain from vectorizing the larger
7605   // trip count. Note: the branch will get updated later on when we vectorize
7606   // the epilogue.
7607   EPI.MainLoopIterationCountCheck =
7608       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7609 
7610   // Generate the induction variable.
7611   OldInduction = Legal->getPrimaryInduction();
7612   Type *IdxTy = Legal->getWidestInductionType();
7613   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7614   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7615   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7616   EPI.VectorTripCount = CountRoundDown;
7617   Induction =
7618       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7619                               getDebugLocFromInstOrOperands(OldInduction));
7620 
7621   // Skip induction resume value creation here because they will be created in
7622   // the second pass. If we created them here, they wouldn't be used anyway,
7623   // because the vplan in the second pass still contains the inductions from the
7624   // original loop.
7625 
7626   return completeLoopSkeleton(Lp, OrigLoopID);
7627 }
7628 
printDebugTracesAtStart()7629 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7630   LLVM_DEBUG({
7631     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7632            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7633            << ", Main Loop UF:" << EPI.MainLoopUF
7634            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7635            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7636   });
7637 }
7638 
printDebugTracesAtEnd()7639 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7640   DEBUG_WITH_TYPE(VerboseDebug, {
7641     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7642   });
7643 }
7644 
emitMinimumIterationCountCheck(Loop * L,BasicBlock * Bypass,bool ForEpilogue)7645 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7646     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7647   assert(L && "Expected valid Loop.");
7648   assert(Bypass && "Expected valid bypass basic block.");
7649   unsigned VFactor =
7650       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7651   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7652   Value *Count = getOrCreateTripCount(L);
7653   // Reuse existing vector loop preheader for TC checks.
7654   // Note that new preheader block is generated for vector loop.
7655   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7656   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7657 
7658   // Generate code to check if the loop's trip count is less than VF * UF of the
7659   // main vector loop.
7660   auto P =
7661       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7662 
7663   Value *CheckMinIters = Builder.CreateICmp(
7664       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7665       "min.iters.check");
7666 
7667   if (!ForEpilogue)
7668     TCCheckBlock->setName("vector.main.loop.iter.check");
7669 
7670   // Create new preheader for vector loop.
7671   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7672                                    DT, LI, nullptr, "vector.ph");
7673 
7674   if (ForEpilogue) {
7675     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7676                                  DT->getNode(Bypass)->getIDom()) &&
7677            "TC check is expected to dominate Bypass");
7678 
7679     // Update dominator for Bypass & LoopExit.
7680     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7681     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7682 
7683     LoopBypassBlocks.push_back(TCCheckBlock);
7684 
7685     // Save the trip count so we don't have to regenerate it in the
7686     // vec.epilog.iter.check. This is safe to do because the trip count
7687     // generated here dominates the vector epilog iter check.
7688     EPI.TripCount = Count;
7689   }
7690 
7691   ReplaceInstWithInst(
7692       TCCheckBlock->getTerminator(),
7693       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7694 
7695   return TCCheckBlock;
7696 }
7697 
7698 //===--------------------------------------------------------------------===//
7699 // EpilogueVectorizerEpilogueLoop
7700 //===--------------------------------------------------------------------===//
7701 
7702 /// This function is partially responsible for generating the control flow
7703 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7704 BasicBlock *
createEpilogueVectorizedLoopSkeleton()7705 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7706   MDNode *OrigLoopID = OrigLoop->getLoopID();
7707   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7708 
7709   // Now, compare the remaining count and if there aren't enough iterations to
7710   // execute the vectorized epilogue skip to the scalar part.
7711   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7712   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7713   LoopVectorPreHeader =
7714       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7715                  LI, nullptr, "vec.epilog.ph");
7716   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7717                                           VecEpilogueIterationCountCheck);
7718 
7719   // Adjust the control flow taking the state info from the main loop
7720   // vectorization into account.
7721   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7722          "expected this to be saved from the previous pass.");
7723   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7724       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7725 
7726   DT->changeImmediateDominator(LoopVectorPreHeader,
7727                                EPI.MainLoopIterationCountCheck);
7728 
7729   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7730       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7731 
7732   if (EPI.SCEVSafetyCheck)
7733     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7734         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7735   if (EPI.MemSafetyCheck)
7736     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7737         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7738 
7739   DT->changeImmediateDominator(
7740       VecEpilogueIterationCountCheck,
7741       VecEpilogueIterationCountCheck->getSinglePredecessor());
7742 
7743   DT->changeImmediateDominator(LoopScalarPreHeader,
7744                                EPI.EpilogueIterationCountCheck);
7745   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7746 
7747   // Keep track of bypass blocks, as they feed start values to the induction
7748   // phis in the scalar loop preheader.
7749   if (EPI.SCEVSafetyCheck)
7750     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7751   if (EPI.MemSafetyCheck)
7752     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7753   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7754 
7755   // Generate a resume induction for the vector epilogue and put it in the
7756   // vector epilogue preheader
7757   Type *IdxTy = Legal->getWidestInductionType();
7758   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7759                                          LoopVectorPreHeader->getFirstNonPHI());
7760   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7761   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7762                            EPI.MainLoopIterationCountCheck);
7763 
7764   // Generate the induction variable.
7765   OldInduction = Legal->getPrimaryInduction();
7766   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7767   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7768   Value *StartIdx = EPResumeVal;
7769   Induction =
7770       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7771                               getDebugLocFromInstOrOperands(OldInduction));
7772 
7773   // Generate induction resume values. These variables save the new starting
7774   // indexes for the scalar loop. They are used to test if there are any tail
7775   // iterations left once the vector loop has completed.
7776   // Note that when the vectorized epilogue is skipped due to iteration count
7777   // check, then the resume value for the induction variable comes from
7778   // the trip count of the main vector loop, hence passing the AdditionalBypass
7779   // argument.
7780   createInductionResumeValues(Lp, CountRoundDown,
7781                               {VecEpilogueIterationCountCheck,
7782                                EPI.VectorTripCount} /* AdditionalBypass */);
7783 
7784   AddRuntimeUnrollDisableMetaData(Lp);
7785   return completeLoopSkeleton(Lp, OrigLoopID);
7786 }
7787 
7788 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(Loop * L,BasicBlock * Bypass,BasicBlock * Insert)7789 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7790     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7791 
7792   assert(EPI.TripCount &&
7793          "Expected trip count to have been safed in the first pass.");
7794   assert(
7795       (!isa<Instruction>(EPI.TripCount) ||
7796        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7797       "saved trip count does not dominate insertion point.");
7798   Value *TC = EPI.TripCount;
7799   IRBuilder<> Builder(Insert->getTerminator());
7800   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7801 
7802   // Generate code to check if the loop's trip count is less than VF * UF of the
7803   // vector epilogue loop.
7804   auto P =
7805       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7806 
7807   Value *CheckMinIters = Builder.CreateICmp(
7808       P, Count,
7809       ConstantInt::get(Count->getType(),
7810                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7811       "min.epilog.iters.check");
7812 
7813   ReplaceInstWithInst(
7814       Insert->getTerminator(),
7815       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7816 
7817   LoopBypassBlocks.push_back(Insert);
7818   return Insert;
7819 }
7820 
printDebugTracesAtStart()7821 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7822   LLVM_DEBUG({
7823     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7824            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7825            << ", Main Loop UF:" << EPI.MainLoopUF
7826            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7827            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7828   });
7829 }
7830 
printDebugTracesAtEnd()7831 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7832   DEBUG_WITH_TYPE(VerboseDebug, {
7833     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7834   });
7835 }
7836 
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)7837 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7838     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7839   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7840   bool PredicateAtRangeStart = Predicate(Range.Start);
7841 
7842   for (ElementCount TmpVF = Range.Start * 2;
7843        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7844     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7845       Range.End = TmpVF;
7846       break;
7847     }
7848 
7849   return PredicateAtRangeStart;
7850 }
7851 
7852 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7853 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7854 /// of VF's starting at a given VF and extending it as much as possible. Each
7855 /// vectorization decision can potentially shorten this sub-range during
7856 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7857 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7858                                            ElementCount MaxVF) {
7859   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7860   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7861     VFRange SubRange = {VF, MaxVFPlusOne};
7862     VPlans.push_back(buildVPlan(SubRange));
7863     VF = SubRange.End;
7864   }
7865 }
7866 
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlanPtr & Plan)7867 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7868                                          VPlanPtr &Plan) {
7869   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7870 
7871   // Look for cached value.
7872   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7873   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7874   if (ECEntryIt != EdgeMaskCache.end())
7875     return ECEntryIt->second;
7876 
7877   VPValue *SrcMask = createBlockInMask(Src, Plan);
7878 
7879   // The terminator has to be a branch inst!
7880   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7881   assert(BI && "Unexpected terminator found");
7882 
7883   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7884     return EdgeMaskCache[Edge] = SrcMask;
7885 
7886   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7887   assert(EdgeMask && "No Edge Mask found for condition");
7888 
7889   if (BI->getSuccessor(0) != Dst)
7890     EdgeMask = Builder.createNot(EdgeMask);
7891 
7892   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7893     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7894 
7895   return EdgeMaskCache[Edge] = EdgeMask;
7896 }
7897 
createBlockInMask(BasicBlock * BB,VPlanPtr & Plan)7898 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7899   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7900 
7901   // Look for cached value.
7902   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7903   if (BCEntryIt != BlockMaskCache.end())
7904     return BCEntryIt->second;
7905 
7906   // All-one mask is modelled as no-mask following the convention for masked
7907   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7908   VPValue *BlockMask = nullptr;
7909 
7910   if (OrigLoop->getHeader() == BB) {
7911     if (!CM.blockNeedsPredication(BB))
7912       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7913 
7914     // Create the block in mask as the first non-phi instruction in the block.
7915     VPBuilder::InsertPointGuard Guard(Builder);
7916     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7917     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7918 
7919     // Introduce the early-exit compare IV <= BTC to form header block mask.
7920     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7921     // Start by constructing the desired canonical IV.
7922     VPValue *IV = nullptr;
7923     if (Legal->getPrimaryInduction())
7924       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7925     else {
7926       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7927       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7928       IV = IVRecipe->getVPValue();
7929     }
7930     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7931     bool TailFolded = !CM.isScalarEpilogueAllowed();
7932 
7933     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7934       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7935       // as a second argument, we only pass the IV here and extract the
7936       // tripcount from the transform state where codegen of the VP instructions
7937       // happen.
7938       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7939     } else {
7940       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7941     }
7942     return BlockMaskCache[BB] = BlockMask;
7943   }
7944 
7945   // This is the block mask. We OR all incoming edges.
7946   for (auto *Predecessor : predecessors(BB)) {
7947     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7948     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7949       return BlockMaskCache[BB] = EdgeMask;
7950 
7951     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7952       BlockMask = EdgeMask;
7953       continue;
7954     }
7955 
7956     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7957   }
7958 
7959   return BlockMaskCache[BB] = BlockMask;
7960 }
7961 
7962 VPWidenMemoryInstructionRecipe *
tryToWidenMemory(Instruction * I,VFRange & Range,VPlanPtr & Plan)7963 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7964                                   VPlanPtr &Plan) {
7965   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7966          "Must be called with either a load or store");
7967 
7968   auto willWiden = [&](ElementCount VF) -> bool {
7969     if (VF.isScalar())
7970       return false;
7971     LoopVectorizationCostModel::InstWidening Decision =
7972         CM.getWideningDecision(I, VF);
7973     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7974            "CM decision should be taken at this point.");
7975     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7976       return true;
7977     if (CM.isScalarAfterVectorization(I, VF) ||
7978         CM.isProfitableToScalarize(I, VF))
7979       return false;
7980     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7981   };
7982 
7983   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7984     return nullptr;
7985 
7986   VPValue *Mask = nullptr;
7987   if (Legal->isMaskRequired(I))
7988     Mask = createBlockInMask(I->getParent(), Plan);
7989 
7990   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7991   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7992     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7993 
7994   StoreInst *Store = cast<StoreInst>(I);
7995   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7996   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7997 }
7998 
7999 VPWidenIntOrFpInductionRecipe *
tryToOptimizeInductionPHI(PHINode * Phi) const8000 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8001   // Check if this is an integer or fp induction. If so, build the recipe that
8002   // produces its scalar and vector values.
8003   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8004   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8005       II.getKind() == InductionDescriptor::IK_FpInduction)
8006     return new VPWidenIntOrFpInductionRecipe(Phi);
8007 
8008   return nullptr;
8009 }
8010 
8011 VPWidenIntOrFpInductionRecipe *
tryToOptimizeInductionTruncate(TruncInst * I,VFRange & Range) const8012 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8013                                                 VFRange &Range) const {
8014   // Optimize the special case where the source is a constant integer
8015   // induction variable. Notice that we can only optimize the 'trunc' case
8016   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8017   // (c) other casts depend on pointer size.
8018 
8019   // Determine whether \p K is a truncation based on an induction variable that
8020   // can be optimized.
8021   auto isOptimizableIVTruncate =
8022       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8023     return [=](ElementCount VF) -> bool {
8024       return CM.isOptimizableIVTruncate(K, VF);
8025     };
8026   };
8027 
8028   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8029           isOptimizableIVTruncate(I), Range))
8030     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8031                                              I);
8032   return nullptr;
8033 }
8034 
tryToBlend(PHINode * Phi,VPlanPtr & Plan)8035 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8036   // We know that all PHIs in non-header blocks are converted into selects, so
8037   // we don't have to worry about the insertion order and we can just use the
8038   // builder. At this point we generate the predication tree. There may be
8039   // duplications since this is a simple recursive scan, but future
8040   // optimizations will clean it up.
8041 
8042   SmallVector<VPValue *, 2> Operands;
8043   unsigned NumIncoming = Phi->getNumIncomingValues();
8044   for (unsigned In = 0; In < NumIncoming; In++) {
8045     VPValue *EdgeMask =
8046       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8047     assert((EdgeMask || NumIncoming == 1) &&
8048            "Multiple predecessors with one having a full mask");
8049     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8050     if (EdgeMask)
8051       Operands.push_back(EdgeMask);
8052   }
8053   return new VPBlendRecipe(Phi, Operands);
8054 }
8055 
tryToWidenCall(CallInst * CI,VFRange & Range,VPlan & Plan) const8056 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8057                                                    VPlan &Plan) const {
8058 
8059   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8060       [this, CI](ElementCount VF) {
8061         return CM.isScalarWithPredication(CI, VF);
8062       },
8063       Range);
8064 
8065   if (IsPredicated)
8066     return nullptr;
8067 
8068   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8069   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8070              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8071              ID == Intrinsic::pseudoprobe))
8072     return nullptr;
8073 
8074   auto willWiden = [&](ElementCount VF) -> bool {
8075     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8076     // The following case may be scalarized depending on the VF.
8077     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8078     // version of the instruction.
8079     // Is it beneficial to perform intrinsic call compared to lib call?
8080     bool NeedToScalarize = false;
8081     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8082     bool UseVectorIntrinsic =
8083         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8084     return UseVectorIntrinsic || !NeedToScalarize;
8085   };
8086 
8087   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8088     return nullptr;
8089 
8090   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8091 }
8092 
shouldWiden(Instruction * I,VFRange & Range) const8093 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8094   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8095          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8096   // Instruction should be widened, unless it is scalar after vectorization,
8097   // scalarization is profitable or it is predicated.
8098   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8099     return CM.isScalarAfterVectorization(I, VF) ||
8100            CM.isProfitableToScalarize(I, VF) ||
8101            CM.isScalarWithPredication(I, VF);
8102   };
8103   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8104                                                              Range);
8105 }
8106 
tryToWiden(Instruction * I,VPlan & Plan) const8107 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8108   auto IsVectorizableOpcode = [](unsigned Opcode) {
8109     switch (Opcode) {
8110     case Instruction::Add:
8111     case Instruction::And:
8112     case Instruction::AShr:
8113     case Instruction::BitCast:
8114     case Instruction::FAdd:
8115     case Instruction::FCmp:
8116     case Instruction::FDiv:
8117     case Instruction::FMul:
8118     case Instruction::FNeg:
8119     case Instruction::FPExt:
8120     case Instruction::FPToSI:
8121     case Instruction::FPToUI:
8122     case Instruction::FPTrunc:
8123     case Instruction::FRem:
8124     case Instruction::FSub:
8125     case Instruction::ICmp:
8126     case Instruction::IntToPtr:
8127     case Instruction::LShr:
8128     case Instruction::Mul:
8129     case Instruction::Or:
8130     case Instruction::PtrToInt:
8131     case Instruction::SDiv:
8132     case Instruction::Select:
8133     case Instruction::SExt:
8134     case Instruction::Shl:
8135     case Instruction::SIToFP:
8136     case Instruction::SRem:
8137     case Instruction::Sub:
8138     case Instruction::Trunc:
8139     case Instruction::UDiv:
8140     case Instruction::UIToFP:
8141     case Instruction::URem:
8142     case Instruction::Xor:
8143     case Instruction::ZExt:
8144       return true;
8145     }
8146     return false;
8147   };
8148 
8149   if (!IsVectorizableOpcode(I->getOpcode()))
8150     return nullptr;
8151 
8152   // Success: widen this instruction.
8153   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8154 }
8155 
handleReplication(Instruction * I,VFRange & Range,VPBasicBlock * VPBB,DenseMap<Instruction *,VPReplicateRecipe * > & PredInst2Recipe,VPlanPtr & Plan)8156 VPBasicBlock *VPRecipeBuilder::handleReplication(
8157     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8158     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8159     VPlanPtr &Plan) {
8160   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8161       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8162       Range);
8163 
8164   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8165       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8166       Range);
8167 
8168   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8169                                        IsUniform, IsPredicated);
8170   setRecipe(I, Recipe);
8171   Plan->addVPValue(I, Recipe);
8172 
8173   // Find if I uses a predicated instruction. If so, it will use its scalar
8174   // value. Avoid hoisting the insert-element which packs the scalar value into
8175   // a vector value, as that happens iff all users use the vector value.
8176   for (auto &Op : I->operands())
8177     if (auto *PredInst = dyn_cast<Instruction>(Op))
8178       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8179         PredInst2Recipe[PredInst]->setAlsoPack(false);
8180 
8181   // Finalize the recipe for Instr, first if it is not predicated.
8182   if (!IsPredicated) {
8183     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8184     VPBB->appendRecipe(Recipe);
8185     return VPBB;
8186   }
8187   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8188   assert(VPBB->getSuccessors().empty() &&
8189          "VPBB has successors when handling predicated replication.");
8190   // Record predicated instructions for above packing optimizations.
8191   PredInst2Recipe[I] = Recipe;
8192   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8193   VPBlockUtils::insertBlockAfter(Region, VPBB);
8194   auto *RegSucc = new VPBasicBlock();
8195   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8196   return RegSucc;
8197 }
8198 
createReplicateRegion(Instruction * Instr,VPRecipeBase * PredRecipe,VPlanPtr & Plan)8199 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8200                                                       VPRecipeBase *PredRecipe,
8201                                                       VPlanPtr &Plan) {
8202   // Instructions marked for predication are replicated and placed under an
8203   // if-then construct to prevent side-effects.
8204 
8205   // Generate recipes to compute the block mask for this region.
8206   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8207 
8208   // Build the triangular if-then region.
8209   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8210   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8211   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8212   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8213   auto *PHIRecipe = Instr->getType()->isVoidTy()
8214                         ? nullptr
8215                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8216   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8217   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8218   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8219 
8220   // Note: first set Entry as region entry and then connect successors starting
8221   // from it in order, to propagate the "parent" of each VPBasicBlock.
8222   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8223   VPBlockUtils::connectBlocks(Pred, Exit);
8224 
8225   return Region;
8226 }
8227 
tryToCreateWidenRecipe(Instruction * Instr,VFRange & Range,VPlanPtr & Plan)8228 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8229                                                       VFRange &Range,
8230                                                       VPlanPtr &Plan) {
8231   // First, check for specific widening recipes that deal with calls, memory
8232   // operations, inductions and Phi nodes.
8233   if (auto *CI = dyn_cast<CallInst>(Instr))
8234     return tryToWidenCall(CI, Range, *Plan);
8235 
8236   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8237     return tryToWidenMemory(Instr, Range, Plan);
8238 
8239   VPRecipeBase *Recipe;
8240   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8241     if (Phi->getParent() != OrigLoop->getHeader())
8242       return tryToBlend(Phi, Plan);
8243     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8244       return Recipe;
8245     return new VPWidenPHIRecipe(Phi);
8246   }
8247 
8248   if (isa<TruncInst>(Instr) &&
8249       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8250     return Recipe;
8251 
8252   if (!shouldWiden(Instr, Range))
8253     return nullptr;
8254 
8255   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8256     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8257                                 OrigLoop);
8258 
8259   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8260     bool InvariantCond =
8261         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8262     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8263                                    InvariantCond);
8264   }
8265 
8266   return tryToWiden(Instr, *Plan);
8267 }
8268 
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8269 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8270                                                         ElementCount MaxVF) {
8271   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8272 
8273   // Collect instructions from the original loop that will become trivially dead
8274   // in the vectorized loop. We don't need to vectorize these instructions. For
8275   // example, original induction update instructions can become dead because we
8276   // separately emit induction "steps" when generating code for the new loop.
8277   // Similarly, we create a new latch condition when setting up the structure
8278   // of the new loop, so the old one can become dead.
8279   SmallPtrSet<Instruction *, 4> DeadInstructions;
8280   collectTriviallyDeadInstructions(DeadInstructions);
8281 
8282   // Add assume instructions we need to drop to DeadInstructions, to prevent
8283   // them from being added to the VPlan.
8284   // TODO: We only need to drop assumes in blocks that get flattend. If the
8285   // control flow is preserved, we should keep them.
8286   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8287   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8288 
8289   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8290   // Dead instructions do not need sinking. Remove them from SinkAfter.
8291   for (Instruction *I : DeadInstructions)
8292     SinkAfter.erase(I);
8293 
8294   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8295   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8296     VFRange SubRange = {VF, MaxVFPlusOne};
8297     VPlans.push_back(
8298         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8299     VF = SubRange.End;
8300   }
8301 }
8302 
buildVPlanWithVPRecipes(VFRange & Range,SmallPtrSetImpl<Instruction * > & DeadInstructions,const DenseMap<Instruction *,Instruction * > & SinkAfter)8303 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8304     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8305     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8306 
8307   // Hold a mapping from predicated instructions to their recipes, in order to
8308   // fix their AlsoPack behavior if a user is determined to replicate and use a
8309   // scalar instead of vector value.
8310   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8311 
8312   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8313 
8314   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8315 
8316   // ---------------------------------------------------------------------------
8317   // Pre-construction: record ingredients whose recipes we'll need to further
8318   // process after constructing the initial VPlan.
8319   // ---------------------------------------------------------------------------
8320 
8321   // Mark instructions we'll need to sink later and their targets as
8322   // ingredients whose recipe we'll need to record.
8323   for (auto &Entry : SinkAfter) {
8324     RecipeBuilder.recordRecipeOf(Entry.first);
8325     RecipeBuilder.recordRecipeOf(Entry.second);
8326   }
8327   for (auto &Reduction : CM.getInLoopReductionChains()) {
8328     PHINode *Phi = Reduction.first;
8329     RecurrenceDescriptor::RecurrenceKind Kind =
8330         Legal->getReductionVars()[Phi].getRecurrenceKind();
8331     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8332 
8333     RecipeBuilder.recordRecipeOf(Phi);
8334     for (auto &R : ReductionOperations) {
8335       RecipeBuilder.recordRecipeOf(R);
8336       // For min/max reducitons, where we have a pair of icmp/select, we also
8337       // need to record the ICmp recipe, so it can be removed later.
8338       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8339           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8340         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8341       }
8342     }
8343   }
8344 
8345   // For each interleave group which is relevant for this (possibly trimmed)
8346   // Range, add it to the set of groups to be later applied to the VPlan and add
8347   // placeholders for its members' Recipes which we'll be replacing with a
8348   // single VPInterleaveRecipe.
8349   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8350     auto applyIG = [IG, this](ElementCount VF) -> bool {
8351       return (VF.isVector() && // Query is illegal for VF == 1
8352               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8353                   LoopVectorizationCostModel::CM_Interleave);
8354     };
8355     if (!getDecisionAndClampRange(applyIG, Range))
8356       continue;
8357     InterleaveGroups.insert(IG);
8358     for (unsigned i = 0; i < IG->getFactor(); i++)
8359       if (Instruction *Member = IG->getMember(i))
8360         RecipeBuilder.recordRecipeOf(Member);
8361   };
8362 
8363   // ---------------------------------------------------------------------------
8364   // Build initial VPlan: Scan the body of the loop in a topological order to
8365   // visit each basic block after having visited its predecessor basic blocks.
8366   // ---------------------------------------------------------------------------
8367 
8368   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8369   auto Plan = std::make_unique<VPlan>();
8370   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8371   Plan->setEntry(VPBB);
8372 
8373   // Scan the body of the loop in a topological order to visit each basic block
8374   // after having visited its predecessor basic blocks.
8375   LoopBlocksDFS DFS(OrigLoop);
8376   DFS.perform(LI);
8377 
8378   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8379     // Relevant instructions from basic block BB will be grouped into VPRecipe
8380     // ingredients and fill a new VPBasicBlock.
8381     unsigned VPBBsForBB = 0;
8382     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8383     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8384     VPBB = FirstVPBBForBB;
8385     Builder.setInsertPoint(VPBB);
8386 
8387     // Introduce each ingredient into VPlan.
8388     // TODO: Model and preserve debug instrinsics in VPlan.
8389     for (Instruction &I : BB->instructionsWithoutDebug()) {
8390       Instruction *Instr = &I;
8391 
8392       // First filter out irrelevant instructions, to ensure no recipes are
8393       // built for them.
8394       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8395         continue;
8396 
8397       if (auto Recipe =
8398               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8399         // Check if the recipe can be converted to a VPValue. We need the extra
8400         // down-casting step until VPRecipeBase inherits from VPValue.
8401         VPValue *MaybeVPValue = Recipe->toVPValue();
8402         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
8403           Plan->addVPValue(Instr, MaybeVPValue);
8404 
8405         RecipeBuilder.setRecipe(Instr, Recipe);
8406         VPBB->appendRecipe(Recipe);
8407         continue;
8408       }
8409 
8410       // Otherwise, if all widening options failed, Instruction is to be
8411       // replicated. This may create a successor for VPBB.
8412       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8413           Instr, Range, VPBB, PredInst2Recipe, Plan);
8414       if (NextVPBB != VPBB) {
8415         VPBB = NextVPBB;
8416         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8417                                     : "");
8418       }
8419     }
8420   }
8421 
8422   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8423   // may also be empty, such as the last one VPBB, reflecting original
8424   // basic-blocks with no recipes.
8425   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8426   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8427   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8428   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8429   delete PreEntry;
8430 
8431   // ---------------------------------------------------------------------------
8432   // Transform initial VPlan: Apply previously taken decisions, in order, to
8433   // bring the VPlan to its final state.
8434   // ---------------------------------------------------------------------------
8435 
8436   // Apply Sink-After legal constraints.
8437   for (auto &Entry : SinkAfter) {
8438     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8439     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8440     Sink->moveAfter(Target);
8441   }
8442 
8443   // Interleave memory: for each Interleave Group we marked earlier as relevant
8444   // for this VPlan, replace the Recipes widening its memory instructions with a
8445   // single VPInterleaveRecipe at its insertion point.
8446   for (auto IG : InterleaveGroups) {
8447     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8448         RecipeBuilder.getRecipe(IG->getInsertPos()));
8449     SmallVector<VPValue *, 4> StoredValues;
8450     for (unsigned i = 0; i < IG->getFactor(); ++i)
8451       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8452         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8453 
8454     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8455                             Recipe->getMask()))
8456         ->insertBefore(Recipe);
8457 
8458     for (unsigned i = 0; i < IG->getFactor(); ++i)
8459       if (Instruction *Member = IG->getMember(i)) {
8460         if (!Member->getType()->isVoidTy()) {
8461           VPValue *OriginalV = Plan->getVPValue(Member);
8462           Plan->removeVPValueFor(Member);
8463           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
8464         }
8465         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8466       }
8467   }
8468 
8469   // Adjust the recipes for any inloop reductions.
8470   if (Range.Start.isVector())
8471     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8472 
8473   // Finally, if tail is folded by masking, introduce selects between the phi
8474   // and the live-out instruction of each reduction, at the end of the latch.
8475   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8476     Builder.setInsertPoint(VPBB);
8477     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8478     for (auto &Reduction : Legal->getReductionVars()) {
8479       if (CM.isInLoopReduction(Reduction.first))
8480         continue;
8481       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8482       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8483       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8484     }
8485   }
8486 
8487   std::string PlanName;
8488   raw_string_ostream RSO(PlanName);
8489   ElementCount VF = Range.Start;
8490   Plan->addVF(VF);
8491   RSO << "Initial VPlan for VF={" << VF;
8492   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8493     Plan->addVF(VF);
8494     RSO << "," << VF;
8495   }
8496   RSO << "},UF>=1";
8497   RSO.flush();
8498   Plan->setName(PlanName);
8499 
8500   return Plan;
8501 }
8502 
buildVPlan(VFRange & Range)8503 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8504   // Outer loop handling: They may require CFG and instruction level
8505   // transformations before even evaluating whether vectorization is profitable.
8506   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8507   // the vectorization pipeline.
8508   assert(!OrigLoop->isInnermost());
8509   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8510 
8511   // Create new empty VPlan
8512   auto Plan = std::make_unique<VPlan>();
8513 
8514   // Build hierarchical CFG
8515   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8516   HCFGBuilder.buildHierarchicalCFG();
8517 
8518   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8519        VF *= 2)
8520     Plan->addVF(VF);
8521 
8522   if (EnableVPlanPredication) {
8523     VPlanPredicator VPP(*Plan);
8524     VPP.predicate();
8525 
8526     // Avoid running transformation to recipes until masked code generation in
8527     // VPlan-native path is in place.
8528     return Plan;
8529   }
8530 
8531   SmallPtrSet<Instruction *, 1> DeadInstructions;
8532   VPlanTransforms::VPInstructionsToVPRecipes(
8533       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8534   return Plan;
8535 }
8536 
8537 // Adjust the recipes for any inloop reductions. The chain of instructions
8538 // leading from the loop exit instr to the phi need to be converted to
8539 // reductions, with one operand being vector and the other being the scalar
8540 // reduction chain.
adjustRecipesForInLoopReductions(VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder)8541 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8542     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8543   for (auto &Reduction : CM.getInLoopReductionChains()) {
8544     PHINode *Phi = Reduction.first;
8545     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8546     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8547 
8548     // ReductionOperations are orders top-down from the phi's use to the
8549     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8550     // which of the two operands will remain scalar and which will be reduced.
8551     // For minmax the chain will be the select instructions.
8552     Instruction *Chain = Phi;
8553     for (Instruction *R : ReductionOperations) {
8554       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8555       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8556 
8557       VPValue *ChainOp = Plan->getVPValue(Chain);
8558       unsigned FirstOpId;
8559       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8560           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8561         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8562                "Expected to replace a VPWidenSelectSC");
8563         FirstOpId = 1;
8564       } else {
8565         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8566                "Expected to replace a VPWidenSC");
8567         FirstOpId = 0;
8568       }
8569       unsigned VecOpId =
8570           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8571       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8572 
8573       auto *CondOp = CM.foldTailByMasking()
8574                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8575                          : nullptr;
8576       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8577           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8578       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
8579       Plan->removeVPValueFor(R);
8580       Plan->addVPValue(R, RedRecipe);
8581       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8582       WidenRecipe->eraseFromParent();
8583 
8584       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8585           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8586         VPRecipeBase *CompareRecipe =
8587             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8588         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8589                "Expected to replace a VPWidenSC");
8590         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8591                "Expected no remaining users");
8592         CompareRecipe->eraseFromParent();
8593       }
8594       Chain = R;
8595     }
8596   }
8597 }
8598 
8599 Value* LoopVectorizationPlanner::VPCallbackILV::
getOrCreateVectorValues(Value * V,unsigned Part)8600 getOrCreateVectorValues(Value *V, unsigned Part) {
8601       return ILV.getOrCreateVectorValue(V, Part);
8602 }
8603 
getOrCreateScalarValue(Value * V,const VPIteration & Instance)8604 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8605     Value *V, const VPIteration &Instance) {
8606   return ILV.getOrCreateScalarValue(V, Instance);
8607 }
8608 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const8609 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8610                                VPSlotTracker &SlotTracker) const {
8611   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8612   IG->getInsertPos()->printAsOperand(O, false);
8613   O << ", ";
8614   getAddr()->printAsOperand(O, SlotTracker);
8615   VPValue *Mask = getMask();
8616   if (Mask) {
8617     O << ", ";
8618     Mask->printAsOperand(O, SlotTracker);
8619   }
8620   for (unsigned i = 0; i < IG->getFactor(); ++i)
8621     if (Instruction *I = IG->getMember(i))
8622       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8623 }
8624 
execute(VPTransformState & State)8625 void VPWidenCallRecipe::execute(VPTransformState &State) {
8626   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8627                                   *this, State);
8628 }
8629 
execute(VPTransformState & State)8630 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8631   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8632                                     this, *this, InvariantCond, State);
8633 }
8634 
execute(VPTransformState & State)8635 void VPWidenRecipe::execute(VPTransformState &State) {
8636   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8637 }
8638 
execute(VPTransformState & State)8639 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8640   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8641                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8642                       IsIndexLoopInvariant, State);
8643 }
8644 
execute(VPTransformState & State)8645 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8646   assert(!State.Instance && "Int or FP induction being replicated.");
8647   State.ILV->widenIntOrFpInduction(IV, Trunc);
8648 }
8649 
execute(VPTransformState & State)8650 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8651   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8652 }
8653 
execute(VPTransformState & State)8654 void VPBlendRecipe::execute(VPTransformState &State) {
8655   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8656   // We know that all PHIs in non-header blocks are converted into
8657   // selects, so we don't have to worry about the insertion order and we
8658   // can just use the builder.
8659   // At this point we generate the predication tree. There may be
8660   // duplications since this is a simple recursive scan, but future
8661   // optimizations will clean it up.
8662 
8663   unsigned NumIncoming = getNumIncomingValues();
8664 
8665   // Generate a sequence of selects of the form:
8666   // SELECT(Mask3, In3,
8667   //        SELECT(Mask2, In2,
8668   //               SELECT(Mask1, In1,
8669   //                      In0)))
8670   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8671   // are essentially undef are taken from In0.
8672   InnerLoopVectorizer::VectorParts Entry(State.UF);
8673   for (unsigned In = 0; In < NumIncoming; ++In) {
8674     for (unsigned Part = 0; Part < State.UF; ++Part) {
8675       // We might have single edge PHIs (blocks) - use an identity
8676       // 'select' for the first PHI operand.
8677       Value *In0 = State.get(getIncomingValue(In), Part);
8678       if (In == 0)
8679         Entry[Part] = In0; // Initialize with the first incoming value.
8680       else {
8681         // Select between the current value and the previous incoming edge
8682         // based on the incoming mask.
8683         Value *Cond = State.get(getMask(In), Part);
8684         Entry[Part] =
8685             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8686       }
8687     }
8688   }
8689   for (unsigned Part = 0; Part < State.UF; ++Part)
8690     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8691 }
8692 
execute(VPTransformState & State)8693 void VPInterleaveRecipe::execute(VPTransformState &State) {
8694   assert(!State.Instance && "Interleave group being replicated.");
8695   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8696                                       getMask());
8697 }
8698 
execute(VPTransformState & State)8699 void VPReductionRecipe::execute(VPTransformState &State) {
8700   assert(!State.Instance && "Reduction being replicated.");
8701   for (unsigned Part = 0; Part < State.UF; ++Part) {
8702     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8703     Value *NewVecOp = State.get(getVecOp(), Part);
8704     if (VPValue *Cond = getCondOp()) {
8705       Value *NewCond = State.get(Cond, Part);
8706       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8707       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8708           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8709       Constant *IdenVec =
8710           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8711       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8712       NewVecOp = Select;
8713     }
8714     Value *NewRed =
8715         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8716     Value *PrevInChain = State.get(getChainOp(), Part);
8717     Value *NextInChain;
8718     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8719         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8720       NextInChain =
8721           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8722                          NewRed, PrevInChain);
8723     } else {
8724       NextInChain = State.Builder.CreateBinOp(
8725           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8726           PrevInChain);
8727     }
8728     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8729   }
8730 }
8731 
execute(VPTransformState & State)8732 void VPReplicateRecipe::execute(VPTransformState &State) {
8733   if (State.Instance) { // Generate a single instance.
8734     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8735     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8736                                     *State.Instance, IsPredicated, State);
8737     // Insert scalar instance packing it into a vector.
8738     if (AlsoPack && State.VF.isVector()) {
8739       // If we're constructing lane 0, initialize to start from undef.
8740       if (State.Instance->Lane == 0) {
8741         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8742         Value *Undef = UndefValue::get(
8743             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8744         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8745                                       State.Instance->Part, Undef);
8746       }
8747       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8748                                            *State.Instance);
8749     }
8750     return;
8751   }
8752 
8753   // Generate scalar instances for all VF lanes of all UF parts, unless the
8754   // instruction is uniform inwhich case generate only the first lane for each
8755   // of the UF parts.
8756   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8757   assert((!State.VF.isScalable() || IsUniform) &&
8758          "Can't scalarize a scalable vector");
8759   for (unsigned Part = 0; Part < State.UF; ++Part)
8760     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8761       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8762                                       IsPredicated, State);
8763 }
8764 
execute(VPTransformState & State)8765 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8766   assert(State.Instance && "Branch on Mask works only on single instance.");
8767 
8768   unsigned Part = State.Instance->Part;
8769   unsigned Lane = State.Instance->Lane;
8770 
8771   Value *ConditionBit = nullptr;
8772   VPValue *BlockInMask = getMask();
8773   if (BlockInMask) {
8774     ConditionBit = State.get(BlockInMask, Part);
8775     if (ConditionBit->getType()->isVectorTy())
8776       ConditionBit = State.Builder.CreateExtractElement(
8777           ConditionBit, State.Builder.getInt32(Lane));
8778   } else // Block in mask is all-one.
8779     ConditionBit = State.Builder.getTrue();
8780 
8781   // Replace the temporary unreachable terminator with a new conditional branch,
8782   // whose two destinations will be set later when they are created.
8783   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8784   assert(isa<UnreachableInst>(CurrentTerminator) &&
8785          "Expected to replace unreachable terminator with conditional branch.");
8786   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8787   CondBr->setSuccessor(0, nullptr);
8788   ReplaceInstWithInst(CurrentTerminator, CondBr);
8789 }
8790 
execute(VPTransformState & State)8791 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8792   assert(State.Instance && "Predicated instruction PHI works per instance.");
8793   Instruction *ScalarPredInst =
8794       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8795   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8796   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8797   assert(PredicatingBB && "Predicated block has no single predecessor.");
8798 
8799   // By current pack/unpack logic we need to generate only a single phi node: if
8800   // a vector value for the predicated instruction exists at this point it means
8801   // the instruction has vector users only, and a phi for the vector value is
8802   // needed. In this case the recipe of the predicated instruction is marked to
8803   // also do that packing, thereby "hoisting" the insert-element sequence.
8804   // Otherwise, a phi node for the scalar value is needed.
8805   unsigned Part = State.Instance->Part;
8806   Instruction *PredInst =
8807       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8808   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8809     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8810     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8811     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8812     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8813     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8814     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8815   } else {
8816     Type *PredInstType = PredInst->getType();
8817     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8818     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8819     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8820     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8821   }
8822 }
8823 
execute(VPTransformState & State)8824 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8825   Instruction *Instr = getUnderlyingInstr();
8826   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8827   State.ILV->vectorizeMemoryInstruction(Instr, State,
8828                                         StoredValue ? nullptr : this, getAddr(),
8829                                         StoredValue, getMask());
8830 }
8831 
8832 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8833 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8834 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8835 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,AssumptionCache * AC,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,LoopVectorizationLegality & LVL)8836 static ScalarEpilogueLowering getScalarEpilogueLowering(
8837     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8838     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8839     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8840     LoopVectorizationLegality &LVL) {
8841   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8842   // don't look at hints or options, and don't request a scalar epilogue.
8843   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8844   // LoopAccessInfo (due to code dependency and not being able to reliably get
8845   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8846   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8847   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8848   // back to the old way and vectorize with versioning when forced. See D81345.)
8849   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8850                                                       PGSOQueryType::IRPass) &&
8851                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8852     return CM_ScalarEpilogueNotAllowedOptSize;
8853 
8854   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8855                               !PreferPredicateOverEpilogue;
8856 
8857   // 2) Next, if disabling predication is requested on the command line, honour
8858   // this and request a scalar epilogue.
8859   if (PredicateOptDisabled)
8860     return CM_ScalarEpilogueAllowed;
8861 
8862   // 3) and 4) look if enabling predication is requested on the command line,
8863   // with a loop hint, or if the TTI hook indicates this is profitable, request
8864   // predication.
8865   if (PreferPredicateOverEpilogue ||
8866       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8867       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8868                                         LVL.getLAI()) &&
8869        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8870     return CM_ScalarEpilogueNotNeededUsePredicate;
8871 
8872   return CM_ScalarEpilogueAllowed;
8873 }
8874 
set(VPValue * Def,Value * IRDef,Value * V,unsigned Part)8875 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8876                            unsigned Part) {
8877   set(Def, V, Part);
8878   ILV->setVectorValue(IRDef, Part, V);
8879 }
8880 
8881 // Process the loop in the VPlan-native vectorization path. This path builds
8882 // VPlan upfront in the vectorization pipeline, which allows to apply
8883 // VPlan-to-VPlan transformations from the very beginning without modifying the
8884 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints)8885 static bool processLoopInVPlanNativePath(
8886     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8887     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8888     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8889     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8890     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8891 
8892   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8893     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8894     return false;
8895   }
8896   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8897   Function *F = L->getHeader()->getParent();
8898   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8899 
8900   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8901       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8902 
8903   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8904                                 &Hints, IAI);
8905   // Use the planner for outer loop vectorization.
8906   // TODO: CM is not used at this point inside the planner. Turn CM into an
8907   // optional argument if we don't need it in the future.
8908   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8909 
8910   // Get user vectorization factor.
8911   ElementCount UserVF = Hints.getWidth();
8912 
8913   // Plan how to best vectorize, return the best VF and its cost.
8914   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8915 
8916   // If we are stress testing VPlan builds, do not attempt to generate vector
8917   // code. Masked vector code generation support will follow soon.
8918   // Also, do not attempt to vectorize if no vector code will be produced.
8919   if (VPlanBuildStressTest || EnableVPlanPredication ||
8920       VectorizationFactor::Disabled() == VF)
8921     return false;
8922 
8923   LVP.setBestPlan(VF.Width, 1);
8924 
8925   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8926                          &CM, BFI, PSI);
8927   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8928                     << L->getHeader()->getParent()->getName() << "\"\n");
8929   LVP.executePlan(LB, DT);
8930 
8931   // Mark the loop as already vectorized to avoid vectorizing again.
8932   Hints.setAlreadyVectorized();
8933 
8934   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8935   return true;
8936 }
8937 
LoopVectorizePass(LoopVectorizeOptions Opts)8938 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8939     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8940                                !EnableLoopInterleaving),
8941       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8942                               !EnableLoopVectorization) {}
8943 
processLoop(Loop * L)8944 bool LoopVectorizePass::processLoop(Loop *L) {
8945   assert((EnableVPlanNativePath || L->isInnermost()) &&
8946          "VPlan-native path is not enabled. Only process inner loops.");
8947 
8948 #ifndef NDEBUG
8949   const std::string DebugLocStr = getDebugLocString(L);
8950 #endif /* NDEBUG */
8951 
8952   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8953                     << L->getHeader()->getParent()->getName() << "\" from "
8954                     << DebugLocStr << "\n");
8955 
8956   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8957 
8958   LLVM_DEBUG(
8959       dbgs() << "LV: Loop hints:"
8960              << " force="
8961              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8962                      ? "disabled"
8963                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8964                             ? "enabled"
8965                             : "?"))
8966              << " width=" << Hints.getWidth()
8967              << " unroll=" << Hints.getInterleave() << "\n");
8968 
8969   // Function containing loop
8970   Function *F = L->getHeader()->getParent();
8971 
8972   // Looking at the diagnostic output is the only way to determine if a loop
8973   // was vectorized (other than looking at the IR or machine code), so it
8974   // is important to generate an optimization remark for each loop. Most of
8975   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8976   // generated as OptimizationRemark and OptimizationRemarkMissed are
8977   // less verbose reporting vectorized loops and unvectorized loops that may
8978   // benefit from vectorization, respectively.
8979 
8980   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8981     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8982     return false;
8983   }
8984 
8985   PredicatedScalarEvolution PSE(*SE, *L);
8986 
8987   // Check if it is legal to vectorize the loop.
8988   LoopVectorizationRequirements Requirements(*ORE);
8989   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8990                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8991   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8992     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8993     Hints.emitRemarkWithHints();
8994     return false;
8995   }
8996 
8997   // Check the function attributes and profiles to find out if this function
8998   // should be optimized for size.
8999   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9000       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9001 
9002   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9003   // here. They may require CFG and instruction level transformations before
9004   // even evaluating whether vectorization is profitable. Since we cannot modify
9005   // the incoming IR, we need to build VPlan upfront in the vectorization
9006   // pipeline.
9007   if (!L->isInnermost())
9008     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9009                                         ORE, BFI, PSI, Hints);
9010 
9011   assert(L->isInnermost() && "Inner loop expected.");
9012 
9013   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9014   // count by optimizing for size, to minimize overheads.
9015   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9016   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9017     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9018                       << "This loop is worth vectorizing only if no scalar "
9019                       << "iteration overheads are incurred.");
9020     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9021       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9022     else {
9023       LLVM_DEBUG(dbgs() << "\n");
9024       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9025     }
9026   }
9027 
9028   // Check the function attributes to see if implicit floats are allowed.
9029   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9030   // an integer loop and the vector instructions selected are purely integer
9031   // vector instructions?
9032   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9033     reportVectorizationFailure(
9034         "Can't vectorize when the NoImplicitFloat attribute is used",
9035         "loop not vectorized due to NoImplicitFloat attribute",
9036         "NoImplicitFloat", ORE, L);
9037     Hints.emitRemarkWithHints();
9038     return false;
9039   }
9040 
9041   // Check if the target supports potentially unsafe FP vectorization.
9042   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9043   // for the target we're vectorizing for, to make sure none of the
9044   // additional fp-math flags can help.
9045   if (Hints.isPotentiallyUnsafe() &&
9046       TTI->isFPVectorizationPotentiallyUnsafe()) {
9047     reportVectorizationFailure(
9048         "Potentially unsafe FP op prevents vectorization",
9049         "loop not vectorized due to unsafe FP support.",
9050         "UnsafeFP", ORE, L);
9051     Hints.emitRemarkWithHints();
9052     return false;
9053   }
9054 
9055   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9056   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9057 
9058   // If an override option has been passed in for interleaved accesses, use it.
9059   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9060     UseInterleaved = EnableInterleavedMemAccesses;
9061 
9062   // Analyze interleaved memory accesses.
9063   if (UseInterleaved) {
9064     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9065   }
9066 
9067   // Use the cost model.
9068   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9069                                 F, &Hints, IAI);
9070   CM.collectValuesToIgnore();
9071 
9072   // Use the planner for vectorization.
9073   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9074 
9075   // Get user vectorization factor and interleave count.
9076   ElementCount UserVF = Hints.getWidth();
9077   unsigned UserIC = Hints.getInterleave();
9078 
9079   // Plan how to best vectorize, return the best VF and its cost.
9080   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9081 
9082   VectorizationFactor VF = VectorizationFactor::Disabled();
9083   unsigned IC = 1;
9084 
9085   if (MaybeVF) {
9086     VF = *MaybeVF;
9087     // Select the interleave count.
9088     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9089   }
9090 
9091   // Identify the diagnostic messages that should be produced.
9092   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9093   bool VectorizeLoop = true, InterleaveLoop = true;
9094   if (Requirements.doesNotMeet(F, L, Hints)) {
9095     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9096                          "requirements.\n");
9097     Hints.emitRemarkWithHints();
9098     return false;
9099   }
9100 
9101   if (VF.Width.isScalar()) {
9102     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9103     VecDiagMsg = std::make_pair(
9104         "VectorizationNotBeneficial",
9105         "the cost-model indicates that vectorization is not beneficial");
9106     VectorizeLoop = false;
9107   }
9108 
9109   if (!MaybeVF && UserIC > 1) {
9110     // Tell the user interleaving was avoided up-front, despite being explicitly
9111     // requested.
9112     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9113                          "interleaving should be avoided up front\n");
9114     IntDiagMsg = std::make_pair(
9115         "InterleavingAvoided",
9116         "Ignoring UserIC, because interleaving was avoided up front");
9117     InterleaveLoop = false;
9118   } else if (IC == 1 && UserIC <= 1) {
9119     // Tell the user interleaving is not beneficial.
9120     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9121     IntDiagMsg = std::make_pair(
9122         "InterleavingNotBeneficial",
9123         "the cost-model indicates that interleaving is not beneficial");
9124     InterleaveLoop = false;
9125     if (UserIC == 1) {
9126       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9127       IntDiagMsg.second +=
9128           " and is explicitly disabled or interleave count is set to 1";
9129     }
9130   } else if (IC > 1 && UserIC == 1) {
9131     // Tell the user interleaving is beneficial, but it explicitly disabled.
9132     LLVM_DEBUG(
9133         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9134     IntDiagMsg = std::make_pair(
9135         "InterleavingBeneficialButDisabled",
9136         "the cost-model indicates that interleaving is beneficial "
9137         "but is explicitly disabled or interleave count is set to 1");
9138     InterleaveLoop = false;
9139   }
9140 
9141   // Override IC if user provided an interleave count.
9142   IC = UserIC > 0 ? UserIC : IC;
9143 
9144   // Emit diagnostic messages, if any.
9145   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9146   if (!VectorizeLoop && !InterleaveLoop) {
9147     // Do not vectorize or interleaving the loop.
9148     ORE->emit([&]() {
9149       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9150                                       L->getStartLoc(), L->getHeader())
9151              << VecDiagMsg.second;
9152     });
9153     ORE->emit([&]() {
9154       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9155                                       L->getStartLoc(), L->getHeader())
9156              << IntDiagMsg.second;
9157     });
9158     return false;
9159   } else if (!VectorizeLoop && InterleaveLoop) {
9160     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9161     ORE->emit([&]() {
9162       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9163                                         L->getStartLoc(), L->getHeader())
9164              << VecDiagMsg.second;
9165     });
9166   } else if (VectorizeLoop && !InterleaveLoop) {
9167     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9168                       << ") in " << DebugLocStr << '\n');
9169     ORE->emit([&]() {
9170       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9171                                         L->getStartLoc(), L->getHeader())
9172              << IntDiagMsg.second;
9173     });
9174   } else if (VectorizeLoop && InterleaveLoop) {
9175     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9176                       << ") in " << DebugLocStr << '\n');
9177     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9178   }
9179 
9180   LVP.setBestPlan(VF.Width, IC);
9181 
9182   using namespace ore;
9183   bool DisableRuntimeUnroll = false;
9184   MDNode *OrigLoopID = L->getLoopID();
9185 
9186   if (!VectorizeLoop) {
9187     assert(IC > 1 && "interleave count should not be 1 or 0");
9188     // If we decided that it is not legal to vectorize the loop, then
9189     // interleave it.
9190     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9191                                BFI, PSI);
9192     LVP.executePlan(Unroller, DT);
9193 
9194     ORE->emit([&]() {
9195       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9196                                 L->getHeader())
9197              << "interleaved loop (interleaved count: "
9198              << NV("InterleaveCount", IC) << ")";
9199     });
9200   } else {
9201     // If we decided that it is *legal* to vectorize the loop, then do it.
9202 
9203     // Consider vectorizing the epilogue too if it's profitable.
9204     VectorizationFactor EpilogueVF =
9205       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9206     if (EpilogueVF.Width.isVector()) {
9207 
9208       // The first pass vectorizes the main loop and creates a scalar epilogue
9209       // to be vectorized by executing the plan (potentially with a different
9210       // factor) again shortly afterwards.
9211       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9212                                         EpilogueVF.Width.getKnownMinValue(), 1);
9213       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9214                                          &LVL, &CM, BFI, PSI);
9215 
9216       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9217       LVP.executePlan(MainILV, DT);
9218       ++LoopsVectorized;
9219 
9220       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9221       formLCSSARecursively(*L, *DT, LI, SE);
9222 
9223       // Second pass vectorizes the epilogue and adjusts the control flow
9224       // edges from the first pass.
9225       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9226       EPI.MainLoopVF = EPI.EpilogueVF;
9227       EPI.MainLoopUF = EPI.EpilogueUF;
9228       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9229                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9230       LVP.executePlan(EpilogILV, DT);
9231       ++LoopsEpilogueVectorized;
9232 
9233       if (!MainILV.areSafetyChecksAdded())
9234         DisableRuntimeUnroll = true;
9235     } else {
9236       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9237                              &LVL, &CM, BFI, PSI);
9238       LVP.executePlan(LB, DT);
9239       ++LoopsVectorized;
9240 
9241       // Add metadata to disable runtime unrolling a scalar loop when there are
9242       // no runtime checks about strides and memory. A scalar loop that is
9243       // rarely used is not worth unrolling.
9244       if (!LB.areSafetyChecksAdded())
9245         DisableRuntimeUnroll = true;
9246     }
9247 
9248     // Report the vectorization decision.
9249     ORE->emit([&]() {
9250       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9251                                 L->getHeader())
9252              << "vectorized loop (vectorization width: "
9253              << NV("VectorizationFactor", VF.Width)
9254              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9255     });
9256   }
9257 
9258   Optional<MDNode *> RemainderLoopID =
9259       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9260                                       LLVMLoopVectorizeFollowupEpilogue});
9261   if (RemainderLoopID.hasValue()) {
9262     L->setLoopID(RemainderLoopID.getValue());
9263   } else {
9264     if (DisableRuntimeUnroll)
9265       AddRuntimeUnrollDisableMetaData(L);
9266 
9267     // Mark the loop as already vectorized to avoid vectorizing again.
9268     Hints.setAlreadyVectorized();
9269   }
9270 
9271   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9272   return true;
9273 }
9274 
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo & BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AAResults & AA_,AssumptionCache & AC_,std::function<const LoopAccessInfo & (Loop &)> & GetLAA_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)9275 LoopVectorizeResult LoopVectorizePass::runImpl(
9276     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9277     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9278     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9279     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9280     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9281   SE = &SE_;
9282   LI = &LI_;
9283   TTI = &TTI_;
9284   DT = &DT_;
9285   BFI = &BFI_;
9286   TLI = TLI_;
9287   AA = &AA_;
9288   AC = &AC_;
9289   GetLAA = &GetLAA_;
9290   DB = &DB_;
9291   ORE = &ORE_;
9292   PSI = PSI_;
9293 
9294   // Don't attempt if
9295   // 1. the target claims to have no vector registers, and
9296   // 2. interleaving won't help ILP.
9297   //
9298   // The second condition is necessary because, even if the target has no
9299   // vector registers, loop vectorization may still enable scalar
9300   // interleaving.
9301   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9302       TTI->getMaxInterleaveFactor(1) < 2)
9303     return LoopVectorizeResult(false, false);
9304 
9305   bool Changed = false, CFGChanged = false;
9306 
9307   // The vectorizer requires loops to be in simplified form.
9308   // Since simplification may add new inner loops, it has to run before the
9309   // legality and profitability checks. This means running the loop vectorizer
9310   // will simplify all loops, regardless of whether anything end up being
9311   // vectorized.
9312   for (auto &L : *LI)
9313     Changed |= CFGChanged |=
9314         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9315 
9316   // Build up a worklist of inner-loops to vectorize. This is necessary as
9317   // the act of vectorizing or partially unrolling a loop creates new loops
9318   // and can invalidate iterators across the loops.
9319   SmallVector<Loop *, 8> Worklist;
9320 
9321   for (Loop *L : *LI)
9322     collectSupportedLoops(*L, LI, ORE, Worklist);
9323 
9324   LoopsAnalyzed += Worklist.size();
9325 
9326   // Now walk the identified inner loops.
9327   while (!Worklist.empty()) {
9328     Loop *L = Worklist.pop_back_val();
9329 
9330     // For the inner loops we actually process, form LCSSA to simplify the
9331     // transform.
9332     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9333 
9334     Changed |= CFGChanged |= processLoop(L);
9335   }
9336 
9337   // Process each loop nest in the function.
9338   return LoopVectorizeResult(Changed, CFGChanged);
9339 }
9340 
run(Function & F,FunctionAnalysisManager & AM)9341 PreservedAnalyses LoopVectorizePass::run(Function &F,
9342                                          FunctionAnalysisManager &AM) {
9343     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9344     auto &LI = AM.getResult<LoopAnalysis>(F);
9345     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9346     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9347     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9348     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9349     auto &AA = AM.getResult<AAManager>(F);
9350     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9351     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9352     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9353     MemorySSA *MSSA = EnableMSSALoopDependency
9354                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9355                           : nullptr;
9356 
9357     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9358     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9359         [&](Loop &L) -> const LoopAccessInfo & {
9360       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9361                                         TLI, TTI, nullptr, MSSA};
9362       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9363     };
9364     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9365     ProfileSummaryInfo *PSI =
9366         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9367     LoopVectorizeResult Result =
9368         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9369     if (!Result.MadeAnyChange)
9370       return PreservedAnalyses::all();
9371     PreservedAnalyses PA;
9372 
9373     // We currently do not preserve loopinfo/dominator analyses with outer loop
9374     // vectorization. Until this is addressed, mark these analyses as preserved
9375     // only for non-VPlan-native path.
9376     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9377     if (!EnableVPlanNativePath) {
9378       PA.preserve<LoopAnalysis>();
9379       PA.preserve<DominatorTreeAnalysis>();
9380     }
9381     PA.preserve<BasicAA>();
9382     PA.preserve<GlobalsAA>();
9383     if (!Result.MadeCFGChange)
9384       PA.preserveSet<CFGAnalyses>();
9385     return PA;
9386 }
9387