1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154
155 using namespace llvm;
156
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168 "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170 "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176
177 static cl::opt<bool> EnableEpilogueVectorization(
178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179 cl::desc("Enable vectorization of epilogue loops."));
180
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183 cl::desc("When epilogue vectorization is enabled, and a value greater than "
184 "1 is specified, forces the given VF for all applicable epilogue "
185 "loops."));
186
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189 cl::desc("Only loops with vectorization factor equal to or larger than "
190 "the specified value are considered for epilogue vectorization."));
191
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196 cl::desc("Loops with a constant trip count that is smaller than this "
197 "value are vectorized only if no scalar iteration overheads "
198 "are incurred."));
199
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206 enum Option {
207 ScalarEpilogue = 0,
208 PredicateElseScalarEpilogue,
209 PredicateOrDontVectorize
210 };
211 } // namespace PreferPredicateTy
212
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214 "prefer-predicate-over-epilogue",
215 cl::init(PreferPredicateTy::ScalarEpilogue),
216 cl::Hidden,
217 cl::desc("Tail-folding and predication preferences over creating a scalar "
218 "epilogue loop."),
219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220 "scalar-epilogue",
221 "Don't tail-predicate loops, create scalar epilogue"),
222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223 "predicate-else-scalar-epilogue",
224 "prefer tail-folding, create scalar epilogue if tail "
225 "folding fails."),
226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227 "predicate-dont-vectorize",
228 "prefers tail-folding, don't attempt vectorization if "
229 "tail-folding fails.")));
230
231 static cl::opt<bool> MaximizeBandwidth(
232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233 cl::desc("Maximize bandwidth when selecting vectorization factor which "
234 "will be determined by the smallest type in loop."));
235
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248 cl::desc("We don't interleave loops with a estimated constant trip count "
249 "below this number"));
250
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253 cl::desc("A flag that overrides the target's number of scalar registers."));
254
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257 cl::desc("A flag that overrides the target's number of vector registers."));
258
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261 cl::desc("A flag that overrides the target's max interleave factor for "
262 "scalar loops."));
263
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266 cl::desc("A flag that overrides the target's max interleave factor for "
267 "vectorized loops."));
268
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270 "force-target-instruction-cost", cl::init(0), cl::Hidden,
271 cl::desc("A flag that overrides the target's expected cost for "
272 "an instruction to a single constant value. Mostly "
273 "useful for getting consistent testing."));
274
275 static cl::opt<unsigned> SmallLoopCost(
276 "small-loop-cost", cl::init(20), cl::Hidden,
277 cl::desc(
278 "The cost of a loop that is considered 'small' by the interleaver."));
279
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282 cl::desc("Enable the use of the block frequency analysis to access PGO "
283 "heuristics minimizing code growth in cold regions and being more "
284 "aggressive in hot regions."));
285
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289 cl::desc(
290 "Enable runtime interleaving until load/store ports are saturated"));
291
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295 cl::desc("Enable interleaving for loops with small iteration counts that "
296 "contain scalar reductions to expose ILP."));
297
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301 cl::desc("Max number of stores to be predicated behind an if."));
302
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305 cl::desc("Count the induction variable only once when interleaving"));
306
307 static cl::opt<bool> EnableCondStoresVectorization(
308 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309 cl::desc("Enable if predication of stores during vectorization."));
310
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313 cl::desc("The maximum interleave count to use when interleaving a scalar "
314 "reduction in a nested loop."));
315
316 static cl::opt<bool>
317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318 cl::Hidden,
319 cl::desc("Prefer in-loop vector reductions, "
320 "overriding the targets preference."));
321
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324 cl::desc(
325 "Prefer predicating a reduction operation over an after loop select."));
326
327 cl::opt<bool> EnableVPlanNativePath(
328 "enable-vplan-native-path", cl::init(false), cl::Hidden,
329 cl::desc("Enable VPlan-native vectorization path with "
330 "support for outer loop vectorization."));
331
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335 "enable-vplan-predication", cl::init(false), cl::Hidden,
336 cl::desc("Enable VPlan-native vectorization path predicator with "
337 "support for outer loop vectorization."));
338
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344 "vplan-build-stress-test", cl::init(false), cl::Hidden,
345 cl::desc(
346 "Build VPlan for every supported loop nest in the function and bail "
347 "out right after the build (stress test the VPlan H-CFG construction "
348 "in the VPlan-native vectorization path)."));
349
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351 "interleave-loops", cl::init(true), cl::Hidden,
352 cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354 "vectorize-loops", cl::init(true), cl::Hidden,
355 cl::desc("Run the Loop vectorization passes"));
356
357 /// A helper function that returns the type of loaded or stored value.
getMemInstValueType(Value * I)358 static Type *getMemInstValueType(Value *I) {
359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360 "Expected Load or Store instruction");
361 if (auto *LI = dyn_cast<LoadInst>(I))
362 return LI->getType();
363 return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
hasIrregularType(Type * Ty,const DataLayout & DL,ElementCount VF)369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370 // Determine if an array of VF elements of type Ty is "bitcast compatible"
371 // with a <VF x Ty> vector.
372 if (VF.isVector()) {
373 auto *VectorTy = VectorType::get(Ty, VF);
374 return TypeSize::get(VF.getKnownMinValue() *
375 DL.getTypeAllocSize(Ty).getFixedValue(),
376 VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377 }
378
379 // If the vectorization factor is one, we just check if an array of type Ty
380 // requires padding between elements.
381 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 /// we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()390 static unsigned getReciprocalPredBlockProb() { return 2; }
391
392 /// A helper function that adds a 'fast' flag to floating-point operations.
addFastMathFlag(Value * V)393 static Value *addFastMathFlag(Value *V) {
394 if (isa<FPMathOperator>(V))
395 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396 return V;
397 }
398
addFastMathFlag(Value * V,FastMathFlags FMF)399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400 if (isa<FPMathOperator>(V))
401 cast<Instruction>(V)->setFastMathFlags(FMF);
402 return V;
403 }
404
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409 : ConstantFP::get(Ty, C);
410 }
411
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 /// 1) Returns exact trip count if it is known.
415 /// 2) Returns expected trip count according to profile data if any.
416 /// 3) Returns upper bound estimate if it is known.
417 /// 4) Returns None if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419 // Check if exact trip count is known.
420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421 return ExpectedTC;
422
423 // Check if there is an expected trip count available from profile data.
424 if (LoopVectorizeWithBlockFrequency)
425 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426 return EstimatedTC;
427
428 // Check if upper bound estimate is known.
429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430 return ExpectedTC;
431
432 return None;
433 }
434
435 namespace llvm {
436
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 /// counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 /// instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)453 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454 LoopInfo *LI, DominatorTree *DT,
455 const TargetLibraryInfo *TLI,
456 const TargetTransformInfo *TTI, AssumptionCache *AC,
457 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460 ProfileSummaryInfo *PSI)
461 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463 Builder(PSE.getSE()->getContext()),
464 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465 BFI(BFI), PSI(PSI) {
466 // Query this against the original loop and save it here because the profile
467 // of the original loop header may change as the transformation happens.
468 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470 }
471
472 virtual ~InnerLoopVectorizer() = default;
473
474 /// Create a new empty loop that will contain vectorized instructions later
475 /// on, while the old loop will be used as the scalar remainder. Control flow
476 /// is generated around the vectorized (and scalar epilogue) loops consisting
477 /// of various checks and bypasses. Return the pre-header block of the new
478 /// loop.
479 /// In the case of epilogue vectorization, this function is overriden to
480 /// handle the more complex control flow around the loops.
481 virtual BasicBlock *createVectorizedLoopSkeleton();
482
483 /// Widen a single instruction within the innermost loop.
484 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485 VPTransformState &State);
486
487 /// Widen a single call instruction within the innermost loop.
488 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489 VPTransformState &State);
490
491 /// Widen a single select instruction within the innermost loop.
492 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493 bool InvariantCond, VPTransformState &State);
494
495 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496 void fixVectorizedLoop();
497
498 // Return true if any runtime check is added.
areSafetyChecksAdded()499 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500
501 /// A type for vectorized values in the new loop. Each value from the
502 /// original loop, when vectorized, is represented by UF vector values in the
503 /// new unrolled loop, where UF is the unroll factor.
504 using VectorParts = SmallVector<Value *, 2>;
505
506 /// Vectorize a single GetElementPtrInst based on information gathered and
507 /// decisions taken during planning.
508 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511
512 /// Vectorize a single PHINode in a block. This method handles the induction
513 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514 /// arbitrary length vectors.
515 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516
517 /// A helper function to scalarize a single Instruction in the innermost loop.
518 /// Generates a sequence of scalar instances for each lane between \p MinLane
519 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521 /// Instr's operands.
522 void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523 const VPIteration &Instance, bool IfPredicateInstr,
524 VPTransformState &State);
525
526 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527 /// is provided, the integer induction variable will first be truncated to
528 /// the corresponding type.
529 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530
531 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532 /// vector or scalar value on-demand if one is not yet available. When
533 /// vectorizing a loop, we visit the definition of an instruction before its
534 /// uses. When visiting the definition, we either vectorize or scalarize the
535 /// instruction, creating an entry for it in the corresponding map. (In some
536 /// cases, such as induction variables, we will create both vector and scalar
537 /// entries.) Then, as we encounter uses of the definition, we derive values
538 /// for each scalar or vector use unless such a value is already available.
539 /// For example, if we scalarize a definition and one of its uses is vector,
540 /// we build the required vector on-demand with an insertelement sequence
541 /// when visiting the use. Otherwise, if the use is scalar, we can use the
542 /// existing scalar definition.
543 ///
544 /// Return a value in the new loop corresponding to \p V from the original
545 /// loop at unroll index \p Part. If the value has already been vectorized,
546 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548 /// a new vector value on-demand by inserting the scalar values into a vector
549 /// with an insertelement sequence. If the value has been neither vectorized
550 /// nor scalarized, it must be loop invariant, so we simply broadcast the
551 /// value into a vector.
552 Value *getOrCreateVectorValue(Value *V, unsigned Part);
553
setVectorValue(Value * Scalar,unsigned Part,Value * Vector)554 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556 }
557
558 /// Return a value in the new loop corresponding to \p V from the original
559 /// loop at unroll and vector indices \p Instance. If the value has been
560 /// vectorized but not scalarized, the necessary extractelement instruction
561 /// will be generated.
562 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563
564 /// Construct the vector value of a scalarized value \p V one lane at a time.
565 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566
567 /// Try to vectorize interleaved access group \p Group with the base address
568 /// given in \p Addr, optionally masking the vector operations if \p
569 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570 /// values in the vectorized loop.
571 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572 VPTransformState &State, VPValue *Addr,
573 ArrayRef<VPValue *> StoredValues,
574 VPValue *BlockInMask = nullptr);
575
576 /// Vectorize Load and Store instructions with the base address given in \p
577 /// Addr, optionally masking the vector operations if \p BlockInMask is
578 /// non-null. Use \p State to translate given VPValues to IR values in the
579 /// vectorized loop.
580 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
581 VPValue *Def, VPValue *Addr,
582 VPValue *StoredValue, VPValue *BlockInMask);
583
584 /// Set the debug location in the builder using the debug location in
585 /// the instruction.
586 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
587
588 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
589 void fixNonInductionPHIs(void);
590
591 protected:
592 friend class LoopVectorizationPlanner;
593
594 /// A small list of PHINodes.
595 using PhiVector = SmallVector<PHINode *, 4>;
596
597 /// A type for scalarized values in the new loop. Each value from the
598 /// original loop, when scalarized, is represented by UF x VF scalar values
599 /// in the new unrolled loop, where UF is the unroll factor and VF is the
600 /// vectorization factor.
601 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
602
603 /// Set up the values of the IVs correctly when exiting the vector loop.
604 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605 Value *CountRoundDown, Value *EndValue,
606 BasicBlock *MiddleBlock);
607
608 /// Create a new induction variable inside L.
609 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
610 Value *Step, Instruction *DL);
611
612 /// Handle all cross-iteration phis in the header.
613 void fixCrossIterationPHIs();
614
615 /// Fix a first-order recurrence. This is the second phase of vectorizing
616 /// this phi node.
617 void fixFirstOrderRecurrence(PHINode *Phi);
618
619 /// Fix a reduction cross-iteration phi. This is the second phase of
620 /// vectorizing this phi node.
621 void fixReduction(PHINode *Phi);
622
623 /// Clear NSW/NUW flags from reduction instructions if necessary.
624 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
625
626 /// The Loop exit block may have single value PHI nodes with some
627 /// incoming value. While vectorizing we only handled real values
628 /// that were defined inside the loop and we should have one value for
629 /// each predecessor of its parent basic block. See PR14725.
630 void fixLCSSAPHIs();
631
632 /// Iteratively sink the scalarized operands of a predicated instruction into
633 /// the block that was created for it.
634 void sinkScalarOperands(Instruction *PredInst);
635
636 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
637 /// represented as.
638 void truncateToMinimalBitwidths();
639
640 /// Create a broadcast instruction. This method generates a broadcast
641 /// instruction (shuffle) for loop invariant values and for the induction
642 /// value. If this is the induction variable then we extend it to N, N+1, ...
643 /// this is needed because each iteration in the loop corresponds to a SIMD
644 /// element.
645 virtual Value *getBroadcastInstrs(Value *V);
646
647 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
648 /// to each vector element of Val. The sequence starts at StartIndex.
649 /// \p Opcode is relevant for FP induction variable.
650 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
651 Instruction::BinaryOps Opcode =
652 Instruction::BinaryOpsEnd);
653
654 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
655 /// variable on which to base the steps, \p Step is the size of the step, and
656 /// \p EntryVal is the value from the original loop that maps to the steps.
657 /// Note that \p EntryVal doesn't have to be an induction variable - it
658 /// can also be a truncate instruction.
659 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
660 const InductionDescriptor &ID);
661
662 /// Create a vector induction phi node based on an existing scalar one. \p
663 /// EntryVal is the value from the original loop that maps to the vector phi
664 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
665 /// truncate instruction, instead of widening the original IV, we widen a
666 /// version of the IV truncated to \p EntryVal's type.
667 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
668 Value *Step, Instruction *EntryVal);
669
670 /// Returns true if an instruction \p I should be scalarized instead of
671 /// vectorized for the chosen vectorization factor.
672 bool shouldScalarizeInstruction(Instruction *I) const;
673
674 /// Returns true if we should generate a scalar version of \p IV.
675 bool needsScalarInduction(Instruction *IV) const;
676
677 /// If there is a cast involved in the induction variable \p ID, which should
678 /// be ignored in the vectorized loop body, this function records the
679 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
680 /// cast. We had already proved that the casted Phi is equal to the uncasted
681 /// Phi in the vectorized loop (under a runtime guard), and therefore
682 /// there is no need to vectorize the cast - the same value can be used in the
683 /// vector loop for both the Phi and the cast.
684 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
685 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
686 ///
687 /// \p EntryVal is the value from the original loop that maps to the vector
688 /// phi node and is used to distinguish what is the IV currently being
689 /// processed - original one (if \p EntryVal is a phi corresponding to the
690 /// original IV) or the "newly-created" one based on the proof mentioned above
691 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
692 /// latter case \p EntryVal is a TruncInst and we must not record anything for
693 /// that IV, but it's error-prone to expect callers of this routine to care
694 /// about that, hence this explicit parameter.
695 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
696 const Instruction *EntryVal,
697 Value *VectorLoopValue,
698 unsigned Part,
699 unsigned Lane = UINT_MAX);
700
701 /// Generate a shuffle sequence that will reverse the vector Vec.
702 virtual Value *reverseVector(Value *Vec);
703
704 /// Returns (and creates if needed) the original loop trip count.
705 Value *getOrCreateTripCount(Loop *NewLoop);
706
707 /// Returns (and creates if needed) the trip count of the widened loop.
708 Value *getOrCreateVectorTripCount(Loop *NewLoop);
709
710 /// Returns a bitcasted value to the requested vector type.
711 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
712 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
713 const DataLayout &DL);
714
715 /// Emit a bypass check to see if the vector trip count is zero, including if
716 /// it overflows.
717 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
718
719 /// Emit a bypass check to see if all of the SCEV assumptions we've
720 /// had to make are correct.
721 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
722
723 /// Emit bypass checks to check any memory assumptions we may have made.
724 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
725
726 /// Compute the transformed value of Index at offset StartValue using step
727 /// StepValue.
728 /// For integer induction, returns StartValue + Index * StepValue.
729 /// For pointer induction, returns StartValue[Index * StepValue].
730 /// FIXME: The newly created binary instructions should contain nsw/nuw
731 /// flags, which can be found from the original scalar operations.
732 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
733 const DataLayout &DL,
734 const InductionDescriptor &ID) const;
735
736 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
737 /// vector loop preheader, middle block and scalar preheader. Also
738 /// allocate a loop object for the new vector loop and return it.
739 Loop *createVectorLoopSkeleton(StringRef Prefix);
740
741 /// Create new phi nodes for the induction variables to resume iteration count
742 /// in the scalar epilogue, from where the vectorized loop left off (given by
743 /// \p VectorTripCount).
744 /// In cases where the loop skeleton is more complicated (eg. epilogue
745 /// vectorization) and the resume values can come from an additional bypass
746 /// block, the \p AdditionalBypass pair provides information about the bypass
747 /// block and the end value on the edge from bypass to this loop.
748 void createInductionResumeValues(
749 Loop *L, Value *VectorTripCount,
750 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
751
752 /// Complete the loop skeleton by adding debug MDs, creating appropriate
753 /// conditional branches in the middle block, preparing the builder and
754 /// running the verifier. Take in the vector loop \p L as argument, and return
755 /// the preheader of the completed vector loop.
756 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
757
758 /// Add additional metadata to \p To that was not present on \p Orig.
759 ///
760 /// Currently this is used to add the noalias annotations based on the
761 /// inserted memchecks. Use this for instructions that are *cloned* into the
762 /// vector loop.
763 void addNewMetadata(Instruction *To, const Instruction *Orig);
764
765 /// Add metadata from one instruction to another.
766 ///
767 /// This includes both the original MDs from \p From and additional ones (\see
768 /// addNewMetadata). Use this for *newly created* instructions in the vector
769 /// loop.
770 void addMetadata(Instruction *To, Instruction *From);
771
772 /// Similar to the previous function but it adds the metadata to a
773 /// vector of instructions.
774 void addMetadata(ArrayRef<Value *> To, Instruction *From);
775
776 /// Allow subclasses to override and print debug traces before/after vplan
777 /// execution, when trace information is requested.
printDebugTracesAtStart()778 virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()779 virtual void printDebugTracesAtEnd(){};
780
781 /// The original loop.
782 Loop *OrigLoop;
783
784 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
785 /// dynamic knowledge to simplify SCEV expressions and converts them to a
786 /// more usable form.
787 PredicatedScalarEvolution &PSE;
788
789 /// Loop Info.
790 LoopInfo *LI;
791
792 /// Dominator Tree.
793 DominatorTree *DT;
794
795 /// Alias Analysis.
796 AAResults *AA;
797
798 /// Target Library Info.
799 const TargetLibraryInfo *TLI;
800
801 /// Target Transform Info.
802 const TargetTransformInfo *TTI;
803
804 /// Assumption Cache.
805 AssumptionCache *AC;
806
807 /// Interface to emit optimization remarks.
808 OptimizationRemarkEmitter *ORE;
809
810 /// LoopVersioning. It's only set up (non-null) if memchecks were
811 /// used.
812 ///
813 /// This is currently only used to add no-alias metadata based on the
814 /// memchecks. The actually versioning is performed manually.
815 std::unique_ptr<LoopVersioning> LVer;
816
817 /// The vectorization SIMD factor to use. Each vector will have this many
818 /// vector elements.
819 ElementCount VF;
820
821 /// The vectorization unroll factor to use. Each scalar is vectorized to this
822 /// many different vector instructions.
823 unsigned UF;
824
825 /// The builder that we use
826 IRBuilder<> Builder;
827
828 // --- Vectorization state ---
829
830 /// The vector-loop preheader.
831 BasicBlock *LoopVectorPreHeader;
832
833 /// The scalar-loop preheader.
834 BasicBlock *LoopScalarPreHeader;
835
836 /// Middle Block between the vector and the scalar.
837 BasicBlock *LoopMiddleBlock;
838
839 /// The ExitBlock of the scalar loop.
840 BasicBlock *LoopExitBlock;
841
842 /// The vector loop body.
843 BasicBlock *LoopVectorBody;
844
845 /// The scalar loop body.
846 BasicBlock *LoopScalarBody;
847
848 /// A list of all bypass blocks. The first block is the entry of the loop.
849 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
850
851 /// The new Induction variable which was added to the new block.
852 PHINode *Induction = nullptr;
853
854 /// The induction variable of the old basic block.
855 PHINode *OldInduction = nullptr;
856
857 /// Maps values from the original loop to their corresponding values in the
858 /// vectorized loop. A key value can map to either vector values, scalar
859 /// values or both kinds of values, depending on whether the key was
860 /// vectorized and scalarized.
861 VectorizerValueMap VectorLoopValueMap;
862
863 /// Store instructions that were predicated.
864 SmallVector<Instruction *, 4> PredicatedInstructions;
865
866 /// Trip count of the original loop.
867 Value *TripCount = nullptr;
868
869 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
870 Value *VectorTripCount = nullptr;
871
872 /// The legality analysis.
873 LoopVectorizationLegality *Legal;
874
875 /// The profitablity analysis.
876 LoopVectorizationCostModel *Cost;
877
878 // Record whether runtime checks are added.
879 bool AddedSafetyChecks = false;
880
881 // Holds the end values for each induction variable. We save the end values
882 // so we can later fix-up the external users of the induction variables.
883 DenseMap<PHINode *, Value *> IVEndValues;
884
885 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
886 // fixed up at the end of vector code generation.
887 SmallVector<PHINode *, 8> OrigPHIsToFix;
888
889 /// BFI and PSI are used to check for profile guided size optimizations.
890 BlockFrequencyInfo *BFI;
891 ProfileSummaryInfo *PSI;
892
893 // Whether this loop should be optimized for size based on profile guided size
894 // optimizatios.
895 bool OptForSizeBasedOnProfile;
896 };
897
898 class InnerLoopUnroller : public InnerLoopVectorizer {
899 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)900 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
901 LoopInfo *LI, DominatorTree *DT,
902 const TargetLibraryInfo *TLI,
903 const TargetTransformInfo *TTI, AssumptionCache *AC,
904 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
905 LoopVectorizationLegality *LVL,
906 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
907 ProfileSummaryInfo *PSI)
908 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
909 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
910 BFI, PSI) {}
911
912 private:
913 Value *getBroadcastInstrs(Value *V) override;
914 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
915 Instruction::BinaryOps Opcode =
916 Instruction::BinaryOpsEnd) override;
917 Value *reverseVector(Value *Vec) override;
918 };
919
920 /// Encapsulate information regarding vectorization of a loop and its epilogue.
921 /// This information is meant to be updated and used across two stages of
922 /// epilogue vectorization.
923 struct EpilogueLoopVectorizationInfo {
924 ElementCount MainLoopVF = ElementCount::getFixed(0);
925 unsigned MainLoopUF = 0;
926 ElementCount EpilogueVF = ElementCount::getFixed(0);
927 unsigned EpilogueUF = 0;
928 BasicBlock *MainLoopIterationCountCheck = nullptr;
929 BasicBlock *EpilogueIterationCountCheck = nullptr;
930 BasicBlock *SCEVSafetyCheck = nullptr;
931 BasicBlock *MemSafetyCheck = nullptr;
932 Value *TripCount = nullptr;
933 Value *VectorTripCount = nullptr;
934
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo935 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
936 unsigned EUF)
937 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
938 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
939 assert(EUF == 1 &&
940 "A high UF for the epilogue loop is likely not beneficial.");
941 }
942 };
943
944 /// An extension of the inner loop vectorizer that creates a skeleton for a
945 /// vectorized loop that has its epilogue (residual) also vectorized.
946 /// The idea is to run the vplan on a given loop twice, firstly to setup the
947 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
948 /// from the first step and vectorize the epilogue. This is achieved by
949 /// deriving two concrete strategy classes from this base class and invoking
950 /// them in succession from the loop vectorizer planner.
951 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
952 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)953 InnerLoopAndEpilogueVectorizer(
954 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
955 DominatorTree *DT, const TargetLibraryInfo *TLI,
956 const TargetTransformInfo *TTI, AssumptionCache *AC,
957 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
958 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
959 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
960 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
961 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
962 EPI(EPI) {}
963
964 // Override this function to handle the more complex control flow around the
965 // three loops.
createVectorizedLoopSkeleton()966 BasicBlock *createVectorizedLoopSkeleton() final override {
967 return createEpilogueVectorizedLoopSkeleton();
968 }
969
970 /// The interface for creating a vectorized skeleton using one of two
971 /// different strategies, each corresponding to one execution of the vplan
972 /// as described above.
973 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
974
975 /// Holds and updates state information required to vectorize the main loop
976 /// and its epilogue in two separate passes. This setup helps us avoid
977 /// regenerating and recomputing runtime safety checks. It also helps us to
978 /// shorten the iteration-count-check path length for the cases where the
979 /// iteration count of the loop is so small that the main vector loop is
980 /// completely skipped.
981 EpilogueLoopVectorizationInfo &EPI;
982 };
983
984 /// A specialized derived class of inner loop vectorizer that performs
985 /// vectorization of *main* loops in the process of vectorizing loops and their
986 /// epilogues.
987 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
988 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)989 EpilogueVectorizerMainLoop(
990 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
991 DominatorTree *DT, const TargetLibraryInfo *TLI,
992 const TargetTransformInfo *TTI, AssumptionCache *AC,
993 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
994 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
995 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
996 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
997 EPI, LVL, CM, BFI, PSI) {}
998 /// Implements the interface for creating a vectorized skeleton using the
999 /// *main loop* strategy (ie the first pass of vplan execution).
1000 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1001
1002 protected:
1003 /// Emits an iteration count bypass check once for the main loop (when \p
1004 /// ForEpilogue is false) and once for the epilogue loop (when \p
1005 /// ForEpilogue is true).
1006 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1007 bool ForEpilogue);
1008 void printDebugTracesAtStart() override;
1009 void printDebugTracesAtEnd() override;
1010 };
1011
1012 // A specialized derived class of inner loop vectorizer that performs
1013 // vectorization of *epilogue* loops in the process of vectorizing loops and
1014 // their epilogues.
1015 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1016 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI)1017 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1018 LoopInfo *LI, DominatorTree *DT,
1019 const TargetLibraryInfo *TLI,
1020 const TargetTransformInfo *TTI, AssumptionCache *AC,
1021 OptimizationRemarkEmitter *ORE,
1022 EpilogueLoopVectorizationInfo &EPI,
1023 LoopVectorizationLegality *LVL,
1024 llvm::LoopVectorizationCostModel *CM,
1025 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1026 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1027 EPI, LVL, CM, BFI, PSI) {}
1028 /// Implements the interface for creating a vectorized skeleton using the
1029 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1030 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1031
1032 protected:
1033 /// Emits an iteration count bypass check after the main vector loop has
1034 /// finished to see if there are any iterations left to execute by either
1035 /// the vector epilogue or the scalar epilogue.
1036 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1037 BasicBlock *Bypass,
1038 BasicBlock *Insert);
1039 void printDebugTracesAtStart() override;
1040 void printDebugTracesAtEnd() override;
1041 };
1042 } // end namespace llvm
1043
1044 /// Look for a meaningful debug location on the instruction or it's
1045 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)1046 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1047 if (!I)
1048 return I;
1049
1050 DebugLoc Empty;
1051 if (I->getDebugLoc() != Empty)
1052 return I;
1053
1054 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1055 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1056 if (OpInst->getDebugLoc() != Empty)
1057 return OpInst;
1058 }
1059
1060 return I;
1061 }
1062
setDebugLocFromInst(IRBuilder<> & B,const Value * Ptr)1063 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1064 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1065 const DILocation *DIL = Inst->getDebugLoc();
1066 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1067 !isa<DbgInfoIntrinsic>(Inst)) {
1068 assert(!VF.isScalable() && "scalable vectors not yet supported.");
1069 auto NewDIL =
1070 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1071 if (NewDIL)
1072 B.SetCurrentDebugLocation(NewDIL.getValue());
1073 else
1074 LLVM_DEBUG(dbgs()
1075 << "Failed to create new discriminator: "
1076 << DIL->getFilename() << " Line: " << DIL->getLine());
1077 }
1078 else
1079 B.SetCurrentDebugLocation(DIL);
1080 } else
1081 B.SetCurrentDebugLocation(DebugLoc());
1082 }
1083
1084 /// Write a record \p DebugMsg about vectorization failure to the debug
1085 /// output stream. If \p I is passed, it is an instruction that prevents
1086 /// vectorization.
1087 #ifndef NDEBUG
debugVectorizationFailure(const StringRef DebugMsg,Instruction * I)1088 static void debugVectorizationFailure(const StringRef DebugMsg,
1089 Instruction *I) {
1090 dbgs() << "LV: Not vectorizing: " << DebugMsg;
1091 if (I != nullptr)
1092 dbgs() << " " << *I;
1093 else
1094 dbgs() << '.';
1095 dbgs() << '\n';
1096 }
1097 #endif
1098
1099 /// Create an analysis remark that explains why vectorization failed
1100 ///
1101 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1102 /// RemarkName is the identifier for the remark. If \p I is passed it is an
1103 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1104 /// the location of the remark. \return the remark object that can be
1105 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)1106 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1107 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1108 Value *CodeRegion = TheLoop->getHeader();
1109 DebugLoc DL = TheLoop->getStartLoc();
1110
1111 if (I) {
1112 CodeRegion = I->getParent();
1113 // If there is no debug location attached to the instruction, revert back to
1114 // using the loop's.
1115 if (I->getDebugLoc())
1116 DL = I->getDebugLoc();
1117 }
1118
1119 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1120 R << "loop not vectorized: ";
1121 return R;
1122 }
1123
1124 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilder<> & B,Constant * Step,ElementCount VF)1125 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1126 assert(isa<ConstantInt>(Step) && "Expected an integer step");
1127 Constant *StepVal = ConstantInt::get(
1128 Step->getType(),
1129 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1130 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1131 }
1132
1133 namespace llvm {
1134
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1135 void reportVectorizationFailure(const StringRef DebugMsg,
1136 const StringRef OREMsg, const StringRef ORETag,
1137 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1138 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1139 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1140 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1141 ORETag, TheLoop, I) << OREMsg);
1142 }
1143
1144 } // end namespace llvm
1145
1146 #ifndef NDEBUG
1147 /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)1148 static std::string getDebugLocString(const Loop *L) {
1149 std::string Result;
1150 if (L) {
1151 raw_string_ostream OS(Result);
1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1153 LoopDbgLoc.print(OS);
1154 else
1155 // Just print the module name.
1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1157 OS.flush();
1158 }
1159 return Result;
1160 }
1161 #endif
1162
addNewMetadata(Instruction * To,const Instruction * Orig)1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1164 const Instruction *Orig) {
1165 // If the loop was versioned with memchecks, add the corresponding no-alias
1166 // metadata.
1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1168 LVer->annotateInstWithNoAlias(To, Orig);
1169 }
1170
addMetadata(Instruction * To,Instruction * From)1171 void InnerLoopVectorizer::addMetadata(Instruction *To,
1172 Instruction *From) {
1173 propagateMetadata(To, From);
1174 addNewMetadata(To, From);
1175 }
1176
addMetadata(ArrayRef<Value * > To,Instruction * From)1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1178 Instruction *From) {
1179 for (Value *V : To) {
1180 if (Instruction *I = dyn_cast<Instruction>(V))
1181 addMetadata(I, From);
1182 }
1183 }
1184
1185 namespace llvm {
1186
1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1188 // lowered.
1189 enum ScalarEpilogueLowering {
1190
1191 // The default: allowing scalar epilogues.
1192 CM_ScalarEpilogueAllowed,
1193
1194 // Vectorization with OptForSize: don't allow epilogues.
1195 CM_ScalarEpilogueNotAllowedOptSize,
1196
1197 // A special case of vectorisation with OptForSize: loops with a very small
1198 // trip count are considered for vectorization under OptForSize, thereby
1199 // making sure the cost of their loop body is dominant, free of runtime
1200 // guards and scalar iteration overheads.
1201 CM_ScalarEpilogueNotAllowedLowTripLoop,
1202
1203 // Loop hint predicate indicating an epilogue is undesired.
1204 CM_ScalarEpilogueNotNeededUsePredicate
1205 };
1206
1207 /// LoopVectorizationCostModel - estimates the expected speedups due to
1208 /// vectorization.
1209 /// In many cases vectorization is not profitable. This can happen because of
1210 /// a number of reasons. In this class we mainly attempt to predict the
1211 /// expected speedup/slowdowns due to the supported instruction set. We use the
1212 /// TargetTransformInfo to query the different backends for the cost of
1213 /// different operations.
1214 class LoopVectorizationCostModel {
1215 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1216 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1217 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1218 LoopVectorizationLegality *Legal,
1219 const TargetTransformInfo &TTI,
1220 const TargetLibraryInfo *TLI, DemandedBits *DB,
1221 AssumptionCache *AC,
1222 OptimizationRemarkEmitter *ORE, const Function *F,
1223 const LoopVectorizeHints *Hints,
1224 InterleavedAccessInfo &IAI)
1225 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1226 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1227 Hints(Hints), InterleaveInfo(IAI) {}
1228
1229 /// \return An upper bound for the vectorization factor, or None if
1230 /// vectorization and interleaving should be avoided up front.
1231 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1232
1233 /// \return True if runtime checks are required for vectorization, and false
1234 /// otherwise.
1235 bool runtimeChecksRequired();
1236
1237 /// \return The most profitable vectorization factor and the cost of that VF.
1238 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1239 /// then this vectorization factor will be selected if vectorization is
1240 /// possible.
1241 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1242 VectorizationFactor
1243 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1244 const LoopVectorizationPlanner &LVP);
1245
1246 /// Setup cost-based decisions for user vectorization factor.
selectUserVectorizationFactor(ElementCount UserVF)1247 void selectUserVectorizationFactor(ElementCount UserVF) {
1248 collectUniformsAndScalars(UserVF);
1249 collectInstsToScalarize(UserVF);
1250 }
1251
1252 /// \return The size (in bits) of the smallest and widest types in the code
1253 /// that needs to be vectorized. We ignore values that remain scalar such as
1254 /// 64 bit loop indices.
1255 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1256
1257 /// \return The desired interleave count.
1258 /// If interleave count has been specified by metadata it will be returned.
1259 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1260 /// are the selected vectorization factor and the cost of the selected VF.
1261 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1262
1263 /// Memory access instruction may be vectorized in more than one way.
1264 /// Form of instruction after vectorization depends on cost.
1265 /// This function takes cost-based decisions for Load/Store instructions
1266 /// and collects them in a map. This decisions map is used for building
1267 /// the lists of loop-uniform and loop-scalar instructions.
1268 /// The calculated cost is saved with widening decision in order to
1269 /// avoid redundant calculations.
1270 void setCostBasedWideningDecision(ElementCount VF);
1271
1272 /// A struct that represents some properties of the register usage
1273 /// of a loop.
1274 struct RegisterUsage {
1275 /// Holds the number of loop invariant values that are used in the loop.
1276 /// The key is ClassID of target-provided register class.
1277 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1278 /// Holds the maximum number of concurrent live intervals in the loop.
1279 /// The key is ClassID of target-provided register class.
1280 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1281 };
1282
1283 /// \return Returns information about the register usages of the loop for the
1284 /// given vectorization factors.
1285 SmallVector<RegisterUsage, 8>
1286 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1287
1288 /// Collect values we want to ignore in the cost model.
1289 void collectValuesToIgnore();
1290
1291 /// Split reductions into those that happen in the loop, and those that happen
1292 /// outside. In loop reductions are collected into InLoopReductionChains.
1293 void collectInLoopReductions();
1294
1295 /// \returns The smallest bitwidth each instruction can be represented with.
1296 /// The vector equivalents of these instructions should be truncated to this
1297 /// type.
getMinimalBitwidths() const1298 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1299 return MinBWs;
1300 }
1301
1302 /// \returns True if it is more profitable to scalarize instruction \p I for
1303 /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1304 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1305 assert(VF.isVector() &&
1306 "Profitable to scalarize relevant only for VF > 1.");
1307
1308 // Cost model is not run in the VPlan-native path - return conservative
1309 // result until this changes.
1310 if (EnableVPlanNativePath)
1311 return false;
1312
1313 auto Scalars = InstsToScalarize.find(VF);
1314 assert(Scalars != InstsToScalarize.end() &&
1315 "VF not yet analyzed for scalarization profitability");
1316 return Scalars->second.find(I) != Scalars->second.end();
1317 }
1318
1319 /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1320 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1321 if (VF.isScalar())
1322 return true;
1323
1324 // Cost model is not run in the VPlan-native path - return conservative
1325 // result until this changes.
1326 if (EnableVPlanNativePath)
1327 return false;
1328
1329 auto UniformsPerVF = Uniforms.find(VF);
1330 assert(UniformsPerVF != Uniforms.end() &&
1331 "VF not yet analyzed for uniformity");
1332 return UniformsPerVF->second.count(I);
1333 }
1334
1335 /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1336 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337 if (VF.isScalar())
1338 return true;
1339
1340 // Cost model is not run in the VPlan-native path - return conservative
1341 // result until this changes.
1342 if (EnableVPlanNativePath)
1343 return false;
1344
1345 auto ScalarsPerVF = Scalars.find(VF);
1346 assert(ScalarsPerVF != Scalars.end() &&
1347 "Scalar values are not calculated for VF");
1348 return ScalarsPerVF->second.count(I);
1349 }
1350
1351 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352 /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1353 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1355 !isProfitableToScalarize(I, VF) &&
1356 !isScalarAfterVectorization(I, VF);
1357 }
1358
1359 /// Decision that was taken during cost calculation for memory instruction.
1360 enum InstWidening {
1361 CM_Unknown,
1362 CM_Widen, // For consecutive accesses with stride +1.
1363 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364 CM_Interleave,
1365 CM_GatherScatter,
1366 CM_Scalarize
1367 };
1368
1369 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1370 /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,unsigned Cost)1371 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1372 unsigned Cost) {
1373 assert(VF.isVector() && "Expected VF >=2");
1374 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1375 }
1376
1377 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1378 /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,unsigned Cost)1379 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1380 ElementCount VF, InstWidening W, unsigned Cost) {
1381 assert(VF.isVector() && "Expected VF >=2");
1382 /// Broadcast this decicion to all instructions inside the group.
1383 /// But the cost will be assigned to one instruction only.
1384 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1385 if (auto *I = Grp->getMember(i)) {
1386 if (Grp->getInsertPos() == I)
1387 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1388 else
1389 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1390 }
1391 }
1392 }
1393
1394 /// Return the cost model decision for the given instruction \p I and vector
1395 /// width \p VF. Return CM_Unknown if this instruction did not pass
1396 /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF)1397 InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1398 assert(VF.isVector() && "Expected VF to be a vector VF");
1399 // Cost model is not run in the VPlan-native path - return conservative
1400 // result until this changes.
1401 if (EnableVPlanNativePath)
1402 return CM_GatherScatter;
1403
1404 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1405 auto Itr = WideningDecisions.find(InstOnVF);
1406 if (Itr == WideningDecisions.end())
1407 return CM_Unknown;
1408 return Itr->second.first;
1409 }
1410
1411 /// Return the vectorization cost for the given instruction \p I and vector
1412 /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1413 unsigned getWideningCost(Instruction *I, ElementCount VF) {
1414 assert(VF.isVector() && "Expected VF >=2");
1415 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1416 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1417 "The cost is not calculated");
1418 return WideningDecisions[InstOnVF].second;
1419 }
1420
1421 /// Return True if instruction \p I is an optimizable truncate whose operand
1422 /// is an induction variable. Such a truncate will be removed by adding a new
1423 /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1424 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1425 // If the instruction is not a truncate, return false.
1426 auto *Trunc = dyn_cast<TruncInst>(I);
1427 if (!Trunc)
1428 return false;
1429
1430 // Get the source and destination types of the truncate.
1431 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1432 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1433
1434 // If the truncate is free for the given types, return false. Replacing a
1435 // free truncate with an induction variable would add an induction variable
1436 // update instruction to each iteration of the loop. We exclude from this
1437 // check the primary induction variable since it will need an update
1438 // instruction regardless.
1439 Value *Op = Trunc->getOperand(0);
1440 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1441 return false;
1442
1443 // If the truncated value is not an induction variable, return false.
1444 return Legal->isInductionPhi(Op);
1445 }
1446
1447 /// Collects the instructions to scalarize for each predicated instruction in
1448 /// the loop.
1449 void collectInstsToScalarize(ElementCount VF);
1450
1451 /// Collect Uniform and Scalar values for the given \p VF.
1452 /// The sets depend on CM decision for Load/Store instructions
1453 /// that may be vectorized as interleave, gather-scatter or scalarized.
collectUniformsAndScalars(ElementCount VF)1454 void collectUniformsAndScalars(ElementCount VF) {
1455 // Do the analysis once.
1456 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1457 return;
1458 setCostBasedWideningDecision(VF);
1459 collectLoopUniforms(VF);
1460 collectLoopScalars(VF);
1461 }
1462
1463 /// Returns true if the target machine supports masked store operation
1464 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment)1465 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1466 return Legal->isConsecutivePtr(Ptr) &&
1467 TTI.isLegalMaskedStore(DataType, Alignment);
1468 }
1469
1470 /// Returns true if the target machine supports masked load operation
1471 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment)1472 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1473 return Legal->isConsecutivePtr(Ptr) &&
1474 TTI.isLegalMaskedLoad(DataType, Alignment);
1475 }
1476
1477 /// Returns true if the target machine supports masked scatter operation
1478 /// for the given \p DataType.
isLegalMaskedScatter(Type * DataType,Align Alignment)1479 bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1480 return TTI.isLegalMaskedScatter(DataType, Alignment);
1481 }
1482
1483 /// Returns true if the target machine supports masked gather operation
1484 /// for the given \p DataType.
isLegalMaskedGather(Type * DataType,Align Alignment)1485 bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1486 return TTI.isLegalMaskedGather(DataType, Alignment);
1487 }
1488
1489 /// Returns true if the target machine can represent \p V as a masked gather
1490 /// or scatter operation.
isLegalGatherOrScatter(Value * V)1491 bool isLegalGatherOrScatter(Value *V) {
1492 bool LI = isa<LoadInst>(V);
1493 bool SI = isa<StoreInst>(V);
1494 if (!LI && !SI)
1495 return false;
1496 auto *Ty = getMemInstValueType(V);
1497 Align Align = getLoadStoreAlignment(V);
1498 return (LI && isLegalMaskedGather(Ty, Align)) ||
1499 (SI && isLegalMaskedScatter(Ty, Align));
1500 }
1501
1502 /// Returns true if \p I is an instruction that will be scalarized with
1503 /// predication. Such instructions include conditional stores and
1504 /// instructions that may divide by zero.
1505 /// If a non-zero VF has been calculated, we check if I will be scalarized
1506 /// predication for that VF.
1507 bool isScalarWithPredication(Instruction *I,
1508 ElementCount VF = ElementCount::getFixed(1));
1509
1510 // Returns true if \p I is an instruction that will be predicated either
1511 // through scalar predication or masked load/store or masked gather/scatter.
1512 // Superset of instructions that return true for isScalarWithPredication.
isPredicatedInst(Instruction * I)1513 bool isPredicatedInst(Instruction *I) {
1514 if (!blockNeedsPredication(I->getParent()))
1515 return false;
1516 // Loads and stores that need some form of masked operation are predicated
1517 // instructions.
1518 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1519 return Legal->isMaskRequired(I);
1520 return isScalarWithPredication(I);
1521 }
1522
1523 /// Returns true if \p I is a memory instruction with consecutive memory
1524 /// access that can be widened.
1525 bool
1526 memoryInstructionCanBeWidened(Instruction *I,
1527 ElementCount VF = ElementCount::getFixed(1));
1528
1529 /// Returns true if \p I is a memory instruction in an interleaved-group
1530 /// of memory accesses that can be vectorized with wide vector loads/stores
1531 /// and shuffles.
1532 bool
1533 interleavedAccessCanBeWidened(Instruction *I,
1534 ElementCount VF = ElementCount::getFixed(1));
1535
1536 /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)1537 bool isAccessInterleaved(Instruction *Instr) {
1538 return InterleaveInfo.isInterleaved(Instr);
1539 }
1540
1541 /// Get the interleaved access group that \p Instr belongs to.
1542 const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)1543 getInterleavedAccessGroup(Instruction *Instr) {
1544 return InterleaveInfo.getInterleaveGroup(Instr);
1545 }
1546
1547 /// Returns true if an interleaved group requires a scalar iteration
1548 /// to handle accesses with gaps, and there is nothing preventing us from
1549 /// creating a scalar epilogue.
requiresScalarEpilogue() const1550 bool requiresScalarEpilogue() const {
1551 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1552 }
1553
1554 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1555 /// loop hint annotation.
isScalarEpilogueAllowed() const1556 bool isScalarEpilogueAllowed() const {
1557 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1558 }
1559
1560 /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1561 bool foldTailByMasking() const { return FoldTailByMasking; }
1562
blockNeedsPredication(BasicBlock * BB)1563 bool blockNeedsPredication(BasicBlock *BB) {
1564 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1565 }
1566
1567 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1568 /// nodes to the chain of instructions representing the reductions. Uses a
1569 /// MapVector to ensure deterministic iteration order.
1570 using ReductionChainMap =
1571 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1572
1573 /// Return the chain of instructions representing an inloop reduction.
getInLoopReductionChains() const1574 const ReductionChainMap &getInLoopReductionChains() const {
1575 return InLoopReductionChains;
1576 }
1577
1578 /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1579 bool isInLoopReduction(PHINode *Phi) const {
1580 return InLoopReductionChains.count(Phi);
1581 }
1582
1583 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1584 /// with factor VF. Return the cost of the instruction, including
1585 /// scalarization overhead if it's needed.
1586 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1587
1588 /// Estimate cost of a call instruction CI if it were vectorized with factor
1589 /// VF. Return the cost of the instruction, including scalarization overhead
1590 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1591 /// scalarized -
1592 /// i.e. either vector version isn't available, or is too expensive.
1593 unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1594 bool &NeedToScalarize);
1595
1596 /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1597 void invalidateCostModelingDecisions() {
1598 WideningDecisions.clear();
1599 Uniforms.clear();
1600 Scalars.clear();
1601 }
1602
1603 private:
1604 unsigned NumPredStores = 0;
1605
1606 /// \return An upper bound for the vectorization factor, a power-of-2 larger
1607 /// than zero. One is returned if vectorization should best be avoided due
1608 /// to cost.
1609 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1610 ElementCount UserVF);
1611
1612 /// The vectorization cost is a combination of the cost itself and a boolean
1613 /// indicating whether any of the contributing operations will actually
1614 /// operate on
1615 /// vector values after type legalization in the backend. If this latter value
1616 /// is
1617 /// false, then all operations will be scalarized (i.e. no vectorization has
1618 /// actually taken place).
1619 using VectorizationCostTy = std::pair<unsigned, bool>;
1620
1621 /// Returns the expected execution cost. The unit of the cost does
1622 /// not matter because we use the 'cost' units to compare different
1623 /// vector widths. The cost that is returned is *not* normalized by
1624 /// the factor width.
1625 VectorizationCostTy expectedCost(ElementCount VF);
1626
1627 /// Returns the execution time cost of an instruction for a given vector
1628 /// width. Vector width of one means scalar.
1629 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1630
1631 /// The cost-computation logic from getInstructionCost which provides
1632 /// the vector type as an output parameter.
1633 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1634
1635 /// Calculate vectorization cost of memory instruction \p I.
1636 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1637
1638 /// The cost computation for scalarized memory instruction.
1639 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1640
1641 /// The cost computation for interleaving group of memory instructions.
1642 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1643
1644 /// The cost computation for Gather/Scatter instruction.
1645 unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1646
1647 /// The cost computation for widening instruction \p I with consecutive
1648 /// memory access.
1649 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1650
1651 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1652 /// Load: scalar load + broadcast.
1653 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1654 /// element)
1655 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1656
1657 /// Estimate the overhead of scalarizing an instruction. This is a
1658 /// convenience wrapper for the type-based getScalarizationOverhead API.
1659 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1660
1661 /// Returns whether the instruction is a load or store and will be a emitted
1662 /// as a vector operation.
1663 bool isConsecutiveLoadOrStore(Instruction *I);
1664
1665 /// Returns true if an artificially high cost for emulated masked memrefs
1666 /// should be used.
1667 bool useEmulatedMaskMemRefHack(Instruction *I);
1668
1669 /// Map of scalar integer values to the smallest bitwidth they can be legally
1670 /// represented as. The vector equivalents of these values should be truncated
1671 /// to this type.
1672 MapVector<Instruction *, uint64_t> MinBWs;
1673
1674 /// A type representing the costs for instructions if they were to be
1675 /// scalarized rather than vectorized. The entries are Instruction-Cost
1676 /// pairs.
1677 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1678
1679 /// A set containing all BasicBlocks that are known to present after
1680 /// vectorization as a predicated block.
1681 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1682
1683 /// Records whether it is allowed to have the original scalar loop execute at
1684 /// least once. This may be needed as a fallback loop in case runtime
1685 /// aliasing/dependence checks fail, or to handle the tail/remainder
1686 /// iterations when the trip count is unknown or doesn't divide by the VF,
1687 /// or as a peel-loop to handle gaps in interleave-groups.
1688 /// Under optsize and when the trip count is very small we don't allow any
1689 /// iterations to execute in the scalar loop.
1690 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1691
1692 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1693 bool FoldTailByMasking = false;
1694
1695 /// A map holding scalar costs for different vectorization factors. The
1696 /// presence of a cost for an instruction in the mapping indicates that the
1697 /// instruction will be scalarized when vectorizing with the associated
1698 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1699 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1700
1701 /// Holds the instructions known to be uniform after vectorization.
1702 /// The data is collected per VF.
1703 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1704
1705 /// Holds the instructions known to be scalar after vectorization.
1706 /// The data is collected per VF.
1707 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1708
1709 /// Holds the instructions (address computations) that are forced to be
1710 /// scalarized.
1711 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1712
1713 /// PHINodes of the reductions that should be expanded in-loop along with
1714 /// their associated chains of reduction operations, in program order from top
1715 /// (PHI) to bottom
1716 ReductionChainMap InLoopReductionChains;
1717
1718 /// Returns the expected difference in cost from scalarizing the expression
1719 /// feeding a predicated instruction \p PredInst. The instructions to
1720 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1721 /// non-negative return value implies the expression will be scalarized.
1722 /// Currently, only single-use chains are considered for scalarization.
1723 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1724 ElementCount VF);
1725
1726 /// Collect the instructions that are uniform after vectorization. An
1727 /// instruction is uniform if we represent it with a single scalar value in
1728 /// the vectorized loop corresponding to each vector iteration. Examples of
1729 /// uniform instructions include pointer operands of consecutive or
1730 /// interleaved memory accesses. Note that although uniformity implies an
1731 /// instruction will be scalar, the reverse is not true. In general, a
1732 /// scalarized instruction will be represented by VF scalar values in the
1733 /// vectorized loop, each corresponding to an iteration of the original
1734 /// scalar loop.
1735 void collectLoopUniforms(ElementCount VF);
1736
1737 /// Collect the instructions that are scalar after vectorization. An
1738 /// instruction is scalar if it is known to be uniform or will be scalarized
1739 /// during vectorization. Non-uniform scalarized instructions will be
1740 /// represented by VF values in the vectorized loop, each corresponding to an
1741 /// iteration of the original scalar loop.
1742 void collectLoopScalars(ElementCount VF);
1743
1744 /// Keeps cost model vectorization decision and cost for instructions.
1745 /// Right now it is used for memory instructions only.
1746 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1747 std::pair<InstWidening, unsigned>>;
1748
1749 DecisionList WideningDecisions;
1750
1751 /// Returns true if \p V is expected to be vectorized and it needs to be
1752 /// extracted.
needsExtract(Value * V,ElementCount VF) const1753 bool needsExtract(Value *V, ElementCount VF) const {
1754 Instruction *I = dyn_cast<Instruction>(V);
1755 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1756 TheLoop->isLoopInvariant(I))
1757 return false;
1758
1759 // Assume we can vectorize V (and hence we need extraction) if the
1760 // scalars are not computed yet. This can happen, because it is called
1761 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1762 // the scalars are collected. That should be a safe assumption in most
1763 // cases, because we check if the operands have vectorizable types
1764 // beforehand in LoopVectorizationLegality.
1765 return Scalars.find(VF) == Scalars.end() ||
1766 !isScalarAfterVectorization(I, VF);
1767 };
1768
1769 /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF)1770 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1771 ElementCount VF) {
1772 return SmallVector<Value *, 4>(make_filter_range(
1773 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1774 }
1775
1776 /// Determines if we have the infrastructure to vectorize loop \p L and its
1777 /// epilogue, assuming the main loop is vectorized by \p VF.
1778 bool isCandidateForEpilogueVectorization(const Loop &L,
1779 const ElementCount VF) const;
1780
1781 /// Returns true if epilogue vectorization is considered profitable, and
1782 /// false otherwise.
1783 /// \p VF is the vectorization factor chosen for the original loop.
1784 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1785
1786 public:
1787 /// The loop that we evaluate.
1788 Loop *TheLoop;
1789
1790 /// Predicated scalar evolution analysis.
1791 PredicatedScalarEvolution &PSE;
1792
1793 /// Loop Info analysis.
1794 LoopInfo *LI;
1795
1796 /// Vectorization legality.
1797 LoopVectorizationLegality *Legal;
1798
1799 /// Vector target information.
1800 const TargetTransformInfo &TTI;
1801
1802 /// Target Library Info.
1803 const TargetLibraryInfo *TLI;
1804
1805 /// Demanded bits analysis.
1806 DemandedBits *DB;
1807
1808 /// Assumption cache.
1809 AssumptionCache *AC;
1810
1811 /// Interface to emit optimization remarks.
1812 OptimizationRemarkEmitter *ORE;
1813
1814 const Function *TheFunction;
1815
1816 /// Loop Vectorize Hint.
1817 const LoopVectorizeHints *Hints;
1818
1819 /// The interleave access information contains groups of interleaved accesses
1820 /// with the same stride and close to each other.
1821 InterleavedAccessInfo &InterleaveInfo;
1822
1823 /// Values to ignore in the cost model.
1824 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1825
1826 /// Values to ignore in the cost model when VF > 1.
1827 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1828
1829 /// Profitable vector factors.
1830 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1831 };
1832
1833 } // end namespace llvm
1834
1835 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1836 // vectorization. The loop needs to be annotated with #pragma omp simd
1837 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1838 // vector length information is not provided, vectorization is not considered
1839 // explicit. Interleave hints are not allowed either. These limitations will be
1840 // relaxed in the future.
1841 // Please, note that we are currently forced to abuse the pragma 'clang
1842 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1843 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1844 // provides *explicit vectorization hints* (LV can bypass legal checks and
1845 // assume that vectorization is legal). However, both hints are implemented
1846 // using the same metadata (llvm.loop.vectorize, processed by
1847 // LoopVectorizeHints). This will be fixed in the future when the native IR
1848 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)1849 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1850 OptimizationRemarkEmitter *ORE) {
1851 assert(!OuterLp->isInnermost() && "This is not an outer loop");
1852 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1853
1854 // Only outer loops with an explicit vectorization hint are supported.
1855 // Unannotated outer loops are ignored.
1856 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1857 return false;
1858
1859 Function *Fn = OuterLp->getHeader()->getParent();
1860 if (!Hints.allowVectorization(Fn, OuterLp,
1861 true /*VectorizeOnlyWhenForced*/)) {
1862 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1863 return false;
1864 }
1865
1866 if (Hints.getInterleave() > 1) {
1867 // TODO: Interleave support is future work.
1868 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1869 "outer loops.\n");
1870 Hints.emitRemarkWithHints();
1871 return false;
1872 }
1873
1874 return true;
1875 }
1876
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)1877 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1878 OptimizationRemarkEmitter *ORE,
1879 SmallVectorImpl<Loop *> &V) {
1880 // Collect inner loops and outer loops without irreducible control flow. For
1881 // now, only collect outer loops that have explicit vectorization hints. If we
1882 // are stress testing the VPlan H-CFG construction, we collect the outermost
1883 // loop of every loop nest.
1884 if (L.isInnermost() || VPlanBuildStressTest ||
1885 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1886 LoopBlocksRPO RPOT(&L);
1887 RPOT.perform(LI);
1888 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1889 V.push_back(&L);
1890 // TODO: Collect inner loops inside marked outer loops in case
1891 // vectorization fails for the outer loop. Do not invoke
1892 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1893 // already known to be reducible. We can use an inherited attribute for
1894 // that.
1895 return;
1896 }
1897 }
1898 for (Loop *InnerL : L)
1899 collectSupportedLoops(*InnerL, LI, ORE, V);
1900 }
1901
1902 namespace {
1903
1904 /// The LoopVectorize Pass.
1905 struct LoopVectorize : public FunctionPass {
1906 /// Pass identification, replacement for typeid
1907 static char ID;
1908
1909 LoopVectorizePass Impl;
1910
LoopVectorize__anon0f3167640211::LoopVectorize1911 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1912 bool VectorizeOnlyWhenForced = false)
1913 : FunctionPass(ID),
1914 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1915 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1916 }
1917
runOnFunction__anon0f3167640211::LoopVectorize1918 bool runOnFunction(Function &F) override {
1919 if (skipFunction(F))
1920 return false;
1921
1922 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1923 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1924 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1925 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1926 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1927 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1928 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1929 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1930 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1931 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1932 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1933 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1934 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1935
1936 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1937 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1938
1939 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1940 GetLAA, *ORE, PSI).MadeAnyChange;
1941 }
1942
getAnalysisUsage__anon0f3167640211::LoopVectorize1943 void getAnalysisUsage(AnalysisUsage &AU) const override {
1944 AU.addRequired<AssumptionCacheTracker>();
1945 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1946 AU.addRequired<DominatorTreeWrapperPass>();
1947 AU.addRequired<LoopInfoWrapperPass>();
1948 AU.addRequired<ScalarEvolutionWrapperPass>();
1949 AU.addRequired<TargetTransformInfoWrapperPass>();
1950 AU.addRequired<AAResultsWrapperPass>();
1951 AU.addRequired<LoopAccessLegacyAnalysis>();
1952 AU.addRequired<DemandedBitsWrapperPass>();
1953 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1954 AU.addRequired<InjectTLIMappingsLegacy>();
1955
1956 // We currently do not preserve loopinfo/dominator analyses with outer loop
1957 // vectorization. Until this is addressed, mark these analyses as preserved
1958 // only for non-VPlan-native path.
1959 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1960 if (!EnableVPlanNativePath) {
1961 AU.addPreserved<LoopInfoWrapperPass>();
1962 AU.addPreserved<DominatorTreeWrapperPass>();
1963 }
1964
1965 AU.addPreserved<BasicAAWrapperPass>();
1966 AU.addPreserved<GlobalsAAWrapperPass>();
1967 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1968 }
1969 };
1970
1971 } // end anonymous namespace
1972
1973 //===----------------------------------------------------------------------===//
1974 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1975 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1976 //===----------------------------------------------------------------------===//
1977
getBroadcastInstrs(Value * V)1978 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1979 // We need to place the broadcast of invariant variables outside the loop,
1980 // but only if it's proven safe to do so. Else, broadcast will be inside
1981 // vector loop body.
1982 Instruction *Instr = dyn_cast<Instruction>(V);
1983 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1984 (!Instr ||
1985 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1986 // Place the code for broadcasting invariant variables in the new preheader.
1987 IRBuilder<>::InsertPointGuard Guard(Builder);
1988 if (SafeToHoist)
1989 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1990
1991 // Broadcast the scalar into all locations in the vector.
1992 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1993
1994 return Shuf;
1995 }
1996
createVectorIntOrFpInductionPHI(const InductionDescriptor & II,Value * Step,Instruction * EntryVal)1997 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1998 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1999 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2000 "Expected either an induction phi-node or a truncate of it!");
2001 Value *Start = II.getStartValue();
2002
2003 // Construct the initial value of the vector IV in the vector loop preheader
2004 auto CurrIP = Builder.saveIP();
2005 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2006 if (isa<TruncInst>(EntryVal)) {
2007 assert(Start->getType()->isIntegerTy() &&
2008 "Truncation requires an integer type");
2009 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2010 Step = Builder.CreateTrunc(Step, TruncType);
2011 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2012 }
2013 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2014 Value *SteppedStart =
2015 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2016
2017 // We create vector phi nodes for both integer and floating-point induction
2018 // variables. Here, we determine the kind of arithmetic we will perform.
2019 Instruction::BinaryOps AddOp;
2020 Instruction::BinaryOps MulOp;
2021 if (Step->getType()->isIntegerTy()) {
2022 AddOp = Instruction::Add;
2023 MulOp = Instruction::Mul;
2024 } else {
2025 AddOp = II.getInductionOpcode();
2026 MulOp = Instruction::FMul;
2027 }
2028
2029 // Multiply the vectorization factor by the step using integer or
2030 // floating-point arithmetic as appropriate.
2031 Value *ConstVF =
2032 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2033 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2034
2035 // Create a vector splat to use in the induction update.
2036 //
2037 // FIXME: If the step is non-constant, we create the vector splat with
2038 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2039 // handle a constant vector splat.
2040 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2041 Value *SplatVF = isa<Constant>(Mul)
2042 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2043 : Builder.CreateVectorSplat(VF, Mul);
2044 Builder.restoreIP(CurrIP);
2045
2046 // We may need to add the step a number of times, depending on the unroll
2047 // factor. The last of those goes into the PHI.
2048 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2049 &*LoopVectorBody->getFirstInsertionPt());
2050 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2051 Instruction *LastInduction = VecInd;
2052 for (unsigned Part = 0; Part < UF; ++Part) {
2053 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2054
2055 if (isa<TruncInst>(EntryVal))
2056 addMetadata(LastInduction, EntryVal);
2057 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2058
2059 LastInduction = cast<Instruction>(addFastMathFlag(
2060 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2061 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2062 }
2063
2064 // Move the last step to the end of the latch block. This ensures consistent
2065 // placement of all induction updates.
2066 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2067 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2068 auto *ICmp = cast<Instruction>(Br->getCondition());
2069 LastInduction->moveBefore(ICmp);
2070 LastInduction->setName("vec.ind.next");
2071
2072 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2073 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2074 }
2075
shouldScalarizeInstruction(Instruction * I) const2076 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2077 return Cost->isScalarAfterVectorization(I, VF) ||
2078 Cost->isProfitableToScalarize(I, VF);
2079 }
2080
needsScalarInduction(Instruction * IV) const2081 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2082 if (shouldScalarizeInstruction(IV))
2083 return true;
2084 auto isScalarInst = [&](User *U) -> bool {
2085 auto *I = cast<Instruction>(U);
2086 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2087 };
2088 return llvm::any_of(IV->users(), isScalarInst);
2089 }
2090
recordVectorLoopValueForInductionCast(const InductionDescriptor & ID,const Instruction * EntryVal,Value * VectorLoopVal,unsigned Part,unsigned Lane)2091 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2092 const InductionDescriptor &ID, const Instruction *EntryVal,
2093 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2094 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2095 "Expected either an induction phi-node or a truncate of it!");
2096
2097 // This induction variable is not the phi from the original loop but the
2098 // newly-created IV based on the proof that casted Phi is equal to the
2099 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2100 // re-uses the same InductionDescriptor that original IV uses but we don't
2101 // have to do any recording in this case - that is done when original IV is
2102 // processed.
2103 if (isa<TruncInst>(EntryVal))
2104 return;
2105
2106 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2107 if (Casts.empty())
2108 return;
2109 // Only the first Cast instruction in the Casts vector is of interest.
2110 // The rest of the Casts (if exist) have no uses outside the
2111 // induction update chain itself.
2112 Instruction *CastInst = *Casts.begin();
2113 if (Lane < UINT_MAX)
2114 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2115 else
2116 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2117 }
2118
widenIntOrFpInduction(PHINode * IV,TruncInst * Trunc)2119 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2120 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2121 "Primary induction variable must have an integer type");
2122
2123 auto II = Legal->getInductionVars().find(IV);
2124 assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2125
2126 auto ID = II->second;
2127 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2128
2129 // The value from the original loop to which we are mapping the new induction
2130 // variable.
2131 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2132
2133 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2134
2135 // Generate code for the induction step. Note that induction steps are
2136 // required to be loop-invariant
2137 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2138 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2139 "Induction step should be loop invariant");
2140 if (PSE.getSE()->isSCEVable(IV->getType())) {
2141 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2142 return Exp.expandCodeFor(Step, Step->getType(),
2143 LoopVectorPreHeader->getTerminator());
2144 }
2145 return cast<SCEVUnknown>(Step)->getValue();
2146 };
2147
2148 // The scalar value to broadcast. This is derived from the canonical
2149 // induction variable. If a truncation type is given, truncate the canonical
2150 // induction variable and step. Otherwise, derive these values from the
2151 // induction descriptor.
2152 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2153 Value *ScalarIV = Induction;
2154 if (IV != OldInduction) {
2155 ScalarIV = IV->getType()->isIntegerTy()
2156 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2157 : Builder.CreateCast(Instruction::SIToFP, Induction,
2158 IV->getType());
2159 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2160 ScalarIV->setName("offset.idx");
2161 }
2162 if (Trunc) {
2163 auto *TruncType = cast<IntegerType>(Trunc->getType());
2164 assert(Step->getType()->isIntegerTy() &&
2165 "Truncation requires an integer step");
2166 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2167 Step = Builder.CreateTrunc(Step, TruncType);
2168 }
2169 return ScalarIV;
2170 };
2171
2172 // Create the vector values from the scalar IV, in the absence of creating a
2173 // vector IV.
2174 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2175 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2176 for (unsigned Part = 0; Part < UF; ++Part) {
2177 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2178 Value *EntryPart =
2179 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2180 ID.getInductionOpcode());
2181 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2182 if (Trunc)
2183 addMetadata(EntryPart, Trunc);
2184 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2185 }
2186 };
2187
2188 // Now do the actual transformations, and start with creating the step value.
2189 Value *Step = CreateStepValue(ID.getStep());
2190 if (VF.isZero() || VF.isScalar()) {
2191 Value *ScalarIV = CreateScalarIV(Step);
2192 CreateSplatIV(ScalarIV, Step);
2193 return;
2194 }
2195
2196 // Determine if we want a scalar version of the induction variable. This is
2197 // true if the induction variable itself is not widened, or if it has at
2198 // least one user in the loop that is not widened.
2199 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2200 if (!NeedsScalarIV) {
2201 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2202 return;
2203 }
2204
2205 // Try to create a new independent vector induction variable. If we can't
2206 // create the phi node, we will splat the scalar induction variable in each
2207 // loop iteration.
2208 if (!shouldScalarizeInstruction(EntryVal)) {
2209 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2210 Value *ScalarIV = CreateScalarIV(Step);
2211 // Create scalar steps that can be used by instructions we will later
2212 // scalarize. Note that the addition of the scalar steps will not increase
2213 // the number of instructions in the loop in the common case prior to
2214 // InstCombine. We will be trading one vector extract for each scalar step.
2215 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2216 return;
2217 }
2218
2219 // All IV users are scalar instructions, so only emit a scalar IV, not a
2220 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2221 // predicate used by the masked loads/stores.
2222 Value *ScalarIV = CreateScalarIV(Step);
2223 if (!Cost->isScalarEpilogueAllowed())
2224 CreateSplatIV(ScalarIV, Step);
2225 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2226 }
2227
getStepVector(Value * Val,int StartIdx,Value * Step,Instruction::BinaryOps BinOp)2228 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2229 Instruction::BinaryOps BinOp) {
2230 // Create and check the types.
2231 auto *ValVTy = cast<FixedVectorType>(Val->getType());
2232 int VLen = ValVTy->getNumElements();
2233
2234 Type *STy = Val->getType()->getScalarType();
2235 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2236 "Induction Step must be an integer or FP");
2237 assert(Step->getType() == STy && "Step has wrong type");
2238
2239 SmallVector<Constant *, 8> Indices;
2240
2241 if (STy->isIntegerTy()) {
2242 // Create a vector of consecutive numbers from zero to VF.
2243 for (int i = 0; i < VLen; ++i)
2244 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2245
2246 // Add the consecutive indices to the vector value.
2247 Constant *Cv = ConstantVector::get(Indices);
2248 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2249 Step = Builder.CreateVectorSplat(VLen, Step);
2250 assert(Step->getType() == Val->getType() && "Invalid step vec");
2251 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2252 // which can be found from the original scalar operations.
2253 Step = Builder.CreateMul(Cv, Step);
2254 return Builder.CreateAdd(Val, Step, "induction");
2255 }
2256
2257 // Floating point induction.
2258 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2259 "Binary Opcode should be specified for FP induction");
2260 // Create a vector of consecutive numbers from zero to VF.
2261 for (int i = 0; i < VLen; ++i)
2262 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2263
2264 // Add the consecutive indices to the vector value.
2265 Constant *Cv = ConstantVector::get(Indices);
2266
2267 Step = Builder.CreateVectorSplat(VLen, Step);
2268
2269 // Floating point operations had to be 'fast' to enable the induction.
2270 FastMathFlags Flags;
2271 Flags.setFast();
2272
2273 Value *MulOp = Builder.CreateFMul(Cv, Step);
2274 if (isa<Instruction>(MulOp))
2275 // Have to check, MulOp may be a constant
2276 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2277
2278 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2279 if (isa<Instruction>(BOp))
2280 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2281 return BOp;
2282 }
2283
buildScalarSteps(Value * ScalarIV,Value * Step,Instruction * EntryVal,const InductionDescriptor & ID)2284 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2285 Instruction *EntryVal,
2286 const InductionDescriptor &ID) {
2287 // We shouldn't have to build scalar steps if we aren't vectorizing.
2288 assert(VF.isVector() && "VF should be greater than one");
2289 // Get the value type and ensure it and the step have the same integer type.
2290 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2291 assert(ScalarIVTy == Step->getType() &&
2292 "Val and Step should have the same type");
2293
2294 // We build scalar steps for both integer and floating-point induction
2295 // variables. Here, we determine the kind of arithmetic we will perform.
2296 Instruction::BinaryOps AddOp;
2297 Instruction::BinaryOps MulOp;
2298 if (ScalarIVTy->isIntegerTy()) {
2299 AddOp = Instruction::Add;
2300 MulOp = Instruction::Mul;
2301 } else {
2302 AddOp = ID.getInductionOpcode();
2303 MulOp = Instruction::FMul;
2304 }
2305
2306 // Determine the number of scalars we need to generate for each unroll
2307 // iteration. If EntryVal is uniform, we only need to generate the first
2308 // lane. Otherwise, we generate all VF values.
2309 unsigned Lanes =
2310 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2311 ? 1
2312 : VF.getKnownMinValue();
2313 assert((!VF.isScalable() || Lanes == 1) &&
2314 "Should never scalarize a scalable vector");
2315 // Compute the scalar steps and save the results in VectorLoopValueMap.
2316 for (unsigned Part = 0; Part < UF; ++Part) {
2317 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2318 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2319 ScalarIVTy->getScalarSizeInBits());
2320 Value *StartIdx =
2321 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2322 if (ScalarIVTy->isFloatingPointTy())
2323 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2324 StartIdx = addFastMathFlag(Builder.CreateBinOp(
2325 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2326 // The step returned by `createStepForVF` is a runtime-evaluated value
2327 // when VF is scalable. Otherwise, it should be folded into a Constant.
2328 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2329 "Expected StartIdx to be folded to a constant when VF is not "
2330 "scalable");
2331 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2332 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2333 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2334 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2335 }
2336 }
2337 }
2338
getOrCreateVectorValue(Value * V,unsigned Part)2339 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2340 assert(V != Induction && "The new induction variable should not be used.");
2341 assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2342 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2343
2344 // If we have a stride that is replaced by one, do it here. Defer this for
2345 // the VPlan-native path until we start running Legal checks in that path.
2346 if (!EnableVPlanNativePath && Legal->hasStride(V))
2347 V = ConstantInt::get(V->getType(), 1);
2348
2349 // If we have a vector mapped to this value, return it.
2350 if (VectorLoopValueMap.hasVectorValue(V, Part))
2351 return VectorLoopValueMap.getVectorValue(V, Part);
2352
2353 // If the value has not been vectorized, check if it has been scalarized
2354 // instead. If it has been scalarized, and we actually need the value in
2355 // vector form, we will construct the vector values on demand.
2356 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2357 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2358
2359 // If we've scalarized a value, that value should be an instruction.
2360 auto *I = cast<Instruction>(V);
2361
2362 // If we aren't vectorizing, we can just copy the scalar map values over to
2363 // the vector map.
2364 if (VF.isScalar()) {
2365 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2366 return ScalarValue;
2367 }
2368
2369 // Get the last scalar instruction we generated for V and Part. If the value
2370 // is known to be uniform after vectorization, this corresponds to lane zero
2371 // of the Part unroll iteration. Otherwise, the last instruction is the one
2372 // we created for the last vector lane of the Part unroll iteration.
2373 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2374 ? 0
2375 : VF.getKnownMinValue() - 1;
2376 assert((!VF.isScalable() || LastLane == 0) &&
2377 "Scalable vectorization can't lead to any scalarized values.");
2378 auto *LastInst = cast<Instruction>(
2379 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2380
2381 // Set the insert point after the last scalarized instruction. This ensures
2382 // the insertelement sequence will directly follow the scalar definitions.
2383 auto OldIP = Builder.saveIP();
2384 auto NewIP = std::next(BasicBlock::iterator(LastInst));
2385 Builder.SetInsertPoint(&*NewIP);
2386
2387 // However, if we are vectorizing, we need to construct the vector values.
2388 // If the value is known to be uniform after vectorization, we can just
2389 // broadcast the scalar value corresponding to lane zero for each unroll
2390 // iteration. Otherwise, we construct the vector values using insertelement
2391 // instructions. Since the resulting vectors are stored in
2392 // VectorLoopValueMap, we will only generate the insertelements once.
2393 Value *VectorValue = nullptr;
2394 if (Cost->isUniformAfterVectorization(I, VF)) {
2395 VectorValue = getBroadcastInstrs(ScalarValue);
2396 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2397 } else {
2398 // Initialize packing with insertelements to start from undef.
2399 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2400 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2401 VectorLoopValueMap.setVectorValue(V, Part, Undef);
2402 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2403 packScalarIntoVectorValue(V, {Part, Lane});
2404 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2405 }
2406 Builder.restoreIP(OldIP);
2407 return VectorValue;
2408 }
2409
2410 // If this scalar is unknown, assume that it is a constant or that it is
2411 // loop invariant. Broadcast V and save the value for future uses.
2412 Value *B = getBroadcastInstrs(V);
2413 VectorLoopValueMap.setVectorValue(V, Part, B);
2414 return B;
2415 }
2416
2417 Value *
getOrCreateScalarValue(Value * V,const VPIteration & Instance)2418 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2419 const VPIteration &Instance) {
2420 // If the value is not an instruction contained in the loop, it should
2421 // already be scalar.
2422 if (OrigLoop->isLoopInvariant(V))
2423 return V;
2424
2425 assert(Instance.Lane > 0
2426 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2427 : true && "Uniform values only have lane zero");
2428
2429 // If the value from the original loop has not been vectorized, it is
2430 // represented by UF x VF scalar values in the new loop. Return the requested
2431 // scalar value.
2432 if (VectorLoopValueMap.hasScalarValue(V, Instance))
2433 return VectorLoopValueMap.getScalarValue(V, Instance);
2434
2435 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2436 // for the given unroll part. If this entry is not a vector type (i.e., the
2437 // vectorization factor is one), there is no need to generate an
2438 // extractelement instruction.
2439 auto *U = getOrCreateVectorValue(V, Instance.Part);
2440 if (!U->getType()->isVectorTy()) {
2441 assert(VF.isScalar() && "Value not scalarized has non-vector type");
2442 return U;
2443 }
2444
2445 // Otherwise, the value from the original loop has been vectorized and is
2446 // represented by UF vector values. Extract and return the requested scalar
2447 // value from the appropriate vector lane.
2448 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2449 }
2450
packScalarIntoVectorValue(Value * V,const VPIteration & Instance)2451 void InnerLoopVectorizer::packScalarIntoVectorValue(
2452 Value *V, const VPIteration &Instance) {
2453 assert(V != Induction && "The new induction variable should not be used.");
2454 assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2455 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2456
2457 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2458 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2459 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2460 Builder.getInt32(Instance.Lane));
2461 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2462 }
2463
reverseVector(Value * Vec)2464 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2465 assert(Vec->getType()->isVectorTy() && "Invalid type");
2466 assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2467 SmallVector<int, 8> ShuffleMask;
2468 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2469 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2470
2471 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2472 }
2473
2474 // Return whether we allow using masked interleave-groups (for dealing with
2475 // strided loads/stores that reside in predicated blocks, or for dealing
2476 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2477 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2478 // If an override option has been passed in for interleaved accesses, use it.
2479 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2480 return EnableMaskedInterleavedMemAccesses;
2481
2482 return TTI.enableMaskedInterleavedAccessVectorization();
2483 }
2484
2485 // Try to vectorize the interleave group that \p Instr belongs to.
2486 //
2487 // E.g. Translate following interleaved load group (factor = 3):
2488 // for (i = 0; i < N; i+=3) {
2489 // R = Pic[i]; // Member of index 0
2490 // G = Pic[i+1]; // Member of index 1
2491 // B = Pic[i+2]; // Member of index 2
2492 // ... // do something to R, G, B
2493 // }
2494 // To:
2495 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2496 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2497 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2498 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2499 //
2500 // Or translate following interleaved store group (factor = 3):
2501 // for (i = 0; i < N; i+=3) {
2502 // ... do something to R, G, B
2503 // Pic[i] = R; // Member of index 0
2504 // Pic[i+1] = G; // Member of index 1
2505 // Pic[i+2] = B; // Member of index 2
2506 // }
2507 // To:
2508 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2509 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2510 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2511 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2512 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask)2513 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2514 const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2515 VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2516 Instruction *Instr = Group->getInsertPos();
2517 const DataLayout &DL = Instr->getModule()->getDataLayout();
2518
2519 // Prepare for the vector type of the interleaved load/store.
2520 Type *ScalarTy = getMemInstValueType(Instr);
2521 unsigned InterleaveFactor = Group->getFactor();
2522 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2523 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2524
2525 // Prepare for the new pointers.
2526 SmallVector<Value *, 2> AddrParts;
2527 unsigned Index = Group->getIndex(Instr);
2528
2529 // TODO: extend the masked interleaved-group support to reversed access.
2530 assert((!BlockInMask || !Group->isReverse()) &&
2531 "Reversed masked interleave-group not supported.");
2532
2533 // If the group is reverse, adjust the index to refer to the last vector lane
2534 // instead of the first. We adjust the index from the first vector lane,
2535 // rather than directly getting the pointer for lane VF - 1, because the
2536 // pointer operand of the interleaved access is supposed to be uniform. For
2537 // uniform instructions, we're only required to generate a value for the
2538 // first vector lane in each unroll iteration.
2539 assert(!VF.isScalable() &&
2540 "scalable vector reverse operation is not implemented");
2541 if (Group->isReverse())
2542 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2543
2544 for (unsigned Part = 0; Part < UF; Part++) {
2545 Value *AddrPart = State.get(Addr, {Part, 0});
2546 setDebugLocFromInst(Builder, AddrPart);
2547
2548 // Notice current instruction could be any index. Need to adjust the address
2549 // to the member of index 0.
2550 //
2551 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2552 // b = A[i]; // Member of index 0
2553 // Current pointer is pointed to A[i+1], adjust it to A[i].
2554 //
2555 // E.g. A[i+1] = a; // Member of index 1
2556 // A[i] = b; // Member of index 0
2557 // A[i+2] = c; // Member of index 2 (Current instruction)
2558 // Current pointer is pointed to A[i+2], adjust it to A[i].
2559
2560 bool InBounds = false;
2561 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2562 InBounds = gep->isInBounds();
2563 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2564 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2565
2566 // Cast to the vector pointer type.
2567 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2568 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2569 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2570 }
2571
2572 setDebugLocFromInst(Builder, Instr);
2573 Value *UndefVec = UndefValue::get(VecTy);
2574
2575 Value *MaskForGaps = nullptr;
2576 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2577 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2578 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2579 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2580 }
2581
2582 // Vectorize the interleaved load group.
2583 if (isa<LoadInst>(Instr)) {
2584 // For each unroll part, create a wide load for the group.
2585 SmallVector<Value *, 2> NewLoads;
2586 for (unsigned Part = 0; Part < UF; Part++) {
2587 Instruction *NewLoad;
2588 if (BlockInMask || MaskForGaps) {
2589 assert(useMaskedInterleavedAccesses(*TTI) &&
2590 "masked interleaved groups are not allowed.");
2591 Value *GroupMask = MaskForGaps;
2592 if (BlockInMask) {
2593 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2594 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2595 Value *ShuffledMask = Builder.CreateShuffleVector(
2596 BlockInMaskPart,
2597 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2598 "interleaved.mask");
2599 GroupMask = MaskForGaps
2600 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2601 MaskForGaps)
2602 : ShuffledMask;
2603 }
2604 NewLoad =
2605 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2606 GroupMask, UndefVec, "wide.masked.vec");
2607 }
2608 else
2609 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2610 Group->getAlign(), "wide.vec");
2611 Group->addMetadata(NewLoad);
2612 NewLoads.push_back(NewLoad);
2613 }
2614
2615 // For each member in the group, shuffle out the appropriate data from the
2616 // wide loads.
2617 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2618 Instruction *Member = Group->getMember(I);
2619
2620 // Skip the gaps in the group.
2621 if (!Member)
2622 continue;
2623
2624 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2625 auto StrideMask =
2626 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2627 for (unsigned Part = 0; Part < UF; Part++) {
2628 Value *StridedVec = Builder.CreateShuffleVector(
2629 NewLoads[Part], StrideMask, "strided.vec");
2630
2631 // If this member has different type, cast the result type.
2632 if (Member->getType() != ScalarTy) {
2633 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2634 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2635 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2636 }
2637
2638 if (Group->isReverse())
2639 StridedVec = reverseVector(StridedVec);
2640
2641 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2642 }
2643 }
2644 return;
2645 }
2646
2647 // The sub vector type for current instruction.
2648 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2649 auto *SubVT = VectorType::get(ScalarTy, VF);
2650
2651 // Vectorize the interleaved store group.
2652 for (unsigned Part = 0; Part < UF; Part++) {
2653 // Collect the stored vector from each member.
2654 SmallVector<Value *, 4> StoredVecs;
2655 for (unsigned i = 0; i < InterleaveFactor; i++) {
2656 // Interleaved store group doesn't allow a gap, so each index has a member
2657 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2658
2659 Value *StoredVec = State.get(StoredValues[i], Part);
2660
2661 if (Group->isReverse())
2662 StoredVec = reverseVector(StoredVec);
2663
2664 // If this member has different type, cast it to a unified type.
2665
2666 if (StoredVec->getType() != SubVT)
2667 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2668
2669 StoredVecs.push_back(StoredVec);
2670 }
2671
2672 // Concatenate all vectors into a wide vector.
2673 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2674
2675 // Interleave the elements in the wide vector.
2676 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2677 Value *IVec = Builder.CreateShuffleVector(
2678 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2679 "interleaved.vec");
2680
2681 Instruction *NewStoreInstr;
2682 if (BlockInMask) {
2683 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2684 Value *ShuffledMask = Builder.CreateShuffleVector(
2685 BlockInMaskPart,
2686 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2687 "interleaved.mask");
2688 NewStoreInstr = Builder.CreateMaskedStore(
2689 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2690 }
2691 else
2692 NewStoreInstr =
2693 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2694
2695 Group->addMetadata(NewStoreInstr);
2696 }
2697 }
2698
vectorizeMemoryInstruction(Instruction * Instr,VPTransformState & State,VPValue * Def,VPValue * Addr,VPValue * StoredValue,VPValue * BlockInMask)2699 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2700 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2701 VPValue *StoredValue, VPValue *BlockInMask) {
2702 // Attempt to issue a wide load.
2703 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2704 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2705
2706 assert((LI || SI) && "Invalid Load/Store instruction");
2707 assert((!SI || StoredValue) && "No stored value provided for widened store");
2708 assert((!LI || !StoredValue) && "Stored value provided for widened load");
2709
2710 LoopVectorizationCostModel::InstWidening Decision =
2711 Cost->getWideningDecision(Instr, VF);
2712 assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2713 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2714 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2715 "CM decision is not to widen the memory instruction");
2716
2717 Type *ScalarDataTy = getMemInstValueType(Instr);
2718
2719 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2720 const Align Alignment = getLoadStoreAlignment(Instr);
2721
2722 // Determine if the pointer operand of the access is either consecutive or
2723 // reverse consecutive.
2724 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2725 bool ConsecutiveStride =
2726 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2727 bool CreateGatherScatter =
2728 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2729
2730 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2731 // gather/scatter. Otherwise Decision should have been to Scalarize.
2732 assert((ConsecutiveStride || CreateGatherScatter) &&
2733 "The instruction should be scalarized");
2734 (void)ConsecutiveStride;
2735
2736 VectorParts BlockInMaskParts(UF);
2737 bool isMaskRequired = BlockInMask;
2738 if (isMaskRequired)
2739 for (unsigned Part = 0; Part < UF; ++Part)
2740 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2741
2742 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2743 // Calculate the pointer for the specific unroll-part.
2744 GetElementPtrInst *PartPtr = nullptr;
2745
2746 bool InBounds = false;
2747 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2748 InBounds = gep->isInBounds();
2749
2750 if (Reverse) {
2751 assert(!VF.isScalable() &&
2752 "Reversing vectors is not yet supported for scalable vectors.");
2753
2754 // If the address is consecutive but reversed, then the
2755 // wide store needs to start at the last vector element.
2756 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2757 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2758 PartPtr->setIsInBounds(InBounds);
2759 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2760 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2761 PartPtr->setIsInBounds(InBounds);
2762 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2763 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2764 } else {
2765 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2766 PartPtr = cast<GetElementPtrInst>(
2767 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2768 PartPtr->setIsInBounds(InBounds);
2769 }
2770
2771 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2772 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2773 };
2774
2775 // Handle Stores:
2776 if (SI) {
2777 setDebugLocFromInst(Builder, SI);
2778
2779 for (unsigned Part = 0; Part < UF; ++Part) {
2780 Instruction *NewSI = nullptr;
2781 Value *StoredVal = State.get(StoredValue, Part);
2782 if (CreateGatherScatter) {
2783 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2784 Value *VectorGep = State.get(Addr, Part);
2785 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2786 MaskPart);
2787 } else {
2788 if (Reverse) {
2789 // If we store to reverse consecutive memory locations, then we need
2790 // to reverse the order of elements in the stored value.
2791 StoredVal = reverseVector(StoredVal);
2792 // We don't want to update the value in the map as it might be used in
2793 // another expression. So don't call resetVectorValue(StoredVal).
2794 }
2795 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2796 if (isMaskRequired)
2797 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2798 BlockInMaskParts[Part]);
2799 else
2800 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2801 }
2802 addMetadata(NewSI, SI);
2803 }
2804 return;
2805 }
2806
2807 // Handle loads.
2808 assert(LI && "Must have a load instruction");
2809 setDebugLocFromInst(Builder, LI);
2810 for (unsigned Part = 0; Part < UF; ++Part) {
2811 Value *NewLI;
2812 if (CreateGatherScatter) {
2813 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2814 Value *VectorGep = State.get(Addr, Part);
2815 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2816 nullptr, "wide.masked.gather");
2817 addMetadata(NewLI, LI);
2818 } else {
2819 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2820 if (isMaskRequired)
2821 NewLI = Builder.CreateMaskedLoad(
2822 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2823 "wide.masked.load");
2824 else
2825 NewLI =
2826 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2827
2828 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2829 addMetadata(NewLI, LI);
2830 if (Reverse)
2831 NewLI = reverseVector(NewLI);
2832 }
2833
2834 State.set(Def, Instr, NewLI, Part);
2835 }
2836 }
2837
scalarizeInstruction(Instruction * Instr,VPUser & User,const VPIteration & Instance,bool IfPredicateInstr,VPTransformState & State)2838 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2839 const VPIteration &Instance,
2840 bool IfPredicateInstr,
2841 VPTransformState &State) {
2842 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2843
2844 setDebugLocFromInst(Builder, Instr);
2845
2846 // Does this instruction return a value ?
2847 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2848
2849 Instruction *Cloned = Instr->clone();
2850 if (!IsVoidRetTy)
2851 Cloned->setName(Instr->getName() + ".cloned");
2852
2853 // Replace the operands of the cloned instructions with their scalar
2854 // equivalents in the new loop.
2855 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2856 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2857 auto InputInstance = Instance;
2858 if (!Operand || !OrigLoop->contains(Operand) ||
2859 (Cost->isUniformAfterVectorization(Operand, State.VF)))
2860 InputInstance.Lane = 0;
2861 auto *NewOp = State.get(User.getOperand(op), InputInstance);
2862 Cloned->setOperand(op, NewOp);
2863 }
2864 addNewMetadata(Cloned, Instr);
2865
2866 // Place the cloned scalar in the new loop.
2867 Builder.Insert(Cloned);
2868
2869 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2870 // representing scalar values in VPTransformState. Add the cloned scalar to
2871 // the scalar map entry.
2872 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2873
2874 // If we just cloned a new assumption, add it the assumption cache.
2875 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2876 if (II->getIntrinsicID() == Intrinsic::assume)
2877 AC->registerAssumption(II);
2878
2879 // End if-block.
2880 if (IfPredicateInstr)
2881 PredicatedInstructions.push_back(Cloned);
2882 }
2883
createInductionVariable(Loop * L,Value * Start,Value * End,Value * Step,Instruction * DL)2884 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2885 Value *End, Value *Step,
2886 Instruction *DL) {
2887 BasicBlock *Header = L->getHeader();
2888 BasicBlock *Latch = L->getLoopLatch();
2889 // As we're just creating this loop, it's possible no latch exists
2890 // yet. If so, use the header as this will be a single block loop.
2891 if (!Latch)
2892 Latch = Header;
2893
2894 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2895 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2896 setDebugLocFromInst(Builder, OldInst);
2897 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2898
2899 Builder.SetInsertPoint(Latch->getTerminator());
2900 setDebugLocFromInst(Builder, OldInst);
2901
2902 // Create i+1 and fill the PHINode.
2903 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2904 Induction->addIncoming(Start, L->getLoopPreheader());
2905 Induction->addIncoming(Next, Latch);
2906 // Create the compare.
2907 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2908 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2909
2910 // Now we have two terminators. Remove the old one from the block.
2911 Latch->getTerminator()->eraseFromParent();
2912
2913 return Induction;
2914 }
2915
getOrCreateTripCount(Loop * L)2916 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2917 if (TripCount)
2918 return TripCount;
2919
2920 assert(L && "Create Trip Count for null loop.");
2921 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2922 // Find the loop boundaries.
2923 ScalarEvolution *SE = PSE.getSE();
2924 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2925 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2926 "Invalid loop count");
2927
2928 Type *IdxTy = Legal->getWidestInductionType();
2929 assert(IdxTy && "No type for induction");
2930
2931 // The exit count might have the type of i64 while the phi is i32. This can
2932 // happen if we have an induction variable that is sign extended before the
2933 // compare. The only way that we get a backedge taken count is that the
2934 // induction variable was signed and as such will not overflow. In such a case
2935 // truncation is legal.
2936 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2937 IdxTy->getPrimitiveSizeInBits())
2938 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2939 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2940
2941 // Get the total trip count from the count by adding 1.
2942 const SCEV *ExitCount = SE->getAddExpr(
2943 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2944
2945 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2946
2947 // Expand the trip count and place the new instructions in the preheader.
2948 // Notice that the pre-header does not change, only the loop body.
2949 SCEVExpander Exp(*SE, DL, "induction");
2950
2951 // Count holds the overall loop count (N).
2952 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2953 L->getLoopPreheader()->getTerminator());
2954
2955 if (TripCount->getType()->isPointerTy())
2956 TripCount =
2957 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2958 L->getLoopPreheader()->getTerminator());
2959
2960 return TripCount;
2961 }
2962
getOrCreateVectorTripCount(Loop * L)2963 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2964 if (VectorTripCount)
2965 return VectorTripCount;
2966
2967 Value *TC = getOrCreateTripCount(L);
2968 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2969
2970 Type *Ty = TC->getType();
2971 // This is where we can make the step a runtime constant.
2972 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2973
2974 // If the tail is to be folded by masking, round the number of iterations N
2975 // up to a multiple of Step instead of rounding down. This is done by first
2976 // adding Step-1 and then rounding down. Note that it's ok if this addition
2977 // overflows: the vector induction variable will eventually wrap to zero given
2978 // that it starts at zero and its Step is a power of two; the loop will then
2979 // exit, with the last early-exit vector comparison also producing all-true.
2980 if (Cost->foldTailByMasking()) {
2981 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2982 "VF*UF must be a power of 2 when folding tail by masking");
2983 assert(!VF.isScalable() &&
2984 "Tail folding not yet supported for scalable vectors");
2985 TC = Builder.CreateAdd(
2986 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2987 }
2988
2989 // Now we need to generate the expression for the part of the loop that the
2990 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2991 // iterations are not required for correctness, or N - Step, otherwise. Step
2992 // is equal to the vectorization factor (number of SIMD elements) times the
2993 // unroll factor (number of SIMD instructions).
2994 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2995
2996 // If there is a non-reversed interleaved group that may speculatively access
2997 // memory out-of-bounds, we need to ensure that there will be at least one
2998 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2999 // the trip count, we set the remainder to be equal to the step. If the step
3000 // does not evenly divide the trip count, no adjustment is necessary since
3001 // there will already be scalar iterations. Note that the minimum iterations
3002 // check ensures that N >= Step.
3003 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3004 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3005 R = Builder.CreateSelect(IsZero, Step, R);
3006 }
3007
3008 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3009
3010 return VectorTripCount;
3011 }
3012
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)3013 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3014 const DataLayout &DL) {
3015 // Verify that V is a vector type with same number of elements as DstVTy.
3016 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3017 unsigned VF = DstFVTy->getNumElements();
3018 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3019 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3020 Type *SrcElemTy = SrcVecTy->getElementType();
3021 Type *DstElemTy = DstFVTy->getElementType();
3022 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3023 "Vector elements must have same size");
3024
3025 // Do a direct cast if element types are castable.
3026 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3027 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3028 }
3029 // V cannot be directly casted to desired vector type.
3030 // May happen when V is a floating point vector but DstVTy is a vector of
3031 // pointers or vice-versa. Handle this using a two-step bitcast using an
3032 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3033 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3034 "Only one type should be a pointer type");
3035 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3036 "Only one type should be a floating point type");
3037 Type *IntTy =
3038 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3039 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3040 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3041 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3042 }
3043
emitMinimumIterationCountCheck(Loop * L,BasicBlock * Bypass)3044 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3045 BasicBlock *Bypass) {
3046 Value *Count = getOrCreateTripCount(L);
3047 // Reuse existing vector loop preheader for TC checks.
3048 // Note that new preheader block is generated for vector loop.
3049 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3050 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3051
3052 // Generate code to check if the loop's trip count is less than VF * UF, or
3053 // equal to it in case a scalar epilogue is required; this implies that the
3054 // vector trip count is zero. This check also covers the case where adding one
3055 // to the backedge-taken count overflowed leading to an incorrect trip count
3056 // of zero. In this case we will also jump to the scalar loop.
3057 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3058 : ICmpInst::ICMP_ULT;
3059
3060 // If tail is to be folded, vector loop takes care of all iterations.
3061 Value *CheckMinIters = Builder.getFalse();
3062 if (!Cost->foldTailByMasking()) {
3063 Value *Step =
3064 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3065 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3066 }
3067 // Create new preheader for vector loop.
3068 LoopVectorPreHeader =
3069 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3070 "vector.ph");
3071
3072 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3073 DT->getNode(Bypass)->getIDom()) &&
3074 "TC check is expected to dominate Bypass");
3075
3076 // Update dominator for Bypass & LoopExit.
3077 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3078 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3079
3080 ReplaceInstWithInst(
3081 TCCheckBlock->getTerminator(),
3082 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3083 LoopBypassBlocks.push_back(TCCheckBlock);
3084 }
3085
emitSCEVChecks(Loop * L,BasicBlock * Bypass)3086 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3087 // Reuse existing vector loop preheader for SCEV checks.
3088 // Note that new preheader block is generated for vector loop.
3089 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3090
3091 // Generate the code to check that the SCEV assumptions that we made.
3092 // We want the new basic block to start at the first instruction in a
3093 // sequence of instructions that form a check.
3094 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3095 "scev.check");
3096 Value *SCEVCheck = Exp.expandCodeForPredicate(
3097 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3098
3099 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3100 if (C->isZero())
3101 return;
3102
3103 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3104 (OptForSizeBasedOnProfile &&
3105 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3106 "Cannot SCEV check stride or overflow when optimizing for size");
3107
3108 SCEVCheckBlock->setName("vector.scevcheck");
3109 // Create new preheader for vector loop.
3110 LoopVectorPreHeader =
3111 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3112 nullptr, "vector.ph");
3113
3114 // Update dominator only if this is first RT check.
3115 if (LoopBypassBlocks.empty()) {
3116 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3117 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3118 }
3119
3120 ReplaceInstWithInst(
3121 SCEVCheckBlock->getTerminator(),
3122 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3123 LoopBypassBlocks.push_back(SCEVCheckBlock);
3124 AddedSafetyChecks = true;
3125 }
3126
emitMemRuntimeChecks(Loop * L,BasicBlock * Bypass)3127 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3128 // VPlan-native path does not do any analysis for runtime checks currently.
3129 if (EnableVPlanNativePath)
3130 return;
3131
3132 // Reuse existing vector loop preheader for runtime memory checks.
3133 // Note that new preheader block is generated for vector loop.
3134 BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3135
3136 // Generate the code that checks in runtime if arrays overlap. We put the
3137 // checks into a separate block to make the more common case of few elements
3138 // faster.
3139 auto *LAI = Legal->getLAI();
3140 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3141 if (!RtPtrChecking.Need)
3142 return;
3143
3144 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3145 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3146 "Cannot emit memory checks when optimizing for size, unless forced "
3147 "to vectorize.");
3148 ORE->emit([&]() {
3149 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3150 L->getStartLoc(), L->getHeader())
3151 << "Code-size may be reduced by not forcing "
3152 "vectorization, or by source-code modifications "
3153 "eliminating the need for runtime checks "
3154 "(e.g., adding 'restrict').";
3155 });
3156 }
3157
3158 MemCheckBlock->setName("vector.memcheck");
3159 // Create new preheader for vector loop.
3160 LoopVectorPreHeader =
3161 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3162 "vector.ph");
3163
3164 auto *CondBranch = cast<BranchInst>(
3165 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3166 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3167 LoopBypassBlocks.push_back(MemCheckBlock);
3168 AddedSafetyChecks = true;
3169
3170 // Update dominator only if this is first RT check.
3171 if (LoopBypassBlocks.empty()) {
3172 DT->changeImmediateDominator(Bypass, MemCheckBlock);
3173 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3174 }
3175
3176 Instruction *FirstCheckInst;
3177 Instruction *MemRuntimeCheck;
3178 std::tie(FirstCheckInst, MemRuntimeCheck) =
3179 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3180 RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3181 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3182 "claimed checks are required");
3183 CondBranch->setCondition(MemRuntimeCheck);
3184
3185 // We currently don't use LoopVersioning for the actual loop cloning but we
3186 // still use it to add the noalias metadata.
3187 LVer = std::make_unique<LoopVersioning>(
3188 *Legal->getLAI(),
3189 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3190 DT, PSE.getSE());
3191 LVer->prepareNoAliasMetadata();
3192 }
3193
emitTransformedIndex(IRBuilder<> & B,Value * Index,ScalarEvolution * SE,const DataLayout & DL,const InductionDescriptor & ID) const3194 Value *InnerLoopVectorizer::emitTransformedIndex(
3195 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3196 const InductionDescriptor &ID) const {
3197
3198 SCEVExpander Exp(*SE, DL, "induction");
3199 auto Step = ID.getStep();
3200 auto StartValue = ID.getStartValue();
3201 assert(Index->getType() == Step->getType() &&
3202 "Index type does not match StepValue type");
3203
3204 // Note: the IR at this point is broken. We cannot use SE to create any new
3205 // SCEV and then expand it, hoping that SCEV's simplification will give us
3206 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3207 // lead to various SCEV crashes. So all we can do is to use builder and rely
3208 // on InstCombine for future simplifications. Here we handle some trivial
3209 // cases only.
3210 auto CreateAdd = [&B](Value *X, Value *Y) {
3211 assert(X->getType() == Y->getType() && "Types don't match!");
3212 if (auto *CX = dyn_cast<ConstantInt>(X))
3213 if (CX->isZero())
3214 return Y;
3215 if (auto *CY = dyn_cast<ConstantInt>(Y))
3216 if (CY->isZero())
3217 return X;
3218 return B.CreateAdd(X, Y);
3219 };
3220
3221 auto CreateMul = [&B](Value *X, Value *Y) {
3222 assert(X->getType() == Y->getType() && "Types don't match!");
3223 if (auto *CX = dyn_cast<ConstantInt>(X))
3224 if (CX->isOne())
3225 return Y;
3226 if (auto *CY = dyn_cast<ConstantInt>(Y))
3227 if (CY->isOne())
3228 return X;
3229 return B.CreateMul(X, Y);
3230 };
3231
3232 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3233 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3234 // the DomTree is not kept up-to-date for additional blocks generated in the
3235 // vector loop. By using the header as insertion point, we guarantee that the
3236 // expanded instructions dominate all their uses.
3237 auto GetInsertPoint = [this, &B]() {
3238 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3239 if (InsertBB != LoopVectorBody &&
3240 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3241 return LoopVectorBody->getTerminator();
3242 return &*B.GetInsertPoint();
3243 };
3244 switch (ID.getKind()) {
3245 case InductionDescriptor::IK_IntInduction: {
3246 assert(Index->getType() == StartValue->getType() &&
3247 "Index type does not match StartValue type");
3248 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3249 return B.CreateSub(StartValue, Index);
3250 auto *Offset = CreateMul(
3251 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3252 return CreateAdd(StartValue, Offset);
3253 }
3254 case InductionDescriptor::IK_PtrInduction: {
3255 assert(isa<SCEVConstant>(Step) &&
3256 "Expected constant step for pointer induction");
3257 return B.CreateGEP(
3258 StartValue->getType()->getPointerElementType(), StartValue,
3259 CreateMul(Index,
3260 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3261 }
3262 case InductionDescriptor::IK_FpInduction: {
3263 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3264 auto InductionBinOp = ID.getInductionBinOp();
3265 assert(InductionBinOp &&
3266 (InductionBinOp->getOpcode() == Instruction::FAdd ||
3267 InductionBinOp->getOpcode() == Instruction::FSub) &&
3268 "Original bin op should be defined for FP induction");
3269
3270 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3271
3272 // Floating point operations had to be 'fast' to enable the induction.
3273 FastMathFlags Flags;
3274 Flags.setFast();
3275
3276 Value *MulExp = B.CreateFMul(StepValue, Index);
3277 if (isa<Instruction>(MulExp))
3278 // We have to check, the MulExp may be a constant.
3279 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3280
3281 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3282 "induction");
3283 if (isa<Instruction>(BOp))
3284 cast<Instruction>(BOp)->setFastMathFlags(Flags);
3285
3286 return BOp;
3287 }
3288 case InductionDescriptor::IK_NoInduction:
3289 return nullptr;
3290 }
3291 llvm_unreachable("invalid enum");
3292 }
3293
createVectorLoopSkeleton(StringRef Prefix)3294 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3295 LoopScalarBody = OrigLoop->getHeader();
3296 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3297 LoopExitBlock = OrigLoop->getExitBlock();
3298 assert(LoopExitBlock && "Must have an exit block");
3299 assert(LoopVectorPreHeader && "Invalid loop structure");
3300
3301 LoopMiddleBlock =
3302 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3303 LI, nullptr, Twine(Prefix) + "middle.block");
3304 LoopScalarPreHeader =
3305 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3306 nullptr, Twine(Prefix) + "scalar.ph");
3307 // We intentionally don't let SplitBlock to update LoopInfo since
3308 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3309 // LoopVectorBody is explicitly added to the correct place few lines later.
3310 LoopVectorBody =
3311 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3312 nullptr, nullptr, Twine(Prefix) + "vector.body");
3313
3314 // Update dominator for loop exit.
3315 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3316
3317 // Create and register the new vector loop.
3318 Loop *Lp = LI->AllocateLoop();
3319 Loop *ParentLoop = OrigLoop->getParentLoop();
3320
3321 // Insert the new loop into the loop nest and register the new basic blocks
3322 // before calling any utilities such as SCEV that require valid LoopInfo.
3323 if (ParentLoop) {
3324 ParentLoop->addChildLoop(Lp);
3325 } else {
3326 LI->addTopLevelLoop(Lp);
3327 }
3328 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3329 return Lp;
3330 }
3331
createInductionResumeValues(Loop * L,Value * VectorTripCount,std::pair<BasicBlock *,Value * > AdditionalBypass)3332 void InnerLoopVectorizer::createInductionResumeValues(
3333 Loop *L, Value *VectorTripCount,
3334 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3335 assert(VectorTripCount && L && "Expected valid arguments");
3336 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3337 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3338 "Inconsistent information about additional bypass.");
3339 // We are going to resume the execution of the scalar loop.
3340 // Go over all of the induction variables that we found and fix the
3341 // PHIs that are left in the scalar version of the loop.
3342 // The starting values of PHI nodes depend on the counter of the last
3343 // iteration in the vectorized loop.
3344 // If we come from a bypass edge then we need to start from the original
3345 // start value.
3346 for (auto &InductionEntry : Legal->getInductionVars()) {
3347 PHINode *OrigPhi = InductionEntry.first;
3348 InductionDescriptor II = InductionEntry.second;
3349
3350 // Create phi nodes to merge from the backedge-taken check block.
3351 PHINode *BCResumeVal =
3352 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3353 LoopScalarPreHeader->getTerminator());
3354 // Copy original phi DL over to the new one.
3355 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3356 Value *&EndValue = IVEndValues[OrigPhi];
3357 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3358 if (OrigPhi == OldInduction) {
3359 // We know what the end value is.
3360 EndValue = VectorTripCount;
3361 } else {
3362 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3363 Type *StepType = II.getStep()->getType();
3364 Instruction::CastOps CastOp =
3365 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3366 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3367 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3368 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3369 EndValue->setName("ind.end");
3370
3371 // Compute the end value for the additional bypass (if applicable).
3372 if (AdditionalBypass.first) {
3373 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3374 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3375 StepType, true);
3376 CRD =
3377 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3378 EndValueFromAdditionalBypass =
3379 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3380 EndValueFromAdditionalBypass->setName("ind.end");
3381 }
3382 }
3383 // The new PHI merges the original incoming value, in case of a bypass,
3384 // or the value at the end of the vectorized loop.
3385 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3386
3387 // Fix the scalar body counter (PHI node).
3388 // The old induction's phi node in the scalar body needs the truncated
3389 // value.
3390 for (BasicBlock *BB : LoopBypassBlocks)
3391 BCResumeVal->addIncoming(II.getStartValue(), BB);
3392
3393 if (AdditionalBypass.first)
3394 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3395 EndValueFromAdditionalBypass);
3396
3397 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3398 }
3399 }
3400
completeLoopSkeleton(Loop * L,MDNode * OrigLoopID)3401 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3402 MDNode *OrigLoopID) {
3403 assert(L && "Expected valid loop.");
3404
3405 // The trip counts should be cached by now.
3406 Value *Count = getOrCreateTripCount(L);
3407 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3408
3409 // We need the OrigLoop (scalar loop part) latch terminator to help
3410 // produce correct debug info for the middle block BB instructions.
3411 // The legality check stage guarantees that the loop will have a single
3412 // latch.
3413 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3414 "Scalar loop latch terminator isn't a branch");
3415 BranchInst *ScalarLatchBr =
3416 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3417
3418 // Add a check in the middle block to see if we have completed
3419 // all of the iterations in the first vector loop.
3420 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3421 // If tail is to be folded, we know we don't need to run the remainder.
3422 Value *CmpN = Builder.getTrue();
3423 if (!Cost->foldTailByMasking()) {
3424 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3425 VectorTripCount, "cmp.n",
3426 LoopMiddleBlock->getTerminator());
3427
3428 // Here we use the same DebugLoc as the scalar loop latch branch instead
3429 // of the corresponding compare because they may have ended up with
3430 // different line numbers and we want to avoid awkward line stepping while
3431 // debugging. Eg. if the compare has got a line number inside the loop.
3432 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3433 }
3434
3435 BranchInst *BrInst =
3436 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3437 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3438 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3439
3440 // Get ready to start creating new instructions into the vectorized body.
3441 assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3442 "Inconsistent vector loop preheader");
3443 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3444
3445 Optional<MDNode *> VectorizedLoopID =
3446 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3447 LLVMLoopVectorizeFollowupVectorized});
3448 if (VectorizedLoopID.hasValue()) {
3449 L->setLoopID(VectorizedLoopID.getValue());
3450
3451 // Do not setAlreadyVectorized if loop attributes have been defined
3452 // explicitly.
3453 return LoopVectorPreHeader;
3454 }
3455
3456 // Keep all loop hints from the original loop on the vector loop (we'll
3457 // replace the vectorizer-specific hints below).
3458 if (MDNode *LID = OrigLoop->getLoopID())
3459 L->setLoopID(LID);
3460
3461 LoopVectorizeHints Hints(L, true, *ORE);
3462 Hints.setAlreadyVectorized();
3463
3464 #ifdef EXPENSIVE_CHECKS
3465 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3466 LI->verify(*DT);
3467 #endif
3468
3469 return LoopVectorPreHeader;
3470 }
3471
createVectorizedLoopSkeleton()3472 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3473 /*
3474 In this function we generate a new loop. The new loop will contain
3475 the vectorized instructions while the old loop will continue to run the
3476 scalar remainder.
3477
3478 [ ] <-- loop iteration number check.
3479 / |
3480 / v
3481 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3482 | / |
3483 | / v
3484 || [ ] <-- vector pre header.
3485 |/ |
3486 | v
3487 | [ ] \
3488 | [ ]_| <-- vector loop.
3489 | |
3490 | v
3491 | -[ ] <--- middle-block.
3492 | / |
3493 | / v
3494 -|- >[ ] <--- new preheader.
3495 | |
3496 | v
3497 | [ ] \
3498 | [ ]_| <-- old scalar loop to handle remainder.
3499 \ |
3500 \ v
3501 >[ ] <-- exit block.
3502 ...
3503 */
3504
3505 // Get the metadata of the original loop before it gets modified.
3506 MDNode *OrigLoopID = OrigLoop->getLoopID();
3507
3508 // Create an empty vector loop, and prepare basic blocks for the runtime
3509 // checks.
3510 Loop *Lp = createVectorLoopSkeleton("");
3511
3512 // Now, compare the new count to zero. If it is zero skip the vector loop and
3513 // jump to the scalar loop. This check also covers the case where the
3514 // backedge-taken count is uint##_max: adding one to it will overflow leading
3515 // to an incorrect trip count of zero. In this (rare) case we will also jump
3516 // to the scalar loop.
3517 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3518
3519 // Generate the code to check any assumptions that we've made for SCEV
3520 // expressions.
3521 emitSCEVChecks(Lp, LoopScalarPreHeader);
3522
3523 // Generate the code that checks in runtime if arrays overlap. We put the
3524 // checks into a separate block to make the more common case of few elements
3525 // faster.
3526 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3527
3528 // Some loops have a single integer induction variable, while other loops
3529 // don't. One example is c++ iterators that often have multiple pointer
3530 // induction variables. In the code below we also support a case where we
3531 // don't have a single induction variable.
3532 //
3533 // We try to obtain an induction variable from the original loop as hard
3534 // as possible. However if we don't find one that:
3535 // - is an integer
3536 // - counts from zero, stepping by one
3537 // - is the size of the widest induction variable type
3538 // then we create a new one.
3539 OldInduction = Legal->getPrimaryInduction();
3540 Type *IdxTy = Legal->getWidestInductionType();
3541 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3542 // The loop step is equal to the vectorization factor (num of SIMD elements)
3543 // times the unroll factor (num of SIMD instructions).
3544 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3545 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3546 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3547 Induction =
3548 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3549 getDebugLocFromInstOrOperands(OldInduction));
3550
3551 // Emit phis for the new starting index of the scalar loop.
3552 createInductionResumeValues(Lp, CountRoundDown);
3553
3554 return completeLoopSkeleton(Lp, OrigLoopID);
3555 }
3556
3557 // Fix up external users of the induction variable. At this point, we are
3558 // in LCSSA form, with all external PHIs that use the IV having one input value,
3559 // coming from the remainder loop. We need those PHIs to also have a correct
3560 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * CountRoundDown,Value * EndValue,BasicBlock * MiddleBlock)3561 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3562 const InductionDescriptor &II,
3563 Value *CountRoundDown, Value *EndValue,
3564 BasicBlock *MiddleBlock) {
3565 // There are two kinds of external IV usages - those that use the value
3566 // computed in the last iteration (the PHI) and those that use the penultimate
3567 // value (the value that feeds into the phi from the loop latch).
3568 // We allow both, but they, obviously, have different values.
3569
3570 assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3571
3572 DenseMap<Value *, Value *> MissingVals;
3573
3574 // An external user of the last iteration's value should see the value that
3575 // the remainder loop uses to initialize its own IV.
3576 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3577 for (User *U : PostInc->users()) {
3578 Instruction *UI = cast<Instruction>(U);
3579 if (!OrigLoop->contains(UI)) {
3580 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3581 MissingVals[UI] = EndValue;
3582 }
3583 }
3584
3585 // An external user of the penultimate value need to see EndValue - Step.
3586 // The simplest way to get this is to recompute it from the constituent SCEVs,
3587 // that is Start + (Step * (CRD - 1)).
3588 for (User *U : OrigPhi->users()) {
3589 auto *UI = cast<Instruction>(U);
3590 if (!OrigLoop->contains(UI)) {
3591 const DataLayout &DL =
3592 OrigLoop->getHeader()->getModule()->getDataLayout();
3593 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3594
3595 IRBuilder<> B(MiddleBlock->getTerminator());
3596 Value *CountMinusOne = B.CreateSub(
3597 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3598 Value *CMO =
3599 !II.getStep()->getType()->isIntegerTy()
3600 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3601 II.getStep()->getType())
3602 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3603 CMO->setName("cast.cmo");
3604 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3605 Escape->setName("ind.escape");
3606 MissingVals[UI] = Escape;
3607 }
3608 }
3609
3610 for (auto &I : MissingVals) {
3611 PHINode *PHI = cast<PHINode>(I.first);
3612 // One corner case we have to handle is two IVs "chasing" each-other,
3613 // that is %IV2 = phi [...], [ %IV1, %latch ]
3614 // In this case, if IV1 has an external use, we need to avoid adding both
3615 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3616 // don't already have an incoming value for the middle block.
3617 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3618 PHI->addIncoming(I.second, MiddleBlock);
3619 }
3620 }
3621
3622 namespace {
3623
3624 struct CSEDenseMapInfo {
canHandle__anon0f3167640d11::CSEDenseMapInfo3625 static bool canHandle(const Instruction *I) {
3626 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3627 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3628 }
3629
getEmptyKey__anon0f3167640d11::CSEDenseMapInfo3630 static inline Instruction *getEmptyKey() {
3631 return DenseMapInfo<Instruction *>::getEmptyKey();
3632 }
3633
getTombstoneKey__anon0f3167640d11::CSEDenseMapInfo3634 static inline Instruction *getTombstoneKey() {
3635 return DenseMapInfo<Instruction *>::getTombstoneKey();
3636 }
3637
getHashValue__anon0f3167640d11::CSEDenseMapInfo3638 static unsigned getHashValue(const Instruction *I) {
3639 assert(canHandle(I) && "Unknown instruction!");
3640 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3641 I->value_op_end()));
3642 }
3643
isEqual__anon0f3167640d11::CSEDenseMapInfo3644 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3645 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3646 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3647 return LHS == RHS;
3648 return LHS->isIdenticalTo(RHS);
3649 }
3650 };
3651
3652 } // end anonymous namespace
3653
3654 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)3655 static void cse(BasicBlock *BB) {
3656 // Perform simple cse.
3657 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3658 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3659 Instruction *In = &*I++;
3660
3661 if (!CSEDenseMapInfo::canHandle(In))
3662 continue;
3663
3664 // Check if we can replace this instruction with any of the
3665 // visited instructions.
3666 if (Instruction *V = CSEMap.lookup(In)) {
3667 In->replaceAllUsesWith(V);
3668 In->eraseFromParent();
3669 continue;
3670 }
3671
3672 CSEMap[In] = In;
3673 }
3674 }
3675
getVectorCallCost(CallInst * CI,ElementCount VF,bool & NeedToScalarize)3676 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3677 ElementCount VF,
3678 bool &NeedToScalarize) {
3679 assert(!VF.isScalable() && "scalable vectors not yet supported.");
3680 Function *F = CI->getCalledFunction();
3681 Type *ScalarRetTy = CI->getType();
3682 SmallVector<Type *, 4> Tys, ScalarTys;
3683 for (auto &ArgOp : CI->arg_operands())
3684 ScalarTys.push_back(ArgOp->getType());
3685
3686 // Estimate cost of scalarized vector call. The source operands are assumed
3687 // to be vectors, so we need to extract individual elements from there,
3688 // execute VF scalar calls, and then gather the result into the vector return
3689 // value.
3690 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3691 TTI::TCK_RecipThroughput);
3692 if (VF.isScalar())
3693 return ScalarCallCost;
3694
3695 // Compute corresponding vector type for return value and arguments.
3696 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3697 for (Type *ScalarTy : ScalarTys)
3698 Tys.push_back(ToVectorTy(ScalarTy, VF));
3699
3700 // Compute costs of unpacking argument values for the scalar calls and
3701 // packing the return values to a vector.
3702 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3703
3704 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3705
3706 // If we can't emit a vector call for this function, then the currently found
3707 // cost is the cost we need to return.
3708 NeedToScalarize = true;
3709 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3710 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3711
3712 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3713 return Cost;
3714
3715 // If the corresponding vector cost is cheaper, return its cost.
3716 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3717 TTI::TCK_RecipThroughput);
3718 if (VectorCallCost < Cost) {
3719 NeedToScalarize = false;
3720 return VectorCallCost;
3721 }
3722 return Cost;
3723 }
3724
getVectorIntrinsicCost(CallInst * CI,ElementCount VF)3725 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3726 ElementCount VF) {
3727 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3728 assert(ID && "Expected intrinsic call!");
3729
3730 IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3731 return TTI.getIntrinsicInstrCost(CostAttrs,
3732 TargetTransformInfo::TCK_RecipThroughput);
3733 }
3734
smallestIntegerVectorType(Type * T1,Type * T2)3735 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3736 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3737 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3738 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3739 }
3740
largestIntegerVectorType(Type * T1,Type * T2)3741 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3742 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3743 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3744 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3745 }
3746
truncateToMinimalBitwidths()3747 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3748 // For every instruction `I` in MinBWs, truncate the operands, create a
3749 // truncated version of `I` and reextend its result. InstCombine runs
3750 // later and will remove any ext/trunc pairs.
3751 SmallPtrSet<Value *, 4> Erased;
3752 for (const auto &KV : Cost->getMinimalBitwidths()) {
3753 // If the value wasn't vectorized, we must maintain the original scalar
3754 // type. The absence of the value from VectorLoopValueMap indicates that it
3755 // wasn't vectorized.
3756 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3757 continue;
3758 for (unsigned Part = 0; Part < UF; ++Part) {
3759 Value *I = getOrCreateVectorValue(KV.first, Part);
3760 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3761 continue;
3762 Type *OriginalTy = I->getType();
3763 Type *ScalarTruncatedTy =
3764 IntegerType::get(OriginalTy->getContext(), KV.second);
3765 auto *TruncatedTy = FixedVectorType::get(
3766 ScalarTruncatedTy,
3767 cast<FixedVectorType>(OriginalTy)->getNumElements());
3768 if (TruncatedTy == OriginalTy)
3769 continue;
3770
3771 IRBuilder<> B(cast<Instruction>(I));
3772 auto ShrinkOperand = [&](Value *V) -> Value * {
3773 if (auto *ZI = dyn_cast<ZExtInst>(V))
3774 if (ZI->getSrcTy() == TruncatedTy)
3775 return ZI->getOperand(0);
3776 return B.CreateZExtOrTrunc(V, TruncatedTy);
3777 };
3778
3779 // The actual instruction modification depends on the instruction type,
3780 // unfortunately.
3781 Value *NewI = nullptr;
3782 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3783 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3784 ShrinkOperand(BO->getOperand(1)));
3785
3786 // Any wrapping introduced by shrinking this operation shouldn't be
3787 // considered undefined behavior. So, we can't unconditionally copy
3788 // arithmetic wrapping flags to NewI.
3789 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3790 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3791 NewI =
3792 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3793 ShrinkOperand(CI->getOperand(1)));
3794 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3795 NewI = B.CreateSelect(SI->getCondition(),
3796 ShrinkOperand(SI->getTrueValue()),
3797 ShrinkOperand(SI->getFalseValue()));
3798 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3799 switch (CI->getOpcode()) {
3800 default:
3801 llvm_unreachable("Unhandled cast!");
3802 case Instruction::Trunc:
3803 NewI = ShrinkOperand(CI->getOperand(0));
3804 break;
3805 case Instruction::SExt:
3806 NewI = B.CreateSExtOrTrunc(
3807 CI->getOperand(0),
3808 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3809 break;
3810 case Instruction::ZExt:
3811 NewI = B.CreateZExtOrTrunc(
3812 CI->getOperand(0),
3813 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3814 break;
3815 }
3816 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3817 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3818 ->getNumElements();
3819 auto *O0 = B.CreateZExtOrTrunc(
3820 SI->getOperand(0),
3821 FixedVectorType::get(ScalarTruncatedTy, Elements0));
3822 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3823 ->getNumElements();
3824 auto *O1 = B.CreateZExtOrTrunc(
3825 SI->getOperand(1),
3826 FixedVectorType::get(ScalarTruncatedTy, Elements1));
3827
3828 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3829 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3830 // Don't do anything with the operands, just extend the result.
3831 continue;
3832 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3833 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3834 ->getNumElements();
3835 auto *O0 = B.CreateZExtOrTrunc(
3836 IE->getOperand(0),
3837 FixedVectorType::get(ScalarTruncatedTy, Elements));
3838 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3839 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3840 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3841 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3842 ->getNumElements();
3843 auto *O0 = B.CreateZExtOrTrunc(
3844 EE->getOperand(0),
3845 FixedVectorType::get(ScalarTruncatedTy, Elements));
3846 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3847 } else {
3848 // If we don't know what to do, be conservative and don't do anything.
3849 continue;
3850 }
3851
3852 // Lastly, extend the result.
3853 NewI->takeName(cast<Instruction>(I));
3854 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3855 I->replaceAllUsesWith(Res);
3856 cast<Instruction>(I)->eraseFromParent();
3857 Erased.insert(I);
3858 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3859 }
3860 }
3861
3862 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3863 for (const auto &KV : Cost->getMinimalBitwidths()) {
3864 // If the value wasn't vectorized, we must maintain the original scalar
3865 // type. The absence of the value from VectorLoopValueMap indicates that it
3866 // wasn't vectorized.
3867 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3868 continue;
3869 for (unsigned Part = 0; Part < UF; ++Part) {
3870 Value *I = getOrCreateVectorValue(KV.first, Part);
3871 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3872 if (Inst && Inst->use_empty()) {
3873 Value *NewI = Inst->getOperand(0);
3874 Inst->eraseFromParent();
3875 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3876 }
3877 }
3878 }
3879 }
3880
fixVectorizedLoop()3881 void InnerLoopVectorizer::fixVectorizedLoop() {
3882 // Insert truncates and extends for any truncated instructions as hints to
3883 // InstCombine.
3884 if (VF.isVector())
3885 truncateToMinimalBitwidths();
3886
3887 // Fix widened non-induction PHIs by setting up the PHI operands.
3888 if (OrigPHIsToFix.size()) {
3889 assert(EnableVPlanNativePath &&
3890 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3891 fixNonInductionPHIs();
3892 }
3893
3894 // At this point every instruction in the original loop is widened to a
3895 // vector form. Now we need to fix the recurrences in the loop. These PHI
3896 // nodes are currently empty because we did not want to introduce cycles.
3897 // This is the second stage of vectorizing recurrences.
3898 fixCrossIterationPHIs();
3899
3900 // Forget the original basic block.
3901 PSE.getSE()->forgetLoop(OrigLoop);
3902
3903 // Fix-up external users of the induction variables.
3904 for (auto &Entry : Legal->getInductionVars())
3905 fixupIVUsers(Entry.first, Entry.second,
3906 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3907 IVEndValues[Entry.first], LoopMiddleBlock);
3908
3909 fixLCSSAPHIs();
3910 for (Instruction *PI : PredicatedInstructions)
3911 sinkScalarOperands(&*PI);
3912
3913 // Remove redundant induction instructions.
3914 cse(LoopVectorBody);
3915
3916 // Set/update profile weights for the vector and remainder loops as original
3917 // loop iterations are now distributed among them. Note that original loop
3918 // represented by LoopScalarBody becomes remainder loop after vectorization.
3919 //
3920 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3921 // end up getting slightly roughened result but that should be OK since
3922 // profile is not inherently precise anyway. Note also possible bypass of
3923 // vector code caused by legality checks is ignored, assigning all the weight
3924 // to the vector loop, optimistically.
3925 //
3926 // For scalable vectorization we can't know at compile time how many iterations
3927 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3928 // vscale of '1'.
3929 setProfileInfoAfterUnrolling(
3930 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3931 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3932 }
3933
fixCrossIterationPHIs()3934 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3935 // In order to support recurrences we need to be able to vectorize Phi nodes.
3936 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3937 // stage #2: We now need to fix the recurrences by adding incoming edges to
3938 // the currently empty PHI nodes. At this point every instruction in the
3939 // original loop is widened to a vector form so we can use them to construct
3940 // the incoming edges.
3941 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3942 // Handle first-order recurrences and reductions that need to be fixed.
3943 if (Legal->isFirstOrderRecurrence(&Phi))
3944 fixFirstOrderRecurrence(&Phi);
3945 else if (Legal->isReductionVariable(&Phi))
3946 fixReduction(&Phi);
3947 }
3948 }
3949
fixFirstOrderRecurrence(PHINode * Phi)3950 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3951 // This is the second phase of vectorizing first-order recurrences. An
3952 // overview of the transformation is described below. Suppose we have the
3953 // following loop.
3954 //
3955 // for (int i = 0; i < n; ++i)
3956 // b[i] = a[i] - a[i - 1];
3957 //
3958 // There is a first-order recurrence on "a". For this loop, the shorthand
3959 // scalar IR looks like:
3960 //
3961 // scalar.ph:
3962 // s_init = a[-1]
3963 // br scalar.body
3964 //
3965 // scalar.body:
3966 // i = phi [0, scalar.ph], [i+1, scalar.body]
3967 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3968 // s2 = a[i]
3969 // b[i] = s2 - s1
3970 // br cond, scalar.body, ...
3971 //
3972 // In this example, s1 is a recurrence because it's value depends on the
3973 // previous iteration. In the first phase of vectorization, we created a
3974 // temporary value for s1. We now complete the vectorization and produce the
3975 // shorthand vector IR shown below (for VF = 4, UF = 1).
3976 //
3977 // vector.ph:
3978 // v_init = vector(..., ..., ..., a[-1])
3979 // br vector.body
3980 //
3981 // vector.body
3982 // i = phi [0, vector.ph], [i+4, vector.body]
3983 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3984 // v2 = a[i, i+1, i+2, i+3];
3985 // v3 = vector(v1(3), v2(0, 1, 2))
3986 // b[i, i+1, i+2, i+3] = v2 - v3
3987 // br cond, vector.body, middle.block
3988 //
3989 // middle.block:
3990 // x = v2(3)
3991 // br scalar.ph
3992 //
3993 // scalar.ph:
3994 // s_init = phi [x, middle.block], [a[-1], otherwise]
3995 // br scalar.body
3996 //
3997 // After execution completes the vector loop, we extract the next value of
3998 // the recurrence (x) to use as the initial value in the scalar loop.
3999
4000 // Get the original loop preheader and single loop latch.
4001 auto *Preheader = OrigLoop->getLoopPreheader();
4002 auto *Latch = OrigLoop->getLoopLatch();
4003
4004 // Get the initial and previous values of the scalar recurrence.
4005 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4006 auto *Previous = Phi->getIncomingValueForBlock(Latch);
4007
4008 // Create a vector from the initial value.
4009 auto *VectorInit = ScalarInit;
4010 if (VF.isVector()) {
4011 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4012 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4013 VectorInit = Builder.CreateInsertElement(
4014 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4015 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4016 }
4017
4018 // We constructed a temporary phi node in the first phase of vectorization.
4019 // This phi node will eventually be deleted.
4020 Builder.SetInsertPoint(
4021 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4022
4023 // Create a phi node for the new recurrence. The current value will either be
4024 // the initial value inserted into a vector or loop-varying vector value.
4025 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4026 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4027
4028 // Get the vectorized previous value of the last part UF - 1. It appears last
4029 // among all unrolled iterations, due to the order of their construction.
4030 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4031
4032 // Find and set the insertion point after the previous value if it is an
4033 // instruction.
4034 BasicBlock::iterator InsertPt;
4035 // Note that the previous value may have been constant-folded so it is not
4036 // guaranteed to be an instruction in the vector loop.
4037 // FIXME: Loop invariant values do not form recurrences. We should deal with
4038 // them earlier.
4039 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4040 InsertPt = LoopVectorBody->getFirstInsertionPt();
4041 else {
4042 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4043 if (isa<PHINode>(PreviousLastPart))
4044 // If the previous value is a phi node, we should insert after all the phi
4045 // nodes in the block containing the PHI to avoid breaking basic block
4046 // verification. Note that the basic block may be different to
4047 // LoopVectorBody, in case we predicate the loop.
4048 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4049 else
4050 InsertPt = ++PreviousInst->getIterator();
4051 }
4052 Builder.SetInsertPoint(&*InsertPt);
4053
4054 // We will construct a vector for the recurrence by combining the values for
4055 // the current and previous iterations. This is the required shuffle mask.
4056 assert(!VF.isScalable());
4057 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4058 ShuffleMask[0] = VF.getKnownMinValue() - 1;
4059 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4060 ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4061
4062 // The vector from which to take the initial value for the current iteration
4063 // (actual or unrolled). Initially, this is the vector phi node.
4064 Value *Incoming = VecPhi;
4065
4066 // Shuffle the current and previous vector and update the vector parts.
4067 for (unsigned Part = 0; Part < UF; ++Part) {
4068 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4069 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4070 auto *Shuffle =
4071 VF.isVector()
4072 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4073 : Incoming;
4074 PhiPart->replaceAllUsesWith(Shuffle);
4075 cast<Instruction>(PhiPart)->eraseFromParent();
4076 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4077 Incoming = PreviousPart;
4078 }
4079
4080 // Fix the latch value of the new recurrence in the vector loop.
4081 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4082
4083 // Extract the last vector element in the middle block. This will be the
4084 // initial value for the recurrence when jumping to the scalar loop.
4085 auto *ExtractForScalar = Incoming;
4086 if (VF.isVector()) {
4087 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4088 ExtractForScalar = Builder.CreateExtractElement(
4089 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4090 "vector.recur.extract");
4091 }
4092 // Extract the second last element in the middle block if the
4093 // Phi is used outside the loop. We need to extract the phi itself
4094 // and not the last element (the phi update in the current iteration). This
4095 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4096 // when the scalar loop is not run at all.
4097 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4098 if (VF.isVector())
4099 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4100 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4101 "vector.recur.extract.for.phi");
4102 // When loop is unrolled without vectorizing, initialize
4103 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4104 // `Incoming`. This is analogous to the vectorized case above: extracting the
4105 // second last element when VF > 1.
4106 else if (UF > 1)
4107 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4108
4109 // Fix the initial value of the original recurrence in the scalar loop.
4110 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4111 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4112 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4113 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4114 Start->addIncoming(Incoming, BB);
4115 }
4116
4117 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4118 Phi->setName("scalar.recur");
4119
4120 // Finally, fix users of the recurrence outside the loop. The users will need
4121 // either the last value of the scalar recurrence or the last value of the
4122 // vector recurrence we extracted in the middle block. Since the loop is in
4123 // LCSSA form, we just need to find all the phi nodes for the original scalar
4124 // recurrence in the exit block, and then add an edge for the middle block.
4125 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4126 if (LCSSAPhi.getIncomingValue(0) == Phi) {
4127 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4128 }
4129 }
4130 }
4131
fixReduction(PHINode * Phi)4132 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4133 Constant *Zero = Builder.getInt32(0);
4134
4135 // Get it's reduction variable descriptor.
4136 assert(Legal->isReductionVariable(Phi) &&
4137 "Unable to find the reduction variable");
4138 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4139
4140 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4141 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4142 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4143 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4144 RdxDesc.getMinMaxRecurrenceKind();
4145 setDebugLocFromInst(Builder, ReductionStartValue);
4146 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4147
4148 // We need to generate a reduction vector from the incoming scalar.
4149 // To do so, we need to generate the 'identity' vector and override
4150 // one of the elements with the incoming scalar reduction. We need
4151 // to do it in the vector-loop preheader.
4152 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4153
4154 // This is the vector-clone of the value that leaves the loop.
4155 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4156
4157 // Find the reduction identity variable. Zero for addition, or, xor,
4158 // one for multiplication, -1 for And.
4159 Value *Identity;
4160 Value *VectorStart;
4161 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4162 RK == RecurrenceDescriptor::RK_FloatMinMax) {
4163 // MinMax reduction have the start value as their identify.
4164 if (VF.isScalar() || IsInLoopReductionPhi) {
4165 VectorStart = Identity = ReductionStartValue;
4166 } else {
4167 VectorStart = Identity =
4168 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4169 }
4170 } else {
4171 // Handle other reduction kinds:
4172 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4173 RK, MinMaxKind, VecTy->getScalarType());
4174 if (VF.isScalar() || IsInLoopReductionPhi) {
4175 Identity = Iden;
4176 // This vector is the Identity vector where the first element is the
4177 // incoming scalar reduction.
4178 VectorStart = ReductionStartValue;
4179 } else {
4180 Identity = ConstantVector::getSplat(VF, Iden);
4181
4182 // This vector is the Identity vector where the first element is the
4183 // incoming scalar reduction.
4184 VectorStart =
4185 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4186 }
4187 }
4188
4189 // Wrap flags are in general invalid after vectorization, clear them.
4190 clearReductionWrapFlags(RdxDesc);
4191
4192 // Fix the vector-loop phi.
4193
4194 // Reductions do not have to start at zero. They can start with
4195 // any loop invariant values.
4196 BasicBlock *Latch = OrigLoop->getLoopLatch();
4197 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4198
4199 for (unsigned Part = 0; Part < UF; ++Part) {
4200 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4201 Value *Val = getOrCreateVectorValue(LoopVal, Part);
4202 // Make sure to add the reduction start value only to the
4203 // first unroll part.
4204 Value *StartVal = (Part == 0) ? VectorStart : Identity;
4205 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4206 cast<PHINode>(VecRdxPhi)
4207 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4208 }
4209
4210 // Before each round, move the insertion point right between
4211 // the PHIs and the values we are going to write.
4212 // This allows us to write both PHINodes and the extractelement
4213 // instructions.
4214 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4215
4216 setDebugLocFromInst(Builder, LoopExitInst);
4217
4218 // If tail is folded by masking, the vector value to leave the loop should be
4219 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4220 // instead of the former. For an inloop reduction the reduction will already
4221 // be predicated, and does not need to be handled here.
4222 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4223 for (unsigned Part = 0; Part < UF; ++Part) {
4224 Value *VecLoopExitInst =
4225 VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4226 Value *Sel = nullptr;
4227 for (User *U : VecLoopExitInst->users()) {
4228 if (isa<SelectInst>(U)) {
4229 assert(!Sel && "Reduction exit feeding two selects");
4230 Sel = U;
4231 } else
4232 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4233 }
4234 assert(Sel && "Reduction exit feeds no select");
4235 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4236
4237 // If the target can create a predicated operator for the reduction at no
4238 // extra cost in the loop (for example a predicated vadd), it can be
4239 // cheaper for the select to remain in the loop than be sunk out of it,
4240 // and so use the select value for the phi instead of the old
4241 // LoopExitValue.
4242 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4243 if (PreferPredicatedReductionSelect ||
4244 TTI->preferPredicatedReductionSelect(
4245 RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4246 TargetTransformInfo::ReductionFlags())) {
4247 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4248 VecRdxPhi->setIncomingValueForBlock(
4249 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4250 }
4251 }
4252 }
4253
4254 // If the vector reduction can be performed in a smaller type, we truncate
4255 // then extend the loop exit value to enable InstCombine to evaluate the
4256 // entire expression in the smaller type.
4257 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4258 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4259 assert(!VF.isScalable() && "scalable vectors not yet supported.");
4260 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4261 Builder.SetInsertPoint(
4262 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4263 VectorParts RdxParts(UF);
4264 for (unsigned Part = 0; Part < UF; ++Part) {
4265 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4266 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4267 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4268 : Builder.CreateZExt(Trunc, VecTy);
4269 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4270 UI != RdxParts[Part]->user_end();)
4271 if (*UI != Trunc) {
4272 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4273 RdxParts[Part] = Extnd;
4274 } else {
4275 ++UI;
4276 }
4277 }
4278 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4279 for (unsigned Part = 0; Part < UF; ++Part) {
4280 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4281 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4282 }
4283 }
4284
4285 // Reduce all of the unrolled parts into a single vector.
4286 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4287 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4288
4289 // The middle block terminator has already been assigned a DebugLoc here (the
4290 // OrigLoop's single latch terminator). We want the whole middle block to
4291 // appear to execute on this line because: (a) it is all compiler generated,
4292 // (b) these instructions are always executed after evaluating the latch
4293 // conditional branch, and (c) other passes may add new predecessors which
4294 // terminate on this line. This is the easiest way to ensure we don't
4295 // accidentally cause an extra step back into the loop while debugging.
4296 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4297 for (unsigned Part = 1; Part < UF; ++Part) {
4298 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4299 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4300 // Floating point operations had to be 'fast' to enable the reduction.
4301 ReducedPartRdx = addFastMathFlag(
4302 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4303 ReducedPartRdx, "bin.rdx"),
4304 RdxDesc.getFastMathFlags());
4305 else
4306 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4307 RdxPart);
4308 }
4309
4310 // Create the reduction after the loop. Note that inloop reductions create the
4311 // target reduction in the loop using a Reduction recipe.
4312 if (VF.isVector() && !IsInLoopReductionPhi) {
4313 bool NoNaN = Legal->hasFunNoNaNAttr();
4314 ReducedPartRdx =
4315 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4316 // If the reduction can be performed in a smaller type, we need to extend
4317 // the reduction to the wider type before we branch to the original loop.
4318 if (Phi->getType() != RdxDesc.getRecurrenceType())
4319 ReducedPartRdx =
4320 RdxDesc.isSigned()
4321 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4322 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4323 }
4324
4325 // Create a phi node that merges control-flow from the backedge-taken check
4326 // block and the middle block.
4327 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4328 LoopScalarPreHeader->getTerminator());
4329 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4330 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4331 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4332
4333 // Now, we need to fix the users of the reduction variable
4334 // inside and outside of the scalar remainder loop.
4335 // We know that the loop is in LCSSA form. We need to update the
4336 // PHI nodes in the exit blocks.
4337 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4338 // All PHINodes need to have a single entry edge, or two if
4339 // we already fixed them.
4340 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4341
4342 // We found a reduction value exit-PHI. Update it with the
4343 // incoming bypass edge.
4344 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4345 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4346 } // end of the LCSSA phi scan.
4347
4348 // Fix the scalar loop reduction variable with the incoming reduction sum
4349 // from the vector body and from the backedge value.
4350 int IncomingEdgeBlockIdx =
4351 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4352 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4353 // Pick the other block.
4354 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4355 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4356 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4357 }
4358
clearReductionWrapFlags(RecurrenceDescriptor & RdxDesc)4359 void InnerLoopVectorizer::clearReductionWrapFlags(
4360 RecurrenceDescriptor &RdxDesc) {
4361 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4362 if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4363 RK != RecurrenceDescriptor::RK_IntegerMult)
4364 return;
4365
4366 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4367 assert(LoopExitInstr && "null loop exit instruction");
4368 SmallVector<Instruction *, 8> Worklist;
4369 SmallPtrSet<Instruction *, 8> Visited;
4370 Worklist.push_back(LoopExitInstr);
4371 Visited.insert(LoopExitInstr);
4372
4373 while (!Worklist.empty()) {
4374 Instruction *Cur = Worklist.pop_back_val();
4375 if (isa<OverflowingBinaryOperator>(Cur))
4376 for (unsigned Part = 0; Part < UF; ++Part) {
4377 Value *V = getOrCreateVectorValue(Cur, Part);
4378 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4379 }
4380
4381 for (User *U : Cur->users()) {
4382 Instruction *UI = cast<Instruction>(U);
4383 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4384 Visited.insert(UI).second)
4385 Worklist.push_back(UI);
4386 }
4387 }
4388 }
4389
fixLCSSAPHIs()4390 void InnerLoopVectorizer::fixLCSSAPHIs() {
4391 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4392 if (LCSSAPhi.getNumIncomingValues() == 1) {
4393 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4394 // Non-instruction incoming values will have only one value.
4395 unsigned LastLane = 0;
4396 if (isa<Instruction>(IncomingValue))
4397 LastLane = Cost->isUniformAfterVectorization(
4398 cast<Instruction>(IncomingValue), VF)
4399 ? 0
4400 : VF.getKnownMinValue() - 1;
4401 assert((!VF.isScalable() || LastLane == 0) &&
4402 "scalable vectors dont support non-uniform scalars yet");
4403 // Can be a loop invariant incoming value or the last scalar value to be
4404 // extracted from the vectorized loop.
4405 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4406 Value *lastIncomingValue =
4407 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4408 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4409 }
4410 }
4411 }
4412
sinkScalarOperands(Instruction * PredInst)4413 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4414 // The basic block and loop containing the predicated instruction.
4415 auto *PredBB = PredInst->getParent();
4416 auto *VectorLoop = LI->getLoopFor(PredBB);
4417
4418 // Initialize a worklist with the operands of the predicated instruction.
4419 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4420
4421 // Holds instructions that we need to analyze again. An instruction may be
4422 // reanalyzed if we don't yet know if we can sink it or not.
4423 SmallVector<Instruction *, 8> InstsToReanalyze;
4424
4425 // Returns true if a given use occurs in the predicated block. Phi nodes use
4426 // their operands in their corresponding predecessor blocks.
4427 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4428 auto *I = cast<Instruction>(U.getUser());
4429 BasicBlock *BB = I->getParent();
4430 if (auto *Phi = dyn_cast<PHINode>(I))
4431 BB = Phi->getIncomingBlock(
4432 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4433 return BB == PredBB;
4434 };
4435
4436 // Iteratively sink the scalarized operands of the predicated instruction
4437 // into the block we created for it. When an instruction is sunk, it's
4438 // operands are then added to the worklist. The algorithm ends after one pass
4439 // through the worklist doesn't sink a single instruction.
4440 bool Changed;
4441 do {
4442 // Add the instructions that need to be reanalyzed to the worklist, and
4443 // reset the changed indicator.
4444 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4445 InstsToReanalyze.clear();
4446 Changed = false;
4447
4448 while (!Worklist.empty()) {
4449 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4450
4451 // We can't sink an instruction if it is a phi node, is already in the
4452 // predicated block, is not in the loop, or may have side effects.
4453 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4454 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4455 continue;
4456
4457 // It's legal to sink the instruction if all its uses occur in the
4458 // predicated block. Otherwise, there's nothing to do yet, and we may
4459 // need to reanalyze the instruction.
4460 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4461 InstsToReanalyze.push_back(I);
4462 continue;
4463 }
4464
4465 // Move the instruction to the beginning of the predicated block, and add
4466 // it's operands to the worklist.
4467 I->moveBefore(&*PredBB->getFirstInsertionPt());
4468 Worklist.insert(I->op_begin(), I->op_end());
4469
4470 // The sinking may have enabled other instructions to be sunk, so we will
4471 // need to iterate.
4472 Changed = true;
4473 }
4474 } while (Changed);
4475 }
4476
fixNonInductionPHIs()4477 void InnerLoopVectorizer::fixNonInductionPHIs() {
4478 for (PHINode *OrigPhi : OrigPHIsToFix) {
4479 PHINode *NewPhi =
4480 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4481 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4482
4483 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4484 predecessors(OrigPhi->getParent()));
4485 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4486 predecessors(NewPhi->getParent()));
4487 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4488 "Scalar and Vector BB should have the same number of predecessors");
4489
4490 // The insertion point in Builder may be invalidated by the time we get
4491 // here. Force the Builder insertion point to something valid so that we do
4492 // not run into issues during insertion point restore in
4493 // getOrCreateVectorValue calls below.
4494 Builder.SetInsertPoint(NewPhi);
4495
4496 // The predecessor order is preserved and we can rely on mapping between
4497 // scalar and vector block predecessors.
4498 for (unsigned i = 0; i < NumIncomingValues; ++i) {
4499 BasicBlock *NewPredBB = VectorBBPredecessors[i];
4500
4501 // When looking up the new scalar/vector values to fix up, use incoming
4502 // values from original phi.
4503 Value *ScIncV =
4504 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4505
4506 // Scalar incoming value may need a broadcast
4507 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4508 NewPhi->addIncoming(NewIncV, NewPredBB);
4509 }
4510 }
4511 }
4512
widenGEP(GetElementPtrInst * GEP,VPValue * VPDef,VPUser & Operands,unsigned UF,ElementCount VF,bool IsPtrLoopInvariant,SmallBitVector & IsIndexLoopInvariant,VPTransformState & State)4513 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4514 VPUser &Operands, unsigned UF,
4515 ElementCount VF, bool IsPtrLoopInvariant,
4516 SmallBitVector &IsIndexLoopInvariant,
4517 VPTransformState &State) {
4518 // Construct a vector GEP by widening the operands of the scalar GEP as
4519 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4520 // results in a vector of pointers when at least one operand of the GEP
4521 // is vector-typed. Thus, to keep the representation compact, we only use
4522 // vector-typed operands for loop-varying values.
4523
4524 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4525 // If we are vectorizing, but the GEP has only loop-invariant operands,
4526 // the GEP we build (by only using vector-typed operands for
4527 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4528 // produce a vector of pointers, we need to either arbitrarily pick an
4529 // operand to broadcast, or broadcast a clone of the original GEP.
4530 // Here, we broadcast a clone of the original.
4531 //
4532 // TODO: If at some point we decide to scalarize instructions having
4533 // loop-invariant operands, this special case will no longer be
4534 // required. We would add the scalarization decision to
4535 // collectLoopScalars() and teach getVectorValue() to broadcast
4536 // the lane-zero scalar value.
4537 auto *Clone = Builder.Insert(GEP->clone());
4538 for (unsigned Part = 0; Part < UF; ++Part) {
4539 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4540 State.set(VPDef, GEP, EntryPart, Part);
4541 addMetadata(EntryPart, GEP);
4542 }
4543 } else {
4544 // If the GEP has at least one loop-varying operand, we are sure to
4545 // produce a vector of pointers. But if we are only unrolling, we want
4546 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4547 // produce with the code below will be scalar (if VF == 1) or vector
4548 // (otherwise). Note that for the unroll-only case, we still maintain
4549 // values in the vector mapping with initVector, as we do for other
4550 // instructions.
4551 for (unsigned Part = 0; Part < UF; ++Part) {
4552 // The pointer operand of the new GEP. If it's loop-invariant, we
4553 // won't broadcast it.
4554 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4555 : State.get(Operands.getOperand(0), Part);
4556
4557 // Collect all the indices for the new GEP. If any index is
4558 // loop-invariant, we won't broadcast it.
4559 SmallVector<Value *, 4> Indices;
4560 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4561 VPValue *Operand = Operands.getOperand(I);
4562 if (IsIndexLoopInvariant[I - 1])
4563 Indices.push_back(State.get(Operand, {0, 0}));
4564 else
4565 Indices.push_back(State.get(Operand, Part));
4566 }
4567
4568 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4569 // but it should be a vector, otherwise.
4570 auto *NewGEP =
4571 GEP->isInBounds()
4572 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4573 Indices)
4574 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4575 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4576 "NewGEP is not a pointer vector");
4577 State.set(VPDef, GEP, NewGEP, Part);
4578 addMetadata(NewGEP, GEP);
4579 }
4580 }
4581 }
4582
widenPHIInstruction(Instruction * PN,unsigned UF,ElementCount VF)4583 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4584 ElementCount VF) {
4585 assert(!VF.isScalable() && "scalable vectors not yet supported.");
4586 PHINode *P = cast<PHINode>(PN);
4587 if (EnableVPlanNativePath) {
4588 // Currently we enter here in the VPlan-native path for non-induction
4589 // PHIs where all control flow is uniform. We simply widen these PHIs.
4590 // Create a vector phi with no operands - the vector phi operands will be
4591 // set at the end of vector code generation.
4592 Type *VecTy =
4593 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4594 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4595 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4596 OrigPHIsToFix.push_back(P);
4597
4598 return;
4599 }
4600
4601 assert(PN->getParent() == OrigLoop->getHeader() &&
4602 "Non-header phis should have been handled elsewhere");
4603
4604 // In order to support recurrences we need to be able to vectorize Phi nodes.
4605 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4606 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4607 // this value when we vectorize all of the instructions that use the PHI.
4608 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4609 for (unsigned Part = 0; Part < UF; ++Part) {
4610 // This is phase one of vectorizing PHIs.
4611 bool ScalarPHI =
4612 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4613 Type *VecTy =
4614 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4615 Value *EntryPart = PHINode::Create(
4616 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4617 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4618 }
4619 return;
4620 }
4621
4622 setDebugLocFromInst(Builder, P);
4623
4624 // This PHINode must be an induction variable.
4625 // Make sure that we know about it.
4626 assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4627
4628 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4629 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4630
4631 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4632 // which can be found from the original scalar operations.
4633 switch (II.getKind()) {
4634 case InductionDescriptor::IK_NoInduction:
4635 llvm_unreachable("Unknown induction");
4636 case InductionDescriptor::IK_IntInduction:
4637 case InductionDescriptor::IK_FpInduction:
4638 llvm_unreachable("Integer/fp induction is handled elsewhere.");
4639 case InductionDescriptor::IK_PtrInduction: {
4640 // Handle the pointer induction variable case.
4641 assert(P->getType()->isPointerTy() && "Unexpected type.");
4642
4643 if (Cost->isScalarAfterVectorization(P, VF)) {
4644 // This is the normalized GEP that starts counting at zero.
4645 Value *PtrInd =
4646 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4647 // Determine the number of scalars we need to generate for each unroll
4648 // iteration. If the instruction is uniform, we only need to generate the
4649 // first lane. Otherwise, we generate all VF values.
4650 unsigned Lanes =
4651 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4652 for (unsigned Part = 0; Part < UF; ++Part) {
4653 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4654 Constant *Idx = ConstantInt::get(PtrInd->getType(),
4655 Lane + Part * VF.getKnownMinValue());
4656 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4657 Value *SclrGep =
4658 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4659 SclrGep->setName("next.gep");
4660 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4661 }
4662 }
4663 return;
4664 }
4665 assert(isa<SCEVConstant>(II.getStep()) &&
4666 "Induction step not a SCEV constant!");
4667 Type *PhiType = II.getStep()->getType();
4668
4669 // Build a pointer phi
4670 Value *ScalarStartValue = II.getStartValue();
4671 Type *ScStValueType = ScalarStartValue->getType();
4672 PHINode *NewPointerPhi =
4673 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4674 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4675
4676 // A pointer induction, performed by using a gep
4677 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4678 Instruction *InductionLoc = LoopLatch->getTerminator();
4679 const SCEV *ScalarStep = II.getStep();
4680 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4681 Value *ScalarStepValue =
4682 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4683 Value *InductionGEP = GetElementPtrInst::Create(
4684 ScStValueType->getPointerElementType(), NewPointerPhi,
4685 Builder.CreateMul(
4686 ScalarStepValue,
4687 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4688 "ptr.ind", InductionLoc);
4689 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4690
4691 // Create UF many actual address geps that use the pointer
4692 // phi as base and a vectorized version of the step value
4693 // (<step*0, ..., step*N>) as offset.
4694 for (unsigned Part = 0; Part < UF; ++Part) {
4695 SmallVector<Constant *, 8> Indices;
4696 // Create a vector of consecutive numbers from zero to VF.
4697 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4698 Indices.push_back(
4699 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4700 Constant *StartOffset = ConstantVector::get(Indices);
4701
4702 Value *GEP = Builder.CreateGEP(
4703 ScStValueType->getPointerElementType(), NewPointerPhi,
4704 Builder.CreateMul(
4705 StartOffset,
4706 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4707 "vector.gep"));
4708 VectorLoopValueMap.setVectorValue(P, Part, GEP);
4709 }
4710 }
4711 }
4712 }
4713
4714 /// A helper function for checking whether an integer division-related
4715 /// instruction may divide by zero (in which case it must be predicated if
4716 /// executed conditionally in the scalar code).
4717 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4718 /// Non-zero divisors that are non compile-time constants will not be
4719 /// converted into multiplication, so we will still end up scalarizing
4720 /// the division, but can do so w/o predication.
mayDivideByZero(Instruction & I)4721 static bool mayDivideByZero(Instruction &I) {
4722 assert((I.getOpcode() == Instruction::UDiv ||
4723 I.getOpcode() == Instruction::SDiv ||
4724 I.getOpcode() == Instruction::URem ||
4725 I.getOpcode() == Instruction::SRem) &&
4726 "Unexpected instruction");
4727 Value *Divisor = I.getOperand(1);
4728 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4729 return !CInt || CInt->isZero();
4730 }
4731
widenInstruction(Instruction & I,VPValue * Def,VPUser & User,VPTransformState & State)4732 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4733 VPUser &User,
4734 VPTransformState &State) {
4735 switch (I.getOpcode()) {
4736 case Instruction::Call:
4737 case Instruction::Br:
4738 case Instruction::PHI:
4739 case Instruction::GetElementPtr:
4740 case Instruction::Select:
4741 llvm_unreachable("This instruction is handled by a different recipe.");
4742 case Instruction::UDiv:
4743 case Instruction::SDiv:
4744 case Instruction::SRem:
4745 case Instruction::URem:
4746 case Instruction::Add:
4747 case Instruction::FAdd:
4748 case Instruction::Sub:
4749 case Instruction::FSub:
4750 case Instruction::FNeg:
4751 case Instruction::Mul:
4752 case Instruction::FMul:
4753 case Instruction::FDiv:
4754 case Instruction::FRem:
4755 case Instruction::Shl:
4756 case Instruction::LShr:
4757 case Instruction::AShr:
4758 case Instruction::And:
4759 case Instruction::Or:
4760 case Instruction::Xor: {
4761 // Just widen unops and binops.
4762 setDebugLocFromInst(Builder, &I);
4763
4764 for (unsigned Part = 0; Part < UF; ++Part) {
4765 SmallVector<Value *, 2> Ops;
4766 for (VPValue *VPOp : User.operands())
4767 Ops.push_back(State.get(VPOp, Part));
4768
4769 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4770
4771 if (auto *VecOp = dyn_cast<Instruction>(V))
4772 VecOp->copyIRFlags(&I);
4773
4774 // Use this vector value for all users of the original instruction.
4775 State.set(Def, &I, V, Part);
4776 addMetadata(V, &I);
4777 }
4778
4779 break;
4780 }
4781 case Instruction::ICmp:
4782 case Instruction::FCmp: {
4783 // Widen compares. Generate vector compares.
4784 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4785 auto *Cmp = cast<CmpInst>(&I);
4786 setDebugLocFromInst(Builder, Cmp);
4787 for (unsigned Part = 0; Part < UF; ++Part) {
4788 Value *A = State.get(User.getOperand(0), Part);
4789 Value *B = State.get(User.getOperand(1), Part);
4790 Value *C = nullptr;
4791 if (FCmp) {
4792 // Propagate fast math flags.
4793 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4794 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4795 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4796 } else {
4797 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4798 }
4799 State.set(Def, &I, C, Part);
4800 addMetadata(C, &I);
4801 }
4802
4803 break;
4804 }
4805
4806 case Instruction::ZExt:
4807 case Instruction::SExt:
4808 case Instruction::FPToUI:
4809 case Instruction::FPToSI:
4810 case Instruction::FPExt:
4811 case Instruction::PtrToInt:
4812 case Instruction::IntToPtr:
4813 case Instruction::SIToFP:
4814 case Instruction::UIToFP:
4815 case Instruction::Trunc:
4816 case Instruction::FPTrunc:
4817 case Instruction::BitCast: {
4818 auto *CI = cast<CastInst>(&I);
4819 setDebugLocFromInst(Builder, CI);
4820
4821 /// Vectorize casts.
4822 Type *DestTy =
4823 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4824
4825 for (unsigned Part = 0; Part < UF; ++Part) {
4826 Value *A = State.get(User.getOperand(0), Part);
4827 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4828 State.set(Def, &I, Cast, Part);
4829 addMetadata(Cast, &I);
4830 }
4831 break;
4832 }
4833 default:
4834 // This instruction is not vectorized by simple widening.
4835 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4836 llvm_unreachable("Unhandled instruction!");
4837 } // end of switch.
4838 }
4839
widenCallInstruction(CallInst & I,VPValue * Def,VPUser & ArgOperands,VPTransformState & State)4840 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4841 VPUser &ArgOperands,
4842 VPTransformState &State) {
4843 assert(!isa<DbgInfoIntrinsic>(I) &&
4844 "DbgInfoIntrinsic should have been dropped during VPlan construction");
4845 setDebugLocFromInst(Builder, &I);
4846
4847 Module *M = I.getParent()->getParent()->getParent();
4848 auto *CI = cast<CallInst>(&I);
4849
4850 SmallVector<Type *, 4> Tys;
4851 for (Value *ArgOperand : CI->arg_operands())
4852 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4853
4854 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4855
4856 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4857 // version of the instruction.
4858 // Is it beneficial to perform intrinsic call compared to lib call?
4859 bool NeedToScalarize = false;
4860 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4861 bool UseVectorIntrinsic =
4862 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4863 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4864 "Instruction should be scalarized elsewhere.");
4865
4866 for (unsigned Part = 0; Part < UF; ++Part) {
4867 SmallVector<Value *, 4> Args;
4868 for (auto &I : enumerate(ArgOperands.operands())) {
4869 // Some intrinsics have a scalar argument - don't replace it with a
4870 // vector.
4871 Value *Arg;
4872 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4873 Arg = State.get(I.value(), Part);
4874 else
4875 Arg = State.get(I.value(), {0, 0});
4876 Args.push_back(Arg);
4877 }
4878
4879 Function *VectorF;
4880 if (UseVectorIntrinsic) {
4881 // Use vector version of the intrinsic.
4882 Type *TysForDecl[] = {CI->getType()};
4883 if (VF.isVector()) {
4884 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4885 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4886 }
4887 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4888 assert(VectorF && "Can't retrieve vector intrinsic.");
4889 } else {
4890 // Use vector version of the function call.
4891 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4892 #ifndef NDEBUG
4893 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4894 "Can't create vector function.");
4895 #endif
4896 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4897 }
4898 SmallVector<OperandBundleDef, 1> OpBundles;
4899 CI->getOperandBundlesAsDefs(OpBundles);
4900 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4901
4902 if (isa<FPMathOperator>(V))
4903 V->copyFastMathFlags(CI);
4904
4905 State.set(Def, &I, V, Part);
4906 addMetadata(V, &I);
4907 }
4908 }
4909
widenSelectInstruction(SelectInst & I,VPValue * VPDef,VPUser & Operands,bool InvariantCond,VPTransformState & State)4910 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4911 VPUser &Operands,
4912 bool InvariantCond,
4913 VPTransformState &State) {
4914 setDebugLocFromInst(Builder, &I);
4915
4916 // The condition can be loop invariant but still defined inside the
4917 // loop. This means that we can't just use the original 'cond' value.
4918 // We have to take the 'vectorized' value and pick the first lane.
4919 // Instcombine will make this a no-op.
4920 auto *InvarCond =
4921 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4922
4923 for (unsigned Part = 0; Part < UF; ++Part) {
4924 Value *Cond =
4925 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4926 Value *Op0 = State.get(Operands.getOperand(1), Part);
4927 Value *Op1 = State.get(Operands.getOperand(2), Part);
4928 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4929 State.set(VPDef, &I, Sel, Part);
4930 addMetadata(Sel, &I);
4931 }
4932 }
4933
collectLoopScalars(ElementCount VF)4934 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4935 // We should not collect Scalars more than once per VF. Right now, this
4936 // function is called from collectUniformsAndScalars(), which already does
4937 // this check. Collecting Scalars for VF=1 does not make any sense.
4938 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4939 "This function should not be visited twice for the same VF");
4940
4941 SmallSetVector<Instruction *, 8> Worklist;
4942
4943 // These sets are used to seed the analysis with pointers used by memory
4944 // accesses that will remain scalar.
4945 SmallSetVector<Instruction *, 8> ScalarPtrs;
4946 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4947 auto *Latch = TheLoop->getLoopLatch();
4948
4949 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4950 // The pointer operands of loads and stores will be scalar as long as the
4951 // memory access is not a gather or scatter operation. The value operand of a
4952 // store will remain scalar if the store is scalarized.
4953 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4954 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4955 assert(WideningDecision != CM_Unknown &&
4956 "Widening decision should be ready at this moment");
4957 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4958 if (Ptr == Store->getValueOperand())
4959 return WideningDecision == CM_Scalarize;
4960 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4961 "Ptr is neither a value or pointer operand");
4962 return WideningDecision != CM_GatherScatter;
4963 };
4964
4965 // A helper that returns true if the given value is a bitcast or
4966 // getelementptr instruction contained in the loop.
4967 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4968 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4969 isa<GetElementPtrInst>(V)) &&
4970 !TheLoop->isLoopInvariant(V);
4971 };
4972
4973 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4974 if (!isa<PHINode>(Ptr) ||
4975 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4976 return false;
4977 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4978 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4979 return false;
4980 return isScalarUse(MemAccess, Ptr);
4981 };
4982
4983 // A helper that evaluates a memory access's use of a pointer. If the
4984 // pointer is actually the pointer induction of a loop, it is being
4985 // inserted into Worklist. If the use will be a scalar use, and the
4986 // pointer is only used by memory accesses, we place the pointer in
4987 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4988 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4989 if (isScalarPtrInduction(MemAccess, Ptr)) {
4990 Worklist.insert(cast<Instruction>(Ptr));
4991 Instruction *Update = cast<Instruction>(
4992 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4993 Worklist.insert(Update);
4994 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4995 << "\n");
4996 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4997 << "\n");
4998 return;
4999 }
5000 // We only care about bitcast and getelementptr instructions contained in
5001 // the loop.
5002 if (!isLoopVaryingBitCastOrGEP(Ptr))
5003 return;
5004
5005 // If the pointer has already been identified as scalar (e.g., if it was
5006 // also identified as uniform), there's nothing to do.
5007 auto *I = cast<Instruction>(Ptr);
5008 if (Worklist.count(I))
5009 return;
5010
5011 // If the use of the pointer will be a scalar use, and all users of the
5012 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5013 // place the pointer in PossibleNonScalarPtrs.
5014 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5015 return isa<LoadInst>(U) || isa<StoreInst>(U);
5016 }))
5017 ScalarPtrs.insert(I);
5018 else
5019 PossibleNonScalarPtrs.insert(I);
5020 };
5021
5022 // We seed the scalars analysis with three classes of instructions: (1)
5023 // instructions marked uniform-after-vectorization and (2) bitcast,
5024 // getelementptr and (pointer) phi instructions used by memory accesses
5025 // requiring a scalar use.
5026 //
5027 // (1) Add to the worklist all instructions that have been identified as
5028 // uniform-after-vectorization.
5029 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5030
5031 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5032 // memory accesses requiring a scalar use. The pointer operands of loads and
5033 // stores will be scalar as long as the memory accesses is not a gather or
5034 // scatter operation. The value operand of a store will remain scalar if the
5035 // store is scalarized.
5036 for (auto *BB : TheLoop->blocks())
5037 for (auto &I : *BB) {
5038 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5039 evaluatePtrUse(Load, Load->getPointerOperand());
5040 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5041 evaluatePtrUse(Store, Store->getPointerOperand());
5042 evaluatePtrUse(Store, Store->getValueOperand());
5043 }
5044 }
5045 for (auto *I : ScalarPtrs)
5046 if (!PossibleNonScalarPtrs.count(I)) {
5047 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5048 Worklist.insert(I);
5049 }
5050
5051 // Insert the forced scalars.
5052 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5053 // induction variable when the PHI user is scalarized.
5054 auto ForcedScalar = ForcedScalars.find(VF);
5055 if (ForcedScalar != ForcedScalars.end())
5056 for (auto *I : ForcedScalar->second)
5057 Worklist.insert(I);
5058
5059 // Expand the worklist by looking through any bitcasts and getelementptr
5060 // instructions we've already identified as scalar. This is similar to the
5061 // expansion step in collectLoopUniforms(); however, here we're only
5062 // expanding to include additional bitcasts and getelementptr instructions.
5063 unsigned Idx = 0;
5064 while (Idx != Worklist.size()) {
5065 Instruction *Dst = Worklist[Idx++];
5066 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5067 continue;
5068 auto *Src = cast<Instruction>(Dst->getOperand(0));
5069 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5070 auto *J = cast<Instruction>(U);
5071 return !TheLoop->contains(J) || Worklist.count(J) ||
5072 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5073 isScalarUse(J, Src));
5074 })) {
5075 Worklist.insert(Src);
5076 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5077 }
5078 }
5079
5080 // An induction variable will remain scalar if all users of the induction
5081 // variable and induction variable update remain scalar.
5082 for (auto &Induction : Legal->getInductionVars()) {
5083 auto *Ind = Induction.first;
5084 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5085
5086 // If tail-folding is applied, the primary induction variable will be used
5087 // to feed a vector compare.
5088 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5089 continue;
5090
5091 // Determine if all users of the induction variable are scalar after
5092 // vectorization.
5093 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5094 auto *I = cast<Instruction>(U);
5095 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5096 });
5097 if (!ScalarInd)
5098 continue;
5099
5100 // Determine if all users of the induction variable update instruction are
5101 // scalar after vectorization.
5102 auto ScalarIndUpdate =
5103 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5104 auto *I = cast<Instruction>(U);
5105 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5106 });
5107 if (!ScalarIndUpdate)
5108 continue;
5109
5110 // The induction variable and its update instruction will remain scalar.
5111 Worklist.insert(Ind);
5112 Worklist.insert(IndUpdate);
5113 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5114 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5115 << "\n");
5116 }
5117
5118 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5119 }
5120
isScalarWithPredication(Instruction * I,ElementCount VF)5121 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5122 ElementCount VF) {
5123 if (!blockNeedsPredication(I->getParent()))
5124 return false;
5125 switch(I->getOpcode()) {
5126 default:
5127 break;
5128 case Instruction::Load:
5129 case Instruction::Store: {
5130 if (!Legal->isMaskRequired(I))
5131 return false;
5132 auto *Ptr = getLoadStorePointerOperand(I);
5133 auto *Ty = getMemInstValueType(I);
5134 // We have already decided how to vectorize this instruction, get that
5135 // result.
5136 if (VF.isVector()) {
5137 InstWidening WideningDecision = getWideningDecision(I, VF);
5138 assert(WideningDecision != CM_Unknown &&
5139 "Widening decision should be ready at this moment");
5140 return WideningDecision == CM_Scalarize;
5141 }
5142 const Align Alignment = getLoadStoreAlignment(I);
5143 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5144 isLegalMaskedGather(Ty, Alignment))
5145 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5146 isLegalMaskedScatter(Ty, Alignment));
5147 }
5148 case Instruction::UDiv:
5149 case Instruction::SDiv:
5150 case Instruction::SRem:
5151 case Instruction::URem:
5152 return mayDivideByZero(*I);
5153 }
5154 return false;
5155 }
5156
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)5157 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5158 Instruction *I, ElementCount VF) {
5159 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5160 assert(getWideningDecision(I, VF) == CM_Unknown &&
5161 "Decision should not be set yet.");
5162 auto *Group = getInterleavedAccessGroup(I);
5163 assert(Group && "Must have a group.");
5164
5165 // If the instruction's allocated size doesn't equal it's type size, it
5166 // requires padding and will be scalarized.
5167 auto &DL = I->getModule()->getDataLayout();
5168 auto *ScalarTy = getMemInstValueType(I);
5169 if (hasIrregularType(ScalarTy, DL, VF))
5170 return false;
5171
5172 // Check if masking is required.
5173 // A Group may need masking for one of two reasons: it resides in a block that
5174 // needs predication, or it was decided to use masking to deal with gaps.
5175 bool PredicatedAccessRequiresMasking =
5176 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5177 bool AccessWithGapsRequiresMasking =
5178 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5179 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5180 return true;
5181
5182 // If masked interleaving is required, we expect that the user/target had
5183 // enabled it, because otherwise it either wouldn't have been created or
5184 // it should have been invalidated by the CostModel.
5185 assert(useMaskedInterleavedAccesses(TTI) &&
5186 "Masked interleave-groups for predicated accesses are not enabled.");
5187
5188 auto *Ty = getMemInstValueType(I);
5189 const Align Alignment = getLoadStoreAlignment(I);
5190 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5191 : TTI.isLegalMaskedStore(Ty, Alignment);
5192 }
5193
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)5194 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5195 Instruction *I, ElementCount VF) {
5196 // Get and ensure we have a valid memory instruction.
5197 LoadInst *LI = dyn_cast<LoadInst>(I);
5198 StoreInst *SI = dyn_cast<StoreInst>(I);
5199 assert((LI || SI) && "Invalid memory instruction");
5200
5201 auto *Ptr = getLoadStorePointerOperand(I);
5202
5203 // In order to be widened, the pointer should be consecutive, first of all.
5204 if (!Legal->isConsecutivePtr(Ptr))
5205 return false;
5206
5207 // If the instruction is a store located in a predicated block, it will be
5208 // scalarized.
5209 if (isScalarWithPredication(I))
5210 return false;
5211
5212 // If the instruction's allocated size doesn't equal it's type size, it
5213 // requires padding and will be scalarized.
5214 auto &DL = I->getModule()->getDataLayout();
5215 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5216 if (hasIrregularType(ScalarTy, DL, VF))
5217 return false;
5218
5219 return true;
5220 }
5221
collectLoopUniforms(ElementCount VF)5222 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5223 // We should not collect Uniforms more than once per VF. Right now,
5224 // this function is called from collectUniformsAndScalars(), which
5225 // already does this check. Collecting Uniforms for VF=1 does not make any
5226 // sense.
5227
5228 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5229 "This function should not be visited twice for the same VF");
5230
5231 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5232 // not analyze again. Uniforms.count(VF) will return 1.
5233 Uniforms[VF].clear();
5234
5235 // We now know that the loop is vectorizable!
5236 // Collect instructions inside the loop that will remain uniform after
5237 // vectorization.
5238
5239 // Global values, params and instructions outside of current loop are out of
5240 // scope.
5241 auto isOutOfScope = [&](Value *V) -> bool {
5242 Instruction *I = dyn_cast<Instruction>(V);
5243 return (!I || !TheLoop->contains(I));
5244 };
5245
5246 SetVector<Instruction *> Worklist;
5247 BasicBlock *Latch = TheLoop->getLoopLatch();
5248
5249 // Instructions that are scalar with predication must not be considered
5250 // uniform after vectorization, because that would create an erroneous
5251 // replicating region where only a single instance out of VF should be formed.
5252 // TODO: optimize such seldom cases if found important, see PR40816.
5253 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5254 if (isOutOfScope(I)) {
5255 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5256 << *I << "\n");
5257 return;
5258 }
5259 if (isScalarWithPredication(I, VF)) {
5260 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5261 << *I << "\n");
5262 return;
5263 }
5264 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5265 Worklist.insert(I);
5266 };
5267
5268 // Start with the conditional branch. If the branch condition is an
5269 // instruction contained in the loop that is only used by the branch, it is
5270 // uniform.
5271 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5272 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5273 addToWorklistIfAllowed(Cmp);
5274
5275 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5276 InstWidening WideningDecision = getWideningDecision(I, VF);
5277 assert(WideningDecision != CM_Unknown &&
5278 "Widening decision should be ready at this moment");
5279
5280 // A uniform memory op is itself uniform. We exclude uniform stores
5281 // here as they demand the last lane, not the first one.
5282 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5283 assert(WideningDecision == CM_Scalarize);
5284 return true;
5285 }
5286
5287 return (WideningDecision == CM_Widen ||
5288 WideningDecision == CM_Widen_Reverse ||
5289 WideningDecision == CM_Interleave);
5290 };
5291
5292
5293 // Returns true if Ptr is the pointer operand of a memory access instruction
5294 // I, and I is known to not require scalarization.
5295 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5296 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5297 };
5298
5299 // Holds a list of values which are known to have at least one uniform use.
5300 // Note that there may be other uses which aren't uniform. A "uniform use"
5301 // here is something which only demands lane 0 of the unrolled iterations;
5302 // it does not imply that all lanes produce the same value (e.g. this is not
5303 // the usual meaning of uniform)
5304 SmallPtrSet<Value *, 8> HasUniformUse;
5305
5306 // Scan the loop for instructions which are either a) known to have only
5307 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5308 for (auto *BB : TheLoop->blocks())
5309 for (auto &I : *BB) {
5310 // If there's no pointer operand, there's nothing to do.
5311 auto *Ptr = getLoadStorePointerOperand(&I);
5312 if (!Ptr)
5313 continue;
5314
5315 // A uniform memory op is itself uniform. We exclude uniform stores
5316 // here as they demand the last lane, not the first one.
5317 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5318 addToWorklistIfAllowed(&I);
5319
5320 if (isUniformDecision(&I, VF)) {
5321 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5322 HasUniformUse.insert(Ptr);
5323 }
5324 }
5325
5326 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5327 // demanding) users. Since loops are assumed to be in LCSSA form, this
5328 // disallows uses outside the loop as well.
5329 for (auto *V : HasUniformUse) {
5330 if (isOutOfScope(V))
5331 continue;
5332 auto *I = cast<Instruction>(V);
5333 auto UsersAreMemAccesses =
5334 llvm::all_of(I->users(), [&](User *U) -> bool {
5335 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5336 });
5337 if (UsersAreMemAccesses)
5338 addToWorklistIfAllowed(I);
5339 }
5340
5341 // Expand Worklist in topological order: whenever a new instruction
5342 // is added , its users should be already inside Worklist. It ensures
5343 // a uniform instruction will only be used by uniform instructions.
5344 unsigned idx = 0;
5345 while (idx != Worklist.size()) {
5346 Instruction *I = Worklist[idx++];
5347
5348 for (auto OV : I->operand_values()) {
5349 // isOutOfScope operands cannot be uniform instructions.
5350 if (isOutOfScope(OV))
5351 continue;
5352 // First order recurrence Phi's should typically be considered
5353 // non-uniform.
5354 auto *OP = dyn_cast<PHINode>(OV);
5355 if (OP && Legal->isFirstOrderRecurrence(OP))
5356 continue;
5357 // If all the users of the operand are uniform, then add the
5358 // operand into the uniform worklist.
5359 auto *OI = cast<Instruction>(OV);
5360 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5361 auto *J = cast<Instruction>(U);
5362 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5363 }))
5364 addToWorklistIfAllowed(OI);
5365 }
5366 }
5367
5368 // For an instruction to be added into Worklist above, all its users inside
5369 // the loop should also be in Worklist. However, this condition cannot be
5370 // true for phi nodes that form a cyclic dependence. We must process phi
5371 // nodes separately. An induction variable will remain uniform if all users
5372 // of the induction variable and induction variable update remain uniform.
5373 // The code below handles both pointer and non-pointer induction variables.
5374 for (auto &Induction : Legal->getInductionVars()) {
5375 auto *Ind = Induction.first;
5376 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5377
5378 // Determine if all users of the induction variable are uniform after
5379 // vectorization.
5380 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5381 auto *I = cast<Instruction>(U);
5382 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5383 isVectorizedMemAccessUse(I, Ind);
5384 });
5385 if (!UniformInd)
5386 continue;
5387
5388 // Determine if all users of the induction variable update instruction are
5389 // uniform after vectorization.
5390 auto UniformIndUpdate =
5391 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5392 auto *I = cast<Instruction>(U);
5393 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5394 isVectorizedMemAccessUse(I, IndUpdate);
5395 });
5396 if (!UniformIndUpdate)
5397 continue;
5398
5399 // The induction variable and its update instruction will remain uniform.
5400 addToWorklistIfAllowed(Ind);
5401 addToWorklistIfAllowed(IndUpdate);
5402 }
5403
5404 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5405 }
5406
runtimeChecksRequired()5407 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5408 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5409
5410 if (Legal->getRuntimePointerChecking()->Need) {
5411 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5412 "runtime pointer checks needed. Enable vectorization of this "
5413 "loop with '#pragma clang loop vectorize(enable)' when "
5414 "compiling with -Os/-Oz",
5415 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5416 return true;
5417 }
5418
5419 if (!PSE.getUnionPredicate().getPredicates().empty()) {
5420 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5421 "runtime SCEV checks needed. Enable vectorization of this "
5422 "loop with '#pragma clang loop vectorize(enable)' when "
5423 "compiling with -Os/-Oz",
5424 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5425 return true;
5426 }
5427
5428 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5429 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5430 reportVectorizationFailure("Runtime stride check for small trip count",
5431 "runtime stride == 1 checks needed. Enable vectorization of "
5432 "this loop without such check by compiling with -Os/-Oz",
5433 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5434 return true;
5435 }
5436
5437 return false;
5438 }
5439
5440 Optional<ElementCount>
computeMaxVF(ElementCount UserVF,unsigned UserIC)5441 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5442 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5443 // TODO: It may by useful to do since it's still likely to be dynamically
5444 // uniform if the target can skip.
5445 reportVectorizationFailure(
5446 "Not inserting runtime ptr check for divergent target",
5447 "runtime pointer checks needed. Not enabled for divergent target",
5448 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5449 return None;
5450 }
5451
5452 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5453 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5454 if (TC == 1) {
5455 reportVectorizationFailure("Single iteration (non) loop",
5456 "loop trip count is one, irrelevant for vectorization",
5457 "SingleIterationLoop", ORE, TheLoop);
5458 return None;
5459 }
5460
5461 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5462
5463 switch (ScalarEpilogueStatus) {
5464 case CM_ScalarEpilogueAllowed:
5465 return MaxVF;
5466 case CM_ScalarEpilogueNotNeededUsePredicate:
5467 LLVM_DEBUG(
5468 dbgs() << "LV: vector predicate hint/switch found.\n"
5469 << "LV: Not allowing scalar epilogue, creating predicated "
5470 << "vector loop.\n");
5471 break;
5472 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5473 // fallthrough as a special case of OptForSize
5474 case CM_ScalarEpilogueNotAllowedOptSize:
5475 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5476 LLVM_DEBUG(
5477 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5478 else
5479 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5480 << "count.\n");
5481
5482 // Bail if runtime checks are required, which are not good when optimising
5483 // for size.
5484 if (runtimeChecksRequired())
5485 return None;
5486 break;
5487 }
5488
5489 // Now try the tail folding
5490
5491 // Invalidate interleave groups that require an epilogue if we can't mask
5492 // the interleave-group.
5493 if (!useMaskedInterleavedAccesses(TTI)) {
5494 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5495 "No decisions should have been taken at this point");
5496 // Note: There is no need to invalidate any cost modeling decisions here, as
5497 // non where taken so far.
5498 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5499 }
5500
5501 assert(!MaxVF.isScalable() &&
5502 "Scalable vectors do not yet support tail folding");
5503 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5504 "MaxVF must be a power of 2");
5505 unsigned MaxVFtimesIC =
5506 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5507 if (TC > 0 && TC % MaxVFtimesIC == 0) {
5508 // Accept MaxVF if we do not have a tail.
5509 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5510 return MaxVF;
5511 }
5512
5513 // If we don't know the precise trip count, or if the trip count that we
5514 // found modulo the vectorization factor is not zero, try to fold the tail
5515 // by masking.
5516 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5517 if (Legal->prepareToFoldTailByMasking()) {
5518 FoldTailByMasking = true;
5519 return MaxVF;
5520 }
5521
5522 // If there was a tail-folding hint/switch, but we can't fold the tail by
5523 // masking, fallback to a vectorization with a scalar epilogue.
5524 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5525 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5526 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5527 return None;
5528 }
5529 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5530 "scalar epilogue instead.\n");
5531 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5532 return MaxVF;
5533 }
5534
5535 if (TC == 0) {
5536 reportVectorizationFailure(
5537 "Unable to calculate the loop count due to complex control flow",
5538 "unable to calculate the loop count due to complex control flow",
5539 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5540 return None;
5541 }
5542
5543 reportVectorizationFailure(
5544 "Cannot optimize for size and vectorize at the same time.",
5545 "cannot optimize for size and vectorize at the same time. "
5546 "Enable vectorization of this loop with '#pragma clang loop "
5547 "vectorize(enable)' when compiling with -Os/-Oz",
5548 "NoTailLoopWithOptForSize", ORE, TheLoop);
5549 return None;
5550 }
5551
5552 ElementCount
computeFeasibleMaxVF(unsigned ConstTripCount,ElementCount UserVF)5553 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5554 ElementCount UserVF) {
5555 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5556 unsigned SmallestType, WidestType;
5557 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5558 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5559
5560 // Get the maximum safe dependence distance in bits computed by LAA.
5561 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5562 // the memory accesses that is most restrictive (involved in the smallest
5563 // dependence distance).
5564 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5565
5566 if (UserVF.isNonZero()) {
5567 // For now, don't verify legality of scalable vectors.
5568 // This will be addressed properly in https://reviews.llvm.org/D91718.
5569 if (UserVF.isScalable())
5570 return UserVF;
5571
5572 // If legally unsafe, clamp the user vectorization factor to a safe value.
5573 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5574 if (UserVF.getFixedValue() <= MaxSafeVF)
5575 return UserVF;
5576
5577 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5578 << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5579 << ".\n");
5580 ORE->emit([&]() {
5581 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5582 TheLoop->getStartLoc(),
5583 TheLoop->getHeader())
5584 << "User-specified vectorization factor "
5585 << ore::NV("UserVectorizationFactor", UserVF)
5586 << " is unsafe, clamping to maximum safe vectorization factor "
5587 << ore::NV("VectorizationFactor", MaxSafeVF);
5588 });
5589 return ElementCount::getFixed(MaxSafeVF);
5590 }
5591
5592 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5593
5594 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5595 // Note that both WidestRegister and WidestType may not be a powers of 2.
5596 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5597
5598 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5599 << " / " << WidestType << " bits.\n");
5600 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5601 << WidestRegister << " bits.\n");
5602
5603 assert(MaxVectorSize <= WidestRegister &&
5604 "Did not expect to pack so many elements"
5605 " into one vector!");
5606 if (MaxVectorSize == 0) {
5607 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5608 MaxVectorSize = 1;
5609 return ElementCount::getFixed(MaxVectorSize);
5610 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5611 isPowerOf2_32(ConstTripCount)) {
5612 // We need to clamp the VF to be the ConstTripCount. There is no point in
5613 // choosing a higher viable VF as done in the loop below.
5614 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5615 << ConstTripCount << "\n");
5616 MaxVectorSize = ConstTripCount;
5617 return ElementCount::getFixed(MaxVectorSize);
5618 }
5619
5620 unsigned MaxVF = MaxVectorSize;
5621 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5622 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5623 // Collect all viable vectorization factors larger than the default MaxVF
5624 // (i.e. MaxVectorSize).
5625 SmallVector<ElementCount, 8> VFs;
5626 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5627 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5628 VFs.push_back(ElementCount::getFixed(VS));
5629
5630 // For each VF calculate its register usage.
5631 auto RUs = calculateRegisterUsage(VFs);
5632
5633 // Select the largest VF which doesn't require more registers than existing
5634 // ones.
5635 for (int i = RUs.size() - 1; i >= 0; --i) {
5636 bool Selected = true;
5637 for (auto& pair : RUs[i].MaxLocalUsers) {
5638 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5639 if (pair.second > TargetNumRegisters)
5640 Selected = false;
5641 }
5642 if (Selected) {
5643 MaxVF = VFs[i].getKnownMinValue();
5644 break;
5645 }
5646 }
5647 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5648 if (MaxVF < MinVF) {
5649 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5650 << ") with target's minimum: " << MinVF << '\n');
5651 MaxVF = MinVF;
5652 }
5653 }
5654 }
5655 return ElementCount::getFixed(MaxVF);
5656 }
5657
5658 VectorizationFactor
selectVectorizationFactor(ElementCount MaxVF)5659 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5660 // FIXME: This can be fixed for scalable vectors later, because at this stage
5661 // the LoopVectorizer will only consider vectorizing a loop with scalable
5662 // vectors when the loop has a hint to enable vectorization for a given VF.
5663 assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5664
5665 float Cost = expectedCost(ElementCount::getFixed(1)).first;
5666 const float ScalarCost = Cost;
5667 unsigned Width = 1;
5668 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5669
5670 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5671 if (ForceVectorization && MaxVF.isVector()) {
5672 // Ignore scalar width, because the user explicitly wants vectorization.
5673 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5674 // evaluation.
5675 Cost = std::numeric_limits<float>::max();
5676 }
5677
5678 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5679 // Notice that the vector loop needs to be executed less times, so
5680 // we need to divide the cost of the vector loops by the width of
5681 // the vector elements.
5682 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5683 float VectorCost = C.first / (float)i;
5684 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5685 << " costs: " << (int)VectorCost << ".\n");
5686 if (!C.second && !ForceVectorization) {
5687 LLVM_DEBUG(
5688 dbgs() << "LV: Not considering vector loop of width " << i
5689 << " because it will not generate any vector instructions.\n");
5690 continue;
5691 }
5692
5693 // If profitable add it to ProfitableVF list.
5694 if (VectorCost < ScalarCost) {
5695 ProfitableVFs.push_back(VectorizationFactor(
5696 {ElementCount::getFixed(i), (unsigned)VectorCost}));
5697 }
5698
5699 if (VectorCost < Cost) {
5700 Cost = VectorCost;
5701 Width = i;
5702 }
5703 }
5704
5705 if (!EnableCondStoresVectorization && NumPredStores) {
5706 reportVectorizationFailure("There are conditional stores.",
5707 "store that is conditionally executed prevents vectorization",
5708 "ConditionalStore", ORE, TheLoop);
5709 Width = 1;
5710 Cost = ScalarCost;
5711 }
5712
5713 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5714 << "LV: Vectorization seems to be not beneficial, "
5715 << "but was forced by a user.\n");
5716 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5717 VectorizationFactor Factor = {ElementCount::getFixed(Width),
5718 (unsigned)(Width * Cost)};
5719 return Factor;
5720 }
5721
isCandidateForEpilogueVectorization(const Loop & L,ElementCount VF) const5722 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5723 const Loop &L, ElementCount VF) const {
5724 // Cross iteration phis such as reductions need special handling and are
5725 // currently unsupported.
5726 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5727 return Legal->isFirstOrderRecurrence(&Phi) ||
5728 Legal->isReductionVariable(&Phi);
5729 }))
5730 return false;
5731
5732 // Phis with uses outside of the loop require special handling and are
5733 // currently unsupported.
5734 for (auto &Entry : Legal->getInductionVars()) {
5735 // Look for uses of the value of the induction at the last iteration.
5736 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5737 for (User *U : PostInc->users())
5738 if (!L.contains(cast<Instruction>(U)))
5739 return false;
5740 // Look for uses of penultimate value of the induction.
5741 for (User *U : Entry.first->users())
5742 if (!L.contains(cast<Instruction>(U)))
5743 return false;
5744 }
5745
5746 // Induction variables that are widened require special handling that is
5747 // currently not supported.
5748 if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5749 return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5750 this->isProfitableToScalarize(Entry.first, VF));
5751 }))
5752 return false;
5753
5754 return true;
5755 }
5756
isEpilogueVectorizationProfitable(const ElementCount VF) const5757 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5758 const ElementCount VF) const {
5759 // FIXME: We need a much better cost-model to take different parameters such
5760 // as register pressure, code size increase and cost of extra branches into
5761 // account. For now we apply a very crude heuristic and only consider loops
5762 // with vectorization factors larger than a certain value.
5763 // We also consider epilogue vectorization unprofitable for targets that don't
5764 // consider interleaving beneficial (eg. MVE).
5765 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5766 return false;
5767 if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5768 return true;
5769 return false;
5770 }
5771
5772 VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,const LoopVectorizationPlanner & LVP)5773 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5774 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5775 VectorizationFactor Result = VectorizationFactor::Disabled();
5776 if (!EnableEpilogueVectorization) {
5777 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5778 return Result;
5779 }
5780
5781 if (!isScalarEpilogueAllowed()) {
5782 LLVM_DEBUG(
5783 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5784 "allowed.\n";);
5785 return Result;
5786 }
5787
5788 // Not really a cost consideration, but check for unsupported cases here to
5789 // simplify the logic.
5790 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5791 LLVM_DEBUG(
5792 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5793 "not a supported candidate.\n";);
5794 return Result;
5795 }
5796
5797 if (EpilogueVectorizationForceVF > 1) {
5798 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5799 if (LVP.hasPlanWithVFs(
5800 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5801 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5802 else {
5803 LLVM_DEBUG(
5804 dbgs()
5805 << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5806 return Result;
5807 }
5808 }
5809
5810 if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5811 TheLoop->getHeader()->getParent()->hasMinSize()) {
5812 LLVM_DEBUG(
5813 dbgs()
5814 << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5815 return Result;
5816 }
5817
5818 if (!isEpilogueVectorizationProfitable(MainLoopVF))
5819 return Result;
5820
5821 for (auto &NextVF : ProfitableVFs)
5822 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5823 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5824 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5825 Result = NextVF;
5826
5827 if (Result != VectorizationFactor::Disabled())
5828 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5829 << Result.Width.getFixedValue() << "\n";);
5830 return Result;
5831 }
5832
5833 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()5834 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5835 unsigned MinWidth = -1U;
5836 unsigned MaxWidth = 8;
5837 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5838
5839 // For each block.
5840 for (BasicBlock *BB : TheLoop->blocks()) {
5841 // For each instruction in the loop.
5842 for (Instruction &I : BB->instructionsWithoutDebug()) {
5843 Type *T = I.getType();
5844
5845 // Skip ignored values.
5846 if (ValuesToIgnore.count(&I))
5847 continue;
5848
5849 // Only examine Loads, Stores and PHINodes.
5850 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5851 continue;
5852
5853 // Examine PHI nodes that are reduction variables. Update the type to
5854 // account for the recurrence type.
5855 if (auto *PN = dyn_cast<PHINode>(&I)) {
5856 if (!Legal->isReductionVariable(PN))
5857 continue;
5858 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5859 T = RdxDesc.getRecurrenceType();
5860 }
5861
5862 // Examine the stored values.
5863 if (auto *ST = dyn_cast<StoreInst>(&I))
5864 T = ST->getValueOperand()->getType();
5865
5866 // Ignore loaded pointer types and stored pointer types that are not
5867 // vectorizable.
5868 //
5869 // FIXME: The check here attempts to predict whether a load or store will
5870 // be vectorized. We only know this for certain after a VF has
5871 // been selected. Here, we assume that if an access can be
5872 // vectorized, it will be. We should also look at extending this
5873 // optimization to non-pointer types.
5874 //
5875 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5876 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5877 continue;
5878
5879 MinWidth = std::min(MinWidth,
5880 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5881 MaxWidth = std::max(MaxWidth,
5882 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5883 }
5884 }
5885
5886 return {MinWidth, MaxWidth};
5887 }
5888
selectInterleaveCount(ElementCount VF,unsigned LoopCost)5889 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5890 unsigned LoopCost) {
5891 // -- The interleave heuristics --
5892 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5893 // There are many micro-architectural considerations that we can't predict
5894 // at this level. For example, frontend pressure (on decode or fetch) due to
5895 // code size, or the number and capabilities of the execution ports.
5896 //
5897 // We use the following heuristics to select the interleave count:
5898 // 1. If the code has reductions, then we interleave to break the cross
5899 // iteration dependency.
5900 // 2. If the loop is really small, then we interleave to reduce the loop
5901 // overhead.
5902 // 3. We don't interleave if we think that we will spill registers to memory
5903 // due to the increased register pressure.
5904
5905 if (!isScalarEpilogueAllowed())
5906 return 1;
5907
5908 // We used the distance for the interleave count.
5909 if (Legal->getMaxSafeDepDistBytes() != -1U)
5910 return 1;
5911
5912 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5913 const bool HasReductions = !Legal->getReductionVars().empty();
5914 // Do not interleave loops with a relatively small known or estimated trip
5915 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5916 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5917 // because with the above conditions interleaving can expose ILP and break
5918 // cross iteration dependences for reductions.
5919 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5920 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5921 return 1;
5922
5923 RegisterUsage R = calculateRegisterUsage({VF})[0];
5924 // We divide by these constants so assume that we have at least one
5925 // instruction that uses at least one register.
5926 for (auto& pair : R.MaxLocalUsers) {
5927 pair.second = std::max(pair.second, 1U);
5928 }
5929
5930 // We calculate the interleave count using the following formula.
5931 // Subtract the number of loop invariants from the number of available
5932 // registers. These registers are used by all of the interleaved instances.
5933 // Next, divide the remaining registers by the number of registers that is
5934 // required by the loop, in order to estimate how many parallel instances
5935 // fit without causing spills. All of this is rounded down if necessary to be
5936 // a power of two. We want power of two interleave count to simplify any
5937 // addressing operations or alignment considerations.
5938 // We also want power of two interleave counts to ensure that the induction
5939 // variable of the vector loop wraps to zero, when tail is folded by masking;
5940 // this currently happens when OptForSize, in which case IC is set to 1 above.
5941 unsigned IC = UINT_MAX;
5942
5943 for (auto& pair : R.MaxLocalUsers) {
5944 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5945 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5946 << " registers of "
5947 << TTI.getRegisterClassName(pair.first) << " register class\n");
5948 if (VF.isScalar()) {
5949 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5950 TargetNumRegisters = ForceTargetNumScalarRegs;
5951 } else {
5952 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5953 TargetNumRegisters = ForceTargetNumVectorRegs;
5954 }
5955 unsigned MaxLocalUsers = pair.second;
5956 unsigned LoopInvariantRegs = 0;
5957 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5958 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5959
5960 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5961 // Don't count the induction variable as interleaved.
5962 if (EnableIndVarRegisterHeur) {
5963 TmpIC =
5964 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5965 std::max(1U, (MaxLocalUsers - 1)));
5966 }
5967
5968 IC = std::min(IC, TmpIC);
5969 }
5970
5971 // Clamp the interleave ranges to reasonable counts.
5972 unsigned MaxInterleaveCount =
5973 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5974
5975 // Check if the user has overridden the max.
5976 if (VF.isScalar()) {
5977 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5978 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5979 } else {
5980 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5981 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5982 }
5983
5984 // If trip count is known or estimated compile time constant, limit the
5985 // interleave count to be less than the trip count divided by VF, provided it
5986 // is at least 1.
5987 //
5988 // For scalable vectors we can't know if interleaving is beneficial. It may
5989 // not be beneficial for small loops if none of the lanes in the second vector
5990 // iterations is enabled. However, for larger loops, there is likely to be a
5991 // similar benefit as for fixed-width vectors. For now, we choose to leave
5992 // the InterleaveCount as if vscale is '1', although if some information about
5993 // the vector is known (e.g. min vector size), we can make a better decision.
5994 if (BestKnownTC) {
5995 MaxInterleaveCount =
5996 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5997 // Make sure MaxInterleaveCount is greater than 0.
5998 MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5999 }
6000
6001 assert(MaxInterleaveCount > 0 &&
6002 "Maximum interleave count must be greater than 0");
6003
6004 // Clamp the calculated IC to be between the 1 and the max interleave count
6005 // that the target and trip count allows.
6006 if (IC > MaxInterleaveCount)
6007 IC = MaxInterleaveCount;
6008 else
6009 // Make sure IC is greater than 0.
6010 IC = std::max(1u, IC);
6011
6012 assert(IC > 0 && "Interleave count must be greater than 0.");
6013
6014 // If we did not calculate the cost for VF (because the user selected the VF)
6015 // then we calculate the cost of VF here.
6016 if (LoopCost == 0)
6017 LoopCost = expectedCost(VF).first;
6018
6019 assert(LoopCost && "Non-zero loop cost expected");
6020
6021 // Interleave if we vectorized this loop and there is a reduction that could
6022 // benefit from interleaving.
6023 if (VF.isVector() && HasReductions) {
6024 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6025 return IC;
6026 }
6027
6028 // Note that if we've already vectorized the loop we will have done the
6029 // runtime check and so interleaving won't require further checks.
6030 bool InterleavingRequiresRuntimePointerCheck =
6031 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6032
6033 // We want to interleave small loops in order to reduce the loop overhead and
6034 // potentially expose ILP opportunities.
6035 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6036 << "LV: IC is " << IC << '\n'
6037 << "LV: VF is " << VF << '\n');
6038 const bool AggressivelyInterleaveReductions =
6039 TTI.enableAggressiveInterleaving(HasReductions);
6040 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6041 // We assume that the cost overhead is 1 and we use the cost model
6042 // to estimate the cost of the loop and interleave until the cost of the
6043 // loop overhead is about 5% of the cost of the loop.
6044 unsigned SmallIC =
6045 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6046
6047 // Interleave until store/load ports (estimated by max interleave count) are
6048 // saturated.
6049 unsigned NumStores = Legal->getNumStores();
6050 unsigned NumLoads = Legal->getNumLoads();
6051 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6052 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6053
6054 // If we have a scalar reduction (vector reductions are already dealt with
6055 // by this point), we can increase the critical path length if the loop
6056 // we're interleaving is inside another loop. Limit, by default to 2, so the
6057 // critical path only gets increased by one reduction operation.
6058 if (HasReductions && TheLoop->getLoopDepth() > 1) {
6059 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6060 SmallIC = std::min(SmallIC, F);
6061 StoresIC = std::min(StoresIC, F);
6062 LoadsIC = std::min(LoadsIC, F);
6063 }
6064
6065 if (EnableLoadStoreRuntimeInterleave &&
6066 std::max(StoresIC, LoadsIC) > SmallIC) {
6067 LLVM_DEBUG(
6068 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6069 return std::max(StoresIC, LoadsIC);
6070 }
6071
6072 // If there are scalar reductions and TTI has enabled aggressive
6073 // interleaving for reductions, we will interleave to expose ILP.
6074 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6075 AggressivelyInterleaveReductions) {
6076 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6077 // Interleave no less than SmallIC but not as aggressive as the normal IC
6078 // to satisfy the rare situation when resources are too limited.
6079 return std::max(IC / 2, SmallIC);
6080 } else {
6081 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6082 return SmallIC;
6083 }
6084 }
6085
6086 // Interleave if this is a large loop (small loops are already dealt with by
6087 // this point) that could benefit from interleaving.
6088 if (AggressivelyInterleaveReductions) {
6089 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6090 return IC;
6091 }
6092
6093 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6094 return 1;
6095 }
6096
6097 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)6098 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6099 // This function calculates the register usage by measuring the highest number
6100 // of values that are alive at a single location. Obviously, this is a very
6101 // rough estimation. We scan the loop in a topological order in order and
6102 // assign a number to each instruction. We use RPO to ensure that defs are
6103 // met before their users. We assume that each instruction that has in-loop
6104 // users starts an interval. We record every time that an in-loop value is
6105 // used, so we have a list of the first and last occurrences of each
6106 // instruction. Next, we transpose this data structure into a multi map that
6107 // holds the list of intervals that *end* at a specific location. This multi
6108 // map allows us to perform a linear search. We scan the instructions linearly
6109 // and record each time that a new interval starts, by placing it in a set.
6110 // If we find this value in the multi-map then we remove it from the set.
6111 // The max register usage is the maximum size of the set.
6112 // We also search for instructions that are defined outside the loop, but are
6113 // used inside the loop. We need this number separately from the max-interval
6114 // usage number because when we unroll, loop-invariant values do not take
6115 // more register.
6116 LoopBlocksDFS DFS(TheLoop);
6117 DFS.perform(LI);
6118
6119 RegisterUsage RU;
6120
6121 // Each 'key' in the map opens a new interval. The values
6122 // of the map are the index of the 'last seen' usage of the
6123 // instruction that is the key.
6124 using IntervalMap = DenseMap<Instruction *, unsigned>;
6125
6126 // Maps instruction to its index.
6127 SmallVector<Instruction *, 64> IdxToInstr;
6128 // Marks the end of each interval.
6129 IntervalMap EndPoint;
6130 // Saves the list of instruction indices that are used in the loop.
6131 SmallPtrSet<Instruction *, 8> Ends;
6132 // Saves the list of values that are used in the loop but are
6133 // defined outside the loop, such as arguments and constants.
6134 SmallPtrSet<Value *, 8> LoopInvariants;
6135
6136 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6137 for (Instruction &I : BB->instructionsWithoutDebug()) {
6138 IdxToInstr.push_back(&I);
6139
6140 // Save the end location of each USE.
6141 for (Value *U : I.operands()) {
6142 auto *Instr = dyn_cast<Instruction>(U);
6143
6144 // Ignore non-instruction values such as arguments, constants, etc.
6145 if (!Instr)
6146 continue;
6147
6148 // If this instruction is outside the loop then record it and continue.
6149 if (!TheLoop->contains(Instr)) {
6150 LoopInvariants.insert(Instr);
6151 continue;
6152 }
6153
6154 // Overwrite previous end points.
6155 EndPoint[Instr] = IdxToInstr.size();
6156 Ends.insert(Instr);
6157 }
6158 }
6159 }
6160
6161 // Saves the list of intervals that end with the index in 'key'.
6162 using InstrList = SmallVector<Instruction *, 2>;
6163 DenseMap<unsigned, InstrList> TransposeEnds;
6164
6165 // Transpose the EndPoints to a list of values that end at each index.
6166 for (auto &Interval : EndPoint)
6167 TransposeEnds[Interval.second].push_back(Interval.first);
6168
6169 SmallPtrSet<Instruction *, 8> OpenIntervals;
6170 SmallVector<RegisterUsage, 8> RUs(VFs.size());
6171 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6172
6173 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6174
6175 // A lambda that gets the register usage for the given type and VF.
6176 const auto &TTICapture = TTI;
6177 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6178 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6179 return 0U;
6180 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6181 };
6182
6183 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6184 Instruction *I = IdxToInstr[i];
6185
6186 // Remove all of the instructions that end at this location.
6187 InstrList &List = TransposeEnds[i];
6188 for (Instruction *ToRemove : List)
6189 OpenIntervals.erase(ToRemove);
6190
6191 // Ignore instructions that are never used within the loop.
6192 if (!Ends.count(I))
6193 continue;
6194
6195 // Skip ignored values.
6196 if (ValuesToIgnore.count(I))
6197 continue;
6198
6199 // For each VF find the maximum usage of registers.
6200 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6201 // Count the number of live intervals.
6202 SmallMapVector<unsigned, unsigned, 4> RegUsage;
6203
6204 if (VFs[j].isScalar()) {
6205 for (auto Inst : OpenIntervals) {
6206 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6207 if (RegUsage.find(ClassID) == RegUsage.end())
6208 RegUsage[ClassID] = 1;
6209 else
6210 RegUsage[ClassID] += 1;
6211 }
6212 } else {
6213 collectUniformsAndScalars(VFs[j]);
6214 for (auto Inst : OpenIntervals) {
6215 // Skip ignored values for VF > 1.
6216 if (VecValuesToIgnore.count(Inst))
6217 continue;
6218 if (isScalarAfterVectorization(Inst, VFs[j])) {
6219 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6220 if (RegUsage.find(ClassID) == RegUsage.end())
6221 RegUsage[ClassID] = 1;
6222 else
6223 RegUsage[ClassID] += 1;
6224 } else {
6225 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6226 if (RegUsage.find(ClassID) == RegUsage.end())
6227 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6228 else
6229 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6230 }
6231 }
6232 }
6233
6234 for (auto& pair : RegUsage) {
6235 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6236 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6237 else
6238 MaxUsages[j][pair.first] = pair.second;
6239 }
6240 }
6241
6242 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6243 << OpenIntervals.size() << '\n');
6244
6245 // Add the current instruction to the list of open intervals.
6246 OpenIntervals.insert(I);
6247 }
6248
6249 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6250 SmallMapVector<unsigned, unsigned, 4> Invariant;
6251
6252 for (auto Inst : LoopInvariants) {
6253 unsigned Usage =
6254 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6255 unsigned ClassID =
6256 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6257 if (Invariant.find(ClassID) == Invariant.end())
6258 Invariant[ClassID] = Usage;
6259 else
6260 Invariant[ClassID] += Usage;
6261 }
6262
6263 LLVM_DEBUG({
6264 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6265 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6266 << " item\n";
6267 for (const auto &pair : MaxUsages[i]) {
6268 dbgs() << "LV(REG): RegisterClass: "
6269 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6270 << " registers\n";
6271 }
6272 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6273 << " item\n";
6274 for (const auto &pair : Invariant) {
6275 dbgs() << "LV(REG): RegisterClass: "
6276 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6277 << " registers\n";
6278 }
6279 });
6280
6281 RU.LoopInvariantRegs = Invariant;
6282 RU.MaxLocalUsers = MaxUsages[i];
6283 RUs[i] = RU;
6284 }
6285
6286 return RUs;
6287 }
6288
useEmulatedMaskMemRefHack(Instruction * I)6289 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6290 // TODO: Cost model for emulated masked load/store is completely
6291 // broken. This hack guides the cost model to use an artificially
6292 // high enough value to practically disable vectorization with such
6293 // operations, except where previously deployed legality hack allowed
6294 // using very low cost values. This is to avoid regressions coming simply
6295 // from moving "masked load/store" check from legality to cost model.
6296 // Masked Load/Gather emulation was previously never allowed.
6297 // Limited number of Masked Store/Scatter emulation was allowed.
6298 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6299 return isa<LoadInst>(I) ||
6300 (isa<StoreInst>(I) &&
6301 NumPredStores > NumberOfStoresToPredicate);
6302 }
6303
collectInstsToScalarize(ElementCount VF)6304 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6305 // If we aren't vectorizing the loop, or if we've already collected the
6306 // instructions to scalarize, there's nothing to do. Collection may already
6307 // have occurred if we have a user-selected VF and are now computing the
6308 // expected cost for interleaving.
6309 if (VF.isScalar() || VF.isZero() ||
6310 InstsToScalarize.find(VF) != InstsToScalarize.end())
6311 return;
6312
6313 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6314 // not profitable to scalarize any instructions, the presence of VF in the
6315 // map will indicate that we've analyzed it already.
6316 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6317
6318 // Find all the instructions that are scalar with predication in the loop and
6319 // determine if it would be better to not if-convert the blocks they are in.
6320 // If so, we also record the instructions to scalarize.
6321 for (BasicBlock *BB : TheLoop->blocks()) {
6322 if (!blockNeedsPredication(BB))
6323 continue;
6324 for (Instruction &I : *BB)
6325 if (isScalarWithPredication(&I)) {
6326 ScalarCostsTy ScalarCosts;
6327 // Do not apply discount logic if hacked cost is needed
6328 // for emulated masked memrefs.
6329 if (!useEmulatedMaskMemRefHack(&I) &&
6330 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6331 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6332 // Remember that BB will remain after vectorization.
6333 PredicatedBBsAfterVectorization.insert(BB);
6334 }
6335 }
6336 }
6337
computePredInstDiscount(Instruction * PredInst,DenseMap<Instruction *,unsigned> & ScalarCosts,ElementCount VF)6338 int LoopVectorizationCostModel::computePredInstDiscount(
6339 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6340 ElementCount VF) {
6341 assert(!isUniformAfterVectorization(PredInst, VF) &&
6342 "Instruction marked uniform-after-vectorization will be predicated");
6343
6344 // Initialize the discount to zero, meaning that the scalar version and the
6345 // vector version cost the same.
6346 int Discount = 0;
6347
6348 // Holds instructions to analyze. The instructions we visit are mapped in
6349 // ScalarCosts. Those instructions are the ones that would be scalarized if
6350 // we find that the scalar version costs less.
6351 SmallVector<Instruction *, 8> Worklist;
6352
6353 // Returns true if the given instruction can be scalarized.
6354 auto canBeScalarized = [&](Instruction *I) -> bool {
6355 // We only attempt to scalarize instructions forming a single-use chain
6356 // from the original predicated block that would otherwise be vectorized.
6357 // Although not strictly necessary, we give up on instructions we know will
6358 // already be scalar to avoid traversing chains that are unlikely to be
6359 // beneficial.
6360 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6361 isScalarAfterVectorization(I, VF))
6362 return false;
6363
6364 // If the instruction is scalar with predication, it will be analyzed
6365 // separately. We ignore it within the context of PredInst.
6366 if (isScalarWithPredication(I))
6367 return false;
6368
6369 // If any of the instruction's operands are uniform after vectorization,
6370 // the instruction cannot be scalarized. This prevents, for example, a
6371 // masked load from being scalarized.
6372 //
6373 // We assume we will only emit a value for lane zero of an instruction
6374 // marked uniform after vectorization, rather than VF identical values.
6375 // Thus, if we scalarize an instruction that uses a uniform, we would
6376 // create uses of values corresponding to the lanes we aren't emitting code
6377 // for. This behavior can be changed by allowing getScalarValue to clone
6378 // the lane zero values for uniforms rather than asserting.
6379 for (Use &U : I->operands())
6380 if (auto *J = dyn_cast<Instruction>(U.get()))
6381 if (isUniformAfterVectorization(J, VF))
6382 return false;
6383
6384 // Otherwise, we can scalarize the instruction.
6385 return true;
6386 };
6387
6388 // Compute the expected cost discount from scalarizing the entire expression
6389 // feeding the predicated instruction. We currently only consider expressions
6390 // that are single-use instruction chains.
6391 Worklist.push_back(PredInst);
6392 while (!Worklist.empty()) {
6393 Instruction *I = Worklist.pop_back_val();
6394
6395 // If we've already analyzed the instruction, there's nothing to do.
6396 if (ScalarCosts.find(I) != ScalarCosts.end())
6397 continue;
6398
6399 // Compute the cost of the vector instruction. Note that this cost already
6400 // includes the scalarization overhead of the predicated instruction.
6401 unsigned VectorCost = getInstructionCost(I, VF).first;
6402
6403 // Compute the cost of the scalarized instruction. This cost is the cost of
6404 // the instruction as if it wasn't if-converted and instead remained in the
6405 // predicated block. We will scale this cost by block probability after
6406 // computing the scalarization overhead.
6407 assert(!VF.isScalable() && "scalable vectors not yet supported.");
6408 unsigned ScalarCost =
6409 VF.getKnownMinValue() *
6410 getInstructionCost(I, ElementCount::getFixed(1)).first;
6411
6412 // Compute the scalarization overhead of needed insertelement instructions
6413 // and phi nodes.
6414 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6415 ScalarCost += TTI.getScalarizationOverhead(
6416 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6417 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6418 assert(!VF.isScalable() && "scalable vectors not yet supported.");
6419 ScalarCost +=
6420 VF.getKnownMinValue() *
6421 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6422 }
6423
6424 // Compute the scalarization overhead of needed extractelement
6425 // instructions. For each of the instruction's operands, if the operand can
6426 // be scalarized, add it to the worklist; otherwise, account for the
6427 // overhead.
6428 for (Use &U : I->operands())
6429 if (auto *J = dyn_cast<Instruction>(U.get())) {
6430 assert(VectorType::isValidElementType(J->getType()) &&
6431 "Instruction has non-scalar type");
6432 if (canBeScalarized(J))
6433 Worklist.push_back(J);
6434 else if (needsExtract(J, VF)) {
6435 assert(!VF.isScalable() && "scalable vectors not yet supported.");
6436 ScalarCost += TTI.getScalarizationOverhead(
6437 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6438 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6439 }
6440 }
6441
6442 // Scale the total scalar cost by block probability.
6443 ScalarCost /= getReciprocalPredBlockProb();
6444
6445 // Compute the discount. A non-negative discount means the vector version
6446 // of the instruction costs more, and scalarizing would be beneficial.
6447 Discount += VectorCost - ScalarCost;
6448 ScalarCosts[I] = ScalarCost;
6449 }
6450
6451 return Discount;
6452 }
6453
6454 LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF)6455 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6456 VectorizationCostTy Cost;
6457
6458 // For each block.
6459 for (BasicBlock *BB : TheLoop->blocks()) {
6460 VectorizationCostTy BlockCost;
6461
6462 // For each instruction in the old loop.
6463 for (Instruction &I : BB->instructionsWithoutDebug()) {
6464 // Skip ignored values.
6465 if (ValuesToIgnore.count(&I) ||
6466 (VF.isVector() && VecValuesToIgnore.count(&I)))
6467 continue;
6468
6469 VectorizationCostTy C = getInstructionCost(&I, VF);
6470
6471 // Check if we should override the cost.
6472 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6473 C.first = ForceTargetInstructionCost;
6474
6475 BlockCost.first += C.first;
6476 BlockCost.second |= C.second;
6477 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6478 << " for VF " << VF << " For instruction: " << I
6479 << '\n');
6480 }
6481
6482 // If we are vectorizing a predicated block, it will have been
6483 // if-converted. This means that the block's instructions (aside from
6484 // stores and instructions that may divide by zero) will now be
6485 // unconditionally executed. For the scalar case, we may not always execute
6486 // the predicated block. Thus, scale the block's cost by the probability of
6487 // executing it.
6488 if (VF.isScalar() && blockNeedsPredication(BB))
6489 BlockCost.first /= getReciprocalPredBlockProb();
6490
6491 Cost.first += BlockCost.first;
6492 Cost.second |= BlockCost.second;
6493 }
6494
6495 return Cost;
6496 }
6497
6498 /// Gets Address Access SCEV after verifying that the access pattern
6499 /// is loop invariant except the induction variable dependence.
6500 ///
6501 /// This SCEV can be sent to the Target in order to estimate the address
6502 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)6503 static const SCEV *getAddressAccessSCEV(
6504 Value *Ptr,
6505 LoopVectorizationLegality *Legal,
6506 PredicatedScalarEvolution &PSE,
6507 const Loop *TheLoop) {
6508
6509 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6510 if (!Gep)
6511 return nullptr;
6512
6513 // We are looking for a gep with all loop invariant indices except for one
6514 // which should be an induction variable.
6515 auto SE = PSE.getSE();
6516 unsigned NumOperands = Gep->getNumOperands();
6517 for (unsigned i = 1; i < NumOperands; ++i) {
6518 Value *Opd = Gep->getOperand(i);
6519 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6520 !Legal->isInductionVariable(Opd))
6521 return nullptr;
6522 }
6523
6524 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6525 return PSE.getSCEV(Ptr);
6526 }
6527
isStrideMul(Instruction * I,LoopVectorizationLegality * Legal)6528 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6529 return Legal->hasStride(I->getOperand(0)) ||
6530 Legal->hasStride(I->getOperand(1));
6531 }
6532
6533 unsigned
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6534 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6535 ElementCount VF) {
6536 assert(VF.isVector() &&
6537 "Scalarization cost of instruction implies vectorization.");
6538 assert(!VF.isScalable() && "scalable vectors not yet supported.");
6539 Type *ValTy = getMemInstValueType(I);
6540 auto SE = PSE.getSE();
6541
6542 unsigned AS = getLoadStoreAddressSpace(I);
6543 Value *Ptr = getLoadStorePointerOperand(I);
6544 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6545
6546 // Figure out whether the access is strided and get the stride value
6547 // if it's known in compile time
6548 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6549
6550 // Get the cost of the scalar memory instruction and address computation.
6551 unsigned Cost =
6552 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6553
6554 // Don't pass *I here, since it is scalar but will actually be part of a
6555 // vectorized loop where the user of it is a vectorized instruction.
6556 const Align Alignment = getLoadStoreAlignment(I);
6557 Cost += VF.getKnownMinValue() *
6558 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6559 AS, TTI::TCK_RecipThroughput);
6560
6561 // Get the overhead of the extractelement and insertelement instructions
6562 // we might create due to scalarization.
6563 Cost += getScalarizationOverhead(I, VF);
6564
6565 // If we have a predicated store, it may not be executed for each vector
6566 // lane. Scale the cost by the probability of executing the predicated
6567 // block.
6568 if (isPredicatedInst(I)) {
6569 Cost /= getReciprocalPredBlockProb();
6570
6571 if (useEmulatedMaskMemRefHack(I))
6572 // Artificially setting to a high enough value to practically disable
6573 // vectorization with such operations.
6574 Cost = 3000000;
6575 }
6576
6577 return Cost;
6578 }
6579
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6580 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6581 ElementCount VF) {
6582 Type *ValTy = getMemInstValueType(I);
6583 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6584 Value *Ptr = getLoadStorePointerOperand(I);
6585 unsigned AS = getLoadStoreAddressSpace(I);
6586 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6587 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6588
6589 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6590 "Stride should be 1 or -1 for consecutive memory access");
6591 const Align Alignment = getLoadStoreAlignment(I);
6592 unsigned Cost = 0;
6593 if (Legal->isMaskRequired(I))
6594 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6595 CostKind);
6596 else
6597 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6598 CostKind, I);
6599
6600 bool Reverse = ConsecutiveStride < 0;
6601 if (Reverse)
6602 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6603 return Cost;
6604 }
6605
getUniformMemOpCost(Instruction * I,ElementCount VF)6606 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6607 ElementCount VF) {
6608 assert(Legal->isUniformMemOp(*I));
6609
6610 Type *ValTy = getMemInstValueType(I);
6611 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6612 const Align Alignment = getLoadStoreAlignment(I);
6613 unsigned AS = getLoadStoreAddressSpace(I);
6614 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6615 if (isa<LoadInst>(I)) {
6616 return TTI.getAddressComputationCost(ValTy) +
6617 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6618 CostKind) +
6619 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6620 }
6621 StoreInst *SI = cast<StoreInst>(I);
6622
6623 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6624 return TTI.getAddressComputationCost(ValTy) +
6625 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6626 CostKind) +
6627 (isLoopInvariantStoreValue
6628 ? 0
6629 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6630 VF.getKnownMinValue() - 1));
6631 }
6632
getGatherScatterCost(Instruction * I,ElementCount VF)6633 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6634 ElementCount VF) {
6635 Type *ValTy = getMemInstValueType(I);
6636 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6637 const Align Alignment = getLoadStoreAlignment(I);
6638 const Value *Ptr = getLoadStorePointerOperand(I);
6639
6640 return TTI.getAddressComputationCost(VectorTy) +
6641 TTI.getGatherScatterOpCost(
6642 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6643 TargetTransformInfo::TCK_RecipThroughput, I);
6644 }
6645
getInterleaveGroupCost(Instruction * I,ElementCount VF)6646 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6647 ElementCount VF) {
6648 Type *ValTy = getMemInstValueType(I);
6649 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6650 unsigned AS = getLoadStoreAddressSpace(I);
6651
6652 auto Group = getInterleavedAccessGroup(I);
6653 assert(Group && "Fail to get an interleaved access group.");
6654
6655 unsigned InterleaveFactor = Group->getFactor();
6656 assert(!VF.isScalable() && "scalable vectors not yet supported.");
6657 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6658
6659 // Holds the indices of existing members in an interleaved load group.
6660 // An interleaved store group doesn't need this as it doesn't allow gaps.
6661 SmallVector<unsigned, 4> Indices;
6662 if (isa<LoadInst>(I)) {
6663 for (unsigned i = 0; i < InterleaveFactor; i++)
6664 if (Group->getMember(i))
6665 Indices.push_back(i);
6666 }
6667
6668 // Calculate the cost of the whole interleaved group.
6669 bool UseMaskForGaps =
6670 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6671 unsigned Cost = TTI.getInterleavedMemoryOpCost(
6672 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6673 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6674
6675 if (Group->isReverse()) {
6676 // TODO: Add support for reversed masked interleaved access.
6677 assert(!Legal->isMaskRequired(I) &&
6678 "Reverse masked interleaved access not supported.");
6679 Cost += Group->getNumMembers() *
6680 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6681 }
6682 return Cost;
6683 }
6684
getMemoryInstructionCost(Instruction * I,ElementCount VF)6685 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6686 ElementCount VF) {
6687 // Calculate scalar cost only. Vectorization cost should be ready at this
6688 // moment.
6689 if (VF.isScalar()) {
6690 Type *ValTy = getMemInstValueType(I);
6691 const Align Alignment = getLoadStoreAlignment(I);
6692 unsigned AS = getLoadStoreAddressSpace(I);
6693
6694 return TTI.getAddressComputationCost(ValTy) +
6695 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6696 TTI::TCK_RecipThroughput, I);
6697 }
6698 return getWideningCost(I, VF);
6699 }
6700
6701 LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6702 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6703 ElementCount VF) {
6704 // If we know that this instruction will remain uniform, check the cost of
6705 // the scalar version.
6706 if (isUniformAfterVectorization(I, VF))
6707 VF = ElementCount::getFixed(1);
6708
6709 if (VF.isVector() && isProfitableToScalarize(I, VF))
6710 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6711
6712 // Forced scalars do not have any scalarization overhead.
6713 auto ForcedScalar = ForcedScalars.find(VF);
6714 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6715 auto InstSet = ForcedScalar->second;
6716 if (InstSet.count(I))
6717 return VectorizationCostTy(
6718 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6719 VF.getKnownMinValue()),
6720 false);
6721 }
6722
6723 Type *VectorTy;
6724 unsigned C = getInstructionCost(I, VF, VectorTy);
6725
6726 bool TypeNotScalarized =
6727 VF.isVector() && VectorTy->isVectorTy() &&
6728 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6729 return VectorizationCostTy(C, TypeNotScalarized);
6730 }
6731
getScalarizationOverhead(Instruction * I,ElementCount VF)6732 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6733 ElementCount VF) {
6734
6735 assert(!VF.isScalable() &&
6736 "cannot compute scalarization overhead for scalable vectorization");
6737 if (VF.isScalar())
6738 return 0;
6739
6740 unsigned Cost = 0;
6741 Type *RetTy = ToVectorTy(I->getType(), VF);
6742 if (!RetTy->isVoidTy() &&
6743 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6744 Cost += TTI.getScalarizationOverhead(
6745 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6746 true, false);
6747
6748 // Some targets keep addresses scalar.
6749 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6750 return Cost;
6751
6752 // Some targets support efficient element stores.
6753 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6754 return Cost;
6755
6756 // Collect operands to consider.
6757 CallInst *CI = dyn_cast<CallInst>(I);
6758 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6759
6760 // Skip operands that do not require extraction/scalarization and do not incur
6761 // any overhead.
6762 return Cost + TTI.getOperandsScalarizationOverhead(
6763 filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6764 }
6765
setCostBasedWideningDecision(ElementCount VF)6766 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6767 if (VF.isScalar())
6768 return;
6769 NumPredStores = 0;
6770 for (BasicBlock *BB : TheLoop->blocks()) {
6771 // For each instruction in the old loop.
6772 for (Instruction &I : *BB) {
6773 Value *Ptr = getLoadStorePointerOperand(&I);
6774 if (!Ptr)
6775 continue;
6776
6777 // TODO: We should generate better code and update the cost model for
6778 // predicated uniform stores. Today they are treated as any other
6779 // predicated store (see added test cases in
6780 // invariant-store-vectorization.ll).
6781 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6782 NumPredStores++;
6783
6784 if (Legal->isUniformMemOp(I)) {
6785 // TODO: Avoid replicating loads and stores instead of
6786 // relying on instcombine to remove them.
6787 // Load: Scalar load + broadcast
6788 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6789 unsigned Cost = getUniformMemOpCost(&I, VF);
6790 setWideningDecision(&I, VF, CM_Scalarize, Cost);
6791 continue;
6792 }
6793
6794 // We assume that widening is the best solution when possible.
6795 if (memoryInstructionCanBeWidened(&I, VF)) {
6796 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6797 int ConsecutiveStride =
6798 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6799 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6800 "Expected consecutive stride.");
6801 InstWidening Decision =
6802 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6803 setWideningDecision(&I, VF, Decision, Cost);
6804 continue;
6805 }
6806
6807 // Choose between Interleaving, Gather/Scatter or Scalarization.
6808 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6809 unsigned NumAccesses = 1;
6810 if (isAccessInterleaved(&I)) {
6811 auto Group = getInterleavedAccessGroup(&I);
6812 assert(Group && "Fail to get an interleaved access group.");
6813
6814 // Make one decision for the whole group.
6815 if (getWideningDecision(&I, VF) != CM_Unknown)
6816 continue;
6817
6818 NumAccesses = Group->getNumMembers();
6819 if (interleavedAccessCanBeWidened(&I, VF))
6820 InterleaveCost = getInterleaveGroupCost(&I, VF);
6821 }
6822
6823 unsigned GatherScatterCost =
6824 isLegalGatherOrScatter(&I)
6825 ? getGatherScatterCost(&I, VF) * NumAccesses
6826 : std::numeric_limits<unsigned>::max();
6827
6828 unsigned ScalarizationCost =
6829 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6830
6831 // Choose better solution for the current VF,
6832 // write down this decision and use it during vectorization.
6833 unsigned Cost;
6834 InstWidening Decision;
6835 if (InterleaveCost <= GatherScatterCost &&
6836 InterleaveCost < ScalarizationCost) {
6837 Decision = CM_Interleave;
6838 Cost = InterleaveCost;
6839 } else if (GatherScatterCost < ScalarizationCost) {
6840 Decision = CM_GatherScatter;
6841 Cost = GatherScatterCost;
6842 } else {
6843 Decision = CM_Scalarize;
6844 Cost = ScalarizationCost;
6845 }
6846 // If the instructions belongs to an interleave group, the whole group
6847 // receives the same decision. The whole group receives the cost, but
6848 // the cost will actually be assigned to one instruction.
6849 if (auto Group = getInterleavedAccessGroup(&I))
6850 setWideningDecision(Group, VF, Decision, Cost);
6851 else
6852 setWideningDecision(&I, VF, Decision, Cost);
6853 }
6854 }
6855
6856 // Make sure that any load of address and any other address computation
6857 // remains scalar unless there is gather/scatter support. This avoids
6858 // inevitable extracts into address registers, and also has the benefit of
6859 // activating LSR more, since that pass can't optimize vectorized
6860 // addresses.
6861 if (TTI.prefersVectorizedAddressing())
6862 return;
6863
6864 // Start with all scalar pointer uses.
6865 SmallPtrSet<Instruction *, 8> AddrDefs;
6866 for (BasicBlock *BB : TheLoop->blocks())
6867 for (Instruction &I : *BB) {
6868 Instruction *PtrDef =
6869 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6870 if (PtrDef && TheLoop->contains(PtrDef) &&
6871 getWideningDecision(&I, VF) != CM_GatherScatter)
6872 AddrDefs.insert(PtrDef);
6873 }
6874
6875 // Add all instructions used to generate the addresses.
6876 SmallVector<Instruction *, 4> Worklist;
6877 for (auto *I : AddrDefs)
6878 Worklist.push_back(I);
6879 while (!Worklist.empty()) {
6880 Instruction *I = Worklist.pop_back_val();
6881 for (auto &Op : I->operands())
6882 if (auto *InstOp = dyn_cast<Instruction>(Op))
6883 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6884 AddrDefs.insert(InstOp).second)
6885 Worklist.push_back(InstOp);
6886 }
6887
6888 for (auto *I : AddrDefs) {
6889 if (isa<LoadInst>(I)) {
6890 // Setting the desired widening decision should ideally be handled in
6891 // by cost functions, but since this involves the task of finding out
6892 // if the loaded register is involved in an address computation, it is
6893 // instead changed here when we know this is the case.
6894 InstWidening Decision = getWideningDecision(I, VF);
6895 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6896 // Scalarize a widened load of address.
6897 setWideningDecision(
6898 I, VF, CM_Scalarize,
6899 (VF.getKnownMinValue() *
6900 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6901 else if (auto Group = getInterleavedAccessGroup(I)) {
6902 // Scalarize an interleave group of address loads.
6903 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6904 if (Instruction *Member = Group->getMember(I))
6905 setWideningDecision(
6906 Member, VF, CM_Scalarize,
6907 (VF.getKnownMinValue() *
6908 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6909 }
6910 }
6911 } else
6912 // Make sure I gets scalarized and a cost estimate without
6913 // scalarization overhead.
6914 ForcedScalars[VF].insert(I);
6915 }
6916 }
6917
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)6918 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6919 ElementCount VF,
6920 Type *&VectorTy) {
6921 Type *RetTy = I->getType();
6922 if (canTruncateToMinimalBitwidth(I, VF))
6923 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6924 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6925 auto SE = PSE.getSE();
6926 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6927
6928 // TODO: We need to estimate the cost of intrinsic calls.
6929 switch (I->getOpcode()) {
6930 case Instruction::GetElementPtr:
6931 // We mark this instruction as zero-cost because the cost of GEPs in
6932 // vectorized code depends on whether the corresponding memory instruction
6933 // is scalarized or not. Therefore, we handle GEPs with the memory
6934 // instruction cost.
6935 return 0;
6936 case Instruction::Br: {
6937 // In cases of scalarized and predicated instructions, there will be VF
6938 // predicated blocks in the vectorized loop. Each branch around these
6939 // blocks requires also an extract of its vector compare i1 element.
6940 bool ScalarPredicatedBB = false;
6941 BranchInst *BI = cast<BranchInst>(I);
6942 if (VF.isVector() && BI->isConditional() &&
6943 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6944 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6945 ScalarPredicatedBB = true;
6946
6947 if (ScalarPredicatedBB) {
6948 // Return cost for branches around scalarized and predicated blocks.
6949 assert(!VF.isScalable() && "scalable vectors not yet supported.");
6950 auto *Vec_i1Ty =
6951 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6952 return (TTI.getScalarizationOverhead(
6953 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6954 false, true) +
6955 (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6956 VF.getKnownMinValue()));
6957 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6958 // The back-edge branch will remain, as will all scalar branches.
6959 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6960 else
6961 // This branch will be eliminated by if-conversion.
6962 return 0;
6963 // Note: We currently assume zero cost for an unconditional branch inside
6964 // a predicated block since it will become a fall-through, although we
6965 // may decide in the future to call TTI for all branches.
6966 }
6967 case Instruction::PHI: {
6968 auto *Phi = cast<PHINode>(I);
6969
6970 // First-order recurrences are replaced by vector shuffles inside the loop.
6971 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6972 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6973 return TTI.getShuffleCost(
6974 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6975 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6976
6977 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6978 // converted into select instructions. We require N - 1 selects per phi
6979 // node, where N is the number of incoming values.
6980 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6981 return (Phi->getNumIncomingValues() - 1) *
6982 TTI.getCmpSelInstrCost(
6983 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6984 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6985 CmpInst::BAD_ICMP_PREDICATE, CostKind);
6986
6987 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6988 }
6989 case Instruction::UDiv:
6990 case Instruction::SDiv:
6991 case Instruction::URem:
6992 case Instruction::SRem:
6993 // If we have a predicated instruction, it may not be executed for each
6994 // vector lane. Get the scalarization cost and scale this amount by the
6995 // probability of executing the predicated block. If the instruction is not
6996 // predicated, we fall through to the next case.
6997 if (VF.isVector() && isScalarWithPredication(I)) {
6998 unsigned Cost = 0;
6999
7000 // These instructions have a non-void type, so account for the phi nodes
7001 // that we will create. This cost is likely to be zero. The phi node
7002 // cost, if any, should be scaled by the block probability because it
7003 // models a copy at the end of each predicated block.
7004 Cost += VF.getKnownMinValue() *
7005 TTI.getCFInstrCost(Instruction::PHI, CostKind);
7006
7007 // The cost of the non-predicated instruction.
7008 Cost += VF.getKnownMinValue() *
7009 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7010
7011 // The cost of insertelement and extractelement instructions needed for
7012 // scalarization.
7013 Cost += getScalarizationOverhead(I, VF);
7014
7015 // Scale the cost by the probability of executing the predicated blocks.
7016 // This assumes the predicated block for each vector lane is equally
7017 // likely.
7018 return Cost / getReciprocalPredBlockProb();
7019 }
7020 LLVM_FALLTHROUGH;
7021 case Instruction::Add:
7022 case Instruction::FAdd:
7023 case Instruction::Sub:
7024 case Instruction::FSub:
7025 case Instruction::Mul:
7026 case Instruction::FMul:
7027 case Instruction::FDiv:
7028 case Instruction::FRem:
7029 case Instruction::Shl:
7030 case Instruction::LShr:
7031 case Instruction::AShr:
7032 case Instruction::And:
7033 case Instruction::Or:
7034 case Instruction::Xor: {
7035 // Since we will replace the stride by 1 the multiplication should go away.
7036 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7037 return 0;
7038 // Certain instructions can be cheaper to vectorize if they have a constant
7039 // second vector operand. One example of this are shifts on x86.
7040 Value *Op2 = I->getOperand(1);
7041 TargetTransformInfo::OperandValueProperties Op2VP;
7042 TargetTransformInfo::OperandValueKind Op2VK =
7043 TTI.getOperandInfo(Op2, Op2VP);
7044 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7045 Op2VK = TargetTransformInfo::OK_UniformValue;
7046
7047 SmallVector<const Value *, 4> Operands(I->operand_values());
7048 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7049 return N * TTI.getArithmeticInstrCost(
7050 I->getOpcode(), VectorTy, CostKind,
7051 TargetTransformInfo::OK_AnyValue,
7052 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7053 }
7054 case Instruction::FNeg: {
7055 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7056 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7057 return N * TTI.getArithmeticInstrCost(
7058 I->getOpcode(), VectorTy, CostKind,
7059 TargetTransformInfo::OK_AnyValue,
7060 TargetTransformInfo::OK_AnyValue,
7061 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7062 I->getOperand(0), I);
7063 }
7064 case Instruction::Select: {
7065 SelectInst *SI = cast<SelectInst>(I);
7066 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7067 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7068 Type *CondTy = SI->getCondition()->getType();
7069 if (!ScalarCond) {
7070 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7071 CondTy = VectorType::get(CondTy, VF);
7072 }
7073 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7074 CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7075 }
7076 case Instruction::ICmp:
7077 case Instruction::FCmp: {
7078 Type *ValTy = I->getOperand(0)->getType();
7079 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7080 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7081 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7082 VectorTy = ToVectorTy(ValTy, VF);
7083 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7084 CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7085 }
7086 case Instruction::Store:
7087 case Instruction::Load: {
7088 ElementCount Width = VF;
7089 if (Width.isVector()) {
7090 InstWidening Decision = getWideningDecision(I, Width);
7091 assert(Decision != CM_Unknown &&
7092 "CM decision should be taken at this point");
7093 if (Decision == CM_Scalarize)
7094 Width = ElementCount::getFixed(1);
7095 }
7096 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7097 return getMemoryInstructionCost(I, VF);
7098 }
7099 case Instruction::ZExt:
7100 case Instruction::SExt:
7101 case Instruction::FPToUI:
7102 case Instruction::FPToSI:
7103 case Instruction::FPExt:
7104 case Instruction::PtrToInt:
7105 case Instruction::IntToPtr:
7106 case Instruction::SIToFP:
7107 case Instruction::UIToFP:
7108 case Instruction::Trunc:
7109 case Instruction::FPTrunc:
7110 case Instruction::BitCast: {
7111 // Computes the CastContextHint from a Load/Store instruction.
7112 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7113 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7114 "Expected a load or a store!");
7115
7116 if (VF.isScalar() || !TheLoop->contains(I))
7117 return TTI::CastContextHint::Normal;
7118
7119 switch (getWideningDecision(I, VF)) {
7120 case LoopVectorizationCostModel::CM_GatherScatter:
7121 return TTI::CastContextHint::GatherScatter;
7122 case LoopVectorizationCostModel::CM_Interleave:
7123 return TTI::CastContextHint::Interleave;
7124 case LoopVectorizationCostModel::CM_Scalarize:
7125 case LoopVectorizationCostModel::CM_Widen:
7126 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7127 : TTI::CastContextHint::Normal;
7128 case LoopVectorizationCostModel::CM_Widen_Reverse:
7129 return TTI::CastContextHint::Reversed;
7130 case LoopVectorizationCostModel::CM_Unknown:
7131 llvm_unreachable("Instr did not go through cost modelling?");
7132 }
7133
7134 llvm_unreachable("Unhandled case!");
7135 };
7136
7137 unsigned Opcode = I->getOpcode();
7138 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7139 // For Trunc, the context is the only user, which must be a StoreInst.
7140 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7141 if (I->hasOneUse())
7142 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7143 CCH = ComputeCCH(Store);
7144 }
7145 // For Z/Sext, the context is the operand, which must be a LoadInst.
7146 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7147 Opcode == Instruction::FPExt) {
7148 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7149 CCH = ComputeCCH(Load);
7150 }
7151
7152 // We optimize the truncation of induction variables having constant
7153 // integer steps. The cost of these truncations is the same as the scalar
7154 // operation.
7155 if (isOptimizableIVTruncate(I, VF)) {
7156 auto *Trunc = cast<TruncInst>(I);
7157 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7158 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7159 }
7160
7161 Type *SrcScalarTy = I->getOperand(0)->getType();
7162 Type *SrcVecTy =
7163 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7164 if (canTruncateToMinimalBitwidth(I, VF)) {
7165 // This cast is going to be shrunk. This may remove the cast or it might
7166 // turn it into slightly different cast. For example, if MinBW == 16,
7167 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7168 //
7169 // Calculate the modified src and dest types.
7170 Type *MinVecTy = VectorTy;
7171 if (Opcode == Instruction::Trunc) {
7172 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7173 VectorTy =
7174 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7175 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7176 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7177 VectorTy =
7178 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7179 }
7180 }
7181
7182 assert(!VF.isScalable() && "VF is assumed to be non scalable");
7183 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7184 return N *
7185 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7186 }
7187 case Instruction::Call: {
7188 bool NeedToScalarize;
7189 CallInst *CI = cast<CallInst>(I);
7190 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7191 if (getVectorIntrinsicIDForCall(CI, TLI))
7192 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7193 return CallCost;
7194 }
7195 case Instruction::ExtractValue:
7196 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7197 default:
7198 // The cost of executing VF copies of the scalar instruction. This opcode
7199 // is unknown. Assume that it is the same as 'mul'.
7200 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7201 Instruction::Mul, VectorTy, CostKind) +
7202 getScalarizationOverhead(I, VF);
7203 } // end of switch.
7204 }
7205
7206 char LoopVectorize::ID = 0;
7207
7208 static const char lv_name[] = "Loop Vectorization";
7209
7210 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7211 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7212 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7213 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7214 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7215 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7216 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7217 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7218 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7219 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7220 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7221 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7222 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7223 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7224 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7225 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7226
7227 namespace llvm {
7228
createLoopVectorizePass()7229 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7230
createLoopVectorizePass(bool InterleaveOnlyWhenForced,bool VectorizeOnlyWhenForced)7231 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7232 bool VectorizeOnlyWhenForced) {
7233 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7234 }
7235
7236 } // end namespace llvm
7237
isConsecutiveLoadOrStore(Instruction * Inst)7238 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7239 // Check if the pointer operand of a load or store instruction is
7240 // consecutive.
7241 if (auto *Ptr = getLoadStorePointerOperand(Inst))
7242 return Legal->isConsecutivePtr(Ptr);
7243 return false;
7244 }
7245
collectValuesToIgnore()7246 void LoopVectorizationCostModel::collectValuesToIgnore() {
7247 // Ignore ephemeral values.
7248 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7249
7250 // Ignore type-promoting instructions we identified during reduction
7251 // detection.
7252 for (auto &Reduction : Legal->getReductionVars()) {
7253 RecurrenceDescriptor &RedDes = Reduction.second;
7254 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7255 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7256 }
7257 // Ignore type-casting instructions we identified during induction
7258 // detection.
7259 for (auto &Induction : Legal->getInductionVars()) {
7260 InductionDescriptor &IndDes = Induction.second;
7261 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7262 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7263 }
7264 }
7265
collectInLoopReductions()7266 void LoopVectorizationCostModel::collectInLoopReductions() {
7267 for (auto &Reduction : Legal->getReductionVars()) {
7268 PHINode *Phi = Reduction.first;
7269 RecurrenceDescriptor &RdxDesc = Reduction.second;
7270
7271 // We don't collect reductions that are type promoted (yet).
7272 if (RdxDesc.getRecurrenceType() != Phi->getType())
7273 continue;
7274
7275 // If the target would prefer this reduction to happen "in-loop", then we
7276 // want to record it as such.
7277 unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7278 if (!PreferInLoopReductions &&
7279 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7280 TargetTransformInfo::ReductionFlags()))
7281 continue;
7282
7283 // Check that we can correctly put the reductions into the loop, by
7284 // finding the chain of operations that leads from the phi to the loop
7285 // exit value.
7286 SmallVector<Instruction *, 4> ReductionOperations =
7287 RdxDesc.getReductionOpChain(Phi, TheLoop);
7288 bool InLoop = !ReductionOperations.empty();
7289 if (InLoop)
7290 InLoopReductionChains[Phi] = ReductionOperations;
7291 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7292 << " reduction for phi: " << *Phi << "\n");
7293 }
7294 }
7295
7296 // TODO: we could return a pair of values that specify the max VF and
7297 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7298 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7299 // doesn't have a cost model that can choose which plan to execute if
7300 // more than one is generated.
determineVPlanVF(const unsigned WidestVectorRegBits,LoopVectorizationCostModel & CM)7301 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7302 LoopVectorizationCostModel &CM) {
7303 unsigned WidestType;
7304 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7305 return WidestVectorRegBits / WidestType;
7306 }
7307
7308 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7309 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7310 assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7311 ElementCount VF = UserVF;
7312 // Outer loop handling: They may require CFG and instruction level
7313 // transformations before even evaluating whether vectorization is profitable.
7314 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7315 // the vectorization pipeline.
7316 if (!OrigLoop->isInnermost()) {
7317 // If the user doesn't provide a vectorization factor, determine a
7318 // reasonable one.
7319 if (UserVF.isZero()) {
7320 VF = ElementCount::getFixed(
7321 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7322 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7323
7324 // Make sure we have a VF > 1 for stress testing.
7325 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7326 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7327 << "overriding computed VF.\n");
7328 VF = ElementCount::getFixed(4);
7329 }
7330 }
7331 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7332 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7333 "VF needs to be a power of two");
7334 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7335 << "VF " << VF << " to build VPlans.\n");
7336 buildVPlans(VF, VF);
7337
7338 // For VPlan build stress testing, we bail out after VPlan construction.
7339 if (VPlanBuildStressTest)
7340 return VectorizationFactor::Disabled();
7341
7342 return {VF, 0 /*Cost*/};
7343 }
7344
7345 LLVM_DEBUG(
7346 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7347 "VPlan-native path.\n");
7348 return VectorizationFactor::Disabled();
7349 }
7350
7351 Optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7352 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7353 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7354 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7355 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7356 return None;
7357
7358 // Invalidate interleave groups if all blocks of loop will be predicated.
7359 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7360 !useMaskedInterleavedAccesses(*TTI)) {
7361 LLVM_DEBUG(
7362 dbgs()
7363 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7364 "which requires masked-interleaved support.\n");
7365 if (CM.InterleaveInfo.invalidateGroups())
7366 // Invalidating interleave groups also requires invalidating all decisions
7367 // based on them, which includes widening decisions and uniform and scalar
7368 // values.
7369 CM.invalidateCostModelingDecisions();
7370 }
7371
7372 ElementCount MaxVF = MaybeMaxVF.getValue();
7373 assert(MaxVF.isNonZero() && "MaxVF is zero.");
7374
7375 if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7376 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7377 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378 "VF needs to be a power of two");
7379 // Collect the instructions (and their associated costs) that will be more
7380 // profitable to scalarize.
7381 CM.selectUserVectorizationFactor(UserVF);
7382 CM.collectInLoopReductions();
7383 buildVPlansWithVPRecipes(UserVF, UserVF);
7384 LLVM_DEBUG(printPlans(dbgs()));
7385 return {{UserVF, 0}};
7386 }
7387
7388 assert(!MaxVF.isScalable() &&
7389 "Scalable vectors not yet supported beyond this point");
7390
7391 for (ElementCount VF = ElementCount::getFixed(1);
7392 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7393 // Collect Uniform and Scalar instructions after vectorization with VF.
7394 CM.collectUniformsAndScalars(VF);
7395
7396 // Collect the instructions (and their associated costs) that will be more
7397 // profitable to scalarize.
7398 if (VF.isVector())
7399 CM.collectInstsToScalarize(VF);
7400 }
7401
7402 CM.collectInLoopReductions();
7403
7404 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7405 LLVM_DEBUG(printPlans(dbgs()));
7406 if (MaxVF.isScalar())
7407 return VectorizationFactor::Disabled();
7408
7409 // Select the optimal vectorization factor.
7410 return CM.selectVectorizationFactor(MaxVF);
7411 }
7412
setBestPlan(ElementCount VF,unsigned UF)7413 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7414 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7415 << '\n');
7416 BestVF = VF;
7417 BestUF = UF;
7418
7419 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7420 return !Plan->hasVF(VF);
7421 });
7422 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7423 }
7424
executePlan(InnerLoopVectorizer & ILV,DominatorTree * DT)7425 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7426 DominatorTree *DT) {
7427 // Perform the actual loop transformation.
7428
7429 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7430 VPCallbackILV CallbackILV(ILV);
7431
7432 assert(BestVF.hasValue() && "Vectorization Factor is missing");
7433
7434 VPTransformState State{*BestVF, BestUF, LI,
7435 DT, ILV.Builder, ILV.VectorLoopValueMap,
7436 &ILV, CallbackILV};
7437 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7438 State.TripCount = ILV.getOrCreateTripCount(nullptr);
7439 State.CanonicalIV = ILV.Induction;
7440
7441 ILV.printDebugTracesAtStart();
7442
7443 //===------------------------------------------------===//
7444 //
7445 // Notice: any optimization or new instruction that go
7446 // into the code below should also be implemented in
7447 // the cost-model.
7448 //
7449 //===------------------------------------------------===//
7450
7451 // 2. Copy and widen instructions from the old loop into the new loop.
7452 assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7453 VPlans.front()->execute(&State);
7454
7455 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7456 // predication, updating analyses.
7457 ILV.fixVectorizedLoop();
7458
7459 ILV.printDebugTracesAtEnd();
7460 }
7461
collectTriviallyDeadInstructions(SmallPtrSetImpl<Instruction * > & DeadInstructions)7462 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7463 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7464 BasicBlock *Latch = OrigLoop->getLoopLatch();
7465
7466 // We create new control-flow for the vectorized loop, so the original
7467 // condition will be dead after vectorization if it's only used by the
7468 // branch.
7469 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7470 if (Cmp && Cmp->hasOneUse()) {
7471 DeadInstructions.insert(Cmp);
7472
7473 // The operands of the icmp is often a dead trunc, used by IndUpdate.
7474 for (Value *Op : Cmp->operands()) {
7475 if (isa<TruncInst>(Op) && Op->hasOneUse())
7476 DeadInstructions.insert(cast<Instruction>(Op));
7477 }
7478 }
7479
7480 // We create new "steps" for induction variable updates to which the original
7481 // induction variables map. An original update instruction will be dead if
7482 // all its users except the induction variable are dead.
7483 for (auto &Induction : Legal->getInductionVars()) {
7484 PHINode *Ind = Induction.first;
7485 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7486
7487 // If the tail is to be folded by masking, the primary induction variable,
7488 // if exists, isn't dead: it will be used for masking. Don't kill it.
7489 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7490 continue;
7491
7492 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7493 return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7494 }))
7495 DeadInstructions.insert(IndUpdate);
7496
7497 // We record as "Dead" also the type-casting instructions we had identified
7498 // during induction analysis. We don't need any handling for them in the
7499 // vectorized loop because we have proven that, under a proper runtime
7500 // test guarding the vectorized loop, the value of the phi, and the casted
7501 // value of the phi, are the same. The last instruction in this casting chain
7502 // will get its scalar/vector/widened def from the scalar/vector/widened def
7503 // of the respective phi node. Any other casts in the induction def-use chain
7504 // have no other uses outside the phi update chain, and will be ignored.
7505 InductionDescriptor &IndDes = Induction.second;
7506 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7507 DeadInstructions.insert(Casts.begin(), Casts.end());
7508 }
7509 }
7510
reverseVector(Value * Vec)7511 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7512
getBroadcastInstrs(Value * V)7513 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7514
getStepVector(Value * Val,int StartIdx,Value * Step,Instruction::BinaryOps BinOp)7515 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7516 Instruction::BinaryOps BinOp) {
7517 // When unrolling and the VF is 1, we only need to add a simple scalar.
7518 Type *Ty = Val->getType();
7519 assert(!Ty->isVectorTy() && "Val must be a scalar");
7520
7521 if (Ty->isFloatingPointTy()) {
7522 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7523
7524 // Floating point operations had to be 'fast' to enable the unrolling.
7525 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7526 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7527 }
7528 Constant *C = ConstantInt::get(Ty, StartIdx);
7529 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7530 }
7531
AddRuntimeUnrollDisableMetaData(Loop * L)7532 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7533 SmallVector<Metadata *, 4> MDs;
7534 // Reserve first location for self reference to the LoopID metadata node.
7535 MDs.push_back(nullptr);
7536 bool IsUnrollMetadata = false;
7537 MDNode *LoopID = L->getLoopID();
7538 if (LoopID) {
7539 // First find existing loop unrolling disable metadata.
7540 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7541 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7542 if (MD) {
7543 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7544 IsUnrollMetadata =
7545 S && S->getString().startswith("llvm.loop.unroll.disable");
7546 }
7547 MDs.push_back(LoopID->getOperand(i));
7548 }
7549 }
7550
7551 if (!IsUnrollMetadata) {
7552 // Add runtime unroll disable metadata.
7553 LLVMContext &Context = L->getHeader()->getContext();
7554 SmallVector<Metadata *, 1> DisableOperands;
7555 DisableOperands.push_back(
7556 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7557 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7558 MDs.push_back(DisableNode);
7559 MDNode *NewLoopID = MDNode::get(Context, MDs);
7560 // Set operand 0 to refer to the loop id itself.
7561 NewLoopID->replaceOperandWith(0, NewLoopID);
7562 L->setLoopID(NewLoopID);
7563 }
7564 }
7565
7566 //===--------------------------------------------------------------------===//
7567 // EpilogueVectorizerMainLoop
7568 //===--------------------------------------------------------------------===//
7569
7570 /// This function is partially responsible for generating the control flow
7571 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
createEpilogueVectorizedLoopSkeleton()7572 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7573 MDNode *OrigLoopID = OrigLoop->getLoopID();
7574 Loop *Lp = createVectorLoopSkeleton("");
7575
7576 // Generate the code to check the minimum iteration count of the vector
7577 // epilogue (see below).
7578 EPI.EpilogueIterationCountCheck =
7579 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7580 EPI.EpilogueIterationCountCheck->setName("iter.check");
7581
7582 // Generate the code to check any assumptions that we've made for SCEV
7583 // expressions.
7584 BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7585 emitSCEVChecks(Lp, LoopScalarPreHeader);
7586
7587 // If a safety check was generated save it.
7588 if (SavedPreHeader != LoopVectorPreHeader)
7589 EPI.SCEVSafetyCheck = SavedPreHeader;
7590
7591 // Generate the code that checks at runtime if arrays overlap. We put the
7592 // checks into a separate block to make the more common case of few elements
7593 // faster.
7594 SavedPreHeader = LoopVectorPreHeader;
7595 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7596
7597 // If a safety check was generated save/overwite it.
7598 if (SavedPreHeader != LoopVectorPreHeader)
7599 EPI.MemSafetyCheck = SavedPreHeader;
7600
7601 // Generate the iteration count check for the main loop, *after* the check
7602 // for the epilogue loop, so that the path-length is shorter for the case
7603 // that goes directly through the vector epilogue. The longer-path length for
7604 // the main loop is compensated for, by the gain from vectorizing the larger
7605 // trip count. Note: the branch will get updated later on when we vectorize
7606 // the epilogue.
7607 EPI.MainLoopIterationCountCheck =
7608 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7609
7610 // Generate the induction variable.
7611 OldInduction = Legal->getPrimaryInduction();
7612 Type *IdxTy = Legal->getWidestInductionType();
7613 Value *StartIdx = ConstantInt::get(IdxTy, 0);
7614 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7615 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7616 EPI.VectorTripCount = CountRoundDown;
7617 Induction =
7618 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7619 getDebugLocFromInstOrOperands(OldInduction));
7620
7621 // Skip induction resume value creation here because they will be created in
7622 // the second pass. If we created them here, they wouldn't be used anyway,
7623 // because the vplan in the second pass still contains the inductions from the
7624 // original loop.
7625
7626 return completeLoopSkeleton(Lp, OrigLoopID);
7627 }
7628
printDebugTracesAtStart()7629 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7630 LLVM_DEBUG({
7631 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7632 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7633 << ", Main Loop UF:" << EPI.MainLoopUF
7634 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7635 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7636 });
7637 }
7638
printDebugTracesAtEnd()7639 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7640 DEBUG_WITH_TYPE(VerboseDebug, {
7641 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7642 });
7643 }
7644
emitMinimumIterationCountCheck(Loop * L,BasicBlock * Bypass,bool ForEpilogue)7645 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7646 Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7647 assert(L && "Expected valid Loop.");
7648 assert(Bypass && "Expected valid bypass basic block.");
7649 unsigned VFactor =
7650 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7651 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7652 Value *Count = getOrCreateTripCount(L);
7653 // Reuse existing vector loop preheader for TC checks.
7654 // Note that new preheader block is generated for vector loop.
7655 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7656 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7657
7658 // Generate code to check if the loop's trip count is less than VF * UF of the
7659 // main vector loop.
7660 auto P =
7661 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7662
7663 Value *CheckMinIters = Builder.CreateICmp(
7664 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7665 "min.iters.check");
7666
7667 if (!ForEpilogue)
7668 TCCheckBlock->setName("vector.main.loop.iter.check");
7669
7670 // Create new preheader for vector loop.
7671 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7672 DT, LI, nullptr, "vector.ph");
7673
7674 if (ForEpilogue) {
7675 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7676 DT->getNode(Bypass)->getIDom()) &&
7677 "TC check is expected to dominate Bypass");
7678
7679 // Update dominator for Bypass & LoopExit.
7680 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7681 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7682
7683 LoopBypassBlocks.push_back(TCCheckBlock);
7684
7685 // Save the trip count so we don't have to regenerate it in the
7686 // vec.epilog.iter.check. This is safe to do because the trip count
7687 // generated here dominates the vector epilog iter check.
7688 EPI.TripCount = Count;
7689 }
7690
7691 ReplaceInstWithInst(
7692 TCCheckBlock->getTerminator(),
7693 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7694
7695 return TCCheckBlock;
7696 }
7697
7698 //===--------------------------------------------------------------------===//
7699 // EpilogueVectorizerEpilogueLoop
7700 //===--------------------------------------------------------------------===//
7701
7702 /// This function is partially responsible for generating the control flow
7703 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7704 BasicBlock *
createEpilogueVectorizedLoopSkeleton()7705 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7706 MDNode *OrigLoopID = OrigLoop->getLoopID();
7707 Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7708
7709 // Now, compare the remaining count and if there aren't enough iterations to
7710 // execute the vectorized epilogue skip to the scalar part.
7711 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7712 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7713 LoopVectorPreHeader =
7714 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7715 LI, nullptr, "vec.epilog.ph");
7716 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7717 VecEpilogueIterationCountCheck);
7718
7719 // Adjust the control flow taking the state info from the main loop
7720 // vectorization into account.
7721 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7722 "expected this to be saved from the previous pass.");
7723 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7724 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7725
7726 DT->changeImmediateDominator(LoopVectorPreHeader,
7727 EPI.MainLoopIterationCountCheck);
7728
7729 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7730 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7731
7732 if (EPI.SCEVSafetyCheck)
7733 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7734 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7735 if (EPI.MemSafetyCheck)
7736 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7737 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7738
7739 DT->changeImmediateDominator(
7740 VecEpilogueIterationCountCheck,
7741 VecEpilogueIterationCountCheck->getSinglePredecessor());
7742
7743 DT->changeImmediateDominator(LoopScalarPreHeader,
7744 EPI.EpilogueIterationCountCheck);
7745 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7746
7747 // Keep track of bypass blocks, as they feed start values to the induction
7748 // phis in the scalar loop preheader.
7749 if (EPI.SCEVSafetyCheck)
7750 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7751 if (EPI.MemSafetyCheck)
7752 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7753 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7754
7755 // Generate a resume induction for the vector epilogue and put it in the
7756 // vector epilogue preheader
7757 Type *IdxTy = Legal->getWidestInductionType();
7758 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7759 LoopVectorPreHeader->getFirstNonPHI());
7760 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7761 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7762 EPI.MainLoopIterationCountCheck);
7763
7764 // Generate the induction variable.
7765 OldInduction = Legal->getPrimaryInduction();
7766 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7767 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7768 Value *StartIdx = EPResumeVal;
7769 Induction =
7770 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7771 getDebugLocFromInstOrOperands(OldInduction));
7772
7773 // Generate induction resume values. These variables save the new starting
7774 // indexes for the scalar loop. They are used to test if there are any tail
7775 // iterations left once the vector loop has completed.
7776 // Note that when the vectorized epilogue is skipped due to iteration count
7777 // check, then the resume value for the induction variable comes from
7778 // the trip count of the main vector loop, hence passing the AdditionalBypass
7779 // argument.
7780 createInductionResumeValues(Lp, CountRoundDown,
7781 {VecEpilogueIterationCountCheck,
7782 EPI.VectorTripCount} /* AdditionalBypass */);
7783
7784 AddRuntimeUnrollDisableMetaData(Lp);
7785 return completeLoopSkeleton(Lp, OrigLoopID);
7786 }
7787
7788 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(Loop * L,BasicBlock * Bypass,BasicBlock * Insert)7789 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7790 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7791
7792 assert(EPI.TripCount &&
7793 "Expected trip count to have been safed in the first pass.");
7794 assert(
7795 (!isa<Instruction>(EPI.TripCount) ||
7796 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7797 "saved trip count does not dominate insertion point.");
7798 Value *TC = EPI.TripCount;
7799 IRBuilder<> Builder(Insert->getTerminator());
7800 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7801
7802 // Generate code to check if the loop's trip count is less than VF * UF of the
7803 // vector epilogue loop.
7804 auto P =
7805 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7806
7807 Value *CheckMinIters = Builder.CreateICmp(
7808 P, Count,
7809 ConstantInt::get(Count->getType(),
7810 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7811 "min.epilog.iters.check");
7812
7813 ReplaceInstWithInst(
7814 Insert->getTerminator(),
7815 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7816
7817 LoopBypassBlocks.push_back(Insert);
7818 return Insert;
7819 }
7820
printDebugTracesAtStart()7821 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7822 LLVM_DEBUG({
7823 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7824 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7825 << ", Main Loop UF:" << EPI.MainLoopUF
7826 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7827 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7828 });
7829 }
7830
printDebugTracesAtEnd()7831 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7832 DEBUG_WITH_TYPE(VerboseDebug, {
7833 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7834 });
7835 }
7836
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)7837 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7838 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7839 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7840 bool PredicateAtRangeStart = Predicate(Range.Start);
7841
7842 for (ElementCount TmpVF = Range.Start * 2;
7843 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7844 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7845 Range.End = TmpVF;
7846 break;
7847 }
7848
7849 return PredicateAtRangeStart;
7850 }
7851
7852 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7853 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7854 /// of VF's starting at a given VF and extending it as much as possible. Each
7855 /// vectorization decision can potentially shorten this sub-range during
7856 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7857 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7858 ElementCount MaxVF) {
7859 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7860 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7861 VFRange SubRange = {VF, MaxVFPlusOne};
7862 VPlans.push_back(buildVPlan(SubRange));
7863 VF = SubRange.End;
7864 }
7865 }
7866
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlanPtr & Plan)7867 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7868 VPlanPtr &Plan) {
7869 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7870
7871 // Look for cached value.
7872 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7873 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7874 if (ECEntryIt != EdgeMaskCache.end())
7875 return ECEntryIt->second;
7876
7877 VPValue *SrcMask = createBlockInMask(Src, Plan);
7878
7879 // The terminator has to be a branch inst!
7880 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7881 assert(BI && "Unexpected terminator found");
7882
7883 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7884 return EdgeMaskCache[Edge] = SrcMask;
7885
7886 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7887 assert(EdgeMask && "No Edge Mask found for condition");
7888
7889 if (BI->getSuccessor(0) != Dst)
7890 EdgeMask = Builder.createNot(EdgeMask);
7891
7892 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7893 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7894
7895 return EdgeMaskCache[Edge] = EdgeMask;
7896 }
7897
createBlockInMask(BasicBlock * BB,VPlanPtr & Plan)7898 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7899 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7900
7901 // Look for cached value.
7902 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7903 if (BCEntryIt != BlockMaskCache.end())
7904 return BCEntryIt->second;
7905
7906 // All-one mask is modelled as no-mask following the convention for masked
7907 // load/store/gather/scatter. Initialize BlockMask to no-mask.
7908 VPValue *BlockMask = nullptr;
7909
7910 if (OrigLoop->getHeader() == BB) {
7911 if (!CM.blockNeedsPredication(BB))
7912 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7913
7914 // Create the block in mask as the first non-phi instruction in the block.
7915 VPBuilder::InsertPointGuard Guard(Builder);
7916 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7917 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7918
7919 // Introduce the early-exit compare IV <= BTC to form header block mask.
7920 // This is used instead of IV < TC because TC may wrap, unlike BTC.
7921 // Start by constructing the desired canonical IV.
7922 VPValue *IV = nullptr;
7923 if (Legal->getPrimaryInduction())
7924 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7925 else {
7926 auto IVRecipe = new VPWidenCanonicalIVRecipe();
7927 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7928 IV = IVRecipe->getVPValue();
7929 }
7930 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7931 bool TailFolded = !CM.isScalarEpilogueAllowed();
7932
7933 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7934 // While ActiveLaneMask is a binary op that consumes the loop tripcount
7935 // as a second argument, we only pass the IV here and extract the
7936 // tripcount from the transform state where codegen of the VP instructions
7937 // happen.
7938 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7939 } else {
7940 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7941 }
7942 return BlockMaskCache[BB] = BlockMask;
7943 }
7944
7945 // This is the block mask. We OR all incoming edges.
7946 for (auto *Predecessor : predecessors(BB)) {
7947 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7948 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7949 return BlockMaskCache[BB] = EdgeMask;
7950
7951 if (!BlockMask) { // BlockMask has its initialized nullptr value.
7952 BlockMask = EdgeMask;
7953 continue;
7954 }
7955
7956 BlockMask = Builder.createOr(BlockMask, EdgeMask);
7957 }
7958
7959 return BlockMaskCache[BB] = BlockMask;
7960 }
7961
7962 VPWidenMemoryInstructionRecipe *
tryToWidenMemory(Instruction * I,VFRange & Range,VPlanPtr & Plan)7963 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7964 VPlanPtr &Plan) {
7965 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7966 "Must be called with either a load or store");
7967
7968 auto willWiden = [&](ElementCount VF) -> bool {
7969 if (VF.isScalar())
7970 return false;
7971 LoopVectorizationCostModel::InstWidening Decision =
7972 CM.getWideningDecision(I, VF);
7973 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7974 "CM decision should be taken at this point.");
7975 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7976 return true;
7977 if (CM.isScalarAfterVectorization(I, VF) ||
7978 CM.isProfitableToScalarize(I, VF))
7979 return false;
7980 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7981 };
7982
7983 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7984 return nullptr;
7985
7986 VPValue *Mask = nullptr;
7987 if (Legal->isMaskRequired(I))
7988 Mask = createBlockInMask(I->getParent(), Plan);
7989
7990 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7991 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7992 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7993
7994 StoreInst *Store = cast<StoreInst>(I);
7995 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7996 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7997 }
7998
7999 VPWidenIntOrFpInductionRecipe *
tryToOptimizeInductionPHI(PHINode * Phi) const8000 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8001 // Check if this is an integer or fp induction. If so, build the recipe that
8002 // produces its scalar and vector values.
8003 InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8004 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8005 II.getKind() == InductionDescriptor::IK_FpInduction)
8006 return new VPWidenIntOrFpInductionRecipe(Phi);
8007
8008 return nullptr;
8009 }
8010
8011 VPWidenIntOrFpInductionRecipe *
tryToOptimizeInductionTruncate(TruncInst * I,VFRange & Range) const8012 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8013 VFRange &Range) const {
8014 // Optimize the special case where the source is a constant integer
8015 // induction variable. Notice that we can only optimize the 'trunc' case
8016 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8017 // (c) other casts depend on pointer size.
8018
8019 // Determine whether \p K is a truncation based on an induction variable that
8020 // can be optimized.
8021 auto isOptimizableIVTruncate =
8022 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8023 return [=](ElementCount VF) -> bool {
8024 return CM.isOptimizableIVTruncate(K, VF);
8025 };
8026 };
8027
8028 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8029 isOptimizableIVTruncate(I), Range))
8030 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8031 I);
8032 return nullptr;
8033 }
8034
tryToBlend(PHINode * Phi,VPlanPtr & Plan)8035 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8036 // We know that all PHIs in non-header blocks are converted into selects, so
8037 // we don't have to worry about the insertion order and we can just use the
8038 // builder. At this point we generate the predication tree. There may be
8039 // duplications since this is a simple recursive scan, but future
8040 // optimizations will clean it up.
8041
8042 SmallVector<VPValue *, 2> Operands;
8043 unsigned NumIncoming = Phi->getNumIncomingValues();
8044 for (unsigned In = 0; In < NumIncoming; In++) {
8045 VPValue *EdgeMask =
8046 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8047 assert((EdgeMask || NumIncoming == 1) &&
8048 "Multiple predecessors with one having a full mask");
8049 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8050 if (EdgeMask)
8051 Operands.push_back(EdgeMask);
8052 }
8053 return new VPBlendRecipe(Phi, Operands);
8054 }
8055
tryToWidenCall(CallInst * CI,VFRange & Range,VPlan & Plan) const8056 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8057 VPlan &Plan) const {
8058
8059 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8060 [this, CI](ElementCount VF) {
8061 return CM.isScalarWithPredication(CI, VF);
8062 },
8063 Range);
8064
8065 if (IsPredicated)
8066 return nullptr;
8067
8068 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8069 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8070 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8071 ID == Intrinsic::pseudoprobe))
8072 return nullptr;
8073
8074 auto willWiden = [&](ElementCount VF) -> bool {
8075 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8076 // The following case may be scalarized depending on the VF.
8077 // The flag shows whether we use Intrinsic or a usual Call for vectorized
8078 // version of the instruction.
8079 // Is it beneficial to perform intrinsic call compared to lib call?
8080 bool NeedToScalarize = false;
8081 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8082 bool UseVectorIntrinsic =
8083 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8084 return UseVectorIntrinsic || !NeedToScalarize;
8085 };
8086
8087 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8088 return nullptr;
8089
8090 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8091 }
8092
shouldWiden(Instruction * I,VFRange & Range) const8093 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8094 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8095 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8096 // Instruction should be widened, unless it is scalar after vectorization,
8097 // scalarization is profitable or it is predicated.
8098 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8099 return CM.isScalarAfterVectorization(I, VF) ||
8100 CM.isProfitableToScalarize(I, VF) ||
8101 CM.isScalarWithPredication(I, VF);
8102 };
8103 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8104 Range);
8105 }
8106
tryToWiden(Instruction * I,VPlan & Plan) const8107 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8108 auto IsVectorizableOpcode = [](unsigned Opcode) {
8109 switch (Opcode) {
8110 case Instruction::Add:
8111 case Instruction::And:
8112 case Instruction::AShr:
8113 case Instruction::BitCast:
8114 case Instruction::FAdd:
8115 case Instruction::FCmp:
8116 case Instruction::FDiv:
8117 case Instruction::FMul:
8118 case Instruction::FNeg:
8119 case Instruction::FPExt:
8120 case Instruction::FPToSI:
8121 case Instruction::FPToUI:
8122 case Instruction::FPTrunc:
8123 case Instruction::FRem:
8124 case Instruction::FSub:
8125 case Instruction::ICmp:
8126 case Instruction::IntToPtr:
8127 case Instruction::LShr:
8128 case Instruction::Mul:
8129 case Instruction::Or:
8130 case Instruction::PtrToInt:
8131 case Instruction::SDiv:
8132 case Instruction::Select:
8133 case Instruction::SExt:
8134 case Instruction::Shl:
8135 case Instruction::SIToFP:
8136 case Instruction::SRem:
8137 case Instruction::Sub:
8138 case Instruction::Trunc:
8139 case Instruction::UDiv:
8140 case Instruction::UIToFP:
8141 case Instruction::URem:
8142 case Instruction::Xor:
8143 case Instruction::ZExt:
8144 return true;
8145 }
8146 return false;
8147 };
8148
8149 if (!IsVectorizableOpcode(I->getOpcode()))
8150 return nullptr;
8151
8152 // Success: widen this instruction.
8153 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8154 }
8155
handleReplication(Instruction * I,VFRange & Range,VPBasicBlock * VPBB,DenseMap<Instruction *,VPReplicateRecipe * > & PredInst2Recipe,VPlanPtr & Plan)8156 VPBasicBlock *VPRecipeBuilder::handleReplication(
8157 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8158 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8159 VPlanPtr &Plan) {
8160 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8161 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8162 Range);
8163
8164 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8165 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8166 Range);
8167
8168 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8169 IsUniform, IsPredicated);
8170 setRecipe(I, Recipe);
8171 Plan->addVPValue(I, Recipe);
8172
8173 // Find if I uses a predicated instruction. If so, it will use its scalar
8174 // value. Avoid hoisting the insert-element which packs the scalar value into
8175 // a vector value, as that happens iff all users use the vector value.
8176 for (auto &Op : I->operands())
8177 if (auto *PredInst = dyn_cast<Instruction>(Op))
8178 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8179 PredInst2Recipe[PredInst]->setAlsoPack(false);
8180
8181 // Finalize the recipe for Instr, first if it is not predicated.
8182 if (!IsPredicated) {
8183 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8184 VPBB->appendRecipe(Recipe);
8185 return VPBB;
8186 }
8187 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8188 assert(VPBB->getSuccessors().empty() &&
8189 "VPBB has successors when handling predicated replication.");
8190 // Record predicated instructions for above packing optimizations.
8191 PredInst2Recipe[I] = Recipe;
8192 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8193 VPBlockUtils::insertBlockAfter(Region, VPBB);
8194 auto *RegSucc = new VPBasicBlock();
8195 VPBlockUtils::insertBlockAfter(RegSucc, Region);
8196 return RegSucc;
8197 }
8198
createReplicateRegion(Instruction * Instr,VPRecipeBase * PredRecipe,VPlanPtr & Plan)8199 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8200 VPRecipeBase *PredRecipe,
8201 VPlanPtr &Plan) {
8202 // Instructions marked for predication are replicated and placed under an
8203 // if-then construct to prevent side-effects.
8204
8205 // Generate recipes to compute the block mask for this region.
8206 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8207
8208 // Build the triangular if-then region.
8209 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8210 assert(Instr->getParent() && "Predicated instruction not in any basic block");
8211 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8212 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8213 auto *PHIRecipe = Instr->getType()->isVoidTy()
8214 ? nullptr
8215 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8216 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8217 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8218 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8219
8220 // Note: first set Entry as region entry and then connect successors starting
8221 // from it in order, to propagate the "parent" of each VPBasicBlock.
8222 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8223 VPBlockUtils::connectBlocks(Pred, Exit);
8224
8225 return Region;
8226 }
8227
tryToCreateWidenRecipe(Instruction * Instr,VFRange & Range,VPlanPtr & Plan)8228 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8229 VFRange &Range,
8230 VPlanPtr &Plan) {
8231 // First, check for specific widening recipes that deal with calls, memory
8232 // operations, inductions and Phi nodes.
8233 if (auto *CI = dyn_cast<CallInst>(Instr))
8234 return tryToWidenCall(CI, Range, *Plan);
8235
8236 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8237 return tryToWidenMemory(Instr, Range, Plan);
8238
8239 VPRecipeBase *Recipe;
8240 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8241 if (Phi->getParent() != OrigLoop->getHeader())
8242 return tryToBlend(Phi, Plan);
8243 if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8244 return Recipe;
8245 return new VPWidenPHIRecipe(Phi);
8246 }
8247
8248 if (isa<TruncInst>(Instr) &&
8249 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8250 return Recipe;
8251
8252 if (!shouldWiden(Instr, Range))
8253 return nullptr;
8254
8255 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8256 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8257 OrigLoop);
8258
8259 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8260 bool InvariantCond =
8261 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8262 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8263 InvariantCond);
8264 }
8265
8266 return tryToWiden(Instr, *Plan);
8267 }
8268
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8269 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8270 ElementCount MaxVF) {
8271 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8272
8273 // Collect instructions from the original loop that will become trivially dead
8274 // in the vectorized loop. We don't need to vectorize these instructions. For
8275 // example, original induction update instructions can become dead because we
8276 // separately emit induction "steps" when generating code for the new loop.
8277 // Similarly, we create a new latch condition when setting up the structure
8278 // of the new loop, so the old one can become dead.
8279 SmallPtrSet<Instruction *, 4> DeadInstructions;
8280 collectTriviallyDeadInstructions(DeadInstructions);
8281
8282 // Add assume instructions we need to drop to DeadInstructions, to prevent
8283 // them from being added to the VPlan.
8284 // TODO: We only need to drop assumes in blocks that get flattend. If the
8285 // control flow is preserved, we should keep them.
8286 auto &ConditionalAssumes = Legal->getConditionalAssumes();
8287 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8288
8289 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8290 // Dead instructions do not need sinking. Remove them from SinkAfter.
8291 for (Instruction *I : DeadInstructions)
8292 SinkAfter.erase(I);
8293
8294 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8295 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8296 VFRange SubRange = {VF, MaxVFPlusOne};
8297 VPlans.push_back(
8298 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8299 VF = SubRange.End;
8300 }
8301 }
8302
buildVPlanWithVPRecipes(VFRange & Range,SmallPtrSetImpl<Instruction * > & DeadInstructions,const DenseMap<Instruction *,Instruction * > & SinkAfter)8303 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8304 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8305 const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8306
8307 // Hold a mapping from predicated instructions to their recipes, in order to
8308 // fix their AlsoPack behavior if a user is determined to replicate and use a
8309 // scalar instead of vector value.
8310 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8311
8312 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8313
8314 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8315
8316 // ---------------------------------------------------------------------------
8317 // Pre-construction: record ingredients whose recipes we'll need to further
8318 // process after constructing the initial VPlan.
8319 // ---------------------------------------------------------------------------
8320
8321 // Mark instructions we'll need to sink later and their targets as
8322 // ingredients whose recipe we'll need to record.
8323 for (auto &Entry : SinkAfter) {
8324 RecipeBuilder.recordRecipeOf(Entry.first);
8325 RecipeBuilder.recordRecipeOf(Entry.second);
8326 }
8327 for (auto &Reduction : CM.getInLoopReductionChains()) {
8328 PHINode *Phi = Reduction.first;
8329 RecurrenceDescriptor::RecurrenceKind Kind =
8330 Legal->getReductionVars()[Phi].getRecurrenceKind();
8331 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8332
8333 RecipeBuilder.recordRecipeOf(Phi);
8334 for (auto &R : ReductionOperations) {
8335 RecipeBuilder.recordRecipeOf(R);
8336 // For min/max reducitons, where we have a pair of icmp/select, we also
8337 // need to record the ICmp recipe, so it can be removed later.
8338 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8339 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8340 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8341 }
8342 }
8343 }
8344
8345 // For each interleave group which is relevant for this (possibly trimmed)
8346 // Range, add it to the set of groups to be later applied to the VPlan and add
8347 // placeholders for its members' Recipes which we'll be replacing with a
8348 // single VPInterleaveRecipe.
8349 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8350 auto applyIG = [IG, this](ElementCount VF) -> bool {
8351 return (VF.isVector() && // Query is illegal for VF == 1
8352 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8353 LoopVectorizationCostModel::CM_Interleave);
8354 };
8355 if (!getDecisionAndClampRange(applyIG, Range))
8356 continue;
8357 InterleaveGroups.insert(IG);
8358 for (unsigned i = 0; i < IG->getFactor(); i++)
8359 if (Instruction *Member = IG->getMember(i))
8360 RecipeBuilder.recordRecipeOf(Member);
8361 };
8362
8363 // ---------------------------------------------------------------------------
8364 // Build initial VPlan: Scan the body of the loop in a topological order to
8365 // visit each basic block after having visited its predecessor basic blocks.
8366 // ---------------------------------------------------------------------------
8367
8368 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8369 auto Plan = std::make_unique<VPlan>();
8370 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8371 Plan->setEntry(VPBB);
8372
8373 // Scan the body of the loop in a topological order to visit each basic block
8374 // after having visited its predecessor basic blocks.
8375 LoopBlocksDFS DFS(OrigLoop);
8376 DFS.perform(LI);
8377
8378 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8379 // Relevant instructions from basic block BB will be grouped into VPRecipe
8380 // ingredients and fill a new VPBasicBlock.
8381 unsigned VPBBsForBB = 0;
8382 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8383 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8384 VPBB = FirstVPBBForBB;
8385 Builder.setInsertPoint(VPBB);
8386
8387 // Introduce each ingredient into VPlan.
8388 // TODO: Model and preserve debug instrinsics in VPlan.
8389 for (Instruction &I : BB->instructionsWithoutDebug()) {
8390 Instruction *Instr = &I;
8391
8392 // First filter out irrelevant instructions, to ensure no recipes are
8393 // built for them.
8394 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8395 continue;
8396
8397 if (auto Recipe =
8398 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8399 // Check if the recipe can be converted to a VPValue. We need the extra
8400 // down-casting step until VPRecipeBase inherits from VPValue.
8401 VPValue *MaybeVPValue = Recipe->toVPValue();
8402 if (!Instr->getType()->isVoidTy() && MaybeVPValue)
8403 Plan->addVPValue(Instr, MaybeVPValue);
8404
8405 RecipeBuilder.setRecipe(Instr, Recipe);
8406 VPBB->appendRecipe(Recipe);
8407 continue;
8408 }
8409
8410 // Otherwise, if all widening options failed, Instruction is to be
8411 // replicated. This may create a successor for VPBB.
8412 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8413 Instr, Range, VPBB, PredInst2Recipe, Plan);
8414 if (NextVPBB != VPBB) {
8415 VPBB = NextVPBB;
8416 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8417 : "");
8418 }
8419 }
8420 }
8421
8422 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8423 // may also be empty, such as the last one VPBB, reflecting original
8424 // basic-blocks with no recipes.
8425 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8426 assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8427 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8428 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8429 delete PreEntry;
8430
8431 // ---------------------------------------------------------------------------
8432 // Transform initial VPlan: Apply previously taken decisions, in order, to
8433 // bring the VPlan to its final state.
8434 // ---------------------------------------------------------------------------
8435
8436 // Apply Sink-After legal constraints.
8437 for (auto &Entry : SinkAfter) {
8438 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8439 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8440 Sink->moveAfter(Target);
8441 }
8442
8443 // Interleave memory: for each Interleave Group we marked earlier as relevant
8444 // for this VPlan, replace the Recipes widening its memory instructions with a
8445 // single VPInterleaveRecipe at its insertion point.
8446 for (auto IG : InterleaveGroups) {
8447 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8448 RecipeBuilder.getRecipe(IG->getInsertPos()));
8449 SmallVector<VPValue *, 4> StoredValues;
8450 for (unsigned i = 0; i < IG->getFactor(); ++i)
8451 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8452 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8453
8454 (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8455 Recipe->getMask()))
8456 ->insertBefore(Recipe);
8457
8458 for (unsigned i = 0; i < IG->getFactor(); ++i)
8459 if (Instruction *Member = IG->getMember(i)) {
8460 if (!Member->getType()->isVoidTy()) {
8461 VPValue *OriginalV = Plan->getVPValue(Member);
8462 Plan->removeVPValueFor(Member);
8463 OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
8464 }
8465 RecipeBuilder.getRecipe(Member)->eraseFromParent();
8466 }
8467 }
8468
8469 // Adjust the recipes for any inloop reductions.
8470 if (Range.Start.isVector())
8471 adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8472
8473 // Finally, if tail is folded by masking, introduce selects between the phi
8474 // and the live-out instruction of each reduction, at the end of the latch.
8475 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8476 Builder.setInsertPoint(VPBB);
8477 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8478 for (auto &Reduction : Legal->getReductionVars()) {
8479 if (CM.isInLoopReduction(Reduction.first))
8480 continue;
8481 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8482 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8483 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8484 }
8485 }
8486
8487 std::string PlanName;
8488 raw_string_ostream RSO(PlanName);
8489 ElementCount VF = Range.Start;
8490 Plan->addVF(VF);
8491 RSO << "Initial VPlan for VF={" << VF;
8492 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8493 Plan->addVF(VF);
8494 RSO << "," << VF;
8495 }
8496 RSO << "},UF>=1";
8497 RSO.flush();
8498 Plan->setName(PlanName);
8499
8500 return Plan;
8501 }
8502
buildVPlan(VFRange & Range)8503 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8504 // Outer loop handling: They may require CFG and instruction level
8505 // transformations before even evaluating whether vectorization is profitable.
8506 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8507 // the vectorization pipeline.
8508 assert(!OrigLoop->isInnermost());
8509 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8510
8511 // Create new empty VPlan
8512 auto Plan = std::make_unique<VPlan>();
8513
8514 // Build hierarchical CFG
8515 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8516 HCFGBuilder.buildHierarchicalCFG();
8517
8518 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8519 VF *= 2)
8520 Plan->addVF(VF);
8521
8522 if (EnableVPlanPredication) {
8523 VPlanPredicator VPP(*Plan);
8524 VPP.predicate();
8525
8526 // Avoid running transformation to recipes until masked code generation in
8527 // VPlan-native path is in place.
8528 return Plan;
8529 }
8530
8531 SmallPtrSet<Instruction *, 1> DeadInstructions;
8532 VPlanTransforms::VPInstructionsToVPRecipes(
8533 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8534 return Plan;
8535 }
8536
8537 // Adjust the recipes for any inloop reductions. The chain of instructions
8538 // leading from the loop exit instr to the phi need to be converted to
8539 // reductions, with one operand being vector and the other being the scalar
8540 // reduction chain.
adjustRecipesForInLoopReductions(VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder)8541 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8542 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8543 for (auto &Reduction : CM.getInLoopReductionChains()) {
8544 PHINode *Phi = Reduction.first;
8545 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8546 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8547
8548 // ReductionOperations are orders top-down from the phi's use to the
8549 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8550 // which of the two operands will remain scalar and which will be reduced.
8551 // For minmax the chain will be the select instructions.
8552 Instruction *Chain = Phi;
8553 for (Instruction *R : ReductionOperations) {
8554 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8555 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8556
8557 VPValue *ChainOp = Plan->getVPValue(Chain);
8558 unsigned FirstOpId;
8559 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8560 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8561 assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8562 "Expected to replace a VPWidenSelectSC");
8563 FirstOpId = 1;
8564 } else {
8565 assert(isa<VPWidenRecipe>(WidenRecipe) &&
8566 "Expected to replace a VPWidenSC");
8567 FirstOpId = 0;
8568 }
8569 unsigned VecOpId =
8570 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8571 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8572
8573 auto *CondOp = CM.foldTailByMasking()
8574 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8575 : nullptr;
8576 VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8577 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8578 WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
8579 Plan->removeVPValueFor(R);
8580 Plan->addVPValue(R, RedRecipe);
8581 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8582 WidenRecipe->eraseFromParent();
8583
8584 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8585 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8586 VPRecipeBase *CompareRecipe =
8587 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8588 assert(isa<VPWidenRecipe>(CompareRecipe) &&
8589 "Expected to replace a VPWidenSC");
8590 assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8591 "Expected no remaining users");
8592 CompareRecipe->eraseFromParent();
8593 }
8594 Chain = R;
8595 }
8596 }
8597 }
8598
8599 Value* LoopVectorizationPlanner::VPCallbackILV::
getOrCreateVectorValues(Value * V,unsigned Part)8600 getOrCreateVectorValues(Value *V, unsigned Part) {
8601 return ILV.getOrCreateVectorValue(V, Part);
8602 }
8603
getOrCreateScalarValue(Value * V,const VPIteration & Instance)8604 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8605 Value *V, const VPIteration &Instance) {
8606 return ILV.getOrCreateScalarValue(V, Instance);
8607 }
8608
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const8609 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8610 VPSlotTracker &SlotTracker) const {
8611 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8612 IG->getInsertPos()->printAsOperand(O, false);
8613 O << ", ";
8614 getAddr()->printAsOperand(O, SlotTracker);
8615 VPValue *Mask = getMask();
8616 if (Mask) {
8617 O << ", ";
8618 Mask->printAsOperand(O, SlotTracker);
8619 }
8620 for (unsigned i = 0; i < IG->getFactor(); ++i)
8621 if (Instruction *I = IG->getMember(i))
8622 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i;
8623 }
8624
execute(VPTransformState & State)8625 void VPWidenCallRecipe::execute(VPTransformState &State) {
8626 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8627 *this, State);
8628 }
8629
execute(VPTransformState & State)8630 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8631 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8632 this, *this, InvariantCond, State);
8633 }
8634
execute(VPTransformState & State)8635 void VPWidenRecipe::execute(VPTransformState &State) {
8636 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8637 }
8638
execute(VPTransformState & State)8639 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8640 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8641 *this, State.UF, State.VF, IsPtrLoopInvariant,
8642 IsIndexLoopInvariant, State);
8643 }
8644
execute(VPTransformState & State)8645 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8646 assert(!State.Instance && "Int or FP induction being replicated.");
8647 State.ILV->widenIntOrFpInduction(IV, Trunc);
8648 }
8649
execute(VPTransformState & State)8650 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8651 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8652 }
8653
execute(VPTransformState & State)8654 void VPBlendRecipe::execute(VPTransformState &State) {
8655 State.ILV->setDebugLocFromInst(State.Builder, Phi);
8656 // We know that all PHIs in non-header blocks are converted into
8657 // selects, so we don't have to worry about the insertion order and we
8658 // can just use the builder.
8659 // At this point we generate the predication tree. There may be
8660 // duplications since this is a simple recursive scan, but future
8661 // optimizations will clean it up.
8662
8663 unsigned NumIncoming = getNumIncomingValues();
8664
8665 // Generate a sequence of selects of the form:
8666 // SELECT(Mask3, In3,
8667 // SELECT(Mask2, In2,
8668 // SELECT(Mask1, In1,
8669 // In0)))
8670 // Note that Mask0 is never used: lanes for which no path reaches this phi and
8671 // are essentially undef are taken from In0.
8672 InnerLoopVectorizer::VectorParts Entry(State.UF);
8673 for (unsigned In = 0; In < NumIncoming; ++In) {
8674 for (unsigned Part = 0; Part < State.UF; ++Part) {
8675 // We might have single edge PHIs (blocks) - use an identity
8676 // 'select' for the first PHI operand.
8677 Value *In0 = State.get(getIncomingValue(In), Part);
8678 if (In == 0)
8679 Entry[Part] = In0; // Initialize with the first incoming value.
8680 else {
8681 // Select between the current value and the previous incoming edge
8682 // based on the incoming mask.
8683 Value *Cond = State.get(getMask(In), Part);
8684 Entry[Part] =
8685 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8686 }
8687 }
8688 }
8689 for (unsigned Part = 0; Part < State.UF; ++Part)
8690 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8691 }
8692
execute(VPTransformState & State)8693 void VPInterleaveRecipe::execute(VPTransformState &State) {
8694 assert(!State.Instance && "Interleave group being replicated.");
8695 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8696 getMask());
8697 }
8698
execute(VPTransformState & State)8699 void VPReductionRecipe::execute(VPTransformState &State) {
8700 assert(!State.Instance && "Reduction being replicated.");
8701 for (unsigned Part = 0; Part < State.UF; ++Part) {
8702 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8703 Value *NewVecOp = State.get(getVecOp(), Part);
8704 if (VPValue *Cond = getCondOp()) {
8705 Value *NewCond = State.get(Cond, Part);
8706 VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8707 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8708 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8709 Constant *IdenVec =
8710 ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8711 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8712 NewVecOp = Select;
8713 }
8714 Value *NewRed =
8715 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8716 Value *PrevInChain = State.get(getChainOp(), Part);
8717 Value *NextInChain;
8718 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8719 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8720 NextInChain =
8721 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8722 NewRed, PrevInChain);
8723 } else {
8724 NextInChain = State.Builder.CreateBinOp(
8725 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8726 PrevInChain);
8727 }
8728 State.set(this, getUnderlyingInstr(), NextInChain, Part);
8729 }
8730 }
8731
execute(VPTransformState & State)8732 void VPReplicateRecipe::execute(VPTransformState &State) {
8733 if (State.Instance) { // Generate a single instance.
8734 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8735 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8736 *State.Instance, IsPredicated, State);
8737 // Insert scalar instance packing it into a vector.
8738 if (AlsoPack && State.VF.isVector()) {
8739 // If we're constructing lane 0, initialize to start from undef.
8740 if (State.Instance->Lane == 0) {
8741 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8742 Value *Undef = UndefValue::get(
8743 VectorType::get(getUnderlyingValue()->getType(), State.VF));
8744 State.ValueMap.setVectorValue(getUnderlyingInstr(),
8745 State.Instance->Part, Undef);
8746 }
8747 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8748 *State.Instance);
8749 }
8750 return;
8751 }
8752
8753 // Generate scalar instances for all VF lanes of all UF parts, unless the
8754 // instruction is uniform inwhich case generate only the first lane for each
8755 // of the UF parts.
8756 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8757 assert((!State.VF.isScalable() || IsUniform) &&
8758 "Can't scalarize a scalable vector");
8759 for (unsigned Part = 0; Part < State.UF; ++Part)
8760 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8761 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8762 IsPredicated, State);
8763 }
8764
execute(VPTransformState & State)8765 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8766 assert(State.Instance && "Branch on Mask works only on single instance.");
8767
8768 unsigned Part = State.Instance->Part;
8769 unsigned Lane = State.Instance->Lane;
8770
8771 Value *ConditionBit = nullptr;
8772 VPValue *BlockInMask = getMask();
8773 if (BlockInMask) {
8774 ConditionBit = State.get(BlockInMask, Part);
8775 if (ConditionBit->getType()->isVectorTy())
8776 ConditionBit = State.Builder.CreateExtractElement(
8777 ConditionBit, State.Builder.getInt32(Lane));
8778 } else // Block in mask is all-one.
8779 ConditionBit = State.Builder.getTrue();
8780
8781 // Replace the temporary unreachable terminator with a new conditional branch,
8782 // whose two destinations will be set later when they are created.
8783 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8784 assert(isa<UnreachableInst>(CurrentTerminator) &&
8785 "Expected to replace unreachable terminator with conditional branch.");
8786 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8787 CondBr->setSuccessor(0, nullptr);
8788 ReplaceInstWithInst(CurrentTerminator, CondBr);
8789 }
8790
execute(VPTransformState & State)8791 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8792 assert(State.Instance && "Predicated instruction PHI works per instance.");
8793 Instruction *ScalarPredInst =
8794 cast<Instruction>(State.get(getOperand(0), *State.Instance));
8795 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8796 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8797 assert(PredicatingBB && "Predicated block has no single predecessor.");
8798
8799 // By current pack/unpack logic we need to generate only a single phi node: if
8800 // a vector value for the predicated instruction exists at this point it means
8801 // the instruction has vector users only, and a phi for the vector value is
8802 // needed. In this case the recipe of the predicated instruction is marked to
8803 // also do that packing, thereby "hoisting" the insert-element sequence.
8804 // Otherwise, a phi node for the scalar value is needed.
8805 unsigned Part = State.Instance->Part;
8806 Instruction *PredInst =
8807 cast<Instruction>(getOperand(0)->getUnderlyingValue());
8808 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8809 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8810 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8811 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8812 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8813 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8814 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8815 } else {
8816 Type *PredInstType = PredInst->getType();
8817 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8818 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8819 Phi->addIncoming(ScalarPredInst, PredicatedBB);
8820 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8821 }
8822 }
8823
execute(VPTransformState & State)8824 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8825 Instruction *Instr = getUnderlyingInstr();
8826 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8827 State.ILV->vectorizeMemoryInstruction(Instr, State,
8828 StoredValue ? nullptr : this, getAddr(),
8829 StoredValue, getMask());
8830 }
8831
8832 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8833 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8834 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8835 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,AssumptionCache * AC,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,LoopVectorizationLegality & LVL)8836 static ScalarEpilogueLowering getScalarEpilogueLowering(
8837 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8838 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8839 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8840 LoopVectorizationLegality &LVL) {
8841 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8842 // don't look at hints or options, and don't request a scalar epilogue.
8843 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8844 // LoopAccessInfo (due to code dependency and not being able to reliably get
8845 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8846 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8847 // versioning when the vectorization is forced, unlike hasOptSize. So revert
8848 // back to the old way and vectorize with versioning when forced. See D81345.)
8849 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8850 PGSOQueryType::IRPass) &&
8851 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8852 return CM_ScalarEpilogueNotAllowedOptSize;
8853
8854 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8855 !PreferPredicateOverEpilogue;
8856
8857 // 2) Next, if disabling predication is requested on the command line, honour
8858 // this and request a scalar epilogue.
8859 if (PredicateOptDisabled)
8860 return CM_ScalarEpilogueAllowed;
8861
8862 // 3) and 4) look if enabling predication is requested on the command line,
8863 // with a loop hint, or if the TTI hook indicates this is profitable, request
8864 // predication.
8865 if (PreferPredicateOverEpilogue ||
8866 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8867 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8868 LVL.getLAI()) &&
8869 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8870 return CM_ScalarEpilogueNotNeededUsePredicate;
8871
8872 return CM_ScalarEpilogueAllowed;
8873 }
8874
set(VPValue * Def,Value * IRDef,Value * V,unsigned Part)8875 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8876 unsigned Part) {
8877 set(Def, V, Part);
8878 ILV->setVectorValue(IRDef, Part, V);
8879 }
8880
8881 // Process the loop in the VPlan-native vectorization path. This path builds
8882 // VPlan upfront in the vectorization pipeline, which allows to apply
8883 // VPlan-to-VPlan transformations from the very beginning without modifying the
8884 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints)8885 static bool processLoopInVPlanNativePath(
8886 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8887 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8888 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8889 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8890 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8891
8892 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8893 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8894 return false;
8895 }
8896 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8897 Function *F = L->getHeader()->getParent();
8898 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8899
8900 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8901 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8902
8903 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8904 &Hints, IAI);
8905 // Use the planner for outer loop vectorization.
8906 // TODO: CM is not used at this point inside the planner. Turn CM into an
8907 // optional argument if we don't need it in the future.
8908 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8909
8910 // Get user vectorization factor.
8911 ElementCount UserVF = Hints.getWidth();
8912
8913 // Plan how to best vectorize, return the best VF and its cost.
8914 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8915
8916 // If we are stress testing VPlan builds, do not attempt to generate vector
8917 // code. Masked vector code generation support will follow soon.
8918 // Also, do not attempt to vectorize if no vector code will be produced.
8919 if (VPlanBuildStressTest || EnableVPlanPredication ||
8920 VectorizationFactor::Disabled() == VF)
8921 return false;
8922
8923 LVP.setBestPlan(VF.Width, 1);
8924
8925 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8926 &CM, BFI, PSI);
8927 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8928 << L->getHeader()->getParent()->getName() << "\"\n");
8929 LVP.executePlan(LB, DT);
8930
8931 // Mark the loop as already vectorized to avoid vectorizing again.
8932 Hints.setAlreadyVectorized();
8933
8934 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8935 return true;
8936 }
8937
LoopVectorizePass(LoopVectorizeOptions Opts)8938 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8939 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8940 !EnableLoopInterleaving),
8941 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8942 !EnableLoopVectorization) {}
8943
processLoop(Loop * L)8944 bool LoopVectorizePass::processLoop(Loop *L) {
8945 assert((EnableVPlanNativePath || L->isInnermost()) &&
8946 "VPlan-native path is not enabled. Only process inner loops.");
8947
8948 #ifndef NDEBUG
8949 const std::string DebugLocStr = getDebugLocString(L);
8950 #endif /* NDEBUG */
8951
8952 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8953 << L->getHeader()->getParent()->getName() << "\" from "
8954 << DebugLocStr << "\n");
8955
8956 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8957
8958 LLVM_DEBUG(
8959 dbgs() << "LV: Loop hints:"
8960 << " force="
8961 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8962 ? "disabled"
8963 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8964 ? "enabled"
8965 : "?"))
8966 << " width=" << Hints.getWidth()
8967 << " unroll=" << Hints.getInterleave() << "\n");
8968
8969 // Function containing loop
8970 Function *F = L->getHeader()->getParent();
8971
8972 // Looking at the diagnostic output is the only way to determine if a loop
8973 // was vectorized (other than looking at the IR or machine code), so it
8974 // is important to generate an optimization remark for each loop. Most of
8975 // these messages are generated as OptimizationRemarkAnalysis. Remarks
8976 // generated as OptimizationRemark and OptimizationRemarkMissed are
8977 // less verbose reporting vectorized loops and unvectorized loops that may
8978 // benefit from vectorization, respectively.
8979
8980 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8981 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8982 return false;
8983 }
8984
8985 PredicatedScalarEvolution PSE(*SE, *L);
8986
8987 // Check if it is legal to vectorize the loop.
8988 LoopVectorizationRequirements Requirements(*ORE);
8989 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8990 &Requirements, &Hints, DB, AC, BFI, PSI);
8991 if (!LVL.canVectorize(EnableVPlanNativePath)) {
8992 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8993 Hints.emitRemarkWithHints();
8994 return false;
8995 }
8996
8997 // Check the function attributes and profiles to find out if this function
8998 // should be optimized for size.
8999 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9000 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9001
9002 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9003 // here. They may require CFG and instruction level transformations before
9004 // even evaluating whether vectorization is profitable. Since we cannot modify
9005 // the incoming IR, we need to build VPlan upfront in the vectorization
9006 // pipeline.
9007 if (!L->isInnermost())
9008 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9009 ORE, BFI, PSI, Hints);
9010
9011 assert(L->isInnermost() && "Inner loop expected.");
9012
9013 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9014 // count by optimizing for size, to minimize overheads.
9015 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9016 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9017 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9018 << "This loop is worth vectorizing only if no scalar "
9019 << "iteration overheads are incurred.");
9020 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9021 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9022 else {
9023 LLVM_DEBUG(dbgs() << "\n");
9024 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9025 }
9026 }
9027
9028 // Check the function attributes to see if implicit floats are allowed.
9029 // FIXME: This check doesn't seem possibly correct -- what if the loop is
9030 // an integer loop and the vector instructions selected are purely integer
9031 // vector instructions?
9032 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9033 reportVectorizationFailure(
9034 "Can't vectorize when the NoImplicitFloat attribute is used",
9035 "loop not vectorized due to NoImplicitFloat attribute",
9036 "NoImplicitFloat", ORE, L);
9037 Hints.emitRemarkWithHints();
9038 return false;
9039 }
9040
9041 // Check if the target supports potentially unsafe FP vectorization.
9042 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9043 // for the target we're vectorizing for, to make sure none of the
9044 // additional fp-math flags can help.
9045 if (Hints.isPotentiallyUnsafe() &&
9046 TTI->isFPVectorizationPotentiallyUnsafe()) {
9047 reportVectorizationFailure(
9048 "Potentially unsafe FP op prevents vectorization",
9049 "loop not vectorized due to unsafe FP support.",
9050 "UnsafeFP", ORE, L);
9051 Hints.emitRemarkWithHints();
9052 return false;
9053 }
9054
9055 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9056 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9057
9058 // If an override option has been passed in for interleaved accesses, use it.
9059 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9060 UseInterleaved = EnableInterleavedMemAccesses;
9061
9062 // Analyze interleaved memory accesses.
9063 if (UseInterleaved) {
9064 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9065 }
9066
9067 // Use the cost model.
9068 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9069 F, &Hints, IAI);
9070 CM.collectValuesToIgnore();
9071
9072 // Use the planner for vectorization.
9073 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9074
9075 // Get user vectorization factor and interleave count.
9076 ElementCount UserVF = Hints.getWidth();
9077 unsigned UserIC = Hints.getInterleave();
9078
9079 // Plan how to best vectorize, return the best VF and its cost.
9080 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9081
9082 VectorizationFactor VF = VectorizationFactor::Disabled();
9083 unsigned IC = 1;
9084
9085 if (MaybeVF) {
9086 VF = *MaybeVF;
9087 // Select the interleave count.
9088 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9089 }
9090
9091 // Identify the diagnostic messages that should be produced.
9092 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9093 bool VectorizeLoop = true, InterleaveLoop = true;
9094 if (Requirements.doesNotMeet(F, L, Hints)) {
9095 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9096 "requirements.\n");
9097 Hints.emitRemarkWithHints();
9098 return false;
9099 }
9100
9101 if (VF.Width.isScalar()) {
9102 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9103 VecDiagMsg = std::make_pair(
9104 "VectorizationNotBeneficial",
9105 "the cost-model indicates that vectorization is not beneficial");
9106 VectorizeLoop = false;
9107 }
9108
9109 if (!MaybeVF && UserIC > 1) {
9110 // Tell the user interleaving was avoided up-front, despite being explicitly
9111 // requested.
9112 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9113 "interleaving should be avoided up front\n");
9114 IntDiagMsg = std::make_pair(
9115 "InterleavingAvoided",
9116 "Ignoring UserIC, because interleaving was avoided up front");
9117 InterleaveLoop = false;
9118 } else if (IC == 1 && UserIC <= 1) {
9119 // Tell the user interleaving is not beneficial.
9120 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9121 IntDiagMsg = std::make_pair(
9122 "InterleavingNotBeneficial",
9123 "the cost-model indicates that interleaving is not beneficial");
9124 InterleaveLoop = false;
9125 if (UserIC == 1) {
9126 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9127 IntDiagMsg.second +=
9128 " and is explicitly disabled or interleave count is set to 1";
9129 }
9130 } else if (IC > 1 && UserIC == 1) {
9131 // Tell the user interleaving is beneficial, but it explicitly disabled.
9132 LLVM_DEBUG(
9133 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9134 IntDiagMsg = std::make_pair(
9135 "InterleavingBeneficialButDisabled",
9136 "the cost-model indicates that interleaving is beneficial "
9137 "but is explicitly disabled or interleave count is set to 1");
9138 InterleaveLoop = false;
9139 }
9140
9141 // Override IC if user provided an interleave count.
9142 IC = UserIC > 0 ? UserIC : IC;
9143
9144 // Emit diagnostic messages, if any.
9145 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9146 if (!VectorizeLoop && !InterleaveLoop) {
9147 // Do not vectorize or interleaving the loop.
9148 ORE->emit([&]() {
9149 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9150 L->getStartLoc(), L->getHeader())
9151 << VecDiagMsg.second;
9152 });
9153 ORE->emit([&]() {
9154 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9155 L->getStartLoc(), L->getHeader())
9156 << IntDiagMsg.second;
9157 });
9158 return false;
9159 } else if (!VectorizeLoop && InterleaveLoop) {
9160 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9161 ORE->emit([&]() {
9162 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9163 L->getStartLoc(), L->getHeader())
9164 << VecDiagMsg.second;
9165 });
9166 } else if (VectorizeLoop && !InterleaveLoop) {
9167 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9168 << ") in " << DebugLocStr << '\n');
9169 ORE->emit([&]() {
9170 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9171 L->getStartLoc(), L->getHeader())
9172 << IntDiagMsg.second;
9173 });
9174 } else if (VectorizeLoop && InterleaveLoop) {
9175 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9176 << ") in " << DebugLocStr << '\n');
9177 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9178 }
9179
9180 LVP.setBestPlan(VF.Width, IC);
9181
9182 using namespace ore;
9183 bool DisableRuntimeUnroll = false;
9184 MDNode *OrigLoopID = L->getLoopID();
9185
9186 if (!VectorizeLoop) {
9187 assert(IC > 1 && "interleave count should not be 1 or 0");
9188 // If we decided that it is not legal to vectorize the loop, then
9189 // interleave it.
9190 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9191 BFI, PSI);
9192 LVP.executePlan(Unroller, DT);
9193
9194 ORE->emit([&]() {
9195 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9196 L->getHeader())
9197 << "interleaved loop (interleaved count: "
9198 << NV("InterleaveCount", IC) << ")";
9199 });
9200 } else {
9201 // If we decided that it is *legal* to vectorize the loop, then do it.
9202
9203 // Consider vectorizing the epilogue too if it's profitable.
9204 VectorizationFactor EpilogueVF =
9205 CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9206 if (EpilogueVF.Width.isVector()) {
9207
9208 // The first pass vectorizes the main loop and creates a scalar epilogue
9209 // to be vectorized by executing the plan (potentially with a different
9210 // factor) again shortly afterwards.
9211 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9212 EpilogueVF.Width.getKnownMinValue(), 1);
9213 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9214 &LVL, &CM, BFI, PSI);
9215
9216 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9217 LVP.executePlan(MainILV, DT);
9218 ++LoopsVectorized;
9219
9220 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9221 formLCSSARecursively(*L, *DT, LI, SE);
9222
9223 // Second pass vectorizes the epilogue and adjusts the control flow
9224 // edges from the first pass.
9225 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9226 EPI.MainLoopVF = EPI.EpilogueVF;
9227 EPI.MainLoopUF = EPI.EpilogueUF;
9228 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9229 ORE, EPI, &LVL, &CM, BFI, PSI);
9230 LVP.executePlan(EpilogILV, DT);
9231 ++LoopsEpilogueVectorized;
9232
9233 if (!MainILV.areSafetyChecksAdded())
9234 DisableRuntimeUnroll = true;
9235 } else {
9236 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9237 &LVL, &CM, BFI, PSI);
9238 LVP.executePlan(LB, DT);
9239 ++LoopsVectorized;
9240
9241 // Add metadata to disable runtime unrolling a scalar loop when there are
9242 // no runtime checks about strides and memory. A scalar loop that is
9243 // rarely used is not worth unrolling.
9244 if (!LB.areSafetyChecksAdded())
9245 DisableRuntimeUnroll = true;
9246 }
9247
9248 // Report the vectorization decision.
9249 ORE->emit([&]() {
9250 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9251 L->getHeader())
9252 << "vectorized loop (vectorization width: "
9253 << NV("VectorizationFactor", VF.Width)
9254 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9255 });
9256 }
9257
9258 Optional<MDNode *> RemainderLoopID =
9259 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9260 LLVMLoopVectorizeFollowupEpilogue});
9261 if (RemainderLoopID.hasValue()) {
9262 L->setLoopID(RemainderLoopID.getValue());
9263 } else {
9264 if (DisableRuntimeUnroll)
9265 AddRuntimeUnrollDisableMetaData(L);
9266
9267 // Mark the loop as already vectorized to avoid vectorizing again.
9268 Hints.setAlreadyVectorized();
9269 }
9270
9271 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9272 return true;
9273 }
9274
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo & BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AAResults & AA_,AssumptionCache & AC_,std::function<const LoopAccessInfo & (Loop &)> & GetLAA_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)9275 LoopVectorizeResult LoopVectorizePass::runImpl(
9276 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9277 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9278 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9279 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9280 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9281 SE = &SE_;
9282 LI = &LI_;
9283 TTI = &TTI_;
9284 DT = &DT_;
9285 BFI = &BFI_;
9286 TLI = TLI_;
9287 AA = &AA_;
9288 AC = &AC_;
9289 GetLAA = &GetLAA_;
9290 DB = &DB_;
9291 ORE = &ORE_;
9292 PSI = PSI_;
9293
9294 // Don't attempt if
9295 // 1. the target claims to have no vector registers, and
9296 // 2. interleaving won't help ILP.
9297 //
9298 // The second condition is necessary because, even if the target has no
9299 // vector registers, loop vectorization may still enable scalar
9300 // interleaving.
9301 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9302 TTI->getMaxInterleaveFactor(1) < 2)
9303 return LoopVectorizeResult(false, false);
9304
9305 bool Changed = false, CFGChanged = false;
9306
9307 // The vectorizer requires loops to be in simplified form.
9308 // Since simplification may add new inner loops, it has to run before the
9309 // legality and profitability checks. This means running the loop vectorizer
9310 // will simplify all loops, regardless of whether anything end up being
9311 // vectorized.
9312 for (auto &L : *LI)
9313 Changed |= CFGChanged |=
9314 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9315
9316 // Build up a worklist of inner-loops to vectorize. This is necessary as
9317 // the act of vectorizing or partially unrolling a loop creates new loops
9318 // and can invalidate iterators across the loops.
9319 SmallVector<Loop *, 8> Worklist;
9320
9321 for (Loop *L : *LI)
9322 collectSupportedLoops(*L, LI, ORE, Worklist);
9323
9324 LoopsAnalyzed += Worklist.size();
9325
9326 // Now walk the identified inner loops.
9327 while (!Worklist.empty()) {
9328 Loop *L = Worklist.pop_back_val();
9329
9330 // For the inner loops we actually process, form LCSSA to simplify the
9331 // transform.
9332 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9333
9334 Changed |= CFGChanged |= processLoop(L);
9335 }
9336
9337 // Process each loop nest in the function.
9338 return LoopVectorizeResult(Changed, CFGChanged);
9339 }
9340
run(Function & F,FunctionAnalysisManager & AM)9341 PreservedAnalyses LoopVectorizePass::run(Function &F,
9342 FunctionAnalysisManager &AM) {
9343 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9344 auto &LI = AM.getResult<LoopAnalysis>(F);
9345 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9346 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9347 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9348 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9349 auto &AA = AM.getResult<AAManager>(F);
9350 auto &AC = AM.getResult<AssumptionAnalysis>(F);
9351 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9352 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9353 MemorySSA *MSSA = EnableMSSALoopDependency
9354 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9355 : nullptr;
9356
9357 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9358 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9359 [&](Loop &L) -> const LoopAccessInfo & {
9360 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
9361 TLI, TTI, nullptr, MSSA};
9362 return LAM.getResult<LoopAccessAnalysis>(L, AR);
9363 };
9364 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9365 ProfileSummaryInfo *PSI =
9366 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9367 LoopVectorizeResult Result =
9368 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9369 if (!Result.MadeAnyChange)
9370 return PreservedAnalyses::all();
9371 PreservedAnalyses PA;
9372
9373 // We currently do not preserve loopinfo/dominator analyses with outer loop
9374 // vectorization. Until this is addressed, mark these analyses as preserved
9375 // only for non-VPlan-native path.
9376 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9377 if (!EnableVPlanNativePath) {
9378 PA.preserve<LoopAnalysis>();
9379 PA.preserve<DominatorTreeAnalysis>();
9380 }
9381 PA.preserve<BasicAA>();
9382 PA.preserve<GlobalsAA>();
9383 if (!Result.MadeCFGChange)
9384 PA.preserveSet<CFGAnalyses>();
9385 return PA;
9386 }
9387