1 //===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This pass exposes codegen information to IR-level passes. Every
10 /// transformation that uses codegen information is broken into three parts:
11 /// 1. The IR-level analysis pass.
12 /// 2. The IR-level transformation interface which provides the needed
13 /// information.
14 /// 3. Codegen-level implementation which uses target-specific hooks.
15 ///
16 /// This file defines #2, which is the interface that IR-level transformations
17 /// use for querying the codegen.
18 ///
19 //===----------------------------------------------------------------------===//
20
21 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
22 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23
24 #include "llvm/IR/InstrTypes.h"
25 #include "llvm/IR/Operator.h"
26 #include "llvm/IR/PassManager.h"
27 #include "llvm/Pass.h"
28 #include "llvm/Support/AtomicOrdering.h"
29 #include "llvm/Support/DataTypes.h"
30 #include <functional>
31
32 namespace llvm {
33
34 namespace Intrinsic {
35 typedef unsigned ID;
36 }
37
38 class AssumptionCache;
39 class BlockFrequencyInfo;
40 class DominatorTree;
41 class BranchInst;
42 class CallBase;
43 class ExtractElementInst;
44 class Function;
45 class GlobalValue;
46 class InstCombiner;
47 class IntrinsicInst;
48 class LoadInst;
49 class LoopAccessInfo;
50 class Loop;
51 class LoopInfo;
52 class ProfileSummaryInfo;
53 class SCEV;
54 class ScalarEvolution;
55 class StoreInst;
56 class SwitchInst;
57 class TargetLibraryInfo;
58 class Type;
59 class User;
60 class Value;
61 struct KnownBits;
62 template <typename T> class Optional;
63
64 /// Information about a load/store intrinsic defined by the target.
65 struct MemIntrinsicInfo {
66 /// This is the pointer that the intrinsic is loading from or storing to.
67 /// If this is non-null, then analysis/optimization passes can assume that
68 /// this intrinsic is functionally equivalent to a load/store from this
69 /// pointer.
70 Value *PtrVal = nullptr;
71
72 // Ordering for atomic operations.
73 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
74
75 // Same Id is set by the target for corresponding load/store intrinsics.
76 unsigned short MatchingId = 0;
77
78 bool ReadMem = false;
79 bool WriteMem = false;
80 bool IsVolatile = false;
81
isUnorderedMemIntrinsicInfo82 bool isUnordered() const {
83 return (Ordering == AtomicOrdering::NotAtomic ||
84 Ordering == AtomicOrdering::Unordered) &&
85 !IsVolatile;
86 }
87 };
88
89 /// Attributes of a target dependent hardware loop.
90 struct HardwareLoopInfo {
91 HardwareLoopInfo() = delete;
HardwareLoopInfoHardwareLoopInfo92 HardwareLoopInfo(Loop *L) : L(L) {}
93 Loop *L = nullptr;
94 BasicBlock *ExitBlock = nullptr;
95 BranchInst *ExitBranch = nullptr;
96 const SCEV *TripCount = nullptr;
97 IntegerType *CountType = nullptr;
98 Value *LoopDecrement = nullptr; // Decrement the loop counter by this
99 // value in every iteration.
100 bool IsNestingLegal = false; // Can a hardware loop be a parent to
101 // another hardware loop?
102 bool CounterInReg = false; // Should loop counter be updated in
103 // the loop via a phi?
104 bool PerformEntryTest = false; // Generate the intrinsic which also performs
105 // icmp ne zero on the loop counter value and
106 // produces an i1 to guard the loop entry.
107 bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
108 DominatorTree &DT, bool ForceNestedLoop = false,
109 bool ForceHardwareLoopPHI = false);
110 bool canAnalyze(LoopInfo &LI);
111 };
112
113 class IntrinsicCostAttributes {
114 const IntrinsicInst *II = nullptr;
115 Type *RetTy = nullptr;
116 Intrinsic::ID IID;
117 SmallVector<Type *, 4> ParamTys;
118 SmallVector<const Value *, 4> Arguments;
119 FastMathFlags FMF;
120 ElementCount VF = ElementCount::getFixed(1);
121 // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
122 // arguments and the return value will be computed based on types.
123 unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
124
125 public:
126 IntrinsicCostAttributes(const IntrinsicInst &I);
127
128 IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
129
130 IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
131 ElementCount Factor);
132
133 IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
134 ElementCount Factor, unsigned ScalarCost);
135
136 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
137 ArrayRef<Type *> Tys, FastMathFlags Flags);
138
139 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
140 ArrayRef<Type *> Tys, FastMathFlags Flags,
141 unsigned ScalarCost);
142
143 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
144 ArrayRef<Type *> Tys, FastMathFlags Flags,
145 unsigned ScalarCost,
146 const IntrinsicInst *I);
147
148 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
149 ArrayRef<Type *> Tys);
150
151 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
152 ArrayRef<const Value *> Args);
153
getID()154 Intrinsic::ID getID() const { return IID; }
getInst()155 const IntrinsicInst *getInst() const { return II; }
getReturnType()156 Type *getReturnType() const { return RetTy; }
getVectorFactor()157 ElementCount getVectorFactor() const { return VF; }
getFlags()158 FastMathFlags getFlags() const { return FMF; }
getScalarizationCost()159 unsigned getScalarizationCost() const { return ScalarizationCost; }
getArgs()160 const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
getArgTypes()161 const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
162
isTypeBasedOnly()163 bool isTypeBasedOnly() const {
164 return Arguments.empty();
165 }
166
skipScalarizationCost()167 bool skipScalarizationCost() const {
168 return ScalarizationCost != std::numeric_limits<unsigned>::max();
169 }
170 };
171
172 class TargetTransformInfo;
173 typedef TargetTransformInfo TTI;
174
175 /// This pass provides access to the codegen interfaces that are needed
176 /// for IR-level transformations.
177 class TargetTransformInfo {
178 public:
179 /// Construct a TTI object using a type implementing the \c Concept
180 /// API below.
181 ///
182 /// This is used by targets to construct a TTI wrapping their target-specific
183 /// implementation that encodes appropriate costs for their target.
184 template <typename T> TargetTransformInfo(T Impl);
185
186 /// Construct a baseline TTI object using a minimal implementation of
187 /// the \c Concept API below.
188 ///
189 /// The TTI implementation will reflect the information in the DataLayout
190 /// provided if non-null.
191 explicit TargetTransformInfo(const DataLayout &DL);
192
193 // Provide move semantics.
194 TargetTransformInfo(TargetTransformInfo &&Arg);
195 TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
196
197 // We need to define the destructor out-of-line to define our sub-classes
198 // out-of-line.
199 ~TargetTransformInfo();
200
201 /// Handle the invalidation of this information.
202 ///
203 /// When used as a result of \c TargetIRAnalysis this method will be called
204 /// when the function this was computed for changes. When it returns false,
205 /// the information is preserved across those changes.
invalidate(Function &,const PreservedAnalyses &,FunctionAnalysisManager::Invalidator &)206 bool invalidate(Function &, const PreservedAnalyses &,
207 FunctionAnalysisManager::Invalidator &) {
208 // FIXME: We should probably in some way ensure that the subtarget
209 // information for a function hasn't changed.
210 return false;
211 }
212
213 /// \name Generic Target Information
214 /// @{
215
216 /// The kind of cost model.
217 ///
218 /// There are several different cost models that can be customized by the
219 /// target. The normalization of each cost model may be target specific.
220 enum TargetCostKind {
221 TCK_RecipThroughput, ///< Reciprocal throughput.
222 TCK_Latency, ///< The latency of instruction.
223 TCK_CodeSize, ///< Instruction code size.
224 TCK_SizeAndLatency ///< The weighted sum of size and latency.
225 };
226
227 /// Query the cost of a specified instruction.
228 ///
229 /// Clients should use this interface to query the cost of an existing
230 /// instruction. The instruction must have a valid parent (basic block).
231 ///
232 /// Note, this method does not cache the cost calculation and it
233 /// can be expensive in some cases.
getInstructionCost(const Instruction * I,enum TargetCostKind kind)234 int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const {
235 switch (kind) {
236 case TCK_RecipThroughput:
237 return getInstructionThroughput(I);
238
239 case TCK_Latency:
240 return getInstructionLatency(I);
241
242 case TCK_CodeSize:
243 case TCK_SizeAndLatency:
244 return getUserCost(I, kind);
245 }
246 llvm_unreachable("Unknown instruction cost kind");
247 }
248
249 /// Underlying constants for 'cost' values in this interface.
250 ///
251 /// Many APIs in this interface return a cost. This enum defines the
252 /// fundamental values that should be used to interpret (and produce) those
253 /// costs. The costs are returned as an int rather than a member of this
254 /// enumeration because it is expected that the cost of one IR instruction
255 /// may have a multiplicative factor to it or otherwise won't fit directly
256 /// into the enum. Moreover, it is common to sum or average costs which works
257 /// better as simple integral values. Thus this enum only provides constants.
258 /// Also note that the returned costs are signed integers to make it natural
259 /// to add, subtract, and test with zero (a common boundary condition). It is
260 /// not expected that 2^32 is a realistic cost to be modeling at any point.
261 ///
262 /// Note that these costs should usually reflect the intersection of code-size
263 /// cost and execution cost. A free instruction is typically one that folds
264 /// into another instruction. For example, reg-to-reg moves can often be
265 /// skipped by renaming the registers in the CPU, but they still are encoded
266 /// and thus wouldn't be considered 'free' here.
267 enum TargetCostConstants {
268 TCC_Free = 0, ///< Expected to fold away in lowering.
269 TCC_Basic = 1, ///< The cost of a typical 'add' instruction.
270 TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
271 };
272
273 /// Estimate the cost of a GEP operation when lowered.
274 int getGEPCost(Type *PointeeType, const Value *Ptr,
275 ArrayRef<const Value *> Operands,
276 TargetCostKind CostKind = TCK_SizeAndLatency) const;
277
278 /// \returns A value by which our inlining threshold should be multiplied.
279 /// This is primarily used to bump up the inlining threshold wholesale on
280 /// targets where calls are unusually expensive.
281 ///
282 /// TODO: This is a rather blunt instrument. Perhaps altering the costs of
283 /// individual classes of instructions would be better.
284 unsigned getInliningThresholdMultiplier() const;
285
286 /// \returns Vector bonus in percent.
287 ///
288 /// Vector bonuses: We want to more aggressively inline vector-dense kernels
289 /// and apply this bonus based on the percentage of vector instructions. A
290 /// bonus is applied if the vector instructions exceed 50% and half that
291 /// amount is applied if it exceeds 10%. Note that these bonuses are some what
292 /// arbitrary and evolved over time by accident as much as because they are
293 /// principled bonuses.
294 /// FIXME: It would be nice to base the bonus values on something more
295 /// scientific. A target may has no bonus on vector instructions.
296 int getInlinerVectorBonusPercent() const;
297
298 /// \return the expected cost of a memcpy, which could e.g. depend on the
299 /// source/destination type and alignment and the number of bytes copied.
300 int getMemcpyCost(const Instruction *I) const;
301
302 /// \return The estimated number of case clusters when lowering \p 'SI'.
303 /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
304 /// table.
305 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
306 unsigned &JTSize,
307 ProfileSummaryInfo *PSI,
308 BlockFrequencyInfo *BFI) const;
309
310 /// Estimate the cost of a given IR user when lowered.
311 ///
312 /// This can estimate the cost of either a ConstantExpr or Instruction when
313 /// lowered.
314 ///
315 /// \p Operands is a list of operands which can be a result of transformations
316 /// of the current operands. The number of the operands on the list must equal
317 /// to the number of the current operands the IR user has. Their order on the
318 /// list must be the same as the order of the current operands the IR user
319 /// has.
320 ///
321 /// The returned cost is defined in terms of \c TargetCostConstants, see its
322 /// comments for a detailed explanation of the cost values.
323 int getUserCost(const User *U, ArrayRef<const Value *> Operands,
324 TargetCostKind CostKind) const;
325
326 /// This is a helper function which calls the two-argument getUserCost
327 /// with \p Operands which are the current operands U has.
getUserCost(const User * U,TargetCostKind CostKind)328 int getUserCost(const User *U, TargetCostKind CostKind) const {
329 SmallVector<const Value *, 4> Operands(U->value_op_begin(),
330 U->value_op_end());
331 return getUserCost(U, Operands, CostKind);
332 }
333
334 /// Return true if branch divergence exists.
335 ///
336 /// Branch divergence has a significantly negative impact on GPU performance
337 /// when threads in the same wavefront take different paths due to conditional
338 /// branches.
339 bool hasBranchDivergence() const;
340
341 /// Return true if the target prefers to use GPU divergence analysis to
342 /// replace the legacy version.
343 bool useGPUDivergenceAnalysis() const;
344
345 /// Returns whether V is a source of divergence.
346 ///
347 /// This function provides the target-dependent information for
348 /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis
349 /// first builds the dependency graph, and then runs the reachability
350 /// algorithm starting with the sources of divergence.
351 bool isSourceOfDivergence(const Value *V) const;
352
353 // Returns true for the target specific
354 // set of operations which produce uniform result
355 // even taking non-uniform arguments
356 bool isAlwaysUniform(const Value *V) const;
357
358 /// Returns the address space ID for a target's 'flat' address space. Note
359 /// this is not necessarily the same as addrspace(0), which LLVM sometimes
360 /// refers to as the generic address space. The flat address space is a
361 /// generic address space that can be used access multiple segments of memory
362 /// with different address spaces. Access of a memory location through a
363 /// pointer with this address space is expected to be legal but slower
364 /// compared to the same memory location accessed through a pointer with a
365 /// different address space.
366 //
367 /// This is for targets with different pointer representations which can
368 /// be converted with the addrspacecast instruction. If a pointer is converted
369 /// to this address space, optimizations should attempt to replace the access
370 /// with the source address space.
371 ///
372 /// \returns ~0u if the target does not have such a flat address space to
373 /// optimize away.
374 unsigned getFlatAddressSpace() const;
375
376 /// Return any intrinsic address operand indexes which may be rewritten if
377 /// they use a flat address space pointer.
378 ///
379 /// \returns true if the intrinsic was handled.
380 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
381 Intrinsic::ID IID) const;
382
383 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
384
385 unsigned getAssumedAddrSpace(const Value *V) const;
386
387 /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
388 /// NewV, which has a different address space. This should happen for every
389 /// operand index that collectFlatAddressOperands returned for the intrinsic.
390 /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the
391 /// new value (which may be the original \p II with modified operands).
392 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
393 Value *NewV) const;
394
395 /// Test whether calls to a function lower to actual program function
396 /// calls.
397 ///
398 /// The idea is to test whether the program is likely to require a 'call'
399 /// instruction or equivalent in order to call the given function.
400 ///
401 /// FIXME: It's not clear that this is a good or useful query API. Client's
402 /// should probably move to simpler cost metrics using the above.
403 /// Alternatively, we could split the cost interface into distinct code-size
404 /// and execution-speed costs. This would allow modelling the core of this
405 /// query more accurately as a call is a single small instruction, but
406 /// incurs significant execution cost.
407 bool isLoweredToCall(const Function *F) const;
408
409 struct LSRCost {
410 /// TODO: Some of these could be merged. Also, a lexical ordering
411 /// isn't always optimal.
412 unsigned Insns;
413 unsigned NumRegs;
414 unsigned AddRecCost;
415 unsigned NumIVMuls;
416 unsigned NumBaseAdds;
417 unsigned ImmCost;
418 unsigned SetupCost;
419 unsigned ScaleCost;
420 };
421
422 /// Parameters that control the generic loop unrolling transformation.
423 struct UnrollingPreferences {
424 /// The cost threshold for the unrolled loop. Should be relative to the
425 /// getUserCost values returned by this API, and the expectation is that
426 /// the unrolled loop's instructions when run through that interface should
427 /// not exceed this cost. However, this is only an estimate. Also, specific
428 /// loops may be unrolled even with a cost above this threshold if deemed
429 /// profitable. Set this to UINT_MAX to disable the loop body cost
430 /// restriction.
431 unsigned Threshold;
432 /// If complete unrolling will reduce the cost of the loop, we will boost
433 /// the Threshold by a certain percent to allow more aggressive complete
434 /// unrolling. This value provides the maximum boost percentage that we
435 /// can apply to Threshold (The value should be no less than 100).
436 /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
437 /// MaxPercentThresholdBoost / 100)
438 /// E.g. if complete unrolling reduces the loop execution time by 50%
439 /// then we boost the threshold by the factor of 2x. If unrolling is not
440 /// expected to reduce the running time, then we do not increase the
441 /// threshold.
442 unsigned MaxPercentThresholdBoost;
443 /// The cost threshold for the unrolled loop when optimizing for size (set
444 /// to UINT_MAX to disable).
445 unsigned OptSizeThreshold;
446 /// The cost threshold for the unrolled loop, like Threshold, but used
447 /// for partial/runtime unrolling (set to UINT_MAX to disable).
448 unsigned PartialThreshold;
449 /// The cost threshold for the unrolled loop when optimizing for size, like
450 /// OptSizeThreshold, but used for partial/runtime unrolling (set to
451 /// UINT_MAX to disable).
452 unsigned PartialOptSizeThreshold;
453 /// A forced unrolling factor (the number of concatenated bodies of the
454 /// original loop in the unrolled loop body). When set to 0, the unrolling
455 /// transformation will select an unrolling factor based on the current cost
456 /// threshold and other factors.
457 unsigned Count;
458 /// Default unroll count for loops with run-time trip count.
459 unsigned DefaultUnrollRuntimeCount;
460 // Set the maximum unrolling factor. The unrolling factor may be selected
461 // using the appropriate cost threshold, but may not exceed this number
462 // (set to UINT_MAX to disable). This does not apply in cases where the
463 // loop is being fully unrolled.
464 unsigned MaxCount;
465 /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
466 /// applies even if full unrolling is selected. This allows a target to fall
467 /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
468 unsigned FullUnrollMaxCount;
469 // Represents number of instructions optimized when "back edge"
470 // becomes "fall through" in unrolled loop.
471 // For now we count a conditional branch on a backedge and a comparison
472 // feeding it.
473 unsigned BEInsns;
474 /// Allow partial unrolling (unrolling of loops to expand the size of the
475 /// loop body, not only to eliminate small constant-trip-count loops).
476 bool Partial;
477 /// Allow runtime unrolling (unrolling of loops to expand the size of the
478 /// loop body even when the number of loop iterations is not known at
479 /// compile time).
480 bool Runtime;
481 /// Allow generation of a loop remainder (extra iterations after unroll).
482 bool AllowRemainder;
483 /// Allow emitting expensive instructions (such as divisions) when computing
484 /// the trip count of a loop for runtime unrolling.
485 bool AllowExpensiveTripCount;
486 /// Apply loop unroll on any kind of loop
487 /// (mainly to loops that fail runtime unrolling).
488 bool Force;
489 /// Allow using trip count upper bound to unroll loops.
490 bool UpperBound;
491 /// Allow unrolling of all the iterations of the runtime loop remainder.
492 bool UnrollRemainder;
493 /// Allow unroll and jam. Used to enable unroll and jam for the target.
494 bool UnrollAndJam;
495 /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
496 /// value above is used during unroll and jam for the outer loop size.
497 /// This value is used in the same manner to limit the size of the inner
498 /// loop.
499 unsigned UnrollAndJamInnerLoopThreshold;
500 /// Don't allow loop unrolling to simulate more than this number of
501 /// iterations when checking full unroll profitability
502 unsigned MaxIterationsCountToAnalyze;
503 };
504
505 /// Get target-customized preferences for the generic loop unrolling
506 /// transformation. The caller will initialize UP with the current
507 /// target-independent defaults.
508 void getUnrollingPreferences(Loop *L, ScalarEvolution &,
509 UnrollingPreferences &UP) const;
510
511 /// Query the target whether it would be profitable to convert the given loop
512 /// into a hardware loop.
513 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
514 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
515 HardwareLoopInfo &HWLoopInfo) const;
516
517 /// Query the target whether it would be prefered to create a predicated
518 /// vector loop, which can avoid the need to emit a scalar epilogue loop.
519 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
520 AssumptionCache &AC, TargetLibraryInfo *TLI,
521 DominatorTree *DT,
522 const LoopAccessInfo *LAI) const;
523
524 /// Query the target whether lowering of the llvm.get.active.lane.mask
525 /// intrinsic is supported.
526 bool emitGetActiveLaneMask() const;
527
528 // Parameters that control the loop peeling transformation
529 struct PeelingPreferences {
530 /// A forced peeling factor (the number of bodied of the original loop
531 /// that should be peeled off before the loop body). When set to 0, the
532 /// a peeling factor based on profile information and other factors.
533 unsigned PeelCount;
534 /// Allow peeling off loop iterations.
535 bool AllowPeeling;
536 /// Allow peeling off loop iterations for loop nests.
537 bool AllowLoopNestsPeeling;
538 /// Allow peeling basing on profile. Uses to enable peeling off all
539 /// iterations basing on provided profile.
540 /// If the value is true the peeling cost model can decide to peel only
541 /// some iterations and in this case it will set this to false.
542 bool PeelProfiledIterations;
543 };
544
545 /// Get target-customized preferences for the generic loop peeling
546 /// transformation. The caller will initialize \p PP with the current
547 /// target-independent defaults with information from \p L and \p SE.
548 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
549 PeelingPreferences &PP) const;
550
551 /// Targets can implement their own combinations for target-specific
552 /// intrinsics. This function will be called from the InstCombine pass every
553 /// time a target-specific intrinsic is encountered.
554 ///
555 /// \returns None to not do anything target specific or a value that will be
556 /// returned from the InstCombiner. It is possible to return null and stop
557 /// further processing of the intrinsic by returning nullptr.
558 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
559 IntrinsicInst &II) const;
560 /// Can be used to implement target-specific instruction combining.
561 /// \see instCombineIntrinsic
562 Optional<Value *>
563 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
564 APInt DemandedMask, KnownBits &Known,
565 bool &KnownBitsComputed) const;
566 /// Can be used to implement target-specific instruction combining.
567 /// \see instCombineIntrinsic
568 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
569 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
570 APInt &UndefElts2, APInt &UndefElts3,
571 std::function<void(Instruction *, unsigned, APInt, APInt &)>
572 SimplifyAndSetOp) const;
573 /// @}
574
575 /// \name Scalar Target Information
576 /// @{
577
578 /// Flags indicating the kind of support for population count.
579 ///
580 /// Compared to the SW implementation, HW support is supposed to
581 /// significantly boost the performance when the population is dense, and it
582 /// may or may not degrade performance if the population is sparse. A HW
583 /// support is considered as "Fast" if it can outperform, or is on a par
584 /// with, SW implementation when the population is sparse; otherwise, it is
585 /// considered as "Slow".
586 enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
587
588 /// Return true if the specified immediate is legal add immediate, that
589 /// is the target has add instructions which can add a register with the
590 /// immediate without having to materialize the immediate into a register.
591 bool isLegalAddImmediate(int64_t Imm) const;
592
593 /// Return true if the specified immediate is legal icmp immediate,
594 /// that is the target has icmp instructions which can compare a register
595 /// against the immediate without having to materialize the immediate into a
596 /// register.
597 bool isLegalICmpImmediate(int64_t Imm) const;
598
599 /// Return true if the addressing mode represented by AM is legal for
600 /// this target, for a load/store of the specified type.
601 /// The type may be VoidTy, in which case only return true if the addressing
602 /// mode is legal for a load/store of any legal type.
603 /// If target returns true in LSRWithInstrQueries(), I may be valid.
604 /// TODO: Handle pre/postinc as well.
605 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
606 bool HasBaseReg, int64_t Scale,
607 unsigned AddrSpace = 0,
608 Instruction *I = nullptr) const;
609
610 /// Return true if LSR cost of C1 is lower than C1.
611 bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
612 TargetTransformInfo::LSRCost &C2) const;
613
614 /// Return true if LSR major cost is number of registers. Targets which
615 /// implement their own isLSRCostLess and unset number of registers as major
616 /// cost should return false, otherwise return true.
617 bool isNumRegsMajorCostOfLSR() const;
618
619 /// \returns true if LSR should not optimize a chain that includes \p I.
620 bool isProfitableLSRChainElement(Instruction *I) const;
621
622 /// Return true if the target can fuse a compare and branch.
623 /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
624 /// calculation for the instructions in a loop.
625 bool canMacroFuseCmp() const;
626
627 /// Return true if the target can save a compare for loop count, for example
628 /// hardware loop saves a compare.
629 bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
630 DominatorTree *DT, AssumptionCache *AC,
631 TargetLibraryInfo *LibInfo) const;
632
633 /// \return True is LSR should make efforts to create/preserve post-inc
634 /// addressing mode expressions.
635 bool shouldFavorPostInc() const;
636
637 /// Return true if LSR should make efforts to generate indexed addressing
638 /// modes that operate across loop iterations.
639 bool shouldFavorBackedgeIndex(const Loop *L) const;
640
641 /// Return true if the target supports masked store.
642 bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
643 /// Return true if the target supports masked load.
644 bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
645
646 /// Return true if the target supports nontemporal store.
647 bool isLegalNTStore(Type *DataType, Align Alignment) const;
648 /// Return true if the target supports nontemporal load.
649 bool isLegalNTLoad(Type *DataType, Align Alignment) const;
650
651 /// Return true if the target supports masked scatter.
652 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
653 /// Return true if the target supports masked gather.
654 bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
655
656 /// Return true if the target supports masked compress store.
657 bool isLegalMaskedCompressStore(Type *DataType) const;
658 /// Return true if the target supports masked expand load.
659 bool isLegalMaskedExpandLoad(Type *DataType) const;
660
661 /// Return true if the target has a unified operation to calculate division
662 /// and remainder. If so, the additional implicit multiplication and
663 /// subtraction required to calculate a remainder from division are free. This
664 /// can enable more aggressive transformations for division and remainder than
665 /// would typically be allowed using throughput or size cost models.
666 bool hasDivRemOp(Type *DataType, bool IsSigned) const;
667
668 /// Return true if the given instruction (assumed to be a memory access
669 /// instruction) has a volatile variant. If that's the case then we can avoid
670 /// addrspacecast to generic AS for volatile loads/stores. Default
671 /// implementation returns false, which prevents address space inference for
672 /// volatile loads/stores.
673 bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;
674
675 /// Return true if target doesn't mind addresses in vectors.
676 bool prefersVectorizedAddressing() const;
677
678 /// Return the cost of the scaling factor used in the addressing
679 /// mode represented by AM for this target, for a load/store
680 /// of the specified type.
681 /// If the AM is supported, the return value must be >= 0.
682 /// If the AM is not supported, it returns a negative value.
683 /// TODO: Handle pre/postinc as well.
684 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
685 bool HasBaseReg, int64_t Scale,
686 unsigned AddrSpace = 0) const;
687
688 /// Return true if the loop strength reduce pass should make
689 /// Instruction* based TTI queries to isLegalAddressingMode(). This is
690 /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
691 /// immediate offset and no index register.
692 bool LSRWithInstrQueries() const;
693
694 /// Return true if it's free to truncate a value of type Ty1 to type
695 /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
696 /// by referencing its sub-register AX.
697 bool isTruncateFree(Type *Ty1, Type *Ty2) const;
698
699 /// Return true if it is profitable to hoist instruction in the
700 /// then/else to before if.
701 bool isProfitableToHoist(Instruction *I) const;
702
703 bool useAA() const;
704
705 /// Return true if this type is legal.
706 bool isTypeLegal(Type *Ty) const;
707
708 /// Returns the estimated number of registers required to represent \p Ty.
709 unsigned getRegUsageForType(Type *Ty) const;
710
711 /// Return true if switches should be turned into lookup tables for the
712 /// target.
713 bool shouldBuildLookupTables() const;
714
715 /// Return true if switches should be turned into lookup tables
716 /// containing this constant value for the target.
717 bool shouldBuildLookupTablesForConstant(Constant *C) const;
718
719 /// Return true if the input function which is cold at all call sites,
720 /// should use coldcc calling convention.
721 bool useColdCCForColdCall(Function &F) const;
722
723 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
724 /// are set if the demanded result elements need to be inserted and/or
725 /// extracted from vectors.
726 unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
727 bool Insert, bool Extract) const;
728
729 /// Estimate the overhead of scalarizing an instructions unique
730 /// non-constant operands. The types of the arguments are ordinarily
731 /// scalar, in which case the costs are multiplied with VF.
732 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
733 unsigned VF) const;
734
735 /// If target has efficient vector element load/store instructions, it can
736 /// return true here so that insertion/extraction costs are not added to
737 /// the scalarization cost of a load/store.
738 bool supportsEfficientVectorElementLoadStore() const;
739
740 /// Don't restrict interleaved unrolling to small loops.
741 bool enableAggressiveInterleaving(bool LoopHasReductions) const;
742
743 /// Returns options for expansion of memcmp. IsZeroCmp is
744 // true if this is the expansion of memcmp(p1, p2, s) == 0.
745 struct MemCmpExpansionOptions {
746 // Return true if memcmp expansion is enabled.
747 operator bool() const { return MaxNumLoads > 0; }
748
749 // Maximum number of load operations.
750 unsigned MaxNumLoads = 0;
751
752 // The list of available load sizes (in bytes), sorted in decreasing order.
753 SmallVector<unsigned, 8> LoadSizes;
754
755 // For memcmp expansion when the memcmp result is only compared equal or
756 // not-equal to 0, allow up to this number of load pairs per block. As an
757 // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
758 // a0 = load2bytes &a[0]
759 // b0 = load2bytes &b[0]
760 // a2 = load1byte &a[2]
761 // b2 = load1byte &b[2]
762 // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
763 unsigned NumLoadsPerBlock = 1;
764
765 // Set to true to allow overlapping loads. For example, 7-byte compares can
766 // be done with two 4-byte compares instead of 4+2+1-byte compares. This
767 // requires all loads in LoadSizes to be doable in an unaligned way.
768 bool AllowOverlappingLoads = false;
769 };
770 MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
771 bool IsZeroCmp) const;
772
773 /// Enable matching of interleaved access groups.
774 bool enableInterleavedAccessVectorization() const;
775
776 /// Enable matching of interleaved access groups that contain predicated
777 /// accesses or gaps and therefore vectorized using masked
778 /// vector loads/stores.
779 bool enableMaskedInterleavedAccessVectorization() const;
780
781 /// Indicate that it is potentially unsafe to automatically vectorize
782 /// floating-point operations because the semantics of vector and scalar
783 /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
784 /// does not support IEEE-754 denormal numbers, while depending on the
785 /// platform, scalar floating-point math does.
786 /// This applies to floating-point math operations and calls, not memory
787 /// operations, shuffles, or casts.
788 bool isFPVectorizationPotentiallyUnsafe() const;
789
790 /// Determine if the target supports unaligned memory accesses.
791 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
792 unsigned AddressSpace = 0,
793 unsigned Alignment = 1,
794 bool *Fast = nullptr) const;
795
796 /// Return hardware support for population count.
797 PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
798
799 /// Return true if the hardware has a fast square-root instruction.
800 bool haveFastSqrt(Type *Ty) const;
801
802 /// Return true if it is faster to check if a floating-point value is NaN
803 /// (or not-NaN) versus a comparison against a constant FP zero value.
804 /// Targets should override this if materializing a 0.0 for comparison is
805 /// generally as cheap as checking for ordered/unordered.
806 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
807
808 /// Return the expected cost of supporting the floating point operation
809 /// of the specified type.
810 int getFPOpCost(Type *Ty) const;
811
812 /// Return the expected cost of materializing for the given integer
813 /// immediate of the specified type.
814 int getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const;
815
816 /// Return the expected cost of materialization for the given integer
817 /// immediate of the specified type for a given instruction. The cost can be
818 /// zero if the immediate can be folded into the specified instruction.
819 int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
820 TargetCostKind CostKind,
821 Instruction *Inst = nullptr) const;
822 int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
823 Type *Ty, TargetCostKind CostKind) const;
824
825 /// Return the expected cost for the given integer when optimising
826 /// for size. This is different than the other integer immediate cost
827 /// functions in that it is subtarget agnostic. This is useful when you e.g.
828 /// target one ISA such as Aarch32 but smaller encodings could be possible
829 /// with another such as Thumb. This return value is used as a penalty when
830 /// the total costs for a constant is calculated (the bigger the cost, the
831 /// more beneficial constant hoisting is).
832 int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
833 Type *Ty) const;
834 /// @}
835
836 /// \name Vector Target Information
837 /// @{
838
839 /// The various kinds of shuffle patterns for vector queries.
840 enum ShuffleKind {
841 SK_Broadcast, ///< Broadcast element 0 to all other elements.
842 SK_Reverse, ///< Reverse the order of the vector.
843 SK_Select, ///< Selects elements from the corresponding lane of
844 ///< either source operand. This is equivalent to a
845 ///< vector select with a constant condition operand.
846 SK_Transpose, ///< Transpose two vectors.
847 SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
848 SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset.
849 SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one
850 ///< with any shuffle mask.
851 SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any
852 ///< shuffle mask.
853 };
854
855 /// Kind of the reduction data.
856 enum ReductionKind {
857 RK_None, /// Not a reduction.
858 RK_Arithmetic, /// Binary reduction data.
859 RK_MinMax, /// Min/max reduction data.
860 RK_UnsignedMinMax, /// Unsigned min/max reduction data.
861 };
862
863 /// Contains opcode + LHS/RHS parts of the reduction operations.
864 struct ReductionData {
865 ReductionData() = delete;
ReductionDataReductionData866 ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS)
867 : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
868 assert(Kind != RK_None && "expected binary or min/max reduction only.");
869 }
870 unsigned Opcode = 0;
871 Value *LHS = nullptr;
872 Value *RHS = nullptr;
873 ReductionKind Kind = RK_None;
hasSameDataReductionData874 bool hasSameData(ReductionData &RD) const {
875 return Kind == RD.Kind && Opcode == RD.Opcode;
876 }
877 };
878
879 static ReductionKind matchPairwiseReduction(
880 const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
881
882 static ReductionKind matchVectorSplittingReduction(
883 const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
884
885 static ReductionKind matchVectorReduction(const ExtractElementInst *ReduxRoot,
886 unsigned &Opcode, VectorType *&Ty,
887 bool &IsPairwise);
888
889 /// Additional information about an operand's possible values.
890 enum OperandValueKind {
891 OK_AnyValue, // Operand can have any value.
892 OK_UniformValue, // Operand is uniform (splat of a value).
893 OK_UniformConstantValue, // Operand is uniform constant.
894 OK_NonUniformConstantValue // Operand is a non uniform constant value.
895 };
896
897 /// Additional properties of an operand's values.
898 enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
899
900 /// \return the number of registers in the target-provided register class.
901 unsigned getNumberOfRegisters(unsigned ClassID) const;
902
903 /// \return the target-provided register class ID for the provided type,
904 /// accounting for type promotion and other type-legalization techniques that
905 /// the target might apply. However, it specifically does not account for the
906 /// scalarization or splitting of vector types. Should a vector type require
907 /// scalarization or splitting into multiple underlying vector registers, that
908 /// type should be mapped to a register class containing no registers.
909 /// Specifically, this is designed to provide a simple, high-level view of the
910 /// register allocation later performed by the backend. These register classes
911 /// don't necessarily map onto the register classes used by the backend.
912 /// FIXME: It's not currently possible to determine how many registers
913 /// are used by the provided type.
914 unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
915
916 /// \return the target-provided register class name
917 const char *getRegisterClassName(unsigned ClassID) const;
918
919 /// \return The width of the largest scalar or vector register type.
920 unsigned getRegisterBitWidth(bool Vector) const;
921
922 /// \return The width of the smallest vector register type.
923 unsigned getMinVectorRegisterBitWidth() const;
924
925 /// \return True if the vectorization factor should be chosen to
926 /// make the vector of the smallest element type match the size of a
927 /// vector register. For wider element types, this could result in
928 /// creating vectors that span multiple vector registers.
929 /// If false, the vectorization factor will be chosen based on the
930 /// size of the widest element type.
931 bool shouldMaximizeVectorBandwidth(bool OptSize) const;
932
933 /// \return The minimum vectorization factor for types of given element
934 /// bit width, or 0 if there is no minimum VF. The returned value only
935 /// applies when shouldMaximizeVectorBandwidth returns true.
936 unsigned getMinimumVF(unsigned ElemWidth) const;
937
938 /// \return True if it should be considered for address type promotion.
939 /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
940 /// profitable without finding other extensions fed by the same input.
941 bool shouldConsiderAddressTypePromotion(
942 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
943
944 /// \return The size of a cache line in bytes.
945 unsigned getCacheLineSize() const;
946
947 /// The possible cache levels
948 enum class CacheLevel {
949 L1D, // The L1 data cache
950 L2D, // The L2 data cache
951
952 // We currently do not model L3 caches, as their sizes differ widely between
953 // microarchitectures. Also, we currently do not have a use for L3 cache
954 // size modeling yet.
955 };
956
957 /// \return The size of the cache level in bytes, if available.
958 Optional<unsigned> getCacheSize(CacheLevel Level) const;
959
960 /// \return The associativity of the cache level, if available.
961 Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
962
963 /// \return How much before a load we should place the prefetch
964 /// instruction. This is currently measured in number of
965 /// instructions.
966 unsigned getPrefetchDistance() const;
967
968 /// Some HW prefetchers can handle accesses up to a certain constant stride.
969 /// Sometimes prefetching is beneficial even below the HW prefetcher limit,
970 /// and the arguments provided are meant to serve as a basis for deciding this
971 /// for a particular loop.
972 ///
973 /// \param NumMemAccesses Number of memory accesses in the loop.
974 /// \param NumStridedMemAccesses Number of the memory accesses that
975 /// ScalarEvolution could find a known stride
976 /// for.
977 /// \param NumPrefetches Number of software prefetches that will be
978 /// emitted as determined by the addresses
979 /// involved and the cache line size.
980 /// \param HasCall True if the loop contains a call.
981 ///
982 /// \return This is the minimum stride in bytes where it makes sense to start
983 /// adding SW prefetches. The default is 1, i.e. prefetch with any
984 /// stride.
985 unsigned getMinPrefetchStride(unsigned NumMemAccesses,
986 unsigned NumStridedMemAccesses,
987 unsigned NumPrefetches, bool HasCall) const;
988
989 /// \return The maximum number of iterations to prefetch ahead. If
990 /// the required number of iterations is more than this number, no
991 /// prefetching is performed.
992 unsigned getMaxPrefetchIterationsAhead() const;
993
994 /// \return True if prefetching should also be done for writes.
995 bool enableWritePrefetching() const;
996
997 /// \return The maximum interleave factor that any transform should try to
998 /// perform for this target. This number depends on the level of parallelism
999 /// and the number of execution units in the CPU.
1000 unsigned getMaxInterleaveFactor(unsigned VF) const;
1001
1002 /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
1003 static OperandValueKind getOperandInfo(const Value *V,
1004 OperandValueProperties &OpProps);
1005
1006 /// This is an approximation of reciprocal throughput of a math/logic op.
1007 /// A higher cost indicates less expected throughput.
1008 /// From Agner Fog's guides, reciprocal throughput is "the average number of
1009 /// clock cycles per instruction when the instructions are not part of a
1010 /// limiting dependency chain."
1011 /// Therefore, costs should be scaled to account for multiple execution units
1012 /// on the target that can process this type of instruction. For example, if
1013 /// there are 5 scalar integer units and 2 vector integer units that can
1014 /// calculate an 'add' in a single cycle, this model should indicate that the
1015 /// cost of the vector add instruction is 2.5 times the cost of the scalar
1016 /// add instruction.
1017 /// \p Args is an optional argument which holds the instruction operands
1018 /// values so the TTI can analyze those values searching for special
1019 /// cases or optimizations based on those values.
1020 /// \p CxtI is the optional original context instruction, if one exists, to
1021 /// provide even more information.
1022 int getArithmeticInstrCost(
1023 unsigned Opcode, Type *Ty,
1024 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1025 OperandValueKind Opd1Info = OK_AnyValue,
1026 OperandValueKind Opd2Info = OK_AnyValue,
1027 OperandValueProperties Opd1PropInfo = OP_None,
1028 OperandValueProperties Opd2PropInfo = OP_None,
1029 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
1030 const Instruction *CxtI = nullptr) const;
1031
1032 /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
1033 /// The index and subtype parameters are used by the subvector insertion and
1034 /// extraction shuffle kinds to show the insert/extract point and the type of
1035 /// the subvector being inserted/extracted.
1036 /// NOTE: For subvector extractions Tp represents the source type.
1037 int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index = 0,
1038 VectorType *SubTp = nullptr) const;
1039
1040 /// Represents a hint about the context in which a cast is used.
1041 ///
1042 /// For zext/sext, the context of the cast is the operand, which must be a
1043 /// load of some kind. For trunc, the context is of the cast is the single
1044 /// user of the instruction, which must be a store of some kind.
1045 ///
1046 /// This enum allows the vectorizer to give getCastInstrCost an idea of the
1047 /// type of cast it's dealing with, as not every cast is equal. For instance,
1048 /// the zext of a load may be free, but the zext of an interleaving load can
1049 //// be (very) expensive!
1050 ///
1051 /// See \c getCastContextHint to compute a CastContextHint from a cast
1052 /// Instruction*. Callers can use it if they don't need to override the
1053 /// context and just want it to be calculated from the instruction.
1054 ///
1055 /// FIXME: This handles the types of load/store that the vectorizer can
1056 /// produce, which are the cases where the context instruction is most
1057 /// likely to be incorrect. There are other situations where that can happen
1058 /// too, which might be handled here but in the long run a more general
1059 /// solution of costing multiple instructions at the same times may be better.
1060 enum class CastContextHint : uint8_t {
1061 None, ///< The cast is not used with a load/store of any kind.
1062 Normal, ///< The cast is used with a normal load/store.
1063 Masked, ///< The cast is used with a masked load/store.
1064 GatherScatter, ///< The cast is used with a gather/scatter.
1065 Interleave, ///< The cast is used with an interleaved load/store.
1066 Reversed, ///< The cast is used with a reversed load/store.
1067 };
1068
1069 /// Calculates a CastContextHint from \p I.
1070 /// This should be used by callers of getCastInstrCost if they wish to
1071 /// determine the context from some instruction.
1072 /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr,
1073 /// or if it's another type of cast.
1074 static CastContextHint getCastContextHint(const Instruction *I);
1075
1076 /// \return The expected cost of cast instructions, such as bitcast, trunc,
1077 /// zext, etc. If there is an existing instruction that holds Opcode, it
1078 /// may be passed in the 'I' parameter.
1079 int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1080 TTI::CastContextHint CCH,
1081 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
1082 const Instruction *I = nullptr) const;
1083
1084 /// \return The expected cost of a sign- or zero-extended vector extract. Use
1085 /// -1 to indicate that there is no information about the index value.
1086 int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1087 unsigned Index = -1) const;
1088
1089 /// \return The expected cost of control-flow related instructions such as
1090 /// Phi, Ret, Br.
1091 int getCFInstrCost(unsigned Opcode,
1092 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
1093
1094 /// \returns The expected cost of compare and select instructions. If there
1095 /// is an existing instruction that holds Opcode, it may be passed in the
1096 /// 'I' parameter. The \p VecPred parameter can be used to indicate the select
1097 /// is using a compare with the specified predicate as condition. When vector
1098 /// types are passed, \p VecPred must be used for all lanes.
1099 int getCmpSelInstrCost(
1100 unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
1101 CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
1102 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1103 const Instruction *I = nullptr) const;
1104
1105 /// \return The expected cost of vector Insert and Extract.
1106 /// Use -1 to indicate that there is no information on the index value.
1107 int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const;
1108
1109 /// \return The cost of Load and Store instructions.
1110 int getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1111 unsigned AddressSpace,
1112 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1113 const Instruction *I = nullptr) const;
1114
1115 /// \return The cost of masked Load and Store instructions.
1116 int getMaskedMemoryOpCost(
1117 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
1118 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1119
1120 /// \return The cost of Gather or Scatter operation
1121 /// \p Opcode - is a type of memory access Load or Store
1122 /// \p DataTy - a vector type of the data to be loaded or stored
1123 /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
1124 /// \p VariableMask - true when the memory access is predicated with a mask
1125 /// that is not a compile-time constant
1126 /// \p Alignment - alignment of single element
1127 /// \p I - the optional original context instruction, if one exists, e.g. the
1128 /// load/store to transform or the call to the gather/scatter intrinsic
1129 int getGatherScatterOpCost(
1130 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1131 Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1132 const Instruction *I = nullptr) const;
1133
1134 /// \return The cost of the interleaved memory operation.
1135 /// \p Opcode is the memory operation code
1136 /// \p VecTy is the vector type of the interleaved access.
1137 /// \p Factor is the interleave factor
1138 /// \p Indices is the indices for interleaved load members (as interleaved
1139 /// load allows gaps)
1140 /// \p Alignment is the alignment of the memory operation
1141 /// \p AddressSpace is address space of the pointer.
1142 /// \p UseMaskForCond indicates if the memory access is predicated.
1143 /// \p UseMaskForGaps indicates if gaps should be masked.
1144 int getInterleavedMemoryOpCost(
1145 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1146 Align Alignment, unsigned AddressSpace,
1147 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1148 bool UseMaskForCond = false, bool UseMaskForGaps = false) const;
1149
1150 /// Calculate the cost of performing a vector reduction.
1151 ///
1152 /// This is the cost of reducing the vector value of type \p Ty to a scalar
1153 /// value using the operation denoted by \p Opcode. The form of the reduction
1154 /// can either be a pairwise reduction or a reduction that splits the vector
1155 /// at every reduction level.
1156 ///
1157 /// Pairwise:
1158 /// (v0, v1, v2, v3)
1159 /// ((v0+v1), (v2+v3), undef, undef)
1160 /// Split:
1161 /// (v0, v1, v2, v3)
1162 /// ((v0+v2), (v1+v3), undef, undef)
1163 int getArithmeticReductionCost(
1164 unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
1165 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1166
1167 int getMinMaxReductionCost(
1168 VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
1169 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1170
1171 /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
1172 /// Three cases are handled: 1. scalar instruction 2. vector instruction
1173 /// 3. scalar instruction which is to be vectorized.
1174 int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1175 TTI::TargetCostKind CostKind) const;
1176
1177 /// \returns The cost of Call instructions.
1178 int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1179 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
1180
1181 /// \returns The number of pieces into which the provided type must be
1182 /// split during legalization. Zero is returned when the answer is unknown.
1183 unsigned getNumberOfParts(Type *Tp) const;
1184
1185 /// \returns The cost of the address computation. For most targets this can be
1186 /// merged into the instruction indexing mode. Some targets might want to
1187 /// distinguish between address computation for memory operations on vector
1188 /// types and scalar types. Such targets should override this function.
1189 /// The 'SE' parameter holds pointer for the scalar evolution object which
1190 /// is used in order to get the Ptr step value in case of constant stride.
1191 /// The 'Ptr' parameter holds SCEV of the access pointer.
1192 int getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr,
1193 const SCEV *Ptr = nullptr) const;
1194
1195 /// \returns The cost, if any, of keeping values of the given types alive
1196 /// over a callsite.
1197 ///
1198 /// Some types may require the use of register classes that do not have
1199 /// any callee-saved registers, so would require a spill and fill.
1200 unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
1201
1202 /// \returns True if the intrinsic is a supported memory intrinsic. Info
1203 /// will contain additional information - whether the intrinsic may write
1204 /// or read to memory, volatility and the pointer. Info is undefined
1205 /// if false is returned.
1206 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
1207
1208 /// \returns The maximum element size, in bytes, for an element
1209 /// unordered-atomic memory intrinsic.
1210 unsigned getAtomicMemIntrinsicMaxElementSize() const;
1211
1212 /// \returns A value which is the result of the given memory intrinsic. New
1213 /// instructions may be created to extract the result from the given intrinsic
1214 /// memory operation. Returns nullptr if the target cannot create a result
1215 /// from the given intrinsic.
1216 Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1217 Type *ExpectedType) const;
1218
1219 /// \returns The type to use in a loop expansion of a memcpy call.
1220 Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1221 unsigned SrcAddrSpace, unsigned DestAddrSpace,
1222 unsigned SrcAlign, unsigned DestAlign) const;
1223
1224 /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
1225 /// \param RemainingBytes The number of bytes to copy.
1226 ///
1227 /// Calculates the operand types to use when copying \p RemainingBytes of
1228 /// memory, where source and destination alignments are \p SrcAlign and
1229 /// \p DestAlign respectively.
1230 void getMemcpyLoopResidualLoweringType(
1231 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1232 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
1233 unsigned SrcAlign, unsigned DestAlign) const;
1234
1235 /// \returns True if the two functions have compatible attributes for inlining
1236 /// purposes.
1237 bool areInlineCompatible(const Function *Caller,
1238 const Function *Callee) const;
1239
1240 /// \returns True if the caller and callee agree on how \p Args will be passed
1241 /// to the callee.
1242 /// \param[out] Args The list of compatible arguments. The implementation may
1243 /// filter out any incompatible args from this list.
1244 bool areFunctionArgsABICompatible(const Function *Caller,
1245 const Function *Callee,
1246 SmallPtrSetImpl<Argument *> &Args) const;
1247
1248 /// The type of load/store indexing.
1249 enum MemIndexedMode {
1250 MIM_Unindexed, ///< No indexing.
1251 MIM_PreInc, ///< Pre-incrementing.
1252 MIM_PreDec, ///< Pre-decrementing.
1253 MIM_PostInc, ///< Post-incrementing.
1254 MIM_PostDec ///< Post-decrementing.
1255 };
1256
1257 /// \returns True if the specified indexed load for the given type is legal.
1258 bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;
1259
1260 /// \returns True if the specified indexed store for the given type is legal.
1261 bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;
1262
1263 /// \returns The bitwidth of the largest vector type that should be used to
1264 /// load/store in the given address space.
1265 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
1266
1267 /// \returns True if the load instruction is legal to vectorize.
1268 bool isLegalToVectorizeLoad(LoadInst *LI) const;
1269
1270 /// \returns True if the store instruction is legal to vectorize.
1271 bool isLegalToVectorizeStore(StoreInst *SI) const;
1272
1273 /// \returns True if it is legal to vectorize the given load chain.
1274 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
1275 unsigned AddrSpace) const;
1276
1277 /// \returns True if it is legal to vectorize the given store chain.
1278 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
1279 unsigned AddrSpace) const;
1280
1281 /// \returns The new vector factor value if the target doesn't support \p
1282 /// SizeInBytes loads or has a better vector factor.
1283 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1284 unsigned ChainSizeInBytes,
1285 VectorType *VecTy) const;
1286
1287 /// \returns The new vector factor value if the target doesn't support \p
1288 /// SizeInBytes stores or has a better vector factor.
1289 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1290 unsigned ChainSizeInBytes,
1291 VectorType *VecTy) const;
1292
1293 /// Flags describing the kind of vector reduction.
1294 struct ReductionFlags {
ReductionFlagsReductionFlags1295 ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
1296 bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation.
1297 bool IsSigned; ///< Whether the operation is a signed int reduction.
1298 bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
1299 };
1300
1301 /// \returns True if the target wants to handle the given reduction idiom in
1302 /// the intrinsics form instead of the shuffle form.
1303 bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1304 ReductionFlags Flags) const;
1305
1306 /// \returns True if the target prefers reductions in loop.
1307 bool preferInLoopReduction(unsigned Opcode, Type *Ty,
1308 ReductionFlags Flags) const;
1309
1310 /// \returns True if the target prefers reductions select kept in the loop
1311 /// when tail folding. i.e.
1312 /// loop:
1313 /// p = phi (0, s)
1314 /// a = add (p, x)
1315 /// s = select (mask, a, p)
1316 /// vecreduce.add(s)
1317 ///
1318 /// As opposed to the normal scheme of p = phi (0, a) which allows the select
1319 /// to be pulled out of the loop. If the select(.., add, ..) can be predicated
1320 /// by the target, this can lead to cleaner code generation.
1321 bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
1322 ReductionFlags Flags) const;
1323
1324 /// \returns True if the target wants to expand the given reduction intrinsic
1325 /// into a shuffle sequence.
1326 bool shouldExpandReduction(const IntrinsicInst *II) const;
1327
1328 /// \returns the size cost of rematerializing a GlobalValue address relative
1329 /// to a stack reload.
1330 unsigned getGISelRematGlobalCost() const;
1331
1332 /// \name Vector Predication Information
1333 /// @{
1334 /// Whether the target supports the %evl parameter of VP intrinsic efficiently
1335 /// in hardware. (see LLVM Language Reference - "Vector Predication
1336 /// Intrinsics") Use of %evl is discouraged when that is not the case.
1337 bool hasActiveVectorLength() const;
1338
1339 /// @}
1340
1341 /// @}
1342
1343 private:
1344 /// Estimate the latency of specified instruction.
1345 /// Returns 1 as the default value.
1346 int getInstructionLatency(const Instruction *I) const;
1347
1348 /// Returns the expected throughput cost of the instruction.
1349 /// Returns -1 if the cost is unknown.
1350 int getInstructionThroughput(const Instruction *I) const;
1351
1352 /// The abstract base class used to type erase specific TTI
1353 /// implementations.
1354 class Concept;
1355
1356 /// The template model for the base class which wraps a concrete
1357 /// implementation in a type erased interface.
1358 template <typename T> class Model;
1359
1360 std::unique_ptr<Concept> TTIImpl;
1361 };
1362
1363 class TargetTransformInfo::Concept {
1364 public:
1365 virtual ~Concept() = 0;
1366 virtual const DataLayout &getDataLayout() const = 0;
1367 virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
1368 ArrayRef<const Value *> Operands,
1369 TTI::TargetCostKind CostKind) = 0;
1370 virtual unsigned getInliningThresholdMultiplier() = 0;
1371 virtual int getInlinerVectorBonusPercent() = 0;
1372 virtual int getMemcpyCost(const Instruction *I) = 0;
1373 virtual unsigned
1374 getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
1375 ProfileSummaryInfo *PSI,
1376 BlockFrequencyInfo *BFI) = 0;
1377 virtual int getUserCost(const User *U, ArrayRef<const Value *> Operands,
1378 TargetCostKind CostKind) = 0;
1379 virtual bool hasBranchDivergence() = 0;
1380 virtual bool useGPUDivergenceAnalysis() = 0;
1381 virtual bool isSourceOfDivergence(const Value *V) = 0;
1382 virtual bool isAlwaysUniform(const Value *V) = 0;
1383 virtual unsigned getFlatAddressSpace() = 0;
1384 virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1385 Intrinsic::ID IID) const = 0;
1386 virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
1387 virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
1388 virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1389 Value *OldV,
1390 Value *NewV) const = 0;
1391 virtual bool isLoweredToCall(const Function *F) = 0;
1392 virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
1393 UnrollingPreferences &UP) = 0;
1394 virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1395 PeelingPreferences &PP) = 0;
1396 virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1397 AssumptionCache &AC,
1398 TargetLibraryInfo *LibInfo,
1399 HardwareLoopInfo &HWLoopInfo) = 0;
1400 virtual bool
1401 preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1402 AssumptionCache &AC, TargetLibraryInfo *TLI,
1403 DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
1404 virtual bool emitGetActiveLaneMask() = 0;
1405 virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
1406 IntrinsicInst &II) = 0;
1407 virtual Optional<Value *>
1408 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1409 APInt DemandedMask, KnownBits &Known,
1410 bool &KnownBitsComputed) = 0;
1411 virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
1412 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1413 APInt &UndefElts2, APInt &UndefElts3,
1414 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1415 SimplifyAndSetOp) = 0;
1416 virtual bool isLegalAddImmediate(int64_t Imm) = 0;
1417 virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
1418 virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
1419 int64_t BaseOffset, bool HasBaseReg,
1420 int64_t Scale, unsigned AddrSpace,
1421 Instruction *I) = 0;
1422 virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1423 TargetTransformInfo::LSRCost &C2) = 0;
1424 virtual bool isNumRegsMajorCostOfLSR() = 0;
1425 virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
1426 virtual bool canMacroFuseCmp() = 0;
1427 virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
1428 LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
1429 TargetLibraryInfo *LibInfo) = 0;
1430 virtual bool shouldFavorPostInc() const = 0;
1431 virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
1432 virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
1433 virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
1434 virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
1435 virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
1436 virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
1437 virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
1438 virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
1439 virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
1440 virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
1441 virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
1442 virtual bool prefersVectorizedAddressing() = 0;
1443 virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1444 int64_t BaseOffset, bool HasBaseReg,
1445 int64_t Scale, unsigned AddrSpace) = 0;
1446 virtual bool LSRWithInstrQueries() = 0;
1447 virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
1448 virtual bool isProfitableToHoist(Instruction *I) = 0;
1449 virtual bool useAA() = 0;
1450 virtual bool isTypeLegal(Type *Ty) = 0;
1451 virtual unsigned getRegUsageForType(Type *Ty) = 0;
1452 virtual bool shouldBuildLookupTables() = 0;
1453 virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
1454 virtual bool useColdCCForColdCall(Function &F) = 0;
1455 virtual unsigned getScalarizationOverhead(VectorType *Ty,
1456 const APInt &DemandedElts,
1457 bool Insert, bool Extract) = 0;
1458 virtual unsigned
1459 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1460 unsigned VF) = 0;
1461 virtual bool supportsEfficientVectorElementLoadStore() = 0;
1462 virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
1463 virtual MemCmpExpansionOptions
1464 enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
1465 virtual bool enableInterleavedAccessVectorization() = 0;
1466 virtual bool enableMaskedInterleavedAccessVectorization() = 0;
1467 virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
1468 virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1469 unsigned BitWidth,
1470 unsigned AddressSpace,
1471 unsigned Alignment,
1472 bool *Fast) = 0;
1473 virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
1474 virtual bool haveFastSqrt(Type *Ty) = 0;
1475 virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
1476 virtual int getFPOpCost(Type *Ty) = 0;
1477 virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
1478 const APInt &Imm, Type *Ty) = 0;
1479 virtual int getIntImmCost(const APInt &Imm, Type *Ty,
1480 TargetCostKind CostKind) = 0;
1481 virtual int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
1482 Type *Ty, TargetCostKind CostKind,
1483 Instruction *Inst = nullptr) = 0;
1484 virtual int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
1485 const APInt &Imm, Type *Ty,
1486 TargetCostKind CostKind) = 0;
1487 virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
1488 virtual unsigned getRegisterClassForType(bool Vector,
1489 Type *Ty = nullptr) const = 0;
1490 virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
1491 virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
1492 virtual unsigned getMinVectorRegisterBitWidth() = 0;
1493 virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
1494 virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
1495 virtual bool shouldConsiderAddressTypePromotion(
1496 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
1497 virtual unsigned getCacheLineSize() const = 0;
1498 virtual Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;
1499 virtual Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;
1500
1501 /// \return How much before a load we should place the prefetch
1502 /// instruction. This is currently measured in number of
1503 /// instructions.
1504 virtual unsigned getPrefetchDistance() const = 0;
1505
1506 /// \return Some HW prefetchers can handle accesses up to a certain
1507 /// constant stride. This is the minimum stride in bytes where it
1508 /// makes sense to start adding SW prefetches. The default is 1,
1509 /// i.e. prefetch with any stride. Sometimes prefetching is beneficial
1510 /// even below the HW prefetcher limit, and the arguments provided are
1511 /// meant to serve as a basis for deciding this for a particular loop.
1512 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
1513 unsigned NumStridedMemAccesses,
1514 unsigned NumPrefetches,
1515 bool HasCall) const = 0;
1516
1517 /// \return The maximum number of iterations to prefetch ahead. If
1518 /// the required number of iterations is more than this number, no
1519 /// prefetching is performed.
1520 virtual unsigned getMaxPrefetchIterationsAhead() const = 0;
1521
1522 /// \return True if prefetching should also be done for writes.
1523 virtual bool enableWritePrefetching() const = 0;
1524
1525 virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
1526 virtual unsigned getArithmeticInstrCost(
1527 unsigned Opcode, Type *Ty,
1528 TTI::TargetCostKind CostKind,
1529 OperandValueKind Opd1Info,
1530 OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
1531 OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1532 const Instruction *CxtI = nullptr) = 0;
1533 virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index,
1534 VectorType *SubTp) = 0;
1535 virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1536 CastContextHint CCH,
1537 TTI::TargetCostKind CostKind,
1538 const Instruction *I) = 0;
1539 virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1540 VectorType *VecTy, unsigned Index) = 0;
1541 virtual int getCFInstrCost(unsigned Opcode,
1542 TTI::TargetCostKind CostKind) = 0;
1543 virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1544 CmpInst::Predicate VecPred,
1545 TTI::TargetCostKind CostKind,
1546 const Instruction *I) = 0;
1547 virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
1548 unsigned Index) = 0;
1549 virtual int getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1550 unsigned AddressSpace,
1551 TTI::TargetCostKind CostKind,
1552 const Instruction *I) = 0;
1553 virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1554 unsigned AddressSpace,
1555 TTI::TargetCostKind CostKind) = 0;
1556 virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1557 const Value *Ptr, bool VariableMask,
1558 Align Alignment,
1559 TTI::TargetCostKind CostKind,
1560 const Instruction *I = nullptr) = 0;
1561
1562 virtual int getInterleavedMemoryOpCost(
1563 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1564 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1565 bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;
1566 virtual int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1567 bool IsPairwiseForm,
1568 TTI::TargetCostKind CostKind) = 0;
1569 virtual int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1570 bool IsPairwiseForm, bool IsUnsigned,
1571 TTI::TargetCostKind CostKind) = 0;
1572 virtual int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1573 TTI::TargetCostKind CostKind) = 0;
1574 virtual int getCallInstrCost(Function *F, Type *RetTy,
1575 ArrayRef<Type *> Tys,
1576 TTI::TargetCostKind CostKind) = 0;
1577 virtual unsigned getNumberOfParts(Type *Tp) = 0;
1578 virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1579 const SCEV *Ptr) = 0;
1580 virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
1581 virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1582 MemIntrinsicInfo &Info) = 0;
1583 virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
1584 virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1585 Type *ExpectedType) = 0;
1586 virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1587 unsigned SrcAddrSpace,
1588 unsigned DestAddrSpace,
1589 unsigned SrcAlign,
1590 unsigned DestAlign) const = 0;
1591 virtual void getMemcpyLoopResidualLoweringType(
1592 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1593 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
1594 unsigned SrcAlign, unsigned DestAlign) const = 0;
1595 virtual bool areInlineCompatible(const Function *Caller,
1596 const Function *Callee) const = 0;
1597 virtual bool
1598 areFunctionArgsABICompatible(const Function *Caller, const Function *Callee,
1599 SmallPtrSetImpl<Argument *> &Args) const = 0;
1600 virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1601 virtual bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1602 virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
1603 virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
1604 virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
1605 virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1606 Align Alignment,
1607 unsigned AddrSpace) const = 0;
1608 virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1609 Align Alignment,
1610 unsigned AddrSpace) const = 0;
1611 virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1612 unsigned ChainSizeInBytes,
1613 VectorType *VecTy) const = 0;
1614 virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1615 unsigned ChainSizeInBytes,
1616 VectorType *VecTy) const = 0;
1617 virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1618 ReductionFlags) const = 0;
1619 virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
1620 ReductionFlags) const = 0;
1621 virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
1622 ReductionFlags) const = 0;
1623 virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
1624 virtual unsigned getGISelRematGlobalCost() const = 0;
1625 virtual bool hasActiveVectorLength() const = 0;
1626 virtual int getInstructionLatency(const Instruction *I) = 0;
1627 };
1628
1629 template <typename T>
1630 class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
1631 T Impl;
1632
1633 public:
Model(T Impl)1634 Model(T Impl) : Impl(std::move(Impl)) {}
~Model()1635 ~Model() override {}
1636
getDataLayout()1637 const DataLayout &getDataLayout() const override {
1638 return Impl.getDataLayout();
1639 }
1640
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,enum TargetTransformInfo::TargetCostKind CostKind)1641 int getGEPCost(Type *PointeeType, const Value *Ptr,
1642 ArrayRef<const Value *> Operands,
1643 enum TargetTransformInfo::TargetCostKind CostKind) override {
1644 return Impl.getGEPCost(PointeeType, Ptr, Operands);
1645 }
getInliningThresholdMultiplier()1646 unsigned getInliningThresholdMultiplier() override {
1647 return Impl.getInliningThresholdMultiplier();
1648 }
getInlinerVectorBonusPercent()1649 int getInlinerVectorBonusPercent() override {
1650 return Impl.getInlinerVectorBonusPercent();
1651 }
getMemcpyCost(const Instruction * I)1652 int getMemcpyCost(const Instruction *I) override {
1653 return Impl.getMemcpyCost(I);
1654 }
getUserCost(const User * U,ArrayRef<const Value * > Operands,TargetCostKind CostKind)1655 int getUserCost(const User *U, ArrayRef<const Value *> Operands,
1656 TargetCostKind CostKind) override {
1657 return Impl.getUserCost(U, Operands, CostKind);
1658 }
hasBranchDivergence()1659 bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
useGPUDivergenceAnalysis()1660 bool useGPUDivergenceAnalysis() override {
1661 return Impl.useGPUDivergenceAnalysis();
1662 }
isSourceOfDivergence(const Value * V)1663 bool isSourceOfDivergence(const Value *V) override {
1664 return Impl.isSourceOfDivergence(V);
1665 }
1666
isAlwaysUniform(const Value * V)1667 bool isAlwaysUniform(const Value *V) override {
1668 return Impl.isAlwaysUniform(V);
1669 }
1670
getFlatAddressSpace()1671 unsigned getFlatAddressSpace() override { return Impl.getFlatAddressSpace(); }
1672
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)1673 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1674 Intrinsic::ID IID) const override {
1675 return Impl.collectFlatAddressOperands(OpIndexes, IID);
1676 }
1677
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)1678 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
1679 return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
1680 }
1681
getAssumedAddrSpace(const Value * V)1682 unsigned getAssumedAddrSpace(const Value *V) const override {
1683 return Impl.getAssumedAddrSpace(V);
1684 }
1685
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)1686 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
1687 Value *NewV) const override {
1688 return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
1689 }
1690
isLoweredToCall(const Function * F)1691 bool isLoweredToCall(const Function *F) override {
1692 return Impl.isLoweredToCall(F);
1693 }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,UnrollingPreferences & UP)1694 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1695 UnrollingPreferences &UP) override {
1696 return Impl.getUnrollingPreferences(L, SE, UP);
1697 }
getPeelingPreferences(Loop * L,ScalarEvolution & SE,PeelingPreferences & PP)1698 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1699 PeelingPreferences &PP) override {
1700 return Impl.getPeelingPreferences(L, SE, PP);
1701 }
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)1702 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1703 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
1704 HardwareLoopInfo &HWLoopInfo) override {
1705 return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
1706 }
preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)1707 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1708 AssumptionCache &AC, TargetLibraryInfo *TLI,
1709 DominatorTree *DT,
1710 const LoopAccessInfo *LAI) override {
1711 return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
1712 }
emitGetActiveLaneMask()1713 bool emitGetActiveLaneMask() override {
1714 return Impl.emitGetActiveLaneMask();
1715 }
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)1716 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
1717 IntrinsicInst &II) override {
1718 return Impl.instCombineIntrinsic(IC, II);
1719 }
1720 Optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)1721 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1722 APInt DemandedMask, KnownBits &Known,
1723 bool &KnownBitsComputed) override {
1724 return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
1725 KnownBitsComputed);
1726 }
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)1727 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
1728 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1729 APInt &UndefElts2, APInt &UndefElts3,
1730 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1731 SimplifyAndSetOp) override {
1732 return Impl.simplifyDemandedVectorEltsIntrinsic(
1733 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
1734 SimplifyAndSetOp);
1735 }
isLegalAddImmediate(int64_t Imm)1736 bool isLegalAddImmediate(int64_t Imm) override {
1737 return Impl.isLegalAddImmediate(Imm);
1738 }
isLegalICmpImmediate(int64_t Imm)1739 bool isLegalICmpImmediate(int64_t Imm) override {
1740 return Impl.isLegalICmpImmediate(Imm);
1741 }
isLegalAddressingMode(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace,Instruction * I)1742 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1743 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
1744 Instruction *I) override {
1745 return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1746 AddrSpace, I);
1747 }
isLSRCostLess(TargetTransformInfo::LSRCost & C1,TargetTransformInfo::LSRCost & C2)1748 bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1749 TargetTransformInfo::LSRCost &C2) override {
1750 return Impl.isLSRCostLess(C1, C2);
1751 }
isNumRegsMajorCostOfLSR()1752 bool isNumRegsMajorCostOfLSR() override {
1753 return Impl.isNumRegsMajorCostOfLSR();
1754 }
isProfitableLSRChainElement(Instruction * I)1755 bool isProfitableLSRChainElement(Instruction *I) override {
1756 return Impl.isProfitableLSRChainElement(I);
1757 }
canMacroFuseCmp()1758 bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); }
canSaveCmp(Loop * L,BranchInst ** BI,ScalarEvolution * SE,LoopInfo * LI,DominatorTree * DT,AssumptionCache * AC,TargetLibraryInfo * LibInfo)1759 bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
1760 DominatorTree *DT, AssumptionCache *AC,
1761 TargetLibraryInfo *LibInfo) override {
1762 return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo);
1763 }
shouldFavorPostInc()1764 bool shouldFavorPostInc() const override { return Impl.shouldFavorPostInc(); }
shouldFavorBackedgeIndex(const Loop * L)1765 bool shouldFavorBackedgeIndex(const Loop *L) const override {
1766 return Impl.shouldFavorBackedgeIndex(L);
1767 }
isLegalMaskedStore(Type * DataType,Align Alignment)1768 bool isLegalMaskedStore(Type *DataType, Align Alignment) override {
1769 return Impl.isLegalMaskedStore(DataType, Alignment);
1770 }
isLegalMaskedLoad(Type * DataType,Align Alignment)1771 bool isLegalMaskedLoad(Type *DataType, Align Alignment) override {
1772 return Impl.isLegalMaskedLoad(DataType, Alignment);
1773 }
isLegalNTStore(Type * DataType,Align Alignment)1774 bool isLegalNTStore(Type *DataType, Align Alignment) override {
1775 return Impl.isLegalNTStore(DataType, Alignment);
1776 }
isLegalNTLoad(Type * DataType,Align Alignment)1777 bool isLegalNTLoad(Type *DataType, Align Alignment) override {
1778 return Impl.isLegalNTLoad(DataType, Alignment);
1779 }
isLegalMaskedScatter(Type * DataType,Align Alignment)1780 bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
1781 return Impl.isLegalMaskedScatter(DataType, Alignment);
1782 }
isLegalMaskedGather(Type * DataType,Align Alignment)1783 bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
1784 return Impl.isLegalMaskedGather(DataType, Alignment);
1785 }
isLegalMaskedCompressStore(Type * DataType)1786 bool isLegalMaskedCompressStore(Type *DataType) override {
1787 return Impl.isLegalMaskedCompressStore(DataType);
1788 }
isLegalMaskedExpandLoad(Type * DataType)1789 bool isLegalMaskedExpandLoad(Type *DataType) override {
1790 return Impl.isLegalMaskedExpandLoad(DataType);
1791 }
hasDivRemOp(Type * DataType,bool IsSigned)1792 bool hasDivRemOp(Type *DataType, bool IsSigned) override {
1793 return Impl.hasDivRemOp(DataType, IsSigned);
1794 }
hasVolatileVariant(Instruction * I,unsigned AddrSpace)1795 bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
1796 return Impl.hasVolatileVariant(I, AddrSpace);
1797 }
prefersVectorizedAddressing()1798 bool prefersVectorizedAddressing() override {
1799 return Impl.prefersVectorizedAddressing();
1800 }
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)1801 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1802 bool HasBaseReg, int64_t Scale,
1803 unsigned AddrSpace) override {
1804 return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1805 AddrSpace);
1806 }
LSRWithInstrQueries()1807 bool LSRWithInstrQueries() override { return Impl.LSRWithInstrQueries(); }
isTruncateFree(Type * Ty1,Type * Ty2)1808 bool isTruncateFree(Type *Ty1, Type *Ty2) override {
1809 return Impl.isTruncateFree(Ty1, Ty2);
1810 }
isProfitableToHoist(Instruction * I)1811 bool isProfitableToHoist(Instruction *I) override {
1812 return Impl.isProfitableToHoist(I);
1813 }
useAA()1814 bool useAA() override { return Impl.useAA(); }
isTypeLegal(Type * Ty)1815 bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
getRegUsageForType(Type * Ty)1816 unsigned getRegUsageForType(Type *Ty) override {
1817 return Impl.getRegUsageForType(Ty);
1818 }
shouldBuildLookupTables()1819 bool shouldBuildLookupTables() override {
1820 return Impl.shouldBuildLookupTables();
1821 }
shouldBuildLookupTablesForConstant(Constant * C)1822 bool shouldBuildLookupTablesForConstant(Constant *C) override {
1823 return Impl.shouldBuildLookupTablesForConstant(C);
1824 }
useColdCCForColdCall(Function & F)1825 bool useColdCCForColdCall(Function &F) override {
1826 return Impl.useColdCCForColdCall(F);
1827 }
1828
getScalarizationOverhead(VectorType * Ty,const APInt & DemandedElts,bool Insert,bool Extract)1829 unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
1830 bool Insert, bool Extract) override {
1831 return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
1832 }
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,unsigned VF)1833 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1834 unsigned VF) override {
1835 return Impl.getOperandsScalarizationOverhead(Args, VF);
1836 }
1837
supportsEfficientVectorElementLoadStore()1838 bool supportsEfficientVectorElementLoadStore() override {
1839 return Impl.supportsEfficientVectorElementLoadStore();
1840 }
1841
enableAggressiveInterleaving(bool LoopHasReductions)1842 bool enableAggressiveInterleaving(bool LoopHasReductions) override {
1843 return Impl.enableAggressiveInterleaving(LoopHasReductions);
1844 }
enableMemCmpExpansion(bool OptSize,bool IsZeroCmp)1845 MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
1846 bool IsZeroCmp) const override {
1847 return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
1848 }
enableInterleavedAccessVectorization()1849 bool enableInterleavedAccessVectorization() override {
1850 return Impl.enableInterleavedAccessVectorization();
1851 }
enableMaskedInterleavedAccessVectorization()1852 bool enableMaskedInterleavedAccessVectorization() override {
1853 return Impl.enableMaskedInterleavedAccessVectorization();
1854 }
isFPVectorizationPotentiallyUnsafe()1855 bool isFPVectorizationPotentiallyUnsafe() override {
1856 return Impl.isFPVectorizationPotentiallyUnsafe();
1857 }
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,unsigned Alignment,bool * Fast)1858 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
1859 unsigned AddressSpace, unsigned Alignment,
1860 bool *Fast) override {
1861 return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
1862 Alignment, Fast);
1863 }
getPopcntSupport(unsigned IntTyWidthInBit)1864 PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
1865 return Impl.getPopcntSupport(IntTyWidthInBit);
1866 }
haveFastSqrt(Type * Ty)1867 bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
1868
isFCmpOrdCheaperThanFCmpZero(Type * Ty)1869 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
1870 return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
1871 }
1872
getFPOpCost(Type * Ty)1873 int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }
1874
getIntImmCodeSizeCost(unsigned Opc,unsigned Idx,const APInt & Imm,Type * Ty)1875 int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1876 Type *Ty) override {
1877 return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty);
1878 }
getIntImmCost(const APInt & Imm,Type * Ty,TargetCostKind CostKind)1879 int getIntImmCost(const APInt &Imm, Type *Ty,
1880 TargetCostKind CostKind) override {
1881 return Impl.getIntImmCost(Imm, Ty, CostKind);
1882 }
1883 int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
1884 TargetCostKind CostKind,
1885 Instruction *Inst = nullptr) override {
1886 return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty, CostKind, Inst);
1887 }
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TargetCostKind CostKind)1888 int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1889 Type *Ty, TargetCostKind CostKind) override {
1890 return Impl.getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
1891 }
getNumberOfRegisters(unsigned ClassID)1892 unsigned getNumberOfRegisters(unsigned ClassID) const override {
1893 return Impl.getNumberOfRegisters(ClassID);
1894 }
1895 unsigned getRegisterClassForType(bool Vector,
1896 Type *Ty = nullptr) const override {
1897 return Impl.getRegisterClassForType(Vector, Ty);
1898 }
getRegisterClassName(unsigned ClassID)1899 const char *getRegisterClassName(unsigned ClassID) const override {
1900 return Impl.getRegisterClassName(ClassID);
1901 }
getRegisterBitWidth(bool Vector)1902 unsigned getRegisterBitWidth(bool Vector) const override {
1903 return Impl.getRegisterBitWidth(Vector);
1904 }
getMinVectorRegisterBitWidth()1905 unsigned getMinVectorRegisterBitWidth() override {
1906 return Impl.getMinVectorRegisterBitWidth();
1907 }
shouldMaximizeVectorBandwidth(bool OptSize)1908 bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
1909 return Impl.shouldMaximizeVectorBandwidth(OptSize);
1910 }
getMinimumVF(unsigned ElemWidth)1911 unsigned getMinimumVF(unsigned ElemWidth) const override {
1912 return Impl.getMinimumVF(ElemWidth);
1913 }
shouldConsiderAddressTypePromotion(const Instruction & I,bool & AllowPromotionWithoutCommonHeader)1914 bool shouldConsiderAddressTypePromotion(
1915 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
1916 return Impl.shouldConsiderAddressTypePromotion(
1917 I, AllowPromotionWithoutCommonHeader);
1918 }
getCacheLineSize()1919 unsigned getCacheLineSize() const override { return Impl.getCacheLineSize(); }
getCacheSize(CacheLevel Level)1920 Optional<unsigned> getCacheSize(CacheLevel Level) const override {
1921 return Impl.getCacheSize(Level);
1922 }
getCacheAssociativity(CacheLevel Level)1923 Optional<unsigned> getCacheAssociativity(CacheLevel Level) const override {
1924 return Impl.getCacheAssociativity(Level);
1925 }
1926
1927 /// Return the preferred prefetch distance in terms of instructions.
1928 ///
getPrefetchDistance()1929 unsigned getPrefetchDistance() const override {
1930 return Impl.getPrefetchDistance();
1931 }
1932
1933 /// Return the minimum stride necessary to trigger software
1934 /// prefetching.
1935 ///
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)1936 unsigned getMinPrefetchStride(unsigned NumMemAccesses,
1937 unsigned NumStridedMemAccesses,
1938 unsigned NumPrefetches,
1939 bool HasCall) const override {
1940 return Impl.getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
1941 NumPrefetches, HasCall);
1942 }
1943
1944 /// Return the maximum prefetch distance in terms of loop
1945 /// iterations.
1946 ///
getMaxPrefetchIterationsAhead()1947 unsigned getMaxPrefetchIterationsAhead() const override {
1948 return Impl.getMaxPrefetchIterationsAhead();
1949 }
1950
1951 /// \return True if prefetching should also be done for writes.
enableWritePrefetching()1952 bool enableWritePrefetching() const override {
1953 return Impl.enableWritePrefetching();
1954 }
1955
getMaxInterleaveFactor(unsigned VF)1956 unsigned getMaxInterleaveFactor(unsigned VF) override {
1957 return Impl.getMaxInterleaveFactor(VF);
1958 }
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JTSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)1959 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1960 unsigned &JTSize,
1961 ProfileSummaryInfo *PSI,
1962 BlockFrequencyInfo *BFI) override {
1963 return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
1964 }
1965 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
1966 TTI::TargetCostKind CostKind,
1967 OperandValueKind Opd1Info,
1968 OperandValueKind Opd2Info,
1969 OperandValueProperties Opd1PropInfo,
1970 OperandValueProperties Opd2PropInfo,
1971 ArrayRef<const Value *> Args,
1972 const Instruction *CxtI = nullptr) override {
1973 return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
1974 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
1975 }
getShuffleCost(ShuffleKind Kind,VectorType * Tp,int Index,VectorType * SubTp)1976 int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index,
1977 VectorType *SubTp) override {
1978 return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
1979 }
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)1980 int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1981 CastContextHint CCH, TTI::TargetCostKind CostKind,
1982 const Instruction *I) override {
1983 return Impl.getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1984 }
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1985 int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1986 unsigned Index) override {
1987 return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
1988 }
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind)1989 int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) override {
1990 return Impl.getCFInstrCost(Opcode, CostKind);
1991 }
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)1992 int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1993 CmpInst::Predicate VecPred,
1994 TTI::TargetCostKind CostKind,
1995 const Instruction *I) override {
1996 return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1997 }
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)1998 int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
1999 return Impl.getVectorInstrCost(Opcode, Val, Index);
2000 }
getMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,const Instruction * I)2001 int getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
2002 unsigned AddressSpace, TTI::TargetCostKind CostKind,
2003 const Instruction *I) override {
2004 return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2005 CostKind, I);
2006 }
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)2007 int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
2008 unsigned AddressSpace,
2009 TTI::TargetCostKind CostKind) override {
2010 return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2011 CostKind);
2012 }
2013 int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
2014 bool VariableMask, Align Alignment,
2015 TTI::TargetCostKind CostKind,
2016 const Instruction *I = nullptr) override {
2017 return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2018 Alignment, CostKind, I);
2019 }
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)2020 int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
2021 ArrayRef<unsigned> Indices, Align Alignment,
2022 unsigned AddressSpace,
2023 TTI::TargetCostKind CostKind,
2024 bool UseMaskForCond,
2025 bool UseMaskForGaps) override {
2026 return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2027 Alignment, AddressSpace, CostKind,
2028 UseMaskForCond, UseMaskForGaps);
2029 }
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,bool IsPairwiseForm,TTI::TargetCostKind CostKind)2030 int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2031 bool IsPairwiseForm,
2032 TTI::TargetCostKind CostKind) override {
2033 return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm,
2034 CostKind);
2035 }
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsPairwiseForm,bool IsUnsigned,TTI::TargetCostKind CostKind)2036 int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2037 bool IsPairwiseForm, bool IsUnsigned,
2038 TTI::TargetCostKind CostKind) override {
2039 return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned,
2040 CostKind);
2041 }
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)2042 int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2043 TTI::TargetCostKind CostKind) override {
2044 return Impl.getIntrinsicInstrCost(ICA, CostKind);
2045 }
getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)2046 int getCallInstrCost(Function *F, Type *RetTy,
2047 ArrayRef<Type *> Tys,
2048 TTI::TargetCostKind CostKind) override {
2049 return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
2050 }
getNumberOfParts(Type * Tp)2051 unsigned getNumberOfParts(Type *Tp) override {
2052 return Impl.getNumberOfParts(Tp);
2053 }
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)2054 int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
2055 const SCEV *Ptr) override {
2056 return Impl.getAddressComputationCost(Ty, SE, Ptr);
2057 }
getCostOfKeepingLiveOverCall(ArrayRef<Type * > Tys)2058 unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
2059 return Impl.getCostOfKeepingLiveOverCall(Tys);
2060 }
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info)2061 bool getTgtMemIntrinsic(IntrinsicInst *Inst,
2062 MemIntrinsicInfo &Info) override {
2063 return Impl.getTgtMemIntrinsic(Inst, Info);
2064 }
getAtomicMemIntrinsicMaxElementSize()2065 unsigned getAtomicMemIntrinsicMaxElementSize() const override {
2066 return Impl.getAtomicMemIntrinsicMaxElementSize();
2067 }
getOrCreateResultFromMemIntrinsic(IntrinsicInst * Inst,Type * ExpectedType)2068 Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2069 Type *ExpectedType) override {
2070 return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
2071 }
getMemcpyLoopLoweringType(LLVMContext & Context,Value * Length,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign)2072 Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
2073 unsigned SrcAddrSpace, unsigned DestAddrSpace,
2074 unsigned SrcAlign,
2075 unsigned DestAlign) const override {
2076 return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
2077 DestAddrSpace, SrcAlign, DestAlign);
2078 }
getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type * > & OpsOut,LLVMContext & Context,unsigned RemainingBytes,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign)2079 void getMemcpyLoopResidualLoweringType(
2080 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
2081 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
2082 unsigned SrcAlign, unsigned DestAlign) const override {
2083 Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
2084 SrcAddrSpace, DestAddrSpace,
2085 SrcAlign, DestAlign);
2086 }
areInlineCompatible(const Function * Caller,const Function * Callee)2087 bool areInlineCompatible(const Function *Caller,
2088 const Function *Callee) const override {
2089 return Impl.areInlineCompatible(Caller, Callee);
2090 }
areFunctionArgsABICompatible(const Function * Caller,const Function * Callee,SmallPtrSetImpl<Argument * > & Args)2091 bool areFunctionArgsABICompatible(
2092 const Function *Caller, const Function *Callee,
2093 SmallPtrSetImpl<Argument *> &Args) const override {
2094 return Impl.areFunctionArgsABICompatible(Caller, Callee, Args);
2095 }
isIndexedLoadLegal(MemIndexedMode Mode,Type * Ty)2096 bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
2097 return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
2098 }
isIndexedStoreLegal(MemIndexedMode Mode,Type * Ty)2099 bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
2100 return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
2101 }
getLoadStoreVecRegBitWidth(unsigned AddrSpace)2102 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
2103 return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
2104 }
isLegalToVectorizeLoad(LoadInst * LI)2105 bool isLegalToVectorizeLoad(LoadInst *LI) const override {
2106 return Impl.isLegalToVectorizeLoad(LI);
2107 }
isLegalToVectorizeStore(StoreInst * SI)2108 bool isLegalToVectorizeStore(StoreInst *SI) const override {
2109 return Impl.isLegalToVectorizeStore(SI);
2110 }
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace)2111 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
2112 unsigned AddrSpace) const override {
2113 return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
2114 AddrSpace);
2115 }
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace)2116 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
2117 unsigned AddrSpace) const override {
2118 return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
2119 AddrSpace);
2120 }
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy)2121 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
2122 unsigned ChainSizeInBytes,
2123 VectorType *VecTy) const override {
2124 return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
2125 }
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy)2126 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
2127 unsigned ChainSizeInBytes,
2128 VectorType *VecTy) const override {
2129 return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
2130 }
useReductionIntrinsic(unsigned Opcode,Type * Ty,ReductionFlags Flags)2131 bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
2132 ReductionFlags Flags) const override {
2133 return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
2134 }
preferInLoopReduction(unsigned Opcode,Type * Ty,ReductionFlags Flags)2135 bool preferInLoopReduction(unsigned Opcode, Type *Ty,
2136 ReductionFlags Flags) const override {
2137 return Impl.preferInLoopReduction(Opcode, Ty, Flags);
2138 }
preferPredicatedReductionSelect(unsigned Opcode,Type * Ty,ReductionFlags Flags)2139 bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
2140 ReductionFlags Flags) const override {
2141 return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
2142 }
shouldExpandReduction(const IntrinsicInst * II)2143 bool shouldExpandReduction(const IntrinsicInst *II) const override {
2144 return Impl.shouldExpandReduction(II);
2145 }
2146
getGISelRematGlobalCost()2147 unsigned getGISelRematGlobalCost() const override {
2148 return Impl.getGISelRematGlobalCost();
2149 }
2150
hasActiveVectorLength()2151 bool hasActiveVectorLength() const override {
2152 return Impl.hasActiveVectorLength();
2153 }
2154
getInstructionLatency(const Instruction * I)2155 int getInstructionLatency(const Instruction *I) override {
2156 return Impl.getInstructionLatency(I);
2157 }
2158 };
2159
2160 template <typename T>
TargetTransformInfo(T Impl)2161 TargetTransformInfo::TargetTransformInfo(T Impl)
2162 : TTIImpl(new Model<T>(Impl)) {}
2163
2164 /// Analysis pass providing the \c TargetTransformInfo.
2165 ///
2166 /// The core idea of the TargetIRAnalysis is to expose an interface through
2167 /// which LLVM targets can analyze and provide information about the middle
2168 /// end's target-independent IR. This supports use cases such as target-aware
2169 /// cost modeling of IR constructs.
2170 ///
2171 /// This is a function analysis because much of the cost modeling for targets
2172 /// is done in a subtarget specific way and LLVM supports compiling different
2173 /// functions targeting different subtargets in order to support runtime
2174 /// dispatch according to the observed subtarget.
2175 class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
2176 public:
2177 typedef TargetTransformInfo Result;
2178
2179 /// Default construct a target IR analysis.
2180 ///
2181 /// This will use the module's datalayout to construct a baseline
2182 /// conservative TTI result.
2183 TargetIRAnalysis();
2184
2185 /// Construct an IR analysis pass around a target-provide callback.
2186 ///
2187 /// The callback will be called with a particular function for which the TTI
2188 /// is needed and must return a TTI object for that function.
2189 TargetIRAnalysis(std::function<Result(const Function &)> TTICallback);
2190
2191 // Value semantics. We spell out the constructors for MSVC.
TargetIRAnalysis(const TargetIRAnalysis & Arg)2192 TargetIRAnalysis(const TargetIRAnalysis &Arg)
2193 : TTICallback(Arg.TTICallback) {}
TargetIRAnalysis(TargetIRAnalysis && Arg)2194 TargetIRAnalysis(TargetIRAnalysis &&Arg)
2195 : TTICallback(std::move(Arg.TTICallback)) {}
2196 TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) {
2197 TTICallback = RHS.TTICallback;
2198 return *this;
2199 }
2200 TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) {
2201 TTICallback = std::move(RHS.TTICallback);
2202 return *this;
2203 }
2204
2205 Result run(const Function &F, FunctionAnalysisManager &);
2206
2207 private:
2208 friend AnalysisInfoMixin<TargetIRAnalysis>;
2209 static AnalysisKey Key;
2210
2211 /// The callback used to produce a result.
2212 ///
2213 /// We use a completely opaque callback so that targets can provide whatever
2214 /// mechanism they desire for constructing the TTI for a given function.
2215 ///
2216 /// FIXME: Should we really use std::function? It's relatively inefficient.
2217 /// It might be possible to arrange for even stateful callbacks to outlive
2218 /// the analysis and thus use a function_ref which would be lighter weight.
2219 /// This may also be less error prone as the callback is likely to reference
2220 /// the external TargetMachine, and that reference needs to never dangle.
2221 std::function<Result(const Function &)> TTICallback;
2222
2223 /// Helper function used as the callback in the default constructor.
2224 static Result getDefaultTTI(const Function &F);
2225 };
2226
2227 /// Wrapper pass for TargetTransformInfo.
2228 ///
2229 /// This pass can be constructed from a TTI object which it stores internally
2230 /// and is queried by passes.
2231 class TargetTransformInfoWrapperPass : public ImmutablePass {
2232 TargetIRAnalysis TIRA;
2233 Optional<TargetTransformInfo> TTI;
2234
2235 virtual void anchor();
2236
2237 public:
2238 static char ID;
2239
2240 /// We must provide a default constructor for the pass but it should
2241 /// never be used.
2242 ///
2243 /// Use the constructor below or call one of the creation routines.
2244 TargetTransformInfoWrapperPass();
2245
2246 explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
2247
2248 TargetTransformInfo &getTTI(const Function &F);
2249 };
2250
2251 /// Create an analysis pass wrapper around a TTI object.
2252 ///
2253 /// This analysis pass just holds the TTI instance and makes it available to
2254 /// clients.
2255 ImmutablePass *createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
2256
2257 } // namespace llvm
2258
2259 #endif
2260