1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "PPCTargetTransformInfo.h"
11 #include "llvm/Analysis/TargetTransformInfo.h"
12 #include "llvm/CodeGen/BasicTTIImpl.h"
13 #include "llvm/Support/CommandLine.h"
14 #include "llvm/Support/Debug.h"
15 #include "llvm/Target/CostTable.h"
16 #include "llvm/Target/TargetLowering.h"
17 using namespace llvm;
18
19 #define DEBUG_TYPE "ppctti"
20
21 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
22 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
23
24 //===----------------------------------------------------------------------===//
25 //
26 // PPC cost model.
27 //
28 //===----------------------------------------------------------------------===//
29
30 TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)31 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
32 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
33 if (ST->hasPOPCNTD() && TyWidth <= 64)
34 return TTI::PSK_FastHardware;
35 return TTI::PSK_Software;
36 }
37
getIntImmCost(const APInt & Imm,Type * Ty)38 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
39 if (DisablePPCConstHoist)
40 return BaseT::getIntImmCost(Imm, Ty);
41
42 assert(Ty->isIntegerTy());
43
44 unsigned BitSize = Ty->getPrimitiveSizeInBits();
45 if (BitSize == 0)
46 return ~0U;
47
48 if (Imm == 0)
49 return TTI::TCC_Free;
50
51 if (Imm.getBitWidth() <= 64) {
52 if (isInt<16>(Imm.getSExtValue()))
53 return TTI::TCC_Basic;
54
55 if (isInt<32>(Imm.getSExtValue())) {
56 // A constant that can be materialized using lis.
57 if ((Imm.getZExtValue() & 0xFFFF) == 0)
58 return TTI::TCC_Basic;
59
60 return 2 * TTI::TCC_Basic;
61 }
62 }
63
64 return 4 * TTI::TCC_Basic;
65 }
66
getIntImmCost(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty)67 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
68 Type *Ty) {
69 if (DisablePPCConstHoist)
70 return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
71
72 assert(Ty->isIntegerTy());
73
74 unsigned BitSize = Ty->getPrimitiveSizeInBits();
75 if (BitSize == 0)
76 return ~0U;
77
78 switch (IID) {
79 default:
80 return TTI::TCC_Free;
81 case Intrinsic::sadd_with_overflow:
82 case Intrinsic::uadd_with_overflow:
83 case Intrinsic::ssub_with_overflow:
84 case Intrinsic::usub_with_overflow:
85 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
86 return TTI::TCC_Free;
87 break;
88 case Intrinsic::experimental_stackmap:
89 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
90 return TTI::TCC_Free;
91 break;
92 case Intrinsic::experimental_patchpoint_void:
93 case Intrinsic::experimental_patchpoint_i64:
94 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
95 return TTI::TCC_Free;
96 break;
97 }
98 return PPCTTIImpl::getIntImmCost(Imm, Ty);
99 }
100
getIntImmCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty)101 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
102 Type *Ty) {
103 if (DisablePPCConstHoist)
104 return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
105
106 assert(Ty->isIntegerTy());
107
108 unsigned BitSize = Ty->getPrimitiveSizeInBits();
109 if (BitSize == 0)
110 return ~0U;
111
112 unsigned ImmIdx = ~0U;
113 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
114 ZeroFree = false;
115 switch (Opcode) {
116 default:
117 return TTI::TCC_Free;
118 case Instruction::GetElementPtr:
119 // Always hoist the base address of a GetElementPtr. This prevents the
120 // creation of new constants for every base constant that gets constant
121 // folded with the offset.
122 if (Idx == 0)
123 return 2 * TTI::TCC_Basic;
124 return TTI::TCC_Free;
125 case Instruction::And:
126 RunFree = true; // (for the rotate-and-mask instructions)
127 // Fallthrough...
128 case Instruction::Add:
129 case Instruction::Or:
130 case Instruction::Xor:
131 ShiftedFree = true;
132 // Fallthrough...
133 case Instruction::Sub:
134 case Instruction::Mul:
135 case Instruction::Shl:
136 case Instruction::LShr:
137 case Instruction::AShr:
138 ImmIdx = 1;
139 break;
140 case Instruction::ICmp:
141 UnsignedFree = true;
142 ImmIdx = 1;
143 // Fallthrough... (zero comparisons can use record-form instructions)
144 case Instruction::Select:
145 ZeroFree = true;
146 break;
147 case Instruction::PHI:
148 case Instruction::Call:
149 case Instruction::Ret:
150 case Instruction::Load:
151 case Instruction::Store:
152 break;
153 }
154
155 if (ZeroFree && Imm == 0)
156 return TTI::TCC_Free;
157
158 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
159 if (isInt<16>(Imm.getSExtValue()))
160 return TTI::TCC_Free;
161
162 if (RunFree) {
163 if (Imm.getBitWidth() <= 32 &&
164 (isShiftedMask_32(Imm.getZExtValue()) ||
165 isShiftedMask_32(~Imm.getZExtValue())))
166 return TTI::TCC_Free;
167
168 if (ST->isPPC64() &&
169 (isShiftedMask_64(Imm.getZExtValue()) ||
170 isShiftedMask_64(~Imm.getZExtValue())))
171 return TTI::TCC_Free;
172 }
173
174 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
175 return TTI::TCC_Free;
176
177 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
178 return TTI::TCC_Free;
179 }
180
181 return PPCTTIImpl::getIntImmCost(Imm, Ty);
182 }
183
getUnrollingPreferences(Loop * L,TTI::UnrollingPreferences & UP)184 void PPCTTIImpl::getUnrollingPreferences(Loop *L,
185 TTI::UnrollingPreferences &UP) {
186 if (ST->getDarwinDirective() == PPC::DIR_A2) {
187 // The A2 is in-order with a deep pipeline, and concatenation unrolling
188 // helps expose latency-hiding opportunities to the instruction scheduler.
189 UP.Partial = UP.Runtime = true;
190
191 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
192 // often outweigh the cost of a division to compute the trip count.
193 UP.AllowExpensiveTripCount = true;
194 }
195
196 BaseT::getUnrollingPreferences(L, UP);
197 }
198
enableAggressiveInterleaving(bool LoopHasReductions)199 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
200 // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
201 // on combining the loads generated for consecutive accesses, and failure to
202 // do so is particularly expensive. This makes it much more likely (compared
203 // to only using concatenation unrolling).
204 if (ST->getDarwinDirective() == PPC::DIR_A2)
205 return true;
206
207 return LoopHasReductions;
208 }
209
enableInterleavedAccessVectorization()210 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
211 return true;
212 }
213
getNumberOfRegisters(bool Vector)214 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
215 if (Vector && !ST->hasAltivec() && !ST->hasQPX())
216 return 0;
217 return ST->hasVSX() ? 64 : 32;
218 }
219
getRegisterBitWidth(bool Vector)220 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
221 if (Vector) {
222 if (ST->hasQPX()) return 256;
223 if (ST->hasAltivec()) return 128;
224 return 0;
225 }
226
227 if (ST->isPPC64())
228 return 64;
229 return 32;
230
231 }
232
getMaxInterleaveFactor(unsigned VF)233 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
234 unsigned Directive = ST->getDarwinDirective();
235 // The 440 has no SIMD support, but floating-point instructions
236 // have a 5-cycle latency, so unroll by 5x for latency hiding.
237 if (Directive == PPC::DIR_440)
238 return 5;
239
240 // The A2 has no SIMD support, but floating-point instructions
241 // have a 6-cycle latency, so unroll by 6x for latency hiding.
242 if (Directive == PPC::DIR_A2)
243 return 6;
244
245 // FIXME: For lack of any better information, do no harm...
246 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
247 return 1;
248
249 // For P7 and P8, floating-point instructions have a 6-cycle latency and
250 // there are two execution units, so unroll by 12x for latency hiding.
251 if (Directive == PPC::DIR_PWR7 ||
252 Directive == PPC::DIR_PWR8)
253 return 12;
254
255 // For most things, modern systems have two execution units (and
256 // out-of-order execution).
257 return 2;
258 }
259
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::OperandValueKind Op1Info,TTI::OperandValueKind Op2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo)260 int PPCTTIImpl::getArithmeticInstrCost(
261 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
262 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
263 TTI::OperandValueProperties Opd2PropInfo) {
264 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
265
266 // Fallback to the default implementation.
267 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
268 Opd1PropInfo, Opd2PropInfo);
269 }
270
getShuffleCost(TTI::ShuffleKind Kind,Type * Tp,int Index,Type * SubTp)271 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
272 Type *SubTp) {
273 // Legalize the type.
274 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
275
276 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
277 // (at least in the sense that there need only be one non-loop-invariant
278 // instruction). We need one such shuffle instruction for each actual
279 // register (this is not true for arbitrary shuffles, but is true for the
280 // structured types of shuffles covered by TTI::ShuffleKind).
281 return LT.first;
282 }
283
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src)284 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
285 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
286
287 return BaseT::getCastInstrCost(Opcode, Dst, Src);
288 }
289
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy)290 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
291 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
292 }
293
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)294 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
295 assert(Val->isVectorTy() && "This must be a vector type");
296
297 int ISD = TLI->InstructionOpcodeToISD(Opcode);
298 assert(ISD && "Invalid opcode");
299
300 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
301 // Double-precision scalars are already located in index #0.
302 if (Index == 0)
303 return 0;
304
305 return BaseT::getVectorInstrCost(Opcode, Val, Index);
306 } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
307 // Floating point scalars are already located in index #0.
308 if (Index == 0)
309 return 0;
310
311 return BaseT::getVectorInstrCost(Opcode, Val, Index);
312 }
313
314 // Estimated cost of a load-hit-store delay. This was obtained
315 // experimentally as a minimum needed to prevent unprofitable
316 // vectorization for the paq8p benchmark. It may need to be
317 // raised further if other unprofitable cases remain.
318 unsigned LHSPenalty = 2;
319 if (ISD == ISD::INSERT_VECTOR_ELT)
320 LHSPenalty += 7;
321
322 // Vector element insert/extract with Altivec is very expensive,
323 // because they require store and reload with the attendant
324 // processor stall for load-hit-store. Until VSX is available,
325 // these need to be estimated as very costly.
326 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
327 ISD == ISD::INSERT_VECTOR_ELT)
328 return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
329
330 return BaseT::getVectorInstrCost(Opcode, Val, Index);
331 }
332
getMemoryOpCost(unsigned Opcode,Type * Src,unsigned Alignment,unsigned AddressSpace)333 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
334 unsigned AddressSpace) {
335 // Legalize the type.
336 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
337 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
338 "Invalid Opcode");
339
340 int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
341
342 // Aligned loads and stores are easy.
343 unsigned SrcBytes = LT.second.getStoreSize();
344 if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
345 return Cost;
346
347 bool IsAltivecType = ST->hasAltivec() &&
348 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
349 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
350 bool IsVSXType = ST->hasVSX() &&
351 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
352 bool IsQPXType = ST->hasQPX() &&
353 (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
354
355 // If we can use the permutation-based load sequence, then this is also
356 // relatively cheap (not counting loop-invariant instructions): one load plus
357 // one permute (the last load in a series has extra cost, but we're
358 // neglecting that here). Note that on the P7, we should do unaligned loads
359 // for Altivec types using the VSX instructions, but that's more expensive
360 // than using the permutation-based load sequence. On the P8, that's no
361 // longer true.
362 if (Opcode == Instruction::Load &&
363 ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
364 Alignment >= LT.second.getScalarType().getStoreSize())
365 return Cost + LT.first; // Add the cost of the permutations.
366
367 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
368 // P7, unaligned vector loads are more expensive than the permutation-based
369 // load sequence, so that might be used instead, but regardless, the net cost
370 // is about the same (not counting loop-invariant instructions).
371 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
372 return Cost;
373
374 // PPC in general does not support unaligned loads and stores. They'll need
375 // to be decomposed based on the alignment factor.
376
377 // Add the cost of each scalar load or store.
378 Cost += LT.first*(SrcBytes/Alignment-1);
379
380 // For a vector type, there is also scalarization overhead (only for
381 // stores, loads are expanded using the vector-load + permutation sequence,
382 // which is much less expensive).
383 if (Src->isVectorTy() && Opcode == Instruction::Store)
384 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
385 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
386
387 return Cost;
388 }
389
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,unsigned Alignment,unsigned AddressSpace)390 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
391 unsigned Factor,
392 ArrayRef<unsigned> Indices,
393 unsigned Alignment,
394 unsigned AddressSpace) {
395 assert(isa<VectorType>(VecTy) &&
396 "Expect a vector type for interleaved memory op");
397
398 // Legalize the type.
399 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
400
401 // Firstly, the cost of load/store operation.
402 int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
403
404 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
405 // (at least in the sense that there need only be one non-loop-invariant
406 // instruction). For each result vector, we need one shuffle per incoming
407 // vector (except that the first shuffle can take two incoming vectors
408 // because it does not need to take itself).
409 Cost += Factor*(LT.first-1);
410
411 return Cost;
412 }
413
414