1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //==-----------------------------------------------------------------------===//
9
10 #include "AMDILDevices.h"
11 #include "AMDGPUInstrInfo.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringRef.h"
15 #include "llvm/ADT/Twine.h"
16 #include "llvm/Constants.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
19 #include "llvm/Function.h"
20 #include "llvm/Instructions.h"
21 #include "llvm/Module.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/MathExtras.h"
24
25 #include <sstream>
26
27 #if 0
28 STATISTIC(PointerAssignments, "Number of dynamic pointer "
29 "assigments discovered");
30 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
31 #endif
32
33 using namespace llvm;
34 // The Peephole optimization pass is used to do simple last minute optimizations
35 // that are required for correct code or to remove redundant functions
36 namespace {
37
38 class OpaqueType;
39
40 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
41 public:
42 TargetMachine &TM;
43 static char ID;
44 AMDGPUPeepholeOpt(TargetMachine &tm);
45 ~AMDGPUPeepholeOpt();
46 const char *getPassName() const;
47 bool runOnFunction(Function &F);
48 bool doInitialization(Module &M);
49 bool doFinalization(Module &M);
50 void getAnalysisUsage(AnalysisUsage &AU) const;
51 protected:
52 private:
53 // Function to initiate all of the instruction level optimizations.
54 bool instLevelOptimizations(BasicBlock::iterator *inst);
55 // Quick check to see if we need to dump all of the pointers into the
56 // arena. If this is correct, then we set all pointers to exist in arena. This
57 // is a workaround for aliasing of pointers in a struct/union.
58 bool dumpAllIntoArena(Function &F);
59 // Because I don't want to invalidate any pointers while in the
60 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
61 // it later. This function does the conversions if required.
62 void doAtomicConversionIfNeeded(Function &F);
63 // Because __amdil_is_constant cannot be properly evaluated if
64 // optimizations are disabled, the call's are placed in a vector
65 // and evaluated after the __amdil_image* functions are evaluated
66 // which should allow the __amdil_is_constant function to be
67 // evaluated correctly.
68 void doIsConstCallConversionIfNeeded();
69 bool mChanged;
70 bool mDebug;
71 bool mConvertAtomics;
72 CodeGenOpt::Level optLevel;
73 // Run a series of tests to see if we can optimize a CALL instruction.
74 bool optimizeCallInst(BasicBlock::iterator *bbb);
75 // A peephole optimization to optimize bit extract sequences.
76 bool optimizeBitExtract(Instruction *inst);
77 // A peephole optimization to optimize bit insert sequences.
78 bool optimizeBitInsert(Instruction *inst);
79 bool setupBitInsert(Instruction *base,
80 Instruction *&src,
81 Constant *&mask,
82 Constant *&shift);
83 // Expand the bit field insert instruction on versions of OpenCL that
84 // don't support it.
85 bool expandBFI(CallInst *CI);
86 // Expand the bit field mask instruction on version of OpenCL that
87 // don't support it.
88 bool expandBFM(CallInst *CI);
89 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
90 // this case we need to expand them. These functions check for 24bit functions
91 // and then expand.
92 bool isSigned24BitOps(CallInst *CI);
93 void expandSigned24BitOps(CallInst *CI);
94 // One optimization that can occur is that if the required workgroup size is
95 // specified then the result of get_local_size is known at compile time and
96 // can be returned accordingly.
97 bool isRWGLocalOpt(CallInst *CI);
98 // On northern island cards, the division is slightly less accurate than on
99 // previous generations, so we need to utilize a more accurate division. So we
100 // can translate the accurate divide to a normal divide on all other cards.
101 bool convertAccurateDivide(CallInst *CI);
102 void expandAccurateDivide(CallInst *CI);
103 // If the alignment is set incorrectly, it can produce really inefficient
104 // code. This checks for this scenario and fixes it if possible.
105 bool correctMisalignedMemOp(Instruction *inst);
106
107 // If we are in no opt mode, then we need to make sure that
108 // local samplers are properly propagated as constant propagation
109 // doesn't occur and we need to know the value of kernel defined
110 // samplers at compile time.
111 bool propagateSamplerInst(CallInst *CI);
112
113 // Helper functions
114
115 // Group of functions that recursively calculate the size of a structure based
116 // on it's sub-types.
117 size_t getTypeSize(Type * const T, bool dereferencePtr = false);
118 size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
119 size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
120 size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
121 size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
122 size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
123 size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
124 size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
125
126 LLVMContext *mCTX;
127 Function *mF;
128 const AMDGPUSubtarget *mSTM;
129 SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
130 SmallVector<CallInst *, 16> isConstVec;
131 }; // class AMDGPUPeepholeOpt
132 char AMDGPUPeepholeOpt::ID = 0;
133
134 // A template function that has two levels of looping before calling the
135 // function with a pointer to the current iterator.
136 template<class InputIterator, class SecondIterator, class Function>
safeNestedForEach(InputIterator First,InputIterator Last,SecondIterator S,Function F)137 Function safeNestedForEach(InputIterator First, InputIterator Last,
138 SecondIterator S, Function F)
139 {
140 for ( ; First != Last; ++First) {
141 SecondIterator sf, sl;
142 for (sf = First->begin(), sl = First->end();
143 sf != sl; ) {
144 if (!F(&sf)) {
145 ++sf;
146 }
147 }
148 }
149 return F;
150 }
151
152 } // anonymous namespace
153
154 namespace llvm {
155 FunctionPass *
createAMDGPUPeepholeOpt(TargetMachine & tm)156 createAMDGPUPeepholeOpt(TargetMachine &tm)
157 {
158 return new AMDGPUPeepholeOpt(tm);
159 }
160 } // llvm namespace
161
AMDGPUPeepholeOpt(TargetMachine & tm)162 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
163 : FunctionPass(ID), TM(tm)
164 {
165 mDebug = false;
166 optLevel = TM.getOptLevel();
167
168 }
169
~AMDGPUPeepholeOpt()170 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()
171 {
172 }
173
174 const char *
getPassName() const175 AMDGPUPeepholeOpt::getPassName() const
176 {
177 return "AMDGPU PeepHole Optimization Pass";
178 }
179
180 bool
containsPointerType(Type * Ty)181 containsPointerType(Type *Ty)
182 {
183 if (!Ty) {
184 return false;
185 }
186 switch(Ty->getTypeID()) {
187 default:
188 return false;
189 case Type::StructTyID: {
190 const StructType *ST = dyn_cast<StructType>(Ty);
191 for (StructType::element_iterator stb = ST->element_begin(),
192 ste = ST->element_end(); stb != ste; ++stb) {
193 if (!containsPointerType(*stb)) {
194 continue;
195 }
196 return true;
197 }
198 break;
199 }
200 case Type::VectorTyID:
201 case Type::ArrayTyID:
202 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
203 case Type::PointerTyID:
204 return true;
205 };
206 return false;
207 }
208
209 bool
dumpAllIntoArena(Function & F)210 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)
211 {
212 bool dumpAll = false;
213 for (Function::const_arg_iterator cab = F.arg_begin(),
214 cae = F.arg_end(); cab != cae; ++cab) {
215 const Argument *arg = cab;
216 const PointerType *PT = dyn_cast<PointerType>(arg->getType());
217 if (!PT) {
218 continue;
219 }
220 Type *DereferencedType = PT->getElementType();
221 if (!dyn_cast<StructType>(DereferencedType)
222 ) {
223 continue;
224 }
225 if (!containsPointerType(DereferencedType)) {
226 continue;
227 }
228 // FIXME: Because a pointer inside of a struct/union may be aliased to
229 // another pointer we need to take the conservative approach and place all
230 // pointers into the arena until more advanced detection is implemented.
231 dumpAll = true;
232 }
233 return dumpAll;
234 }
235 void
doIsConstCallConversionIfNeeded()236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
237 {
238 if (isConstVec.empty()) {
239 return;
240 }
241 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
242 CallInst *CI = isConstVec[x];
243 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
244 Type *aType = Type::getInt32Ty(*mCTX);
245 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
246 : ConstantInt::get(aType, 0);
247 CI->replaceAllUsesWith(Val);
248 CI->eraseFromParent();
249 }
250 isConstVec.clear();
251 }
252 void
doAtomicConversionIfNeeded(Function & F)253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
254 {
255 // Don't do anything if we don't have any atomic operations.
256 if (atomicFuncs.empty()) {
257 return;
258 }
259 // Change the function name for the atomic if it is required
260 uint32_t size = atomicFuncs.size();
261 for (uint32_t x = 0; x < size; ++x) {
262 atomicFuncs[x].first->setOperand(
263 atomicFuncs[x].first->getNumOperands()-1,
264 atomicFuncs[x].second);
265
266 }
267 mChanged = true;
268 if (mConvertAtomics) {
269 return;
270 }
271 }
272
273 bool
runOnFunction(Function & MF)274 AMDGPUPeepholeOpt::runOnFunction(Function &MF)
275 {
276 mChanged = false;
277 mF = &MF;
278 mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
279 if (mDebug) {
280 MF.dump();
281 }
282 mCTX = &MF.getType()->getContext();
283 mConvertAtomics = true;
284 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
285 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
286 this));
287
288 doAtomicConversionIfNeeded(MF);
289 doIsConstCallConversionIfNeeded();
290
291 if (mDebug) {
292 MF.dump();
293 }
294 return mChanged;
295 }
296
297 bool
optimizeCallInst(BasicBlock::iterator * bbb)298 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
299 {
300 Instruction *inst = (*bbb);
301 CallInst *CI = dyn_cast<CallInst>(inst);
302 if (!CI) {
303 return false;
304 }
305 if (isSigned24BitOps(CI)) {
306 expandSigned24BitOps(CI);
307 ++(*bbb);
308 CI->eraseFromParent();
309 return true;
310 }
311 if (propagateSamplerInst(CI)) {
312 return false;
313 }
314 if (expandBFI(CI) || expandBFM(CI)) {
315 ++(*bbb);
316 CI->eraseFromParent();
317 return true;
318 }
319 if (convertAccurateDivide(CI)) {
320 expandAccurateDivide(CI);
321 ++(*bbb);
322 CI->eraseFromParent();
323 return true;
324 }
325
326 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
327 if (calleeName.startswith("__amdil_is_constant")) {
328 // If we do not have optimizations, then this
329 // cannot be properly evaluated, so we add the
330 // call instruction to a vector and process
331 // them at the end of processing after the
332 // samplers have been correctly handled.
333 if (optLevel == CodeGenOpt::None) {
334 isConstVec.push_back(CI);
335 return false;
336 } else {
337 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
338 Type *aType = Type::getInt32Ty(*mCTX);
339 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
340 : ConstantInt::get(aType, 0);
341 CI->replaceAllUsesWith(Val);
342 ++(*bbb);
343 CI->eraseFromParent();
344 return true;
345 }
346 }
347
348 if (calleeName.equals("__amdil_is_asic_id_i32")) {
349 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
350 Type *aType = Type::getInt32Ty(*mCTX);
351 Value *Val = CV;
352 if (Val) {
353 Val = ConstantInt::get(aType,
354 mSTM->device()->getDeviceFlag() & CV->getZExtValue());
355 } else {
356 Val = ConstantInt::get(aType, 0);
357 }
358 CI->replaceAllUsesWith(Val);
359 ++(*bbb);
360 CI->eraseFromParent();
361 return true;
362 }
363 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
364 if (!F) {
365 return false;
366 }
367 if (F->getName().startswith("__atom") && !CI->getNumUses()
368 && F->getName().find("_xchg") == StringRef::npos) {
369 std::string buffer(F->getName().str() + "_noret");
370 F = dyn_cast<Function>(
371 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
372 atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
373 }
374
375 if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
376 && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
377 return false;
378 }
379 if (!mConvertAtomics) {
380 return false;
381 }
382 StringRef name = F->getName();
383 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
384 mConvertAtomics = false;
385 }
386 return false;
387 }
388
389 bool
setupBitInsert(Instruction * base,Instruction * & src,Constant * & mask,Constant * & shift)390 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
391 Instruction *&src,
392 Constant *&mask,
393 Constant *&shift)
394 {
395 if (!base) {
396 if (mDebug) {
397 dbgs() << "Null pointer passed into function.\n";
398 }
399 return false;
400 }
401 bool andOp = false;
402 if (base->getOpcode() == Instruction::Shl) {
403 shift = dyn_cast<Constant>(base->getOperand(1));
404 } else if (base->getOpcode() == Instruction::And) {
405 mask = dyn_cast<Constant>(base->getOperand(1));
406 andOp = true;
407 } else {
408 if (mDebug) {
409 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
410 }
411 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
412 return false;
413 }
414 src = dyn_cast<Instruction>(base->getOperand(0));
415 if (!src) {
416 if (mDebug) {
417 dbgs() << "Failed setup since the base operand is not an instruction!\n";
418 }
419 return false;
420 }
421 // If we find an 'and' operation, then we don't need to
422 // find the next operation as we already know the
423 // bits that are valid at this point.
424 if (andOp) {
425 return true;
426 }
427 if (src->getOpcode() == Instruction::Shl && !shift) {
428 shift = dyn_cast<Constant>(src->getOperand(1));
429 src = dyn_cast<Instruction>(src->getOperand(0));
430 } else if (src->getOpcode() == Instruction::And && !mask) {
431 mask = dyn_cast<Constant>(src->getOperand(1));
432 }
433 if (!mask && !shift) {
434 if (mDebug) {
435 dbgs() << "Failed setup since both mask and shift are NULL!\n";
436 }
437 // Did not find a constant mask or a shift.
438 return false;
439 }
440 return true;
441 }
442 bool
optimizeBitInsert(Instruction * inst)443 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)
444 {
445 if (!inst) {
446 return false;
447 }
448 if (!inst->isBinaryOp()) {
449 return false;
450 }
451 if (inst->getOpcode() != Instruction::Or) {
452 return false;
453 }
454 if (optLevel == CodeGenOpt::None) {
455 return false;
456 }
457 // We want to do an optimization on a sequence of ops that in the end equals a
458 // single ISA instruction.
459 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
460 // Some simplified versions of this pattern are as follows:
461 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
462 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
463 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
464 // (A & B) | (D << F) when (1 << F) >= B
465 // (A << C) | (D & E) when (1 << C) >= E
466 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
467 // The HD4XXX hardware doesn't support the ubit_insert instruction.
468 return false;
469 }
470 Type *aType = inst->getType();
471 bool isVector = aType->isVectorTy();
472 int numEle = 1;
473 // This optimization only works on 32bit integers.
474 if (aType->getScalarType()
475 != Type::getInt32Ty(inst->getContext())) {
476 return false;
477 }
478 if (isVector) {
479 const VectorType *VT = dyn_cast<VectorType>(aType);
480 numEle = VT->getNumElements();
481 // We currently cannot support more than 4 elements in a intrinsic and we
482 // cannot support Vec3 types.
483 if (numEle > 4 || numEle == 3) {
484 return false;
485 }
486 }
487 // TODO: Handle vectors.
488 if (isVector) {
489 if (mDebug) {
490 dbgs() << "!!! Vectors are not supported yet!\n";
491 }
492 return false;
493 }
494 Instruction *LHSSrc = NULL, *RHSSrc = NULL;
495 Constant *LHSMask = NULL, *RHSMask = NULL;
496 Constant *LHSShift = NULL, *RHSShift = NULL;
497 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
498 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
499 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
500 if (mDebug) {
501 dbgs() << "Found an OR Operation that failed setup!\n";
502 inst->dump();
503 if (LHS) { LHS->dump(); }
504 if (LHSSrc) { LHSSrc->dump(); }
505 if (LHSMask) { LHSMask->dump(); }
506 if (LHSShift) { LHSShift->dump(); }
507 }
508 // There was an issue with the setup for BitInsert.
509 return false;
510 }
511 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
512 if (mDebug) {
513 dbgs() << "Found an OR Operation that failed setup!\n";
514 inst->dump();
515 if (RHS) { RHS->dump(); }
516 if (RHSSrc) { RHSSrc->dump(); }
517 if (RHSMask) { RHSMask->dump(); }
518 if (RHSShift) { RHSShift->dump(); }
519 }
520 // There was an issue with the setup for BitInsert.
521 return false;
522 }
523 if (mDebug) {
524 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
525 dbgs() << "Op: "; inst->dump();
526 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
527 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
528 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
529 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
530 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
531 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
532 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
533 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
534 }
535 Constant *offset = NULL;
536 Constant *width = NULL;
537 int32_t lhsMaskVal = 0, rhsMaskVal = 0;
538 int32_t lhsShiftVal = 0, rhsShiftVal = 0;
539 int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
540 int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
541 lhsMaskVal = (int32_t)(LHSMask
542 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
543 rhsMaskVal = (int32_t)(RHSMask
544 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
545 lhsShiftVal = (int32_t)(LHSShift
546 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
547 rhsShiftVal = (int32_t)(RHSShift
548 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
549 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
550 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
551 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
552 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
553 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
554 if (mDebug) {
555 dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
556 dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
557 dbgs() << (RHSMask ? " & E)" : ")");
558 dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
559 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
560 dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
561 dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
562 dbgs() << "width(B) = " << lhsMaskWidth;
563 dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
564 dbgs() << "offset(B) = " << lhsMaskOffset;
565 dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
566 dbgs() << "Constraints: \n";
567 dbgs() << "\t(1) B ^ E == 0\n";
568 dbgs() << "\t(2-LHS) B is a mask\n";
569 dbgs() << "\t(2-LHS) E is a mask\n";
570 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
571 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
572 }
573 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
574 if (mDebug) {
575 dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
576 dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
577 dbgs() << "Failed constraint 1!\n";
578 }
579 return false;
580 }
581 if (mDebug) {
582 dbgs() << "LHS = " << lhsMaskOffset << "";
583 dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
584 dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
585 dbgs() << "\nRHS = " << rhsMaskOffset << "";
586 dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
587 dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
588 dbgs() << "\n";
589 }
590 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
591 offset = ConstantInt::get(aType, lhsMaskOffset, false);
592 width = ConstantInt::get(aType, lhsMaskWidth, false);
593 RHSSrc = RHS;
594 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
595 if (mDebug) {
596 dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
597 dbgs() << "Failed constraint 2!\n";
598 }
599 return false;
600 }
601 if (!LHSShift) {
602 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
603 "MaskShr", LHS);
604 } else if (lhsShiftVal != lhsMaskOffset) {
605 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
606 "MaskShr", LHS);
607 }
608 if (mDebug) {
609 dbgs() << "Optimizing LHS!\n";
610 }
611 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
612 offset = ConstantInt::get(aType, rhsMaskOffset, false);
613 width = ConstantInt::get(aType, rhsMaskWidth, false);
614 LHSSrc = RHSSrc;
615 RHSSrc = LHS;
616 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
617 if (mDebug) {
618 dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
619 dbgs() << "Failed constraint 2!\n";
620 }
621 return false;
622 }
623 if (!RHSShift) {
624 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
625 "MaskShr", RHS);
626 } else if (rhsShiftVal != rhsMaskOffset) {
627 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
628 "MaskShr", RHS);
629 }
630 if (mDebug) {
631 dbgs() << "Optimizing RHS!\n";
632 }
633 } else {
634 if (mDebug) {
635 dbgs() << "Failed constraint 3!\n";
636 }
637 return false;
638 }
639 if (mDebug) {
640 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
641 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
642 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
643 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
644 }
645 if (!offset || !width) {
646 if (mDebug) {
647 dbgs() << "Either width or offset are NULL, failed detection!\n";
648 }
649 return false;
650 }
651 // Lets create the function signature.
652 std::vector<Type *> callTypes;
653 callTypes.push_back(aType);
654 callTypes.push_back(aType);
655 callTypes.push_back(aType);
656 callTypes.push_back(aType);
657 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
658 std::string name = "__amdil_ubit_insert";
659 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
660 Function *Func =
661 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
662 getOrInsertFunction(llvm::StringRef(name), funcType));
663 Value *Operands[4] = {
664 width,
665 offset,
666 LHSSrc,
667 RHSSrc
668 };
669 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
670 if (mDebug) {
671 dbgs() << "Old Inst: ";
672 inst->dump();
673 dbgs() << "New Inst: ";
674 CI->dump();
675 dbgs() << "\n\n";
676 }
677 CI->insertBefore(inst);
678 inst->replaceAllUsesWith(CI);
679 return true;
680 }
681
682 bool
optimizeBitExtract(Instruction * inst)683 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)
684 {
685 if (!inst) {
686 return false;
687 }
688 if (!inst->isBinaryOp()) {
689 return false;
690 }
691 if (inst->getOpcode() != Instruction::And) {
692 return false;
693 }
694 if (optLevel == CodeGenOpt::None) {
695 return false;
696 }
697 // We want to do some simple optimizations on Shift right/And patterns. The
698 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
699 // value smaller than 32 and C is a mask. If C is a constant value, then the
700 // following transformation can occur. For signed integers, it turns into the
701 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
702 // integers, it turns into the function call dst =
703 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
704 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
705 // Evergreen hardware.
706 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
707 // This does not work on HD4XXX hardware.
708 return false;
709 }
710 Type *aType = inst->getType();
711 bool isVector = aType->isVectorTy();
712
713 // XXX Support vector types
714 if (isVector) {
715 return false;
716 }
717 int numEle = 1;
718 // This only works on 32bit integers
719 if (aType->getScalarType()
720 != Type::getInt32Ty(inst->getContext())) {
721 return false;
722 }
723 if (isVector) {
724 const VectorType *VT = dyn_cast<VectorType>(aType);
725 numEle = VT->getNumElements();
726 // We currently cannot support more than 4 elements in a intrinsic and we
727 // cannot support Vec3 types.
728 if (numEle > 4 || numEle == 3) {
729 return false;
730 }
731 }
732 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
733 // If the first operand is not a shift instruction, then we can return as it
734 // doesn't match this pattern.
735 if (!ShiftInst || !ShiftInst->isShift()) {
736 return false;
737 }
738 // If we are a shift left, then we need don't match this pattern.
739 if (ShiftInst->getOpcode() == Instruction::Shl) {
740 return false;
741 }
742 bool isSigned = ShiftInst->isArithmeticShift();
743 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
744 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
745 // Lets make sure that the shift value and the and mask are constant integers.
746 if (!AndMask || !ShrVal) {
747 return false;
748 }
749 Constant *newMaskConst;
750 Constant *shiftValConst;
751 if (isVector) {
752 // Handle the vector case
753 std::vector<Constant *> maskVals;
754 std::vector<Constant *> shiftVals;
755 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
756 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
757 Type *scalarType = AndMaskVec->getType()->getScalarType();
758 assert(AndMaskVec->getNumOperands() ==
759 ShrValVec->getNumOperands() && "cannot have a "
760 "combination where the number of elements to a "
761 "shift and an and are different!");
762 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
763 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
764 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
765 if (!AndCI || !ShiftIC) {
766 return false;
767 }
768 uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
769 if (!isMask_32(maskVal)) {
770 return false;
771 }
772 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
773 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
774 // If the mask or shiftval is greater than the bitcount, then break out.
775 if (maskVal >= 32 || shiftVal >= 32) {
776 return false;
777 }
778 // If the mask val is greater than the the number of original bits left
779 // then this optimization is invalid.
780 if (maskVal > (32 - shiftVal)) {
781 return false;
782 }
783 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
784 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
785 }
786 newMaskConst = ConstantVector::get(maskVals);
787 shiftValConst = ConstantVector::get(shiftVals);
788 } else {
789 // Handle the scalar case
790 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
791 // This must be a mask value where all lower bits are set to 1 and then any
792 // bit higher is set to 0.
793 if (!isMask_32(maskVal)) {
794 return false;
795 }
796 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
797 // Count the number of bits set in the mask, this is the width of the
798 // resulting bit set that is extracted from the source value.
799 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
800 // If the mask or shift val is greater than the bitcount, then break out.
801 if (maskVal >= 32 || shiftVal >= 32) {
802 return false;
803 }
804 // If the mask val is greater than the the number of original bits left then
805 // this optimization is invalid.
806 if (maskVal > (32 - shiftVal)) {
807 return false;
808 }
809 newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
810 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
811 }
812 // Lets create the function signature.
813 std::vector<Type *> callTypes;
814 callTypes.push_back(aType);
815 callTypes.push_back(aType);
816 callTypes.push_back(aType);
817 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
818 std::string name = "llvm.AMDIL.bit.extract.u32";
819 if (isVector) {
820 name += ".v" + itostr(numEle) + "i32";
821 } else {
822 name += ".";
823 }
824 // Lets create the function.
825 Function *Func =
826 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
827 getOrInsertFunction(llvm::StringRef(name), funcType));
828 Value *Operands[3] = {
829 ShiftInst->getOperand(0),
830 shiftValConst,
831 newMaskConst
832 };
833 // Lets create the Call with the operands
834 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
835 CI->setDoesNotAccessMemory();
836 CI->insertBefore(inst);
837 inst->replaceAllUsesWith(CI);
838 return true;
839 }
840
841 bool
expandBFI(CallInst * CI)842 AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
843 {
844 if (!CI) {
845 return false;
846 }
847 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
848 if (!LHS->getName().startswith("__amdil_bfi")) {
849 return false;
850 }
851 Type* type = CI->getOperand(0)->getType();
852 Constant *negOneConst = NULL;
853 if (type->isVectorTy()) {
854 std::vector<Constant *> negOneVals;
855 negOneConst = ConstantInt::get(CI->getContext(),
856 APInt(32, StringRef("-1"), 10));
857 for (size_t x = 0,
858 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
859 negOneVals.push_back(negOneConst);
860 }
861 negOneConst = ConstantVector::get(negOneVals);
862 } else {
863 negOneConst = ConstantInt::get(CI->getContext(),
864 APInt(32, StringRef("-1"), 10));
865 }
866 // __amdil_bfi => (A & B) | (~A & C)
867 BinaryOperator *lhs =
868 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
869 CI->getOperand(1), "bfi_and", CI);
870 BinaryOperator *rhs =
871 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
872 "bfi_not", CI);
873 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
874 "bfi_and", CI);
875 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
876 CI->replaceAllUsesWith(lhs);
877 return true;
878 }
879
880 bool
expandBFM(CallInst * CI)881 AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
882 {
883 if (!CI) {
884 return false;
885 }
886 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
887 if (!LHS->getName().startswith("__amdil_bfm")) {
888 return false;
889 }
890 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
891 Constant *newMaskConst = NULL;
892 Constant *newShiftConst = NULL;
893 Type* type = CI->getOperand(0)->getType();
894 if (type->isVectorTy()) {
895 std::vector<Constant*> newMaskVals, newShiftVals;
896 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
897 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
898 for (size_t x = 0,
899 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
900 newMaskVals.push_back(newMaskConst);
901 newShiftVals.push_back(newShiftConst);
902 }
903 newMaskConst = ConstantVector::get(newMaskVals);
904 newShiftConst = ConstantVector::get(newShiftVals);
905 } else {
906 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
907 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
908 }
909 BinaryOperator *lhs =
910 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
911 newMaskConst, "bfm_mask", CI);
912 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
913 lhs, "bfm_shl", CI);
914 lhs = BinaryOperator::Create(Instruction::Sub, lhs,
915 newShiftConst, "bfm_sub", CI);
916 BinaryOperator *rhs =
917 BinaryOperator::Create(Instruction::And, CI->getOperand(1),
918 newMaskConst, "bfm_mask", CI);
919 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
920 CI->replaceAllUsesWith(lhs);
921 return true;
922 }
923
924 bool
instLevelOptimizations(BasicBlock::iterator * bbb)925 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
926 {
927 Instruction *inst = (*bbb);
928 if (optimizeCallInst(bbb)) {
929 return true;
930 }
931 if (optimizeBitExtract(inst)) {
932 return false;
933 }
934 if (optimizeBitInsert(inst)) {
935 return false;
936 }
937 if (correctMisalignedMemOp(inst)) {
938 return false;
939 }
940 return false;
941 }
942 bool
correctMisalignedMemOp(Instruction * inst)943 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
944 {
945 LoadInst *linst = dyn_cast<LoadInst>(inst);
946 StoreInst *sinst = dyn_cast<StoreInst>(inst);
947 unsigned alignment;
948 Type* Ty = inst->getType();
949 if (linst) {
950 alignment = linst->getAlignment();
951 Ty = inst->getType();
952 } else if (sinst) {
953 alignment = sinst->getAlignment();
954 Ty = sinst->getValueOperand()->getType();
955 } else {
956 return false;
957 }
958 unsigned size = getTypeSize(Ty);
959 if (size == alignment || size < alignment) {
960 return false;
961 }
962 if (!Ty->isStructTy()) {
963 return false;
964 }
965 if (alignment < 4) {
966 if (linst) {
967 linst->setAlignment(0);
968 return true;
969 } else if (sinst) {
970 sinst->setAlignment(0);
971 return true;
972 }
973 }
974 return false;
975 }
976 bool
isSigned24BitOps(CallInst * CI)977 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)
978 {
979 if (!CI) {
980 return false;
981 }
982 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
983 std::string namePrefix = LHS->getName().substr(0, 14);
984 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
985 && namePrefix != "__amdil__imul24_high") {
986 return false;
987 }
988 if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
989 return false;
990 }
991 return true;
992 }
993
994 void
expandSigned24BitOps(CallInst * CI)995 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)
996 {
997 assert(isSigned24BitOps(CI) && "Must be a "
998 "signed 24 bit operation to call this function!");
999 Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1000 // On 7XX and 8XX we do not have signed 24bit, so we need to
1001 // expand it to the following:
1002 // imul24 turns into 32bit imul
1003 // imad24 turns into 32bit imad
1004 // imul24_high turns into 32bit imulhigh
1005 if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1006 Type *aType = CI->getOperand(0)->getType();
1007 bool isVector = aType->isVectorTy();
1008 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1009 std::vector<Type*> callTypes;
1010 callTypes.push_back(CI->getOperand(0)->getType());
1011 callTypes.push_back(CI->getOperand(1)->getType());
1012 callTypes.push_back(CI->getOperand(2)->getType());
1013 FunctionType *funcType =
1014 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1015 std::string name = "__amdil_imad";
1016 if (isVector) {
1017 name += "_v" + itostr(numEle) + "i32";
1018 } else {
1019 name += "_i32";
1020 }
1021 Function *Func = dyn_cast<Function>(
1022 CI->getParent()->getParent()->getParent()->
1023 getOrInsertFunction(llvm::StringRef(name), funcType));
1024 Value *Operands[3] = {
1025 CI->getOperand(0),
1026 CI->getOperand(1),
1027 CI->getOperand(2)
1028 };
1029 CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1030 nCI->insertBefore(CI);
1031 CI->replaceAllUsesWith(nCI);
1032 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1033 BinaryOperator *mulOp =
1034 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1035 CI->getOperand(1), "imul24", CI);
1036 CI->replaceAllUsesWith(mulOp);
1037 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1038 Type *aType = CI->getOperand(0)->getType();
1039
1040 bool isVector = aType->isVectorTy();
1041 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1042 std::vector<Type*> callTypes;
1043 callTypes.push_back(CI->getOperand(0)->getType());
1044 callTypes.push_back(CI->getOperand(1)->getType());
1045 FunctionType *funcType =
1046 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1047 std::string name = "__amdil_imul_high";
1048 if (isVector) {
1049 name += "_v" + itostr(numEle) + "i32";
1050 } else {
1051 name += "_i32";
1052 }
1053 Function *Func = dyn_cast<Function>(
1054 CI->getParent()->getParent()->getParent()->
1055 getOrInsertFunction(llvm::StringRef(name), funcType));
1056 Value *Operands[2] = {
1057 CI->getOperand(0),
1058 CI->getOperand(1)
1059 };
1060 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1061 nCI->insertBefore(CI);
1062 CI->replaceAllUsesWith(nCI);
1063 }
1064 }
1065
1066 bool
isRWGLocalOpt(CallInst * CI)1067 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1068 {
1069 return (CI != NULL
1070 && CI->getOperand(CI->getNumOperands() - 1)->getName()
1071 == "__amdil_get_local_size_int");
1072 }
1073
1074 bool
convertAccurateDivide(CallInst * CI)1075 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)
1076 {
1077 if (!CI) {
1078 return false;
1079 }
1080 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1081 && (mSTM->getDeviceName() == "cayman")) {
1082 return false;
1083 }
1084 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1085 == "__amdil_improved_div";
1086 }
1087
1088 void
expandAccurateDivide(CallInst * CI)1089 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)
1090 {
1091 assert(convertAccurateDivide(CI)
1092 && "expanding accurate divide can only happen if it is expandable!");
1093 BinaryOperator *divOp =
1094 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1095 CI->getOperand(1), "fdiv32", CI);
1096 CI->replaceAllUsesWith(divOp);
1097 }
1098
1099 bool
propagateSamplerInst(CallInst * CI)1100 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
1101 {
1102 if (optLevel != CodeGenOpt::None) {
1103 return false;
1104 }
1105
1106 if (!CI) {
1107 return false;
1108 }
1109
1110 unsigned funcNameIdx = 0;
1111 funcNameIdx = CI->getNumOperands() - 1;
1112 StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1113 if (calleeName != "__amdil_image2d_read_norm"
1114 && calleeName != "__amdil_image2d_read_unnorm"
1115 && calleeName != "__amdil_image3d_read_norm"
1116 && calleeName != "__amdil_image3d_read_unnorm") {
1117 return false;
1118 }
1119
1120 unsigned samplerIdx = 2;
1121 samplerIdx = 1;
1122 Value *sampler = CI->getOperand(samplerIdx);
1123 LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1124 if (!lInst) {
1125 return false;
1126 }
1127
1128 if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1129 return false;
1130 }
1131
1132 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1133 // If we are loading from what is not a global value, then we
1134 // fail and return.
1135 if (!gv) {
1136 return false;
1137 }
1138
1139 // If we don't have an initializer or we have an initializer and
1140 // the initializer is not a 32bit integer, we fail.
1141 if (!gv->hasInitializer()
1142 || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1143 return false;
1144 }
1145
1146 // Now that we have the global variable initializer, lets replace
1147 // all uses of the load instruction with the samplerVal and
1148 // reparse the __amdil_is_constant() function.
1149 Constant *samplerVal = gv->getInitializer();
1150 lInst->replaceAllUsesWith(samplerVal);
1151 return true;
1152 }
1153
1154 bool
doInitialization(Module & M)1155 AMDGPUPeepholeOpt::doInitialization(Module &M)
1156 {
1157 return false;
1158 }
1159
1160 bool
doFinalization(Module & M)1161 AMDGPUPeepholeOpt::doFinalization(Module &M)
1162 {
1163 return false;
1164 }
1165
1166 void
getAnalysisUsage(AnalysisUsage & AU) const1167 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1168 {
1169 AU.addRequired<MachineFunctionAnalysis>();
1170 FunctionPass::getAnalysisUsage(AU);
1171 AU.setPreservesAll();
1172 }
1173
getTypeSize(Type * const T,bool dereferencePtr)1174 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1175 size_t size = 0;
1176 if (!T) {
1177 return size;
1178 }
1179 switch (T->getTypeID()) {
1180 case Type::X86_FP80TyID:
1181 case Type::FP128TyID:
1182 case Type::PPC_FP128TyID:
1183 case Type::LabelTyID:
1184 assert(0 && "These types are not supported by this backend");
1185 default:
1186 case Type::FloatTyID:
1187 case Type::DoubleTyID:
1188 size = T->getPrimitiveSizeInBits() >> 3;
1189 break;
1190 case Type::PointerTyID:
1191 size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1192 break;
1193 case Type::IntegerTyID:
1194 size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1195 break;
1196 case Type::StructTyID:
1197 size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1198 break;
1199 case Type::ArrayTyID:
1200 size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1201 break;
1202 case Type::FunctionTyID:
1203 size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1204 break;
1205 case Type::VectorTyID:
1206 size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1207 break;
1208 };
1209 return size;
1210 }
1211
getTypeSize(StructType * const ST,bool dereferencePtr)1212 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1213 bool dereferencePtr) {
1214 size_t size = 0;
1215 if (!ST) {
1216 return size;
1217 }
1218 Type *curType;
1219 StructType::element_iterator eib;
1220 StructType::element_iterator eie;
1221 for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1222 curType = *eib;
1223 size += getTypeSize(curType, dereferencePtr);
1224 }
1225 return size;
1226 }
1227
getTypeSize(IntegerType * const IT,bool dereferencePtr)1228 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1229 bool dereferencePtr) {
1230 return IT ? (IT->getBitWidth() >> 3) : 0;
1231 }
1232
getTypeSize(FunctionType * const FT,bool dereferencePtr)1233 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1234 bool dereferencePtr) {
1235 assert(0 && "Should not be able to calculate the size of an function type");
1236 return 0;
1237 }
1238
getTypeSize(ArrayType * const AT,bool dereferencePtr)1239 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1240 bool dereferencePtr) {
1241 return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1242 dereferencePtr) * AT->getNumElements())
1243 : 0);
1244 }
1245
getTypeSize(VectorType * const VT,bool dereferencePtr)1246 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1247 bool dereferencePtr) {
1248 return VT ? (VT->getBitWidth() >> 3) : 0;
1249 }
1250
getTypeSize(PointerType * const PT,bool dereferencePtr)1251 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1252 bool dereferencePtr) {
1253 if (!PT) {
1254 return 0;
1255 }
1256 Type *CT = PT->getElementType();
1257 if (CT->getTypeID() == Type::StructTyID &&
1258 PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1259 return getTypeSize(dyn_cast<StructType>(CT));
1260 } else if (dereferencePtr) {
1261 size_t size = 0;
1262 for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1263 size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1264 }
1265 return size;
1266 } else {
1267 return 4;
1268 }
1269 }
1270
getTypeSize(OpaqueType * const OT,bool dereferencePtr)1271 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1272 bool dereferencePtr) {
1273 //assert(0 && "Should not be able to calculate the size of an opaque type");
1274 return 4;
1275 }
1276