1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file builder_misc.cpp 24 * 25 * @brief Implementation for miscellaneous builder functions 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "builder.h" 31 #include "common/rdtsc_buckets.h" 32 33 #include <cstdarg> 34 35 namespace SwrJit 36 { 37 void __cdecl CallPrint(const char* fmt, ...); 38 39 ////////////////////////////////////////////////////////////////////////// 40 /// @brief Convert an IEEE 754 32-bit single precision float to an 41 /// 16 bit float with 5 exponent bits and a variable 42 /// number of mantissa bits. 43 /// @param val - 32-bit float 44 /// @todo Maybe move this outside of this file into a header? Convert32To16Float(float val)45 static uint16_t Convert32To16Float(float val) 46 { 47 uint32_t sign, exp, mant; 48 uint32_t roundBits; 49 50 // Extract the sign, exponent, and mantissa 51 uint32_t uf = *(uint32_t*)&val; 52 sign = (uf & 0x80000000) >> 31; 53 exp = (uf & 0x7F800000) >> 23; 54 mant = uf & 0x007FFFFF; 55 56 // Check for out of range 57 if (std::isnan(val)) 58 { 59 exp = 0x1F; 60 mant = 0x200; 61 sign = 1; // set the sign bit for NANs 62 } 63 else if (std::isinf(val)) 64 { 65 exp = 0x1f; 66 mant = 0x0; 67 } 68 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value 69 { 70 exp = 0x1E; 71 mant = 0x3FF; 72 } 73 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm 74 { 75 mant |= 0x00800000; 76 for (; exp <= 0x70; mant >>= 1, exp++) 77 ; 78 exp = 0; 79 mant = mant >> 13; 80 } 81 else if (exp < 0x66) // Too small to represent -> Zero 82 { 83 exp = 0; 84 mant = 0; 85 } 86 else 87 { 88 // Saves bits that will be shifted off for rounding 89 roundBits = mant & 0x1FFFu; 90 // convert exponent and mantissa to 16 bit format 91 exp = exp - 0x70; 92 mant = mant >> 13; 93 94 // Essentially RTZ, but round up if off by only 1 lsb 95 if (roundBits == 0x1FFFu) 96 { 97 mant++; 98 // check for overflow 99 if ((mant & 0xC00u) != 0) 100 exp++; 101 // make sure only the needed bits are used 102 mant &= 0x3FF; 103 } 104 } 105 106 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; 107 return (uint16_t)tmpVal; 108 } 109 110 ////////////////////////////////////////////////////////////////////////// 111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision 112 /// float 113 /// @param val - 16-bit float 114 /// @todo Maybe move this outside of this file into a header? ConvertSmallFloatTo32(UINT val)115 static float ConvertSmallFloatTo32(UINT val) 116 { 117 UINT result; 118 if ((val & 0x7fff) == 0) 119 { 120 result = ((uint32_t)(val & 0x8000)) << 16; 121 } 122 else if ((val & 0x7c00) == 0x7c00) 123 { 124 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; 125 result |= ((uint32_t)val & 0x8000) << 16; 126 } 127 else 128 { 129 uint32_t sign = (val & 0x8000) << 16; 130 uint32_t mant = (val & 0x3ff) << 13; 131 uint32_t exp = (val >> 10) & 0x1f; 132 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals 133 { 134 mant <<= 1; 135 while (mant < (0x400 << 13)) 136 { 137 exp--; 138 mant <<= 1; 139 } 140 mant &= (0x3ff << 13); 141 } 142 exp = ((exp - 15 + 127) & 0xff) << 23; 143 result = sign | exp | mant; 144 } 145 146 return *(float*)&result; 147 } 148 C(bool i)149 Constant *Builder::C(bool i) 150 { 151 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); 152 } 153 C(char i)154 Constant *Builder::C(char i) 155 { 156 return ConstantInt::get(IRB()->getInt8Ty(), i); 157 } 158 C(uint8_t i)159 Constant *Builder::C(uint8_t i) 160 { 161 return ConstantInt::get(IRB()->getInt8Ty(), i); 162 } 163 C(int i)164 Constant *Builder::C(int i) 165 { 166 return ConstantInt::get(IRB()->getInt32Ty(), i); 167 } 168 C(int64_t i)169 Constant *Builder::C(int64_t i) 170 { 171 return ConstantInt::get(IRB()->getInt64Ty(), i); 172 } 173 C(uint16_t i)174 Constant *Builder::C(uint16_t i) 175 { 176 return ConstantInt::get(mInt16Ty,i); 177 } 178 C(uint32_t i)179 Constant *Builder::C(uint32_t i) 180 { 181 return ConstantInt::get(IRB()->getInt32Ty(), i); 182 } 183 C(float i)184 Constant *Builder::C(float i) 185 { 186 return ConstantFP::get(IRB()->getFloatTy(), i); 187 } 188 PRED(bool pred)189 Constant *Builder::PRED(bool pred) 190 { 191 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); 192 } 193 VIMMED1(int i)194 Value *Builder::VIMMED1(int i) 195 { 196 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); 197 } 198 VIMMED1(uint32_t i)199 Value *Builder::VIMMED1(uint32_t i) 200 { 201 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); 202 } 203 VIMMED1(float i)204 Value *Builder::VIMMED1(float i) 205 { 206 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); 207 } 208 VIMMED1(bool i)209 Value *Builder::VIMMED1(bool i) 210 { 211 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); 212 } 213 VUNDEF_IPTR()214 Value *Builder::VUNDEF_IPTR() 215 { 216 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); 217 } 218 VUNDEF_I()219 Value *Builder::VUNDEF_I() 220 { 221 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); 222 } 223 VUNDEF(Type * ty,uint32_t size)224 Value *Builder::VUNDEF(Type *ty, uint32_t size) 225 { 226 return UndefValue::get(VectorType::get(ty, size)); 227 } 228 VUNDEF_F()229 Value *Builder::VUNDEF_F() 230 { 231 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); 232 } 233 VUNDEF(Type * t)234 Value *Builder::VUNDEF(Type* t) 235 { 236 return UndefValue::get(VectorType::get(t, mVWidth)); 237 } 238 239 #if HAVE_LLVM == 0x306 VINSERT(Value * vec,Value * val,uint64_t index)240 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) 241 { 242 return VINSERT(vec, val, C((int64_t)index)); 243 } 244 #endif 245 VBROADCAST(Value * src)246 Value *Builder::VBROADCAST(Value *src) 247 { 248 // check if src is already a vector 249 if (src->getType()->isVectorTy()) 250 { 251 return src; 252 } 253 254 return VECTOR_SPLAT(mVWidth, src); 255 } 256 IMMED(Value * v)257 uint32_t Builder::IMMED(Value* v) 258 { 259 SWR_ASSERT(isa<ConstantInt>(v)); 260 ConstantInt *pValConst = cast<ConstantInt>(v); 261 return pValConst->getZExtValue(); 262 } 263 S_IMMED(Value * v)264 int32_t Builder::S_IMMED(Value* v) 265 { 266 SWR_ASSERT(isa<ConstantInt>(v)); 267 ConstantInt *pValConst = cast<ConstantInt>(v); 268 return pValConst->getSExtValue(); 269 } 270 GEP(Value * ptr,const std::initializer_list<Value * > & indexList)271 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) 272 { 273 std::vector<Value*> indices; 274 for (auto i : indexList) 275 indices.push_back(i); 276 return GEPA(ptr, indices); 277 } 278 GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)279 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) 280 { 281 std::vector<Value*> indices; 282 for (auto i : indexList) 283 indices.push_back(C(i)); 284 return GEPA(ptr, indices); 285 } 286 LOAD(Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name)287 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name) 288 { 289 std::vector<Value*> valIndices; 290 for (auto i : indices) 291 valIndices.push_back(C(i)); 292 return LOAD(GEPA(basePtr, valIndices), name); 293 } 294 LOADV(Value * basePtr,const std::initializer_list<Value * > & indices,const llvm::Twine & name)295 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name) 296 { 297 std::vector<Value*> valIndices; 298 for (auto i : indices) 299 valIndices.push_back(i); 300 return LOAD(GEPA(basePtr, valIndices), name); 301 } 302 STORE(Value * val,Value * basePtr,const std::initializer_list<uint32_t> & indices)303 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices) 304 { 305 std::vector<Value*> valIndices; 306 for (auto i : indices) 307 valIndices.push_back(C(i)); 308 return STORE(val, GEPA(basePtr, valIndices)); 309 } 310 STOREV(Value * val,Value * basePtr,const std::initializer_list<Value * > & indices)311 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices) 312 { 313 std::vector<Value*> valIndices; 314 for (auto i : indices) 315 valIndices.push_back(i); 316 return STORE(val, GEPA(basePtr, valIndices)); 317 } 318 CALL(Value * Callee,const std::initializer_list<Value * > & argsList)319 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList) 320 { 321 std::vector<Value*> args; 322 for (auto arg : argsList) 323 args.push_back(arg); 324 return CALLA(Callee, args); 325 } 326 327 #if HAVE_LLVM > 0x306 CALL(Value * Callee,Value * arg)328 CallInst *Builder::CALL(Value *Callee, Value* arg) 329 { 330 std::vector<Value*> args; 331 args.push_back(arg); 332 return CALLA(Callee, args); 333 } 334 CALL2(Value * Callee,Value * arg1,Value * arg2)335 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2) 336 { 337 std::vector<Value*> args; 338 args.push_back(arg1); 339 args.push_back(arg2); 340 return CALLA(Callee, args); 341 } 342 CALL3(Value * Callee,Value * arg1,Value * arg2,Value * arg3)343 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3) 344 { 345 std::vector<Value*> args; 346 args.push_back(arg1); 347 args.push_back(arg2); 348 args.push_back(arg3); 349 return CALLA(Callee, args); 350 } 351 #endif 352 VRCP(Value * va)353 Value *Builder::VRCP(Value *va) 354 { 355 return FDIV(VIMMED1(1.0f), va); // 1 / a 356 } 357 VPLANEPS(Value * vA,Value * vB,Value * vC,Value * & vX,Value * & vY)358 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) 359 { 360 Value* vOut = FMADDPS(vA, vX, vC); 361 vOut = FMADDPS(vB, vY, vOut); 362 return vOut; 363 } 364 365 ////////////////////////////////////////////////////////////////////////// 366 /// @brief Generate an i32 masked load operation in LLVM IR. If not 367 /// supported on the underlying platform, emulate it with float masked load 368 /// @param src - base address pointer for the load 369 /// @param vMask - SIMD wide mask that controls whether to access memory load 0 MASKLOADD(Value * src,Value * mask)370 Value *Builder::MASKLOADD(Value* src,Value* mask) 371 { 372 Value* vResult; 373 // use avx2 gather instruction is available 374 if(JM()->mArch.AVX2()) 375 { 376 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); 377 vResult = CALL(func,{src,mask}); 378 } 379 else 380 { 381 // maskload intrinsic expects integer mask operand in llvm >= 3.8 382 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8) 383 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); 384 #else 385 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth)); 386 #endif 387 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); 388 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth)); 389 } 390 return vResult; 391 } 392 393 ////////////////////////////////////////////////////////////////////////// 394 /// @brief insert a JIT call to CallPrint 395 /// - outputs formatted string to both stdout and VS output window 396 /// - DEBUG builds only 397 /// Usage example: 398 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); 399 /// where C(lane) creates a constant value to print, and pIndex is the Value* 400 /// result from a GEP, printing out the pointer to memory 401 /// @param printStr - constant string to print, which includes format specifiers 402 /// @param printArgs - initializer list of Value*'s to print to std out PRINT(const std::string & printStr,const std::initializer_list<Value * > & printArgs)403 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs) 404 { 405 // push the arguments to CallPrint into a vector 406 std::vector<Value*> printCallArgs; 407 // save room for the format string. we still need to modify it for vectors 408 printCallArgs.resize(1); 409 410 // search through the format string for special processing 411 size_t pos = 0; 412 std::string tempStr(printStr); 413 pos = tempStr.find('%', pos); 414 auto v = printArgs.begin(); 415 416 while ((pos != std::string::npos) && (v != printArgs.end())) 417 { 418 Value* pArg = *v; 419 Type* pType = pArg->getType(); 420 421 if (pType->isVectorTy()) 422 { 423 Type* pContainedType = pType->getContainedType(0); 424 425 if (toupper(tempStr[pos + 1]) == 'X') 426 { 427 tempStr[pos] = '0'; 428 tempStr[pos + 1] = 'x'; 429 tempStr.insert(pos + 2, "%08X "); 430 pos += 7; 431 432 printCallArgs.push_back(VEXTRACT(pArg, C(0))); 433 434 std::string vectorFormatStr; 435 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) 436 { 437 vectorFormatStr += "0x%08X "; 438 printCallArgs.push_back(VEXTRACT(pArg, C(i))); 439 } 440 441 tempStr.insert(pos, vectorFormatStr); 442 pos += vectorFormatStr.size(); 443 } 444 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) 445 { 446 uint32_t i = 0; 447 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) 448 { 449 tempStr.insert(pos, std::string("%f ")); 450 pos += 3; 451 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); 452 } 453 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); 454 } 455 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) 456 { 457 uint32_t i = 0; 458 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) 459 { 460 tempStr.insert(pos, std::string("%d ")); 461 pos += 3; 462 printCallArgs.push_back(VEXTRACT(pArg, C(i))); 463 } 464 printCallArgs.push_back(VEXTRACT(pArg, C(i))); 465 } 466 } 467 else 468 { 469 if (toupper(tempStr[pos + 1]) == 'X') 470 { 471 tempStr[pos] = '0'; 472 tempStr.insert(pos + 1, "x%08"); 473 printCallArgs.push_back(pArg); 474 pos += 3; 475 } 476 // for %f we need to cast float Values to doubles so that they print out correctly 477 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy())) 478 { 479 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext))); 480 pos++; 481 } 482 else 483 { 484 printCallArgs.push_back(pArg); 485 } 486 } 487 488 // advance to the next arguement 489 v++; 490 pos = tempStr.find('%', ++pos); 491 } 492 493 // create global variable constant string 494 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); 495 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); 496 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); 497 498 // get a pointer to the first character in the constant string array 499 std::vector<Constant*> geplist{C(0),C(0)}; 500 #if HAVE_LLVM == 0x306 501 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); 502 #else 503 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); 504 #endif 505 506 // insert the pointer to the format string in the argument vector 507 printCallArgs[0] = strGEP; 508 509 // get pointer to CallPrint function and insert decl into the module if needed 510 std::vector<Type*> args; 511 args.push_back(PointerType::get(mInt8Ty,0)); 512 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); 513 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); 514 515 // if we haven't yet added the symbol to the symbol table 516 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) 517 { 518 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); 519 } 520 521 // insert a call to CallPrint 522 return CALLA(callPrintFn,printCallArgs); 523 } 524 525 ////////////////////////////////////////////////////////////////////////// 526 /// @brief Wrapper around PRINT with initializer list. PRINT(const std::string & printStr)527 CallInst* Builder::PRINT(const std::string &printStr) 528 { 529 return PRINT(printStr, {}); 530 } 531 532 ////////////////////////////////////////////////////////////////////////// 533 /// @brief Generate a masked gather operation in LLVM IR. If not 534 /// supported on the underlying platform, emulate it with loads 535 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 536 /// @param pBase - Int8* base VB address pointer value 537 /// @param vIndices - SIMD wide value of VB byte offsets 538 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 539 /// @param scale - value to scale indices by GATHERPS(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,Value * scale)540 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) 541 { 542 Value* vGather; 543 544 // use avx2 gather instruction if available 545 if(JM()->mArch.AVX2()) 546 { 547 // force mask to <N x float>, required by vgather 548 vMask = BITCAST(vMask, mSimdFP32Ty); 549 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); 550 } 551 else 552 { 553 Value* pStack = STACKSAVE(); 554 555 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address 556 Value* vSrcPtr = ALLOCA(vSrc->getType()); 557 STORE(vSrc, vSrcPtr); 558 559 vGather = VUNDEF_F(); 560 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); 561 Value *vOffsets = MUL(vIndices,vScaleVec); 562 Value *mask = MASK(vMask); 563 for(uint32_t i = 0; i < mVWidth; ++i) 564 { 565 // single component byte index 566 Value *offset = VEXTRACT(vOffsets,C(i)); 567 // byte pointer to component 568 Value *loadAddress = GEP(pBase,offset); 569 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); 570 // pointer to the value to load if we're masking off a component 571 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); 572 Value *selMask = VEXTRACT(mask,C(i)); 573 // switch in a safe address to load if we're trying to access a vertex 574 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); 575 Value *val = LOAD(validAddress); 576 vGather = VINSERT(vGather,val,C(i)); 577 } 578 STACKRESTORE(pStack); 579 } 580 581 return vGather; 582 } 583 584 ////////////////////////////////////////////////////////////////////////// 585 /// @brief Generate a masked gather operation in LLVM IR. If not 586 /// supported on the underlying platform, emulate it with loads 587 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 588 /// @param pBase - Int8* base VB address pointer value 589 /// @param vIndices - SIMD wide value of VB byte offsets 590 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 591 /// @param scale - value to scale indices by GATHERDD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,Value * scale)592 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) 593 { 594 Value* vGather; 595 596 // use avx2 gather instruction if available 597 if(JM()->mArch.AVX2()) 598 { 599 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); 600 } 601 else 602 { 603 Value* pStack = STACKSAVE(); 604 605 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address 606 Value* vSrcPtr = ALLOCA(vSrc->getType()); 607 STORE(vSrc, vSrcPtr); 608 609 vGather = VUNDEF_I(); 610 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); 611 Value *vOffsets = MUL(vIndices, vScaleVec); 612 Value *mask = MASK(vMask); 613 for(uint32_t i = 0; i < mVWidth; ++i) 614 { 615 // single component byte index 616 Value *offset = VEXTRACT(vOffsets, C(i)); 617 // byte pointer to component 618 Value *loadAddress = GEP(pBase, offset); 619 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); 620 // pointer to the value to load if we're masking off a component 621 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); 622 Value *selMask = VEXTRACT(mask, C(i)); 623 // switch in a safe address to load if we're trying to access a vertex 624 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); 625 Value *val = LOAD(validAddress, C(0)); 626 vGather = VINSERT(vGather, val, C(i)); 627 } 628 629 STACKRESTORE(pStack); 630 } 631 return vGather; 632 } 633 634 ////////////////////////////////////////////////////////////////////////// 635 /// @brief Generate a masked gather operation in LLVM IR. If not 636 /// supported on the underlying platform, emulate it with loads 637 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 638 /// @param pBase - Int8* base VB address pointer value 639 /// @param vIndices - SIMD wide value of VB byte offsets 640 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 641 /// @param scale - value to scale indices by GATHERPD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,Value * scale)642 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) 643 { 644 Value* vGather; 645 646 // use avx2 gather instruction if available 647 if(JM()->mArch.AVX2()) 648 { 649 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale); 650 } 651 else 652 { 653 Value* pStack = STACKSAVE(); 654 655 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address 656 Value* vSrcPtr = ALLOCA(vSrc->getType()); 657 STORE(vSrc, vSrcPtr); 658 659 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); 660 Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty)); 661 Value *vOffsets = MUL(vIndices,vScaleVec); 662 Value *mask = MASK(vMask); 663 for(uint32_t i = 0; i < mVWidth/2; ++i) 664 { 665 // single component byte index 666 Value *offset = VEXTRACT(vOffsets,C(i)); 667 // byte pointer to component 668 Value *loadAddress = GEP(pBase,offset); 669 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0)); 670 // pointer to the value to load if we're masking off a component 671 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); 672 Value *selMask = VEXTRACT(mask,C(i)); 673 // switch in a safe address to load if we're trying to access a vertex 674 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); 675 Value *val = LOAD(validAddress); 676 vGather = VINSERT(vGather,val,C(i)); 677 } 678 STACKRESTORE(pStack); 679 } 680 return vGather; 681 } 682 683 ////////////////////////////////////////////////////////////////////////// 684 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask MASK(Value * vmask)685 Value* Builder::MASK(Value* vmask) 686 { 687 Value* src = BITCAST(vmask, mSimdInt32Ty); 688 return ICMP_SLT(src, VIMMED1(0)); 689 } 690 691 ////////////////////////////////////////////////////////////////////////// 692 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask VMASK(Value * mask)693 Value* Builder::VMASK(Value* mask) 694 { 695 return S_EXT(mask, mSimdInt32Ty); 696 } 697 698 ////////////////////////////////////////////////////////////////////////// 699 /// @brief Generate a VPSHUFB operation in LLVM IR. If not 700 /// supported on the underlying platform, emulate it 701 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values 702 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values 703 /// Byte masks in lower 128 lane of b selects 8 bit values from lower 704 /// 128bits of a, and vice versa for the upper lanes. If the mask 705 /// value is negative, '0' is inserted. PSHUFB(Value * a,Value * b)706 Value *Builder::PSHUFB(Value* a, Value* b) 707 { 708 Value* res; 709 // use avx2 pshufb instruction if available 710 if(JM()->mArch.AVX2()) 711 { 712 res = VPSHUFB(a, b); 713 } 714 else 715 { 716 Constant* cB = dyn_cast<Constant>(b); 717 // number of 8 bit elements in b 718 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements(); 719 // output vector 720 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); 721 722 // insert an 8 bit value from the high and low lanes of a per loop iteration 723 numElms /= 2; 724 for(uint32_t i = 0; i < numElms; i++) 725 { 726 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); 727 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); 728 729 // extract values from constant mask 730 char valLow128bLane = (char)(cLow128b->getSExtValue()); 731 char valHigh128bLane = (char)(cHigh128b->getSExtValue()); 732 733 Value* insertValLow128b; 734 Value* insertValHigh128b; 735 736 // if the mask value is negative, insert a '0' in the respective output position 737 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector 738 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); 739 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); 740 741 vShuf = VINSERT(vShuf, insertValLow128b, i); 742 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); 743 } 744 res = vShuf; 745 } 746 return res; 747 } 748 749 ////////////////////////////////////////////////////////////////////////// 750 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 751 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it 752 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only 753 /// lower 8 values are used. PMOVSXBD(Value * a)754 Value *Builder::PMOVSXBD(Value* a) 755 { 756 // llvm-3.9 removed the pmovsxbd intrinsic 757 #if HAVE_LLVM < 0x309 758 // use avx2 byte sign extend instruction if available 759 if(JM()->mArch.AVX2()) 760 { 761 Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd); 762 return CALL(pmovsxbd, std::initializer_list<Value*>{a}); 763 } 764 else 765 #endif 766 { 767 // VPMOVSXBD output type 768 Type* v8x32Ty = VectorType::get(mInt32Ty, 8); 769 // Extract 8 values from 128bit lane and sign extend 770 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); 771 } 772 } 773 774 ////////////////////////////////////////////////////////////////////////// 775 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 776 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it 777 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. PMOVSXWD(Value * a)778 Value *Builder::PMOVSXWD(Value* a) 779 { 780 // llvm-3.9 removed the pmovsxwd intrinsic 781 #if HAVE_LLVM < 0x309 782 // use avx2 word sign extend if available 783 if(JM()->mArch.AVX2()) 784 { 785 Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd); 786 return CALL(pmovsxwd, std::initializer_list<Value*>{a}); 787 } 788 else 789 #endif 790 { 791 // VPMOVSXWD output type 792 Type* v8x32Ty = VectorType::get(mInt32Ty, 8); 793 // Extract 8 values from 128bit lane and sign extend 794 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); 795 } 796 } 797 798 ////////////////////////////////////////////////////////////////////////// 799 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values 800 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying 801 /// platform, emulate it 802 /// @param a - 256bit SIMD lane(8x32bit) of integer values. 803 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values PERMD(Value * a,Value * idx)804 Value *Builder::PERMD(Value* a, Value* idx) 805 { 806 Value* res; 807 // use avx2 permute instruction if available 808 if(JM()->mArch.AVX2()) 809 { 810 res = VPERMD(a, idx); 811 } 812 else 813 { 814 if (isa<Constant>(idx)) 815 { 816 res = VSHUFFLE(a, a, idx); 817 } 818 else 819 { 820 res = VUNDEF_I(); 821 for (uint32_t l = 0; l < JM()->mVWidth; ++l) 822 { 823 Value* pIndex = VEXTRACT(idx, C(l)); 824 Value* pVal = VEXTRACT(a, pIndex); 825 res = VINSERT(res, pVal, C(l)); 826 } 827 } 828 } 829 return res; 830 } 831 832 ////////////////////////////////////////////////////////////////////////// 833 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values 834 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying 835 /// platform, emulate it 836 /// @param a - 256bit SIMD lane(8x32bit) of float values. 837 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values PERMPS(Value * a,Value * idx)838 Value *Builder::PERMPS(Value* a, Value* idx) 839 { 840 Value* res; 841 // use avx2 permute instruction if available 842 if (JM()->mArch.AVX2()) 843 { 844 // llvm 3.6.0 swapped the order of the args to vpermd 845 res = VPERMPS(idx, a); 846 } 847 else 848 { 849 if (isa<Constant>(idx)) 850 { 851 res = VSHUFFLE(a, a, idx); 852 } 853 else 854 { 855 res = VUNDEF_F(); 856 for (uint32_t l = 0; l < JM()->mVWidth; ++l) 857 { 858 Value* pIndex = VEXTRACT(idx, C(l)); 859 Value* pVal = VEXTRACT(a, pIndex); 860 res = VINSERT(res, pVal, C(l)); 861 } 862 } 863 } 864 865 return res; 866 } 867 868 ////////////////////////////////////////////////////////////////////////// 869 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) 870 /// in LLVM IR. If not supported on the underlying platform, emulate it 871 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. CVTPH2PS(Value * a)872 Value *Builder::CVTPH2PS(Value* a) 873 { 874 if (JM()->mArch.F16C()) 875 { 876 return VCVTPH2PS(a); 877 } 878 else 879 { 880 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); 881 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy)); 882 883 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr) 884 { 885 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32); 886 } 887 888 Value* pResult = UndefValue::get(mSimdFP32Ty); 889 for (uint32_t i = 0; i < mVWidth; ++i) 890 { 891 Value* pSrc = VEXTRACT(a, C(i)); 892 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); 893 pResult = VINSERT(pResult, pConv, C(i)); 894 } 895 896 return pResult; 897 } 898 } 899 900 ////////////////////////////////////////////////////////////////////////// 901 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) 902 /// in LLVM IR. If not supported on the underlying platform, emulate it 903 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. CVTPS2PH(Value * a,Value * rounding)904 Value *Builder::CVTPS2PH(Value* a, Value* rounding) 905 { 906 if (JM()->mArch.F16C()) 907 { 908 return VCVTPS2PH(a, rounding); 909 } 910 else 911 { 912 // call scalar C function for now 913 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); 914 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy)); 915 916 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr) 917 { 918 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float); 919 } 920 921 Value* pResult = UndefValue::get(mSimdInt16Ty); 922 for (uint32_t i = 0; i < mVWidth; ++i) 923 { 924 Value* pSrc = VEXTRACT(a, C(i)); 925 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); 926 pResult = VINSERT(pResult, pConv, C(i)); 927 } 928 929 return pResult; 930 } 931 } 932 PMAXSD(Value * a,Value * b)933 Value *Builder::PMAXSD(Value* a, Value* b) 934 { 935 // llvm-3.9 removed the pmax intrinsics 936 #if HAVE_LLVM >= 0x309 937 Value* cmp = ICMP_SGT(a, b); 938 return SELECT(cmp, a, b); 939 #else 940 if (JM()->mArch.AVX2()) 941 { 942 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d); 943 return CALL(pmaxsd, {a, b}); 944 } 945 else 946 { 947 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources 948 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd); 949 950 // low 128 951 Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); 952 Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); 953 Value* resLo = CALL(pmaxsd, {aLo, bLo}); 954 955 // high 128 956 Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); 957 Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); 958 Value* resHi = CALL(pmaxsd, {aHi, bHi}); 959 960 // combine 961 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); 962 result = VINSERTI128(result, resHi, C((uint8_t)1)); 963 964 return result; 965 } 966 #endif 967 } 968 PMINSD(Value * a,Value * b)969 Value *Builder::PMINSD(Value* a, Value* b) 970 { 971 // llvm-3.9 removed the pmin intrinsics 972 #if HAVE_LLVM >= 0x309 973 Value* cmp = ICMP_SLT(a, b); 974 return SELECT(cmp, a, b); 975 #else 976 if (JM()->mArch.AVX2()) 977 { 978 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d); 979 return CALL(pminsd, {a, b}); 980 } 981 else 982 { 983 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources 984 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd); 985 986 // low 128 987 Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); 988 Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); 989 Value* resLo = CALL(pminsd, {aLo, bLo}); 990 991 // high 128 992 Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); 993 Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); 994 Value* resHi = CALL(pminsd, {aHi, bHi}); 995 996 // combine 997 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); 998 result = VINSERTI128(result, resHi, C((uint8_t)1)); 999 1000 return result; 1001 } 1002 #endif 1003 } 1004 Gather4(const SWR_FORMAT format,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1005 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 1006 Value* mask, Value* vGatherComponents[], bool bPackedOutput) 1007 { 1008 const SWR_FORMAT_INFO &info = GetFormatInfo(format); 1009 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) 1010 { 1011 // ensure our mask is the correct type 1012 mask = BITCAST(mask, mSimdFP32Ty); 1013 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); 1014 } 1015 else 1016 { 1017 // ensure our mask is the correct type 1018 mask = BITCAST(mask, mSimdInt32Ty); 1019 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); 1020 } 1021 } 1022 GATHER4PS(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1023 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 1024 Value* mask, Value* vGatherComponents[], bool bPackedOutput) 1025 { 1026 switch(info.bpp / info.numComps) 1027 { 1028 case 16: 1029 { 1030 Value* vGatherResult[2]; 1031 Value *vMask; 1032 1033 // TODO: vGatherMaskedVal 1034 Value* vGatherMaskedVal = VIMMED1((float)0); 1035 1036 // always have at least one component out of x or y to fetch 1037 1038 // save mask as it is zero'd out after each gather 1039 vMask = mask; 1040 1041 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); 1042 // e.g. result of first 8x32bit integer gather for 16bit components 1043 // 256i - 0 1 2 3 4 5 6 7 1044 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 1045 // 1046 1047 // if we have at least one component out of x or y to fetch 1048 if(info.numComps > 2) 1049 { 1050 // offset base to the next components(zw) in the vertex to gather 1051 pSrcBase = GEP(pSrcBase, C((char)4)); 1052 vMask = mask; 1053 1054 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); 1055 // e.g. result of second 8x32bit integer gather for 16bit components 1056 // 256i - 0 1 2 3 4 5 6 7 1057 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 1058 // 1059 } 1060 else 1061 { 1062 vGatherResult[1] = vGatherMaskedVal; 1063 } 1064 1065 // Shuffle gathered components into place, each row is a component 1066 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 1067 } 1068 break; 1069 case 32: 1070 { 1071 // apply defaults 1072 for (uint32_t i = 0; i < 4; ++i) 1073 { 1074 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); 1075 } 1076 1077 for(uint32_t i = 0; i < info.numComps; i++) 1078 { 1079 uint32_t swizzleIndex = info.swizzle[i]; 1080 1081 // save mask as it is zero'd out after each gather 1082 Value *vMask = mask; 1083 1084 // Gather a SIMD of components 1085 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); 1086 1087 // offset base to the next component to gather 1088 pSrcBase = GEP(pSrcBase, C((char)4)); 1089 } 1090 } 1091 break; 1092 default: 1093 SWR_ASSERT(0, "Invalid float format"); 1094 break; 1095 } 1096 } 1097 GATHER4DD(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1098 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 1099 Value* mask, Value* vGatherComponents[], bool bPackedOutput) 1100 { 1101 switch (info.bpp / info.numComps) 1102 { 1103 case 8: 1104 { 1105 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 1106 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); 1107 // e.g. result of an 8x32bit integer gather for 8bit components 1108 // 256i - 0 1 2 3 4 5 6 7 1109 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 1110 1111 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 1112 } 1113 break; 1114 case 16: 1115 { 1116 Value* vGatherResult[2]; 1117 Value *vMask; 1118 1119 // TODO: vGatherMaskedVal 1120 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 1121 1122 // always have at least one component out of x or y to fetch 1123 1124 // save mask as it is zero'd out after each gather 1125 vMask = mask; 1126 1127 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); 1128 // e.g. result of first 8x32bit integer gather for 16bit components 1129 // 256i - 0 1 2 3 4 5 6 7 1130 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 1131 // 1132 1133 // if we have at least one component out of x or y to fetch 1134 if(info.numComps > 2) 1135 { 1136 // offset base to the next components(zw) in the vertex to gather 1137 pSrcBase = GEP(pSrcBase, C((char)4)); 1138 vMask = mask; 1139 1140 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); 1141 // e.g. result of second 8x32bit integer gather for 16bit components 1142 // 256i - 0 1 2 3 4 5 6 7 1143 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 1144 // 1145 } 1146 else 1147 { 1148 vGatherResult[1] = vGatherMaskedVal; 1149 } 1150 1151 // Shuffle gathered components into place, each row is a component 1152 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 1153 1154 } 1155 break; 1156 case 32: 1157 { 1158 // apply defaults 1159 for (uint32_t i = 0; i < 4; ++i) 1160 { 1161 vGatherComponents[i] = VIMMED1((int)info.defaults[i]); 1162 } 1163 1164 for(uint32_t i = 0; i < info.numComps; i++) 1165 { 1166 uint32_t swizzleIndex = info.swizzle[i]; 1167 1168 // save mask as it is zero'd out after each gather 1169 Value *vMask = mask; 1170 1171 // Gather a SIMD of components 1172 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); 1173 1174 // offset base to the next component to gather 1175 pSrcBase = GEP(pSrcBase, C((char)4)); 1176 } 1177 } 1178 break; 1179 default: 1180 SWR_ASSERT(0, "unsupported format"); 1181 break; 1182 } 1183 } 1184 Shuffle16bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput[2],Value * vGatherOutput[4],bool bPackedOutput)1185 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) 1186 { 1187 // cast types 1188 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1189 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1190 1191 // input could either be float or int vector; do shuffle work in int 1192 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); 1193 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); 1194 1195 if(bPackedOutput) 1196 { 1197 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits 1198 1199 // shuffle mask 1200 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1201 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1202 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); 1203 // after pshufb: group components together in each 128bit lane 1204 // 256i - 0 1 2 3 4 5 6 7 1205 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1206 1207 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1208 // after PERMD: move and pack xy components into each 128bit lane 1209 // 256i - 0 1 2 3 4 5 6 7 1210 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1211 1212 // do the same for zw components 1213 Value* vi128ZW = nullptr; 1214 if(info.numComps > 2) 1215 { 1216 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); 1217 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1218 } 1219 1220 for(uint32_t i = 0; i < 4; i++) 1221 { 1222 uint32_t swizzleIndex = info.swizzle[i]; 1223 // todo: fixed for packed 1224 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 1225 if(i >= info.numComps) 1226 { 1227 // set the default component val 1228 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 1229 continue; 1230 } 1231 1232 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1233 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1234 // if x or y, use vi128XY permute result, else use vi128ZW 1235 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1236 1237 // extract packed component 128 bit lanes 1238 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 1239 } 1240 1241 } 1242 else 1243 { 1244 // pshufb masks for each component 1245 Value* vConstMask[2]; 1246 // x/z shuffle mask 1247 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1248 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); 1249 1250 // y/w shuffle mask 1251 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 1252 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 1253 1254 1255 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 1256 // apply defaults 1257 for (uint32_t i = 0; i < 4; ++i) 1258 { 1259 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 1260 } 1261 1262 for(uint32_t i = 0; i < info.numComps; i++) 1263 { 1264 uint32_t swizzleIndex = info.swizzle[i]; 1265 1266 // select correct constMask for x/z or y/w pshufb 1267 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 1268 // if x or y, use vi128XY permute result, else use vi128ZW 1269 uint32_t selectedGather = (i < 2) ? 0 : 1; 1270 1271 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); 1272 // after pshufb mask for x channel; z uses the same shuffle from the second gather 1273 // 256i - 0 1 2 3 4 5 6 7 1274 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 1275 } 1276 } 1277 } 1278 Shuffle8bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput,Value * vGatherOutput[],bool bPackedOutput)1279 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) 1280 { 1281 // cast types 1282 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1283 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits 1284 1285 if(bPackedOutput) 1286 { 1287 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits 1288 // shuffle mask 1289 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 1290 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); 1291 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 1292 // after pshufb: group components together in each 128bit lane 1293 // 256i - 0 1 2 3 4 5 6 7 1294 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 1295 1296 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); 1297 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 1298 // 256i - 0 1 2 3 4 5 6 7 1299 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 1300 1301 // do the same for zw components 1302 Value* vi128ZW = nullptr; 1303 if(info.numComps > 2) 1304 { 1305 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); 1306 } 1307 1308 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex 1309 for(uint32_t i = 0; i < 4; i++) 1310 { 1311 uint32_t swizzleIndex = info.swizzle[i]; 1312 // todo: fix for packed 1313 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 1314 if(i >= info.numComps) 1315 { 1316 // set the default component val 1317 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 1318 continue; 1319 } 1320 1321 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1322 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1323 // if x or y, use vi128XY permute result, else use vi128ZW 1324 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1325 1326 // sign extend 1327 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 1328 } 1329 } 1330 // else zero extend 1331 else{ 1332 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 1333 // apply defaults 1334 for (uint32_t i = 0; i < 4; ++i) 1335 { 1336 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 1337 } 1338 1339 for(uint32_t i = 0; i < info.numComps; i++){ 1340 uint32_t swizzleIndex = info.swizzle[i]; 1341 1342 // pshufb masks for each component 1343 Value* vConstMask; 1344 switch(i) 1345 { 1346 case 0: 1347 // x shuffle mask 1348 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 1349 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); 1350 break; 1351 case 1: 1352 // y shuffle mask 1353 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 1354 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); 1355 break; 1356 case 2: 1357 // z shuffle mask 1358 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 1359 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); 1360 break; 1361 case 3: 1362 // w shuffle mask 1363 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 1364 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); 1365 break; 1366 default: 1367 vConstMask = nullptr; 1368 break; 1369 } 1370 1371 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 1372 // after pshufb for x channel 1373 // 256i - 0 1 2 3 4 5 6 7 1374 // x000 x000 x000 x000 x000 x000 x000 x000 1375 } 1376 } 1377 } 1378 1379 // Helper function to create alloca in entry block of function CreateEntryAlloca(Function * pFunc,Type * pType)1380 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) 1381 { 1382 auto saveIP = IRB()->saveIP(); 1383 IRB()->SetInsertPoint(&pFunc->getEntryBlock(), 1384 pFunc->getEntryBlock().begin()); 1385 Value* pAlloca = ALLOCA(pType); 1386 IRB()->restoreIP(saveIP); 1387 return pAlloca; 1388 } 1389 1390 ////////////////////////////////////////////////////////////////////////// 1391 /// @brief emulates a scatter operation. 1392 /// @param pDst - pointer to destination 1393 /// @param vSrc - vector of src data to scatter 1394 /// @param vOffsets - vector of byte offsets from pDst 1395 /// @param vMask - mask of valid lanes SCATTERPS(Value * pDst,Value * vSrc,Value * vOffsets,Value * vMask)1396 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) 1397 { 1398 /* Scatter algorithm 1399 1400 while(Index = BitScanForward(mask)) 1401 srcElem = srcVector[Index] 1402 offsetElem = offsetVector[Index] 1403 *(pDst + offsetElem) = srcElem 1404 Update mask (&= ~(1<<Index) 1405 1406 */ 1407 1408 BasicBlock* pCurBB = IRB()->GetInsertBlock(); 1409 Function* pFunc = pCurBB->getParent(); 1410 Type* pSrcTy = vSrc->getType()->getVectorElementType(); 1411 1412 // Store vectors on stack 1413 if (pScatterStackSrc == nullptr) 1414 { 1415 // Save off stack allocations and reuse per scatter. Significantly reduces stack 1416 // requirements for shaders with a lot of scatters. 1417 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); 1418 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); 1419 } 1420 1421 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); 1422 Value* pOffsetsArrayPtr = pScatterStackOffsets; 1423 STORE(vSrc, pSrcArrayPtr); 1424 STORE(vOffsets, pOffsetsArrayPtr); 1425 1426 // Cast to pointers for random access 1427 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); 1428 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); 1429 1430 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty)); 1431 1432 // Get cttz function 1433 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty }); 1434 1435 // Setup loop basic block 1436 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc); 1437 1438 // compute first set bit 1439 Value* pIndex = CALL(pfnCttz, { pMask, C(false) }); 1440 1441 Value* pIsUndef = ICMP_EQ(pIndex, C(32)); 1442 1443 // Split current block 1444 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); 1445 1446 // Remove unconditional jump created by splitBasicBlock 1447 pCurBB->getTerminator()->eraseFromParent(); 1448 1449 // Add terminator to end of original block 1450 IRB()->SetInsertPoint(pCurBB); 1451 1452 // Add conditional branch 1453 COND_BR(pIsUndef, pPostLoop, pLoop); 1454 1455 // Add loop basic block contents 1456 IRB()->SetInsertPoint(pLoop); 1457 PHINode* pIndexPhi = PHI(mInt32Ty, 2); 1458 PHINode* pMaskPhi = PHI(mInt32Ty, 2); 1459 1460 pIndexPhi->addIncoming(pIndex, pCurBB); 1461 pMaskPhi->addIncoming(pMask, pCurBB); 1462 1463 // Extract elements for this index 1464 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi }); 1465 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi }); 1466 1467 // GEP to this offset in dst 1468 Value* pCurDst = GEP(pDst, pOffsetElem); 1469 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); 1470 STORE(pSrcElem, pCurDst); 1471 1472 // Update the mask 1473 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); 1474 1475 // Terminator 1476 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) }); 1477 1478 pIsUndef = ICMP_EQ(pNewIndex, C(32)); 1479 COND_BR(pIsUndef, pPostLoop, pLoop); 1480 1481 // Update phi edges 1482 pIndexPhi->addIncoming(pNewIndex, pLoop); 1483 pMaskPhi->addIncoming(pNewMask, pLoop); 1484 1485 // Move builder to beginning of post loop 1486 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); 1487 } 1488 VABSPS(Value * a)1489 Value* Builder::VABSPS(Value* a) 1490 { 1491 Value* asInt = BITCAST(a, mSimdInt32Ty); 1492 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); 1493 return result; 1494 } 1495 ICLAMP(Value * src,Value * low,Value * high)1496 Value *Builder::ICLAMP(Value* src, Value* low, Value* high) 1497 { 1498 Value *lowCmp = ICMP_SLT(src, low); 1499 Value *ret = SELECT(lowCmp, low, src); 1500 1501 Value *highCmp = ICMP_SGT(ret, high); 1502 ret = SELECT(highCmp, high, ret); 1503 1504 return ret; 1505 } 1506 FCLAMP(Value * src,Value * low,Value * high)1507 Value *Builder::FCLAMP(Value* src, Value* low, Value* high) 1508 { 1509 Value *lowCmp = FCMP_OLT(src, low); 1510 Value *ret = SELECT(lowCmp, low, src); 1511 1512 Value *highCmp = FCMP_OGT(ret, high); 1513 ret = SELECT(highCmp, high, ret); 1514 1515 return ret; 1516 } 1517 FCLAMP(Value * src,float low,float high)1518 Value *Builder::FCLAMP(Value* src, float low, float high) 1519 { 1520 Value* result = VMAXPS(src, VIMMED1(low)); 1521 result = VMINPS(result, VIMMED1(high)); 1522 1523 return result; 1524 } 1525 1526 ////////////////////////////////////////////////////////////////////////// 1527 /// @brief save/restore stack, providing ability to push/pop the stack and 1528 /// reduce overall stack requirements for temporary stack use STACKSAVE()1529 Value* Builder::STACKSAVE() 1530 { 1531 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); 1532 #if HAVE_LLVM == 0x306 1533 return CALL(pfnStackSave); 1534 #else 1535 return CALLA(pfnStackSave); 1536 #endif 1537 } 1538 STACKRESTORE(Value * pSaved)1539 void Builder::STACKRESTORE(Value* pSaved) 1540 { 1541 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); 1542 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved}); 1543 } 1544 FMADDPS(Value * a,Value * b,Value * c)1545 Value *Builder::FMADDPS(Value* a, Value* b, Value* c) 1546 { 1547 Value* vOut; 1548 // use FMADs if available 1549 if(JM()->mArch.AVX2()) 1550 { 1551 vOut = VFMADDPS(a, b, c); 1552 } 1553 else 1554 { 1555 vOut = FADD(FMUL(a, b), c); 1556 } 1557 return vOut; 1558 } 1559 POPCNT(Value * a)1560 Value* Builder::POPCNT(Value* a) 1561 { 1562 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); 1563 return CALL(pCtPop, std::initializer_list<Value*>{a}); 1564 } 1565 1566 ////////////////////////////////////////////////////////////////////////// 1567 /// @brief C functions called by LLVM IR 1568 ////////////////////////////////////////////////////////////////////////// 1569 1570 ////////////////////////////////////////////////////////////////////////// 1571 /// @brief called in JIT code, inserted by PRINT 1572 /// output to both stdout and visual studio debug console CallPrint(const char * fmt,...)1573 void __cdecl CallPrint(const char* fmt, ...) 1574 { 1575 va_list args; 1576 va_start(args, fmt); 1577 vprintf(fmt, args); 1578 1579 #if defined( _WIN32 ) 1580 char strBuf[1024]; 1581 vsnprintf_s(strBuf, _TRUNCATE, fmt, args); 1582 OutputDebugString(strBuf); 1583 #endif 1584 1585 va_end(args); 1586 } 1587 VEXTRACTI128(Value * a,Constant * imm8)1588 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) 1589 { 1590 #if HAVE_LLVM == 0x306 1591 Function *func = 1592 Intrinsic::getDeclaration(JM()->mpCurrentModule, 1593 Intrinsic::x86_avx_vextractf128_si_256); 1594 return CALL(func, {a, imm8}); 1595 #else 1596 bool flag = !imm8->isZeroValue(); 1597 SmallVector<Constant*,8> idx; 1598 for (unsigned i = 0; i < mVWidth / 2; i++) { 1599 idx.push_back(C(flag ? i + mVWidth / 2 : i)); 1600 } 1601 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); 1602 #endif 1603 } 1604 VINSERTI128(Value * a,Value * b,Constant * imm8)1605 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) 1606 { 1607 #if HAVE_LLVM == 0x306 1608 Function *func = 1609 Intrinsic::getDeclaration(JM()->mpCurrentModule, 1610 Intrinsic::x86_avx_vinsertf128_si_256); 1611 return CALL(func, {a, b, imm8}); 1612 #else 1613 bool flag = !imm8->isZeroValue(); 1614 SmallVector<Constant*,8> idx; 1615 for (unsigned i = 0; i < mVWidth; i++) { 1616 idx.push_back(C(i)); 1617 } 1618 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); 1619 1620 SmallVector<Constant*,8> idx2; 1621 for (unsigned i = 0; i < mVWidth / 2; i++) { 1622 idx2.push_back(C(flag ? i : i + mVWidth)); 1623 } 1624 for (unsigned i = mVWidth / 2; i < mVWidth; i++) { 1625 idx2.push_back(C(flag ? i + mVWidth / 2 : i)); 1626 } 1627 return VSHUFFLE(a, inter, ConstantVector::get(idx2)); 1628 #endif 1629 } 1630 1631 // rdtsc buckets macros RDTSC_START(Value * pBucketMgr,Value * pId)1632 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) 1633 { 1634 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into 1635 // buckets framework when single threaded 1636 if (KNOB_SINGLE_THREADED) 1637 { 1638 std::vector<Type*> args{ 1639 PointerType::get(mInt32Ty, 0), // pBucketMgr 1640 mInt32Ty // id 1641 }; 1642 1643 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); 1644 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); 1645 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) 1646 { 1647 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); 1648 } 1649 1650 CALL(pFunc, { pBucketMgr, pId }); 1651 } 1652 } 1653 RDTSC_STOP(Value * pBucketMgr,Value * pId)1654 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) 1655 { 1656 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into 1657 // buckets framework when single threaded 1658 if (KNOB_SINGLE_THREADED) 1659 { 1660 std::vector<Type*> args{ 1661 PointerType::get(mInt32Ty, 0), // pBucketMgr 1662 mInt32Ty // id 1663 }; 1664 1665 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); 1666 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); 1667 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) 1668 { 1669 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); 1670 } 1671 1672 CALL(pFunc, { pBucketMgr, pId }); 1673 } 1674 } 1675 1676 } 1677