1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file builder_misc.cpp 24 * 25 * @brief Implementation for miscellaneous builder functions 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "jit_pch.hpp" 31 #include "builder.h" 32 #include "common/rdtsc_buckets.h" 33 34 #include <cstdarg> 35 36 namespace SwrJit 37 { 38 void __cdecl CallPrint(const char* fmt, ...); 39 40 ////////////////////////////////////////////////////////////////////////// 41 /// @brief Convert an IEEE 754 32-bit single precision float to an 42 /// 16 bit float with 5 exponent bits and a variable 43 /// number of mantissa bits. 44 /// @param val - 32-bit float 45 /// @todo Maybe move this outside of this file into a header? ConvertFloat32ToFloat16(float val)46 static uint16_t ConvertFloat32ToFloat16(float val) 47 { 48 uint32_t sign, exp, mant; 49 uint32_t roundBits; 50 51 // Extract the sign, exponent, and mantissa 52 uint32_t uf = *(uint32_t*)&val; 53 sign = (uf & 0x80000000) >> 31; 54 exp = (uf & 0x7F800000) >> 23; 55 mant = uf & 0x007FFFFF; 56 57 // Check for out of range 58 if (std::isnan(val)) 59 { 60 exp = 0x1F; 61 mant = 0x200; 62 sign = 1; // set the sign bit for NANs 63 } 64 else if (std::isinf(val)) 65 { 66 exp = 0x1f; 67 mant = 0x0; 68 } 69 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value 70 { 71 exp = 0x1E; 72 mant = 0x3FF; 73 } 74 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm 75 { 76 mant |= 0x00800000; 77 for (; exp <= 0x70; mant >>= 1, exp++) 78 ; 79 exp = 0; 80 mant = mant >> 13; 81 } 82 else if (exp < 0x66) // Too small to represent -> Zero 83 { 84 exp = 0; 85 mant = 0; 86 } 87 else 88 { 89 // Saves bits that will be shifted off for rounding 90 roundBits = mant & 0x1FFFu; 91 // convert exponent and mantissa to 16 bit format 92 exp = exp - 0x70; 93 mant = mant >> 13; 94 95 // Essentially RTZ, but round up if off by only 1 lsb 96 if (roundBits == 0x1FFFu) 97 { 98 mant++; 99 // check for overflow 100 if ((mant & 0xC00u) != 0) 101 exp++; 102 // make sure only the needed bits are used 103 mant &= 0x3FF; 104 } 105 } 106 107 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; 108 return (uint16_t)tmpVal; 109 } 110 111 ////////////////////////////////////////////////////////////////////////// 112 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision 113 /// float 114 /// @param val - 16-bit float 115 /// @todo Maybe move this outside of this file into a header? ConvertFloat16ToFloat32(uint32_t val)116 static float ConvertFloat16ToFloat32(uint32_t val) 117 { 118 uint32_t result; 119 if ((val & 0x7fff) == 0) 120 { 121 result = ((uint32_t)(val & 0x8000)) << 16; 122 } 123 else if ((val & 0x7c00) == 0x7c00) 124 { 125 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; 126 result |= ((uint32_t)val & 0x8000) << 16; 127 } 128 else 129 { 130 uint32_t sign = (val & 0x8000) << 16; 131 uint32_t mant = (val & 0x3ff) << 13; 132 uint32_t exp = (val >> 10) & 0x1f; 133 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals 134 { 135 mant <<= 1; 136 while (mant < (0x400 << 13)) 137 { 138 exp--; 139 mant <<= 1; 140 } 141 mant &= (0x3ff << 13); 142 } 143 exp = ((exp - 15 + 127) & 0xff) << 23; 144 result = sign | exp | mant; 145 } 146 147 return *(float*)&result; 148 } 149 C(bool i)150 Constant *Builder::C(bool i) 151 { 152 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); 153 } 154 C(char i)155 Constant *Builder::C(char i) 156 { 157 return ConstantInt::get(IRB()->getInt8Ty(), i); 158 } 159 C(uint8_t i)160 Constant *Builder::C(uint8_t i) 161 { 162 return ConstantInt::get(IRB()->getInt8Ty(), i); 163 } 164 C(int i)165 Constant *Builder::C(int i) 166 { 167 return ConstantInt::get(IRB()->getInt32Ty(), i); 168 } 169 C(int64_t i)170 Constant *Builder::C(int64_t i) 171 { 172 return ConstantInt::get(IRB()->getInt64Ty(), i); 173 } 174 C(uint16_t i)175 Constant *Builder::C(uint16_t i) 176 { 177 return ConstantInt::get(mInt16Ty,i); 178 } 179 C(uint32_t i)180 Constant *Builder::C(uint32_t i) 181 { 182 return ConstantInt::get(IRB()->getInt32Ty(), i); 183 } 184 C(float i)185 Constant *Builder::C(float i) 186 { 187 return ConstantFP::get(IRB()->getFloatTy(), i); 188 } 189 PRED(bool pred)190 Constant *Builder::PRED(bool pred) 191 { 192 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); 193 } 194 VIMMED1(int i)195 Value *Builder::VIMMED1(int i) 196 { 197 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); 198 } 199 VIMMED1_16(int i)200 Value *Builder::VIMMED1_16(int i) 201 { 202 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); 203 } 204 VIMMED1(uint32_t i)205 Value *Builder::VIMMED1(uint32_t i) 206 { 207 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); 208 } 209 VIMMED1_16(uint32_t i)210 Value *Builder::VIMMED1_16(uint32_t i) 211 { 212 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); 213 } 214 VIMMED1(float i)215 Value *Builder::VIMMED1(float i) 216 { 217 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); 218 } 219 VIMMED1_16(float i)220 Value *Builder::VIMMED1_16(float i) 221 { 222 return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i))); 223 } 224 VIMMED1(bool i)225 Value *Builder::VIMMED1(bool i) 226 { 227 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); 228 } 229 VIMMED1_16(bool i)230 Value *Builder::VIMMED1_16(bool i) 231 { 232 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i))); 233 } 234 VUNDEF_IPTR()235 Value *Builder::VUNDEF_IPTR() 236 { 237 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); 238 } 239 VUNDEF(Type * t)240 Value *Builder::VUNDEF(Type* t) 241 { 242 return UndefValue::get(VectorType::get(t, mVWidth)); 243 } 244 VUNDEF_I()245 Value *Builder::VUNDEF_I() 246 { 247 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); 248 } 249 VUNDEF_I_16()250 Value *Builder::VUNDEF_I_16() 251 { 252 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); 253 } 254 VUNDEF_F()255 Value *Builder::VUNDEF_F() 256 { 257 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); 258 } 259 VUNDEF_F_16()260 Value *Builder::VUNDEF_F_16() 261 { 262 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); 263 } 264 VUNDEF(Type * ty,uint32_t size)265 Value *Builder::VUNDEF(Type *ty, uint32_t size) 266 { 267 return UndefValue::get(VectorType::get(ty, size)); 268 } 269 VBROADCAST(Value * src,const llvm::Twine & name)270 Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name) 271 { 272 // check if src is already a vector 273 if (src->getType()->isVectorTy()) 274 { 275 return src; 276 } 277 278 return VECTOR_SPLAT(mVWidth, src, name); 279 } 280 VBROADCAST_16(Value * src)281 Value *Builder::VBROADCAST_16(Value *src) 282 { 283 // check if src is already a vector 284 if (src->getType()->isVectorTy()) 285 { 286 return src; 287 } 288 289 return VECTOR_SPLAT(mVWidth16, src); 290 } 291 IMMED(Value * v)292 uint32_t Builder::IMMED(Value* v) 293 { 294 SWR_ASSERT(isa<ConstantInt>(v)); 295 ConstantInt *pValConst = cast<ConstantInt>(v); 296 return pValConst->getZExtValue(); 297 } 298 S_IMMED(Value * v)299 int32_t Builder::S_IMMED(Value* v) 300 { 301 SWR_ASSERT(isa<ConstantInt>(v)); 302 ConstantInt *pValConst = cast<ConstantInt>(v); 303 return pValConst->getSExtValue(); 304 } 305 GEP(Value * ptr,const std::initializer_list<Value * > & indexList)306 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) 307 { 308 std::vector<Value*> indices; 309 for (auto i : indexList) 310 indices.push_back(i); 311 return GEPA(ptr, indices); 312 } 313 GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)314 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) 315 { 316 std::vector<Value*> indices; 317 for (auto i : indexList) 318 indices.push_back(C(i)); 319 return GEPA(ptr, indices); 320 } 321 IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<Value * > & indexList)322 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList) 323 { 324 std::vector<Value*> indices; 325 for (auto i : indexList) 326 indices.push_back(i); 327 return IN_BOUNDS_GEP(ptr, indices); 328 } 329 IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)330 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) 331 { 332 std::vector<Value*> indices; 333 for (auto i : indexList) 334 indices.push_back(C(i)); 335 return IN_BOUNDS_GEP(ptr, indices); 336 } 337 LOAD(Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name)338 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name) 339 { 340 std::vector<Value*> valIndices; 341 for (auto i : indices) 342 valIndices.push_back(C(i)); 343 return LOAD(GEPA(basePtr, valIndices), name); 344 } 345 LOADV(Value * basePtr,const std::initializer_list<Value * > & indices,const llvm::Twine & name)346 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name) 347 { 348 std::vector<Value*> valIndices; 349 for (auto i : indices) 350 valIndices.push_back(i); 351 return LOAD(GEPA(basePtr, valIndices), name); 352 } 353 STORE(Value * val,Value * basePtr,const std::initializer_list<uint32_t> & indices)354 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices) 355 { 356 std::vector<Value*> valIndices; 357 for (auto i : indices) 358 valIndices.push_back(C(i)); 359 return STORE(val, GEPA(basePtr, valIndices)); 360 } 361 STOREV(Value * val,Value * basePtr,const std::initializer_list<Value * > & indices)362 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices) 363 { 364 std::vector<Value*> valIndices; 365 for (auto i : indices) 366 valIndices.push_back(i); 367 return STORE(val, GEPA(basePtr, valIndices)); 368 } 369 CALL(Value * Callee,const std::initializer_list<Value * > & argsList,const llvm::Twine & name)370 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name) 371 { 372 std::vector<Value*> args; 373 for (auto arg : argsList) 374 args.push_back(arg); 375 return CALLA(Callee, args, name); 376 } 377 CALL(Value * Callee,Value * arg)378 CallInst *Builder::CALL(Value *Callee, Value* arg) 379 { 380 std::vector<Value*> args; 381 args.push_back(arg); 382 return CALLA(Callee, args); 383 } 384 CALL2(Value * Callee,Value * arg1,Value * arg2)385 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2) 386 { 387 std::vector<Value*> args; 388 args.push_back(arg1); 389 args.push_back(arg2); 390 return CALLA(Callee, args); 391 } 392 CALL3(Value * Callee,Value * arg1,Value * arg2,Value * arg3)393 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3) 394 { 395 std::vector<Value*> args; 396 args.push_back(arg1); 397 args.push_back(arg2); 398 args.push_back(arg3); 399 return CALLA(Callee, args); 400 } 401 402 ////////////////////////////////////////////////////////////////////////// DEBUGTRAP()403 Value *Builder::DEBUGTRAP() 404 { 405 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap); 406 return CALL(func); 407 } 408 VRCP(Value * va,const llvm::Twine & name)409 Value *Builder::VRCP(Value *va, const llvm::Twine& name) 410 { 411 return FDIV(VIMMED1(1.0f), va, name); // 1 / a 412 } 413 VPLANEPS(Value * vA,Value * vB,Value * vC,Value * & vX,Value * & vY)414 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) 415 { 416 Value* vOut = FMADDPS(vA, vX, vC); 417 vOut = FMADDPS(vB, vY, vOut); 418 return vOut; 419 } 420 421 ////////////////////////////////////////////////////////////////////////// 422 /// @brief Generate an i32 masked load operation in LLVM IR. If not 423 /// supported on the underlying platform, emulate it with float masked load 424 /// @param src - base address pointer for the load 425 /// @param vMask - SIMD wide mask that controls whether to access memory load 0 MASKLOADD(Value * src,Value * mask)426 Value *Builder::MASKLOADD(Value* src,Value* mask) 427 { 428 Value* vResult; 429 // use avx2 gather instruction is available 430 if(JM()->mArch.AVX2()) 431 { 432 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); 433 vResult = CALL(func,{src,mask}); 434 } 435 else 436 { 437 // maskload intrinsic expects integer mask operand in llvm >= 3.8 438 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8) 439 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); 440 #else 441 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth)); 442 #endif 443 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); 444 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth)); 445 } 446 return vResult; 447 } 448 449 ////////////////////////////////////////////////////////////////////////// 450 /// @brief insert a JIT call to CallPrint 451 /// - outputs formatted string to both stdout and VS output window 452 /// - DEBUG builds only 453 /// Usage example: 454 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); 455 /// where C(lane) creates a constant value to print, and pIndex is the Value* 456 /// result from a GEP, printing out the pointer to memory 457 /// @param printStr - constant string to print, which includes format specifiers 458 /// @param printArgs - initializer list of Value*'s to print to std out PRINT(const std::string & printStr,const std::initializer_list<Value * > & printArgs)459 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs) 460 { 461 // push the arguments to CallPrint into a vector 462 std::vector<Value*> printCallArgs; 463 // save room for the format string. we still need to modify it for vectors 464 printCallArgs.resize(1); 465 466 // search through the format string for special processing 467 size_t pos = 0; 468 std::string tempStr(printStr); 469 pos = tempStr.find('%', pos); 470 auto v = printArgs.begin(); 471 472 while ((pos != std::string::npos) && (v != printArgs.end())) 473 { 474 Value* pArg = *v; 475 Type* pType = pArg->getType(); 476 477 if (pType->isVectorTy()) 478 { 479 Type* pContainedType = pType->getContainedType(0); 480 481 if (toupper(tempStr[pos + 1]) == 'X') 482 { 483 tempStr[pos] = '0'; 484 tempStr[pos + 1] = 'x'; 485 tempStr.insert(pos + 2, "%08X "); 486 pos += 7; 487 488 printCallArgs.push_back(VEXTRACT(pArg, C(0))); 489 490 std::string vectorFormatStr; 491 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) 492 { 493 vectorFormatStr += "0x%08X "; 494 printCallArgs.push_back(VEXTRACT(pArg, C(i))); 495 } 496 497 tempStr.insert(pos, vectorFormatStr); 498 pos += vectorFormatStr.size(); 499 } 500 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) 501 { 502 uint32_t i = 0; 503 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) 504 { 505 tempStr.insert(pos, std::string("%f ")); 506 pos += 3; 507 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); 508 } 509 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); 510 } 511 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) 512 { 513 uint32_t i = 0; 514 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) 515 { 516 tempStr.insert(pos, std::string("%d ")); 517 pos += 3; 518 printCallArgs.push_back(VEXTRACT(pArg, C(i))); 519 } 520 printCallArgs.push_back(VEXTRACT(pArg, C(i))); 521 } 522 } 523 else 524 { 525 if (toupper(tempStr[pos + 1]) == 'X') 526 { 527 tempStr[pos] = '0'; 528 tempStr.insert(pos + 1, "x%08"); 529 printCallArgs.push_back(pArg); 530 pos += 3; 531 } 532 // for %f we need to cast float Values to doubles so that they print out correctly 533 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy())) 534 { 535 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext))); 536 pos++; 537 } 538 else 539 { 540 printCallArgs.push_back(pArg); 541 } 542 } 543 544 // advance to the next arguement 545 v++; 546 pos = tempStr.find('%', ++pos); 547 } 548 549 // create global variable constant string 550 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); 551 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); 552 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); 553 554 // get a pointer to the first character in the constant string array 555 std::vector<Constant*> geplist{C(0),C(0)}; 556 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); 557 558 // insert the pointer to the format string in the argument vector 559 printCallArgs[0] = strGEP; 560 561 // get pointer to CallPrint function and insert decl into the module if needed 562 std::vector<Type*> args; 563 args.push_back(PointerType::get(mInt8Ty,0)); 564 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); 565 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); 566 567 // if we haven't yet added the symbol to the symbol table 568 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) 569 { 570 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); 571 } 572 573 // insert a call to CallPrint 574 return CALLA(callPrintFn,printCallArgs); 575 } 576 577 ////////////////////////////////////////////////////////////////////////// 578 /// @brief Wrapper around PRINT with initializer list. PRINT(const std::string & printStr)579 CallInst* Builder::PRINT(const std::string &printStr) 580 { 581 return PRINT(printStr, {}); 582 } 583 584 ////////////////////////////////////////////////////////////////////////// 585 /// @brief Generate a masked gather operation in LLVM IR. If not 586 /// supported on the underlying platform, emulate it with loads 587 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 588 /// @param pBase - Int8* base VB address pointer value 589 /// @param vIndices - SIMD wide value of VB byte offsets 590 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 591 /// @param scale - value to scale indices by GATHERPS(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)592 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) 593 { 594 Value *vGather; 595 596 // use avx2 gather instruction if available 597 if(JM()->mArch.AVX2()) 598 { 599 // force mask to <N x float>, required by vgather 600 Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty); 601 602 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale)); 603 } 604 else 605 { 606 Value* pStack = STACKSAVE(); 607 608 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address 609 Value* vSrcPtr = ALLOCA(vSrc->getType()); 610 STORE(vSrc, vSrcPtr); 611 612 vGather = VUNDEF_F(); 613 Value *vScaleVec = VIMMED1((uint32_t)scale); 614 Value *vOffsets = MUL(vIndices,vScaleVec); 615 for(uint32_t i = 0; i < mVWidth; ++i) 616 { 617 // single component byte index 618 Value *offset = VEXTRACT(vOffsets,C(i)); 619 // byte pointer to component 620 Value *loadAddress = GEP(pBase,offset); 621 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); 622 // pointer to the value to load if we're masking off a component 623 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); 624 Value *selMask = VEXTRACT(vMask,C(i)); 625 // switch in a safe address to load if we're trying to access a vertex 626 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); 627 Value *val = LOAD(validAddress); 628 vGather = VINSERT(vGather,val,C(i)); 629 } 630 631 STACKRESTORE(pStack); 632 } 633 634 return vGather; 635 } 636 GATHERPS_16(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)637 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) 638 { 639 Value *vGather = VUNDEF_F_16(); 640 641 // use AVX512F gather instruction if available 642 if (JM()->mArch.AVX512F()) 643 { 644 // force mask to <N-bit Integer>, required by vgather2 645 Value *mask = BITCAST(vMask, mInt16Ty); 646 647 vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); 648 } 649 else 650 { 651 Value *src0 = EXTRACT_16(vSrc, 0); 652 Value *src1 = EXTRACT_16(vSrc, 1); 653 654 Value *indices0 = EXTRACT_16(vIndices, 0); 655 Value *indices1 = EXTRACT_16(vIndices, 1); 656 657 Value *mask0 = EXTRACT_16(vMask, 0); 658 Value *mask1 = EXTRACT_16(vMask, 1); 659 660 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); 661 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); 662 663 vGather = JOIN_16(gather0, gather1); 664 } 665 666 return vGather; 667 } 668 669 ////////////////////////////////////////////////////////////////////////// 670 /// @brief Generate a masked gather operation in LLVM IR. If not 671 /// supported on the underlying platform, emulate it with loads 672 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 673 /// @param pBase - Int8* base VB address pointer value 674 /// @param vIndices - SIMD wide value of VB byte offsets 675 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 676 /// @param scale - value to scale indices by GATHERDD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)677 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) 678 { 679 Value* vGather; 680 681 // use avx2 gather instruction if available 682 if(JM()->mArch.AVX2()) 683 { 684 vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale)); 685 } 686 else 687 { 688 Value* pStack = STACKSAVE(); 689 690 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address 691 Value* vSrcPtr = ALLOCA(vSrc->getType()); 692 STORE(vSrc, vSrcPtr); 693 694 vGather = VUNDEF_I(); 695 Value *vScaleVec = VIMMED1((uint32_t)scale); 696 Value *vOffsets = MUL(vIndices, vScaleVec); 697 for(uint32_t i = 0; i < mVWidth; ++i) 698 { 699 // single component byte index 700 Value *offset = VEXTRACT(vOffsets, C(i)); 701 // byte pointer to component 702 Value *loadAddress = GEP(pBase, offset); 703 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); 704 // pointer to the value to load if we're masking off a component 705 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); 706 Value *selMask = VEXTRACT(vMask, C(i)); 707 // switch in a safe address to load if we're trying to access a vertex 708 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); 709 Value *val = LOAD(validAddress, C(0)); 710 vGather = VINSERT(vGather, val, C(i)); 711 } 712 713 STACKRESTORE(pStack); 714 } 715 716 return vGather; 717 } 718 GATHERDD_16(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)719 Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) 720 { 721 Value *vGather = VUNDEF_I_16(); 722 723 // use AVX512F gather instruction if available 724 if (JM()->mArch.AVX512F()) 725 { 726 // force mask to <N-bit Integer>, required by vgather2 727 Value *mask = BITCAST(vMask, mInt16Ty); 728 729 vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); 730 } 731 else 732 { 733 Value *src0 = EXTRACT_16(vSrc, 0); 734 Value *src1 = EXTRACT_16(vSrc, 1); 735 736 Value *indices0 = EXTRACT_16(vIndices, 0); 737 Value *indices1 = EXTRACT_16(vIndices, 1); 738 739 Value *mask0 = EXTRACT_16(vMask, 0); 740 Value *mask1 = EXTRACT_16(vMask, 1); 741 742 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale); 743 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale); 744 745 vGather = JOIN_16(gather0, gather1); 746 } 747 748 return vGather; 749 } 750 751 ////////////////////////////////////////////////////////////////////////// 752 /// @brief Generate a masked gather operation in LLVM IR. If not 753 /// supported on the underlying platform, emulate it with loads 754 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 755 /// @param pBase - Int8* base VB address pointer value 756 /// @param vIndices - SIMD wide value of VB byte offsets 757 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 758 /// @param scale - value to scale indices by GATHERPD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)759 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) 760 { 761 Value* vGather; 762 763 // use avx2 gather instruction if available 764 if(JM()->mArch.AVX2()) 765 { 766 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2)); 767 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); 768 } 769 else 770 { 771 Value* pStack = STACKSAVE(); 772 773 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address 774 Value* vSrcPtr = ALLOCA(vSrc->getType()); 775 STORE(vSrc, vSrcPtr); 776 777 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); 778 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale)); 779 Value *vOffsets = MUL(vIndices,vScaleVec); 780 for(uint32_t i = 0; i < mVWidth/2; ++i) 781 { 782 // single component byte index 783 Value *offset = VEXTRACT(vOffsets,C(i)); 784 // byte pointer to component 785 Value *loadAddress = GEP(pBase,offset); 786 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0)); 787 // pointer to the value to load if we're masking off a component 788 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); 789 Value *selMask = VEXTRACT(vMask,C(i)); 790 // switch in a safe address to load if we're trying to access a vertex 791 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); 792 Value *val = LOAD(validAddress); 793 vGather = VINSERT(vGather,val,C(i)); 794 } 795 STACKRESTORE(pStack); 796 } 797 return vGather; 798 } 799 EXTRACT_16(Value * x,uint32_t imm)800 Value *Builder::EXTRACT_16(Value *x, uint32_t imm) 801 { 802 if (imm == 0) 803 { 804 return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 }); 805 } 806 else 807 { 808 return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 }); 809 } 810 } 811 JOIN_16(Value * a,Value * b)812 Value *Builder::JOIN_16(Value *a, Value *b) 813 { 814 return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); 815 } 816 817 ////////////////////////////////////////////////////////////////////////// 818 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask MASK(Value * vmask)819 Value *Builder::MASK(Value *vmask) 820 { 821 Value *src = BITCAST(vmask, mSimdInt32Ty); 822 return ICMP_SLT(src, VIMMED1(0)); 823 } 824 MASK_16(Value * vmask)825 Value *Builder::MASK_16(Value *vmask) 826 { 827 Value *src = BITCAST(vmask, mSimd16Int32Ty); 828 return ICMP_SLT(src, VIMMED1_16(0)); 829 } 830 831 ////////////////////////////////////////////////////////////////////////// 832 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask VMASK(Value * mask)833 Value *Builder::VMASK(Value *mask) 834 { 835 return S_EXT(mask, mSimdInt32Ty); 836 } 837 VMASK_16(Value * mask)838 Value *Builder::VMASK_16(Value *mask) 839 { 840 return S_EXT(mask, mSimd16Int32Ty); 841 } 842 843 ////////////////////////////////////////////////////////////////////////// 844 /// @brief Generate a VPSHUFB operation in LLVM IR. If not 845 /// supported on the underlying platform, emulate it 846 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values 847 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values 848 /// Byte masks in lower 128 lane of b selects 8 bit values from lower 849 /// 128bits of a, and vice versa for the upper lanes. If the mask 850 /// value is negative, '0' is inserted. PSHUFB(Value * a,Value * b)851 Value *Builder::PSHUFB(Value* a, Value* b) 852 { 853 Value* res; 854 // use avx2 pshufb instruction if available 855 if(JM()->mArch.AVX2()) 856 { 857 res = VPSHUFB(a, b); 858 } 859 else 860 { 861 Constant* cB = dyn_cast<Constant>(b); 862 // number of 8 bit elements in b 863 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements(); 864 // output vector 865 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); 866 867 // insert an 8 bit value from the high and low lanes of a per loop iteration 868 numElms /= 2; 869 for(uint32_t i = 0; i < numElms; i++) 870 { 871 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); 872 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); 873 874 // extract values from constant mask 875 char valLow128bLane = (char)(cLow128b->getSExtValue()); 876 char valHigh128bLane = (char)(cHigh128b->getSExtValue()); 877 878 Value* insertValLow128b; 879 Value* insertValHigh128b; 880 881 // if the mask value is negative, insert a '0' in the respective output position 882 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector 883 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); 884 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); 885 886 vShuf = VINSERT(vShuf, insertValLow128b, i); 887 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); 888 } 889 res = vShuf; 890 } 891 return res; 892 } 893 894 ////////////////////////////////////////////////////////////////////////// 895 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 896 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it 897 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only 898 /// lower 8 values are used. PMOVSXBD(Value * a)899 Value *Builder::PMOVSXBD(Value* a) 900 { 901 // VPMOVSXBD output type 902 Type* v8x32Ty = VectorType::get(mInt32Ty, 8); 903 // Extract 8 values from 128bit lane and sign extend 904 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); 905 } 906 907 ////////////////////////////////////////////////////////////////////////// 908 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 909 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it 910 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. PMOVSXWD(Value * a)911 Value *Builder::PMOVSXWD(Value* a) 912 { 913 // VPMOVSXWD output type 914 Type* v8x32Ty = VectorType::get(mInt32Ty, 8); 915 // Extract 8 values from 128bit lane and sign extend 916 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); 917 } 918 919 ////////////////////////////////////////////////////////////////////////// 920 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values 921 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying 922 /// platform, emulate it 923 /// @param a - 256bit SIMD lane(8x32bit) of integer values. 924 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values PERMD(Value * a,Value * idx)925 Value *Builder::PERMD(Value* a, Value* idx) 926 { 927 Value* res; 928 // use avx2 permute instruction if available 929 if(JM()->mArch.AVX2()) 930 { 931 res = VPERMD(a, idx); 932 } 933 else 934 { 935 if (isa<Constant>(idx)) 936 { 937 res = VSHUFFLE(a, a, idx); 938 } 939 else 940 { 941 res = VUNDEF_I(); 942 for (uint32_t l = 0; l < JM()->mVWidth; ++l) 943 { 944 Value* pIndex = VEXTRACT(idx, C(l)); 945 Value* pVal = VEXTRACT(a, pIndex); 946 res = VINSERT(res, pVal, C(l)); 947 } 948 } 949 } 950 return res; 951 } 952 953 ////////////////////////////////////////////////////////////////////////// 954 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values 955 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying 956 /// platform, emulate it 957 /// @param a - 256bit SIMD lane(8x32bit) of float values. 958 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values PERMPS(Value * a,Value * idx)959 Value *Builder::PERMPS(Value* a, Value* idx) 960 { 961 Value* res; 962 // use avx2 permute instruction if available 963 if (JM()->mArch.AVX2()) 964 { 965 // llvm 3.6.0 swapped the order of the args to vpermd 966 res = VPERMPS(idx, a); 967 } 968 else 969 { 970 if (isa<Constant>(idx)) 971 { 972 res = VSHUFFLE(a, a, idx); 973 } 974 else 975 { 976 res = VUNDEF_F(); 977 for (uint32_t l = 0; l < JM()->mVWidth; ++l) 978 { 979 Value* pIndex = VEXTRACT(idx, C(l)); 980 Value* pVal = VEXTRACT(a, pIndex); 981 res = VINSERT(res, pVal, C(l)); 982 } 983 } 984 } 985 986 return res; 987 } 988 989 ////////////////////////////////////////////////////////////////////////// 990 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) 991 /// in LLVM IR. If not supported on the underlying platform, emulate it 992 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. CVTPH2PS(Value * a,const llvm::Twine & name)993 Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name) 994 { 995 if (JM()->mArch.F16C()) 996 { 997 return VCVTPH2PS(a, name); 998 } 999 else 1000 { 1001 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); 1002 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy)); 1003 1004 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr) 1005 { 1006 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32); 1007 } 1008 1009 Value* pResult = UndefValue::get(mSimdFP32Ty); 1010 for (uint32_t i = 0; i < mVWidth; ++i) 1011 { 1012 Value* pSrc = VEXTRACT(a, C(i)); 1013 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); 1014 pResult = VINSERT(pResult, pConv, C(i)); 1015 } 1016 1017 pResult->setName(name); 1018 return pResult; 1019 } 1020 } 1021 1022 ////////////////////////////////////////////////////////////////////////// 1023 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) 1024 /// in LLVM IR. If not supported on the underlying platform, emulate it 1025 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. CVTPS2PH(Value * a,Value * rounding)1026 Value *Builder::CVTPS2PH(Value* a, Value* rounding) 1027 { 1028 if (JM()->mArch.F16C()) 1029 { 1030 return VCVTPS2PH(a, rounding); 1031 } 1032 else 1033 { 1034 // call scalar C function for now 1035 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); 1036 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); 1037 1038 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr) 1039 { 1040 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16); 1041 } 1042 1043 Value* pResult = UndefValue::get(mSimdInt16Ty); 1044 for (uint32_t i = 0; i < mVWidth; ++i) 1045 { 1046 Value* pSrc = VEXTRACT(a, C(i)); 1047 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); 1048 pResult = VINSERT(pResult, pConv, C(i)); 1049 } 1050 1051 return pResult; 1052 } 1053 } 1054 PMAXSD(Value * a,Value * b)1055 Value *Builder::PMAXSD(Value* a, Value* b) 1056 { 1057 Value* cmp = ICMP_SGT(a, b); 1058 return SELECT(cmp, a, b); 1059 } 1060 PMINSD(Value * a,Value * b)1061 Value *Builder::PMINSD(Value* a, Value* b) 1062 { 1063 Value* cmp = ICMP_SLT(a, b); 1064 return SELECT(cmp, a, b); 1065 } 1066 Gather4(const SWR_FORMAT format,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1067 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 1068 Value* mask, Value* vGatherComponents[], bool bPackedOutput) 1069 { 1070 const SWR_FORMAT_INFO &info = GetFormatInfo(format); 1071 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) 1072 { 1073 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); 1074 } 1075 else 1076 { 1077 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); 1078 } 1079 } 1080 GATHER4PS(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput)1081 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 1082 Value* vMask, Value* vGatherComponents[], bool bPackedOutput) 1083 { 1084 switch(info.bpp / info.numComps) 1085 { 1086 case 16: 1087 { 1088 Value* vGatherResult[2]; 1089 1090 // TODO: vGatherMaskedVal 1091 Value* vGatherMaskedVal = VIMMED1((float)0); 1092 1093 // always have at least one component out of x or y to fetch 1094 1095 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); 1096 // e.g. result of first 8x32bit integer gather for 16bit components 1097 // 256i - 0 1 2 3 4 5 6 7 1098 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 1099 // 1100 1101 // if we have at least one component out of x or y to fetch 1102 if(info.numComps > 2) 1103 { 1104 // offset base to the next components(zw) in the vertex to gather 1105 pSrcBase = GEP(pSrcBase, C((char)4)); 1106 1107 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); 1108 // e.g. result of second 8x32bit integer gather for 16bit components 1109 // 256i - 0 1 2 3 4 5 6 7 1110 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 1111 // 1112 } 1113 else 1114 { 1115 vGatherResult[1] = vGatherMaskedVal; 1116 } 1117 1118 // Shuffle gathered components into place, each row is a component 1119 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 1120 } 1121 break; 1122 case 32: 1123 { 1124 // apply defaults 1125 for (uint32_t i = 0; i < 4; ++i) 1126 { 1127 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); 1128 } 1129 1130 for(uint32_t i = 0; i < info.numComps; i++) 1131 { 1132 uint32_t swizzleIndex = info.swizzle[i]; 1133 1134 // Gather a SIMD of components 1135 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); 1136 1137 // offset base to the next component to gather 1138 pSrcBase = GEP(pSrcBase, C((char)4)); 1139 } 1140 } 1141 break; 1142 default: 1143 SWR_INVALID("Invalid float format"); 1144 break; 1145 } 1146 } 1147 GATHER4DD(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput)1148 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 1149 Value* vMask, Value* vGatherComponents[], bool bPackedOutput) 1150 { 1151 switch (info.bpp / info.numComps) 1152 { 1153 case 8: 1154 { 1155 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 1156 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); 1157 // e.g. result of an 8x32bit integer gather for 8bit components 1158 // 256i - 0 1 2 3 4 5 6 7 1159 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 1160 1161 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 1162 } 1163 break; 1164 case 16: 1165 { 1166 Value* vGatherResult[2]; 1167 1168 // TODO: vGatherMaskedVal 1169 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 1170 1171 // always have at least one component out of x or y to fetch 1172 1173 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); 1174 // e.g. result of first 8x32bit integer gather for 16bit components 1175 // 256i - 0 1 2 3 4 5 6 7 1176 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 1177 // 1178 1179 // if we have at least one component out of x or y to fetch 1180 if(info.numComps > 2) 1181 { 1182 // offset base to the next components(zw) in the vertex to gather 1183 pSrcBase = GEP(pSrcBase, C((char)4)); 1184 1185 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); 1186 // e.g. result of second 8x32bit integer gather for 16bit components 1187 // 256i - 0 1 2 3 4 5 6 7 1188 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 1189 // 1190 } 1191 else 1192 { 1193 vGatherResult[1] = vGatherMaskedVal; 1194 } 1195 1196 // Shuffle gathered components into place, each row is a component 1197 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 1198 1199 } 1200 break; 1201 case 32: 1202 { 1203 // apply defaults 1204 for (uint32_t i = 0; i < 4; ++i) 1205 { 1206 vGatherComponents[i] = VIMMED1((int)info.defaults[i]); 1207 } 1208 1209 for(uint32_t i = 0; i < info.numComps; i++) 1210 { 1211 uint32_t swizzleIndex = info.swizzle[i]; 1212 1213 // Gather a SIMD of components 1214 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); 1215 1216 // offset base to the next component to gather 1217 pSrcBase = GEP(pSrcBase, C((char)4)); 1218 } 1219 } 1220 break; 1221 default: 1222 SWR_INVALID("unsupported format"); 1223 break; 1224 } 1225 } 1226 Shuffle16bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput[2],Value * vGatherOutput[4],bool bPackedOutput)1227 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) 1228 { 1229 // cast types 1230 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1231 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1232 1233 // input could either be float or int vector; do shuffle work in int 1234 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); 1235 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); 1236 1237 if(bPackedOutput) 1238 { 1239 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits 1240 1241 // shuffle mask 1242 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1243 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1244 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); 1245 // after pshufb: group components together in each 128bit lane 1246 // 256i - 0 1 2 3 4 5 6 7 1247 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1248 1249 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1250 // after PERMD: move and pack xy components into each 128bit lane 1251 // 256i - 0 1 2 3 4 5 6 7 1252 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1253 1254 // do the same for zw components 1255 Value* vi128ZW = nullptr; 1256 if(info.numComps > 2) 1257 { 1258 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); 1259 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1260 } 1261 1262 for(uint32_t i = 0; i < 4; i++) 1263 { 1264 uint32_t swizzleIndex = info.swizzle[i]; 1265 // todo: fixed for packed 1266 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 1267 if(i >= info.numComps) 1268 { 1269 // set the default component val 1270 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 1271 continue; 1272 } 1273 1274 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1275 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1276 // if x or y, use vi128XY permute result, else use vi128ZW 1277 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1278 1279 // extract packed component 128 bit lanes 1280 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 1281 } 1282 1283 } 1284 else 1285 { 1286 // pshufb masks for each component 1287 Value* vConstMask[2]; 1288 // x/z shuffle mask 1289 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1290 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); 1291 1292 // y/w shuffle mask 1293 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 1294 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 1295 1296 1297 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 1298 // apply defaults 1299 for (uint32_t i = 0; i < 4; ++i) 1300 { 1301 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 1302 } 1303 1304 for(uint32_t i = 0; i < info.numComps; i++) 1305 { 1306 uint32_t swizzleIndex = info.swizzle[i]; 1307 1308 // select correct constMask for x/z or y/w pshufb 1309 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 1310 // if x or y, use vi128XY permute result, else use vi128ZW 1311 uint32_t selectedGather = (i < 2) ? 0 : 1; 1312 1313 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); 1314 // after pshufb mask for x channel; z uses the same shuffle from the second gather 1315 // 256i - 0 1 2 3 4 5 6 7 1316 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 1317 } 1318 } 1319 } 1320 Shuffle8bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput,Value * vGatherOutput[],bool bPackedOutput)1321 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) 1322 { 1323 // cast types 1324 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1325 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits 1326 1327 if(bPackedOutput) 1328 { 1329 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits 1330 // shuffle mask 1331 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 1332 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); 1333 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 1334 // after pshufb: group components together in each 128bit lane 1335 // 256i - 0 1 2 3 4 5 6 7 1336 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 1337 1338 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); 1339 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 1340 // 256i - 0 1 2 3 4 5 6 7 1341 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 1342 1343 // do the same for zw components 1344 Value* vi128ZW = nullptr; 1345 if(info.numComps > 2) 1346 { 1347 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); 1348 } 1349 1350 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex 1351 for(uint32_t i = 0; i < 4; i++) 1352 { 1353 uint32_t swizzleIndex = info.swizzle[i]; 1354 // todo: fix for packed 1355 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 1356 if(i >= info.numComps) 1357 { 1358 // set the default component val 1359 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 1360 continue; 1361 } 1362 1363 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1364 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1365 // if x or y, use vi128XY permute result, else use vi128ZW 1366 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1367 1368 // sign extend 1369 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 1370 } 1371 } 1372 // else zero extend 1373 else{ 1374 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 1375 // apply defaults 1376 for (uint32_t i = 0; i < 4; ++i) 1377 { 1378 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 1379 } 1380 1381 for(uint32_t i = 0; i < info.numComps; i++){ 1382 uint32_t swizzleIndex = info.swizzle[i]; 1383 1384 // pshufb masks for each component 1385 Value* vConstMask; 1386 switch(i) 1387 { 1388 case 0: 1389 // x shuffle mask 1390 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 1391 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); 1392 break; 1393 case 1: 1394 // y shuffle mask 1395 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 1396 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); 1397 break; 1398 case 2: 1399 // z shuffle mask 1400 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 1401 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); 1402 break; 1403 case 3: 1404 // w shuffle mask 1405 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 1406 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); 1407 break; 1408 default: 1409 vConstMask = nullptr; 1410 break; 1411 } 1412 1413 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 1414 // after pshufb for x channel 1415 // 256i - 0 1 2 3 4 5 6 7 1416 // x000 x000 x000 x000 x000 x000 x000 x000 1417 } 1418 } 1419 } 1420 1421 // Helper function to create alloca in entry block of function CreateEntryAlloca(Function * pFunc,Type * pType)1422 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) 1423 { 1424 auto saveIP = IRB()->saveIP(); 1425 IRB()->SetInsertPoint(&pFunc->getEntryBlock(), 1426 pFunc->getEntryBlock().begin()); 1427 Value* pAlloca = ALLOCA(pType); 1428 if (saveIP.isSet()) IRB()->restoreIP(saveIP); 1429 return pAlloca; 1430 } 1431 CreateEntryAlloca(Function * pFunc,Type * pType,Value * pArraySize)1432 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize) 1433 { 1434 auto saveIP = IRB()->saveIP(); 1435 IRB()->SetInsertPoint(&pFunc->getEntryBlock(), 1436 pFunc->getEntryBlock().begin()); 1437 Value* pAlloca = ALLOCA(pType, pArraySize); 1438 if (saveIP.isSet()) IRB()->restoreIP(saveIP); 1439 return pAlloca; 1440 } 1441 1442 ////////////////////////////////////////////////////////////////////////// 1443 /// @brief emulates a scatter operation. 1444 /// @param pDst - pointer to destination 1445 /// @param vSrc - vector of src data to scatter 1446 /// @param vOffsets - vector of byte offsets from pDst 1447 /// @param vMask - mask of valid lanes SCATTERPS(Value * pDst,Value * vSrc,Value * vOffsets,Value * vMask)1448 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) 1449 { 1450 /* Scatter algorithm 1451 1452 while(Index = BitScanForward(mask)) 1453 srcElem = srcVector[Index] 1454 offsetElem = offsetVector[Index] 1455 *(pDst + offsetElem) = srcElem 1456 Update mask (&= ~(1<<Index) 1457 1458 */ 1459 1460 BasicBlock* pCurBB = IRB()->GetInsertBlock(); 1461 Function* pFunc = pCurBB->getParent(); 1462 Type* pSrcTy = vSrc->getType()->getVectorElementType(); 1463 1464 // Store vectors on stack 1465 if (pScatterStackSrc == nullptr) 1466 { 1467 // Save off stack allocations and reuse per scatter. Significantly reduces stack 1468 // requirements for shaders with a lot of scatters. 1469 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); 1470 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); 1471 } 1472 1473 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); 1474 Value* pOffsetsArrayPtr = pScatterStackOffsets; 1475 STORE(vSrc, pSrcArrayPtr); 1476 STORE(vOffsets, pOffsetsArrayPtr); 1477 1478 // Cast to pointers for random access 1479 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); 1480 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); 1481 1482 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty)); 1483 1484 // Get cttz function 1485 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty }); 1486 1487 // Setup loop basic block 1488 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc); 1489 1490 // compute first set bit 1491 Value* pIndex = CALL(pfnCttz, { pMask, C(false) }); 1492 1493 Value* pIsUndef = ICMP_EQ(pIndex, C(32)); 1494 1495 // Split current block 1496 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); 1497 1498 // Remove unconditional jump created by splitBasicBlock 1499 pCurBB->getTerminator()->eraseFromParent(); 1500 1501 // Add terminator to end of original block 1502 IRB()->SetInsertPoint(pCurBB); 1503 1504 // Add conditional branch 1505 COND_BR(pIsUndef, pPostLoop, pLoop); 1506 1507 // Add loop basic block contents 1508 IRB()->SetInsertPoint(pLoop); 1509 PHINode* pIndexPhi = PHI(mInt32Ty, 2); 1510 PHINode* pMaskPhi = PHI(mInt32Ty, 2); 1511 1512 pIndexPhi->addIncoming(pIndex, pCurBB); 1513 pMaskPhi->addIncoming(pMask, pCurBB); 1514 1515 // Extract elements for this index 1516 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi }); 1517 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi }); 1518 1519 // GEP to this offset in dst 1520 Value* pCurDst = GEP(pDst, pOffsetElem); 1521 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); 1522 STORE(pSrcElem, pCurDst); 1523 1524 // Update the mask 1525 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); 1526 1527 // Terminator 1528 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) }); 1529 1530 pIsUndef = ICMP_EQ(pNewIndex, C(32)); 1531 COND_BR(pIsUndef, pPostLoop, pLoop); 1532 1533 // Update phi edges 1534 pIndexPhi->addIncoming(pNewIndex, pLoop); 1535 pMaskPhi->addIncoming(pNewMask, pLoop); 1536 1537 // Move builder to beginning of post loop 1538 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); 1539 } 1540 VABSPS(Value * a)1541 Value* Builder::VABSPS(Value* a) 1542 { 1543 Value* asInt = BITCAST(a, mSimdInt32Ty); 1544 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); 1545 return result; 1546 } 1547 ICLAMP(Value * src,Value * low,Value * high,const llvm::Twine & name)1548 Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name) 1549 { 1550 Value *lowCmp = ICMP_SLT(src, low); 1551 Value *ret = SELECT(lowCmp, low, src); 1552 1553 Value *highCmp = ICMP_SGT(ret, high); 1554 ret = SELECT(highCmp, high, ret, name); 1555 1556 return ret; 1557 } 1558 FCLAMP(Value * src,Value * low,Value * high)1559 Value *Builder::FCLAMP(Value* src, Value* low, Value* high) 1560 { 1561 Value *lowCmp = FCMP_OLT(src, low); 1562 Value *ret = SELECT(lowCmp, low, src); 1563 1564 Value *highCmp = FCMP_OGT(ret, high); 1565 ret = SELECT(highCmp, high, ret); 1566 1567 return ret; 1568 } 1569 FCLAMP(Value * src,float low,float high)1570 Value *Builder::FCLAMP(Value* src, float low, float high) 1571 { 1572 Value* result = VMAXPS(src, VIMMED1(low)); 1573 result = VMINPS(result, VIMMED1(high)); 1574 1575 return result; 1576 } 1577 1578 ////////////////////////////////////////////////////////////////////////// 1579 /// @brief save/restore stack, providing ability to push/pop the stack and 1580 /// reduce overall stack requirements for temporary stack use STACKSAVE()1581 Value* Builder::STACKSAVE() 1582 { 1583 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); 1584 return CALLA(pfnStackSave); 1585 } 1586 STACKRESTORE(Value * pSaved)1587 void Builder::STACKRESTORE(Value* pSaved) 1588 { 1589 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); 1590 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved}); 1591 } 1592 FMADDPS(Value * a,Value * b,Value * c)1593 Value *Builder::FMADDPS(Value* a, Value* b, Value* c) 1594 { 1595 Value* vOut; 1596 // use FMADs if available 1597 if(JM()->mArch.AVX2()) 1598 { 1599 vOut = VFMADDPS(a, b, c); 1600 } 1601 else 1602 { 1603 vOut = FADD(FMUL(a, b), c); 1604 } 1605 return vOut; 1606 } 1607 POPCNT(Value * a)1608 Value* Builder::POPCNT(Value* a) 1609 { 1610 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); 1611 return CALL(pCtPop, std::initializer_list<Value*>{a}); 1612 } 1613 1614 ////////////////////////////////////////////////////////////////////////// 1615 /// @brief C functions called by LLVM IR 1616 ////////////////////////////////////////////////////////////////////////// 1617 1618 ////////////////////////////////////////////////////////////////////////// 1619 /// @brief called in JIT code, inserted by PRINT 1620 /// output to both stdout and visual studio debug console CallPrint(const char * fmt,...)1621 void __cdecl CallPrint(const char* fmt, ...) 1622 { 1623 va_list args; 1624 va_start(args, fmt); 1625 vprintf(fmt, args); 1626 1627 #if defined( _WIN32 ) 1628 char strBuf[1024]; 1629 vsnprintf_s(strBuf, _TRUNCATE, fmt, args); 1630 OutputDebugStringA(strBuf); 1631 #endif 1632 1633 va_end(args); 1634 } 1635 VEXTRACTI128(Value * a,Constant * imm8)1636 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) 1637 { 1638 bool flag = !imm8->isZeroValue(); 1639 SmallVector<Constant*,8> idx; 1640 for (unsigned i = 0; i < mVWidth / 2; i++) { 1641 idx.push_back(C(flag ? i + mVWidth / 2 : i)); 1642 } 1643 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); 1644 } 1645 VINSERTI128(Value * a,Value * b,Constant * imm8)1646 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) 1647 { 1648 bool flag = !imm8->isZeroValue(); 1649 SmallVector<Constant*,8> idx; 1650 for (unsigned i = 0; i < mVWidth; i++) { 1651 idx.push_back(C(i)); 1652 } 1653 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); 1654 1655 SmallVector<Constant*,8> idx2; 1656 for (unsigned i = 0; i < mVWidth / 2; i++) { 1657 idx2.push_back(C(flag ? i : i + mVWidth)); 1658 } 1659 for (unsigned i = mVWidth / 2; i < mVWidth; i++) { 1660 idx2.push_back(C(flag ? i + mVWidth / 2 : i)); 1661 } 1662 return VSHUFFLE(a, inter, ConstantVector::get(idx2)); 1663 } 1664 1665 // rdtsc buckets macros RDTSC_START(Value * pBucketMgr,Value * pId)1666 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) 1667 { 1668 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into 1669 // buckets framework when single threaded 1670 if (KNOB_SINGLE_THREADED) 1671 { 1672 std::vector<Type*> args{ 1673 PointerType::get(mInt32Ty, 0), // pBucketMgr 1674 mInt32Ty // id 1675 }; 1676 1677 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); 1678 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); 1679 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) 1680 { 1681 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); 1682 } 1683 1684 CALL(pFunc, { pBucketMgr, pId }); 1685 } 1686 } 1687 RDTSC_STOP(Value * pBucketMgr,Value * pId)1688 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) 1689 { 1690 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into 1691 // buckets framework when single threaded 1692 if (KNOB_SINGLE_THREADED) 1693 { 1694 std::vector<Type*> args{ 1695 PointerType::get(mInt32Ty, 0), // pBucketMgr 1696 mInt32Ty // id 1697 }; 1698 1699 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); 1700 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); 1701 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) 1702 { 1703 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); 1704 } 1705 1706 CALL(pFunc, { pBucketMgr, pId }); 1707 } 1708 } 1709 1710 GetTypeSize(Type * pType)1711 uint32_t Builder::GetTypeSize(Type* pType) 1712 { 1713 if (pType->isStructTy()) 1714 { 1715 uint32_t numElems = pType->getStructNumElements(); 1716 Type* pElemTy = pType->getStructElementType(0); 1717 return numElems * GetTypeSize(pElemTy); 1718 } 1719 1720 if (pType->isArrayTy()) 1721 { 1722 uint32_t numElems = pType->getArrayNumElements(); 1723 Type* pElemTy = pType->getArrayElementType(); 1724 return numElems * GetTypeSize(pElemTy); 1725 } 1726 1727 if (pType->isIntegerTy()) 1728 { 1729 uint32_t bitSize = pType->getIntegerBitWidth(); 1730 return bitSize / 8; 1731 } 1732 1733 if (pType->isFloatTy()) 1734 { 1735 return 4; 1736 } 1737 1738 if (pType->isHalfTy()) 1739 { 1740 return 2; 1741 } 1742 1743 if (pType->isDoubleTy()) 1744 { 1745 return 8; 1746 } 1747 1748 SWR_ASSERT(false, "Unimplemented type."); 1749 return 0; 1750 } 1751 } 1752