• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32 
33 #include <cstdarg>
34 
35 namespace SwrJit
36 {
37     void __cdecl CallPrint(const char* fmt, ...);
38 
39     //////////////////////////////////////////////////////////////////////////
40     /// @brief Convert an IEEE 754 32-bit single precision float to an
41     ///        16 bit float with 5 exponent bits and a variable
42     ///        number of mantissa bits.
43     /// @param val - 32-bit float
44     /// @todo Maybe move this outside of this file into a header?
Convert32To16Float(float val)45     static uint16_t Convert32To16Float(float val)
46     {
47         uint32_t sign, exp, mant;
48         uint32_t roundBits;
49 
50         // Extract the sign, exponent, and mantissa
51         uint32_t uf = *(uint32_t*)&val;
52         sign = (uf & 0x80000000) >> 31;
53         exp = (uf & 0x7F800000) >> 23;
54         mant = uf & 0x007FFFFF;
55 
56         // Check for out of range
57         if (std::isnan(val))
58         {
59             exp = 0x1F;
60             mant = 0x200;
61             sign = 1;                     // set the sign bit for NANs
62         }
63         else if (std::isinf(val))
64         {
65             exp = 0x1f;
66             mant = 0x0;
67         }
68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
69         {
70             exp = 0x1E;
71             mant = 0x3FF;
72         }
73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
74         {
75             mant |= 0x00800000;
76             for (; exp <= 0x70; mant >>= 1, exp++)
77                 ;
78             exp = 0;
79             mant = mant >> 13;
80         }
81         else if (exp < 0x66) // Too small to represent -> Zero
82         {
83             exp = 0;
84             mant = 0;
85         }
86         else
87         {
88             // Saves bits that will be shifted off for rounding
89             roundBits = mant & 0x1FFFu;
90             // convert exponent and mantissa to 16 bit format
91             exp = exp - 0x70;
92             mant = mant >> 13;
93 
94             // Essentially RTZ, but round up if off by only 1 lsb
95             if (roundBits == 0x1FFFu)
96             {
97                 mant++;
98                 // check for overflow
99                 if ((mant & 0xC00u) != 0)
100                     exp++;
101                 // make sure only the needed bits are used
102                 mant &= 0x3FF;
103             }
104         }
105 
106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
107         return (uint16_t)tmpVal;
108     }
109 
110     //////////////////////////////////////////////////////////////////////////
111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
112     ///        float
113     /// @param val - 16-bit float
114     /// @todo Maybe move this outside of this file into a header?
ConvertSmallFloatTo32(UINT val)115     static float ConvertSmallFloatTo32(UINT val)
116     {
117         UINT result;
118         if ((val & 0x7fff) == 0)
119         {
120             result = ((uint32_t)(val & 0x8000)) << 16;
121         }
122         else if ((val & 0x7c00) == 0x7c00)
123         {
124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125             result |= ((uint32_t)val & 0x8000) << 16;
126         }
127         else
128         {
129             uint32_t sign = (val & 0x8000) << 16;
130             uint32_t mant = (val & 0x3ff) << 13;
131             uint32_t exp = (val >> 10) & 0x1f;
132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
133             {
134                 mant <<= 1;
135                 while (mant < (0x400 << 13))
136                 {
137                     exp--;
138                     mant <<= 1;
139                 }
140                 mant &= (0x3ff << 13);
141             }
142             exp = ((exp - 15 + 127) & 0xff) << 23;
143             result = sign | exp | mant;
144         }
145 
146         return *(float*)&result;
147     }
148 
C(bool i)149     Constant *Builder::C(bool i)
150     {
151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
152     }
153 
C(char i)154     Constant *Builder::C(char i)
155     {
156         return ConstantInt::get(IRB()->getInt8Ty(), i);
157     }
158 
C(uint8_t i)159     Constant *Builder::C(uint8_t i)
160     {
161         return ConstantInt::get(IRB()->getInt8Ty(), i);
162     }
163 
C(int i)164     Constant *Builder::C(int i)
165     {
166         return ConstantInt::get(IRB()->getInt32Ty(), i);
167     }
168 
C(int64_t i)169     Constant *Builder::C(int64_t i)
170     {
171         return ConstantInt::get(IRB()->getInt64Ty(), i);
172     }
173 
C(uint16_t i)174     Constant *Builder::C(uint16_t i)
175     {
176         return ConstantInt::get(mInt16Ty,i);
177     }
178 
C(uint32_t i)179     Constant *Builder::C(uint32_t i)
180     {
181         return ConstantInt::get(IRB()->getInt32Ty(), i);
182     }
183 
C(float i)184     Constant *Builder::C(float i)
185     {
186         return ConstantFP::get(IRB()->getFloatTy(), i);
187     }
188 
PRED(bool pred)189     Constant *Builder::PRED(bool pred)
190     {
191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
192     }
193 
VIMMED1(int i)194     Value *Builder::VIMMED1(int i)
195     {
196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
197     }
198 
VIMMED1(uint32_t i)199     Value *Builder::VIMMED1(uint32_t i)
200     {
201         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
202     }
203 
VIMMED1(float i)204     Value *Builder::VIMMED1(float i)
205     {
206         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
207     }
208 
VIMMED1(bool i)209     Value *Builder::VIMMED1(bool i)
210     {
211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
212     }
213 
VUNDEF_IPTR()214     Value *Builder::VUNDEF_IPTR()
215     {
216         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
217     }
218 
VUNDEF_I()219     Value *Builder::VUNDEF_I()
220     {
221         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
222     }
223 
VUNDEF(Type * ty,uint32_t size)224     Value *Builder::VUNDEF(Type *ty, uint32_t size)
225     {
226         return UndefValue::get(VectorType::get(ty, size));
227     }
228 
VUNDEF_F()229     Value *Builder::VUNDEF_F()
230     {
231         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
232     }
233 
VUNDEF(Type * t)234     Value *Builder::VUNDEF(Type* t)
235     {
236         return UndefValue::get(VectorType::get(t, mVWidth));
237     }
238 
239     #if HAVE_LLVM == 0x306
VINSERT(Value * vec,Value * val,uint64_t index)240     Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
241     {
242         return VINSERT(vec, val, C((int64_t)index));
243     }
244     #endif
245 
VBROADCAST(Value * src)246     Value *Builder::VBROADCAST(Value *src)
247     {
248         // check if src is already a vector
249         if (src->getType()->isVectorTy())
250         {
251             return src;
252         }
253 
254         return VECTOR_SPLAT(mVWidth, src);
255     }
256 
IMMED(Value * v)257     uint32_t Builder::IMMED(Value* v)
258     {
259         SWR_ASSERT(isa<ConstantInt>(v));
260         ConstantInt *pValConst = cast<ConstantInt>(v);
261         return pValConst->getZExtValue();
262     }
263 
S_IMMED(Value * v)264     int32_t Builder::S_IMMED(Value* v)
265     {
266         SWR_ASSERT(isa<ConstantInt>(v));
267         ConstantInt *pValConst = cast<ConstantInt>(v);
268         return pValConst->getSExtValue();
269     }
270 
GEP(Value * ptr,const std::initializer_list<Value * > & indexList)271     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
272     {
273         std::vector<Value*> indices;
274         for (auto i : indexList)
275             indices.push_back(i);
276         return GEPA(ptr, indices);
277     }
278 
GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)279     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
280     {
281         std::vector<Value*> indices;
282         for (auto i : indexList)
283             indices.push_back(C(i));
284         return GEPA(ptr, indices);
285     }
286 
LOAD(Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name)287     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
288     {
289         std::vector<Value*> valIndices;
290         for (auto i : indices)
291             valIndices.push_back(C(i));
292         return LOAD(GEPA(basePtr, valIndices), name);
293     }
294 
LOADV(Value * basePtr,const std::initializer_list<Value * > & indices,const llvm::Twine & name)295     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
296     {
297         std::vector<Value*> valIndices;
298         for (auto i : indices)
299             valIndices.push_back(i);
300         return LOAD(GEPA(basePtr, valIndices), name);
301     }
302 
STORE(Value * val,Value * basePtr,const std::initializer_list<uint32_t> & indices)303     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
304     {
305         std::vector<Value*> valIndices;
306         for (auto i : indices)
307             valIndices.push_back(C(i));
308         return STORE(val, GEPA(basePtr, valIndices));
309     }
310 
STOREV(Value * val,Value * basePtr,const std::initializer_list<Value * > & indices)311     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
312     {
313         std::vector<Value*> valIndices;
314         for (auto i : indices)
315             valIndices.push_back(i);
316         return STORE(val, GEPA(basePtr, valIndices));
317     }
318 
CALL(Value * Callee,const std::initializer_list<Value * > & argsList)319     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
320     {
321         std::vector<Value*> args;
322         for (auto arg : argsList)
323             args.push_back(arg);
324         return CALLA(Callee, args);
325     }
326 
327     #if HAVE_LLVM > 0x306
CALL(Value * Callee,Value * arg)328     CallInst *Builder::CALL(Value *Callee, Value* arg)
329     {
330         std::vector<Value*> args;
331         args.push_back(arg);
332         return CALLA(Callee, args);
333     }
334 
CALL2(Value * Callee,Value * arg1,Value * arg2)335     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
336     {
337         std::vector<Value*> args;
338         args.push_back(arg1);
339         args.push_back(arg2);
340         return CALLA(Callee, args);
341     }
342 
CALL3(Value * Callee,Value * arg1,Value * arg2,Value * arg3)343     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
344     {
345         std::vector<Value*> args;
346         args.push_back(arg1);
347         args.push_back(arg2);
348         args.push_back(arg3);
349         return CALLA(Callee, args);
350     }
351     #endif
352 
VRCP(Value * va)353     Value *Builder::VRCP(Value *va)
354     {
355         return FDIV(VIMMED1(1.0f), va);  // 1 / a
356     }
357 
VPLANEPS(Value * vA,Value * vB,Value * vC,Value * & vX,Value * & vY)358     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
359     {
360         Value* vOut = FMADDPS(vA, vX, vC);
361         vOut = FMADDPS(vB, vY, vOut);
362         return vOut;
363     }
364 
365     //////////////////////////////////////////////////////////////////////////
366     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
367     /// supported on the underlying platform, emulate it with float masked load
368     /// @param src - base address pointer for the load
369     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
MASKLOADD(Value * src,Value * mask)370     Value *Builder::MASKLOADD(Value* src,Value* mask)
371     {
372         Value* vResult;
373         // use avx2 gather instruction is available
374         if(JM()->mArch.AVX2())
375         {
376             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
377             vResult = CALL(func,{src,mask});
378         }
379         else
380         {
381             // maskload intrinsic expects integer mask operand in llvm >= 3.8
382     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
383             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
384     #else
385             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
386     #endif
387             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
388             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
389         }
390         return vResult;
391     }
392 
393     //////////////////////////////////////////////////////////////////////////
394     /// @brief insert a JIT call to CallPrint
395     /// - outputs formatted string to both stdout and VS output window
396     /// - DEBUG builds only
397     /// Usage example:
398     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
399     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
400     ///   result from a GEP, printing out the pointer to memory
401     /// @param printStr - constant string to print, which includes format specifiers
402     /// @param printArgs - initializer list of Value*'s to print to std out
PRINT(const std::string & printStr,const std::initializer_list<Value * > & printArgs)403     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
404     {
405         // push the arguments to CallPrint into a vector
406         std::vector<Value*> printCallArgs;
407         // save room for the format string.  we still need to modify it for vectors
408         printCallArgs.resize(1);
409 
410         // search through the format string for special processing
411         size_t pos = 0;
412         std::string tempStr(printStr);
413         pos = tempStr.find('%', pos);
414         auto v = printArgs.begin();
415 
416         while ((pos != std::string::npos) && (v != printArgs.end()))
417         {
418             Value* pArg = *v;
419             Type* pType = pArg->getType();
420 
421             if (pType->isVectorTy())
422             {
423                 Type* pContainedType = pType->getContainedType(0);
424 
425                 if (toupper(tempStr[pos + 1]) == 'X')
426                 {
427                     tempStr[pos] = '0';
428                     tempStr[pos + 1] = 'x';
429                     tempStr.insert(pos + 2, "%08X ");
430                     pos += 7;
431 
432                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
433 
434                     std::string vectorFormatStr;
435                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
436                     {
437                         vectorFormatStr += "0x%08X ";
438                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
439                     }
440 
441                     tempStr.insert(pos, vectorFormatStr);
442                     pos += vectorFormatStr.size();
443                 }
444                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
445                 {
446                     uint32_t i = 0;
447                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
448                     {
449                         tempStr.insert(pos, std::string("%f "));
450                         pos += 3;
451                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
452                     }
453                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
454                 }
455                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
456                 {
457                     uint32_t i = 0;
458                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
459                     {
460                         tempStr.insert(pos, std::string("%d "));
461                         pos += 3;
462                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
463                     }
464                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
465                 }
466             }
467             else
468             {
469                 if (toupper(tempStr[pos + 1]) == 'X')
470                 {
471                     tempStr[pos] = '0';
472                     tempStr.insert(pos + 1, "x%08");
473                     printCallArgs.push_back(pArg);
474                     pos += 3;
475                 }
476                 // for %f we need to cast float Values to doubles so that they print out correctly
477                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
478                 {
479                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
480                     pos++;
481                 }
482                 else
483                 {
484                     printCallArgs.push_back(pArg);
485                 }
486             }
487 
488             // advance to the next arguement
489             v++;
490             pos = tempStr.find('%', ++pos);
491         }
492 
493         // create global variable constant string
494         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
495         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
496         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
497 
498         // get a pointer to the first character in the constant string array
499         std::vector<Constant*> geplist{C(0),C(0)};
500     #if HAVE_LLVM == 0x306
501         Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
502     #else
503         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
504     #endif
505 
506         // insert the pointer to the format string in the argument vector
507         printCallArgs[0] = strGEP;
508 
509         // get pointer to CallPrint function and insert decl into the module if needed
510         std::vector<Type*> args;
511         args.push_back(PointerType::get(mInt8Ty,0));
512         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
513         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
514 
515         // if we haven't yet added the symbol to the symbol table
516         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
517         {
518             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
519         }
520 
521         // insert a call to CallPrint
522         return CALLA(callPrintFn,printCallArgs);
523     }
524 
525     //////////////////////////////////////////////////////////////////////////
526     /// @brief Wrapper around PRINT with initializer list.
PRINT(const std::string & printStr)527     CallInst* Builder::PRINT(const std::string &printStr)
528     {
529         return PRINT(printStr, {});
530     }
531 
532     //////////////////////////////////////////////////////////////////////////
533     /// @brief Generate a masked gather operation in LLVM IR.  If not
534     /// supported on the underlying platform, emulate it with loads
535     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
536     /// @param pBase - Int8* base VB address pointer value
537     /// @param vIndices - SIMD wide value of VB byte offsets
538     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
539     /// @param scale - value to scale indices by
GATHERPS(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,Value * scale)540     Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
541     {
542         Value* vGather;
543 
544         // use avx2 gather instruction if available
545         if(JM()->mArch.AVX2())
546         {
547             // force mask to <N x float>, required by vgather
548             vMask = BITCAST(vMask, mSimdFP32Ty);
549             vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
550         }
551         else
552         {
553             Value* pStack = STACKSAVE();
554 
555             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
556             Value* vSrcPtr = ALLOCA(vSrc->getType());
557             STORE(vSrc, vSrcPtr);
558 
559             vGather = VUNDEF_F();
560             Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
561             Value *vOffsets = MUL(vIndices,vScaleVec);
562             Value *mask = MASK(vMask);
563             for(uint32_t i = 0; i < mVWidth; ++i)
564             {
565                 // single component byte index
566                 Value *offset = VEXTRACT(vOffsets,C(i));
567                 // byte pointer to component
568                 Value *loadAddress = GEP(pBase,offset);
569                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
570                 // pointer to the value to load if we're masking off a component
571                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
572                 Value *selMask = VEXTRACT(mask,C(i));
573                 // switch in a safe address to load if we're trying to access a vertex
574                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
575                 Value *val = LOAD(validAddress);
576                 vGather = VINSERT(vGather,val,C(i));
577             }
578             STACKRESTORE(pStack);
579         }
580 
581         return vGather;
582     }
583 
584     //////////////////////////////////////////////////////////////////////////
585     /// @brief Generate a masked gather operation in LLVM IR.  If not
586     /// supported on the underlying platform, emulate it with loads
587     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
588     /// @param pBase - Int8* base VB address pointer value
589     /// @param vIndices - SIMD wide value of VB byte offsets
590     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
591     /// @param scale - value to scale indices by
GATHERDD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,Value * scale)592     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
593     {
594         Value* vGather;
595 
596         // use avx2 gather instruction if available
597         if(JM()->mArch.AVX2())
598         {
599             vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
600         }
601         else
602         {
603             Value* pStack = STACKSAVE();
604 
605             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
606             Value* vSrcPtr = ALLOCA(vSrc->getType());
607             STORE(vSrc, vSrcPtr);
608 
609             vGather = VUNDEF_I();
610             Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
611             Value *vOffsets = MUL(vIndices, vScaleVec);
612             Value *mask = MASK(vMask);
613             for(uint32_t i = 0; i < mVWidth; ++i)
614             {
615                 // single component byte index
616                 Value *offset = VEXTRACT(vOffsets, C(i));
617                 // byte pointer to component
618                 Value *loadAddress = GEP(pBase, offset);
619                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
620                 // pointer to the value to load if we're masking off a component
621                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
622                 Value *selMask = VEXTRACT(mask, C(i));
623                 // switch in a safe address to load if we're trying to access a vertex
624                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
625                 Value *val = LOAD(validAddress, C(0));
626                 vGather = VINSERT(vGather, val, C(i));
627             }
628 
629             STACKRESTORE(pStack);
630         }
631         return vGather;
632     }
633 
634     //////////////////////////////////////////////////////////////////////////
635     /// @brief Generate a masked gather operation in LLVM IR.  If not
636     /// supported on the underlying platform, emulate it with loads
637     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
638     /// @param pBase - Int8* base VB address pointer value
639     /// @param vIndices - SIMD wide value of VB byte offsets
640     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
641     /// @param scale - value to scale indices by
GATHERPD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,Value * scale)642     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
643     {
644         Value* vGather;
645 
646         // use avx2 gather instruction if available
647         if(JM()->mArch.AVX2())
648         {
649             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
650         }
651         else
652         {
653             Value* pStack = STACKSAVE();
654 
655             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
656             Value* vSrcPtr = ALLOCA(vSrc->getType());
657             STORE(vSrc, vSrcPtr);
658 
659             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
660             Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
661             Value *vOffsets = MUL(vIndices,vScaleVec);
662             Value *mask = MASK(vMask);
663             for(uint32_t i = 0; i < mVWidth/2; ++i)
664             {
665                 // single component byte index
666                 Value *offset = VEXTRACT(vOffsets,C(i));
667                 // byte pointer to component
668                 Value *loadAddress = GEP(pBase,offset);
669                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
670                 // pointer to the value to load if we're masking off a component
671                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
672                 Value *selMask = VEXTRACT(mask,C(i));
673                 // switch in a safe address to load if we're trying to access a vertex
674                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
675                 Value *val = LOAD(validAddress);
676                 vGather = VINSERT(vGather,val,C(i));
677             }
678             STACKRESTORE(pStack);
679         }
680         return vGather;
681     }
682 
683     //////////////////////////////////////////////////////////////////////////
684     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
MASK(Value * vmask)685     Value* Builder::MASK(Value* vmask)
686     {
687         Value* src = BITCAST(vmask, mSimdInt32Ty);
688         return ICMP_SLT(src, VIMMED1(0));
689     }
690 
691     //////////////////////////////////////////////////////////////////////////
692     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
VMASK(Value * mask)693     Value* Builder::VMASK(Value* mask)
694     {
695         return S_EXT(mask, mSimdInt32Ty);
696     }
697 
698     //////////////////////////////////////////////////////////////////////////
699     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
700     /// supported on the underlying platform, emulate it
701     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
702     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
703     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
704     /// 128bits of a, and vice versa for the upper lanes.  If the mask
705     /// value is negative, '0' is inserted.
PSHUFB(Value * a,Value * b)706     Value *Builder::PSHUFB(Value* a, Value* b)
707     {
708         Value* res;
709         // use avx2 pshufb instruction if available
710         if(JM()->mArch.AVX2())
711         {
712             res = VPSHUFB(a, b);
713         }
714         else
715         {
716             Constant* cB = dyn_cast<Constant>(b);
717             // number of 8 bit elements in b
718             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
719             // output vector
720             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
721 
722             // insert an 8 bit value from the high and low lanes of a per loop iteration
723             numElms /= 2;
724             for(uint32_t i = 0; i < numElms; i++)
725             {
726                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
727                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
728 
729                 // extract values from constant mask
730                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
731                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
732 
733                 Value* insertValLow128b;
734                 Value* insertValHigh128b;
735 
736                 // if the mask value is negative, insert a '0' in the respective output position
737                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
738                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
739                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
740 
741                 vShuf = VINSERT(vShuf, insertValLow128b, i);
742                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
743             }
744             res = vShuf;
745         }
746         return res;
747     }
748 
749     //////////////////////////////////////////////////////////////////////////
750     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
751     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
752     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
753     /// lower 8 values are used.
PMOVSXBD(Value * a)754     Value *Builder::PMOVSXBD(Value* a)
755     {
756         // llvm-3.9 removed the pmovsxbd intrinsic
757     #if HAVE_LLVM < 0x309
758         // use avx2 byte sign extend instruction if available
759         if(JM()->mArch.AVX2())
760         {
761             Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
762             return CALL(pmovsxbd, std::initializer_list<Value*>{a});
763         }
764         else
765     #endif
766         {
767             // VPMOVSXBD output type
768             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
769             // Extract 8 values from 128bit lane and sign extend
770             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
771         }
772     }
773 
774     //////////////////////////////////////////////////////////////////////////
775     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
776     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
777     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
PMOVSXWD(Value * a)778     Value *Builder::PMOVSXWD(Value* a)
779     {
780         // llvm-3.9 removed the pmovsxwd intrinsic
781     #if HAVE_LLVM < 0x309
782         // use avx2 word sign extend if available
783         if(JM()->mArch.AVX2())
784         {
785             Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
786             return CALL(pmovsxwd, std::initializer_list<Value*>{a});
787         }
788         else
789     #endif
790         {
791             // VPMOVSXWD output type
792             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
793             // Extract 8 values from 128bit lane and sign extend
794             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
795         }
796     }
797 
798     //////////////////////////////////////////////////////////////////////////
799     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
800     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
801     /// platform, emulate it
802     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
803     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
PERMD(Value * a,Value * idx)804     Value *Builder::PERMD(Value* a, Value* idx)
805     {
806         Value* res;
807         // use avx2 permute instruction if available
808         if(JM()->mArch.AVX2())
809         {
810             res = VPERMD(a, idx);
811         }
812         else
813         {
814             if (isa<Constant>(idx))
815             {
816                 res = VSHUFFLE(a, a, idx);
817             }
818             else
819             {
820                 res = VUNDEF_I();
821                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
822                 {
823                     Value* pIndex = VEXTRACT(idx, C(l));
824                     Value* pVal = VEXTRACT(a, pIndex);
825                     res = VINSERT(res, pVal, C(l));
826                 }
827             }
828         }
829         return res;
830     }
831 
832     //////////////////////////////////////////////////////////////////////////
833     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
834     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
835     /// platform, emulate it
836     /// @param a - 256bit SIMD lane(8x32bit) of float values.
837     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
PERMPS(Value * a,Value * idx)838     Value *Builder::PERMPS(Value* a, Value* idx)
839     {
840         Value* res;
841         // use avx2 permute instruction if available
842         if (JM()->mArch.AVX2())
843         {
844             // llvm 3.6.0 swapped the order of the args to vpermd
845             res = VPERMPS(idx, a);
846         }
847         else
848         {
849             if (isa<Constant>(idx))
850             {
851                 res = VSHUFFLE(a, a, idx);
852             }
853             else
854             {
855                 res = VUNDEF_F();
856                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
857                 {
858                     Value* pIndex = VEXTRACT(idx, C(l));
859                     Value* pVal = VEXTRACT(a, pIndex);
860                     res = VINSERT(res, pVal, C(l));
861                 }
862             }
863         }
864 
865         return res;
866     }
867 
868     //////////////////////////////////////////////////////////////////////////
869     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
870     /// in LLVM IR.  If not supported on the underlying platform, emulate it
871     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPH2PS(Value * a)872     Value *Builder::CVTPH2PS(Value* a)
873     {
874         if (JM()->mArch.F16C())
875         {
876             return VCVTPH2PS(a);
877         }
878         else
879         {
880             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
881             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
882 
883             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
884             {
885                 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
886             }
887 
888             Value* pResult = UndefValue::get(mSimdFP32Ty);
889             for (uint32_t i = 0; i < mVWidth; ++i)
890             {
891                 Value* pSrc = VEXTRACT(a, C(i));
892                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
893                 pResult = VINSERT(pResult, pConv, C(i));
894             }
895 
896             return pResult;
897         }
898     }
899 
900     //////////////////////////////////////////////////////////////////////////
901     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
902     /// in LLVM IR.  If not supported on the underlying platform, emulate it
903     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPS2PH(Value * a,Value * rounding)904     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
905     {
906         if (JM()->mArch.F16C())
907         {
908             return VCVTPS2PH(a, rounding);
909         }
910         else
911         {
912             // call scalar C function for now
913             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
914             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
915 
916             if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
917             {
918                 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
919             }
920 
921             Value* pResult = UndefValue::get(mSimdInt16Ty);
922             for (uint32_t i = 0; i < mVWidth; ++i)
923             {
924                 Value* pSrc = VEXTRACT(a, C(i));
925                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
926                 pResult = VINSERT(pResult, pConv, C(i));
927             }
928 
929             return pResult;
930         }
931     }
932 
PMAXSD(Value * a,Value * b)933     Value *Builder::PMAXSD(Value* a, Value* b)
934     {
935         // llvm-3.9 removed the pmax intrinsics
936     #if HAVE_LLVM >= 0x309
937         Value* cmp = ICMP_SGT(a, b);
938         return SELECT(cmp, a, b);
939     #else
940         if (JM()->mArch.AVX2())
941         {
942             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
943             return CALL(pmaxsd, {a, b});
944         }
945         else
946         {
947             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
948             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
949 
950             // low 128
951             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
952             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
953             Value* resLo = CALL(pmaxsd, {aLo, bLo});
954 
955             // high 128
956             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
957             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
958             Value* resHi = CALL(pmaxsd, {aHi, bHi});
959 
960             // combine
961             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
962             result = VINSERTI128(result, resHi, C((uint8_t)1));
963 
964             return result;
965         }
966     #endif
967     }
968 
PMINSD(Value * a,Value * b)969     Value *Builder::PMINSD(Value* a, Value* b)
970     {
971         // llvm-3.9 removed the pmin intrinsics
972     #if HAVE_LLVM >= 0x309
973         Value* cmp = ICMP_SLT(a, b);
974         return SELECT(cmp, a, b);
975     #else
976         if (JM()->mArch.AVX2())
977         {
978             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
979             return CALL(pminsd, {a, b});
980         }
981         else
982         {
983             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
984             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
985 
986             // low 128
987             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
988             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
989             Value* resLo = CALL(pminsd, {aLo, bLo});
990 
991             // high 128
992             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
993             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
994             Value* resHi = CALL(pminsd, {aHi, bHi});
995 
996             // combine
997             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
998             result = VINSERTI128(result, resHi, C((uint8_t)1));
999 
1000             return result;
1001         }
1002     #endif
1003     }
1004 
Gather4(const SWR_FORMAT format,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1005     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1006                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1007     {
1008         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1009         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1010         {
1011             // ensure our mask is the correct type
1012             mask = BITCAST(mask, mSimdFP32Ty);
1013             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1014         }
1015         else
1016         {
1017             // ensure our mask is the correct type
1018             mask = BITCAST(mask, mSimdInt32Ty);
1019             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1020         }
1021     }
1022 
GATHER4PS(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1023     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1024                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1025     {
1026         switch(info.bpp / info.numComps)
1027         {
1028             case 16:
1029             {
1030                     Value* vGatherResult[2];
1031                     Value *vMask;
1032 
1033                     // TODO: vGatherMaskedVal
1034                     Value* vGatherMaskedVal = VIMMED1((float)0);
1035 
1036                     // always have at least one component out of x or y to fetch
1037 
1038                     // save mask as it is zero'd out after each gather
1039                     vMask = mask;
1040 
1041                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1042                     // e.g. result of first 8x32bit integer gather for 16bit components
1043                     // 256i - 0    1    2    3    4    5    6    7
1044                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1045                     //
1046 
1047                     // if we have at least one component out of x or y to fetch
1048                     if(info.numComps > 2)
1049                     {
1050                         // offset base to the next components(zw) in the vertex to gather
1051                         pSrcBase = GEP(pSrcBase, C((char)4));
1052                         vMask = mask;
1053 
1054                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1055                         // e.g. result of second 8x32bit integer gather for 16bit components
1056                         // 256i - 0    1    2    3    4    5    6    7
1057                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1058                         //
1059                     }
1060                     else
1061                     {
1062                         vGatherResult[1] =  vGatherMaskedVal;
1063                     }
1064 
1065                     // Shuffle gathered components into place, each row is a component
1066                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1067             }
1068                 break;
1069             case 32:
1070             {
1071                 // apply defaults
1072                 for (uint32_t i = 0; i < 4; ++i)
1073                 {
1074                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1075                 }
1076 
1077                 for(uint32_t i = 0; i < info.numComps; i++)
1078                 {
1079                     uint32_t swizzleIndex = info.swizzle[i];
1080 
1081                     // save mask as it is zero'd out after each gather
1082                     Value *vMask = mask;
1083 
1084                     // Gather a SIMD of components
1085                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1086 
1087                     // offset base to the next component to gather
1088                     pSrcBase = GEP(pSrcBase, C((char)4));
1089                 }
1090             }
1091                 break;
1092             default:
1093                 SWR_ASSERT(0, "Invalid float format");
1094                 break;
1095         }
1096     }
1097 
GATHER4DD(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1098     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1099                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1100     {
1101         switch (info.bpp / info.numComps)
1102         {
1103             case 8:
1104             {
1105                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1106                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1107                 // e.g. result of an 8x32bit integer gather for 8bit components
1108                 // 256i - 0    1    2    3    4    5    6    7
1109                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1110 
1111                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1112             }
1113                 break;
1114             case 16:
1115             {
1116                 Value* vGatherResult[2];
1117                 Value *vMask;
1118 
1119                 // TODO: vGatherMaskedVal
1120                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1121 
1122                 // always have at least one component out of x or y to fetch
1123 
1124                 // save mask as it is zero'd out after each gather
1125                 vMask = mask;
1126 
1127                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1128                 // e.g. result of first 8x32bit integer gather for 16bit components
1129                 // 256i - 0    1    2    3    4    5    6    7
1130                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1131                 //
1132 
1133                 // if we have at least one component out of x or y to fetch
1134                 if(info.numComps > 2)
1135                 {
1136                     // offset base to the next components(zw) in the vertex to gather
1137                     pSrcBase = GEP(pSrcBase, C((char)4));
1138                     vMask = mask;
1139 
1140                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1141                     // e.g. result of second 8x32bit integer gather for 16bit components
1142                     // 256i - 0    1    2    3    4    5    6    7
1143                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1144                     //
1145                 }
1146                 else
1147                 {
1148                     vGatherResult[1] = vGatherMaskedVal;
1149                 }
1150 
1151                 // Shuffle gathered components into place, each row is a component
1152                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1153 
1154             }
1155                 break;
1156             case 32:
1157             {
1158                 // apply defaults
1159                 for (uint32_t i = 0; i < 4; ++i)
1160                 {
1161                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1162                 }
1163 
1164                 for(uint32_t i = 0; i < info.numComps; i++)
1165                 {
1166                     uint32_t swizzleIndex = info.swizzle[i];
1167 
1168                     // save mask as it is zero'd out after each gather
1169                     Value *vMask = mask;
1170 
1171                     // Gather a SIMD of components
1172                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1173 
1174                     // offset base to the next component to gather
1175                     pSrcBase = GEP(pSrcBase, C((char)4));
1176                 }
1177             }
1178                 break;
1179             default:
1180                 SWR_ASSERT(0, "unsupported format");
1181             break;
1182         }
1183     }
1184 
Shuffle16bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput[2],Value * vGatherOutput[4],bool bPackedOutput)1185     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1186     {
1187         // cast types
1188         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1189         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1190 
1191         // input could either be float or int vector; do shuffle work in int
1192         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1193         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1194 
1195         if(bPackedOutput)
1196         {
1197             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1198 
1199             // shuffle mask
1200             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1201                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1202             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1203             // after pshufb: group components together in each 128bit lane
1204             // 256i - 0    1    2    3    4    5    6    7
1205             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1206 
1207             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1208             // after PERMD: move and pack xy components into each 128bit lane
1209             // 256i - 0    1    2    3    4    5    6    7
1210             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1211 
1212             // do the same for zw components
1213             Value* vi128ZW = nullptr;
1214             if(info.numComps > 2)
1215             {
1216                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1217                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1218             }
1219 
1220             for(uint32_t i = 0; i < 4; i++)
1221             {
1222                 uint32_t swizzleIndex = info.swizzle[i];
1223                 // todo: fixed for packed
1224                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1225                 if(i >= info.numComps)
1226                 {
1227                     // set the default component val
1228                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1229                     continue;
1230                 }
1231 
1232                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1233                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1234                 // if x or y, use vi128XY permute result, else use vi128ZW
1235                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1236 
1237                 // extract packed component 128 bit lanes
1238                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1239             }
1240 
1241         }
1242         else
1243         {
1244             // pshufb masks for each component
1245             Value* vConstMask[2];
1246             // x/z shuffle mask
1247             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1248                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1249 
1250             // y/w shuffle mask
1251             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1252                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1253 
1254 
1255             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1256             // apply defaults
1257             for (uint32_t i = 0; i < 4; ++i)
1258             {
1259                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1260             }
1261 
1262             for(uint32_t i = 0; i < info.numComps; i++)
1263             {
1264                 uint32_t swizzleIndex = info.swizzle[i];
1265 
1266                 // select correct constMask for x/z or y/w pshufb
1267                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1268                 // if x or y, use vi128XY permute result, else use vi128ZW
1269                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1270 
1271                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1272                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1273                 // 256i - 0    1    2    3    4    5    6    7
1274                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1275             }
1276         }
1277     }
1278 
Shuffle8bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput,Value * vGatherOutput[],bool bPackedOutput)1279     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1280     {
1281         // cast types
1282         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1283         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1284 
1285         if(bPackedOutput)
1286         {
1287             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1288             // shuffle mask
1289             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1290                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1291             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1292             // after pshufb: group components together in each 128bit lane
1293             // 256i - 0    1    2    3    4    5    6    7
1294             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1295 
1296             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1297             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1298             // 256i - 0    1    2    3    4    5    6    7
1299             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1300 
1301             // do the same for zw components
1302             Value* vi128ZW = nullptr;
1303             if(info.numComps > 2)
1304             {
1305                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1306             }
1307 
1308             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1309             for(uint32_t i = 0; i < 4; i++)
1310             {
1311                 uint32_t swizzleIndex = info.swizzle[i];
1312                 // todo: fix for packed
1313                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1314                 if(i >= info.numComps)
1315                 {
1316                     // set the default component val
1317                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1318                     continue;
1319                 }
1320 
1321                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1322                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1323                 // if x or y, use vi128XY permute result, else use vi128ZW
1324                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1325 
1326                 // sign extend
1327                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1328             }
1329         }
1330         // else zero extend
1331         else{
1332             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1333             // apply defaults
1334             for (uint32_t i = 0; i < 4; ++i)
1335             {
1336                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1337             }
1338 
1339             for(uint32_t i = 0; i < info.numComps; i++){
1340                 uint32_t swizzleIndex = info.swizzle[i];
1341 
1342                 // pshufb masks for each component
1343                 Value* vConstMask;
1344                 switch(i)
1345                 {
1346                     case 0:
1347                         // x shuffle mask
1348                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1349                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1350                         break;
1351                     case 1:
1352                         // y shuffle mask
1353                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1354                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1355                         break;
1356                     case 2:
1357                         // z shuffle mask
1358                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1359                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1360                         break;
1361                     case 3:
1362                         // w shuffle mask
1363                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1364                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1365                         break;
1366                     default:
1367                         vConstMask = nullptr;
1368                         break;
1369                 }
1370 
1371                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1372                     // after pshufb for x channel
1373                     // 256i - 0    1    2    3    4    5    6    7
1374                     //        x000 x000 x000 x000 x000 x000 x000 x000
1375             }
1376         }
1377     }
1378 
1379     // Helper function to create alloca in entry block of function
CreateEntryAlloca(Function * pFunc,Type * pType)1380     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1381     {
1382         auto saveIP = IRB()->saveIP();
1383         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1384                               pFunc->getEntryBlock().begin());
1385         Value* pAlloca = ALLOCA(pType);
1386         IRB()->restoreIP(saveIP);
1387         return pAlloca;
1388     }
1389 
1390     //////////////////////////////////////////////////////////////////////////
1391     /// @brief emulates a scatter operation.
1392     /// @param pDst - pointer to destination
1393     /// @param vSrc - vector of src data to scatter
1394     /// @param vOffsets - vector of byte offsets from pDst
1395     /// @param vMask - mask of valid lanes
SCATTERPS(Value * pDst,Value * vSrc,Value * vOffsets,Value * vMask)1396     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1397     {
1398         /* Scatter algorithm
1399 
1400            while(Index = BitScanForward(mask))
1401                 srcElem = srcVector[Index]
1402                 offsetElem = offsetVector[Index]
1403                 *(pDst + offsetElem) = srcElem
1404                 Update mask (&= ~(1<<Index)
1405 
1406         */
1407 
1408         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1409         Function* pFunc = pCurBB->getParent();
1410         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1411 
1412         // Store vectors on stack
1413         if (pScatterStackSrc == nullptr)
1414         {
1415             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1416             // requirements for shaders with a lot of scatters.
1417             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1418             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1419         }
1420 
1421         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1422         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1423         STORE(vSrc, pSrcArrayPtr);
1424         STORE(vOffsets, pOffsetsArrayPtr);
1425 
1426         // Cast to pointers for random access
1427         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1428         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1429 
1430         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1431 
1432         // Get cttz function
1433         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1434 
1435         // Setup loop basic block
1436         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1437 
1438         // compute first set bit
1439         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1440 
1441         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1442 
1443         // Split current block
1444         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1445 
1446         // Remove unconditional jump created by splitBasicBlock
1447         pCurBB->getTerminator()->eraseFromParent();
1448 
1449         // Add terminator to end of original block
1450         IRB()->SetInsertPoint(pCurBB);
1451 
1452         // Add conditional branch
1453         COND_BR(pIsUndef, pPostLoop, pLoop);
1454 
1455         // Add loop basic block contents
1456         IRB()->SetInsertPoint(pLoop);
1457         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1458         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1459 
1460         pIndexPhi->addIncoming(pIndex, pCurBB);
1461         pMaskPhi->addIncoming(pMask, pCurBB);
1462 
1463         // Extract elements for this index
1464         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1465         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1466 
1467         // GEP to this offset in dst
1468         Value* pCurDst = GEP(pDst, pOffsetElem);
1469         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1470         STORE(pSrcElem, pCurDst);
1471 
1472         // Update the mask
1473         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1474 
1475         // Terminator
1476         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1477 
1478         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1479         COND_BR(pIsUndef, pPostLoop, pLoop);
1480 
1481         // Update phi edges
1482         pIndexPhi->addIncoming(pNewIndex, pLoop);
1483         pMaskPhi->addIncoming(pNewMask, pLoop);
1484 
1485         // Move builder to beginning of post loop
1486         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1487     }
1488 
VABSPS(Value * a)1489     Value* Builder::VABSPS(Value* a)
1490     {
1491         Value* asInt = BITCAST(a, mSimdInt32Ty);
1492         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1493         return result;
1494     }
1495 
ICLAMP(Value * src,Value * low,Value * high)1496     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1497     {
1498         Value *lowCmp = ICMP_SLT(src, low);
1499         Value *ret = SELECT(lowCmp, low, src);
1500 
1501         Value *highCmp = ICMP_SGT(ret, high);
1502         ret = SELECT(highCmp, high, ret);
1503 
1504         return ret;
1505     }
1506 
FCLAMP(Value * src,Value * low,Value * high)1507     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1508     {
1509         Value *lowCmp = FCMP_OLT(src, low);
1510         Value *ret = SELECT(lowCmp, low, src);
1511 
1512         Value *highCmp = FCMP_OGT(ret, high);
1513         ret = SELECT(highCmp, high, ret);
1514 
1515         return ret;
1516     }
1517 
FCLAMP(Value * src,float low,float high)1518     Value *Builder::FCLAMP(Value* src, float low, float high)
1519     {
1520         Value* result = VMAXPS(src, VIMMED1(low));
1521         result = VMINPS(result, VIMMED1(high));
1522 
1523         return result;
1524     }
1525 
1526     //////////////////////////////////////////////////////////////////////////
1527     /// @brief save/restore stack, providing ability to push/pop the stack and
1528     ///        reduce overall stack requirements for temporary stack use
STACKSAVE()1529     Value* Builder::STACKSAVE()
1530     {
1531         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1532     #if HAVE_LLVM == 0x306
1533         return CALL(pfnStackSave);
1534     #else
1535         return CALLA(pfnStackSave);
1536     #endif
1537     }
1538 
STACKRESTORE(Value * pSaved)1539     void Builder::STACKRESTORE(Value* pSaved)
1540     {
1541         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1542         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1543     }
1544 
FMADDPS(Value * a,Value * b,Value * c)1545     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1546     {
1547         Value* vOut;
1548         // use FMADs if available
1549         if(JM()->mArch.AVX2())
1550         {
1551             vOut = VFMADDPS(a, b, c);
1552         }
1553         else
1554         {
1555             vOut = FADD(FMUL(a, b), c);
1556         }
1557         return vOut;
1558     }
1559 
POPCNT(Value * a)1560     Value* Builder::POPCNT(Value* a)
1561     {
1562         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1563         return CALL(pCtPop, std::initializer_list<Value*>{a});
1564     }
1565 
1566     //////////////////////////////////////////////////////////////////////////
1567     /// @brief C functions called by LLVM IR
1568     //////////////////////////////////////////////////////////////////////////
1569 
1570     //////////////////////////////////////////////////////////////////////////
1571     /// @brief called in JIT code, inserted by PRINT
1572     /// output to both stdout and visual studio debug console
CallPrint(const char * fmt,...)1573     void __cdecl CallPrint(const char* fmt, ...)
1574     {
1575         va_list args;
1576         va_start(args, fmt);
1577         vprintf(fmt, args);
1578 
1579     #if defined( _WIN32 )
1580         char strBuf[1024];
1581         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1582         OutputDebugString(strBuf);
1583     #endif
1584 
1585         va_end(args);
1586     }
1587 
VEXTRACTI128(Value * a,Constant * imm8)1588     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1589     {
1590     #if HAVE_LLVM == 0x306
1591         Function *func =
1592             Intrinsic::getDeclaration(JM()->mpCurrentModule,
1593                                       Intrinsic::x86_avx_vextractf128_si_256);
1594         return CALL(func, {a, imm8});
1595     #else
1596         bool flag = !imm8->isZeroValue();
1597         SmallVector<Constant*,8> idx;
1598         for (unsigned i = 0; i < mVWidth / 2; i++) {
1599             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1600         }
1601         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1602     #endif
1603     }
1604 
VINSERTI128(Value * a,Value * b,Constant * imm8)1605     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1606     {
1607     #if HAVE_LLVM == 0x306
1608         Function *func =
1609             Intrinsic::getDeclaration(JM()->mpCurrentModule,
1610                                       Intrinsic::x86_avx_vinsertf128_si_256);
1611         return CALL(func, {a, b, imm8});
1612     #else
1613         bool flag = !imm8->isZeroValue();
1614         SmallVector<Constant*,8> idx;
1615         for (unsigned i = 0; i < mVWidth; i++) {
1616             idx.push_back(C(i));
1617         }
1618         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1619 
1620         SmallVector<Constant*,8> idx2;
1621         for (unsigned i = 0; i < mVWidth / 2; i++) {
1622             idx2.push_back(C(flag ? i : i + mVWidth));
1623         }
1624         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1625             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1626         }
1627         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1628     #endif
1629     }
1630 
1631     // rdtsc buckets macros
RDTSC_START(Value * pBucketMgr,Value * pId)1632     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1633     {
1634         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1635         // buckets framework when single threaded
1636         if (KNOB_SINGLE_THREADED)
1637         {
1638             std::vector<Type*> args{
1639                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1640                 mInt32Ty                        // id
1641             };
1642 
1643             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1644             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1645             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1646             {
1647                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1648             }
1649 
1650             CALL(pFunc, { pBucketMgr, pId });
1651         }
1652     }
1653 
RDTSC_STOP(Value * pBucketMgr,Value * pId)1654     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1655     {
1656         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1657         // buckets framework when single threaded
1658         if (KNOB_SINGLE_THREADED)
1659         {
1660             std::vector<Type*> args{
1661                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1662                 mInt32Ty                        // id
1663             };
1664 
1665             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1666             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1667             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1668             {
1669                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1670             }
1671 
1672             CALL(pFunc, { pBucketMgr, pId });
1673         }
1674     }
1675 
1676 }
1677