android-10.0.0_r47/s

/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file builder_misc.cpp
*
* @brief Implementation for miscellaneous builder functions
*
* Notes:
*
******************************************************************************/
#include "jit_pch.hpp"
#include "builder.h"
#include "common/rdtsc_buckets.h"

#include <cstdarg>

namespace SwrJit
{
    void __cdecl CallPrint(const char* fmt, ...);

    //////////////////////////////////////////////////////////////////////////
    /// @brief Convert an IEEE 754 32-bit single precision float to an
    ///        16 bit float with 5 exponent bits and a variable
    ///        number of mantissa bits.
    /// @param val - 32-bit float
    /// @todo Maybe move this outside of this file into a header?
    static uint16_t ConvertFloat32ToFloat16(float val)
    {
        uint32_t sign, exp, mant;
        uint32_t roundBits;

        // Extract the sign, exponent, and mantissa
        uint32_t uf = *(uint32_t*)&val;
        sign = (uf & 0x80000000) >> 31;
        exp = (uf & 0x7F800000) >> 23;
        mant = uf & 0x007FFFFF;

        // Check for out of range
        if (std::isnan(val))
        {
            exp = 0x1F;
            mant = 0x200;
            sign = 1;                     // set the sign bit for NANs
        }
        else if (std::isinf(val))
        {
            exp = 0x1f;
            mant = 0x0;
        }
        else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
        {
            exp = 0x1E;
            mant = 0x3FF;
        }
        else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
        {
            mant |= 0x00800000;
            for (; exp <= 0x70; mant >>= 1, exp++)
                ;
            exp = 0;
            mant = mant >> 13;
        }
        else if (exp < 0x66) // Too small to represent -> Zero
        {
            exp = 0;
            mant = 0;
        }
        else
        {
            // Saves bits that will be shifted off for rounding
            roundBits = mant & 0x1FFFu;
            // convert exponent and mantissa to 16 bit format
            exp = exp - 0x70;
            mant = mant >> 13;

            // Essentially RTZ, but round up if off by only 1 lsb
            if (roundBits == 0x1FFFu)
            {
                mant++;
                // check for overflow
                if ((mant & 0xC00u) != 0)
                    exp++;
                // make sure only the needed bits are used
                mant &= 0x3FF;
            }
        }

        uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
        return (uint16_t)tmpVal;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
    ///        float
    /// @param val - 16-bit float
    /// @todo Maybe move this outside of this file into a header?
    static float ConvertFloat16ToFloat32(uint32_t val)
    {
        uint32_t result;
        if ((val & 0x7fff) == 0)
        {
            result = ((uint32_t)(val & 0x8000)) << 16;
        }
        else if ((val & 0x7c00) == 0x7c00)
        {
            result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
            result |= ((uint32_t)val & 0x8000) << 16;
        }
        else
        {
            uint32_t sign = (val & 0x8000) << 16;
            uint32_t mant = (val & 0x3ff) << 13;
            uint32_t exp = (val >> 10) & 0x1f;
            if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
            {
                mant <<= 1;
                while (mant < (0x400 << 13))
                {
                    exp--;
                    mant <<= 1;
                }
                mant &= (0x3ff << 13);
            }
            exp = ((exp - 15 + 127) & 0xff) << 23;
            result = sign | exp | mant;
        }

        return *(float*)&result;
    }

    Constant *Builder::C(bool i)
    {
        return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
    }

    Constant *Builder::C(char i)
    {
        return ConstantInt::get(IRB()->getInt8Ty(), i);
    }

    Constant *Builder::C(uint8_t i)
    {
        return ConstantInt::get(IRB()->getInt8Ty(), i);
    }

    Constant *Builder::C(int i)
    {
        return ConstantInt::get(IRB()->getInt32Ty(), i);
    }

    Constant *Builder::C(int64_t i)
    {
        return ConstantInt::get(IRB()->getInt64Ty(), i);
    }

    Constant *Builder::C(uint16_t i)
    {
        return ConstantInt::get(mInt16Ty,i);
    }

    Constant *Builder::C(uint32_t i)
    {
        return ConstantInt::get(IRB()->getInt32Ty(), i);
    }

    Constant *Builder::C(float i)
    {
        return ConstantFP::get(IRB()->getFloatTy(), i);
    }

    Constant *Builder::PRED(bool pred)
    {
        return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
    }

    Value *Builder::VIMMED1(int i)
    {
        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    }

    Value *Builder::VIMMED1_16(int i)
    {
        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
    }

    Value *Builder::VIMMED1(uint32_t i)
    {
        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    }

    Value *Builder::VIMMED1_16(uint32_t i)
    {
        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
    }

    Value *Builder::VIMMED1(float i)
    {
        return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
    }

    Value *Builder::VIMMED1_16(float i)
    {
        return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
    }

    Value *Builder::VIMMED1(bool i)
    {
        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    }

    Value *Builder::VIMMED1_16(bool i)
    {
        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
    }

    Value *Builder::VUNDEF_IPTR()
    {
        return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
    }

    Value *Builder::VUNDEF(Type* t)
    {
        return UndefValue::get(VectorType::get(t, mVWidth));
    }

    Value *Builder::VUNDEF_I()
    {
        return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
    }

    Value *Builder::VUNDEF_I_16()
    {
        return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
    }

    Value *Builder::VUNDEF_F()
    {
        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
    }

    Value *Builder::VUNDEF_F_16()
    {
        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
    }

    Value *Builder::VUNDEF(Type *ty, uint32_t size)
    {
        return UndefValue::get(VectorType::get(ty, size));
    }

    Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
    {
        // check if src is already a vector
        if (src->getType()->isVectorTy())
        {
            return src;
        }

        return VECTOR_SPLAT(mVWidth, src, name);
    }

    Value *Builder::VBROADCAST_16(Value *src)
    {
        // check if src is already a vector
        if (src->getType()->isVectorTy())
        {
            return src;
        }

        return VECTOR_SPLAT(mVWidth16, src);
    }

    uint32_t Builder::IMMED(Value* v)
    {
        SWR_ASSERT(isa<ConstantInt>(v));
        ConstantInt *pValConst = cast<ConstantInt>(v);
        return pValConst->getZExtValue();
    }

    int32_t Builder::S_IMMED(Value* v)
    {
        SWR_ASSERT(isa<ConstantInt>(v));
        ConstantInt *pValConst = cast<ConstantInt>(v);
        return pValConst->getSExtValue();
    }

    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
    {
        std::vector<Value*> indices;
        for (auto i : indexList)
            indices.push_back(i);
        return GEPA(ptr, indices);
    }

    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
    {
        std::vector<Value*> indices;
        for (auto i : indexList)
            indices.push_back(C(i));
        return GEPA(ptr, indices);
    }

    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
    {
        std::vector<Value*> indices;
        for (auto i : indexList)
            indices.push_back(i);
        return IN_BOUNDS_GEP(ptr, indices);
    }

    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
    {
        std::vector<Value*> indices;
        for (auto i : indexList)
            indices.push_back(C(i));
        return IN_BOUNDS_GEP(ptr, indices);
    }

    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
    {
        std::vector<Value*> valIndices;
        for (auto i : indices)
            valIndices.push_back(C(i));
        return LOAD(GEPA(basePtr, valIndices), name);
    }

    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
    {
        std::vector<Value*> valIndices;
        for (auto i : indices)
            valIndices.push_back(i);
        return LOAD(GEPA(basePtr, valIndices), name);
    }

    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
    {
        std::vector<Value*> valIndices;
        for (auto i : indices)
            valIndices.push_back(C(i));
        return STORE(val, GEPA(basePtr, valIndices));
    }

    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
    {
        std::vector<Value*> valIndices;
        for (auto i : indices)
            valIndices.push_back(i);
        return STORE(val, GEPA(basePtr, valIndices));
    }

    CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
    {
        std::vector<Value*> args;
        for (auto arg : argsList)
            args.push_back(arg);
        return CALLA(Callee, args, name);
    }

    CallInst *Builder::CALL(Value *Callee, Value* arg)
    {
        std::vector<Value*> args;
        args.push_back(arg);
        return CALLA(Callee, args);
    }

    CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
    {
        std::vector<Value*> args;
        args.push_back(arg1);
        args.push_back(arg2);
        return CALLA(Callee, args);
    }

    CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
    {
        std::vector<Value*> args;
        args.push_back(arg1);
        args.push_back(arg2);
        args.push_back(arg3);
        return CALLA(Callee, args);
    }

    //////////////////////////////////////////////////////////////////////////
    Value *Builder::DEBUGTRAP()
    {
        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
        return CALL(func);
    }

    Value *Builder::VRCP(Value *va, const llvm::Twine& name)
    {
        return FDIV(VIMMED1(1.0f), va, name);  // 1 / a
    }

    Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
    {
        Value* vOut = FMADDPS(vA, vX, vC);
        vOut = FMADDPS(vB, vY, vOut);
        return vOut;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate an i32 masked load operation in LLVM IR.  If not
    /// supported on the underlying platform, emulate it with float masked load
    /// @param src - base address pointer for the load
    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
    Value *Builder::MASKLOADD(Value* src,Value* mask)
    {
        Value* vResult;
        // use avx2 gather instruction is available
        if(JM()->mArch.AVX2())
        {
            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
            vResult = CALL(func,{src,mask});
        }
        else
        {
            // maskload intrinsic expects integer mask operand in llvm >= 3.8
    #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
            mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
    #else
            mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
    #endif
            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
            vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
        }
        return vResult;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief insert a JIT call to CallPrint
    /// - outputs formatted string to both stdout and VS output window
    /// - DEBUG builds only
    /// Usage example:
    ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
    ///   where C(lane) creates a constant value to print, and pIndex is the Value*
    ///   result from a GEP, printing out the pointer to memory
    /// @param printStr - constant string to print, which includes format specifiers
    /// @param printArgs - initializer list of Value*'s to print to std out
    CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
    {
        // push the arguments to CallPrint into a vector
        std::vector<Value*> printCallArgs;
        // save room for the format string.  we still need to modify it for vectors
        printCallArgs.resize(1);

        // search through the format string for special processing
        size_t pos = 0;
        std::string tempStr(printStr);
        pos = tempStr.find('%', pos);
        auto v = printArgs.begin();

        while ((pos != std::string::npos) && (v != printArgs.end()))
        {
            Value* pArg = *v;
            Type* pType = pArg->getType();

            if (pType->isVectorTy())
            {
                Type* pContainedType = pType->getContainedType(0);

                if (toupper(tempStr[pos + 1]) == 'X')
                {
                    tempStr[pos] = '0';
                    tempStr[pos + 1] = 'x';
                    tempStr.insert(pos + 2, "%08X ");
                    pos += 7;

                    printCallArgs.push_back(VEXTRACT(pArg, C(0)));

                    std::string vectorFormatStr;
                    for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
                    {
                        vectorFormatStr += "0x%08X ";
                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
                    }

                    tempStr.insert(pos, vectorFormatStr);
                    pos += vectorFormatStr.size();
                }
                else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
                {
                    uint32_t i = 0;
                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
                    {
                        tempStr.insert(pos, std::string("%f "));
                        pos += 3;
                        printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                    }
                    printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                }
                else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
                {
                    uint32_t i = 0;
                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
                    {
                        tempStr.insert(pos, std::string("%d "));
                        pos += 3;
                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
                    }
                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
                }
            }
            else
            {
                if (toupper(tempStr[pos + 1]) == 'X')
                {
                    tempStr[pos] = '0';
                    tempStr.insert(pos + 1, "x%08");
                    printCallArgs.push_back(pArg);
                    pos += 3;
                }
                // for %f we need to cast float Values to doubles so that they print out correctly
                else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
                {
                    printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
                    pos++;
                }
                else
                {
                    printCallArgs.push_back(pArg);
                }
            }

            // advance to the next arguement
            v++;
            pos = tempStr.find('%', ++pos);
        }

        // create global variable constant string
        Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
        GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
        JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);

        // get a pointer to the first character in the constant string array
        std::vector<Constant*> geplist{C(0),C(0)};
        Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);

        // insert the pointer to the format string in the argument vector
        printCallArgs[0] = strGEP;

        // get pointer to CallPrint function and insert decl into the module if needed
        std::vector<Type*> args;
        args.push_back(PointerType::get(mInt8Ty,0));
        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
        Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));

        // if we haven't yet added the symbol to the symbol table
        if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
        {
            sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
        }

        // insert a call to CallPrint
        return CALLA(callPrintFn,printCallArgs);
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Wrapper around PRINT with initializer list.
    CallInst* Builder::PRINT(const std::string &printStr)
    {
        return PRINT(printStr, {});
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a masked gather operation in LLVM IR.  If not
    /// supported on the underlying platform, emulate it with loads
    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    /// @param pBase - Int8* base VB address pointer value
    /// @param vIndices - SIMD wide value of VB byte offsets
    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    /// @param scale - value to scale indices by
    Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
    {
        Value *vGather;

        // use avx2 gather instruction if available
        if(JM()->mArch.AVX2())
        {
            // force mask to <N x float>, required by vgather
            Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);

            vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
        }
        else
        {
            Value* pStack = STACKSAVE();

            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
            Value* vSrcPtr = ALLOCA(vSrc->getType());
            STORE(vSrc, vSrcPtr);

            vGather = VUNDEF_F();
            Value *vScaleVec = VIMMED1((uint32_t)scale);
            Value *vOffsets = MUL(vIndices,vScaleVec);
            for(uint32_t i = 0; i < mVWidth; ++i)
            {
                // single component byte index
                Value *offset = VEXTRACT(vOffsets,C(i));
                // byte pointer to component
                Value *loadAddress = GEP(pBase,offset);
                loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
                // pointer to the value to load if we're masking off a component
                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
                Value *selMask = VEXTRACT(vMask,C(i));
                // switch in a safe address to load if we're trying to access a vertex
                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
                Value *val = LOAD(validAddress);
                vGather = VINSERT(vGather,val,C(i));
            }

            STACKRESTORE(pStack);
        }

        return vGather;
    }

    Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
    {
        Value *vGather = VUNDEF_F_16();

        // use AVX512F gather instruction if available
        if (JM()->mArch.AVX512F())
        {
            // force mask to <N-bit Integer>, required by vgather2
            Value *mask = BITCAST(vMask, mInt16Ty);

            vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
        }
        else
        {
            Value *src0 = EXTRACT_16(vSrc, 0);
            Value *src1 = EXTRACT_16(vSrc, 1);

            Value *indices0 = EXTRACT_16(vIndices, 0);
            Value *indices1 = EXTRACT_16(vIndices, 1);

            Value *mask0 = EXTRACT_16(vMask, 0);
            Value *mask1 = EXTRACT_16(vMask, 1);

            Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
            Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);

            vGather = JOIN_16(gather0, gather1);
        }

        return vGather;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a masked gather operation in LLVM IR.  If not
    /// supported on the underlying platform, emulate it with loads
    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    /// @param pBase - Int8* base VB address pointer value
    /// @param vIndices - SIMD wide value of VB byte offsets
    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    /// @param scale - value to scale indices by
    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
    {
        Value* vGather;

        // use avx2 gather instruction if available
        if(JM()->mArch.AVX2())
        {
            vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
        }
        else
        {
            Value* pStack = STACKSAVE();

            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
            Value* vSrcPtr = ALLOCA(vSrc->getType());
            STORE(vSrc, vSrcPtr);

            vGather = VUNDEF_I();
            Value *vScaleVec = VIMMED1((uint32_t)scale);
            Value *vOffsets = MUL(vIndices, vScaleVec);
            for(uint32_t i = 0; i < mVWidth; ++i)
            {
                // single component byte index
                Value *offset = VEXTRACT(vOffsets, C(i));
                // byte pointer to component
                Value *loadAddress = GEP(pBase, offset);
                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
                // pointer to the value to load if we're masking off a component
                Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
                Value *selMask = VEXTRACT(vMask, C(i));
                // switch in a safe address to load if we're trying to access a vertex
                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
                Value *val = LOAD(validAddress, C(0));
                vGather = VINSERT(vGather, val, C(i));
            }

            STACKRESTORE(pStack);
        }

        return vGather;
    }

    Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
    {
        Value *vGather = VUNDEF_I_16();

        // use AVX512F gather instruction if available
        if (JM()->mArch.AVX512F())
        {
            // force mask to <N-bit Integer>, required by vgather2
            Value *mask = BITCAST(vMask, mInt16Ty);

            vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
        }
        else
        {
            Value *src0 = EXTRACT_16(vSrc, 0);
            Value *src1 = EXTRACT_16(vSrc, 1);

            Value *indices0 = EXTRACT_16(vIndices, 0);
            Value *indices1 = EXTRACT_16(vIndices, 1);

            Value *mask0 = EXTRACT_16(vMask, 0);
            Value *mask1 = EXTRACT_16(vMask, 1);

            Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
            Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);

            vGather = JOIN_16(gather0, gather1);
        }

        return vGather;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a masked gather operation in LLVM IR.  If not
    /// supported on the underlying platform, emulate it with loads
    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    /// @param pBase - Int8* base VB address pointer value
    /// @param vIndices - SIMD wide value of VB byte offsets
    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    /// @param scale - value to scale indices by
    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
    {
        Value* vGather;

        // use avx2 gather instruction if available
        if(JM()->mArch.AVX2())
        {
            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
        }
        else
        {
            Value* pStack = STACKSAVE();

            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
            Value* vSrcPtr = ALLOCA(vSrc->getType());
            STORE(vSrc, vSrcPtr);

            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
            Value *vOffsets = MUL(vIndices,vScaleVec);
            for(uint32_t i = 0; i < mVWidth/2; ++i)
            {
                // single component byte index
                Value *offset = VEXTRACT(vOffsets,C(i));
                // byte pointer to component
                Value *loadAddress = GEP(pBase,offset);
                loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
                // pointer to the value to load if we're masking off a component
                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
                Value *selMask = VEXTRACT(vMask,C(i));
                // switch in a safe address to load if we're trying to access a vertex
                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
                Value *val = LOAD(validAddress);
                vGather = VINSERT(vGather,val,C(i));
            }
            STACKRESTORE(pStack);
        }
        return vGather;
    }

    Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
    {
        if (imm == 0)
        {
            return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
        }
        else
        {
            return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
        }
    }

    Value *Builder::JOIN_16(Value *a, Value *b)
    {
        return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
    Value *Builder::MASK(Value *vmask)
    {
        Value *src = BITCAST(vmask, mSimdInt32Ty);
        return ICMP_SLT(src, VIMMED1(0));
    }

    Value *Builder::MASK_16(Value *vmask)
    {
        Value *src = BITCAST(vmask, mSimd16Int32Ty);
        return ICMP_SLT(src, VIMMED1_16(0));
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
    Value *Builder::VMASK(Value *mask)
    {
        return S_EXT(mask, mSimdInt32Ty);
    }

    Value *Builder::VMASK_16(Value *mask)
    {
        return S_EXT(mask, mSimd16Int32Ty);
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
    /// supported on the underlying platform, emulate it
    /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
    /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
    /// Byte masks in lower 128 lane of b selects 8 bit values from lower
    /// 128bits of a, and vice versa for the upper lanes.  If the mask
    /// value is negative, '0' is inserted.
    Value *Builder::PSHUFB(Value* a, Value* b)
    {
        Value* res;
        // use avx2 pshufb instruction if available
        if(JM()->mArch.AVX2())
        {
            res = VPSHUFB(a, b);
        }
        else
        {
            Constant* cB = dyn_cast<Constant>(b);
            // number of 8 bit elements in b
            uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
            // output vector
            Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));

            // insert an 8 bit value from the high and low lanes of a per loop iteration
            numElms /= 2;
            for(uint32_t i = 0; i < numElms; i++)
            {
                ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
                ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));

                // extract values from constant mask
                char valLow128bLane =  (char)(cLow128b->getSExtValue());
                char valHigh128bLane = (char)(cHigh128b->getSExtValue());

                Value* insertValLow128b;
                Value* insertValHigh128b;

                // if the mask value is negative, insert a '0' in the respective output position
                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
                insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
                insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));

                vShuf = VINSERT(vShuf, insertValLow128b, i);
                vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
            }
            res = vShuf;
        }
        return res;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
    /// lower 8 values are used.
    Value *Builder::PMOVSXBD(Value* a)
    {
        // VPMOVSXBD output type
        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
        // Extract 8 values from 128bit lane and sign extend
        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
    /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
    Value *Builder::PMOVSXWD(Value* a)
    {
        // VPMOVSXWD output type
        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
        // Extract 8 values from 128bit lane and sign extend
        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
    /// platform, emulate it
    /// @param a - 256bit SIMD lane(8x32bit) of integer values.
    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
    Value *Builder::PERMD(Value* a, Value* idx)
    {
        Value* res;
        // use avx2 permute instruction if available
        if(JM()->mArch.AVX2())
        {
            res = VPERMD(a, idx);
        }
        else
        {
            if (isa<Constant>(idx))
            {
                res = VSHUFFLE(a, a, idx);
            }
            else
            {
                res = VUNDEF_I();
                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
                {
                    Value* pIndex = VEXTRACT(idx, C(l));
                    Value* pVal = VEXTRACT(a, pIndex);
                    res = VINSERT(res, pVal, C(l));
                }
            }
        }
        return res;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
    /// platform, emulate it
    /// @param a - 256bit SIMD lane(8x32bit) of float values.
    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
    Value *Builder::PERMPS(Value* a, Value* idx)
    {
        Value* res;
        // use avx2 permute instruction if available
        if (JM()->mArch.AVX2())
        {
            // llvm 3.6.0 swapped the order of the args to vpermd
            res = VPERMPS(idx, a);
        }
        else
        {
            if (isa<Constant>(idx))
            {
                res = VSHUFFLE(a, a, idx);
            }
            else
            {
                res = VUNDEF_F();
                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
                {
                    Value* pIndex = VEXTRACT(idx, C(l));
                    Value* pVal = VEXTRACT(a, pIndex);
                    res = VINSERT(res, pVal, C(l));
                }
            }
        }

        return res;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
    /// in LLVM IR.  If not supported on the underlying platform, emulate it
    /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
    Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
    {
        if (JM()->mArch.F16C())
        {
            return VCVTPH2PS(a, name);
        }
        else
        {
            FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
            Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));

            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
            {
                sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
            }

            Value* pResult = UndefValue::get(mSimdFP32Ty);
            for (uint32_t i = 0; i < mVWidth; ++i)
            {
                Value* pSrc = VEXTRACT(a, C(i));
                Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
                pResult = VINSERT(pResult, pConv, C(i));
            }

            pResult->setName(name);
            return pResult;
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
    /// in LLVM IR.  If not supported on the underlying platform, emulate it
    /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
    Value *Builder::CVTPS2PH(Value* a, Value* rounding)
    {
        if (JM()->mArch.F16C())
        {
            return VCVTPS2PH(a, rounding);
        }
        else
        {
            // call scalar C function for now
            FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
            Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));

            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
            {
                sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
            }

            Value* pResult = UndefValue::get(mSimdInt16Ty);
            for (uint32_t i = 0; i < mVWidth; ++i)
            {
                Value* pSrc = VEXTRACT(a, C(i));
                Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
                pResult = VINSERT(pResult, pConv, C(i));
            }

            return pResult;
        }
    }

    Value *Builder::PMAXSD(Value* a, Value* b)
    {
        Value* cmp = ICMP_SGT(a, b);
        return SELECT(cmp, a, b);
    }

    Value *Builder::PMINSD(Value* a, Value* b)
    {
        Value* cmp = ICMP_SLT(a, b);
        return SELECT(cmp, a, b);
    }

    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
                          Value* mask, Value* vGatherComponents[], bool bPackedOutput)
    {
        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
        if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
        {
            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
        }
        else
        {
            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
        }
    }

    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
                            Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
    {
        switch(info.bpp / info.numComps)
        {
            case 16:
            {
                    Value* vGatherResult[2];

                    // TODO: vGatherMaskedVal
                    Value* vGatherMaskedVal = VIMMED1((float)0);

                    // always have at least one component out of x or y to fetch

                    vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
                    // e.g. result of first 8x32bit integer gather for 16bit components
                    // 256i - 0    1    2    3    4    5    6    7
                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
                    //

                    // if we have at least one component out of x or y to fetch
                    if(info.numComps > 2)
                    {
                        // offset base to the next components(zw) in the vertex to gather
                        pSrcBase = GEP(pSrcBase, C((char)4));

                        vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
                        // e.g. result of second 8x32bit integer gather for 16bit components
                        // 256i - 0    1    2    3    4    5    6    7
                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
                        //
                    }
                    else
                    {
                        vGatherResult[1] =  vGatherMaskedVal;
                    }

                    // Shuffle gathered components into place, each row is a component
                    Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
            }
                break;
            case 32:
            {
                // apply defaults
                for (uint32_t i = 0; i < 4; ++i)
                {
                    vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
                }

                for(uint32_t i = 0; i < info.numComps; i++)
                {
                    uint32_t swizzleIndex = info.swizzle[i];

                    // Gather a SIMD of components
                    vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);

                    // offset base to the next component to gather
                    pSrcBase = GEP(pSrcBase, C((char)4));
                }
            }
                break;
            default:
                SWR_INVALID("Invalid float format");
                break;
        }
    }

    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
                            Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
    {
        switch (info.bpp / info.numComps)
        {
            case 8:
            {
                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
                Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
                // e.g. result of an 8x32bit integer gather for 8bit components
                // 256i - 0    1    2    3    4    5    6    7
                //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw

                Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
            }
                break;
            case 16:
            {
                Value* vGatherResult[2];

                // TODO: vGatherMaskedVal
                Value* vGatherMaskedVal = VIMMED1((int32_t)0);

                // always have at least one component out of x or y to fetch

                vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
                // e.g. result of first 8x32bit integer gather for 16bit components
                // 256i - 0    1    2    3    4    5    6    7
                //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
                //

                // if we have at least one component out of x or y to fetch
                if(info.numComps > 2)
                {
                    // offset base to the next components(zw) in the vertex to gather
                    pSrcBase = GEP(pSrcBase, C((char)4));

                    vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
                    // e.g. result of second 8x32bit integer gather for 16bit components
                    // 256i - 0    1    2    3    4    5    6    7
                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
                    //
                }
                else
                {
                    vGatherResult[1] = vGatherMaskedVal;
                }

                // Shuffle gathered components into place, each row is a component
                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);

            }
                break;
            case 32:
            {
                // apply defaults
                for (uint32_t i = 0; i < 4; ++i)
                {
                    vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
                }

                for(uint32_t i = 0; i < info.numComps; i++)
                {
                    uint32_t swizzleIndex = info.swizzle[i];

                    // Gather a SIMD of components
                    vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);

                    // offset base to the next component to gather
                    pSrcBase = GEP(pSrcBase, C((char)4));
                }
            }
                break;
            default:
                SWR_INVALID("unsupported format");
            break;
        }
    }

    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
    {
        // cast types
        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits

        // input could either be float or int vector; do shuffle work in int
        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);

        if(bPackedOutput)
        {
            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits

            // shuffle mask
            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
            // after pshufb: group components together in each 128bit lane
            // 256i - 0    1    2    3    4    5    6    7
            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy

            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
            // after PERMD: move and pack xy components into each 128bit lane
            // 256i - 0    1    2    3    4    5    6    7
            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy

            // do the same for zw components
            Value* vi128ZW = nullptr;
            if(info.numComps > 2)
            {
                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
            }

            for(uint32_t i = 0; i < 4; i++)
            {
                uint32_t swizzleIndex = info.swizzle[i];
                // todo: fixed for packed
                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
                if(i >= info.numComps)
                {
                    // set the default component val
                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
                    continue;
                }

                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
                // if x or y, use vi128XY permute result, else use vi128ZW
                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;

                // extract packed component 128 bit lanes
                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
            }

        }
        else
        {
            // pshufb masks for each component
            Value* vConstMask[2];
            // x/z shuffle mask
            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });

            // y/w shuffle mask
            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});


            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
            // apply defaults
            for (uint32_t i = 0; i < 4; ++i)
            {
                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
            }

            for(uint32_t i = 0; i < info.numComps; i++)
            {
                uint32_t swizzleIndex = info.swizzle[i];

                // select correct constMask for x/z or y/w pshufb
                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
                // if x or y, use vi128XY permute result, else use vi128ZW
                uint32_t selectedGather = (i < 2) ? 0 : 1;

                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
                // after pshufb mask for x channel; z uses the same shuffle from the second gather
                // 256i - 0    1    2    3    4    5    6    7
                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
            }
        }
    }

    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
    {
        // cast types
        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
        Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits

        if(bPackedOutput)
        {
            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
            // shuffle mask
            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
            // after pshufb: group components together in each 128bit lane
            // 256i - 0    1    2    3    4    5    6    7
            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww

            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
            // 256i - 0    1    2    3    4    5    6    7
            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)

            // do the same for zw components
            Value* vi128ZW = nullptr;
            if(info.numComps > 2)
            {
                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
            }

            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
            for(uint32_t i = 0; i < 4; i++)
            {
                uint32_t swizzleIndex = info.swizzle[i];
                // todo: fix for packed
                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
                if(i >= info.numComps)
                {
                    // set the default component val
                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
                    continue;
                }

                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
                // if x or y, use vi128XY permute result, else use vi128ZW
                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;

                // sign extend
                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
            }
        }
        // else zero extend
        else{
            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
            // apply defaults
            for (uint32_t i = 0; i < 4; ++i)
            {
                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
            }

            for(uint32_t i = 0; i < info.numComps; i++){
                uint32_t swizzleIndex = info.swizzle[i];

                // pshufb masks for each component
                Value* vConstMask;
                switch(i)
                {
                    case 0:
                        // x shuffle mask
                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
                        break;
                    case 1:
                        // y shuffle mask
                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
                        break;
                    case 2:
                        // z shuffle mask
                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
                        break;
                    case 3:
                        // w shuffle mask
                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
                        break;
                    default:
                        vConstMask = nullptr;
                        break;
                }

                    vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
                    // after pshufb for x channel
                    // 256i - 0    1    2    3    4    5    6    7
                    //        x000 x000 x000 x000 x000 x000 x000 x000
            }
        }
    }

    // Helper function to create alloca in entry block of function
    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
    {
        auto saveIP = IRB()->saveIP();
        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
                              pFunc->getEntryBlock().begin());
        Value* pAlloca = ALLOCA(pType);
        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
        return pAlloca;
    }

    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
    {
        auto saveIP = IRB()->saveIP();
        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
            pFunc->getEntryBlock().begin());
        Value* pAlloca = ALLOCA(pType, pArraySize);
        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
        return pAlloca;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief emulates a scatter operation.
    /// @param pDst - pointer to destination
    /// @param vSrc - vector of src data to scatter
    /// @param vOffsets - vector of byte offsets from pDst
    /// @param vMask - mask of valid lanes
    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
    {
        /* Scatter algorithm

           while(Index = BitScanForward(mask))
                srcElem = srcVector[Index]
                offsetElem = offsetVector[Index]
                *(pDst + offsetElem) = srcElem
                Update mask (&= ~(1<<Index)

        */

        BasicBlock* pCurBB = IRB()->GetInsertBlock();
        Function* pFunc = pCurBB->getParent();
        Type* pSrcTy = vSrc->getType()->getVectorElementType();

        // Store vectors on stack
        if (pScatterStackSrc == nullptr)
        {
            // Save off stack allocations and reuse per scatter. Significantly reduces stack
            // requirements for shaders with a lot of scatters.
            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
        }

        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
        Value* pOffsetsArrayPtr = pScatterStackOffsets;
        STORE(vSrc, pSrcArrayPtr);
        STORE(vOffsets, pOffsetsArrayPtr);

        // Cast to pointers for random access
        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));

        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));

        // Get cttz function
        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });

        // Setup loop basic block
        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);

        // compute first set bit
        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });

        Value* pIsUndef = ICMP_EQ(pIndex, C(32));

        // Split current block
        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());

        // Remove unconditional jump created by splitBasicBlock
        pCurBB->getTerminator()->eraseFromParent();

        // Add terminator to end of original block
        IRB()->SetInsertPoint(pCurBB);

        // Add conditional branch
        COND_BR(pIsUndef, pPostLoop, pLoop);

        // Add loop basic block contents
        IRB()->SetInsertPoint(pLoop);
        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
        PHINode* pMaskPhi = PHI(mInt32Ty, 2);

        pIndexPhi->addIncoming(pIndex, pCurBB);
        pMaskPhi->addIncoming(pMask, pCurBB);

        // Extract elements for this index
        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });

        // GEP to this offset in dst
        Value* pCurDst = GEP(pDst, pOffsetElem);
        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
        STORE(pSrcElem, pCurDst);

        // Update the mask
        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));

        // Terminator
        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });

        pIsUndef = ICMP_EQ(pNewIndex, C(32));
        COND_BR(pIsUndef, pPostLoop, pLoop);

        // Update phi edges
        pIndexPhi->addIncoming(pNewIndex, pLoop);
        pMaskPhi->addIncoming(pNewMask, pLoop);

        // Move builder to beginning of post loop
        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
    }

    Value* Builder::VABSPS(Value* a)
    {
        Value* asInt = BITCAST(a, mSimdInt32Ty);
        Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
        return result;
    }

    Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
    {
        Value *lowCmp = ICMP_SLT(src, low);
        Value *ret = SELECT(lowCmp, low, src);

        Value *highCmp = ICMP_SGT(ret, high);
        ret = SELECT(highCmp, high, ret, name);

        return ret;
    }

    Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
    {
        Value *lowCmp = FCMP_OLT(src, low);
        Value *ret = SELECT(lowCmp, low, src);

        Value *highCmp = FCMP_OGT(ret, high);
        ret = SELECT(highCmp, high, ret);

        return ret;
    }

    Value *Builder::FCLAMP(Value* src, float low, float high)
    {
        Value* result = VMAXPS(src, VIMMED1(low));
        result = VMINPS(result, VIMMED1(high));

        return result;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief save/restore stack, providing ability to push/pop the stack and
    ///        reduce overall stack requirements for temporary stack use
    Value* Builder::STACKSAVE()
    {
        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
        return CALLA(pfnStackSave);
    }

    void Builder::STACKRESTORE(Value* pSaved)
    {
        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
    }

    Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
    {
        Value* vOut;
        // use FMADs if available
        if(JM()->mArch.AVX2())
        {
            vOut = VFMADDPS(a, b, c);
        }
        else
        {
            vOut = FADD(FMUL(a, b), c);
        }
        return vOut;
    }

    Value* Builder::POPCNT(Value* a)
    {
        Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
        return CALL(pCtPop, std::initializer_list<Value*>{a});
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief C functions called by LLVM IR
    //////////////////////////////////////////////////////////////////////////

    //////////////////////////////////////////////////////////////////////////
    /// @brief called in JIT code, inserted by PRINT
    /// output to both stdout and visual studio debug console
    void __cdecl CallPrint(const char* fmt, ...)
    {
        va_list args;
        va_start(args, fmt);
        vprintf(fmt, args);

    #if defined( _WIN32 )
        char strBuf[1024];
        vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
        OutputDebugStringA(strBuf);
    #endif

        va_end(args);
    }

    Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
    {
        bool flag = !imm8->isZeroValue();
        SmallVector<Constant*,8> idx;
        for (unsigned i = 0; i < mVWidth / 2; i++) {
            idx.push_back(C(flag ? i + mVWidth / 2 : i));
        }
        return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
    }

    Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
    {
        bool flag = !imm8->isZeroValue();
        SmallVector<Constant*,8> idx;
        for (unsigned i = 0; i < mVWidth; i++) {
            idx.push_back(C(i));
        }
        Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));

        SmallVector<Constant*,8> idx2;
        for (unsigned i = 0; i < mVWidth / 2; i++) {
            idx2.push_back(C(flag ? i : i + mVWidth));
        }
        for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
            idx2.push_back(C(flag ? i + mVWidth / 2 : i));
        }
        return VSHUFFLE(a, inter, ConstantVector::get(idx2));
    }

    // rdtsc buckets macros
    void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
    {
        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
        // buckets framework when single threaded
        if (KNOB_SINGLE_THREADED)
        {
            std::vector<Type*> args{
                PointerType::get(mInt32Ty, 0),   // pBucketMgr
                mInt32Ty                        // id
            };

            FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
            {
                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
            }

            CALL(pFunc, { pBucketMgr, pId });
        }
    }

    void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
    {
        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
        // buckets framework when single threaded
        if (KNOB_SINGLE_THREADED)
        {
            std::vector<Type*> args{
                PointerType::get(mInt32Ty, 0),   // pBucketMgr
                mInt32Ty                        // id
            };

            FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
            {
                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
            }

            CALL(pFunc, { pBucketMgr, pId });
        }
    }


    uint32_t Builder::GetTypeSize(Type* pType)
    {
        if (pType->isStructTy())
        {
            uint32_t numElems = pType->getStructNumElements();
            Type* pElemTy = pType->getStructElementType(0);
            return numElems * GetTypeSize(pElemTy);
        }

        if (pType->isArrayTy())
        {
            uint32_t numElems = pType->getArrayNumElements();
            Type* pElemTy = pType->getArrayElementType();
            return numElems * GetTypeSize(pElemTy);
        }

        if (pType->isIntegerTy())
        {
            uint32_t bitSize = pType->getIntegerBitWidth();
            return bitSize / 8;
        }

        if (pType->isFloatTy())
        {
            return 4;
        }

        if (pType->isHalfTy())
        {
            return 2;
        }

        if (pType->isDoubleTy())
        {
            return 8;
        }

        SWR_ASSERT(false, "Unimplemented type.");
        return 0;
    }
}