1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file builder_misc.cpp 24 * 25 * @brief Implementation for miscellaneous builder functions 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "jit_pch.hpp" 31 #include "builder.h" 32 33 #include <cstdarg> 34 35 namespace SwrJit 36 { AssertMemoryUsageParams(Value * ptr,MEM_CLIENT usage)37 void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage) 38 { 39 SWR_ASSERT( 40 ptr->getType() != mInt64Ty, 41 "Address appears to be GFX access. Requires translation through BuilderGfxMem."); 42 } 43 GEP(Value * Ptr,Value * Idx,Type * Ty,bool isReadOnly,const Twine & Name)44 Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name) 45 { 46 return IRB()->CreateGEP(Ptr, Idx, Name); 47 } 48 GEP(Type * Ty,Value * Ptr,Value * Idx,const Twine & Name)49 Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) 50 { 51 return IRB()->CreateGEP(Ty, Ptr, Idx, Name); 52 } 53 GEP(Value * ptr,const std::initializer_list<Value * > & indexList,Type * Ty)54 Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty) 55 { 56 std::vector<Value*> indices; 57 for (auto i : indexList) 58 indices.push_back(i); 59 return GEPA(ptr, indices); 60 } 61 GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList,Type * Ty)62 Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty) 63 { 64 std::vector<Value*> indices; 65 for (auto i : indexList) 66 indices.push_back(C(i)); 67 return GEPA(ptr, indices); 68 } 69 GEPA(Value * Ptr,ArrayRef<Value * > IdxList,const Twine & Name)70 Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) 71 { 72 return IRB()->CreateGEP(Ptr, IdxList, Name); 73 } 74 GEPA(Type * Ty,Value * Ptr,ArrayRef<Value * > IdxList,const Twine & Name)75 Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) 76 { 77 return IRB()->CreateGEP(Ty, Ptr, IdxList, Name); 78 } 79 IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<Value * > & indexList)80 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList) 81 { 82 std::vector<Value*> indices; 83 for (auto i : indexList) 84 indices.push_back(i); 85 return IN_BOUNDS_GEP(ptr, indices); 86 } 87 IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)88 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList) 89 { 90 std::vector<Value*> indices; 91 for (auto i : indexList) 92 indices.push_back(C(i)); 93 return IN_BOUNDS_GEP(ptr, indices); 94 } 95 LOAD(Value * Ptr,const char * Name,Type * Ty,MEM_CLIENT usage)96 LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage) 97 { 98 AssertMemoryUsageParams(Ptr, usage); 99 return IRB()->CreateLoad(Ptr, Name); 100 } 101 LOAD(Value * Ptr,const Twine & Name,Type * Ty,MEM_CLIENT usage)102 LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage) 103 { 104 AssertMemoryUsageParams(Ptr, usage); 105 return IRB()->CreateLoad(Ptr, Name); 106 } 107 LOAD(Type * Ty,Value * Ptr,const Twine & Name,MEM_CLIENT usage)108 LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage) 109 { 110 AssertMemoryUsageParams(Ptr, usage); 111 return IRB()->CreateLoad(Ty, Ptr, Name); 112 } 113 114 LoadInst* LOAD(Value * Ptr,bool isVolatile,const Twine & Name,Type * Ty,MEM_CLIENT usage)115 Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage) 116 { 117 AssertMemoryUsageParams(Ptr, usage); 118 return IRB()->CreateLoad(Ptr, isVolatile, Name); 119 } 120 LOAD(Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name,Type * Ty,MEM_CLIENT usage)121 LoadInst* Builder::LOAD(Value* basePtr, 122 const std::initializer_list<uint32_t>& indices, 123 const llvm::Twine& name, 124 Type* Ty, 125 MEM_CLIENT usage) 126 { 127 std::vector<Value*> valIndices; 128 for (auto i : indices) 129 valIndices.push_back(C(i)); 130 return Builder::LOAD(GEPA(basePtr, valIndices), name); 131 } 132 LOADV(Value * basePtr,const std::initializer_list<Value * > & indices,const llvm::Twine & name)133 LoadInst* Builder::LOADV(Value* basePtr, 134 const std::initializer_list<Value*>& indices, 135 const llvm::Twine& name) 136 { 137 std::vector<Value*> valIndices; 138 for (auto i : indices) 139 valIndices.push_back(i); 140 return LOAD(GEPA(basePtr, valIndices), name); 141 } 142 143 StoreInst* STORE(Value * val,Value * basePtr,const std::initializer_list<uint32_t> & indices,Type * Ty,MEM_CLIENT usage)144 Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage) 145 { 146 std::vector<Value*> valIndices; 147 for (auto i : indices) 148 valIndices.push_back(C(i)); 149 return STORE(val, GEPA(basePtr, valIndices)); 150 } 151 152 StoreInst* STOREV(Value * val,Value * basePtr,const std::initializer_list<Value * > & indices)153 Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices) 154 { 155 std::vector<Value*> valIndices; 156 for (auto i : indices) 157 valIndices.push_back(i); 158 return STORE(val, GEPA(basePtr, valIndices)); 159 } 160 OFFSET_TO_NEXT_COMPONENT(Value * base,Constant * offset)161 Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) 162 { 163 return GEP(base, offset); 164 } 165 MEM_ADD(Value * i32Incr,Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name)166 Value* Builder::MEM_ADD(Value* i32Incr, 167 Value* basePtr, 168 const std::initializer_list<uint32_t>& indices, 169 const llvm::Twine& name) 170 { 171 Value* i32Value = LOAD(GEP(basePtr, indices), name); 172 Value* i32Result = ADD(i32Value, i32Incr); 173 return STORE(i32Result, GEP(basePtr, indices)); 174 } 175 176 ////////////////////////////////////////////////////////////////////////// 177 /// @brief Generate a masked gather operation in LLVM IR. If not 178 /// supported on the underlying platform, emulate it with loads 179 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 180 /// @param pBase - Int8* base VB address pointer value 181 /// @param vIndices - SIMD wide value of VB byte offsets 182 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 183 /// @param scale - value to scale indices by GATHERPS(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale,MEM_CLIENT usage)184 Value* Builder::GATHERPS(Value* vSrc, 185 Value* pBase, 186 Value* vIndices, 187 Value* vMask, 188 uint8_t scale, 189 MEM_CLIENT usage) 190 { 191 AssertMemoryUsageParams(pBase, usage); 192 193 return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale)); 194 } 195 196 ////////////////////////////////////////////////////////////////////////// 197 /// @brief Generate a masked gather operation in LLVM IR. If not 198 /// supported on the underlying platform, emulate it with loads 199 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 200 /// @param pBase - Int8* base VB address pointer value 201 /// @param vIndices - SIMD wide value of VB byte offsets 202 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 203 /// @param scale - value to scale indices by GATHERDD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale,MEM_CLIENT usage)204 Value* Builder::GATHERDD(Value* vSrc, 205 Value* pBase, 206 Value* vIndices, 207 Value* vMask, 208 uint8_t scale, 209 MEM_CLIENT usage) 210 { 211 AssertMemoryUsageParams(pBase, usage); 212 213 return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); 214 } 215 216 ////////////////////////////////////////////////////////////////////////// 217 /// @brief Generate a masked gather operation in LLVM IR. If not 218 /// supported on the underlying platform, emulate it with loads 219 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 220 /// @param pBase - Int8* base VB address pointer value 221 /// @param vIndices - SIMD wide value of VB byte offsets 222 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 223 /// @param scale - value to scale indices by 224 Value* GATHERPD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)225 Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) 226 { 227 return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); 228 } 229 230 ////////////////////////////////////////////////////////////////////////// 231 /// @brief Alternative masked gather where source is a vector of pointers 232 /// @param pVecSrcPtr - SIMD wide vector of pointers 233 /// @param pVecMask - SIMD active lanes 234 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive GATHER_PTR(Value * pVecSrcPtr,Value * pVecMask,Value * pVecPassthru)235 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru) 236 { 237 return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru); 238 } 239 SCATTER_PTR(Value * pVecDstPtr,Value * pVecSrc,Value * pVecMask)240 void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask) 241 { 242 MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask); 243 } 244 Gather4(const SWR_FORMAT format,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput,MEM_CLIENT usage)245 void Builder::Gather4(const SWR_FORMAT format, 246 Value* pSrcBase, 247 Value* byteOffsets, 248 Value* mask, 249 Value* vGatherComponents[], 250 bool bPackedOutput, 251 MEM_CLIENT usage) 252 { 253 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 254 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) 255 { 256 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); 257 } 258 else 259 { 260 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); 261 } 262 } 263 GATHER4PS(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput,MEM_CLIENT usage)264 void Builder::GATHER4PS(const SWR_FORMAT_INFO& info, 265 Value* pSrcBase, 266 Value* byteOffsets, 267 Value* vMask, 268 Value* vGatherComponents[], 269 bool bPackedOutput, 270 MEM_CLIENT usage) 271 { 272 switch (info.bpp / info.numComps) 273 { 274 case 16: 275 { 276 Value* vGatherResult[2]; 277 278 // TODO: vGatherMaskedVal 279 Value* vGatherMaskedVal = VIMMED1((float)0); 280 281 // always have at least one component out of x or y to fetch 282 283 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 284 // e.g. result of first 8x32bit integer gather for 16bit components 285 // 256i - 0 1 2 3 4 5 6 7 286 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 287 // 288 289 // if we have at least one component out of x or y to fetch 290 if (info.numComps > 2) 291 { 292 // offset base to the next components(zw) in the vertex to gather 293 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 294 295 vGatherResult[1] = 296 GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 297 // e.g. result of second 8x32bit integer gather for 16bit components 298 // 256i - 0 1 2 3 4 5 6 7 299 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 300 // 301 } 302 else 303 { 304 vGatherResult[1] = vGatherMaskedVal; 305 } 306 307 // Shuffle gathered components into place, each row is a component 308 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 309 } 310 break; 311 case 32: 312 { 313 // apply defaults 314 for (uint32_t i = 0; i < 4; ++i) 315 { 316 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); 317 } 318 319 for (uint32_t i = 0; i < info.numComps; i++) 320 { 321 uint32_t swizzleIndex = info.swizzle[i]; 322 323 // Gather a SIMD of components 324 vGatherComponents[swizzleIndex] = GATHERPS( 325 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); 326 327 // offset base to the next component to gather 328 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 329 } 330 } 331 break; 332 default: 333 SWR_INVALID("Invalid float format"); 334 break; 335 } 336 } 337 GATHER4DD(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput,MEM_CLIENT usage)338 void Builder::GATHER4DD(const SWR_FORMAT_INFO& info, 339 Value* pSrcBase, 340 Value* byteOffsets, 341 Value* vMask, 342 Value* vGatherComponents[], 343 bool bPackedOutput, 344 MEM_CLIENT usage) 345 { 346 switch (info.bpp / info.numComps) 347 { 348 case 8: 349 { 350 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 351 Value* vGatherResult = 352 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 353 // e.g. result of an 8x32bit integer gather for 8bit components 354 // 256i - 0 1 2 3 4 5 6 7 355 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 356 357 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 358 } 359 break; 360 case 16: 361 { 362 Value* vGatherResult[2]; 363 364 // TODO: vGatherMaskedVal 365 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 366 367 // always have at least one component out of x or y to fetch 368 369 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 370 // e.g. result of first 8x32bit integer gather for 16bit components 371 // 256i - 0 1 2 3 4 5 6 7 372 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 373 // 374 375 // if we have at least one component out of x or y to fetch 376 if (info.numComps > 2) 377 { 378 // offset base to the next components(zw) in the vertex to gather 379 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 380 381 vGatherResult[1] = 382 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 383 // e.g. result of second 8x32bit integer gather for 16bit components 384 // 256i - 0 1 2 3 4 5 6 7 385 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 386 // 387 } 388 else 389 { 390 vGatherResult[1] = vGatherMaskedVal; 391 } 392 393 // Shuffle gathered components into place, each row is a component 394 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 395 } 396 break; 397 case 32: 398 { 399 // apply defaults 400 for (uint32_t i = 0; i < 4; ++i) 401 { 402 vGatherComponents[i] = VIMMED1((int)info.defaults[i]); 403 } 404 405 for (uint32_t i = 0; i < info.numComps; i++) 406 { 407 uint32_t swizzleIndex = info.swizzle[i]; 408 409 // Gather a SIMD of components 410 vGatherComponents[swizzleIndex] = GATHERDD( 411 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); 412 413 // offset base to the next component to gather 414 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 415 } 416 } 417 break; 418 default: 419 SWR_INVALID("unsupported format"); 420 break; 421 } 422 } 423 Shuffle16bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput[2],Value * vGatherOutput[4],bool bPackedOutput)424 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, 425 Value* vGatherInput[2], 426 Value* vGatherOutput[4], 427 bool bPackedOutput) 428 { 429 // cast types 430 Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 431 Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 432 433 // input could either be float or int vector; do shuffle work in int 434 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); 435 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); 436 437 if (bPackedOutput) 438 { 439 Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 440 mVWidth / 4); // vwidth is units of 32 bits 441 442 // shuffle mask 443 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 444 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 445 Value* vShufResult = 446 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); 447 // after pshufb: group components together in each 128bit lane 448 // 256i - 0 1 2 3 4 5 6 7 449 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 450 451 Value* vi128XY = 452 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 453 // after PERMD: move and pack xy components into each 128bit lane 454 // 256i - 0 1 2 3 4 5 6 7 455 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 456 457 // do the same for zw components 458 Value* vi128ZW = nullptr; 459 if (info.numComps > 2) 460 { 461 Value* vShufResult = 462 BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); 463 vi128ZW = 464 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 465 } 466 467 for (uint32_t i = 0; i < 4; i++) 468 { 469 uint32_t swizzleIndex = info.swizzle[i]; 470 // todo: fixed for packed 471 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 472 if (i >= info.numComps) 473 { 474 // set the default component val 475 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 476 continue; 477 } 478 479 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 480 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 481 // if x or y, use vi128XY permute result, else use vi128ZW 482 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 483 484 // extract packed component 128 bit lanes 485 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 486 } 487 } 488 else 489 { 490 // pshufb masks for each component 491 Value* vConstMask[2]; 492 // x/z shuffle mask 493 vConstMask[0] = C<char>({ 494 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 495 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 496 }); 497 498 // y/w shuffle mask 499 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 500 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 501 502 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 503 // apply defaults 504 for (uint32_t i = 0; i < 4; ++i) 505 { 506 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 507 } 508 509 for (uint32_t i = 0; i < info.numComps; i++) 510 { 511 uint32_t swizzleIndex = info.swizzle[i]; 512 513 // select correct constMask for x/z or y/w pshufb 514 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 515 // if x or y, use vi128XY permute result, else use vi128ZW 516 uint32_t selectedGather = (i < 2) ? 0 : 1; 517 518 vGatherOutput[swizzleIndex] = 519 BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), 520 vConstMask[selectedMask]), 521 vGatherTy); 522 // after pshufb mask for x channel; z uses the same shuffle from the second gather 523 // 256i - 0 1 2 3 4 5 6 7 524 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 525 } 526 } 527 } 528 Shuffle8bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput,Value * vGatherOutput[],bool bPackedOutput)529 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, 530 Value* vGatherInput, 531 Value* vGatherOutput[], 532 bool bPackedOutput) 533 { 534 // cast types 535 Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 536 Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 537 538 if (bPackedOutput) 539 { 540 Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 541 mVWidth / 4); // vwidth is units of 32 bits 542 // shuffle mask 543 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 544 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); 545 Value* vShufResult = 546 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 547 // after pshufb: group components together in each 128bit lane 548 // 256i - 0 1 2 3 4 5 6 7 549 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 550 551 Value* vi128XY = 552 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); 553 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 554 // 256i - 0 1 2 3 4 5 6 7 555 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 556 557 // do the same for zw components 558 Value* vi128ZW = nullptr; 559 if (info.numComps > 2) 560 { 561 vi128ZW = 562 BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); 563 } 564 565 // sign extend all enabled components. If we have a fill vVertexElements, output to 566 // current simdvertex 567 for (uint32_t i = 0; i < 4; i++) 568 { 569 uint32_t swizzleIndex = info.swizzle[i]; 570 // todo: fix for packed 571 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 572 if (i >= info.numComps) 573 { 574 // set the default component val 575 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 576 continue; 577 } 578 579 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 580 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 581 // if x or y, use vi128XY permute result, else use vi128ZW 582 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 583 584 // sign extend 585 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 586 } 587 } 588 // else zero extend 589 else 590 { 591 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 592 // apply defaults 593 for (uint32_t i = 0; i < 4; ++i) 594 { 595 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 596 } 597 598 for (uint32_t i = 0; i < info.numComps; i++) 599 { 600 uint32_t swizzleIndex = info.swizzle[i]; 601 602 // pshufb masks for each component 603 Value* vConstMask; 604 switch (i) 605 { 606 case 0: 607 // x shuffle mask 608 vConstMask = 609 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 610 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); 611 break; 612 case 1: 613 // y shuffle mask 614 vConstMask = 615 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 616 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); 617 break; 618 case 2: 619 // z shuffle mask 620 vConstMask = 621 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 622 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); 623 break; 624 case 3: 625 // w shuffle mask 626 vConstMask = 627 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 628 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); 629 break; 630 default: 631 vConstMask = nullptr; 632 break; 633 } 634 635 assert(vConstMask && "Invalid info.numComps value"); 636 vGatherOutput[swizzleIndex] = 637 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 638 // after pshufb for x channel 639 // 256i - 0 1 2 3 4 5 6 7 640 // x000 x000 x000 x000 x000 x000 x000 x000 641 } 642 } 643 } 644 645 ////////////////////////////////////////////////////////////////////////// 646 /// @brief emulates a scatter operation. 647 /// @param pDst - pointer to destination 648 /// @param vSrc - vector of src data to scatter 649 /// @param vOffsets - vector of byte offsets from pDst 650 /// @param vMask - mask of valid lanes SCATTERPS(Value * pDst,Value * vSrc,Value * vOffsets,Value * vMask,MEM_CLIENT usage)651 void Builder::SCATTERPS( 652 Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage) 653 { 654 AssertMemoryUsageParams(pDst, usage); 655 #if LLVM_VERSION_MAJOR >= 11 656 SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy()); 657 #else 658 SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy()); 659 #endif 660 VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1)); 661 return; 662 663 /* Scatter algorithm 664 665 while(Index = BitScanForward(mask)) 666 srcElem = srcVector[Index] 667 offsetElem = offsetVector[Index] 668 *(pDst + offsetElem) = srcElem 669 Update mask (&= ~(1<<Index) 670 671 */ 672 673 /* 674 675 // Reference implementation kept around for reference 676 677 BasicBlock* pCurBB = IRB()->GetInsertBlock(); 678 Function* pFunc = pCurBB->getParent(); 679 Type* pSrcTy = vSrc->getType()->getVectorElementType(); 680 681 // Store vectors on stack 682 if (pScatterStackSrc == nullptr) 683 { 684 // Save off stack allocations and reuse per scatter. Significantly reduces stack 685 // requirements for shaders with a lot of scatters. 686 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); 687 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); 688 } 689 690 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); 691 Value* pOffsetsArrayPtr = pScatterStackOffsets; 692 STORE(vSrc, pSrcArrayPtr); 693 STORE(vOffsets, pOffsetsArrayPtr); 694 695 // Cast to pointers for random access 696 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); 697 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); 698 699 Value* pMask = VMOVMSK(vMask); 700 701 // Setup loop basic block 702 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc); 703 704 // compute first set bit 705 Value* pIndex = CTTZ(pMask, C(false)); 706 707 Value* pIsUndef = ICMP_EQ(pIndex, C(32)); 708 709 // Split current block or create new one if building inline 710 BasicBlock* pPostLoop; 711 if (pCurBB->getTerminator()) 712 { 713 pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); 714 715 // Remove unconditional jump created by splitBasicBlock 716 pCurBB->getTerminator()->eraseFromParent(); 717 718 // Add terminator to end of original block 719 IRB()->SetInsertPoint(pCurBB); 720 721 // Add conditional branch 722 COND_BR(pIsUndef, pPostLoop, pLoop); 723 } 724 else 725 { 726 pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc); 727 728 // Add conditional branch 729 COND_BR(pIsUndef, pPostLoop, pLoop); 730 } 731 732 // Add loop basic block contents 733 IRB()->SetInsertPoint(pLoop); 734 PHINode* pIndexPhi = PHI(mInt32Ty, 2); 735 PHINode* pMaskPhi = PHI(mInt32Ty, 2); 736 737 pIndexPhi->addIncoming(pIndex, pCurBB); 738 pMaskPhi->addIncoming(pMask, pCurBB); 739 740 // Extract elements for this index 741 Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi}); 742 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi}); 743 744 // GEP to this offset in dst 745 Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy); 746 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); 747 STORE(pSrcElem, pCurDst); 748 749 // Update the mask 750 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); 751 752 // Terminator 753 Value* pNewIndex = CTTZ(pNewMask, C(false)); 754 755 pIsUndef = ICMP_EQ(pNewIndex, C(32)); 756 COND_BR(pIsUndef, pPostLoop, pLoop); 757 758 // Update phi edges 759 pIndexPhi->addIncoming(pNewIndex, pLoop); 760 pMaskPhi->addIncoming(pNewMask, pLoop); 761 762 // Move builder to beginning of post loop 763 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); 764 765 */ 766 } 767 } // namespace SwrJit 768