1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "LLVMReactor.hpp"
16
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "LLVMReactorDebugInfo.hpp"
20 #include "Print.hpp"
21 #include "Reactor.hpp"
22 #include "x86.hpp"
23
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/IntrinsicsX86.h"
26 #include "llvm/Support/Alignment.h"
27 #include "llvm/Support/Error.h"
28 #include "llvm/Support/ManagedStatic.h"
29
30 #include <fstream>
31 #include <iostream>
32 #include <mutex>
33 #include <numeric>
34 #include <thread>
35 #include <unordered_map>
36
37 #if defined(__i386__) || defined(__x86_64__)
38 # include <xmmintrin.h>
39 #endif
40
41 #include <math.h>
42
43 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()44 extern "C" void X86CompilationCallback()
45 {
46 UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
47 }
48 #endif
49
50 #if !LLVM_ENABLE_THREADS
51 # error "LLVM_ENABLE_THREADS needs to be enabled"
52 #endif
53
54 #if LLVM_VERSION_MAJOR < 11
55 namespace llvm {
56 using FixedVectorType = VectorType;
57 } // namespace llvm
58 #endif
59
60 namespace {
61
62 // Used to automatically invoke llvm_shutdown() when driver is unloaded
63 llvm::llvm_shutdown_obj llvmShutdownObj;
64
65 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
66 // for destructing objects at exit. See crbug.com/1074222
67 thread_local rr::JITBuilder *jit = nullptr;
68
69 // Default configuration settings. Must be accessed under mutex lock.
70 std::mutex defaultConfigLock;
defaultConfig()71 rr::Config &defaultConfig()
72 {
73 // This uses a static in a function to avoid the cost of a global static
74 // initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
75 static rr::Config config = rr::Config::Edit()
76 .add(rr::Optimization::Pass::ScalarReplAggregates)
77 .add(rr::Optimization::Pass::InstructionCombining)
78 .apply({});
79 return config;
80 }
81
lowerPAVG(llvm::Value * x,llvm::Value * y)82 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
83 {
84 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
85
86 llvm::VectorType *extTy =
87 llvm::VectorType::getExtendedElementVectorType(ty);
88 x = jit->builder->CreateZExt(x, extTy);
89 y = jit->builder->CreateZExt(y, extTy);
90
91 // (x + y + 1) >> 1
92 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
93 llvm::Value *res = jit->builder->CreateAdd(x, y);
94 res = jit->builder->CreateAdd(res, one);
95 res = jit->builder->CreateLShr(res, one);
96 return jit->builder->CreateTrunc(res, ty);
97 }
98
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)99 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
100 llvm::ICmpInst::Predicate pred)
101 {
102 return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
103 }
104
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)105 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
106 llvm::Value *y, llvm::Type *dstTy)
107 {
108 return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
109 }
110
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)111 [[maybe_unused]] llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
112 llvm::FCmpInst::Predicate pred)
113 {
114 return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
115 }
116
lowerRound(llvm::Value * x)117 [[maybe_unused]] llvm::Value *lowerRound(llvm::Value *x)
118 {
119 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
120 jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
121 return jit->builder->CreateCall(nearbyint, { x });
122 }
123
lowerRoundInt(llvm::Value * x,llvm::Type * ty)124 [[maybe_unused]] llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
125 {
126 return jit->builder->CreateFPToSI(lowerRound(x), ty);
127 }
128
lowerFloor(llvm::Value * x)129 [[maybe_unused]] llvm::Value *lowerFloor(llvm::Value *x)
130 {
131 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
132 jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
133 return jit->builder->CreateCall(floor, { x });
134 }
135
lowerTrunc(llvm::Value * x)136 [[maybe_unused]] llvm::Value *lowerTrunc(llvm::Value *x)
137 {
138 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
139 jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
140 return jit->builder->CreateCall(trunc, { x });
141 }
142
lowerSQRT(llvm::Value * x)143 [[maybe_unused]] llvm::Value *lowerSQRT(llvm::Value *x)
144 {
145 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
146 jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
147 return jit->builder->CreateCall(sqrt, { x });
148 }
149
lowerRCP(llvm::Value * x)150 [[maybe_unused]] llvm::Value *lowerRCP(llvm::Value *x)
151 {
152 llvm::Type *ty = x->getType();
153 llvm::Constant *one;
154 if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
155 {
156 one = llvm::ConstantVector::getSplat(
157 #if LLVM_VERSION_MAJOR >= 11
158 vectorTy->getElementCount(),
159 #else
160 vectorTy->getNumElements(),
161 #endif
162 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
163 }
164 else
165 {
166 one = llvm::ConstantFP::get(ty, 1);
167 }
168 return jit->builder->CreateFDiv(one, x);
169 }
170
lowerRSQRT(llvm::Value * x)171 [[maybe_unused]] llvm::Value *lowerRSQRT(llvm::Value *x)
172 {
173 return lowerRCP(lowerSQRT(x));
174 }
175
lowerVectorShl(llvm::Value * x,uint64_t scalarY)176 [[maybe_unused]] llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
177 {
178 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
179 llvm::Value *y = llvm::ConstantVector::getSplat(
180 #if LLVM_VERSION_MAJOR >= 11
181 ty->getElementCount(),
182 #else
183 ty->getNumElements(),
184 #endif
185 llvm::ConstantInt::get(ty->getElementType(), scalarY));
186 return jit->builder->CreateShl(x, y);
187 }
188
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)189 [[maybe_unused]] llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
190 {
191 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
192 llvm::Value *y = llvm::ConstantVector::getSplat(
193 #if LLVM_VERSION_MAJOR >= 11
194 ty->getElementCount(),
195 #else
196 ty->getNumElements(),
197 #endif
198 llvm::ConstantInt::get(ty->getElementType(), scalarY));
199 return jit->builder->CreateAShr(x, y);
200 }
201
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)202 [[maybe_unused]] llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
203 {
204 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
205 llvm::Value *y = llvm::ConstantVector::getSplat(
206 #if LLVM_VERSION_MAJOR >= 11
207 ty->getElementCount(),
208 #else
209 ty->getNumElements(),
210 #endif
211 llvm::ConstantInt::get(ty->getElementType(), scalarY));
212 return jit->builder->CreateLShr(x, y);
213 }
214
lowerShuffleVector(llvm::Value * v1,llvm::Value * v2,llvm::ArrayRef<int> select)215 llvm::Value *lowerShuffleVector(llvm::Value *v1, llvm::Value *v2, llvm::ArrayRef<int> select)
216 {
217 int size = select.size();
218 const int maxSize = 16;
219 llvm::Constant *swizzle[maxSize];
220 ASSERT(size <= maxSize);
221
222 for(int i = 0; i < size; i++)
223 {
224 swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
225 }
226
227 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
228
229 return jit->builder->CreateShuffleVector(v1, v2, shuffle);
230 }
231
lowerMulAdd(llvm::Value * x,llvm::Value * y)232 [[maybe_unused]] llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
233 {
234 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
235 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
236
237 llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
238 llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
239 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
240
241 llvm::Value *undef = llvm::UndefValue::get(extTy);
242
243 llvm::SmallVector<int, 16> evenIdx;
244 llvm::SmallVector<int, 16> oddIdx;
245 for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
246 {
247 evenIdx.push_back(i);
248 oddIdx.push_back(i + 1);
249 }
250
251 llvm::Value *lhs = lowerShuffleVector(mult, undef, evenIdx);
252 llvm::Value *rhs = lowerShuffleVector(mult, undef, oddIdx);
253 return jit->builder->CreateAdd(lhs, rhs);
254 }
255
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)256 [[maybe_unused]] llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
257 {
258 llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
259 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
260
261 llvm::IntegerType *dstElemTy =
262 llvm::cast<llvm::IntegerType>(dstTy->getElementType());
263
264 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
265 ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
266 llvm::Constant *max, *min;
267 if(isSigned)
268 {
269 max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
270 min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
271 }
272 else
273 {
274 max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
275 min = llvm::ConstantInt::get(srcTy, 0, false);
276 }
277
278 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
279 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
280 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
281 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
282
283 x = jit->builder->CreateTrunc(x, dstTy);
284 y = jit->builder->CreateTrunc(y, dstTy);
285
286 llvm::SmallVector<int, 16> index(srcTy->getNumElements() * 2);
287 std::iota(index.begin(), index.end(), 0);
288
289 return lowerShuffleVector(x, y, index);
290 }
291
lowerSignMask(llvm::Value * x,llvm::Type * retTy)292 [[maybe_unused]] llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
293 {
294 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
295 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
296 llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
297
298 llvm::Value *ret = jit->builder->CreateZExt(
299 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
300 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
301 {
302 llvm::Value *elem = jit->builder->CreateZExt(
303 jit->builder->CreateExtractElement(cmp, i), retTy);
304 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
305 }
306 return ret;
307 }
308
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)309 [[maybe_unused]] llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
310 {
311 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
312 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
313 llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
314
315 llvm::Value *ret = jit->builder->CreateZExt(
316 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
317 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
318 {
319 llvm::Value *elem = jit->builder->CreateZExt(
320 jit->builder->CreateExtractElement(cmp, i), retTy);
321 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
322 }
323 return ret;
324 }
325
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)326 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
327 {
328 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
329 }
330
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)331 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
332 {
333 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
334 }
335
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)336 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
337 {
338 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
339 }
340
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)341 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
342 {
343 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
344 }
345
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)346 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
347 {
348 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
349 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
350
351 llvm::Value *extX, *extY;
352 if(sext)
353 {
354 extX = jit->builder->CreateSExt(x, extTy);
355 extY = jit->builder->CreateSExt(y, extTy);
356 }
357 else
358 {
359 extX = jit->builder->CreateZExt(x, extTy);
360 extY = jit->builder->CreateZExt(y, extTy);
361 }
362
363 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
364
365 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
366 llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
367 return jit->builder->CreateTrunc(mulh, ty);
368 }
369
370 } // namespace
371
372 namespace rr {
373
backendName()374 std::string Caps::backendName()
375 {
376 return std::string("LLVM ") + LLVM_VERSION_STRING;
377 }
378
coroutinesSupported()379 bool Caps::coroutinesSupported()
380 {
381 return true;
382 }
383
fmaIsFast()384 bool Caps::fmaIsFast()
385 {
386 static bool AVX2 = CPUID::supportsAVX2(); // Also checks for FMA support
387
388 // If x86 FMA instructions are supported, assume LLVM will emit them instead of making calls to std::fma().
389 return AVX2;
390 }
391
392 // The abstract Type* types are implemented as LLVM types, except that
393 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
394 // and VFP in ARM, and eliminate the overhead of converting them to explicit
395 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
396 // as abstract pointers with small enum values.
397 enum InternalType : uintptr_t
398 {
399 // Emulated types:
400 Type_v2i32,
401 Type_v4i16,
402 Type_v2i16,
403 Type_v8i8,
404 Type_v4i8,
405 Type_v2f32,
406 EmulatedTypeCount,
407 // Returned by asInternalType() to indicate that the abstract Type*
408 // should be interpreted as LLVM type pointer:
409 Type_LLVM
410 };
411
asInternalType(Type * type)412 inline InternalType asInternalType(Type *type)
413 {
414 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
415 return (t < EmulatedTypeCount) ? t : Type_LLVM;
416 }
417
T(Type * t)418 llvm::Type *T(Type *t)
419 {
420 // Use 128-bit vectors to implement logically shorter ones.
421 switch(asInternalType(t))
422 {
423 case Type_v2i32: return T(Int4::type());
424 case Type_v4i16: return T(Short8::type());
425 case Type_v2i16: return T(Short8::type());
426 case Type_v8i8: return T(Byte16::type());
427 case Type_v4i8: return T(Byte16::type());
428 case Type_v2f32: return T(Float4::type());
429 case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
430 default:
431 UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
432 return nullptr;
433 }
434 }
435
T(InternalType t)436 Type *T(InternalType t)
437 {
438 return reinterpret_cast<Type *>(t);
439 }
440
T(const std::vector<Type * > & t)441 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
442 {
443 return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
444 }
445
B(BasicBlock * t)446 inline llvm::BasicBlock *B(BasicBlock *t)
447 {
448 return reinterpret_cast<llvm::BasicBlock *>(t);
449 }
450
B(llvm::BasicBlock * t)451 inline BasicBlock *B(llvm::BasicBlock *t)
452 {
453 return reinterpret_cast<BasicBlock *>(t);
454 }
455
typeSize(Type * type)456 static size_t typeSize(Type *type)
457 {
458 switch(asInternalType(type))
459 {
460 case Type_v2i32: return 8;
461 case Type_v4i16: return 8;
462 case Type_v2i16: return 4;
463 case Type_v8i8: return 8;
464 case Type_v4i8: return 4;
465 case Type_v2f32: return 8;
466 case Type_LLVM:
467 {
468 llvm::Type *t = T(type);
469
470 if(t->isPointerTy())
471 {
472 return sizeof(void *);
473 }
474
475 // At this point we should only have LLVM 'primitive' types.
476 unsigned int bits = t->getPrimitiveSizeInBits();
477 ASSERT_MSG(bits != 0, "bits: %d", int(bits));
478
479 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
480 // but are typically stored as one byte. The DataLayout structure should
481 // be used here and many other places if this assumption fails.
482 return (bits + 7) / 8;
483 }
484 break;
485 default:
486 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
487 return 0;
488 }
489 }
490
elementCount(Type * type)491 static unsigned int elementCount(Type *type)
492 {
493 switch(asInternalType(type))
494 {
495 case Type_v2i32: return 2;
496 case Type_v4i16: return 4;
497 case Type_v2i16: return 2;
498 case Type_v8i8: return 8;
499 case Type_v4i8: return 4;
500 case Type_v2f32: return 2;
501 case Type_LLVM: return llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();
502 default:
503 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
504 return 0;
505 }
506 }
507
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)508 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> ¶ms)
509 {
510 llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
511 auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
512
513 func->setLinkage(llvm::GlobalValue::ExternalLinkage);
514 func->setDoesNotThrow();
515 func->setCallingConv(llvm::CallingConv::C);
516
517 if(__has_feature(memory_sanitizer))
518 {
519 func->addFnAttr(llvm::Attribute::SanitizeMemory);
520 }
521
522 func->addFnAttr("warn-stack-size", "524288"); // Warn when a function uses more than 512 KiB of stack memory
523
524 return func;
525 }
526
Nucleus()527 Nucleus::Nucleus()
528 {
529 #if !__has_feature(memory_sanitizer)
530 // thread_local variables in shared libraries are initialized at load-time,
531 // but this is not observed by MemorySanitizer if the loader itself was not
532 // instrumented, leading to false-positive uninitialized variable errors.
533 ASSERT(jit == nullptr);
534 ASSERT(Variable::unmaterializedVariables == nullptr);
535 #endif
536
537 jit = new JITBuilder(Nucleus::getDefaultConfig());
538 Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
539 }
540
~Nucleus()541 Nucleus::~Nucleus()
542 {
543 delete Variable::unmaterializedVariables;
544 Variable::unmaterializedVariables = nullptr;
545
546 delete jit;
547 jit = nullptr;
548 }
549
setDefaultConfig(const Config & cfg)550 void Nucleus::setDefaultConfig(const Config &cfg)
551 {
552 std::unique_lock<std::mutex> lock(::defaultConfigLock);
553 ::defaultConfig() = cfg;
554 }
555
adjustDefaultConfig(const Config::Edit & cfgEdit)556 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
557 {
558 std::unique_lock<std::mutex> lock(::defaultConfigLock);
559 auto &config = ::defaultConfig();
560 config = cfgEdit.apply(config);
561 }
562
getDefaultConfig()563 Config Nucleus::getDefaultConfig()
564 {
565 std::unique_lock<std::mutex> lock(::defaultConfigLock);
566 return ::defaultConfig();
567 }
568
acquireRoutine(const char * name,const Config::Edit * cfgEdit)569 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
570 {
571 if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
572 {
573 llvm::Type *type = jit->function->getReturnType();
574
575 if(type->isVoidTy())
576 {
577 createRetVoid();
578 }
579 else
580 {
581 createRet(V(llvm::UndefValue::get(type)));
582 }
583 }
584
585 std::shared_ptr<Routine> routine;
586
587 auto acquire = [&](rr::JITBuilder *jit) {
588 // ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
589 // it needs to only use the jit variable passed in as an argument.
590
591 Config cfg = jit->config;
592 if(cfgEdit)
593 {
594 cfg = cfgEdit->apply(jit->config);
595 }
596
597 #ifdef ENABLE_RR_DEBUG_INFO
598 if(jit->debugInfo != nullptr)
599 {
600 jit->debugInfo->Finalize();
601 }
602 #endif // ENABLE_RR_DEBUG_INFO
603
604 if(false)
605 {
606 std::error_code error;
607 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
608 jit->module->print(file, 0);
609 }
610
611 jit->runPasses(cfg);
612
613 if(false)
614 {
615 std::error_code error;
616 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
617 jit->module->print(file, 0);
618 }
619
620 routine = jit->acquireRoutine(name, &jit->function, 1, cfg);
621 };
622
623 #ifdef JIT_IN_SEPARATE_THREAD
624 // Perform optimizations and codegen in a separate thread to avoid stack overflow.
625 // FIXME(b/149829034): This is not a long-term solution. Reactor has no control
626 // over the threading and stack sizes of its users, so this should be addressed
627 // at a higher level instead.
628 std::thread thread(acquire, jit);
629 thread.join();
630 #else
631 acquire(jit);
632 #endif
633
634 return routine;
635 }
636
allocateStackVariable(Type * type,int arraySize)637 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
638 {
639 // Need to allocate it in the entry block for mem2reg to work
640 llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
641
642 llvm::Instruction *declaration;
643
644 #if LLVM_VERSION_MAJOR >= 11
645 auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
646 #else
647 auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
648 #endif
649
650 if(arraySize)
651 {
652 Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
653 declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
654 }
655 else
656 {
657 declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
658 }
659
660 entryBlock.getInstList().push_front(declaration);
661
662 return V(declaration);
663 }
664
createBasicBlock()665 BasicBlock *Nucleus::createBasicBlock()
666 {
667 return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
668 }
669
getInsertBlock()670 BasicBlock *Nucleus::getInsertBlock()
671 {
672 return B(jit->builder->GetInsertBlock());
673 }
674
setInsertBlock(BasicBlock * basicBlock)675 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
676 {
677 // assert(jit->builder->GetInsertBlock()->back().isTerminator());
678
679 jit->builder->SetInsertPoint(B(basicBlock));
680 }
681
createFunction(Type * ReturnType,const std::vector<Type * > & Params)682 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
683 {
684 jit->function = rr::createFunction("", T(ReturnType), T(Params));
685
686 #ifdef ENABLE_RR_DEBUG_INFO
687 jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
688 #endif // ENABLE_RR_DEBUG_INFO
689
690 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
691 }
692
getArgument(unsigned int index)693 Value *Nucleus::getArgument(unsigned int index)
694 {
695 llvm::Function::arg_iterator args = jit->function->arg_begin();
696
697 while(index)
698 {
699 args++;
700 index--;
701 }
702
703 return V(&*args);
704 }
705
createRetVoid()706 void Nucleus::createRetVoid()
707 {
708 RR_DEBUG_INFO_UPDATE_LOC();
709
710 ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
711
712 // Code generated after this point is unreachable, so any variables
713 // being read can safely return an undefined value. We have to avoid
714 // materializing variables after the terminator ret instruction.
715 Variable::killUnmaterialized();
716
717 jit->builder->CreateRetVoid();
718 }
719
createRet(Value * v)720 void Nucleus::createRet(Value *v)
721 {
722 RR_DEBUG_INFO_UPDATE_LOC();
723
724 ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
725
726 // Code generated after this point is unreachable, so any variables
727 // being read can safely return an undefined value. We have to avoid
728 // materializing variables after the terminator ret instruction.
729 Variable::killUnmaterialized();
730
731 jit->builder->CreateRet(V(v));
732 }
733
createBr(BasicBlock * dest)734 void Nucleus::createBr(BasicBlock *dest)
735 {
736 RR_DEBUG_INFO_UPDATE_LOC();
737 Variable::materializeAll();
738
739 jit->builder->CreateBr(B(dest));
740 }
741
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)742 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
743 {
744 RR_DEBUG_INFO_UPDATE_LOC();
745 Variable::materializeAll();
746 jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
747 }
748
createAdd(Value * lhs,Value * rhs)749 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
750 {
751 RR_DEBUG_INFO_UPDATE_LOC();
752 return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
753 }
754
createSub(Value * lhs,Value * rhs)755 Value *Nucleus::createSub(Value *lhs, Value *rhs)
756 {
757 RR_DEBUG_INFO_UPDATE_LOC();
758 return V(jit->builder->CreateSub(V(lhs), V(rhs)));
759 }
760
createMul(Value * lhs,Value * rhs)761 Value *Nucleus::createMul(Value *lhs, Value *rhs)
762 {
763 RR_DEBUG_INFO_UPDATE_LOC();
764 return V(jit->builder->CreateMul(V(lhs), V(rhs)));
765 }
766
createUDiv(Value * lhs,Value * rhs)767 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
768 {
769 RR_DEBUG_INFO_UPDATE_LOC();
770 return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
771 }
772
createSDiv(Value * lhs,Value * rhs)773 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
774 {
775 RR_DEBUG_INFO_UPDATE_LOC();
776 return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
777 }
778
createFAdd(Value * lhs,Value * rhs)779 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
780 {
781 RR_DEBUG_INFO_UPDATE_LOC();
782 return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
783 }
784
createFSub(Value * lhs,Value * rhs)785 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
786 {
787 RR_DEBUG_INFO_UPDATE_LOC();
788 return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
789 }
790
createFMul(Value * lhs,Value * rhs)791 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
792 {
793 RR_DEBUG_INFO_UPDATE_LOC();
794 return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
795 }
796
createFDiv(Value * lhs,Value * rhs)797 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
798 {
799 RR_DEBUG_INFO_UPDATE_LOC();
800 return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
801 }
802
createURem(Value * lhs,Value * rhs)803 Value *Nucleus::createURem(Value *lhs, Value *rhs)
804 {
805 RR_DEBUG_INFO_UPDATE_LOC();
806 return V(jit->builder->CreateURem(V(lhs), V(rhs)));
807 }
808
createSRem(Value * lhs,Value * rhs)809 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
810 {
811 RR_DEBUG_INFO_UPDATE_LOC();
812 return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
813 }
814
createFRem(Value * lhs,Value * rhs)815 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
816 {
817 RR_DEBUG_INFO_UPDATE_LOC();
818 return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
819 }
820
operator %(RValue<Float4> lhs,RValue<Float4> rhs)821 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
822 {
823 return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
824 }
825
createShl(Value * lhs,Value * rhs)826 Value *Nucleus::createShl(Value *lhs, Value *rhs)
827 {
828 RR_DEBUG_INFO_UPDATE_LOC();
829 return V(jit->builder->CreateShl(V(lhs), V(rhs)));
830 }
831
createLShr(Value * lhs,Value * rhs)832 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
833 {
834 RR_DEBUG_INFO_UPDATE_LOC();
835 return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
836 }
837
createAShr(Value * lhs,Value * rhs)838 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
839 {
840 RR_DEBUG_INFO_UPDATE_LOC();
841 return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
842 }
843
createAnd(Value * lhs,Value * rhs)844 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
845 {
846 RR_DEBUG_INFO_UPDATE_LOC();
847 return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
848 }
849
createOr(Value * lhs,Value * rhs)850 Value *Nucleus::createOr(Value *lhs, Value *rhs)
851 {
852 RR_DEBUG_INFO_UPDATE_LOC();
853 return V(jit->builder->CreateOr(V(lhs), V(rhs)));
854 }
855
createXor(Value * lhs,Value * rhs)856 Value *Nucleus::createXor(Value *lhs, Value *rhs)
857 {
858 RR_DEBUG_INFO_UPDATE_LOC();
859 return V(jit->builder->CreateXor(V(lhs), V(rhs)));
860 }
861
createNeg(Value * v)862 Value *Nucleus::createNeg(Value *v)
863 {
864 RR_DEBUG_INFO_UPDATE_LOC();
865 return V(jit->builder->CreateNeg(V(v)));
866 }
867
createFNeg(Value * v)868 Value *Nucleus::createFNeg(Value *v)
869 {
870 RR_DEBUG_INFO_UPDATE_LOC();
871 return V(jit->builder->CreateFNeg(V(v)));
872 }
873
createNot(Value * v)874 Value *Nucleus::createNot(Value *v)
875 {
876 RR_DEBUG_INFO_UPDATE_LOC();
877 return V(jit->builder->CreateNot(V(v)));
878 }
879
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)880 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
881 {
882 RR_DEBUG_INFO_UPDATE_LOC();
883 switch(asInternalType(type))
884 {
885 case Type_v2i32:
886 case Type_v4i16:
887 case Type_v8i8:
888 case Type_v2f32:
889 return createBitCast(
890 createInsertElement(
891 V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
892 createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
893 0),
894 type);
895 case Type_v2i16:
896 case Type_v4i8:
897 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
898 {
899 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
900 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
901 i = createZExt(i, Long::type());
902 Value *v = createInsertElement(u, i, 0);
903 return createBitCast(v, type);
904 }
905 // Fallthrough to non-emulated case.
906 case Type_LLVM:
907 {
908 auto elTy = T(type);
909 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
910
911 if(!atomic)
912 {
913 return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
914 }
915 else if(elTy->isIntegerTy() || elTy->isPointerTy())
916 {
917 // Integers and pointers can be atomically loaded by setting
918 // the ordering constraint on the load instruction.
919 auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
920 load->setAtomic(atomicOrdering(atomic, memoryOrder));
921 return V(load);
922 }
923 else if(elTy->isFloatTy() || elTy->isDoubleTy())
924 {
925 // LLVM claims to support atomic loads of float types as
926 // above, but certain backends cannot deal with this.
927 // Load as an integer and bitcast. See b/136037244.
928 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
929 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
930 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
931 auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
932 load->setAtomic(atomicOrdering(atomic, memoryOrder));
933 auto loadCast = jit->builder->CreateBitCast(load, elTy);
934 return V(loadCast);
935 }
936 else
937 {
938 // More exotic types require falling back to the extern:
939 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
940 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
941 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
942 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
943 auto i8PtrTy = i8Ty->getPointerTo();
944 auto voidTy = llvm::Type::getVoidTy(*jit->context);
945 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
946 auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
947 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
948 auto out = allocateStackVariable(type);
949 jit->builder->CreateCall(func, {
950 llvm::ConstantInt::get(sizetTy, size),
951 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
952 jit->builder->CreatePointerCast(V(out), i8PtrTy),
953 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
954 });
955 return V(jit->builder->CreateLoad(T(type), V(out)));
956 }
957 }
958 default:
959 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
960 return nullptr;
961 }
962 }
963
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)964 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
965 {
966 RR_DEBUG_INFO_UPDATE_LOC();
967 switch(asInternalType(type))
968 {
969 case Type_v2i32:
970 case Type_v4i16:
971 case Type_v8i8:
972 case Type_v2f32:
973 createStore(
974 createExtractElement(
975 createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
976 createBitCast(ptr, Pointer<Long>::type()),
977 Long::type(), isVolatile, alignment, atomic, memoryOrder);
978 return value;
979 case Type_v2i16:
980 case Type_v4i8:
981 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
982 {
983 createStore(
984 createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
985 createBitCast(ptr, Pointer<Int>::type()),
986 Int::type(), isVolatile, alignment, atomic, memoryOrder);
987 return value;
988 }
989 // Fallthrough to non-emulated case.
990 case Type_LLVM:
991 {
992 auto elTy = T(type);
993 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
994
995 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
996 {
997 // Mark all memory writes as initialized by calling __msan_unpoison
998 // void __msan_unpoison(const volatile void *a, size_t size)
999 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1000 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1001 auto voidPtrTy = i8Ty->getPointerTo();
1002 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1003 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1004 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1005 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1006
1007 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
1008 llvm::ConstantInt::get(sizetTy, size) });
1009 }
1010
1011 if(!atomic)
1012 {
1013 jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1014 }
1015 else if(elTy->isIntegerTy() || elTy->isPointerTy())
1016 {
1017 // Integers and pointers can be atomically stored by setting
1018 // the ordering constraint on the store instruction.
1019 auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1020 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1021 }
1022 else if(elTy->isFloatTy() || elTy->isDoubleTy())
1023 {
1024 // LLVM claims to support atomic stores of float types as
1025 // above, but certain backends cannot deal with this.
1026 // Store as an bitcast integer. See b/136037244.
1027 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1028 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1029 auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1030 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1031 auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1032 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1033 }
1034 else
1035 {
1036 // More exotic types require falling back to the extern:
1037 // void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1038 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1039 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1040 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1041 auto i8PtrTy = i8Ty->getPointerTo();
1042 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1043 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1044 auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1045 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1046 auto copy = allocateStackVariable(type);
1047 jit->builder->CreateStore(V(value), V(copy));
1048 jit->builder->CreateCall(func, {
1049 llvm::ConstantInt::get(sizetTy, size),
1050 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1051 jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1052 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1053 });
1054 }
1055
1056 return value;
1057 }
1058 default:
1059 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1060 return nullptr;
1061 }
1062 }
1063
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1064 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1065 {
1066 RR_DEBUG_INFO_UPDATE_LOC();
1067
1068 ASSERT(V(ptr)->getType()->isPointerTy());
1069 ASSERT(V(mask)->getType()->isVectorTy());
1070
1071 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1072 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1073 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1074 auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1075 auto elVecPtrTy = elVecTy->getPointerTo();
1076 auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1077 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1078 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1079 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1080 return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1081 }
1082
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1083 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1084 {
1085 RR_DEBUG_INFO_UPDATE_LOC();
1086
1087 ASSERT(V(ptr)->getType()->isPointerTy());
1088 ASSERT(V(val)->getType()->isVectorTy());
1089 ASSERT(V(mask)->getType()->isVectorTy());
1090
1091 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1092 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1093 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1094 auto elVecTy = V(val)->getType();
1095 auto elVecPtrTy = elVecTy->getPointerTo();
1096 auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1097 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1098 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1099 jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1100
1101 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1102 {
1103 // Mark memory writes as initialized by calling __msan_unpoison
1104 // void __msan_unpoison(const volatile void *a, size_t size)
1105 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1106 auto voidPtrTy = voidTy->getPointerTo();
1107 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1108 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1109 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1110 auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1111
1112 for(unsigned i = 0; i < numEls; i++)
1113 {
1114 // Check mask for this element
1115 auto idx = llvm::ConstantInt::get(i32Ty, i);
1116 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1117 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1118 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1119 jit->builder->SetInsertPoint(thenBlock);
1120
1121 // Insert __msan_unpoison call in conditional block
1122 auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1123 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1124 llvm::ConstantInt::get(sizetTy, size) });
1125
1126 jit->builder->CreateBr(mergeBlock);
1127 jit->builder->SetInsertPoint(mergeBlock);
1128 }
1129 }
1130 }
1131
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1132 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1133 {
1134 ASSERT(base->getType()->isPointerTy());
1135 ASSERT(offsets->getType()->isVectorTy());
1136 ASSERT(mask->getType()->isVectorTy());
1137
1138 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1139 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1140 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1141 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1142 auto i8PtrTy = i8Ty->getPointerTo();
1143 auto elPtrTy = elTy->getPointerTo();
1144 auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1145 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1146 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1147 auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1148 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1149 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1150 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1151
1152 if(!__has_feature(memory_sanitizer))
1153 {
1154 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1155 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1156 return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1157 }
1158 else // __has_feature(memory_sanitizer)
1159 {
1160 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1161 // Work around it by emulating gather with element-wise loads.
1162 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1163
1164 Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1165 Nucleus::createStore(V(passthrough), result, T(elVecTy));
1166
1167 for(unsigned i = 0; i < numEls; i++)
1168 {
1169 // Check mask for this element
1170 Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1171
1172 If(RValue<Bool>(elementMask))
1173 {
1174 Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1175 Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1176
1177 Value *v = Nucleus::createLoad(result, T(elVecTy));
1178 v = Nucleus::createInsertElement(v, el, i);
1179 Nucleus::createStore(v, result, T(elVecTy));
1180 }
1181 }
1182
1183 return V(Nucleus::createLoad(result, T(elVecTy)));
1184 }
1185 }
1186
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1187 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1188 {
1189 return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1190 }
1191
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1192 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1193 {
1194 return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1195 }
1196
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1197 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1198 {
1199 ASSERT(base->getType()->isPointerTy());
1200 ASSERT(val->getType()->isVectorTy());
1201 ASSERT(offsets->getType()->isVectorTy());
1202 ASSERT(mask->getType()->isVectorTy());
1203
1204 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1205 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1206 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1207 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1208 auto i8PtrTy = i8Ty->getPointerTo();
1209 auto elVecTy = val->getType();
1210 auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1211 auto elPtrTy = elTy->getPointerTo();
1212 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1213
1214 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1215 auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1216 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1217 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1218
1219 if(!__has_feature(memory_sanitizer))
1220 {
1221 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1222 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1223 jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1224 }
1225 else // __has_feature(memory_sanitizer)
1226 {
1227 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1228 // Work around it by emulating scatter with element-wise stores.
1229 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1230
1231 for(unsigned i = 0; i < numEls; i++)
1232 {
1233 // Check mask for this element
1234 auto idx = llvm::ConstantInt::get(i32Ty, i);
1235 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1236 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1237 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1238 jit->builder->SetInsertPoint(thenBlock);
1239
1240 auto el = jit->builder->CreateExtractElement(val, idx);
1241 auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1242 Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1243
1244 jit->builder->CreateBr(mergeBlock);
1245 jit->builder->SetInsertPoint(mergeBlock);
1246 }
1247 }
1248 }
1249
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1250 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1251 {
1252 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1253 }
1254
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1255 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1256 {
1257 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1258 }
1259
createFence(std::memory_order memoryOrder)1260 void Nucleus::createFence(std::memory_order memoryOrder)
1261 {
1262 RR_DEBUG_INFO_UPDATE_LOC();
1263 jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1264 }
1265
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1266 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1267 {
1268 RR_DEBUG_INFO_UPDATE_LOC();
1269 ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1270 if(sizeof(void *) == 8)
1271 {
1272 // LLVM manual: "When indexing into an array, pointer or vector,
1273 // integers of any width are allowed, and they are not required to
1274 // be constant. These integers are treated as signed values where
1275 // relevant."
1276 //
1277 // Thus if we want indexes to be treated as unsigned we have to
1278 // zero-extend them ourselves.
1279 //
1280 // Note that this is not because we want to address anywhere near
1281 // 4 GB of data. Instead this is important for performance because
1282 // x86 supports automatic zero-extending of 32-bit registers to
1283 // 64-bit. Thus when indexing into an array using a uint32 is
1284 // actually faster than an int32.
1285 index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1286 }
1287
1288 // For non-emulated types we can rely on LLVM's GEP to calculate the
1289 // effective address correctly.
1290 if(asInternalType(type) == Type_LLVM)
1291 {
1292 return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1293 }
1294
1295 // For emulated types we have to multiply the index by the intended
1296 // type size ourselves to obain the byte offset.
1297 index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1298
1299 // Cast to a byte pointer, apply the byte offset, and cast back to the
1300 // original pointer type.
1301 return createBitCast(
1302 V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1303 T(llvm::PointerType::get(T(type), 0)));
1304 }
1305
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1306 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1307 {
1308 RR_DEBUG_INFO_UPDATE_LOC();
1309 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1310 #if LLVM_VERSION_MAJOR >= 11
1311 llvm::MaybeAlign(),
1312 #endif
1313 atomicOrdering(true, memoryOrder)));
1314 }
1315
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1316 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1317 {
1318 RR_DEBUG_INFO_UPDATE_LOC();
1319 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1320 #if LLVM_VERSION_MAJOR >= 11
1321 llvm::MaybeAlign(),
1322 #endif
1323 atomicOrdering(true, memoryOrder)));
1324 }
1325
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1326 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1327 {
1328 RR_DEBUG_INFO_UPDATE_LOC();
1329 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1330 #if LLVM_VERSION_MAJOR >= 11
1331 llvm::MaybeAlign(),
1332 #endif
1333 atomicOrdering(true, memoryOrder)));
1334 }
1335
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1336 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1337 {
1338 RR_DEBUG_INFO_UPDATE_LOC();
1339 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1340 #if LLVM_VERSION_MAJOR >= 11
1341 llvm::MaybeAlign(),
1342 #endif
1343 atomicOrdering(true, memoryOrder)));
1344 }
1345
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1346 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1347 {
1348 RR_DEBUG_INFO_UPDATE_LOC();
1349 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1350 #if LLVM_VERSION_MAJOR >= 11
1351 llvm::MaybeAlign(),
1352 #endif
1353 atomicOrdering(true, memoryOrder)));
1354 }
1355
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1356 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1357 {
1358 RR_DEBUG_INFO_UPDATE_LOC();
1359 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1360 #if LLVM_VERSION_MAJOR >= 11
1361 llvm::MaybeAlign(),
1362 #endif
1363 atomicOrdering(true, memoryOrder)));
1364 }
1365
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1366 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1367 {
1368 RR_DEBUG_INFO_UPDATE_LOC();
1369 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1370 #if LLVM_VERSION_MAJOR >= 11
1371 llvm::MaybeAlign(),
1372 #endif
1373 atomicOrdering(true, memoryOrder)));
1374 }
1375
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1376 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1377 {
1378 RR_DEBUG_INFO_UPDATE_LOC();
1379 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1380 #if LLVM_VERSION_MAJOR >= 11
1381 llvm::MaybeAlign(),
1382 #endif
1383 atomicOrdering(true, memoryOrder)));
1384 }
1385
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1386 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1387 {
1388 RR_DEBUG_INFO_UPDATE_LOC();
1389 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1390 #if LLVM_VERSION_MAJOR >= 11
1391 llvm::MaybeAlign(),
1392 #endif
1393 atomicOrdering(true, memoryOrder)));
1394 }
1395
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1396 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1397 {
1398 RR_DEBUG_INFO_UPDATE_LOC();
1399 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1400 #if LLVM_VERSION_MAJOR >= 11
1401 llvm::MaybeAlign(),
1402 #endif
1403 atomicOrdering(true, memoryOrder)));
1404 }
1405
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1406 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1407 {
1408 RR_DEBUG_INFO_UPDATE_LOC();
1409 // Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1410 return V(jit->builder->CreateExtractValue(
1411 jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1412 #if LLVM_VERSION_MAJOR >= 11
1413 llvm::MaybeAlign(),
1414 #endif
1415 atomicOrdering(true, memoryOrderEqual),
1416 atomicOrdering(true, memoryOrderUnequal)),
1417 llvm::ArrayRef<unsigned>(0u)));
1418 }
1419
createTrunc(Value * v,Type * destType)1420 Value *Nucleus::createTrunc(Value *v, Type *destType)
1421 {
1422 RR_DEBUG_INFO_UPDATE_LOC();
1423 return V(jit->builder->CreateTrunc(V(v), T(destType)));
1424 }
1425
createZExt(Value * v,Type * destType)1426 Value *Nucleus::createZExt(Value *v, Type *destType)
1427 {
1428 RR_DEBUG_INFO_UPDATE_LOC();
1429 return V(jit->builder->CreateZExt(V(v), T(destType)));
1430 }
1431
createSExt(Value * v,Type * destType)1432 Value *Nucleus::createSExt(Value *v, Type *destType)
1433 {
1434 RR_DEBUG_INFO_UPDATE_LOC();
1435 return V(jit->builder->CreateSExt(V(v), T(destType)));
1436 }
1437
createFPToUI(Value * v,Type * destType)1438 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1439 {
1440 RR_DEBUG_INFO_UPDATE_LOC();
1441 return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1442 }
1443
createFPToSI(Value * v,Type * destType)1444 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1445 {
1446 RR_DEBUG_INFO_UPDATE_LOC();
1447 return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1448 }
1449
createSIToFP(Value * v,Type * destType)1450 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1451 {
1452 RR_DEBUG_INFO_UPDATE_LOC();
1453 return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1454 }
1455
createFPTrunc(Value * v,Type * destType)1456 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1457 {
1458 RR_DEBUG_INFO_UPDATE_LOC();
1459 return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1460 }
1461
createFPExt(Value * v,Type * destType)1462 Value *Nucleus::createFPExt(Value *v, Type *destType)
1463 {
1464 RR_DEBUG_INFO_UPDATE_LOC();
1465 return V(jit->builder->CreateFPExt(V(v), T(destType)));
1466 }
1467
createBitCast(Value * v,Type * destType)1468 Value *Nucleus::createBitCast(Value *v, Type *destType)
1469 {
1470 RR_DEBUG_INFO_UPDATE_LOC();
1471 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1472 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1473 // reading back as the destination type.
1474 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1475 {
1476 Value *readAddress = allocateStackVariable(destType);
1477 Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1478 createStore(v, writeAddress, T(V(v)->getType()));
1479 return createLoad(readAddress, destType);
1480 }
1481 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1482 {
1483 Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1484 createStore(v, writeAddress, T(V(v)->getType()));
1485 Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1486 return createLoad(readAddress, destType);
1487 }
1488
1489 return V(jit->builder->CreateBitCast(V(v), T(destType)));
1490 }
1491
createICmpEQ(Value * lhs,Value * rhs)1492 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1493 {
1494 RR_DEBUG_INFO_UPDATE_LOC();
1495 return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1496 }
1497
createICmpNE(Value * lhs,Value * rhs)1498 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1499 {
1500 RR_DEBUG_INFO_UPDATE_LOC();
1501 return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1502 }
1503
createICmpUGT(Value * lhs,Value * rhs)1504 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1505 {
1506 RR_DEBUG_INFO_UPDATE_LOC();
1507 return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1508 }
1509
createICmpUGE(Value * lhs,Value * rhs)1510 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1511 {
1512 RR_DEBUG_INFO_UPDATE_LOC();
1513 return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1514 }
1515
createICmpULT(Value * lhs,Value * rhs)1516 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1517 {
1518 RR_DEBUG_INFO_UPDATE_LOC();
1519 return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1520 }
1521
createICmpULE(Value * lhs,Value * rhs)1522 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1523 {
1524 RR_DEBUG_INFO_UPDATE_LOC();
1525 return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1526 }
1527
createICmpSGT(Value * lhs,Value * rhs)1528 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1529 {
1530 RR_DEBUG_INFO_UPDATE_LOC();
1531 return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1532 }
1533
createICmpSGE(Value * lhs,Value * rhs)1534 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1535 {
1536 RR_DEBUG_INFO_UPDATE_LOC();
1537 return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1538 }
1539
createICmpSLT(Value * lhs,Value * rhs)1540 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1541 {
1542 RR_DEBUG_INFO_UPDATE_LOC();
1543 return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1544 }
1545
createICmpSLE(Value * lhs,Value * rhs)1546 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1547 {
1548 RR_DEBUG_INFO_UPDATE_LOC();
1549 return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1550 }
1551
createFCmpOEQ(Value * lhs,Value * rhs)1552 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1553 {
1554 RR_DEBUG_INFO_UPDATE_LOC();
1555 return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1556 }
1557
createFCmpOGT(Value * lhs,Value * rhs)1558 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1559 {
1560 RR_DEBUG_INFO_UPDATE_LOC();
1561 return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1562 }
1563
createFCmpOGE(Value * lhs,Value * rhs)1564 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1565 {
1566 RR_DEBUG_INFO_UPDATE_LOC();
1567 return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1568 }
1569
createFCmpOLT(Value * lhs,Value * rhs)1570 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1571 {
1572 RR_DEBUG_INFO_UPDATE_LOC();
1573 return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1574 }
1575
createFCmpOLE(Value * lhs,Value * rhs)1576 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1577 {
1578 RR_DEBUG_INFO_UPDATE_LOC();
1579 return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1580 }
1581
createFCmpONE(Value * lhs,Value * rhs)1582 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1583 {
1584 RR_DEBUG_INFO_UPDATE_LOC();
1585 return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1586 }
1587
createFCmpORD(Value * lhs,Value * rhs)1588 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1589 {
1590 RR_DEBUG_INFO_UPDATE_LOC();
1591 return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1592 }
1593
createFCmpUNO(Value * lhs,Value * rhs)1594 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1595 {
1596 RR_DEBUG_INFO_UPDATE_LOC();
1597 return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1598 }
1599
createFCmpUEQ(Value * lhs,Value * rhs)1600 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1601 {
1602 RR_DEBUG_INFO_UPDATE_LOC();
1603 return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1604 }
1605
createFCmpUGT(Value * lhs,Value * rhs)1606 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1607 {
1608 RR_DEBUG_INFO_UPDATE_LOC();
1609 return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1610 }
1611
createFCmpUGE(Value * lhs,Value * rhs)1612 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1613 {
1614 RR_DEBUG_INFO_UPDATE_LOC();
1615 return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1616 }
1617
createFCmpULT(Value * lhs,Value * rhs)1618 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1619 {
1620 RR_DEBUG_INFO_UPDATE_LOC();
1621 return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1622 }
1623
createFCmpULE(Value * lhs,Value * rhs)1624 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1625 {
1626 RR_DEBUG_INFO_UPDATE_LOC();
1627 return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1628 }
1629
createFCmpUNE(Value * lhs,Value * rhs)1630 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1631 {
1632 RR_DEBUG_INFO_UPDATE_LOC();
1633 return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1634 }
1635
createExtractElement(Value * vector,Type * type,int index)1636 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1637 {
1638 RR_DEBUG_INFO_UPDATE_LOC();
1639 ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1640 return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1641 }
1642
createInsertElement(Value * vector,Value * element,int index)1643 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1644 {
1645 RR_DEBUG_INFO_UPDATE_LOC();
1646 return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1647 }
1648
createShuffleVector(Value * v1,Value * v2,const int * select)1649 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1650 {
1651 RR_DEBUG_INFO_UPDATE_LOC();
1652
1653 int size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1654 llvm::SmallVector<int, 16> mask;
1655 for(int i = 0; i < size; i++)
1656 {
1657 mask.push_back(select[i]);
1658 }
1659
1660 return V(lowerShuffleVector(V(v1), V(v2), mask));
1661 }
1662
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1663 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1664 {
1665 RR_DEBUG_INFO_UPDATE_LOC();
1666 return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1667 }
1668
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1669 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1670 {
1671 RR_DEBUG_INFO_UPDATE_LOC();
1672 return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1673 }
1674
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1675 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1676 {
1677 RR_DEBUG_INFO_UPDATE_LOC();
1678 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1679 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1680 }
1681
createUnreachable()1682 void Nucleus::createUnreachable()
1683 {
1684 RR_DEBUG_INFO_UPDATE_LOC();
1685 jit->builder->CreateUnreachable();
1686 }
1687
getType(Value * value)1688 Type *Nucleus::getType(Value *value)
1689 {
1690 return T(V(value)->getType());
1691 }
1692
getContainedType(Type * vectorType)1693 Type *Nucleus::getContainedType(Type *vectorType)
1694 {
1695 return T(T(vectorType)->getContainedType(0));
1696 }
1697
getPointerType(Type * ElementType)1698 Type *Nucleus::getPointerType(Type *ElementType)
1699 {
1700 return T(llvm::PointerType::get(T(ElementType), 0));
1701 }
1702
getNaturalIntType()1703 static llvm::Type *getNaturalIntType()
1704 {
1705 return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1706 }
1707
getPrintfStorageType(Type * valueType)1708 Type *Nucleus::getPrintfStorageType(Type *valueType)
1709 {
1710 llvm::Type *valueTy = T(valueType);
1711 if(valueTy->isIntegerTy())
1712 {
1713 return T(getNaturalIntType());
1714 }
1715 if(valueTy->isFloatTy())
1716 {
1717 return T(llvm::Type::getDoubleTy(*jit->context));
1718 }
1719
1720 UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1721 return {};
1722 }
1723
createNullValue(Type * Ty)1724 Value *Nucleus::createNullValue(Type *Ty)
1725 {
1726 RR_DEBUG_INFO_UPDATE_LOC();
1727 return V(llvm::Constant::getNullValue(T(Ty)));
1728 }
1729
createConstantLong(int64_t i)1730 Value *Nucleus::createConstantLong(int64_t i)
1731 {
1732 RR_DEBUG_INFO_UPDATE_LOC();
1733 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1734 }
1735
createConstantInt(int i)1736 Value *Nucleus::createConstantInt(int i)
1737 {
1738 RR_DEBUG_INFO_UPDATE_LOC();
1739 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1740 }
1741
createConstantInt(unsigned int i)1742 Value *Nucleus::createConstantInt(unsigned int i)
1743 {
1744 RR_DEBUG_INFO_UPDATE_LOC();
1745 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1746 }
1747
createConstantBool(bool b)1748 Value *Nucleus::createConstantBool(bool b)
1749 {
1750 RR_DEBUG_INFO_UPDATE_LOC();
1751 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1752 }
1753
createConstantByte(signed char i)1754 Value *Nucleus::createConstantByte(signed char i)
1755 {
1756 RR_DEBUG_INFO_UPDATE_LOC();
1757 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1758 }
1759
createConstantByte(unsigned char i)1760 Value *Nucleus::createConstantByte(unsigned char i)
1761 {
1762 RR_DEBUG_INFO_UPDATE_LOC();
1763 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1764 }
1765
createConstantShort(short i)1766 Value *Nucleus::createConstantShort(short i)
1767 {
1768 RR_DEBUG_INFO_UPDATE_LOC();
1769 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1770 }
1771
createConstantShort(unsigned short i)1772 Value *Nucleus::createConstantShort(unsigned short i)
1773 {
1774 RR_DEBUG_INFO_UPDATE_LOC();
1775 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1776 }
1777
createConstantFloat(float x)1778 Value *Nucleus::createConstantFloat(float x)
1779 {
1780 RR_DEBUG_INFO_UPDATE_LOC();
1781 return V(llvm::ConstantFP::get(T(Float::type()), x));
1782 }
1783
createNullPointer(Type * Ty)1784 Value *Nucleus::createNullPointer(Type *Ty)
1785 {
1786 RR_DEBUG_INFO_UPDATE_LOC();
1787 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1788 }
1789
createConstantVector(const int64_t * constants,Type * type)1790 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1791 {
1792 RR_DEBUG_INFO_UPDATE_LOC();
1793 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1794 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1795 const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1796 ASSERT(numElements <= 16 && numConstants <= numElements);
1797 llvm::Constant *constantVector[16];
1798
1799 for(int i = 0; i < numElements; i++)
1800 {
1801 constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1802 }
1803
1804 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1805 }
1806
createConstantVector(const double * constants,Type * type)1807 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1808 {
1809 RR_DEBUG_INFO_UPDATE_LOC();
1810 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1811 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1812 const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1813 ASSERT(numElements <= 8 && numConstants <= numElements);
1814 llvm::Constant *constantVector[8];
1815
1816 for(int i = 0; i < numElements; i++)
1817 {
1818 constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1819 }
1820
1821 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1822 }
1823
createConstantString(const char * v)1824 Value *Nucleus::createConstantString(const char *v)
1825 {
1826 // NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1827 auto ptr = jit->builder->CreateGlobalStringPtr(v);
1828 return V(ptr);
1829 }
1830
setOptimizerCallback(OptimizerCallback * callback)1831 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1832 {
1833 // The LLVM backend does not produce optimizer reports.
1834 (void)callback;
1835 }
1836
type()1837 Type *Void::type()
1838 {
1839 return T(llvm::Type::getVoidTy(*jit->context));
1840 }
1841
type()1842 Type *Bool::type()
1843 {
1844 return T(llvm::Type::getInt1Ty(*jit->context));
1845 }
1846
type()1847 Type *Byte::type()
1848 {
1849 return T(llvm::Type::getInt8Ty(*jit->context));
1850 }
1851
type()1852 Type *SByte::type()
1853 {
1854 return T(llvm::Type::getInt8Ty(*jit->context));
1855 }
1856
type()1857 Type *Short::type()
1858 {
1859 return T(llvm::Type::getInt16Ty(*jit->context));
1860 }
1861
type()1862 Type *UShort::type()
1863 {
1864 return T(llvm::Type::getInt16Ty(*jit->context));
1865 }
1866
type()1867 Type *Byte4::type()
1868 {
1869 return T(Type_v4i8);
1870 }
1871
type()1872 Type *SByte4::type()
1873 {
1874 return T(Type_v4i8);
1875 }
1876
AddSat(RValue<Byte8> x,RValue<Byte8> y)1877 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1878 {
1879 RR_DEBUG_INFO_UPDATE_LOC();
1880 #if defined(__i386__) || defined(__x86_64__)
1881 return x86::paddusb(x, y);
1882 #else
1883 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1884 #endif
1885 }
1886
SubSat(RValue<Byte8> x,RValue<Byte8> y)1887 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1888 {
1889 RR_DEBUG_INFO_UPDATE_LOC();
1890 #if defined(__i386__) || defined(__x86_64__)
1891 return x86::psubusb(x, y);
1892 #else
1893 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1894 #endif
1895 }
1896
SignMask(RValue<Byte8> x)1897 RValue<Int> SignMask(RValue<Byte8> x)
1898 {
1899 RR_DEBUG_INFO_UPDATE_LOC();
1900 #if defined(__i386__) || defined(__x86_64__)
1901 return x86::pmovmskb(x);
1902 #else
1903 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1904 #endif
1905 }
1906
1907 // RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1908 // {
1909 //#if defined(__i386__) || defined(__x86_64__)
1910 // return x86::pcmpgtb(x, y); // FIXME: Signedness
1911 //#else
1912 // return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1913 //#endif
1914 // }
1915
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1916 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1917 {
1918 RR_DEBUG_INFO_UPDATE_LOC();
1919 #if defined(__i386__) || defined(__x86_64__)
1920 return x86::pcmpeqb(x, y);
1921 #else
1922 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1923 #endif
1924 }
1925
type()1926 Type *Byte8::type()
1927 {
1928 return T(Type_v8i8);
1929 }
1930
AddSat(RValue<SByte8> x,RValue<SByte8> y)1931 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1932 {
1933 RR_DEBUG_INFO_UPDATE_LOC();
1934 #if defined(__i386__) || defined(__x86_64__)
1935 return x86::paddsb(x, y);
1936 #else
1937 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1938 #endif
1939 }
1940
SubSat(RValue<SByte8> x,RValue<SByte8> y)1941 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1942 {
1943 RR_DEBUG_INFO_UPDATE_LOC();
1944 #if defined(__i386__) || defined(__x86_64__)
1945 return x86::psubsb(x, y);
1946 #else
1947 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1948 #endif
1949 }
1950
SignMask(RValue<SByte8> x)1951 RValue<Int> SignMask(RValue<SByte8> x)
1952 {
1953 RR_DEBUG_INFO_UPDATE_LOC();
1954 #if defined(__i386__) || defined(__x86_64__)
1955 return x86::pmovmskb(As<Byte8>(x));
1956 #else
1957 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1958 #endif
1959 }
1960
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1961 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1962 {
1963 RR_DEBUG_INFO_UPDATE_LOC();
1964 #if defined(__i386__) || defined(__x86_64__)
1965 return x86::pcmpgtb(x, y);
1966 #else
1967 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1968 #endif
1969 }
1970
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1971 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1972 {
1973 RR_DEBUG_INFO_UPDATE_LOC();
1974 #if defined(__i386__) || defined(__x86_64__)
1975 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1976 #else
1977 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1978 #endif
1979 }
1980
type()1981 Type *SByte8::type()
1982 {
1983 return T(Type_v8i8);
1984 }
1985
type()1986 Type *Byte16::type()
1987 {
1988 return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1989 }
1990
type()1991 Type *SByte16::type()
1992 {
1993 return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1994 }
1995
type()1996 Type *Short2::type()
1997 {
1998 return T(Type_v2i16);
1999 }
2000
type()2001 Type *UShort2::type()
2002 {
2003 return T(Type_v2i16);
2004 }
2005
Short4(RValue<Int4> cast)2006 Short4::Short4(RValue<Int4> cast)
2007 {
2008 RR_DEBUG_INFO_UPDATE_LOC();
2009 int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2010 Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2011
2012 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2013 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2014
2015 storeValue(short4);
2016 }
2017
2018 // Short4::Short4(RValue<Float> cast)
2019 // {
2020 // }
2021
Short4(RValue<Float4> cast)2022 Short4::Short4(RValue<Float4> cast)
2023 {
2024 RR_DEBUG_INFO_UPDATE_LOC();
2025 Int4 v4i32 = Int4(cast);
2026 #if defined(__i386__) || defined(__x86_64__)
2027 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2028 #else
2029 Value *v = v4i32.loadValue();
2030 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2031 #endif
2032
2033 storeValue(As<Short4>(Int2(v4i32)).value());
2034 }
2035
operator <<(RValue<Short4> lhs,unsigned char rhs)2036 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2037 {
2038 RR_DEBUG_INFO_UPDATE_LOC();
2039 #if defined(__i386__) || defined(__x86_64__)
2040 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2041
2042 return x86::psllw(lhs, rhs);
2043 #else
2044 return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2045 #endif
2046 }
2047
operator >>(RValue<Short4> lhs,unsigned char rhs)2048 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2049 {
2050 RR_DEBUG_INFO_UPDATE_LOC();
2051 #if defined(__i386__) || defined(__x86_64__)
2052 return x86::psraw(lhs, rhs);
2053 #else
2054 return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2055 #endif
2056 }
2057
Max(RValue<Short4> x,RValue<Short4> y)2058 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2059 {
2060 RR_DEBUG_INFO_UPDATE_LOC();
2061 #if defined(__i386__) || defined(__x86_64__)
2062 return x86::pmaxsw(x, y);
2063 #else
2064 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2065 #endif
2066 }
2067
Min(RValue<Short4> x,RValue<Short4> y)2068 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2069 {
2070 RR_DEBUG_INFO_UPDATE_LOC();
2071 #if defined(__i386__) || defined(__x86_64__)
2072 return x86::pminsw(x, y);
2073 #else
2074 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2075 #endif
2076 }
2077
AddSat(RValue<Short4> x,RValue<Short4> y)2078 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2079 {
2080 RR_DEBUG_INFO_UPDATE_LOC();
2081 #if defined(__i386__) || defined(__x86_64__)
2082 return x86::paddsw(x, y);
2083 #else
2084 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2085 #endif
2086 }
2087
SubSat(RValue<Short4> x,RValue<Short4> y)2088 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2089 {
2090 RR_DEBUG_INFO_UPDATE_LOC();
2091 #if defined(__i386__) || defined(__x86_64__)
2092 return x86::psubsw(x, y);
2093 #else
2094 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2095 #endif
2096 }
2097
MulHigh(RValue<Short4> x,RValue<Short4> y)2098 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2099 {
2100 RR_DEBUG_INFO_UPDATE_LOC();
2101 #if defined(__i386__) || defined(__x86_64__)
2102 return x86::pmulhw(x, y);
2103 #else
2104 return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2105 #endif
2106 }
2107
MulAdd(RValue<Short4> x,RValue<Short4> y)2108 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2109 {
2110 RR_DEBUG_INFO_UPDATE_LOC();
2111 #if defined(__i386__) || defined(__x86_64__)
2112 return x86::pmaddwd(x, y);
2113 #else
2114 return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2115 #endif
2116 }
2117
PackSigned(RValue<Short4> x,RValue<Short4> y)2118 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2119 {
2120 RR_DEBUG_INFO_UPDATE_LOC();
2121 #if defined(__i386__) || defined(__x86_64__)
2122 auto result = x86::packsswb(x, y);
2123 #else
2124 auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2125 #endif
2126 return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2127 }
2128
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2129 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2130 {
2131 RR_DEBUG_INFO_UPDATE_LOC();
2132 #if defined(__i386__) || defined(__x86_64__)
2133 auto result = x86::packuswb(x, y);
2134 #else
2135 auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2136 #endif
2137 return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2138 }
2139
CmpGT(RValue<Short4> x,RValue<Short4> y)2140 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2141 {
2142 RR_DEBUG_INFO_UPDATE_LOC();
2143 #if defined(__i386__) || defined(__x86_64__)
2144 return x86::pcmpgtw(x, y);
2145 #else
2146 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2147 #endif
2148 }
2149
CmpEQ(RValue<Short4> x,RValue<Short4> y)2150 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2151 {
2152 RR_DEBUG_INFO_UPDATE_LOC();
2153 #if defined(__i386__) || defined(__x86_64__)
2154 return x86::pcmpeqw(x, y);
2155 #else
2156 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2157 #endif
2158 }
2159
type()2160 Type *Short4::type()
2161 {
2162 return T(Type_v4i16);
2163 }
2164
UShort4(RValue<Float4> cast,bool saturate)2165 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2166 {
2167 RR_DEBUG_INFO_UPDATE_LOC();
2168 if(saturate)
2169 {
2170 #if defined(__i386__) || defined(__x86_64__)
2171 if(CPUID::supportsSSE4_1())
2172 {
2173 Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
2174 *this = As<Short4>(PackUnsigned(int4, int4));
2175 }
2176 else
2177 #endif
2178 {
2179 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2180 }
2181 }
2182 else
2183 {
2184 *this = Short4(Int4(cast));
2185 }
2186 }
2187
operator <<(RValue<UShort4> lhs,unsigned char rhs)2188 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2189 {
2190 RR_DEBUG_INFO_UPDATE_LOC();
2191 #if defined(__i386__) || defined(__x86_64__)
2192 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2193
2194 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2195 #else
2196 return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2197 #endif
2198 }
2199
operator >>(RValue<UShort4> lhs,unsigned char rhs)2200 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2201 {
2202 RR_DEBUG_INFO_UPDATE_LOC();
2203 #if defined(__i386__) || defined(__x86_64__)
2204 // return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2205
2206 return x86::psrlw(lhs, rhs);
2207 #else
2208 return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2209 #endif
2210 }
2211
Max(RValue<UShort4> x,RValue<UShort4> y)2212 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2213 {
2214 RR_DEBUG_INFO_UPDATE_LOC();
2215 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2216 }
2217
Min(RValue<UShort4> x,RValue<UShort4> y)2218 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2219 {
2220 RR_DEBUG_INFO_UPDATE_LOC();
2221 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2222 }
2223
AddSat(RValue<UShort4> x,RValue<UShort4> y)2224 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2225 {
2226 RR_DEBUG_INFO_UPDATE_LOC();
2227 #if defined(__i386__) || defined(__x86_64__)
2228 return x86::paddusw(x, y);
2229 #else
2230 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2231 #endif
2232 }
2233
SubSat(RValue<UShort4> x,RValue<UShort4> y)2234 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2235 {
2236 RR_DEBUG_INFO_UPDATE_LOC();
2237 #if defined(__i386__) || defined(__x86_64__)
2238 return x86::psubusw(x, y);
2239 #else
2240 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2241 #endif
2242 }
2243
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2244 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2245 {
2246 RR_DEBUG_INFO_UPDATE_LOC();
2247 #if defined(__i386__) || defined(__x86_64__)
2248 return x86::pmulhuw(x, y);
2249 #else
2250 return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2251 #endif
2252 }
2253
Average(RValue<UShort4> x,RValue<UShort4> y)2254 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2255 {
2256 RR_DEBUG_INFO_UPDATE_LOC();
2257 #if defined(__i386__) || defined(__x86_64__)
2258 return x86::pavgw(x, y);
2259 #else
2260 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2261 #endif
2262 }
2263
type()2264 Type *UShort4::type()
2265 {
2266 return T(Type_v4i16);
2267 }
2268
operator <<(RValue<Short8> lhs,unsigned char rhs)2269 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2270 {
2271 RR_DEBUG_INFO_UPDATE_LOC();
2272 #if defined(__i386__) || defined(__x86_64__)
2273 return x86::psllw(lhs, rhs);
2274 #else
2275 return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2276 #endif
2277 }
2278
operator >>(RValue<Short8> lhs,unsigned char rhs)2279 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2280 {
2281 RR_DEBUG_INFO_UPDATE_LOC();
2282 #if defined(__i386__) || defined(__x86_64__)
2283 return x86::psraw(lhs, rhs);
2284 #else
2285 return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2286 #endif
2287 }
2288
MulAdd(RValue<Short8> x,RValue<Short8> y)2289 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2290 {
2291 RR_DEBUG_INFO_UPDATE_LOC();
2292 #if defined(__i386__) || defined(__x86_64__)
2293 return x86::pmaddwd(x, y);
2294 #else
2295 return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2296 #endif
2297 }
2298
MulHigh(RValue<Short8> x,RValue<Short8> y)2299 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2300 {
2301 RR_DEBUG_INFO_UPDATE_LOC();
2302 #if defined(__i386__) || defined(__x86_64__)
2303 return x86::pmulhw(x, y);
2304 #else
2305 return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2306 #endif
2307 }
2308
type()2309 Type *Short8::type()
2310 {
2311 return T(llvm::VectorType::get(T(Short::type()), 8, false));
2312 }
2313
operator <<(RValue<UShort8> lhs,unsigned char rhs)2314 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2315 {
2316 RR_DEBUG_INFO_UPDATE_LOC();
2317 #if defined(__i386__) || defined(__x86_64__)
2318 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2319 #else
2320 return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2321 #endif
2322 }
2323
operator >>(RValue<UShort8> lhs,unsigned char rhs)2324 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2325 {
2326 RR_DEBUG_INFO_UPDATE_LOC();
2327 #if defined(__i386__) || defined(__x86_64__)
2328 return x86::psrlw(lhs, rhs); // FIXME: Fallback required
2329 #else
2330 return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2331 #endif
2332 }
2333
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2334 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2335 {
2336 RR_DEBUG_INFO_UPDATE_LOC();
2337 #if defined(__i386__) || defined(__x86_64__)
2338 return x86::pmulhuw(x, y);
2339 #else
2340 return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2341 #endif
2342 }
2343
type()2344 Type *UShort8::type()
2345 {
2346 return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2347 }
2348
operator ++(Int & val,int)2349 RValue<Int> operator++(Int &val, int) // Post-increment
2350 {
2351 RR_DEBUG_INFO_UPDATE_LOC();
2352 RValue<Int> res = val;
2353
2354 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2355 val.storeValue(inc);
2356
2357 return res;
2358 }
2359
operator ++(Int & val)2360 const Int &operator++(Int &val) // Pre-increment
2361 {
2362 RR_DEBUG_INFO_UPDATE_LOC();
2363 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2364 val.storeValue(inc);
2365
2366 return val;
2367 }
2368
operator --(Int & val,int)2369 RValue<Int> operator--(Int &val, int) // Post-decrement
2370 {
2371 RR_DEBUG_INFO_UPDATE_LOC();
2372 RValue<Int> res = val;
2373
2374 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2375 val.storeValue(inc);
2376
2377 return res;
2378 }
2379
operator --(Int & val)2380 const Int &operator--(Int &val) // Pre-decrement
2381 {
2382 RR_DEBUG_INFO_UPDATE_LOC();
2383 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2384 val.storeValue(inc);
2385
2386 return val;
2387 }
2388
RoundInt(RValue<Float> cast)2389 RValue<Int> RoundInt(RValue<Float> cast)
2390 {
2391 RR_DEBUG_INFO_UPDATE_LOC();
2392 #if defined(__i386__) || defined(__x86_64__)
2393 return x86::cvtss2si(cast);
2394 #else
2395 return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2396 #endif
2397 }
2398
type()2399 Type *Int::type()
2400 {
2401 return T(llvm::Type::getInt32Ty(*jit->context));
2402 }
2403
type()2404 Type *Long::type()
2405 {
2406 return T(llvm::Type::getInt64Ty(*jit->context));
2407 }
2408
UInt(RValue<Float> cast)2409 UInt::UInt(RValue<Float> cast)
2410 {
2411 RR_DEBUG_INFO_UPDATE_LOC();
2412 Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2413 storeValue(integer);
2414 }
2415
operator ++(UInt & val,int)2416 RValue<UInt> operator++(UInt &val, int) // Post-increment
2417 {
2418 RR_DEBUG_INFO_UPDATE_LOC();
2419 RValue<UInt> res = val;
2420
2421 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2422 val.storeValue(inc);
2423
2424 return res;
2425 }
2426
operator ++(UInt & val)2427 const UInt &operator++(UInt &val) // Pre-increment
2428 {
2429 RR_DEBUG_INFO_UPDATE_LOC();
2430 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2431 val.storeValue(inc);
2432
2433 return val;
2434 }
2435
operator --(UInt & val,int)2436 RValue<UInt> operator--(UInt &val, int) // Post-decrement
2437 {
2438 RR_DEBUG_INFO_UPDATE_LOC();
2439 RValue<UInt> res = val;
2440
2441 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2442 val.storeValue(inc);
2443
2444 return res;
2445 }
2446
operator --(UInt & val)2447 const UInt &operator--(UInt &val) // Pre-decrement
2448 {
2449 RR_DEBUG_INFO_UPDATE_LOC();
2450 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2451 val.storeValue(inc);
2452
2453 return val;
2454 }
2455
2456 // RValue<UInt> RoundUInt(RValue<Float> cast)
2457 // {
2458 //#if defined(__i386__) || defined(__x86_64__)
2459 // return x86::cvtss2si(val); // FIXME: Unsigned
2460 //#else
2461 // return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2462 //#endif
2463 // }
2464
type()2465 Type *UInt::type()
2466 {
2467 return T(llvm::Type::getInt32Ty(*jit->context));
2468 }
2469
2470 // Int2::Int2(RValue<Int> cast)
2471 // {
2472 // Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2473 // Value *vector = Nucleus::createBitCast(extend, Int2::type());
2474 //
2475 // int shuffle[2] = {0, 0};
2476 // Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2477 //
2478 // storeValue(replicate);
2479 // }
2480
operator <<(RValue<Int2> lhs,unsigned char rhs)2481 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2482 {
2483 RR_DEBUG_INFO_UPDATE_LOC();
2484 #if defined(__i386__) || defined(__x86_64__)
2485 // return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2486
2487 return x86::pslld(lhs, rhs);
2488 #else
2489 return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2490 #endif
2491 }
2492
operator >>(RValue<Int2> lhs,unsigned char rhs)2493 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2494 {
2495 RR_DEBUG_INFO_UPDATE_LOC();
2496 #if defined(__i386__) || defined(__x86_64__)
2497 // return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2498
2499 return x86::psrad(lhs, rhs);
2500 #else
2501 return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2502 #endif
2503 }
2504
type()2505 Type *Int2::type()
2506 {
2507 return T(Type_v2i32);
2508 }
2509
operator <<(RValue<UInt2> lhs,unsigned char rhs)2510 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2511 {
2512 RR_DEBUG_INFO_UPDATE_LOC();
2513 #if defined(__i386__) || defined(__x86_64__)
2514 // return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2515
2516 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2517 #else
2518 return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2519 #endif
2520 }
2521
operator >>(RValue<UInt2> lhs,unsigned char rhs)2522 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2523 {
2524 RR_DEBUG_INFO_UPDATE_LOC();
2525 #if defined(__i386__) || defined(__x86_64__)
2526 // return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2527
2528 return x86::psrld(lhs, rhs);
2529 #else
2530 return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2531 #endif
2532 }
2533
type()2534 Type *UInt2::type()
2535 {
2536 return T(Type_v2i32);
2537 }
2538
Int4(RValue<Byte4> cast)2539 Int4::Int4(RValue<Byte4> cast)
2540 : XYZW(this)
2541 {
2542 RR_DEBUG_INFO_UPDATE_LOC();
2543 int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2544 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2545 Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2546
2547 int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2548 Value *c = Nucleus::createBitCast(b, Short8::type());
2549 Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2550
2551 *this = As<Int4>(d);
2552 }
2553
Int4(RValue<SByte4> cast)2554 Int4::Int4(RValue<SByte4> cast)
2555 : XYZW(this)
2556 {
2557 RR_DEBUG_INFO_UPDATE_LOC();
2558 int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2559 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2560 Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2561
2562 int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2563 Value *c = Nucleus::createBitCast(b, Short8::type());
2564 Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2565
2566 *this = As<Int4>(d) >> 24;
2567 }
2568
Int4(RValue<Short4> cast)2569 Int4::Int4(RValue<Short4> cast)
2570 : XYZW(this)
2571 {
2572 RR_DEBUG_INFO_UPDATE_LOC();
2573 int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2574 Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2575 *this = As<Int4>(c) >> 16;
2576 }
2577
Int4(RValue<UShort4> cast)2578 Int4::Int4(RValue<UShort4> cast)
2579 : XYZW(this)
2580 {
2581 RR_DEBUG_INFO_UPDATE_LOC();
2582 int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2583 Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2584 *this = As<Int4>(c);
2585 }
2586
Int4(RValue<Int> rhs)2587 Int4::Int4(RValue<Int> rhs)
2588 : XYZW(this)
2589 {
2590 RR_DEBUG_INFO_UPDATE_LOC();
2591 Value *vector = loadValue();
2592 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2593
2594 int swizzle[4] = { 0, 0, 0, 0 };
2595 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2596
2597 storeValue(replicate);
2598 }
2599
operator <<(RValue<Int4> lhs,unsigned char rhs)2600 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2601 {
2602 RR_DEBUG_INFO_UPDATE_LOC();
2603 #if defined(__i386__) || defined(__x86_64__)
2604 return x86::pslld(lhs, rhs);
2605 #else
2606 return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2607 #endif
2608 }
2609
operator >>(RValue<Int4> lhs,unsigned char rhs)2610 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2611 {
2612 RR_DEBUG_INFO_UPDATE_LOC();
2613 #if defined(__i386__) || defined(__x86_64__)
2614 return x86::psrad(lhs, rhs);
2615 #else
2616 return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2617 #endif
2618 }
2619
CmpEQ(RValue<Int4> x,RValue<Int4> y)2620 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2621 {
2622 RR_DEBUG_INFO_UPDATE_LOC();
2623 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2624 }
2625
CmpLT(RValue<Int4> x,RValue<Int4> y)2626 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2627 {
2628 RR_DEBUG_INFO_UPDATE_LOC();
2629 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2630 }
2631
CmpLE(RValue<Int4> x,RValue<Int4> y)2632 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2633 {
2634 RR_DEBUG_INFO_UPDATE_LOC();
2635 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2636 }
2637
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2638 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2639 {
2640 RR_DEBUG_INFO_UPDATE_LOC();
2641 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2642 }
2643
CmpNLT(RValue<Int4> x,RValue<Int4> y)2644 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2645 {
2646 RR_DEBUG_INFO_UPDATE_LOC();
2647 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2648 }
2649
CmpNLE(RValue<Int4> x,RValue<Int4> y)2650 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2651 {
2652 RR_DEBUG_INFO_UPDATE_LOC();
2653 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2654 }
2655
Abs(RValue<Int4> x)2656 RValue<Int4> Abs(RValue<Int4> x)
2657 {
2658 #if LLVM_VERSION_MAJOR >= 12
2659 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
2660 return RValue<Int4>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
2661 #else
2662 auto negative = x >> 31;
2663 return (x ^ negative) - negative;
2664 #endif
2665 }
2666
Max(RValue<Int4> x,RValue<Int4> y)2667 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2668 {
2669 RR_DEBUG_INFO_UPDATE_LOC();
2670 #if defined(__i386__) || defined(__x86_64__)
2671 if(CPUID::supportsSSE4_1())
2672 {
2673 return x86::pmaxsd(x, y);
2674 }
2675 else
2676 #endif
2677 {
2678 RValue<Int4> greater = CmpNLE(x, y);
2679 return (x & greater) | (y & ~greater);
2680 }
2681 }
2682
Min(RValue<Int4> x,RValue<Int4> y)2683 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2684 {
2685 RR_DEBUG_INFO_UPDATE_LOC();
2686 #if defined(__i386__) || defined(__x86_64__)
2687 if(CPUID::supportsSSE4_1())
2688 {
2689 return x86::pminsd(x, y);
2690 }
2691 else
2692 #endif
2693 {
2694 RValue<Int4> less = CmpLT(x, y);
2695 return (x & less) | (y & ~less);
2696 }
2697 }
2698
RoundInt(RValue<Float4> cast)2699 RValue<Int4> RoundInt(RValue<Float4> cast)
2700 {
2701 RR_DEBUG_INFO_UPDATE_LOC();
2702 #if defined(__i386__) || defined(__x86_64__)
2703 return x86::cvtps2dq(cast);
2704 #else
2705 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2706 #endif
2707 }
2708
RoundIntClamped(RValue<Float4> cast)2709 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2710 {
2711 RR_DEBUG_INFO_UPDATE_LOC();
2712
2713 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
2714 #if defined(__i386__) || defined(__x86_64__)
2715 // cvtps2dq produces 0x80000000, a negative value, for input larger than
2716 // 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2717 // saturate to 0x80000000.
2718 return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2719 #elif defined(__arm__) || defined(__aarch64__)
2720 // ARM saturates to the largest positive or negative integer. Unit tests
2721 // verify that lowerRoundInt() behaves as desired.
2722 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2723 #elif LLVM_VERSION_MAJOR >= 14
2724 llvm::Value *rounded = lowerRound(V(cast.value()));
2725 llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
2726 jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(Int4::type()), T(Float4::type()) });
2727 return RValue<Int4>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
2728 #else
2729 RValue<Float4> clamped = Max(Min(cast, Float4(0x7FFFFF80)), Float4(0x80000000));
2730 return As<Int4>(V(lowerRoundInt(V(clamped.value()), T(Int4::type()))));
2731 #endif
2732 }
2733
MulHigh(RValue<Int4> x,RValue<Int4> y)2734 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2735 {
2736 RR_DEBUG_INFO_UPDATE_LOC();
2737 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2738 return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2739 }
2740
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2741 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2742 {
2743 RR_DEBUG_INFO_UPDATE_LOC();
2744 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2745 return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2746 }
2747
PackSigned(RValue<Int4> x,RValue<Int4> y)2748 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2749 {
2750 RR_DEBUG_INFO_UPDATE_LOC();
2751 #if defined(__i386__) || defined(__x86_64__)
2752 return x86::packssdw(x, y);
2753 #else
2754 return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2755 #endif
2756 }
2757
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2758 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2759 {
2760 RR_DEBUG_INFO_UPDATE_LOC();
2761 #if defined(__i386__) || defined(__x86_64__)
2762 return x86::packusdw(x, y);
2763 #else
2764 return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2765 #endif
2766 }
2767
SignMask(RValue<Int4> x)2768 RValue<Int> SignMask(RValue<Int4> x)
2769 {
2770 RR_DEBUG_INFO_UPDATE_LOC();
2771 #if defined(__i386__) || defined(__x86_64__)
2772 return x86::movmskps(As<Float4>(x));
2773 #else
2774 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2775 #endif
2776 }
2777
type()2778 Type *Int4::type()
2779 {
2780 return T(llvm::VectorType::get(T(Int::type()), 4, false));
2781 }
2782
UInt4(RValue<Float4> cast)2783 UInt4::UInt4(RValue<Float4> cast)
2784 : XYZW(this)
2785 {
2786 RR_DEBUG_INFO_UPDATE_LOC();
2787 Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2788 storeValue(xyzw);
2789 }
2790
UInt4(RValue<UInt> rhs)2791 UInt4::UInt4(RValue<UInt> rhs)
2792 : XYZW(this)
2793 {
2794 RR_DEBUG_INFO_UPDATE_LOC();
2795 Value *vector = loadValue();
2796 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2797
2798 int swizzle[4] = { 0, 0, 0, 0 };
2799 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2800
2801 storeValue(replicate);
2802 }
2803
operator <<(RValue<UInt4> lhs,unsigned char rhs)2804 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2805 {
2806 RR_DEBUG_INFO_UPDATE_LOC();
2807 #if defined(__i386__) || defined(__x86_64__)
2808 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2809 #else
2810 return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2811 #endif
2812 }
2813
operator >>(RValue<UInt4> lhs,unsigned char rhs)2814 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2815 {
2816 RR_DEBUG_INFO_UPDATE_LOC();
2817 #if defined(__i386__) || defined(__x86_64__)
2818 return x86::psrld(lhs, rhs);
2819 #else
2820 return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2821 #endif
2822 }
2823
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2824 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2825 {
2826 RR_DEBUG_INFO_UPDATE_LOC();
2827 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2828 }
2829
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2830 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2831 {
2832 RR_DEBUG_INFO_UPDATE_LOC();
2833 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2834 }
2835
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2836 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2837 {
2838 RR_DEBUG_INFO_UPDATE_LOC();
2839 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2840 }
2841
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2842 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2843 {
2844 RR_DEBUG_INFO_UPDATE_LOC();
2845 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2846 }
2847
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2848 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2849 {
2850 RR_DEBUG_INFO_UPDATE_LOC();
2851 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2852 }
2853
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2854 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2855 {
2856 RR_DEBUG_INFO_UPDATE_LOC();
2857 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2858 }
2859
Max(RValue<UInt4> x,RValue<UInt4> y)2860 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2861 {
2862 RR_DEBUG_INFO_UPDATE_LOC();
2863 #if defined(__i386__) || defined(__x86_64__)
2864 if(CPUID::supportsSSE4_1())
2865 {
2866 return x86::pmaxud(x, y);
2867 }
2868 else
2869 #endif
2870 {
2871 RValue<UInt4> greater = CmpNLE(x, y);
2872 return (x & greater) | (y & ~greater);
2873 }
2874 }
2875
Min(RValue<UInt4> x,RValue<UInt4> y)2876 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2877 {
2878 RR_DEBUG_INFO_UPDATE_LOC();
2879 #if defined(__i386__) || defined(__x86_64__)
2880 if(CPUID::supportsSSE4_1())
2881 {
2882 return x86::pminud(x, y);
2883 }
2884 else
2885 #endif
2886 {
2887 RValue<UInt4> less = CmpLT(x, y);
2888 return (x & less) | (y & ~less);
2889 }
2890 }
2891
type()2892 Type *UInt4::type()
2893 {
2894 return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2895 }
2896
type()2897 Type *Half::type()
2898 {
2899 return T(llvm::Type::getInt16Ty(*jit->context));
2900 }
2901
Rcp_pp(RValue<Float> x,bool exactAtPow2)2902 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2903 {
2904 RR_DEBUG_INFO_UPDATE_LOC();
2905 #if defined(__i386__) || defined(__x86_64__)
2906 if(exactAtPow2)
2907 {
2908 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2909 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2910 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2911 }
2912 return x86::rcpss(x);
2913 #else
2914 return As<Float>(V(lowerRCP(V(x.value()))));
2915 #endif
2916 }
2917
RcpSqrt_pp(RValue<Float> x)2918 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2919 {
2920 RR_DEBUG_INFO_UPDATE_LOC();
2921 #if defined(__i386__) || defined(__x86_64__)
2922 return x86::rsqrtss(x);
2923 #else
2924 return As<Float>(V(lowerRSQRT(V(x.value()))));
2925 #endif
2926 }
2927
HasRcpApprox()2928 bool HasRcpApprox()
2929 {
2930 #if defined(__i386__) || defined(__x86_64__)
2931 return true;
2932 #else
2933 return false;
2934 #endif
2935 }
2936
RcpApprox(RValue<Float4> x,bool exactAtPow2)2937 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2938 {
2939 #if defined(__i386__) || defined(__x86_64__)
2940 if(exactAtPow2)
2941 {
2942 // rcpps uses a piecewise-linear approximation which minimizes the relative error
2943 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2944 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2945 }
2946 return x86::rcpps(x);
2947 #else
2948 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2949 return { 0.0f };
2950 #endif
2951 }
2952
RcpApprox(RValue<Float> x,bool exactAtPow2)2953 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2954 {
2955 #if defined(__i386__) || defined(__x86_64__)
2956 if(exactAtPow2)
2957 {
2958 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2959 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2960 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2961 }
2962 return x86::rcpss(x);
2963 #else
2964 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2965 return { 0.0f };
2966 #endif
2967 }
2968
HasRcpSqrtApprox()2969 bool HasRcpSqrtApprox()
2970 {
2971 #if defined(__i386__) || defined(__x86_64__)
2972 return true;
2973 #else
2974 return false;
2975 #endif
2976 }
2977
RcpSqrtApprox(RValue<Float4> x)2978 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2979 {
2980 #if defined(__i386__) || defined(__x86_64__)
2981 return x86::rsqrtps(x);
2982 #else
2983 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2984 return { 0.0f };
2985 #endif
2986 }
2987
RcpSqrtApprox(RValue<Float> x)2988 RValue<Float> RcpSqrtApprox(RValue<Float> x)
2989 {
2990 #if defined(__i386__) || defined(__x86_64__)
2991 return x86::rsqrtss(x);
2992 #else
2993 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2994 return { 0.0f };
2995 #endif
2996 }
2997
Sqrt(RValue<Float> x)2998 RValue<Float> Sqrt(RValue<Float> x)
2999 {
3000 RR_DEBUG_INFO_UPDATE_LOC();
3001 #if defined(__i386__) || defined(__x86_64__)
3002 return x86::sqrtss(x);
3003 #else
3004 return As<Float>(V(lowerSQRT(V(x.value()))));
3005 #endif
3006 }
3007
Round(RValue<Float> x)3008 RValue<Float> Round(RValue<Float> x)
3009 {
3010 RR_DEBUG_INFO_UPDATE_LOC();
3011 #if defined(__i386__) || defined(__x86_64__)
3012 if(CPUID::supportsSSE4_1())
3013 {
3014 return x86::roundss(x, 0);
3015 }
3016 else
3017 {
3018 return Float4(Round(Float4(x))).x;
3019 }
3020 #else
3021 return RValue<Float>(V(lowerRound(V(x.value()))));
3022 #endif
3023 }
3024
Trunc(RValue<Float> x)3025 RValue<Float> Trunc(RValue<Float> x)
3026 {
3027 RR_DEBUG_INFO_UPDATE_LOC();
3028 #if defined(__i386__) || defined(__x86_64__)
3029 if(CPUID::supportsSSE4_1())
3030 {
3031 return x86::roundss(x, 3);
3032 }
3033 else
3034 {
3035 return Float(Int(x)); // Rounded toward zero
3036 }
3037 #else
3038 return RValue<Float>(V(lowerTrunc(V(x.value()))));
3039 #endif
3040 }
3041
Frac(RValue<Float> x)3042 RValue<Float> Frac(RValue<Float> x)
3043 {
3044 RR_DEBUG_INFO_UPDATE_LOC();
3045 #if defined(__i386__) || defined(__x86_64__)
3046 if(CPUID::supportsSSE4_1())
3047 {
3048 return x - x86::floorss(x);
3049 }
3050 else
3051 {
3052 return Float4(Frac(Float4(x))).x;
3053 }
3054 #else
3055 // x - floor(x) can be 1.0 for very small negative x.
3056 // Clamp against the value just below 1.0.
3057 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3058 #endif
3059 }
3060
Floor(RValue<Float> x)3061 RValue<Float> Floor(RValue<Float> x)
3062 {
3063 RR_DEBUG_INFO_UPDATE_LOC();
3064 #if defined(__i386__) || defined(__x86_64__)
3065 if(CPUID::supportsSSE4_1())
3066 {
3067 return x86::floorss(x);
3068 }
3069 else
3070 {
3071 return Float4(Floor(Float4(x))).x;
3072 }
3073 #else
3074 return RValue<Float>(V(lowerFloor(V(x.value()))));
3075 #endif
3076 }
3077
Ceil(RValue<Float> x)3078 RValue<Float> Ceil(RValue<Float> x)
3079 {
3080 RR_DEBUG_INFO_UPDATE_LOC();
3081 #if defined(__i386__) || defined(__x86_64__)
3082 if(CPUID::supportsSSE4_1())
3083 {
3084 return x86::ceilss(x);
3085 }
3086 else
3087 #endif
3088 {
3089 return Float4(Ceil(Float4(x))).x;
3090 }
3091 }
3092
type()3093 Type *Float::type()
3094 {
3095 return T(llvm::Type::getFloatTy(*jit->context));
3096 }
3097
type()3098 Type *Float2::type()
3099 {
3100 return T(Type_v2f32);
3101 }
3102
Exp2(RValue<Float> v)3103 RValue<Float> Exp2(RValue<Float> v)
3104 {
3105 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::type()) });
3106 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3107 }
3108
Log2(RValue<Float> v)3109 RValue<Float> Log2(RValue<Float> v)
3110 {
3111 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::type()) });
3112 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3113 }
3114
Float4(RValue<Float> rhs)3115 Float4::Float4(RValue<Float> rhs)
3116 : XYZW(this)
3117 {
3118 RR_DEBUG_INFO_UPDATE_LOC();
3119 Value *vector = loadValue();
3120 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3121
3122 int swizzle[4] = { 0, 0, 0, 0 };
3123 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3124
3125 storeValue(replicate);
3126 }
3127
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3128 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3129 {
3130 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
3131 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3132 }
3133
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3134 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3135 {
3136 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(Float4::type()) });
3137 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3138 }
3139
Abs(RValue<Float4> x)3140 RValue<Float4> Abs(RValue<Float4> x)
3141 {
3142 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
3143 return RValue<Float4>(V(jit->builder->CreateCall(func, V(x.value()))));
3144 }
3145
Max(RValue<Float4> x,RValue<Float4> y)3146 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3147 {
3148 RR_DEBUG_INFO_UPDATE_LOC();
3149 #if defined(__i386__) || defined(__x86_64__)
3150 return x86::maxps(x, y);
3151 #else
3152 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3153 #endif
3154 }
3155
Min(RValue<Float4> x,RValue<Float4> y)3156 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3157 {
3158 RR_DEBUG_INFO_UPDATE_LOC();
3159 #if defined(__i386__) || defined(__x86_64__)
3160 return x86::minps(x, y);
3161 #else
3162 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3163 #endif
3164 }
3165
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3166 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3167 {
3168 RR_DEBUG_INFO_UPDATE_LOC();
3169 #if defined(__i386__) || defined(__x86_64__)
3170 if(exactAtPow2)
3171 {
3172 // rcpps uses a piecewise-linear approximation which minimizes the relative error
3173 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
3174 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
3175 }
3176 return x86::rcpps(x);
3177 #else
3178 return As<Float4>(V(lowerRCP(V(x.value()))));
3179 #endif
3180 }
3181
RcpSqrt_pp(RValue<Float4> x)3182 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3183 {
3184 RR_DEBUG_INFO_UPDATE_LOC();
3185 #if defined(__i386__) || defined(__x86_64__)
3186 return x86::rsqrtps(x);
3187 #else
3188 return As<Float4>(V(lowerRSQRT(V(x.value()))));
3189 #endif
3190 }
3191
Sqrt(RValue<Float4> x)3192 RValue<Float4> Sqrt(RValue<Float4> x)
3193 {
3194 RR_DEBUG_INFO_UPDATE_LOC();
3195 #if defined(__i386__) || defined(__x86_64__)
3196 return x86::sqrtps(x);
3197 #else
3198 return As<Float4>(V(lowerSQRT(V(x.value()))));
3199 #endif
3200 }
3201
SignMask(RValue<Float4> x)3202 RValue<Int> SignMask(RValue<Float4> x)
3203 {
3204 RR_DEBUG_INFO_UPDATE_LOC();
3205 #if defined(__i386__) || defined(__x86_64__)
3206 return x86::movmskps(x);
3207 #else
3208 return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3209 #endif
3210 }
3211
CmpEQ(RValue<Float4> x,RValue<Float4> y)3212 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3213 {
3214 RR_DEBUG_INFO_UPDATE_LOC();
3215 // return As<Int4>(x86::cmpeqps(x, y));
3216 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3217 }
3218
CmpLT(RValue<Float4> x,RValue<Float4> y)3219 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3220 {
3221 RR_DEBUG_INFO_UPDATE_LOC();
3222 // return As<Int4>(x86::cmpltps(x, y));
3223 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3224 }
3225
CmpLE(RValue<Float4> x,RValue<Float4> y)3226 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3227 {
3228 RR_DEBUG_INFO_UPDATE_LOC();
3229 // return As<Int4>(x86::cmpleps(x, y));
3230 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3231 }
3232
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3233 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3234 {
3235 RR_DEBUG_INFO_UPDATE_LOC();
3236 // return As<Int4>(x86::cmpneqps(x, y));
3237 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3238 }
3239
CmpNLT(RValue<Float4> x,RValue<Float4> y)3240 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3241 {
3242 RR_DEBUG_INFO_UPDATE_LOC();
3243 // return As<Int4>(x86::cmpnltps(x, y));
3244 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3245 }
3246
CmpNLE(RValue<Float4> x,RValue<Float4> y)3247 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3248 {
3249 RR_DEBUG_INFO_UPDATE_LOC();
3250 // return As<Int4>(x86::cmpnleps(x, y));
3251 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3252 }
3253
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3254 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3255 {
3256 RR_DEBUG_INFO_UPDATE_LOC();
3257 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3258 }
3259
CmpULT(RValue<Float4> x,RValue<Float4> y)3260 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3261 {
3262 RR_DEBUG_INFO_UPDATE_LOC();
3263 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3264 }
3265
CmpULE(RValue<Float4> x,RValue<Float4> y)3266 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3267 {
3268 RR_DEBUG_INFO_UPDATE_LOC();
3269 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3270 }
3271
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3272 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3273 {
3274 RR_DEBUG_INFO_UPDATE_LOC();
3275 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3276 }
3277
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3278 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3279 {
3280 RR_DEBUG_INFO_UPDATE_LOC();
3281 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3282 }
3283
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3284 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3285 {
3286 RR_DEBUG_INFO_UPDATE_LOC();
3287 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3288 }
3289
Round(RValue<Float4> x)3290 RValue<Float4> Round(RValue<Float4> x)
3291 {
3292 RR_DEBUG_INFO_UPDATE_LOC();
3293 #if defined(__i386__) || defined(__x86_64__)
3294 if(CPUID::supportsSSE4_1())
3295 {
3296 return x86::roundps(x, 0);
3297 }
3298 else
3299 {
3300 return Float4(RoundInt(x));
3301 }
3302 #else
3303 return RValue<Float4>(V(lowerRound(V(x.value()))));
3304 #endif
3305 }
3306
Trunc(RValue<Float4> x)3307 RValue<Float4> Trunc(RValue<Float4> x)
3308 {
3309 RR_DEBUG_INFO_UPDATE_LOC();
3310 #if defined(__i386__) || defined(__x86_64__)
3311 if(CPUID::supportsSSE4_1())
3312 {
3313 return x86::roundps(x, 3);
3314 }
3315 else
3316 {
3317 return Float4(Int4(x));
3318 }
3319 #else
3320 return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3321 #endif
3322 }
3323
Frac(RValue<Float4> x)3324 RValue<Float4> Frac(RValue<Float4> x)
3325 {
3326 RR_DEBUG_INFO_UPDATE_LOC();
3327 Float4 frc;
3328
3329 #if defined(__i386__) || defined(__x86_64__)
3330 if(CPUID::supportsSSE4_1())
3331 {
3332 frc = x - x86::floorps(x);
3333 }
3334 else
3335 {
3336 frc = x - Float4(Int4(x)); // Signed fractional part.
3337
3338 frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f))); // Add 1.0 if negative.
3339 }
3340 #else
3341 frc = x - Floor(x);
3342 #endif
3343
3344 // x - floor(x) can be 1.0 for very small negative x.
3345 // Clamp against the value just below 1.0.
3346 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3347 }
3348
Floor(RValue<Float4> x)3349 RValue<Float4> Floor(RValue<Float4> x)
3350 {
3351 RR_DEBUG_INFO_UPDATE_LOC();
3352 #if defined(__i386__) || defined(__x86_64__)
3353 if(CPUID::supportsSSE4_1())
3354 {
3355 return x86::floorps(x);
3356 }
3357 else
3358 {
3359 return x - Frac(x);
3360 }
3361 #else
3362 return RValue<Float4>(V(lowerFloor(V(x.value()))));
3363 #endif
3364 }
3365
Ceil(RValue<Float4> x)3366 RValue<Float4> Ceil(RValue<Float4> x)
3367 {
3368 RR_DEBUG_INFO_UPDATE_LOC();
3369 #if defined(__i386__) || defined(__x86_64__)
3370 if(CPUID::supportsSSE4_1())
3371 {
3372 return x86::ceilps(x);
3373 }
3374 else
3375 #endif
3376 {
3377 return -Floor(-x);
3378 }
3379 }
3380
Sin(RValue<Float4> v)3381 RValue<Float4> Sin(RValue<Float4> v)
3382 {
3383 RR_DEBUG_INFO_UPDATE_LOC();
3384 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value())->getType() });
3385 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3386 }
3387
Cos(RValue<Float4> v)3388 RValue<Float4> Cos(RValue<Float4> v)
3389 {
3390 RR_DEBUG_INFO_UPDATE_LOC();
3391 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value())->getType() });
3392 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3393 }
3394
Tan(RValue<Float4> v)3395 RValue<Float4> Tan(RValue<Float4> v)
3396 {
3397 RR_DEBUG_INFO_UPDATE_LOC();
3398 return Sin(v) / Cos(v);
3399 }
3400
TransformFloat4PerElement(RValue<Float4> v,const char * name)3401 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3402 {
3403 auto funcTy = llvm::FunctionType::get(T(Float::type()), llvm::ArrayRef<llvm::Type *>(T(Float::type())), false);
3404 auto func = jit->module->getOrInsertFunction(name, funcTy);
3405 llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3406 for(uint64_t i = 0; i < 4; i++)
3407 {
3408 auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value(), Float::type(), i)));
3409 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3410 }
3411 return RValue<Float4>(V(out));
3412 }
3413
Asin(RValue<Float4> v)3414 RValue<Float4> Asin(RValue<Float4> v)
3415 {
3416 RR_DEBUG_INFO_UPDATE_LOC();
3417 return TransformFloat4PerElement(v, "asinf");
3418 }
3419
Acos(RValue<Float4> v)3420 RValue<Float4> Acos(RValue<Float4> v)
3421 {
3422 RR_DEBUG_INFO_UPDATE_LOC();
3423 return TransformFloat4PerElement(v, "acosf");
3424 }
3425
Atan(RValue<Float4> v)3426 RValue<Float4> Atan(RValue<Float4> v)
3427 {
3428 RR_DEBUG_INFO_UPDATE_LOC();
3429 return TransformFloat4PerElement(v, "atanf");
3430 }
3431
Sinh(RValue<Float4> v)3432 RValue<Float4> Sinh(RValue<Float4> v)
3433 {
3434 RR_DEBUG_INFO_UPDATE_LOC();
3435 return TransformFloat4PerElement(v, "sinhf");
3436 }
3437
Cosh(RValue<Float4> v)3438 RValue<Float4> Cosh(RValue<Float4> v)
3439 {
3440 RR_DEBUG_INFO_UPDATE_LOC();
3441 return TransformFloat4PerElement(v, "coshf");
3442 }
3443
Tanh(RValue<Float4> v)3444 RValue<Float4> Tanh(RValue<Float4> v)
3445 {
3446 RR_DEBUG_INFO_UPDATE_LOC();
3447 return TransformFloat4PerElement(v, "tanhf");
3448 }
3449
Asinh(RValue<Float4> v)3450 RValue<Float4> Asinh(RValue<Float4> v)
3451 {
3452 RR_DEBUG_INFO_UPDATE_LOC();
3453 return TransformFloat4PerElement(v, "asinhf");
3454 }
3455
Acosh(RValue<Float4> v)3456 RValue<Float4> Acosh(RValue<Float4> v)
3457 {
3458 RR_DEBUG_INFO_UPDATE_LOC();
3459 return TransformFloat4PerElement(v, "acoshf");
3460 }
3461
Atanh(RValue<Float4> v)3462 RValue<Float4> Atanh(RValue<Float4> v)
3463 {
3464 RR_DEBUG_INFO_UPDATE_LOC();
3465 return TransformFloat4PerElement(v, "atanhf");
3466 }
3467
Atan2(RValue<Float4> x,RValue<Float4> y)3468 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3469 {
3470 RR_DEBUG_INFO_UPDATE_LOC();
3471 llvm::SmallVector<llvm::Type *, 2> paramTys;
3472 paramTys.push_back(T(Float::type()));
3473 paramTys.push_back(T(Float::type()));
3474 auto funcTy = llvm::FunctionType::get(T(Float::type()), paramTys, false);
3475 auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3476 llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3477 for(uint64_t i = 0; i < 4; i++)
3478 {
3479 auto el = jit->builder->CreateCall(func, { V(Nucleus::createExtractElement(x.value(), Float::type(), i)),
3480 V(Nucleus::createExtractElement(y.value(), Float::type(), i)) });
3481 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3482 }
3483 return RValue<Float4>(V(out));
3484 }
3485
Pow(RValue<Float4> x,RValue<Float4> y)3486 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3487 {
3488 RR_DEBUG_INFO_UPDATE_LOC();
3489 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::type()) });
3490 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()) })));
3491 }
3492
Exp(RValue<Float4> v)3493 RValue<Float4> Exp(RValue<Float4> v)
3494 {
3495 RR_DEBUG_INFO_UPDATE_LOC();
3496 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::type()) });
3497 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3498 }
3499
Log(RValue<Float4> v)3500 RValue<Float4> Log(RValue<Float4> v)
3501 {
3502 RR_DEBUG_INFO_UPDATE_LOC();
3503 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::type()) });
3504 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3505 }
3506
Exp2(RValue<Float4> v)3507 RValue<Float4> Exp2(RValue<Float4> v)
3508 {
3509 RR_DEBUG_INFO_UPDATE_LOC();
3510 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::type()) });
3511 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3512 }
3513
Log2(RValue<Float4> v)3514 RValue<Float4> Log2(RValue<Float4> v)
3515 {
3516 RR_DEBUG_INFO_UPDATE_LOC();
3517 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::type()) });
3518 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3519 }
3520
Ctlz(RValue<UInt> v,bool isZeroUndef)3521 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3522 {
3523 RR_DEBUG_INFO_UPDATE_LOC();
3524 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3525 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3526 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3527 }
3528
Ctlz(RValue<UInt4> v,bool isZeroUndef)3529 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3530 {
3531 RR_DEBUG_INFO_UPDATE_LOC();
3532 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3533 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3534 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3535 }
3536
Cttz(RValue<UInt> v,bool isZeroUndef)3537 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3538 {
3539 RR_DEBUG_INFO_UPDATE_LOC();
3540 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3541 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3542 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3543 }
3544
Cttz(RValue<UInt4> v,bool isZeroUndef)3545 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3546 {
3547 RR_DEBUG_INFO_UPDATE_LOC();
3548 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3549 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3550 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3551 }
3552
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3553 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3554 {
3555 return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3556 }
3557
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3558 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3559 {
3560 return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3561 }
3562
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3563 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3564 {
3565 return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3566 }
3567
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3568 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3569 {
3570 return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3571 }
3572
type()3573 Type *Float4::type()
3574 {
3575 return T(llvm::VectorType::get(T(Float::type()), 4, false));
3576 }
3577
Ticks()3578 RValue<Long> Ticks()
3579 {
3580 RR_DEBUG_INFO_UPDATE_LOC();
3581 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3582
3583 return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3584 }
3585
ConstantPointer(void const * ptr)3586 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3587 {
3588 RR_DEBUG_INFO_UPDATE_LOC();
3589 // Note: this should work for 32-bit pointers as well because 'inttoptr'
3590 // is defined to truncate (and zero extend) if necessary.
3591 auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3592 return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3593 }
3594
ConstantData(void const * data,size_t size)3595 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3596 {
3597 RR_DEBUG_INFO_UPDATE_LOC();
3598 auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3599 auto ptr = jit->builder->CreateGlobalStringPtr(str);
3600 return RValue<Pointer<Byte>>(V(ptr));
3601 }
3602
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3603 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3604 {
3605 // If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3606 // mark all call arguments as initialized by calling __msan_unpoison_param().
3607 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3608 {
3609 // void __msan_unpoison_param(size_t n)
3610 auto voidTy = llvm::Type::getVoidTy(*jit->context);
3611 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3612 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3613 auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3614
3615 jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3616 }
3617
3618 RR_DEBUG_INFO_UPDATE_LOC();
3619 llvm::SmallVector<llvm::Type *, 8> paramTys;
3620 for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3621 auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3622
3623 auto funcPtrTy = funcTy->getPointerTo();
3624 auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3625
3626 llvm::SmallVector<llvm::Value *, 8> arguments;
3627 for(auto arg : args) { arguments.push_back(V(arg)); }
3628 return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3629 }
3630
Breakpoint()3631 void Breakpoint()
3632 {
3633 RR_DEBUG_INFO_UPDATE_LOC();
3634 llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3635
3636 jit->builder->CreateCall(debugtrap);
3637 }
3638
3639 } // namespace rr
3640
3641 namespace rr {
3642
3643 #if defined(__i386__) || defined(__x86_64__)
3644 namespace x86 {
3645
3646 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3647 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3648 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3649 {
3650 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3651
3652 return V(jit->builder->CreateCall(intrinsic, V(x)));
3653 }
3654
3655 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3656 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3657 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3658 {
3659 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3660
3661 return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3662 }
3663
cvtss2si(RValue<Float> val)3664 RValue<Int> cvtss2si(RValue<Float> val)
3665 {
3666 Float4 vector;
3667 vector.x = val;
3668
3669 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3670 }
3671
cvtps2dq(RValue<Float4> val)3672 RValue<Int4> cvtps2dq(RValue<Float4> val)
3673 {
3674 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3675 }
3676
rcpss(RValue<Float> val)3677 RValue<Float> rcpss(RValue<Float> val)
3678 {
3679 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3680
3681 // TODO(b/172238865): MemorySanitizer does not support the rcpss instruction,
3682 // which makes it look at the entire 128-bit input operand for undefined bits.
3683 // Use zero-initialized values instead.
3684 if(__has_feature(memory_sanitizer))
3685 {
3686 undef = Float4(0).loadValue();
3687 }
3688
3689 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3690
3691 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3692 }
3693
sqrtss(RValue<Float> val)3694 RValue<Float> sqrtss(RValue<Float> val)
3695 {
3696 return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3697 }
3698
rsqrtss(RValue<Float> val)3699 RValue<Float> rsqrtss(RValue<Float> val)
3700 {
3701 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3702
3703 // TODO(b/172238865): MemorySanitizer does not support the rsqrtss instruction,
3704 // which makes it look at the entire 128-bit input operand for undefined bits.
3705 // Use zero-initialized values instead.
3706 if(__has_feature(memory_sanitizer))
3707 {
3708 undef = Float4(0).loadValue();
3709 }
3710
3711 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3712
3713 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3714 }
3715
rcpps(RValue<Float4> val)3716 RValue<Float4> rcpps(RValue<Float4> val)
3717 {
3718 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3719 }
3720
sqrtps(RValue<Float4> val)3721 RValue<Float4> sqrtps(RValue<Float4> val)
3722 {
3723 return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3724 }
3725
rsqrtps(RValue<Float4> val)3726 RValue<Float4> rsqrtps(RValue<Float4> val)
3727 {
3728 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3729 }
3730
maxps(RValue<Float4> x,RValue<Float4> y)3731 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3732 {
3733 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3734 }
3735
minps(RValue<Float4> x,RValue<Float4> y)3736 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3737 {
3738 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3739 }
3740
roundss(RValue<Float> val,unsigned char imm)3741 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3742 {
3743 llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3744
3745 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3746
3747 // TODO(b/172238865): MemorySanitizer does not support the roundss instruction,
3748 // which makes it look at the entire 128-bit input operands for undefined bits.
3749 // Use zero-initialized values instead.
3750 if(__has_feature(memory_sanitizer))
3751 {
3752 undef = Float4(0).loadValue();
3753 }
3754
3755 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3756
3757 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3758 }
3759
floorss(RValue<Float> val)3760 RValue<Float> floorss(RValue<Float> val)
3761 {
3762 return roundss(val, 1);
3763 }
3764
ceilss(RValue<Float> val)3765 RValue<Float> ceilss(RValue<Float> val)
3766 {
3767 return roundss(val, 2);
3768 }
3769
roundps(RValue<Float4> val,unsigned char imm)3770 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3771 {
3772 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3773 }
3774
floorps(RValue<Float4> val)3775 RValue<Float4> floorps(RValue<Float4> val)
3776 {
3777 return roundps(val, 1);
3778 }
3779
ceilps(RValue<Float4> val)3780 RValue<Float4> ceilps(RValue<Float4> val)
3781 {
3782 return roundps(val, 2);
3783 }
3784
paddsw(RValue<Short4> x,RValue<Short4> y)3785 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3786 {
3787 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3788 }
3789
psubsw(RValue<Short4> x,RValue<Short4> y)3790 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3791 {
3792 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3793 }
3794
paddusw(RValue<UShort4> x,RValue<UShort4> y)3795 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3796 {
3797 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3798 }
3799
psubusw(RValue<UShort4> x,RValue<UShort4> y)3800 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3801 {
3802 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3803 }
3804
paddsb(RValue<SByte8> x,RValue<SByte8> y)3805 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3806 {
3807 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3808 }
3809
psubsb(RValue<SByte8> x,RValue<SByte8> y)3810 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3811 {
3812 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3813 }
3814
paddusb(RValue<Byte8> x,RValue<Byte8> y)3815 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3816 {
3817 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3818 }
3819
psubusb(RValue<Byte8> x,RValue<Byte8> y)3820 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3821 {
3822 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3823 }
3824
pavgw(RValue<UShort4> x,RValue<UShort4> y)3825 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3826 {
3827 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3828 }
3829
pmaxsw(RValue<Short4> x,RValue<Short4> y)3830 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3831 {
3832 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3833 }
3834
pminsw(RValue<Short4> x,RValue<Short4> y)3835 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3836 {
3837 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3838 }
3839
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3840 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3841 {
3842 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3843 }
3844
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3845 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3846 {
3847 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3848 }
3849
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3850 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3851 {
3852 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3853 }
3854
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3855 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3856 {
3857 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3858 }
3859
packssdw(RValue<Int2> x,RValue<Int2> y)3860 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3861 {
3862 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3863 }
3864
packssdw(RValue<Int4> x,RValue<Int4> y)3865 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3866 {
3867 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3868 }
3869
packsswb(RValue<Short4> x,RValue<Short4> y)3870 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3871 {
3872 return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3873 }
3874
packuswb(RValue<Short4> x,RValue<Short4> y)3875 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3876 {
3877 return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3878 }
3879
packusdw(RValue<Int4> x,RValue<Int4> y)3880 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3881 {
3882 if(CPUID::supportsSSE4_1())
3883 {
3884 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3885 }
3886 else
3887 {
3888 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3889 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3890
3891 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3892 }
3893 }
3894
psrlw(RValue<UShort4> x,unsigned char y)3895 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3896 {
3897 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3898 }
3899
psrlw(RValue<UShort8> x,unsigned char y)3900 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3901 {
3902 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3903 }
3904
psraw(RValue<Short4> x,unsigned char y)3905 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3906 {
3907 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3908 }
3909
psraw(RValue<Short8> x,unsigned char y)3910 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3911 {
3912 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3913 }
3914
psllw(RValue<Short4> x,unsigned char y)3915 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3916 {
3917 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3918 }
3919
psllw(RValue<Short8> x,unsigned char y)3920 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3921 {
3922 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3923 }
3924
pslld(RValue<Int2> x,unsigned char y)3925 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3926 {
3927 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3928 }
3929
pslld(RValue<Int4> x,unsigned char y)3930 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3931 {
3932 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3933 }
3934
psrad(RValue<Int2> x,unsigned char y)3935 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3936 {
3937 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3938 }
3939
psrad(RValue<Int4> x,unsigned char y)3940 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3941 {
3942 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3943 }
3944
psrld(RValue<UInt2> x,unsigned char y)3945 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3946 {
3947 return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3948 }
3949
psrld(RValue<UInt4> x,unsigned char y)3950 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3951 {
3952 return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3953 }
3954
pmaxsd(RValue<Int4> x,RValue<Int4> y)3955 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3956 {
3957 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3958 }
3959
pminsd(RValue<Int4> x,RValue<Int4> y)3960 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3961 {
3962 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3963 }
3964
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3965 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3966 {
3967 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3968 }
3969
pminud(RValue<UInt4> x,RValue<UInt4> y)3970 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3971 {
3972 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3973 }
3974
pmulhw(RValue<Short4> x,RValue<Short4> y)3975 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3976 {
3977 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3978 }
3979
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3980 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3981 {
3982 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3983 }
3984
pmaddwd(RValue<Short4> x,RValue<Short4> y)3985 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3986 {
3987 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3988 }
3989
pmulhw(RValue<Short8> x,RValue<Short8> y)3990 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3991 {
3992 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3993 }
3994
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3995 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3996 {
3997 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3998 }
3999
pmaddwd(RValue<Short8> x,RValue<Short8> y)4000 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
4001 {
4002 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
4003 }
4004
movmskps(RValue<Float4> x)4005 RValue<Int> movmskps(RValue<Float4> x)
4006 {
4007 Value *v = x.value();
4008
4009 // TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4010 // which makes it look at the entire 128-bit input for undefined bits. Mask off
4011 // just the sign bits to avoid false positives.
4012 if(__has_feature(memory_sanitizer))
4013 {
4014 v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
4015 }
4016
4017 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
4018 }
4019
pmovmskb(RValue<Byte8> x)4020 RValue<Int> pmovmskb(RValue<Byte8> x)
4021 {
4022 Value *v = x.value();
4023
4024 // TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4025 // which makes it look at the entire 128-bit input for undefined bits. Mask off
4026 // just the sign bits in the lower 64-bit vector to avoid false positives.
4027 if(__has_feature(memory_sanitizer))
4028 {
4029 v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
4030 }
4031
4032 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
4033 }
4034
4035 } // namespace x86
4036 #endif // defined(__i386__) || defined(__x86_64__)
4037
4038 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)4039 void VPrintf(const std::vector<Value *> &vals)
4040 {
4041 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4042 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4043 auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
4044 auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
4045 jit->builder->CreateCall(func, V(vals));
4046 }
4047 #endif // ENABLE_RR_PRINT
4048
Nop()4049 void Nop()
4050 {
4051 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4052 auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
4053 auto func = jit->module->getOrInsertFunction("nop", funcTy);
4054 jit->builder->CreateCall(func);
4055 }
4056
EmitDebugLocation()4057 void EmitDebugLocation()
4058 {
4059 #ifdef ENABLE_RR_DEBUG_INFO
4060 if(jit->debugInfo != nullptr)
4061 {
4062 jit->debugInfo->EmitLocation();
4063 }
4064 #endif // ENABLE_RR_DEBUG_INFO
4065 }
4066
EmitDebugVariable(Value * value)4067 void EmitDebugVariable(Value *value)
4068 {
4069 #ifdef ENABLE_RR_DEBUG_INFO
4070 if(jit->debugInfo != nullptr)
4071 {
4072 jit->debugInfo->EmitVariable(value);
4073 }
4074 #endif // ENABLE_RR_DEBUG_INFO
4075 }
4076
FlushDebug()4077 void FlushDebug()
4078 {
4079 #ifdef ENABLE_RR_DEBUG_INFO
4080 if(jit->debugInfo != nullptr)
4081 {
4082 jit->debugInfo->Flush();
4083 }
4084 #endif // ENABLE_RR_DEBUG_INFO
4085 }
4086
4087 } // namespace rr
4088
4089 // ------------------------------ Coroutines ------------------------------
4090
4091 namespace {
4092
4093 // Magic values retuned by llvm.coro.suspend.
4094 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
4095 enum SuspendAction
4096 {
4097 SuspendActionSuspend = -1,
4098 SuspendActionResume = 0,
4099 SuspendActionDestroy = 1
4100 };
4101
promoteFunctionToCoroutine()4102 void promoteFunctionToCoroutine()
4103 {
4104 ASSERT(jit->coroutine.id == nullptr);
4105
4106 // Types
4107 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4108 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4109 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4110 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4111 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4112 auto promiseTy = jit->coroutine.yieldType;
4113 auto promisePtrTy = promiseTy->getPointerTo();
4114
4115 // LLVM intrinsics
4116 auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
4117 auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
4118 auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
4119 auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
4120 auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
4121 auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
4122 auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
4123 auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
4124 auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
4125 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4126
4127 auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
4128 auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
4129 auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
4130 auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
4131
4132 auto oldInsertionPoint = jit->builder->saveIP();
4133
4134 // Build the coroutine_await() function:
4135 //
4136 // bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4137 // {
4138 // if(llvm.coro.done(handle))
4139 // {
4140 // return false;
4141 // }
4142 // else
4143 // {
4144 // *value = (T*)llvm.coro.promise(handle);
4145 // llvm.coro.resume(handle);
4146 // return true;
4147 // }
4148 // }
4149 //
4150 {
4151 auto args = jit->coroutine.await->arg_begin();
4152 auto handle = args++;
4153 auto outPtr = args++;
4154 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
4155 auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
4156 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
4157
4158 auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4159 jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4160
4161 jit->builder->SetInsertPoint(doneBlock);
4162 jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
4163
4164 jit->builder->SetInsertPoint(resumeBlock);
4165 auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4); // TODO: Get correct alignment.
4166 auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
4167 auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4168 jit->builder->CreateStore(promise, outPtr);
4169 jit->builder->CreateCall(coro_resume, { handle });
4170 jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
4171 }
4172
4173 // Build the coroutine_destroy() function:
4174 //
4175 // void coroutine_destroy(CoroutineHandle* handle)
4176 // {
4177 // llvm.coro.destroy(handle);
4178 // }
4179 //
4180 {
4181 auto handle = jit->coroutine.destroy->arg_begin();
4182 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4183 jit->builder->CreateCall(coro_destroy, { handle });
4184 jit->builder->CreateRetVoid();
4185 }
4186
4187 // Begin building the main coroutine_begin() function.
4188 //
4189 // CoroutineHandle* coroutine_begin(<Arguments>)
4190 // {
4191 // YieldType promise;
4192 // auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4193 // void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4194 // CoroutineHandle *handle = llvm.coro.begin(id, frame);
4195 //
4196 // ... <REACTOR CODE> ...
4197 //
4198 // end:
4199 // SuspendAction action = llvm.coro.suspend(none, true /* final */); // <-- RESUME POINT
4200 // switch(action)
4201 // {
4202 // case SuspendActionResume:
4203 // UNREACHABLE(); // Illegal to resume after final suspend.
4204 // case SuspendActionDestroy:
4205 // goto destroy;
4206 // default: // (SuspendActionSuspend)
4207 // goto suspend;
4208 // }
4209 //
4210 // destroy:
4211 // coroutine_free_frame(llvm.coro.free(id, handle));
4212 // goto suspend;
4213 //
4214 // suspend:
4215 // llvm.coro.end(handle, false);
4216 // return handle;
4217 // }
4218 //
4219
4220 #ifdef ENABLE_RR_DEBUG_INFO
4221 jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
4222 #endif // ENABLE_RR_DEBUG_INFO
4223
4224 jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
4225 jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
4226 jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
4227
4228 jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4229 jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4230 jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4231 llvm::ConstantInt::get(i32Ty, 0),
4232 jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4233 llvm::ConstantPointerNull::get(i8PtrTy),
4234 llvm::ConstantPointerNull::get(i8PtrTy),
4235 });
4236 auto size = jit->builder->CreateCall(coro_size, {});
4237 auto frame = jit->builder->CreateCall(allocFrame, { size });
4238 jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4239
4240 // Build the suspend block
4241 jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4242 jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4243 jit->builder->CreateRet(jit->coroutine.handle);
4244
4245 // Build the end block
4246 jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4247 auto action = jit->builder->CreateCall(coro_suspend, {
4248 llvm::ConstantTokenNone::get(*jit->context),
4249 llvm::ConstantInt::get(i1Ty, 1), // final: true
4250 });
4251 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4252 // switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4253 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4254
4255 // Build the destroy block
4256 jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4257 auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4258 jit->builder->CreateCall(freeFrame, { memory });
4259 jit->builder->CreateBr(jit->coroutine.suspendBlock);
4260
4261 // Switch back to original insert point to continue building the coroutine.
4262 jit->builder->restoreIP(oldInsertionPoint);
4263 }
4264
4265 } // anonymous namespace
4266
4267 namespace rr {
4268
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4269 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4270 {
4271 // Coroutines are initially created as a regular function.
4272 // Upon the first call to Yield(), the function is promoted to a true
4273 // coroutine.
4274 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4275 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4276 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4277 auto handleTy = i8PtrTy;
4278 auto boolTy = i1Ty;
4279 auto promiseTy = T(YieldType);
4280 auto promisePtrTy = promiseTy->getPointerTo();
4281
4282 jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4283 jit->function->addFnAttr("coroutine.presplit", "0");
4284 jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4285 jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4286 jit->coroutine.yieldType = promiseTy;
4287 jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4288
4289 jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4290 }
4291
yield(Value * val)4292 void Nucleus::yield(Value *val)
4293 {
4294 if(jit->coroutine.id == nullptr)
4295 {
4296 // First call to yield().
4297 // Promote the function to a full coroutine.
4298 promoteFunctionToCoroutine();
4299 ASSERT(jit->coroutine.id != nullptr);
4300 }
4301
4302 // promise = val;
4303 //
4304 // auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4305 // switch(action)
4306 // {
4307 // case SuspendActionResume:
4308 // goto resume;
4309 // case SuspendActionDestroy:
4310 // goto destroy;
4311 // default: // (SuspendActionSuspend)
4312 // goto suspend;
4313 // }
4314 // resume:
4315 //
4316
4317 RR_DEBUG_INFO_UPDATE_LOC();
4318 Variable::materializeAll();
4319
4320 // Types
4321 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4322 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4323
4324 // Intrinsics
4325 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4326
4327 // Create a block to resume execution.
4328 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4329
4330 // Store the promise (yield value)
4331 jit->builder->CreateStore(V(val), jit->coroutine.promise);
4332 auto action = jit->builder->CreateCall(coro_suspend, {
4333 llvm::ConstantTokenNone::get(*jit->context),
4334 llvm::ConstantInt::get(i1Ty, 0), // final: true
4335 });
4336 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4337 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4338 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4339
4340 // Continue building in the resume block.
4341 jit->builder->SetInsertPoint(resumeBlock);
4342 }
4343
acquireCoroutine(const char * name,const Config::Edit * cfgEdit)4344 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
4345 {
4346 if(jit->coroutine.id)
4347 {
4348 jit->builder->CreateBr(jit->coroutine.endBlock);
4349 }
4350 else
4351 {
4352 // Coroutine without a Yield acts as a regular function.
4353 // The 'coroutine_begin' function returns a nullptr for the coroutine
4354 // handle.
4355 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4356 // The 'coroutine_await' function always returns false (coroutine done).
4357 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4358 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4359 // The 'coroutine_destroy' does nothing, returns void.
4360 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4361 jit->builder->CreateRetVoid();
4362 }
4363
4364 #ifdef ENABLE_RR_DEBUG_INFO
4365 if(jit->debugInfo != nullptr)
4366 {
4367 jit->debugInfo->Finalize();
4368 }
4369 #endif // ENABLE_RR_DEBUG_INFO
4370
4371 if(false)
4372 {
4373 std::error_code error;
4374 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4375 jit->module->print(file, 0);
4376 }
4377
4378 Config cfg = jit->config;
4379 if(cfgEdit)
4380 {
4381 cfg = cfgEdit->apply(jit->config);
4382 }
4383 jit->runPasses(cfg);
4384
4385 if(false)
4386 {
4387 std::error_code error;
4388 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4389 jit->module->print(file, 0);
4390 }
4391
4392 llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4393 funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4394 funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4395 funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4396
4397 auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount, cfg);
4398
4399 delete jit;
4400 jit = nullptr;
4401
4402 return routine;
4403 }
4404
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4405 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4406 {
4407 return func();
4408 }
4409
4410 } // namespace rr
4411