1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "LLVMReactor.hpp"
16
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "EmulatedIntrinsics.hpp"
20 #include "LLVMReactorDebugInfo.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "x86.hpp"
24
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/IntrinsicsX86.h"
27 #include "llvm/IR/LegacyPassManager.h"
28 #include "llvm/IR/Verifier.h"
29 #include "llvm/Support/Alignment.h"
30 #include "llvm/Support/ManagedStatic.h"
31 #include "llvm/Transforms/Coroutines.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Scalar.h"
34
35 #include <fstream>
36 #include <iostream>
37 #include <mutex>
38 #include <numeric>
39 #include <thread>
40 #include <unordered_map>
41
42 #if defined(__i386__) || defined(__x86_64__)
43 # include <xmmintrin.h>
44 #endif
45
46 #include <math.h>
47
48 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()49 extern "C" void X86CompilationCallback()
50 {
51 UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
52 }
53 #endif
54
55 #if !LLVM_ENABLE_THREADS
56 # error "LLVM_ENABLE_THREADS needs to be enabled"
57 #endif
58
59 #if LLVM_VERSION_MAJOR < 11
60 namespace llvm {
61 using FixedVectorType = VectorType;
62 } // namespace llvm
63 #endif
64
65 namespace {
66
67 // Used to automatically invoke llvm_shutdown() when driver is unloaded
68 llvm::llvm_shutdown_obj llvmShutdownObj;
69
70 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
71 // for destructing objects at exit. See crbug.com/1074222
72 thread_local rr::JITBuilder *jit = nullptr;
73
74 // Default configuration settings. Must be accessed under mutex lock.
75 std::mutex defaultConfigLock;
defaultConfig()76 rr::Config &defaultConfig()
77 {
78 // This uses a static in a function to avoid the cost of a global static
79 // initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
80 static rr::Config config = rr::Config::Edit()
81 .add(rr::Optimization::Pass::ScalarReplAggregates)
82 .add(rr::Optimization::Pass::InstructionCombining)
83 .apply({});
84 return config;
85 }
86
lowerPAVG(llvm::Value * x,llvm::Value * y)87 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
88 {
89 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
90
91 llvm::VectorType *extTy =
92 llvm::VectorType::getExtendedElementVectorType(ty);
93 x = jit->builder->CreateZExt(x, extTy);
94 y = jit->builder->CreateZExt(y, extTy);
95
96 // (x + y + 1) >> 1
97 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
98 llvm::Value *res = jit->builder->CreateAdd(x, y);
99 res = jit->builder->CreateAdd(res, one);
100 res = jit->builder->CreateLShr(res, one);
101 return jit->builder->CreateTrunc(res, ty);
102 }
103
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)104 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
105 llvm::ICmpInst::Predicate pred)
106 {
107 return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
108 }
109
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)110 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
111 llvm::Value *y, llvm::Type *dstTy)
112 {
113 return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
114 }
115
116 #if defined(__i386__) || defined(__x86_64__)
lowerPMOV(llvm::Value * op,llvm::Type * dstType,bool sext)117 llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
118 {
119 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
120 llvm::FixedVectorType *dstTy = llvm::cast<llvm::FixedVectorType>(dstType);
121
122 llvm::Value *undef = llvm::UndefValue::get(srcTy);
123 llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
124 std::iota(mask.begin(), mask.end(), 0);
125 llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
126
127 return sext ? jit->builder->CreateSExt(v, dstTy)
128 : jit->builder->CreateZExt(v, dstTy);
129 }
130
lowerPABS(llvm::Value * v)131 llvm::Value *lowerPABS(llvm::Value *v)
132 {
133 llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
134 llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
135 llvm::Value *neg = jit->builder->CreateNeg(v);
136 return jit->builder->CreateSelect(cmp, v, neg);
137 }
138 #endif // defined(__i386__) || defined(__x86_64__)
139
140 #if !defined(__i386__) && !defined(__x86_64__)
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)141 llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
142 llvm::FCmpInst::Predicate pred)
143 {
144 return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
145 }
146
lowerRound(llvm::Value * x)147 llvm::Value *lowerRound(llvm::Value *x)
148 {
149 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
150 jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
151 return jit->builder->CreateCall(nearbyint, { x });
152 }
153
lowerRoundInt(llvm::Value * x,llvm::Type * ty)154 llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
155 {
156 return jit->builder->CreateFPToSI(lowerRound(x), ty);
157 }
158
lowerFloor(llvm::Value * x)159 llvm::Value *lowerFloor(llvm::Value *x)
160 {
161 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
162 jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
163 return jit->builder->CreateCall(floor, { x });
164 }
165
lowerTrunc(llvm::Value * x)166 llvm::Value *lowerTrunc(llvm::Value *x)
167 {
168 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
169 jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
170 return jit->builder->CreateCall(trunc, { x });
171 }
172
lowerSQRT(llvm::Value * x)173 llvm::Value *lowerSQRT(llvm::Value *x)
174 {
175 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
176 jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
177 return jit->builder->CreateCall(sqrt, { x });
178 }
179
lowerRCP(llvm::Value * x)180 llvm::Value *lowerRCP(llvm::Value *x)
181 {
182 llvm::Type *ty = x->getType();
183 llvm::Constant *one;
184 if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
185 {
186 one = llvm::ConstantVector::getSplat(
187 vectorTy->getNumElements(),
188 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
189 }
190 else
191 {
192 one = llvm::ConstantFP::get(ty, 1);
193 }
194 return jit->builder->CreateFDiv(one, x);
195 }
196
lowerRSQRT(llvm::Value * x)197 llvm::Value *lowerRSQRT(llvm::Value *x)
198 {
199 return lowerRCP(lowerSQRT(x));
200 }
201
lowerVectorShl(llvm::Value * x,uint64_t scalarY)202 llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
203 {
204 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
205 llvm::Value *y = llvm::ConstantVector::getSplat(
206 ty->getNumElements(),
207 llvm::ConstantInt::get(ty->getElementType(), scalarY));
208 return jit->builder->CreateShl(x, y);
209 }
210
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)211 llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
212 {
213 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
214 llvm::Value *y = llvm::ConstantVector::getSplat(
215 ty->getNumElements(),
216 llvm::ConstantInt::get(ty->getElementType(), scalarY));
217 return jit->builder->CreateAShr(x, y);
218 }
219
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)220 llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
221 {
222 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
223 llvm::Value *y = llvm::ConstantVector::getSplat(
224 ty->getNumElements(),
225 llvm::ConstantInt::get(ty->getElementType(), scalarY));
226 return jit->builder->CreateLShr(x, y);
227 }
228
lowerMulAdd(llvm::Value * x,llvm::Value * y)229 llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
230 {
231 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
232 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
233
234 llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
235 llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
236 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
237
238 llvm::Value *undef = llvm::UndefValue::get(extTy);
239
240 llvm::SmallVector<uint32_t, 16> evenIdx;
241 llvm::SmallVector<uint32_t, 16> oddIdx;
242 for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
243 {
244 evenIdx.push_back(i);
245 oddIdx.push_back(i + 1);
246 }
247
248 llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
249 llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
250 return jit->builder->CreateAdd(lhs, rhs);
251 }
252
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)253 llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
254 {
255 llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
256 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
257
258 llvm::IntegerType *dstElemTy =
259 llvm::cast<llvm::IntegerType>(dstTy->getElementType());
260
261 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
262 ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
263 llvm::Constant *max, *min;
264 if(isSigned)
265 {
266 max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
267 min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
268 }
269 else
270 {
271 max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
272 min = llvm::ConstantInt::get(srcTy, 0, false);
273 }
274
275 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
276 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
277 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
278 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
279
280 x = jit->builder->CreateTrunc(x, dstTy);
281 y = jit->builder->CreateTrunc(y, dstTy);
282
283 llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
284 std::iota(index.begin(), index.end(), 0);
285
286 return jit->builder->CreateShuffleVector(x, y, index);
287 }
288
lowerSignMask(llvm::Value * x,llvm::Type * retTy)289 llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
290 {
291 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
292 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
293 llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
294
295 llvm::Value *ret = jit->builder->CreateZExt(
296 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
297 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
298 {
299 llvm::Value *elem = jit->builder->CreateZExt(
300 jit->builder->CreateExtractElement(cmp, i), retTy);
301 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
302 }
303 return ret;
304 }
305
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)306 llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
307 {
308 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
309 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
310 llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
311
312 llvm::Value *ret = jit->builder->CreateZExt(
313 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
314 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
315 {
316 llvm::Value *elem = jit->builder->CreateZExt(
317 jit->builder->CreateExtractElement(cmp, i), retTy);
318 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
319 }
320 return ret;
321 }
322 #endif // !defined(__i386__) && !defined(__x86_64__)
323
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)324 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
325 {
326 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
327 }
328
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)329 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
330 {
331 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
332 }
333
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)334 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
335 {
336 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
337 }
338
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)339 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
340 {
341 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
342 }
343
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)344 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
345 {
346 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
347 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
348
349 llvm::Value *extX, *extY;
350 if(sext)
351 {
352 extX = jit->builder->CreateSExt(x, extTy);
353 extY = jit->builder->CreateSExt(y, extTy);
354 }
355 else
356 {
357 extX = jit->builder->CreateZExt(x, extTy);
358 extY = jit->builder->CreateZExt(y, extTy);
359 }
360
361 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
362
363 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
364 llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
365 return jit->builder->CreateTrunc(mulh, ty);
366 }
367
368 } // namespace
369
370 namespace rr {
371
BackendName()372 std::string BackendName()
373 {
374 return std::string("LLVM ") + LLVM_VERSION_STRING;
375 }
376
377 const Capabilities Caps = {
378 true, // CoroutinesSupported
379 };
380
381 // The abstract Type* types are implemented as LLVM types, except that
382 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
383 // and VFP in ARM, and eliminate the overhead of converting them to explicit
384 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
385 // as abstract pointers with small enum values.
386 enum InternalType : uintptr_t
387 {
388 // Emulated types:
389 Type_v2i32,
390 Type_v4i16,
391 Type_v2i16,
392 Type_v8i8,
393 Type_v4i8,
394 Type_v2f32,
395 EmulatedTypeCount,
396 // Returned by asInternalType() to indicate that the abstract Type*
397 // should be interpreted as LLVM type pointer:
398 Type_LLVM
399 };
400
asInternalType(Type * type)401 inline InternalType asInternalType(Type *type)
402 {
403 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
404 return (t < EmulatedTypeCount) ? t : Type_LLVM;
405 }
406
T(Type * t)407 llvm::Type *T(Type *t)
408 {
409 // Use 128-bit vectors to implement logically shorter ones.
410 switch(asInternalType(t))
411 {
412 case Type_v2i32: return T(Int4::type());
413 case Type_v4i16: return T(Short8::type());
414 case Type_v2i16: return T(Short8::type());
415 case Type_v8i8: return T(Byte16::type());
416 case Type_v4i8: return T(Byte16::type());
417 case Type_v2f32: return T(Float4::type());
418 case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
419 default:
420 UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
421 return nullptr;
422 }
423 }
424
T(InternalType t)425 Type *T(InternalType t)
426 {
427 return reinterpret_cast<Type *>(t);
428 }
429
T(const std::vector<Type * > & t)430 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
431 {
432 return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
433 }
434
B(BasicBlock * t)435 inline llvm::BasicBlock *B(BasicBlock *t)
436 {
437 return reinterpret_cast<llvm::BasicBlock *>(t);
438 }
439
B(llvm::BasicBlock * t)440 inline BasicBlock *B(llvm::BasicBlock *t)
441 {
442 return reinterpret_cast<BasicBlock *>(t);
443 }
444
typeSize(Type * type)445 static size_t typeSize(Type *type)
446 {
447 switch(asInternalType(type))
448 {
449 case Type_v2i32: return 8;
450 case Type_v4i16: return 8;
451 case Type_v2i16: return 4;
452 case Type_v8i8: return 8;
453 case Type_v4i8: return 4;
454 case Type_v2f32: return 8;
455 case Type_LLVM:
456 {
457 llvm::Type *t = T(type);
458
459 if(t->isPointerTy())
460 {
461 return sizeof(void *);
462 }
463
464 // At this point we should only have LLVM 'primitive' types.
465 unsigned int bits = t->getPrimitiveSizeInBits();
466 ASSERT_MSG(bits != 0, "bits: %d", int(bits));
467
468 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
469 // but are typically stored as one byte. The DataLayout structure should
470 // be used here and many other places if this assumption fails.
471 return (bits + 7) / 8;
472 }
473 break;
474 default:
475 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
476 return 0;
477 }
478 }
479
elementCount(Type * type)480 static unsigned int elementCount(Type *type)
481 {
482 switch(asInternalType(type))
483 {
484 case Type_v2i32: return 2;
485 case Type_v4i16: return 4;
486 case Type_v2i16: return 2;
487 case Type_v8i8: return 8;
488 case Type_v4i8: return 4;
489 case Type_v2f32: return 2;
490 case Type_LLVM: return llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();
491 default:
492 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
493 return 0;
494 }
495 }
496
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)497 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> ¶ms)
498 {
499 llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
500 auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
501
502 func->setLinkage(llvm::GlobalValue::ExternalLinkage);
503 func->setDoesNotThrow();
504 func->setCallingConv(llvm::CallingConv::C);
505
506 if(__has_feature(memory_sanitizer))
507 {
508 func->addFnAttr(llvm::Attribute::SanitizeMemory);
509 }
510
511 return func;
512 }
513
Nucleus()514 Nucleus::Nucleus()
515 {
516 #if !__has_feature(memory_sanitizer)
517 // thread_local variables in shared libraries are initialized at load-time,
518 // but this is not observed by MemorySanitizer if the loader itself was not
519 // instrumented, leading to false-positive unitialized variable errors.
520 ASSERT(jit == nullptr);
521 ASSERT(Variable::unmaterializedVariables == nullptr);
522 #endif
523
524 jit = new JITBuilder(Nucleus::getDefaultConfig());
525 Variable::unmaterializedVariables = new Variable::UnmaterializedVariables{};
526 }
527
~Nucleus()528 Nucleus::~Nucleus()
529 {
530 delete Variable::unmaterializedVariables;
531 Variable::unmaterializedVariables = nullptr;
532
533 delete jit;
534 jit = nullptr;
535 }
536
setDefaultConfig(const Config & cfg)537 void Nucleus::setDefaultConfig(const Config &cfg)
538 {
539 std::unique_lock<std::mutex> lock(::defaultConfigLock);
540 ::defaultConfig() = cfg;
541 }
542
adjustDefaultConfig(const Config::Edit & cfgEdit)543 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
544 {
545 std::unique_lock<std::mutex> lock(::defaultConfigLock);
546 auto &config = ::defaultConfig();
547 config = cfgEdit.apply(config);
548 }
549
getDefaultConfig()550 Config Nucleus::getDefaultConfig()
551 {
552 std::unique_lock<std::mutex> lock(::defaultConfigLock);
553 return ::defaultConfig();
554 }
555
acquireRoutine(const char * name,const Config::Edit & cfgEdit)556 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
557 {
558 if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
559 {
560 llvm::Type *type = jit->function->getReturnType();
561
562 if(type->isVoidTy())
563 {
564 createRetVoid();
565 }
566 else
567 {
568 createRet(V(llvm::UndefValue::get(type)));
569 }
570 }
571
572 std::shared_ptr<Routine> routine;
573
574 auto acquire = [&](rr::JITBuilder *jit) {
575 // ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
576 // it needs to only use the jit variable passed in as an argument.
577
578 auto cfg = cfgEdit.apply(jit->config);
579
580 #ifdef ENABLE_RR_DEBUG_INFO
581 if(jit->debugInfo != nullptr)
582 {
583 jit->debugInfo->Finalize();
584 }
585 #endif // ENABLE_RR_DEBUG_INFO
586
587 if(false)
588 {
589 std::error_code error;
590 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
591 jit->module->print(file, 0);
592 }
593
594 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
595 {
596 llvm::legacy::PassManager pm;
597 pm.add(llvm::createVerifierPass());
598 pm.run(*jit->module);
599 }
600 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
601
602 jit->optimize(cfg);
603
604 if(false)
605 {
606 std::error_code error;
607 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
608 jit->module->print(file, 0);
609 }
610
611 routine = jit->acquireRoutine(name, &jit->function, 1, cfg);
612 };
613
614 #ifdef JIT_IN_SEPARATE_THREAD
615 // Perform optimizations and codegen in a separate thread to avoid stack overflow.
616 // FIXME(b/149829034): This is not a long-term solution. Reactor has no control
617 // over the threading and stack sizes of its users, so this should be addressed
618 // at a higher level instead.
619 std::thread thread(acquire, jit);
620 thread.join();
621 #else
622 acquire(jit);
623 #endif
624
625 return routine;
626 }
627
allocateStackVariable(Type * type,int arraySize)628 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
629 {
630 // Need to allocate it in the entry block for mem2reg to work
631 llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
632
633 llvm::Instruction *declaration;
634
635 #if LLVM_VERSION_MAJOR >= 11
636 auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
637 #else
638 auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
639 #endif
640
641 if(arraySize)
642 {
643 Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
644 declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
645 }
646 else
647 {
648 declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
649 }
650
651 entryBlock.getInstList().push_front(declaration);
652
653 return V(declaration);
654 }
655
createBasicBlock()656 BasicBlock *Nucleus::createBasicBlock()
657 {
658 return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
659 }
660
getInsertBlock()661 BasicBlock *Nucleus::getInsertBlock()
662 {
663 return B(jit->builder->GetInsertBlock());
664 }
665
setInsertBlock(BasicBlock * basicBlock)666 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
667 {
668 // assert(jit->builder->GetInsertBlock()->back().isTerminator());
669
670 jit->builder->SetInsertPoint(B(basicBlock));
671 }
672
createFunction(Type * ReturnType,const std::vector<Type * > & Params)673 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
674 {
675 jit->function = rr::createFunction("", T(ReturnType), T(Params));
676
677 #ifdef ENABLE_RR_DEBUG_INFO
678 jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
679 #endif // ENABLE_RR_DEBUG_INFO
680
681 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
682 }
683
getArgument(unsigned int index)684 Value *Nucleus::getArgument(unsigned int index)
685 {
686 llvm::Function::arg_iterator args = jit->function->arg_begin();
687
688 while(index)
689 {
690 args++;
691 index--;
692 }
693
694 return V(&*args);
695 }
696
createRetVoid()697 void Nucleus::createRetVoid()
698 {
699 RR_DEBUG_INFO_UPDATE_LOC();
700
701 ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
702
703 // Code generated after this point is unreachable, so any variables
704 // being read can safely return an undefined value. We have to avoid
705 // materializing variables after the terminator ret instruction.
706 Variable::killUnmaterialized();
707
708 jit->builder->CreateRetVoid();
709 }
710
createRet(Value * v)711 void Nucleus::createRet(Value *v)
712 {
713 RR_DEBUG_INFO_UPDATE_LOC();
714
715 ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
716
717 // Code generated after this point is unreachable, so any variables
718 // being read can safely return an undefined value. We have to avoid
719 // materializing variables after the terminator ret instruction.
720 Variable::killUnmaterialized();
721
722 jit->builder->CreateRet(V(v));
723 }
724
createBr(BasicBlock * dest)725 void Nucleus::createBr(BasicBlock *dest)
726 {
727 RR_DEBUG_INFO_UPDATE_LOC();
728 Variable::materializeAll();
729
730 jit->builder->CreateBr(B(dest));
731 }
732
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)733 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
734 {
735 RR_DEBUG_INFO_UPDATE_LOC();
736 Variable::materializeAll();
737 jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
738 }
739
createAdd(Value * lhs,Value * rhs)740 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
741 {
742 RR_DEBUG_INFO_UPDATE_LOC();
743 return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
744 }
745
createSub(Value * lhs,Value * rhs)746 Value *Nucleus::createSub(Value *lhs, Value *rhs)
747 {
748 RR_DEBUG_INFO_UPDATE_LOC();
749 return V(jit->builder->CreateSub(V(lhs), V(rhs)));
750 }
751
createMul(Value * lhs,Value * rhs)752 Value *Nucleus::createMul(Value *lhs, Value *rhs)
753 {
754 RR_DEBUG_INFO_UPDATE_LOC();
755 return V(jit->builder->CreateMul(V(lhs), V(rhs)));
756 }
757
createUDiv(Value * lhs,Value * rhs)758 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
759 {
760 RR_DEBUG_INFO_UPDATE_LOC();
761 return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
762 }
763
createSDiv(Value * lhs,Value * rhs)764 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
765 {
766 RR_DEBUG_INFO_UPDATE_LOC();
767 return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
768 }
769
createFAdd(Value * lhs,Value * rhs)770 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
771 {
772 RR_DEBUG_INFO_UPDATE_LOC();
773 return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
774 }
775
createFSub(Value * lhs,Value * rhs)776 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
777 {
778 RR_DEBUG_INFO_UPDATE_LOC();
779 return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
780 }
781
createFMul(Value * lhs,Value * rhs)782 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
783 {
784 RR_DEBUG_INFO_UPDATE_LOC();
785 return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
786 }
787
createFDiv(Value * lhs,Value * rhs)788 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
789 {
790 RR_DEBUG_INFO_UPDATE_LOC();
791 return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
792 }
793
createURem(Value * lhs,Value * rhs)794 Value *Nucleus::createURem(Value *lhs, Value *rhs)
795 {
796 RR_DEBUG_INFO_UPDATE_LOC();
797 return V(jit->builder->CreateURem(V(lhs), V(rhs)));
798 }
799
createSRem(Value * lhs,Value * rhs)800 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
801 {
802 RR_DEBUG_INFO_UPDATE_LOC();
803 return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
804 }
805
createFRem(Value * lhs,Value * rhs)806 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
807 {
808 RR_DEBUG_INFO_UPDATE_LOC();
809 return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
810 }
811
operator %(RValue<Float4> lhs,RValue<Float4> rhs)812 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
813 {
814 return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
815 }
816
createShl(Value * lhs,Value * rhs)817 Value *Nucleus::createShl(Value *lhs, Value *rhs)
818 {
819 RR_DEBUG_INFO_UPDATE_LOC();
820 return V(jit->builder->CreateShl(V(lhs), V(rhs)));
821 }
822
createLShr(Value * lhs,Value * rhs)823 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
824 {
825 RR_DEBUG_INFO_UPDATE_LOC();
826 return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
827 }
828
createAShr(Value * lhs,Value * rhs)829 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
830 {
831 RR_DEBUG_INFO_UPDATE_LOC();
832 return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
833 }
834
createAnd(Value * lhs,Value * rhs)835 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
836 {
837 RR_DEBUG_INFO_UPDATE_LOC();
838 return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
839 }
840
createOr(Value * lhs,Value * rhs)841 Value *Nucleus::createOr(Value *lhs, Value *rhs)
842 {
843 RR_DEBUG_INFO_UPDATE_LOC();
844 return V(jit->builder->CreateOr(V(lhs), V(rhs)));
845 }
846
createXor(Value * lhs,Value * rhs)847 Value *Nucleus::createXor(Value *lhs, Value *rhs)
848 {
849 RR_DEBUG_INFO_UPDATE_LOC();
850 return V(jit->builder->CreateXor(V(lhs), V(rhs)));
851 }
852
createNeg(Value * v)853 Value *Nucleus::createNeg(Value *v)
854 {
855 RR_DEBUG_INFO_UPDATE_LOC();
856 return V(jit->builder->CreateNeg(V(v)));
857 }
858
createFNeg(Value * v)859 Value *Nucleus::createFNeg(Value *v)
860 {
861 RR_DEBUG_INFO_UPDATE_LOC();
862 return V(jit->builder->CreateFNeg(V(v)));
863 }
864
createNot(Value * v)865 Value *Nucleus::createNot(Value *v)
866 {
867 RR_DEBUG_INFO_UPDATE_LOC();
868 return V(jit->builder->CreateNot(V(v)));
869 }
870
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)871 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
872 {
873 RR_DEBUG_INFO_UPDATE_LOC();
874 switch(asInternalType(type))
875 {
876 case Type_v2i32:
877 case Type_v4i16:
878 case Type_v8i8:
879 case Type_v2f32:
880 return createBitCast(
881 createInsertElement(
882 V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
883 createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
884 0),
885 type);
886 case Type_v2i16:
887 case Type_v4i8:
888 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
889 {
890 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
891 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
892 i = createZExt(i, Long::type());
893 Value *v = createInsertElement(u, i, 0);
894 return createBitCast(v, type);
895 }
896 // Fallthrough to non-emulated case.
897 case Type_LLVM:
898 {
899 auto elTy = T(type);
900 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
901
902 if(!atomic)
903 {
904 return V(jit->builder->CreateAlignedLoad(V(ptr), llvm::MaybeAlign(alignment), isVolatile));
905 }
906 else if(elTy->isIntegerTy() || elTy->isPointerTy())
907 {
908 // Integers and pointers can be atomically loaded by setting
909 // the ordering constraint on the load instruction.
910 auto load = jit->builder->CreateAlignedLoad(V(ptr), llvm::MaybeAlign(alignment), isVolatile);
911 load->setAtomic(atomicOrdering(atomic, memoryOrder));
912 return V(load);
913 }
914 else if(elTy->isFloatTy() || elTy->isDoubleTy())
915 {
916 // LLVM claims to support atomic loads of float types as
917 // above, but certain backends cannot deal with this.
918 // Load as an integer and bitcast. See b/136037244.
919 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
920 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
921 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
922 auto load = jit->builder->CreateAlignedLoad(ptrCast, llvm::MaybeAlign(alignment), isVolatile);
923 load->setAtomic(atomicOrdering(atomic, memoryOrder));
924 auto loadCast = jit->builder->CreateBitCast(load, elTy);
925 return V(loadCast);
926 }
927 else
928 {
929 // More exotic types require falling back to the extern:
930 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
931 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
932 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
933 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
934 auto i8PtrTy = i8Ty->getPointerTo();
935 auto voidTy = llvm::Type::getVoidTy(*jit->context);
936 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
937 auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
938 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
939 auto out = allocateStackVariable(type);
940 jit->builder->CreateCall(func, {
941 llvm::ConstantInt::get(sizetTy, size),
942 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
943 jit->builder->CreatePointerCast(V(out), i8PtrTy),
944 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
945 });
946 return V(jit->builder->CreateLoad(V(out)));
947 }
948 }
949 default:
950 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
951 return nullptr;
952 }
953 }
954
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)955 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
956 {
957 RR_DEBUG_INFO_UPDATE_LOC();
958 switch(asInternalType(type))
959 {
960 case Type_v2i32:
961 case Type_v4i16:
962 case Type_v8i8:
963 case Type_v2f32:
964 createStore(
965 createExtractElement(
966 createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
967 createBitCast(ptr, Pointer<Long>::type()),
968 Long::type(), isVolatile, alignment, atomic, memoryOrder);
969 return value;
970 case Type_v2i16:
971 case Type_v4i8:
972 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
973 {
974 createStore(
975 createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
976 createBitCast(ptr, Pointer<Int>::type()),
977 Int::type(), isVolatile, alignment, atomic, memoryOrder);
978 return value;
979 }
980 // Fallthrough to non-emulated case.
981 case Type_LLVM:
982 {
983 auto elTy = T(type);
984 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
985
986 if(__has_feature(memory_sanitizer) && !REACTOR_ENABLE_MEMORY_SANITIZER_INSTRUMENTATION)
987 {
988 // Mark all memory writes as initialized by calling __msan_unpoison
989 // void __msan_unpoison(const volatile void *a, size_t size)
990 auto voidTy = llvm::Type::getVoidTy(*jit->context);
991 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
992 auto voidPtrTy = i8Ty->getPointerTo();
993 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
994 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
995 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
996 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
997
998 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
999 llvm::ConstantInt::get(sizetTy, size) });
1000 }
1001
1002 if(!atomic)
1003 {
1004 jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1005 }
1006 else if(elTy->isIntegerTy() || elTy->isPointerTy())
1007 {
1008 // Integers and pointers can be atomically stored by setting
1009 // the ordering constraint on the store instruction.
1010 auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1011 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1012 }
1013 else if(elTy->isFloatTy() || elTy->isDoubleTy())
1014 {
1015 // LLVM claims to support atomic stores of float types as
1016 // above, but certain backends cannot deal with this.
1017 // Store as an bitcast integer. See b/136037244.
1018 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1019 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1020 auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1021 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1022 auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1023 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1024 }
1025 else
1026 {
1027 // More exotic types require falling back to the extern:
1028 // void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1029 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1030 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1031 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1032 auto i8PtrTy = i8Ty->getPointerTo();
1033 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1034 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1035 auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1036 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1037 auto copy = allocateStackVariable(type);
1038 jit->builder->CreateStore(V(value), V(copy));
1039 jit->builder->CreateCall(func, {
1040 llvm::ConstantInt::get(sizetTy, size),
1041 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1042 jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1043 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1044 });
1045 }
1046
1047 return value;
1048 }
1049 default:
1050 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1051 return nullptr;
1052 }
1053 }
1054
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1055 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1056 {
1057 RR_DEBUG_INFO_UPDATE_LOC();
1058
1059 ASSERT(V(ptr)->getType()->isPointerTy());
1060 ASSERT(V(mask)->getType()->isVectorTy());
1061
1062 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1063 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1064 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1065 auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1066 auto elVecPtrTy = elVecTy->getPointerTo();
1067 auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1068 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1069 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1070 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1071 return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1072 }
1073
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1074 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1075 {
1076 RR_DEBUG_INFO_UPDATE_LOC();
1077
1078 ASSERT(V(ptr)->getType()->isPointerTy());
1079 ASSERT(V(val)->getType()->isVectorTy());
1080 ASSERT(V(mask)->getType()->isVectorTy());
1081
1082 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1083 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1084 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1085 auto elVecTy = V(val)->getType();
1086 auto elVecPtrTy = elVecTy->getPointerTo();
1087 auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1088 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1089 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1090 jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1091
1092 if(__has_feature(memory_sanitizer) && !REACTOR_ENABLE_MEMORY_SANITIZER_INSTRUMENTATION)
1093 {
1094 // Mark memory writes as initialized by calling __msan_unpoison
1095 // void __msan_unpoison(const volatile void *a, size_t size)
1096 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1097 auto voidPtrTy = voidTy->getPointerTo();
1098 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1099 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1100 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1101 auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1102
1103 for(unsigned i = 0; i < numEls; i++)
1104 {
1105 // Check mask for this element
1106 auto idx = llvm::ConstantInt::get(i32Ty, i);
1107 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1108 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1109 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1110 jit->builder->SetInsertPoint(thenBlock);
1111
1112 // Insert __msan_unpoison call in conditional block
1113 auto elPtr = jit->builder->CreateGEP(V(ptr), idx);
1114 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1115 llvm::ConstantInt::get(sizetTy, size) });
1116
1117 jit->builder->CreateBr(mergeBlock);
1118 jit->builder->SetInsertPoint(mergeBlock);
1119 }
1120 }
1121 }
1122
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1123 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1124 {
1125 ASSERT(base->getType()->isPointerTy());
1126 ASSERT(offsets->getType()->isVectorTy());
1127 ASSERT(mask->getType()->isVectorTy());
1128
1129 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1130 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1131 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1132 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1133 auto i8PtrTy = i8Ty->getPointerTo();
1134 auto elPtrTy = elTy->getPointerTo();
1135 auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1136 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1137 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1138 auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
1139 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1140 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1141 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1142
1143 if(!__has_feature(memory_sanitizer))
1144 {
1145 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1146 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1147 return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1148 }
1149 else // __has_feature(memory_sanitizer)
1150 {
1151 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1152 // Work around it by emulating gather with element-wise loads.
1153 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1154
1155 Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1156 Nucleus::createStore(V(passthrough), result, T(elVecTy));
1157
1158 for(unsigned i = 0; i < numEls; i++)
1159 {
1160 // Check mask for this element
1161 Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1162
1163 If(RValue<Bool>(elementMask))
1164 {
1165 Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1166 Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1167
1168 Value *v = Nucleus::createLoad(result, T(elVecTy));
1169 v = Nucleus::createInsertElement(v, el, i);
1170 Nucleus::createStore(v, result, T(elVecTy));
1171 }
1172 }
1173
1174 return V(Nucleus::createLoad(result, T(elVecTy)));
1175 }
1176 }
1177
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1178 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1179 {
1180 return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1181 }
1182
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1183 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1184 {
1185 return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1186 }
1187
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1188 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1189 {
1190 ASSERT(base->getType()->isPointerTy());
1191 ASSERT(val->getType()->isVectorTy());
1192 ASSERT(offsets->getType()->isVectorTy());
1193 ASSERT(mask->getType()->isVectorTy());
1194
1195 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1196 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1197 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1198 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1199 auto i8PtrTy = i8Ty->getPointerTo();
1200 auto elVecTy = val->getType();
1201 auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1202 auto elPtrTy = elTy->getPointerTo();
1203 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1204
1205 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1206 auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
1207 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1208 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1209
1210 if(!__has_feature(memory_sanitizer))
1211 {
1212 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1213 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1214 jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1215 }
1216 else // __has_feature(memory_sanitizer)
1217 {
1218 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1219 // Work around it by emulating scatter with element-wise stores.
1220 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1221
1222 for(unsigned i = 0; i < numEls; i++)
1223 {
1224 // Check mask for this element
1225 auto idx = llvm::ConstantInt::get(i32Ty, i);
1226 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1227 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1228 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1229 jit->builder->SetInsertPoint(thenBlock);
1230
1231 auto el = jit->builder->CreateExtractElement(val, idx);
1232 auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1233 Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1234
1235 jit->builder->CreateBr(mergeBlock);
1236 jit->builder->SetInsertPoint(mergeBlock);
1237 }
1238 }
1239 }
1240
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1241 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1242 {
1243 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1244 }
1245
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1246 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1247 {
1248 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1249 }
1250
createFence(std::memory_order memoryOrder)1251 void Nucleus::createFence(std::memory_order memoryOrder)
1252 {
1253 RR_DEBUG_INFO_UPDATE_LOC();
1254 jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1255 }
1256
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1257 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1258 {
1259 RR_DEBUG_INFO_UPDATE_LOC();
1260 ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1261 if(sizeof(void *) == 8)
1262 {
1263 // LLVM manual: "When indexing into an array, pointer or vector,
1264 // integers of any width are allowed, and they are not required to
1265 // be constant. These integers are treated as signed values where
1266 // relevant."
1267 //
1268 // Thus if we want indexes to be treated as unsigned we have to
1269 // zero-extend them ourselves.
1270 //
1271 // Note that this is not because we want to address anywhere near
1272 // 4 GB of data. Instead this is important for performance because
1273 // x86 supports automatic zero-extending of 32-bit registers to
1274 // 64-bit. Thus when indexing into an array using a uint32 is
1275 // actually faster than an int32.
1276 index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1277 }
1278
1279 // For non-emulated types we can rely on LLVM's GEP to calculate the
1280 // effective address correctly.
1281 if(asInternalType(type) == Type_LLVM)
1282 {
1283 return V(jit->builder->CreateGEP(V(ptr), V(index)));
1284 }
1285
1286 // For emulated types we have to multiply the index by the intended
1287 // type size ourselves to obain the byte offset.
1288 index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1289
1290 // Cast to a byte pointer, apply the byte offset, and cast back to the
1291 // original pointer type.
1292 return createBitCast(
1293 V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1294 T(llvm::PointerType::get(T(type), 0)));
1295 }
1296
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1297 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1298 {
1299 RR_DEBUG_INFO_UPDATE_LOC();
1300 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1301 #if LLVM_VERSION_MAJOR >= 11
1302 llvm::MaybeAlign(),
1303 #endif
1304 atomicOrdering(true, memoryOrder)));
1305 }
1306
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1307 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1308 {
1309 RR_DEBUG_INFO_UPDATE_LOC();
1310 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1311 #if LLVM_VERSION_MAJOR >= 11
1312 llvm::MaybeAlign(),
1313 #endif
1314 atomicOrdering(true, memoryOrder)));
1315 }
1316
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1317 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1318 {
1319 RR_DEBUG_INFO_UPDATE_LOC();
1320 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1321 #if LLVM_VERSION_MAJOR >= 11
1322 llvm::MaybeAlign(),
1323 #endif
1324 atomicOrdering(true, memoryOrder)));
1325 }
1326
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1327 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1328 {
1329 RR_DEBUG_INFO_UPDATE_LOC();
1330 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1331 #if LLVM_VERSION_MAJOR >= 11
1332 llvm::MaybeAlign(),
1333 #endif
1334 atomicOrdering(true, memoryOrder)));
1335 }
1336
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1337 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1338 {
1339 RR_DEBUG_INFO_UPDATE_LOC();
1340 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1341 #if LLVM_VERSION_MAJOR >= 11
1342 llvm::MaybeAlign(),
1343 #endif
1344 atomicOrdering(true, memoryOrder)));
1345 }
1346
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1347 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1348 {
1349 RR_DEBUG_INFO_UPDATE_LOC();
1350 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1351 #if LLVM_VERSION_MAJOR >= 11
1352 llvm::MaybeAlign(),
1353 #endif
1354 atomicOrdering(true, memoryOrder)));
1355 }
1356
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1357 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1358 {
1359 RR_DEBUG_INFO_UPDATE_LOC();
1360 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1361 #if LLVM_VERSION_MAJOR >= 11
1362 llvm::MaybeAlign(),
1363 #endif
1364 atomicOrdering(true, memoryOrder)));
1365 }
1366
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1367 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1368 {
1369 RR_DEBUG_INFO_UPDATE_LOC();
1370 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1371 #if LLVM_VERSION_MAJOR >= 11
1372 llvm::MaybeAlign(),
1373 #endif
1374 atomicOrdering(true, memoryOrder)));
1375 }
1376
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1377 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1378 {
1379 RR_DEBUG_INFO_UPDATE_LOC();
1380 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1381 #if LLVM_VERSION_MAJOR >= 11
1382 llvm::MaybeAlign(),
1383 #endif
1384 atomicOrdering(true, memoryOrder)));
1385 }
1386
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1387 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1388 {
1389 RR_DEBUG_INFO_UPDATE_LOC();
1390 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1391 #if LLVM_VERSION_MAJOR >= 11
1392 llvm::MaybeAlign(),
1393 #endif
1394 atomicOrdering(true, memoryOrder)));
1395 }
1396
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1397 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1398 {
1399 RR_DEBUG_INFO_UPDATE_LOC();
1400 // Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1401 return V(jit->builder->CreateExtractValue(
1402 jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1403 #if LLVM_VERSION_MAJOR >= 11
1404 llvm::MaybeAlign(),
1405 #endif
1406 atomicOrdering(true, memoryOrderEqual),
1407 atomicOrdering(true, memoryOrderUnequal)),
1408 llvm::ArrayRef<unsigned>(0u)));
1409 }
1410
createTrunc(Value * v,Type * destType)1411 Value *Nucleus::createTrunc(Value *v, Type *destType)
1412 {
1413 RR_DEBUG_INFO_UPDATE_LOC();
1414 return V(jit->builder->CreateTrunc(V(v), T(destType)));
1415 }
1416
createZExt(Value * v,Type * destType)1417 Value *Nucleus::createZExt(Value *v, Type *destType)
1418 {
1419 RR_DEBUG_INFO_UPDATE_LOC();
1420 return V(jit->builder->CreateZExt(V(v), T(destType)));
1421 }
1422
createSExt(Value * v,Type * destType)1423 Value *Nucleus::createSExt(Value *v, Type *destType)
1424 {
1425 RR_DEBUG_INFO_UPDATE_LOC();
1426 return V(jit->builder->CreateSExt(V(v), T(destType)));
1427 }
1428
createFPToUI(Value * v,Type * destType)1429 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1430 {
1431 RR_DEBUG_INFO_UPDATE_LOC();
1432 return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1433 }
1434
createFPToSI(Value * v,Type * destType)1435 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1436 {
1437 RR_DEBUG_INFO_UPDATE_LOC();
1438 return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1439 }
1440
createSIToFP(Value * v,Type * destType)1441 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1442 {
1443 RR_DEBUG_INFO_UPDATE_LOC();
1444 return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1445 }
1446
createFPTrunc(Value * v,Type * destType)1447 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1448 {
1449 RR_DEBUG_INFO_UPDATE_LOC();
1450 return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1451 }
1452
createFPExt(Value * v,Type * destType)1453 Value *Nucleus::createFPExt(Value *v, Type *destType)
1454 {
1455 RR_DEBUG_INFO_UPDATE_LOC();
1456 return V(jit->builder->CreateFPExt(V(v), T(destType)));
1457 }
1458
createBitCast(Value * v,Type * destType)1459 Value *Nucleus::createBitCast(Value *v, Type *destType)
1460 {
1461 RR_DEBUG_INFO_UPDATE_LOC();
1462 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1463 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1464 // reading back as the destination type.
1465 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1466 {
1467 Value *readAddress = allocateStackVariable(destType);
1468 Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1469 createStore(v, writeAddress, T(V(v)->getType()));
1470 return createLoad(readAddress, destType);
1471 }
1472 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1473 {
1474 Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1475 createStore(v, writeAddress, T(V(v)->getType()));
1476 Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1477 return createLoad(readAddress, destType);
1478 }
1479
1480 return V(jit->builder->CreateBitCast(V(v), T(destType)));
1481 }
1482
createICmpEQ(Value * lhs,Value * rhs)1483 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1484 {
1485 RR_DEBUG_INFO_UPDATE_LOC();
1486 return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1487 }
1488
createICmpNE(Value * lhs,Value * rhs)1489 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1490 {
1491 RR_DEBUG_INFO_UPDATE_LOC();
1492 return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1493 }
1494
createICmpUGT(Value * lhs,Value * rhs)1495 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1496 {
1497 RR_DEBUG_INFO_UPDATE_LOC();
1498 return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1499 }
1500
createICmpUGE(Value * lhs,Value * rhs)1501 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1502 {
1503 RR_DEBUG_INFO_UPDATE_LOC();
1504 return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1505 }
1506
createICmpULT(Value * lhs,Value * rhs)1507 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1508 {
1509 RR_DEBUG_INFO_UPDATE_LOC();
1510 return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1511 }
1512
createICmpULE(Value * lhs,Value * rhs)1513 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1514 {
1515 RR_DEBUG_INFO_UPDATE_LOC();
1516 return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1517 }
1518
createICmpSGT(Value * lhs,Value * rhs)1519 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1520 {
1521 RR_DEBUG_INFO_UPDATE_LOC();
1522 return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1523 }
1524
createICmpSGE(Value * lhs,Value * rhs)1525 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1526 {
1527 RR_DEBUG_INFO_UPDATE_LOC();
1528 return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1529 }
1530
createICmpSLT(Value * lhs,Value * rhs)1531 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1532 {
1533 RR_DEBUG_INFO_UPDATE_LOC();
1534 return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1535 }
1536
createICmpSLE(Value * lhs,Value * rhs)1537 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1538 {
1539 RR_DEBUG_INFO_UPDATE_LOC();
1540 return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1541 }
1542
createFCmpOEQ(Value * lhs,Value * rhs)1543 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1544 {
1545 RR_DEBUG_INFO_UPDATE_LOC();
1546 return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1547 }
1548
createFCmpOGT(Value * lhs,Value * rhs)1549 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1550 {
1551 RR_DEBUG_INFO_UPDATE_LOC();
1552 return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1553 }
1554
createFCmpOGE(Value * lhs,Value * rhs)1555 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1556 {
1557 RR_DEBUG_INFO_UPDATE_LOC();
1558 return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1559 }
1560
createFCmpOLT(Value * lhs,Value * rhs)1561 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1562 {
1563 RR_DEBUG_INFO_UPDATE_LOC();
1564 return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1565 }
1566
createFCmpOLE(Value * lhs,Value * rhs)1567 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1568 {
1569 RR_DEBUG_INFO_UPDATE_LOC();
1570 return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1571 }
1572
createFCmpONE(Value * lhs,Value * rhs)1573 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1574 {
1575 RR_DEBUG_INFO_UPDATE_LOC();
1576 return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1577 }
1578
createFCmpORD(Value * lhs,Value * rhs)1579 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1580 {
1581 RR_DEBUG_INFO_UPDATE_LOC();
1582 return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1583 }
1584
createFCmpUNO(Value * lhs,Value * rhs)1585 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1586 {
1587 RR_DEBUG_INFO_UPDATE_LOC();
1588 return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1589 }
1590
createFCmpUEQ(Value * lhs,Value * rhs)1591 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1592 {
1593 RR_DEBUG_INFO_UPDATE_LOC();
1594 return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1595 }
1596
createFCmpUGT(Value * lhs,Value * rhs)1597 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1598 {
1599 RR_DEBUG_INFO_UPDATE_LOC();
1600 return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1601 }
1602
createFCmpUGE(Value * lhs,Value * rhs)1603 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1604 {
1605 RR_DEBUG_INFO_UPDATE_LOC();
1606 return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1607 }
1608
createFCmpULT(Value * lhs,Value * rhs)1609 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1610 {
1611 RR_DEBUG_INFO_UPDATE_LOC();
1612 return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1613 }
1614
createFCmpULE(Value * lhs,Value * rhs)1615 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1616 {
1617 RR_DEBUG_INFO_UPDATE_LOC();
1618 return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1619 }
1620
createFCmpUNE(Value * lhs,Value * rhs)1621 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1622 {
1623 RR_DEBUG_INFO_UPDATE_LOC();
1624 return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1625 }
1626
createExtractElement(Value * vector,Type * type,int index)1627 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1628 {
1629 RR_DEBUG_INFO_UPDATE_LOC();
1630 ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1631 return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1632 }
1633
createInsertElement(Value * vector,Value * element,int index)1634 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1635 {
1636 RR_DEBUG_INFO_UPDATE_LOC();
1637 return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1638 }
1639
createShuffleVector(Value * v1,Value * v2,const int * select)1640 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1641 {
1642 RR_DEBUG_INFO_UPDATE_LOC();
1643
1644 int size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1645 const int maxSize = 16;
1646 llvm::Constant *swizzle[maxSize];
1647 ASSERT(size <= maxSize);
1648
1649 for(int i = 0; i < size; i++)
1650 {
1651 swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
1652 }
1653
1654 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
1655
1656 return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1657 }
1658
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1659 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1660 {
1661 RR_DEBUG_INFO_UPDATE_LOC();
1662 return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1663 }
1664
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1665 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1666 {
1667 RR_DEBUG_INFO_UPDATE_LOC();
1668 return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1669 }
1670
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1671 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1672 {
1673 RR_DEBUG_INFO_UPDATE_LOC();
1674 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1675 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1676 }
1677
createUnreachable()1678 void Nucleus::createUnreachable()
1679 {
1680 RR_DEBUG_INFO_UPDATE_LOC();
1681 jit->builder->CreateUnreachable();
1682 }
1683
getType(Value * value)1684 Type *Nucleus::getType(Value *value)
1685 {
1686 return T(V(value)->getType());
1687 }
1688
getContainedType(Type * vectorType)1689 Type *Nucleus::getContainedType(Type *vectorType)
1690 {
1691 return T(T(vectorType)->getContainedType(0));
1692 }
1693
getPointerType(Type * ElementType)1694 Type *Nucleus::getPointerType(Type *ElementType)
1695 {
1696 return T(llvm::PointerType::get(T(ElementType), 0));
1697 }
1698
getNaturalIntType()1699 static llvm::Type *getNaturalIntType()
1700 {
1701 return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1702 }
1703
getPrintfStorageType(Type * valueType)1704 Type *Nucleus::getPrintfStorageType(Type *valueType)
1705 {
1706 llvm::Type *valueTy = T(valueType);
1707 if(valueTy->isIntegerTy())
1708 {
1709 return T(getNaturalIntType());
1710 }
1711 if(valueTy->isFloatTy())
1712 {
1713 return T(llvm::Type::getDoubleTy(*jit->context));
1714 }
1715
1716 UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1717 return {};
1718 }
1719
createNullValue(Type * Ty)1720 Value *Nucleus::createNullValue(Type *Ty)
1721 {
1722 RR_DEBUG_INFO_UPDATE_LOC();
1723 return V(llvm::Constant::getNullValue(T(Ty)));
1724 }
1725
createConstantLong(int64_t i)1726 Value *Nucleus::createConstantLong(int64_t i)
1727 {
1728 RR_DEBUG_INFO_UPDATE_LOC();
1729 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1730 }
1731
createConstantInt(int i)1732 Value *Nucleus::createConstantInt(int i)
1733 {
1734 RR_DEBUG_INFO_UPDATE_LOC();
1735 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1736 }
1737
createConstantInt(unsigned int i)1738 Value *Nucleus::createConstantInt(unsigned int i)
1739 {
1740 RR_DEBUG_INFO_UPDATE_LOC();
1741 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1742 }
1743
createConstantBool(bool b)1744 Value *Nucleus::createConstantBool(bool b)
1745 {
1746 RR_DEBUG_INFO_UPDATE_LOC();
1747 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1748 }
1749
createConstantByte(signed char i)1750 Value *Nucleus::createConstantByte(signed char i)
1751 {
1752 RR_DEBUG_INFO_UPDATE_LOC();
1753 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1754 }
1755
createConstantByte(unsigned char i)1756 Value *Nucleus::createConstantByte(unsigned char i)
1757 {
1758 RR_DEBUG_INFO_UPDATE_LOC();
1759 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1760 }
1761
createConstantShort(short i)1762 Value *Nucleus::createConstantShort(short i)
1763 {
1764 RR_DEBUG_INFO_UPDATE_LOC();
1765 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1766 }
1767
createConstantShort(unsigned short i)1768 Value *Nucleus::createConstantShort(unsigned short i)
1769 {
1770 RR_DEBUG_INFO_UPDATE_LOC();
1771 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1772 }
1773
createConstantFloat(float x)1774 Value *Nucleus::createConstantFloat(float x)
1775 {
1776 RR_DEBUG_INFO_UPDATE_LOC();
1777 return V(llvm::ConstantFP::get(T(Float::type()), x));
1778 }
1779
createNullPointer(Type * Ty)1780 Value *Nucleus::createNullPointer(Type *Ty)
1781 {
1782 RR_DEBUG_INFO_UPDATE_LOC();
1783 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1784 }
1785
createConstantVector(const int64_t * constants,Type * type)1786 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1787 {
1788 RR_DEBUG_INFO_UPDATE_LOC();
1789 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1790 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1791 const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1792 ASSERT(numElements <= 16 && numConstants <= numElements);
1793 llvm::Constant *constantVector[16];
1794
1795 for(int i = 0; i < numElements; i++)
1796 {
1797 constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1798 }
1799
1800 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1801 }
1802
createConstantVector(const double * constants,Type * type)1803 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1804 {
1805 RR_DEBUG_INFO_UPDATE_LOC();
1806 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1807 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1808 const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1809 ASSERT(numElements <= 8 && numConstants <= numElements);
1810 llvm::Constant *constantVector[8];
1811
1812 for(int i = 0; i < numElements; i++)
1813 {
1814 constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1815 }
1816
1817 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1818 }
1819
createConstantString(const char * v)1820 Value *Nucleus::createConstantString(const char *v)
1821 {
1822 // NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1823 auto ptr = jit->builder->CreateGlobalStringPtr(v);
1824 return V(ptr);
1825 }
1826
setOptimizerCallback(OptimizerCallback * callback)1827 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1828 {
1829 // The LLVM backend does not produce optimizer reports.
1830 (void)callback;
1831 }
1832
type()1833 Type *Void::type()
1834 {
1835 return T(llvm::Type::getVoidTy(*jit->context));
1836 }
1837
type()1838 Type *Bool::type()
1839 {
1840 return T(llvm::Type::getInt1Ty(*jit->context));
1841 }
1842
type()1843 Type *Byte::type()
1844 {
1845 return T(llvm::Type::getInt8Ty(*jit->context));
1846 }
1847
type()1848 Type *SByte::type()
1849 {
1850 return T(llvm::Type::getInt8Ty(*jit->context));
1851 }
1852
type()1853 Type *Short::type()
1854 {
1855 return T(llvm::Type::getInt16Ty(*jit->context));
1856 }
1857
type()1858 Type *UShort::type()
1859 {
1860 return T(llvm::Type::getInt16Ty(*jit->context));
1861 }
1862
type()1863 Type *Byte4::type()
1864 {
1865 return T(Type_v4i8);
1866 }
1867
type()1868 Type *SByte4::type()
1869 {
1870 return T(Type_v4i8);
1871 }
1872
AddSat(RValue<Byte8> x,RValue<Byte8> y)1873 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1874 {
1875 RR_DEBUG_INFO_UPDATE_LOC();
1876 #if defined(__i386__) || defined(__x86_64__)
1877 return x86::paddusb(x, y);
1878 #else
1879 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1880 #endif
1881 }
1882
SubSat(RValue<Byte8> x,RValue<Byte8> y)1883 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1884 {
1885 RR_DEBUG_INFO_UPDATE_LOC();
1886 #if defined(__i386__) || defined(__x86_64__)
1887 return x86::psubusb(x, y);
1888 #else
1889 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1890 #endif
1891 }
1892
SignMask(RValue<Byte8> x)1893 RValue<Int> SignMask(RValue<Byte8> x)
1894 {
1895 RR_DEBUG_INFO_UPDATE_LOC();
1896 #if defined(__i386__) || defined(__x86_64__)
1897 return x86::pmovmskb(x);
1898 #else
1899 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1900 #endif
1901 }
1902
1903 // RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1904 // {
1905 //#if defined(__i386__) || defined(__x86_64__)
1906 // return x86::pcmpgtb(x, y); // FIXME: Signedness
1907 //#else
1908 // return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1909 //#endif
1910 // }
1911
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1912 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1913 {
1914 RR_DEBUG_INFO_UPDATE_LOC();
1915 #if defined(__i386__) || defined(__x86_64__)
1916 return x86::pcmpeqb(x, y);
1917 #else
1918 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1919 #endif
1920 }
1921
type()1922 Type *Byte8::type()
1923 {
1924 return T(Type_v8i8);
1925 }
1926
AddSat(RValue<SByte8> x,RValue<SByte8> y)1927 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1928 {
1929 RR_DEBUG_INFO_UPDATE_LOC();
1930 #if defined(__i386__) || defined(__x86_64__)
1931 return x86::paddsb(x, y);
1932 #else
1933 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1934 #endif
1935 }
1936
SubSat(RValue<SByte8> x,RValue<SByte8> y)1937 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1938 {
1939 RR_DEBUG_INFO_UPDATE_LOC();
1940 #if defined(__i386__) || defined(__x86_64__)
1941 return x86::psubsb(x, y);
1942 #else
1943 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1944 #endif
1945 }
1946
SignMask(RValue<SByte8> x)1947 RValue<Int> SignMask(RValue<SByte8> x)
1948 {
1949 RR_DEBUG_INFO_UPDATE_LOC();
1950 #if defined(__i386__) || defined(__x86_64__)
1951 return x86::pmovmskb(As<Byte8>(x));
1952 #else
1953 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1954 #endif
1955 }
1956
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1957 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1958 {
1959 RR_DEBUG_INFO_UPDATE_LOC();
1960 #if defined(__i386__) || defined(__x86_64__)
1961 return x86::pcmpgtb(x, y);
1962 #else
1963 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1964 #endif
1965 }
1966
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1967 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1968 {
1969 RR_DEBUG_INFO_UPDATE_LOC();
1970 #if defined(__i386__) || defined(__x86_64__)
1971 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1972 #else
1973 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1974 #endif
1975 }
1976
type()1977 Type *SByte8::type()
1978 {
1979 return T(Type_v8i8);
1980 }
1981
type()1982 Type *Byte16::type()
1983 {
1984 return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1985 }
1986
type()1987 Type *SByte16::type()
1988 {
1989 return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1990 }
1991
type()1992 Type *Short2::type()
1993 {
1994 return T(Type_v2i16);
1995 }
1996
type()1997 Type *UShort2::type()
1998 {
1999 return T(Type_v2i16);
2000 }
2001
Short4(RValue<Int4> cast)2002 Short4::Short4(RValue<Int4> cast)
2003 {
2004 RR_DEBUG_INFO_UPDATE_LOC();
2005 int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2006 Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2007
2008 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2009 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2010
2011 storeValue(short4);
2012 }
2013
2014 // Short4::Short4(RValue<Float> cast)
2015 // {
2016 // }
2017
Short4(RValue<Float4> cast)2018 Short4::Short4(RValue<Float4> cast)
2019 {
2020 RR_DEBUG_INFO_UPDATE_LOC();
2021 Int4 v4i32 = Int4(cast);
2022 #if defined(__i386__) || defined(__x86_64__)
2023 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2024 #else
2025 Value *v = v4i32.loadValue();
2026 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2027 #endif
2028
2029 storeValue(As<Short4>(Int2(v4i32)).value());
2030 }
2031
operator <<(RValue<Short4> lhs,unsigned char rhs)2032 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2033 {
2034 RR_DEBUG_INFO_UPDATE_LOC();
2035 #if defined(__i386__) || defined(__x86_64__)
2036 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2037
2038 return x86::psllw(lhs, rhs);
2039 #else
2040 return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2041 #endif
2042 }
2043
operator >>(RValue<Short4> lhs,unsigned char rhs)2044 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2045 {
2046 RR_DEBUG_INFO_UPDATE_LOC();
2047 #if defined(__i386__) || defined(__x86_64__)
2048 return x86::psraw(lhs, rhs);
2049 #else
2050 return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2051 #endif
2052 }
2053
Max(RValue<Short4> x,RValue<Short4> y)2054 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2055 {
2056 RR_DEBUG_INFO_UPDATE_LOC();
2057 #if defined(__i386__) || defined(__x86_64__)
2058 return x86::pmaxsw(x, y);
2059 #else
2060 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2061 #endif
2062 }
2063
Min(RValue<Short4> x,RValue<Short4> y)2064 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2065 {
2066 RR_DEBUG_INFO_UPDATE_LOC();
2067 #if defined(__i386__) || defined(__x86_64__)
2068 return x86::pminsw(x, y);
2069 #else
2070 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2071 #endif
2072 }
2073
AddSat(RValue<Short4> x,RValue<Short4> y)2074 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2075 {
2076 RR_DEBUG_INFO_UPDATE_LOC();
2077 #if defined(__i386__) || defined(__x86_64__)
2078 return x86::paddsw(x, y);
2079 #else
2080 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2081 #endif
2082 }
2083
SubSat(RValue<Short4> x,RValue<Short4> y)2084 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2085 {
2086 RR_DEBUG_INFO_UPDATE_LOC();
2087 #if defined(__i386__) || defined(__x86_64__)
2088 return x86::psubsw(x, y);
2089 #else
2090 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2091 #endif
2092 }
2093
MulHigh(RValue<Short4> x,RValue<Short4> y)2094 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2095 {
2096 RR_DEBUG_INFO_UPDATE_LOC();
2097 #if defined(__i386__) || defined(__x86_64__)
2098 return x86::pmulhw(x, y);
2099 #else
2100 return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2101 #endif
2102 }
2103
MulAdd(RValue<Short4> x,RValue<Short4> y)2104 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2105 {
2106 RR_DEBUG_INFO_UPDATE_LOC();
2107 #if defined(__i386__) || defined(__x86_64__)
2108 return x86::pmaddwd(x, y);
2109 #else
2110 return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2111 #endif
2112 }
2113
PackSigned(RValue<Short4> x,RValue<Short4> y)2114 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2115 {
2116 RR_DEBUG_INFO_UPDATE_LOC();
2117 #if defined(__i386__) || defined(__x86_64__)
2118 auto result = x86::packsswb(x, y);
2119 #else
2120 auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2121 #endif
2122 return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2123 }
2124
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2125 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2126 {
2127 RR_DEBUG_INFO_UPDATE_LOC();
2128 #if defined(__i386__) || defined(__x86_64__)
2129 auto result = x86::packuswb(x, y);
2130 #else
2131 auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2132 #endif
2133 return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2134 }
2135
CmpGT(RValue<Short4> x,RValue<Short4> y)2136 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2137 {
2138 RR_DEBUG_INFO_UPDATE_LOC();
2139 #if defined(__i386__) || defined(__x86_64__)
2140 return x86::pcmpgtw(x, y);
2141 #else
2142 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2143 #endif
2144 }
2145
CmpEQ(RValue<Short4> x,RValue<Short4> y)2146 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2147 {
2148 RR_DEBUG_INFO_UPDATE_LOC();
2149 #if defined(__i386__) || defined(__x86_64__)
2150 return x86::pcmpeqw(x, y);
2151 #else
2152 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2153 #endif
2154 }
2155
type()2156 Type *Short4::type()
2157 {
2158 return T(Type_v4i16);
2159 }
2160
UShort4(RValue<Float4> cast,bool saturate)2161 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2162 {
2163 RR_DEBUG_INFO_UPDATE_LOC();
2164 if(saturate)
2165 {
2166 #if defined(__i386__) || defined(__x86_64__)
2167 if(CPUID::supportsSSE4_1())
2168 {
2169 Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
2170 *this = As<Short4>(PackUnsigned(int4, int4));
2171 }
2172 else
2173 #endif
2174 {
2175 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2176 }
2177 }
2178 else
2179 {
2180 *this = Short4(Int4(cast));
2181 }
2182 }
2183
operator <<(RValue<UShort4> lhs,unsigned char rhs)2184 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2185 {
2186 RR_DEBUG_INFO_UPDATE_LOC();
2187 #if defined(__i386__) || defined(__x86_64__)
2188 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2189
2190 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2191 #else
2192 return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2193 #endif
2194 }
2195
operator >>(RValue<UShort4> lhs,unsigned char rhs)2196 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2197 {
2198 RR_DEBUG_INFO_UPDATE_LOC();
2199 #if defined(__i386__) || defined(__x86_64__)
2200 // return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2201
2202 return x86::psrlw(lhs, rhs);
2203 #else
2204 return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2205 #endif
2206 }
2207
Max(RValue<UShort4> x,RValue<UShort4> y)2208 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2209 {
2210 RR_DEBUG_INFO_UPDATE_LOC();
2211 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2212 }
2213
Min(RValue<UShort4> x,RValue<UShort4> y)2214 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2215 {
2216 RR_DEBUG_INFO_UPDATE_LOC();
2217 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2218 }
2219
AddSat(RValue<UShort4> x,RValue<UShort4> y)2220 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2221 {
2222 RR_DEBUG_INFO_UPDATE_LOC();
2223 #if defined(__i386__) || defined(__x86_64__)
2224 return x86::paddusw(x, y);
2225 #else
2226 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2227 #endif
2228 }
2229
SubSat(RValue<UShort4> x,RValue<UShort4> y)2230 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2231 {
2232 RR_DEBUG_INFO_UPDATE_LOC();
2233 #if defined(__i386__) || defined(__x86_64__)
2234 return x86::psubusw(x, y);
2235 #else
2236 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2237 #endif
2238 }
2239
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2240 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2241 {
2242 RR_DEBUG_INFO_UPDATE_LOC();
2243 #if defined(__i386__) || defined(__x86_64__)
2244 return x86::pmulhuw(x, y);
2245 #else
2246 return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2247 #endif
2248 }
2249
Average(RValue<UShort4> x,RValue<UShort4> y)2250 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2251 {
2252 RR_DEBUG_INFO_UPDATE_LOC();
2253 #if defined(__i386__) || defined(__x86_64__)
2254 return x86::pavgw(x, y);
2255 #else
2256 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2257 #endif
2258 }
2259
type()2260 Type *UShort4::type()
2261 {
2262 return T(Type_v4i16);
2263 }
2264
operator <<(RValue<Short8> lhs,unsigned char rhs)2265 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2266 {
2267 RR_DEBUG_INFO_UPDATE_LOC();
2268 #if defined(__i386__) || defined(__x86_64__)
2269 return x86::psllw(lhs, rhs);
2270 #else
2271 return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2272 #endif
2273 }
2274
operator >>(RValue<Short8> lhs,unsigned char rhs)2275 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2276 {
2277 RR_DEBUG_INFO_UPDATE_LOC();
2278 #if defined(__i386__) || defined(__x86_64__)
2279 return x86::psraw(lhs, rhs);
2280 #else
2281 return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2282 #endif
2283 }
2284
MulAdd(RValue<Short8> x,RValue<Short8> y)2285 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2286 {
2287 RR_DEBUG_INFO_UPDATE_LOC();
2288 #if defined(__i386__) || defined(__x86_64__)
2289 return x86::pmaddwd(x, y);
2290 #else
2291 return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2292 #endif
2293 }
2294
MulHigh(RValue<Short8> x,RValue<Short8> y)2295 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2296 {
2297 RR_DEBUG_INFO_UPDATE_LOC();
2298 #if defined(__i386__) || defined(__x86_64__)
2299 return x86::pmulhw(x, y);
2300 #else
2301 return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2302 #endif
2303 }
2304
type()2305 Type *Short8::type()
2306 {
2307 return T(llvm::VectorType::get(T(Short::type()), 8, false));
2308 }
2309
operator <<(RValue<UShort8> lhs,unsigned char rhs)2310 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2311 {
2312 RR_DEBUG_INFO_UPDATE_LOC();
2313 #if defined(__i386__) || defined(__x86_64__)
2314 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2315 #else
2316 return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2317 #endif
2318 }
2319
operator >>(RValue<UShort8> lhs,unsigned char rhs)2320 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2321 {
2322 RR_DEBUG_INFO_UPDATE_LOC();
2323 #if defined(__i386__) || defined(__x86_64__)
2324 return x86::psrlw(lhs, rhs); // FIXME: Fallback required
2325 #else
2326 return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2327 #endif
2328 }
2329
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2330 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2331 {
2332 RR_DEBUG_INFO_UPDATE_LOC();
2333 #if defined(__i386__) || defined(__x86_64__)
2334 return x86::pmulhuw(x, y);
2335 #else
2336 return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2337 #endif
2338 }
2339
type()2340 Type *UShort8::type()
2341 {
2342 return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2343 }
2344
operator ++(Int & val,int)2345 RValue<Int> operator++(Int &val, int) // Post-increment
2346 {
2347 RR_DEBUG_INFO_UPDATE_LOC();
2348 RValue<Int> res = val;
2349
2350 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2351 val.storeValue(inc);
2352
2353 return res;
2354 }
2355
operator ++(Int & val)2356 const Int &operator++(Int &val) // Pre-increment
2357 {
2358 RR_DEBUG_INFO_UPDATE_LOC();
2359 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2360 val.storeValue(inc);
2361
2362 return val;
2363 }
2364
operator --(Int & val,int)2365 RValue<Int> operator--(Int &val, int) // Post-decrement
2366 {
2367 RR_DEBUG_INFO_UPDATE_LOC();
2368 RValue<Int> res = val;
2369
2370 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2371 val.storeValue(inc);
2372
2373 return res;
2374 }
2375
operator --(Int & val)2376 const Int &operator--(Int &val) // Pre-decrement
2377 {
2378 RR_DEBUG_INFO_UPDATE_LOC();
2379 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2380 val.storeValue(inc);
2381
2382 return val;
2383 }
2384
RoundInt(RValue<Float> cast)2385 RValue<Int> RoundInt(RValue<Float> cast)
2386 {
2387 RR_DEBUG_INFO_UPDATE_LOC();
2388 #if defined(__i386__) || defined(__x86_64__)
2389 return x86::cvtss2si(cast);
2390 #else
2391 return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2392 #endif
2393 }
2394
type()2395 Type *Int::type()
2396 {
2397 return T(llvm::Type::getInt32Ty(*jit->context));
2398 }
2399
type()2400 Type *Long::type()
2401 {
2402 return T(llvm::Type::getInt64Ty(*jit->context));
2403 }
2404
UInt(RValue<Float> cast)2405 UInt::UInt(RValue<Float> cast)
2406 {
2407 RR_DEBUG_INFO_UPDATE_LOC();
2408 Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2409 storeValue(integer);
2410 }
2411
operator ++(UInt & val,int)2412 RValue<UInt> operator++(UInt &val, int) // Post-increment
2413 {
2414 RR_DEBUG_INFO_UPDATE_LOC();
2415 RValue<UInt> res = val;
2416
2417 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2418 val.storeValue(inc);
2419
2420 return res;
2421 }
2422
operator ++(UInt & val)2423 const UInt &operator++(UInt &val) // Pre-increment
2424 {
2425 RR_DEBUG_INFO_UPDATE_LOC();
2426 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2427 val.storeValue(inc);
2428
2429 return val;
2430 }
2431
operator --(UInt & val,int)2432 RValue<UInt> operator--(UInt &val, int) // Post-decrement
2433 {
2434 RR_DEBUG_INFO_UPDATE_LOC();
2435 RValue<UInt> res = val;
2436
2437 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2438 val.storeValue(inc);
2439
2440 return res;
2441 }
2442
operator --(UInt & val)2443 const UInt &operator--(UInt &val) // Pre-decrement
2444 {
2445 RR_DEBUG_INFO_UPDATE_LOC();
2446 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2447 val.storeValue(inc);
2448
2449 return val;
2450 }
2451
2452 // RValue<UInt> RoundUInt(RValue<Float> cast)
2453 // {
2454 //#if defined(__i386__) || defined(__x86_64__)
2455 // return x86::cvtss2si(val); // FIXME: Unsigned
2456 //#else
2457 // return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2458 //#endif
2459 // }
2460
type()2461 Type *UInt::type()
2462 {
2463 return T(llvm::Type::getInt32Ty(*jit->context));
2464 }
2465
2466 // Int2::Int2(RValue<Int> cast)
2467 // {
2468 // Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2469 // Value *vector = Nucleus::createBitCast(extend, Int2::type());
2470 //
2471 // int shuffle[2] = {0, 0};
2472 // Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2473 //
2474 // storeValue(replicate);
2475 // }
2476
operator <<(RValue<Int2> lhs,unsigned char rhs)2477 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2478 {
2479 RR_DEBUG_INFO_UPDATE_LOC();
2480 #if defined(__i386__) || defined(__x86_64__)
2481 // return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2482
2483 return x86::pslld(lhs, rhs);
2484 #else
2485 return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2486 #endif
2487 }
2488
operator >>(RValue<Int2> lhs,unsigned char rhs)2489 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2490 {
2491 RR_DEBUG_INFO_UPDATE_LOC();
2492 #if defined(__i386__) || defined(__x86_64__)
2493 // return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2494
2495 return x86::psrad(lhs, rhs);
2496 #else
2497 return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2498 #endif
2499 }
2500
type()2501 Type *Int2::type()
2502 {
2503 return T(Type_v2i32);
2504 }
2505
operator <<(RValue<UInt2> lhs,unsigned char rhs)2506 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2507 {
2508 RR_DEBUG_INFO_UPDATE_LOC();
2509 #if defined(__i386__) || defined(__x86_64__)
2510 // return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2511
2512 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2513 #else
2514 return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2515 #endif
2516 }
2517
operator >>(RValue<UInt2> lhs,unsigned char rhs)2518 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2519 {
2520 RR_DEBUG_INFO_UPDATE_LOC();
2521 #if defined(__i386__) || defined(__x86_64__)
2522 // return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2523
2524 return x86::psrld(lhs, rhs);
2525 #else
2526 return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2527 #endif
2528 }
2529
type()2530 Type *UInt2::type()
2531 {
2532 return T(Type_v2i32);
2533 }
2534
Int4(RValue<Byte4> cast)2535 Int4::Int4(RValue<Byte4> cast)
2536 : XYZW(this)
2537 {
2538 RR_DEBUG_INFO_UPDATE_LOC();
2539 #if defined(__i386__) || defined(__x86_64__)
2540 if(CPUID::supportsSSE4_1())
2541 {
2542 *this = x86::pmovzxbd(As<Byte16>(cast));
2543 }
2544 else
2545 #endif
2546 {
2547 int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2548 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2549 Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2550
2551 int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2552 Value *c = Nucleus::createBitCast(b, Short8::type());
2553 Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2554
2555 *this = As<Int4>(d);
2556 }
2557 }
2558
Int4(RValue<SByte4> cast)2559 Int4::Int4(RValue<SByte4> cast)
2560 : XYZW(this)
2561 {
2562 RR_DEBUG_INFO_UPDATE_LOC();
2563 #if defined(__i386__) || defined(__x86_64__)
2564 if(CPUID::supportsSSE4_1())
2565 {
2566 *this = x86::pmovsxbd(As<SByte16>(cast));
2567 }
2568 else
2569 #endif
2570 {
2571 int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2572 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2573 Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2574
2575 int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2576 Value *c = Nucleus::createBitCast(b, Short8::type());
2577 Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2578
2579 *this = As<Int4>(d) >> 24;
2580 }
2581 }
2582
Int4(RValue<Short4> cast)2583 Int4::Int4(RValue<Short4> cast)
2584 : XYZW(this)
2585 {
2586 RR_DEBUG_INFO_UPDATE_LOC();
2587 #if defined(__i386__) || defined(__x86_64__)
2588 if(CPUID::supportsSSE4_1())
2589 {
2590 *this = x86::pmovsxwd(As<Short8>(cast));
2591 }
2592 else
2593 #endif
2594 {
2595 int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2596 Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2597 *this = As<Int4>(c) >> 16;
2598 }
2599 }
2600
Int4(RValue<UShort4> cast)2601 Int4::Int4(RValue<UShort4> cast)
2602 : XYZW(this)
2603 {
2604 RR_DEBUG_INFO_UPDATE_LOC();
2605 #if defined(__i386__) || defined(__x86_64__)
2606 if(CPUID::supportsSSE4_1())
2607 {
2608 *this = x86::pmovzxwd(As<UShort8>(cast));
2609 }
2610 else
2611 #endif
2612 {
2613 int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2614 Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2615 *this = As<Int4>(c);
2616 }
2617 }
2618
Int4(RValue<Int> rhs)2619 Int4::Int4(RValue<Int> rhs)
2620 : XYZW(this)
2621 {
2622 RR_DEBUG_INFO_UPDATE_LOC();
2623 Value *vector = loadValue();
2624 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2625
2626 int swizzle[4] = { 0, 0, 0, 0 };
2627 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2628
2629 storeValue(replicate);
2630 }
2631
operator <<(RValue<Int4> lhs,unsigned char rhs)2632 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2633 {
2634 RR_DEBUG_INFO_UPDATE_LOC();
2635 #if defined(__i386__) || defined(__x86_64__)
2636 return x86::pslld(lhs, rhs);
2637 #else
2638 return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2639 #endif
2640 }
2641
operator >>(RValue<Int4> lhs,unsigned char rhs)2642 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2643 {
2644 RR_DEBUG_INFO_UPDATE_LOC();
2645 #if defined(__i386__) || defined(__x86_64__)
2646 return x86::psrad(lhs, rhs);
2647 #else
2648 return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2649 #endif
2650 }
2651
CmpEQ(RValue<Int4> x,RValue<Int4> y)2652 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2653 {
2654 RR_DEBUG_INFO_UPDATE_LOC();
2655 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2656 }
2657
CmpLT(RValue<Int4> x,RValue<Int4> y)2658 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2659 {
2660 RR_DEBUG_INFO_UPDATE_LOC();
2661 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2662 }
2663
CmpLE(RValue<Int4> x,RValue<Int4> y)2664 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2665 {
2666 RR_DEBUG_INFO_UPDATE_LOC();
2667 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2668 }
2669
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2670 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2671 {
2672 RR_DEBUG_INFO_UPDATE_LOC();
2673 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2674 }
2675
CmpNLT(RValue<Int4> x,RValue<Int4> y)2676 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2677 {
2678 RR_DEBUG_INFO_UPDATE_LOC();
2679 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2680 }
2681
CmpNLE(RValue<Int4> x,RValue<Int4> y)2682 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2683 {
2684 RR_DEBUG_INFO_UPDATE_LOC();
2685 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2686 }
2687
Max(RValue<Int4> x,RValue<Int4> y)2688 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2689 {
2690 RR_DEBUG_INFO_UPDATE_LOC();
2691 #if defined(__i386__) || defined(__x86_64__)
2692 if(CPUID::supportsSSE4_1())
2693 {
2694 return x86::pmaxsd(x, y);
2695 }
2696 else
2697 #endif
2698 {
2699 RValue<Int4> greater = CmpNLE(x, y);
2700 return (x & greater) | (y & ~greater);
2701 }
2702 }
2703
Min(RValue<Int4> x,RValue<Int4> y)2704 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2705 {
2706 RR_DEBUG_INFO_UPDATE_LOC();
2707 #if defined(__i386__) || defined(__x86_64__)
2708 if(CPUID::supportsSSE4_1())
2709 {
2710 return x86::pminsd(x, y);
2711 }
2712 else
2713 #endif
2714 {
2715 RValue<Int4> less = CmpLT(x, y);
2716 return (x & less) | (y & ~less);
2717 }
2718 }
2719
RoundInt(RValue<Float4> cast)2720 RValue<Int4> RoundInt(RValue<Float4> cast)
2721 {
2722 RR_DEBUG_INFO_UPDATE_LOC();
2723 #if defined(__i386__) || defined(__x86_64__)
2724 return x86::cvtps2dq(cast);
2725 #else
2726 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2727 #endif
2728 }
2729
RoundIntClamped(RValue<Float4> cast)2730 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2731 {
2732 RR_DEBUG_INFO_UPDATE_LOC();
2733 #if defined(__i386__) || defined(__x86_64__)
2734 // cvtps2dq produces 0x80000000, a negative value, for input larger than
2735 // 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2736 // saturate to 0x80000000.
2737 return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2738 #else
2739 // ARM saturates to the largest positive or negative integer. Unit tests
2740 // verify that lowerRoundInt() behaves as desired.
2741 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2742 #endif
2743 }
2744
MulHigh(RValue<Int4> x,RValue<Int4> y)2745 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2746 {
2747 RR_DEBUG_INFO_UPDATE_LOC();
2748 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2749 return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2750 }
2751
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2752 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2753 {
2754 RR_DEBUG_INFO_UPDATE_LOC();
2755 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2756 return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2757 }
2758
PackSigned(RValue<Int4> x,RValue<Int4> y)2759 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2760 {
2761 RR_DEBUG_INFO_UPDATE_LOC();
2762 #if defined(__i386__) || defined(__x86_64__)
2763 return x86::packssdw(x, y);
2764 #else
2765 return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2766 #endif
2767 }
2768
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2769 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2770 {
2771 RR_DEBUG_INFO_UPDATE_LOC();
2772 #if defined(__i386__) || defined(__x86_64__)
2773 return x86::packusdw(x, y);
2774 #else
2775 return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2776 #endif
2777 }
2778
SignMask(RValue<Int4> x)2779 RValue<Int> SignMask(RValue<Int4> x)
2780 {
2781 RR_DEBUG_INFO_UPDATE_LOC();
2782 #if defined(__i386__) || defined(__x86_64__)
2783 return x86::movmskps(As<Float4>(x));
2784 #else
2785 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2786 #endif
2787 }
2788
type()2789 Type *Int4::type()
2790 {
2791 return T(llvm::VectorType::get(T(Int::type()), 4, false));
2792 }
2793
UInt4(RValue<Float4> cast)2794 UInt4::UInt4(RValue<Float4> cast)
2795 : XYZW(this)
2796 {
2797 RR_DEBUG_INFO_UPDATE_LOC();
2798 Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2799 storeValue(xyzw);
2800 }
2801
UInt4(RValue<UInt> rhs)2802 UInt4::UInt4(RValue<UInt> rhs)
2803 : XYZW(this)
2804 {
2805 RR_DEBUG_INFO_UPDATE_LOC();
2806 Value *vector = loadValue();
2807 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2808
2809 int swizzle[4] = { 0, 0, 0, 0 };
2810 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2811
2812 storeValue(replicate);
2813 }
2814
operator <<(RValue<UInt4> lhs,unsigned char rhs)2815 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2816 {
2817 RR_DEBUG_INFO_UPDATE_LOC();
2818 #if defined(__i386__) || defined(__x86_64__)
2819 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2820 #else
2821 return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2822 #endif
2823 }
2824
operator >>(RValue<UInt4> lhs,unsigned char rhs)2825 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2826 {
2827 RR_DEBUG_INFO_UPDATE_LOC();
2828 #if defined(__i386__) || defined(__x86_64__)
2829 return x86::psrld(lhs, rhs);
2830 #else
2831 return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2832 #endif
2833 }
2834
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2835 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2836 {
2837 RR_DEBUG_INFO_UPDATE_LOC();
2838 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2839 }
2840
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2841 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2842 {
2843 RR_DEBUG_INFO_UPDATE_LOC();
2844 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2845 }
2846
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2847 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2848 {
2849 RR_DEBUG_INFO_UPDATE_LOC();
2850 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2851 }
2852
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2853 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2854 {
2855 RR_DEBUG_INFO_UPDATE_LOC();
2856 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2857 }
2858
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2859 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2860 {
2861 RR_DEBUG_INFO_UPDATE_LOC();
2862 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2863 }
2864
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2865 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2866 {
2867 RR_DEBUG_INFO_UPDATE_LOC();
2868 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2869 }
2870
Max(RValue<UInt4> x,RValue<UInt4> y)2871 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2872 {
2873 RR_DEBUG_INFO_UPDATE_LOC();
2874 #if defined(__i386__) || defined(__x86_64__)
2875 if(CPUID::supportsSSE4_1())
2876 {
2877 return x86::pmaxud(x, y);
2878 }
2879 else
2880 #endif
2881 {
2882 RValue<UInt4> greater = CmpNLE(x, y);
2883 return (x & greater) | (y & ~greater);
2884 }
2885 }
2886
Min(RValue<UInt4> x,RValue<UInt4> y)2887 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2888 {
2889 RR_DEBUG_INFO_UPDATE_LOC();
2890 #if defined(__i386__) || defined(__x86_64__)
2891 if(CPUID::supportsSSE4_1())
2892 {
2893 return x86::pminud(x, y);
2894 }
2895 else
2896 #endif
2897 {
2898 RValue<UInt4> less = CmpLT(x, y);
2899 return (x & less) | (y & ~less);
2900 }
2901 }
2902
type()2903 Type *UInt4::type()
2904 {
2905 return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2906 }
2907
type()2908 Type *Half::type()
2909 {
2910 return T(llvm::Type::getInt16Ty(*jit->context));
2911 }
2912
Rcp_pp(RValue<Float> x,bool exactAtPow2)2913 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2914 {
2915 RR_DEBUG_INFO_UPDATE_LOC();
2916 #if defined(__i386__) || defined(__x86_64__)
2917 if(exactAtPow2)
2918 {
2919 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2920 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2921 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2922 }
2923 return x86::rcpss(x);
2924 #else
2925 return As<Float>(V(lowerRCP(V(x.value()))));
2926 #endif
2927 }
2928
RcpSqrt_pp(RValue<Float> x)2929 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2930 {
2931 RR_DEBUG_INFO_UPDATE_LOC();
2932 #if defined(__i386__) || defined(__x86_64__)
2933 return x86::rsqrtss(x);
2934 #else
2935 return As<Float>(V(lowerRSQRT(V(x.value()))));
2936 #endif
2937 }
2938
HasRcpApprox()2939 bool HasRcpApprox()
2940 {
2941 #if defined(__i386__) || defined(__x86_64__)
2942 return true;
2943 #else
2944 return false;
2945 #endif
2946 }
2947
RcpApprox(RValue<Float4> x,bool exactAtPow2)2948 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2949 {
2950 #if defined(__i386__) || defined(__x86_64__)
2951 if(exactAtPow2)
2952 {
2953 // rcpps uses a piecewise-linear approximation which minimizes the relative error
2954 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2955 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2956 }
2957 return x86::rcpps(x);
2958 #else
2959 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2960 return { 0.0f };
2961 #endif
2962 }
2963
RcpApprox(RValue<Float> x,bool exactAtPow2)2964 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2965 {
2966 #if defined(__i386__) || defined(__x86_64__)
2967 if(exactAtPow2)
2968 {
2969 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2970 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2971 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2972 }
2973 return x86::rcpss(x);
2974 #else
2975 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2976 return { 0.0f };
2977 #endif
2978 }
2979
HasRcpSqrtApprox()2980 bool HasRcpSqrtApprox()
2981 {
2982 #if defined(__i386__) || defined(__x86_64__)
2983 return true;
2984 #else
2985 return false;
2986 #endif
2987 }
2988
RcpSqrtApprox(RValue<Float4> x)2989 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2990 {
2991 #if defined(__i386__) || defined(__x86_64__)
2992 return x86::rsqrtps(x);
2993 #else
2994 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2995 return { 0.0f };
2996 #endif
2997 }
2998
RcpSqrtApprox(RValue<Float> x)2999 RValue<Float> RcpSqrtApprox(RValue<Float> x)
3000 {
3001 #if defined(__i386__) || defined(__x86_64__)
3002 return x86::rsqrtss(x);
3003 #else
3004 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
3005 return { 0.0f };
3006 #endif
3007 }
3008
Sqrt(RValue<Float> x)3009 RValue<Float> Sqrt(RValue<Float> x)
3010 {
3011 RR_DEBUG_INFO_UPDATE_LOC();
3012 #if defined(__i386__) || defined(__x86_64__)
3013 return x86::sqrtss(x);
3014 #else
3015 return As<Float>(V(lowerSQRT(V(x.value()))));
3016 #endif
3017 }
3018
Round(RValue<Float> x)3019 RValue<Float> Round(RValue<Float> x)
3020 {
3021 RR_DEBUG_INFO_UPDATE_LOC();
3022 #if defined(__i386__) || defined(__x86_64__)
3023 if(CPUID::supportsSSE4_1())
3024 {
3025 return x86::roundss(x, 0);
3026 }
3027 else
3028 {
3029 return Float4(Round(Float4(x))).x;
3030 }
3031 #else
3032 return RValue<Float>(V(lowerRound(V(x.value()))));
3033 #endif
3034 }
3035
Trunc(RValue<Float> x)3036 RValue<Float> Trunc(RValue<Float> x)
3037 {
3038 RR_DEBUG_INFO_UPDATE_LOC();
3039 #if defined(__i386__) || defined(__x86_64__)
3040 if(CPUID::supportsSSE4_1())
3041 {
3042 return x86::roundss(x, 3);
3043 }
3044 else
3045 {
3046 return Float(Int(x)); // Rounded toward zero
3047 }
3048 #else
3049 return RValue<Float>(V(lowerTrunc(V(x.value()))));
3050 #endif
3051 }
3052
Frac(RValue<Float> x)3053 RValue<Float> Frac(RValue<Float> x)
3054 {
3055 RR_DEBUG_INFO_UPDATE_LOC();
3056 #if defined(__i386__) || defined(__x86_64__)
3057 if(CPUID::supportsSSE4_1())
3058 {
3059 return x - x86::floorss(x);
3060 }
3061 else
3062 {
3063 return Float4(Frac(Float4(x))).x;
3064 }
3065 #else
3066 // x - floor(x) can be 1.0 for very small negative x.
3067 // Clamp against the value just below 1.0.
3068 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3069 #endif
3070 }
3071
Floor(RValue<Float> x)3072 RValue<Float> Floor(RValue<Float> x)
3073 {
3074 RR_DEBUG_INFO_UPDATE_LOC();
3075 #if defined(__i386__) || defined(__x86_64__)
3076 if(CPUID::supportsSSE4_1())
3077 {
3078 return x86::floorss(x);
3079 }
3080 else
3081 {
3082 return Float4(Floor(Float4(x))).x;
3083 }
3084 #else
3085 return RValue<Float>(V(lowerFloor(V(x.value()))));
3086 #endif
3087 }
3088
Ceil(RValue<Float> x)3089 RValue<Float> Ceil(RValue<Float> x)
3090 {
3091 RR_DEBUG_INFO_UPDATE_LOC();
3092 #if defined(__i386__) || defined(__x86_64__)
3093 if(CPUID::supportsSSE4_1())
3094 {
3095 return x86::ceilss(x);
3096 }
3097 else
3098 #endif
3099 {
3100 return Float4(Ceil(Float4(x))).x;
3101 }
3102 }
3103
type()3104 Type *Float::type()
3105 {
3106 return T(llvm::Type::getFloatTy(*jit->context));
3107 }
3108
type()3109 Type *Float2::type()
3110 {
3111 return T(Type_v2f32);
3112 }
3113
Exp2(RValue<Float> v)3114 RValue<Float> Exp2(RValue<Float> v)
3115 {
3116 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::type()) });
3117 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3118 }
3119
Log2(RValue<Float> v)3120 RValue<Float> Log2(RValue<Float> v)
3121 {
3122 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::type()) });
3123 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3124 }
3125
Float4(RValue<Float> rhs)3126 Float4::Float4(RValue<Float> rhs)
3127 : XYZW(this)
3128 {
3129 RR_DEBUG_INFO_UPDATE_LOC();
3130 Value *vector = loadValue();
3131 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3132
3133 int swizzle[4] = { 0, 0, 0, 0 };
3134 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3135
3136 storeValue(replicate);
3137 }
3138
Max(RValue<Float4> x,RValue<Float4> y)3139 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3140 {
3141 RR_DEBUG_INFO_UPDATE_LOC();
3142 #if defined(__i386__) || defined(__x86_64__)
3143 return x86::maxps(x, y);
3144 #else
3145 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3146 #endif
3147 }
3148
Min(RValue<Float4> x,RValue<Float4> y)3149 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3150 {
3151 RR_DEBUG_INFO_UPDATE_LOC();
3152 #if defined(__i386__) || defined(__x86_64__)
3153 return x86::minps(x, y);
3154 #else
3155 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3156 #endif
3157 }
3158
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3159 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3160 {
3161 RR_DEBUG_INFO_UPDATE_LOC();
3162 #if defined(__i386__) || defined(__x86_64__)
3163 if(exactAtPow2)
3164 {
3165 // rcpps uses a piecewise-linear approximation which minimizes the relative error
3166 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
3167 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
3168 }
3169 return x86::rcpps(x);
3170 #else
3171 return As<Float4>(V(lowerRCP(V(x.value()))));
3172 #endif
3173 }
3174
RcpSqrt_pp(RValue<Float4> x)3175 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3176 {
3177 RR_DEBUG_INFO_UPDATE_LOC();
3178 #if defined(__i386__) || defined(__x86_64__)
3179 return x86::rsqrtps(x);
3180 #else
3181 return As<Float4>(V(lowerRSQRT(V(x.value()))));
3182 #endif
3183 }
3184
Sqrt(RValue<Float4> x)3185 RValue<Float4> Sqrt(RValue<Float4> x)
3186 {
3187 RR_DEBUG_INFO_UPDATE_LOC();
3188 #if defined(__i386__) || defined(__x86_64__)
3189 return x86::sqrtps(x);
3190 #else
3191 return As<Float4>(V(lowerSQRT(V(x.value()))));
3192 #endif
3193 }
3194
SignMask(RValue<Float4> x)3195 RValue<Int> SignMask(RValue<Float4> x)
3196 {
3197 RR_DEBUG_INFO_UPDATE_LOC();
3198 #if defined(__i386__) || defined(__x86_64__)
3199 return x86::movmskps(x);
3200 #else
3201 return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3202 #endif
3203 }
3204
CmpEQ(RValue<Float4> x,RValue<Float4> y)3205 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3206 {
3207 RR_DEBUG_INFO_UPDATE_LOC();
3208 // return As<Int4>(x86::cmpeqps(x, y));
3209 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3210 }
3211
CmpLT(RValue<Float4> x,RValue<Float4> y)3212 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3213 {
3214 RR_DEBUG_INFO_UPDATE_LOC();
3215 // return As<Int4>(x86::cmpltps(x, y));
3216 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3217 }
3218
CmpLE(RValue<Float4> x,RValue<Float4> y)3219 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3220 {
3221 RR_DEBUG_INFO_UPDATE_LOC();
3222 // return As<Int4>(x86::cmpleps(x, y));
3223 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3224 }
3225
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3226 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3227 {
3228 RR_DEBUG_INFO_UPDATE_LOC();
3229 // return As<Int4>(x86::cmpneqps(x, y));
3230 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3231 }
3232
CmpNLT(RValue<Float4> x,RValue<Float4> y)3233 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3234 {
3235 RR_DEBUG_INFO_UPDATE_LOC();
3236 // return As<Int4>(x86::cmpnltps(x, y));
3237 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3238 }
3239
CmpNLE(RValue<Float4> x,RValue<Float4> y)3240 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3241 {
3242 RR_DEBUG_INFO_UPDATE_LOC();
3243 // return As<Int4>(x86::cmpnleps(x, y));
3244 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3245 }
3246
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3247 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3248 {
3249 RR_DEBUG_INFO_UPDATE_LOC();
3250 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3251 }
3252
CmpULT(RValue<Float4> x,RValue<Float4> y)3253 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3254 {
3255 RR_DEBUG_INFO_UPDATE_LOC();
3256 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3257 }
3258
CmpULE(RValue<Float4> x,RValue<Float4> y)3259 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3260 {
3261 RR_DEBUG_INFO_UPDATE_LOC();
3262 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3263 }
3264
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3265 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3266 {
3267 RR_DEBUG_INFO_UPDATE_LOC();
3268 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3269 }
3270
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3271 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3272 {
3273 RR_DEBUG_INFO_UPDATE_LOC();
3274 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3275 }
3276
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3277 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3278 {
3279 RR_DEBUG_INFO_UPDATE_LOC();
3280 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3281 }
3282
Round(RValue<Float4> x)3283 RValue<Float4> Round(RValue<Float4> x)
3284 {
3285 RR_DEBUG_INFO_UPDATE_LOC();
3286 #if defined(__i386__) || defined(__x86_64__)
3287 if(CPUID::supportsSSE4_1())
3288 {
3289 return x86::roundps(x, 0);
3290 }
3291 else
3292 {
3293 return Float4(RoundInt(x));
3294 }
3295 #else
3296 return RValue<Float4>(V(lowerRound(V(x.value()))));
3297 #endif
3298 }
3299
Trunc(RValue<Float4> x)3300 RValue<Float4> Trunc(RValue<Float4> x)
3301 {
3302 RR_DEBUG_INFO_UPDATE_LOC();
3303 #if defined(__i386__) || defined(__x86_64__)
3304 if(CPUID::supportsSSE4_1())
3305 {
3306 return x86::roundps(x, 3);
3307 }
3308 else
3309 {
3310 return Float4(Int4(x));
3311 }
3312 #else
3313 return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3314 #endif
3315 }
3316
Frac(RValue<Float4> x)3317 RValue<Float4> Frac(RValue<Float4> x)
3318 {
3319 RR_DEBUG_INFO_UPDATE_LOC();
3320 Float4 frc;
3321
3322 #if defined(__i386__) || defined(__x86_64__)
3323 if(CPUID::supportsSSE4_1())
3324 {
3325 frc = x - Floor(x);
3326 }
3327 else
3328 {
3329 frc = x - Float4(Int4(x)); // Signed fractional part.
3330
3331 frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f))); // Add 1.0 if negative.
3332 }
3333 #else
3334 frc = x - Floor(x);
3335 #endif
3336
3337 // x - floor(x) can be 1.0 for very small negative x.
3338 // Clamp against the value just below 1.0.
3339 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3340 }
3341
Floor(RValue<Float4> x)3342 RValue<Float4> Floor(RValue<Float4> x)
3343 {
3344 RR_DEBUG_INFO_UPDATE_LOC();
3345 #if defined(__i386__) || defined(__x86_64__)
3346 if(CPUID::supportsSSE4_1())
3347 {
3348 return x86::floorps(x);
3349 }
3350 else
3351 {
3352 return x - Frac(x);
3353 }
3354 #else
3355 return RValue<Float4>(V(lowerFloor(V(x.value()))));
3356 #endif
3357 }
3358
Ceil(RValue<Float4> x)3359 RValue<Float4> Ceil(RValue<Float4> x)
3360 {
3361 RR_DEBUG_INFO_UPDATE_LOC();
3362 #if defined(__i386__) || defined(__x86_64__)
3363 if(CPUID::supportsSSE4_1())
3364 {
3365 return x86::ceilps(x);
3366 }
3367 else
3368 #endif
3369 {
3370 return -Floor(-x);
3371 }
3372 }
3373
Sin(RValue<Float4> v)3374 RValue<Float4> Sin(RValue<Float4> v)
3375 {
3376 RR_DEBUG_INFO_UPDATE_LOC();
3377 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value())->getType() });
3378 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3379 }
3380
Cos(RValue<Float4> v)3381 RValue<Float4> Cos(RValue<Float4> v)
3382 {
3383 RR_DEBUG_INFO_UPDATE_LOC();
3384 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value())->getType() });
3385 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3386 }
3387
Tan(RValue<Float4> v)3388 RValue<Float4> Tan(RValue<Float4> v)
3389 {
3390 RR_DEBUG_INFO_UPDATE_LOC();
3391 return Sin(v) / Cos(v);
3392 }
3393
TransformFloat4PerElement(RValue<Float4> v,const char * name)3394 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3395 {
3396 auto funcTy = llvm::FunctionType::get(T(Float::type()), llvm::ArrayRef<llvm::Type *>(T(Float::type())), false);
3397 auto func = jit->module->getOrInsertFunction(name, funcTy);
3398 llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3399 for(uint64_t i = 0; i < 4; i++)
3400 {
3401 auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value(), Float::type(), i)));
3402 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3403 }
3404 return RValue<Float4>(V(out));
3405 }
3406
Asin(RValue<Float4> v,Precision p)3407 RValue<Float4> Asin(RValue<Float4> v, Precision p)
3408 {
3409 RR_DEBUG_INFO_UPDATE_LOC();
3410 return TransformFloat4PerElement(v, "asinf");
3411 }
3412
Acos(RValue<Float4> v,Precision p)3413 RValue<Float4> Acos(RValue<Float4> v, Precision p)
3414 {
3415 RR_DEBUG_INFO_UPDATE_LOC();
3416 return TransformFloat4PerElement(v, "acosf");
3417 }
3418
Atan(RValue<Float4> v)3419 RValue<Float4> Atan(RValue<Float4> v)
3420 {
3421 RR_DEBUG_INFO_UPDATE_LOC();
3422 return TransformFloat4PerElement(v, "atanf");
3423 }
3424
Sinh(RValue<Float4> v)3425 RValue<Float4> Sinh(RValue<Float4> v)
3426 {
3427 RR_DEBUG_INFO_UPDATE_LOC();
3428 return emulated::Sinh(v);
3429 }
3430
Cosh(RValue<Float4> v)3431 RValue<Float4> Cosh(RValue<Float4> v)
3432 {
3433 RR_DEBUG_INFO_UPDATE_LOC();
3434 return emulated::Cosh(v);
3435 }
3436
Tanh(RValue<Float4> v)3437 RValue<Float4> Tanh(RValue<Float4> v)
3438 {
3439 RR_DEBUG_INFO_UPDATE_LOC();
3440 return TransformFloat4PerElement(v, "tanhf");
3441 }
3442
Asinh(RValue<Float4> v)3443 RValue<Float4> Asinh(RValue<Float4> v)
3444 {
3445 RR_DEBUG_INFO_UPDATE_LOC();
3446 return TransformFloat4PerElement(v, "asinhf");
3447 }
3448
Acosh(RValue<Float4> v)3449 RValue<Float4> Acosh(RValue<Float4> v)
3450 {
3451 RR_DEBUG_INFO_UPDATE_LOC();
3452 return TransformFloat4PerElement(v, "acoshf");
3453 }
3454
Atanh(RValue<Float4> v)3455 RValue<Float4> Atanh(RValue<Float4> v)
3456 {
3457 RR_DEBUG_INFO_UPDATE_LOC();
3458 return TransformFloat4PerElement(v, "atanhf");
3459 }
3460
Atan2(RValue<Float4> x,RValue<Float4> y)3461 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3462 {
3463 RR_DEBUG_INFO_UPDATE_LOC();
3464 llvm::SmallVector<llvm::Type *, 2> paramTys;
3465 paramTys.push_back(T(Float::type()));
3466 paramTys.push_back(T(Float::type()));
3467 auto funcTy = llvm::FunctionType::get(T(Float::type()), paramTys, false);
3468 auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3469 llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3470 for(uint64_t i = 0; i < 4; i++)
3471 {
3472 auto el = jit->builder->CreateCall(func, { V(Nucleus::createExtractElement(x.value(), Float::type(), i)),
3473 V(Nucleus::createExtractElement(y.value(), Float::type(), i)) });
3474 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3475 }
3476 return RValue<Float4>(V(out));
3477 }
3478
Pow(RValue<Float4> x,RValue<Float4> y)3479 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3480 {
3481 RR_DEBUG_INFO_UPDATE_LOC();
3482 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::type()) });
3483 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()) })));
3484 }
3485
Exp(RValue<Float4> v)3486 RValue<Float4> Exp(RValue<Float4> v)
3487 {
3488 RR_DEBUG_INFO_UPDATE_LOC();
3489 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::type()) });
3490 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3491 }
3492
Log(RValue<Float4> v)3493 RValue<Float4> Log(RValue<Float4> v)
3494 {
3495 RR_DEBUG_INFO_UPDATE_LOC();
3496 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::type()) });
3497 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3498 }
3499
Exp2(RValue<Float4> v)3500 RValue<Float4> Exp2(RValue<Float4> v)
3501 {
3502 RR_DEBUG_INFO_UPDATE_LOC();
3503 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::type()) });
3504 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3505 }
3506
Log2(RValue<Float4> v)3507 RValue<Float4> Log2(RValue<Float4> v)
3508 {
3509 RR_DEBUG_INFO_UPDATE_LOC();
3510 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::type()) });
3511 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3512 }
3513
Ctlz(RValue<UInt> v,bool isZeroUndef)3514 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3515 {
3516 RR_DEBUG_INFO_UPDATE_LOC();
3517 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3518 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3519 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3520 }
3521
Ctlz(RValue<UInt4> v,bool isZeroUndef)3522 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3523 {
3524 RR_DEBUG_INFO_UPDATE_LOC();
3525 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3526 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3527 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3528 }
3529
Cttz(RValue<UInt> v,bool isZeroUndef)3530 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3531 {
3532 RR_DEBUG_INFO_UPDATE_LOC();
3533 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3534 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3535 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3536 }
3537
Cttz(RValue<UInt4> v,bool isZeroUndef)3538 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3539 {
3540 RR_DEBUG_INFO_UPDATE_LOC();
3541 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3542 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3543 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3544 }
3545
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3546 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3547 {
3548 return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3549 }
3550
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3551 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3552 {
3553 return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3554 }
3555
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3556 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3557 {
3558 return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3559 }
3560
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3561 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3562 {
3563 return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3564 }
3565
type()3566 Type *Float4::type()
3567 {
3568 return T(llvm::VectorType::get(T(Float::type()), 4, false));
3569 }
3570
Ticks()3571 RValue<Long> Ticks()
3572 {
3573 RR_DEBUG_INFO_UPDATE_LOC();
3574 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3575
3576 return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3577 }
3578
ConstantPointer(void const * ptr)3579 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3580 {
3581 RR_DEBUG_INFO_UPDATE_LOC();
3582 // Note: this should work for 32-bit pointers as well because 'inttoptr'
3583 // is defined to truncate (and zero extend) if necessary.
3584 auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3585 return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3586 }
3587
ConstantData(void const * data,size_t size)3588 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3589 {
3590 RR_DEBUG_INFO_UPDATE_LOC();
3591 auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3592 auto ptr = jit->builder->CreateGlobalStringPtr(str);
3593 return RValue<Pointer<Byte>>(V(ptr));
3594 }
3595
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3596 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3597 {
3598 RR_DEBUG_INFO_UPDATE_LOC();
3599 llvm::SmallVector<llvm::Type *, 8> paramTys;
3600 for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3601 auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3602
3603 auto funcPtrTy = funcTy->getPointerTo();
3604 auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3605
3606 llvm::SmallVector<llvm::Value *, 8> arguments;
3607 for(auto arg : args) { arguments.push_back(V(arg)); }
3608 return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3609 }
3610
Breakpoint()3611 void Breakpoint()
3612 {
3613 RR_DEBUG_INFO_UPDATE_LOC();
3614 llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3615
3616 jit->builder->CreateCall(debugtrap);
3617 }
3618
3619 } // namespace rr
3620
3621 namespace rr {
3622
3623 #if defined(__i386__) || defined(__x86_64__)
3624 namespace x86 {
3625
3626 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3627 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3628 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3629 {
3630 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3631
3632 return V(jit->builder->CreateCall(intrinsic, V(x)));
3633 }
3634
3635 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3636 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3637 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3638 {
3639 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3640
3641 return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3642 }
3643
cvtss2si(RValue<Float> val)3644 RValue<Int> cvtss2si(RValue<Float> val)
3645 {
3646 Float4 vector;
3647 vector.x = val;
3648
3649 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3650 }
3651
cvtps2dq(RValue<Float4> val)3652 RValue<Int4> cvtps2dq(RValue<Float4> val)
3653 {
3654 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3655 }
3656
rcpss(RValue<Float> val)3657 RValue<Float> rcpss(RValue<Float> val)
3658 {
3659 Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3660
3661 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3662 }
3663
sqrtss(RValue<Float> val)3664 RValue<Float> sqrtss(RValue<Float> val)
3665 {
3666 return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3667 }
3668
rsqrtss(RValue<Float> val)3669 RValue<Float> rsqrtss(RValue<Float> val)
3670 {
3671 Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3672
3673 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3674 }
3675
rcpps(RValue<Float4> val)3676 RValue<Float4> rcpps(RValue<Float4> val)
3677 {
3678 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3679 }
3680
sqrtps(RValue<Float4> val)3681 RValue<Float4> sqrtps(RValue<Float4> val)
3682 {
3683 return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3684 }
3685
rsqrtps(RValue<Float4> val)3686 RValue<Float4> rsqrtps(RValue<Float4> val)
3687 {
3688 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3689 }
3690
maxps(RValue<Float4> x,RValue<Float4> y)3691 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3692 {
3693 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3694 }
3695
minps(RValue<Float4> x,RValue<Float4> y)3696 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3697 {
3698 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3699 }
3700
roundss(RValue<Float> val,unsigned char imm)3701 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3702 {
3703 llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3704
3705 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3706 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3707
3708 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3709 }
3710
floorss(RValue<Float> val)3711 RValue<Float> floorss(RValue<Float> val)
3712 {
3713 return roundss(val, 1);
3714 }
3715
ceilss(RValue<Float> val)3716 RValue<Float> ceilss(RValue<Float> val)
3717 {
3718 return roundss(val, 2);
3719 }
3720
roundps(RValue<Float4> val,unsigned char imm)3721 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3722 {
3723 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3724 }
3725
floorps(RValue<Float4> val)3726 RValue<Float4> floorps(RValue<Float4> val)
3727 {
3728 return roundps(val, 1);
3729 }
3730
ceilps(RValue<Float4> val)3731 RValue<Float4> ceilps(RValue<Float4> val)
3732 {
3733 return roundps(val, 2);
3734 }
3735
pabsd(RValue<Int4> x)3736 RValue<Int4> pabsd(RValue<Int4> x)
3737 {
3738 return RValue<Int4>(V(lowerPABS(V(x.value()))));
3739 }
3740
paddsw(RValue<Short4> x,RValue<Short4> y)3741 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3742 {
3743 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3744 }
3745
psubsw(RValue<Short4> x,RValue<Short4> y)3746 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3747 {
3748 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3749 }
3750
paddusw(RValue<UShort4> x,RValue<UShort4> y)3751 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3752 {
3753 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3754 }
3755
psubusw(RValue<UShort4> x,RValue<UShort4> y)3756 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3757 {
3758 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3759 }
3760
paddsb(RValue<SByte8> x,RValue<SByte8> y)3761 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3762 {
3763 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3764 }
3765
psubsb(RValue<SByte8> x,RValue<SByte8> y)3766 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3767 {
3768 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3769 }
3770
paddusb(RValue<Byte8> x,RValue<Byte8> y)3771 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3772 {
3773 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3774 }
3775
psubusb(RValue<Byte8> x,RValue<Byte8> y)3776 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3777 {
3778 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3779 }
3780
pavgw(RValue<UShort4> x,RValue<UShort4> y)3781 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3782 {
3783 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3784 }
3785
pmaxsw(RValue<Short4> x,RValue<Short4> y)3786 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3787 {
3788 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3789 }
3790
pminsw(RValue<Short4> x,RValue<Short4> y)3791 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3792 {
3793 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3794 }
3795
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3796 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3797 {
3798 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3799 }
3800
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3801 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3802 {
3803 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3804 }
3805
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3806 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3807 {
3808 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3809 }
3810
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3811 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3812 {
3813 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3814 }
3815
packssdw(RValue<Int2> x,RValue<Int2> y)3816 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3817 {
3818 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3819 }
3820
packssdw(RValue<Int4> x,RValue<Int4> y)3821 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3822 {
3823 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3824 }
3825
packsswb(RValue<Short4> x,RValue<Short4> y)3826 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3827 {
3828 return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3829 }
3830
packuswb(RValue<Short4> x,RValue<Short4> y)3831 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3832 {
3833 return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3834 }
3835
packusdw(RValue<Int4> x,RValue<Int4> y)3836 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3837 {
3838 if(CPUID::supportsSSE4_1())
3839 {
3840 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3841 }
3842 else
3843 {
3844 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3845 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3846
3847 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3848 }
3849 }
3850
psrlw(RValue<UShort4> x,unsigned char y)3851 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3852 {
3853 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3854 }
3855
psrlw(RValue<UShort8> x,unsigned char y)3856 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3857 {
3858 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3859 }
3860
psraw(RValue<Short4> x,unsigned char y)3861 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3862 {
3863 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3864 }
3865
psraw(RValue<Short8> x,unsigned char y)3866 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3867 {
3868 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3869 }
3870
psllw(RValue<Short4> x,unsigned char y)3871 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3872 {
3873 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3874 }
3875
psllw(RValue<Short8> x,unsigned char y)3876 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3877 {
3878 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3879 }
3880
pslld(RValue<Int2> x,unsigned char y)3881 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3882 {
3883 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3884 }
3885
pslld(RValue<Int4> x,unsigned char y)3886 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3887 {
3888 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3889 }
3890
psrad(RValue<Int2> x,unsigned char y)3891 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3892 {
3893 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3894 }
3895
psrad(RValue<Int4> x,unsigned char y)3896 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3897 {
3898 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3899 }
3900
psrld(RValue<UInt2> x,unsigned char y)3901 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3902 {
3903 return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3904 }
3905
psrld(RValue<UInt4> x,unsigned char y)3906 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3907 {
3908 return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3909 }
3910
pmaxsd(RValue<Int4> x,RValue<Int4> y)3911 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3912 {
3913 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3914 }
3915
pminsd(RValue<Int4> x,RValue<Int4> y)3916 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3917 {
3918 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3919 }
3920
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3921 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3922 {
3923 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3924 }
3925
pminud(RValue<UInt4> x,RValue<UInt4> y)3926 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3927 {
3928 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3929 }
3930
pmulhw(RValue<Short4> x,RValue<Short4> y)3931 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3932 {
3933 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3934 }
3935
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3936 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3937 {
3938 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3939 }
3940
pmaddwd(RValue<Short4> x,RValue<Short4> y)3941 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3942 {
3943 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3944 }
3945
pmulhw(RValue<Short8> x,RValue<Short8> y)3946 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3947 {
3948 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3949 }
3950
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3951 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3952 {
3953 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3954 }
3955
pmaddwd(RValue<Short8> x,RValue<Short8> y)3956 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3957 {
3958 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3959 }
3960
movmskps(RValue<Float4> x)3961 RValue<Int> movmskps(RValue<Float4> x)
3962 {
3963 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, x.value()));
3964 }
3965
pmovmskb(RValue<Byte8> x)3966 RValue<Int> pmovmskb(RValue<Byte8> x)
3967 {
3968 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, x.value())) & 0xFF;
3969 }
3970
pmovzxbd(RValue<Byte16> x)3971 RValue<Int4> pmovzxbd(RValue<Byte16> x)
3972 {
3973 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
3974 }
3975
pmovsxbd(RValue<SByte16> x)3976 RValue<Int4> pmovsxbd(RValue<SByte16> x)
3977 {
3978 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
3979 }
3980
pmovzxwd(RValue<UShort8> x)3981 RValue<Int4> pmovzxwd(RValue<UShort8> x)
3982 {
3983 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
3984 }
3985
pmovsxwd(RValue<Short8> x)3986 RValue<Int4> pmovsxwd(RValue<Short8> x)
3987 {
3988 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
3989 }
3990
3991 } // namespace x86
3992 #endif // defined(__i386__) || defined(__x86_64__)
3993
3994 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3995 void VPrintf(const std::vector<Value *> &vals)
3996 {
3997 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3998 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3999 auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
4000 auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
4001 jit->builder->CreateCall(func, V(vals));
4002 }
4003 #endif // ENABLE_RR_PRINT
4004
Nop()4005 void Nop()
4006 {
4007 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4008 auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
4009 auto func = jit->module->getOrInsertFunction("nop", funcTy);
4010 jit->builder->CreateCall(func);
4011 }
4012
EmitDebugLocation()4013 void EmitDebugLocation()
4014 {
4015 #ifdef ENABLE_RR_DEBUG_INFO
4016 if(jit->debugInfo != nullptr)
4017 {
4018 jit->debugInfo->EmitLocation();
4019 }
4020 #endif // ENABLE_RR_DEBUG_INFO
4021 }
4022
EmitDebugVariable(Value * value)4023 void EmitDebugVariable(Value *value)
4024 {
4025 #ifdef ENABLE_RR_DEBUG_INFO
4026 if(jit->debugInfo != nullptr)
4027 {
4028 jit->debugInfo->EmitVariable(value);
4029 }
4030 #endif // ENABLE_RR_DEBUG_INFO
4031 }
4032
FlushDebug()4033 void FlushDebug()
4034 {
4035 #ifdef ENABLE_RR_DEBUG_INFO
4036 if(jit->debugInfo != nullptr)
4037 {
4038 jit->debugInfo->Flush();
4039 }
4040 #endif // ENABLE_RR_DEBUG_INFO
4041 }
4042
4043 } // namespace rr
4044
4045 // ------------------------------ Coroutines ------------------------------
4046
4047 namespace {
4048
4049 // Magic values retuned by llvm.coro.suspend.
4050 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
4051 enum SuspendAction
4052 {
4053 SuspendActionSuspend = -1,
4054 SuspendActionResume = 0,
4055 SuspendActionDestroy = 1
4056 };
4057
promoteFunctionToCoroutine()4058 void promoteFunctionToCoroutine()
4059 {
4060 ASSERT(jit->coroutine.id == nullptr);
4061
4062 // Types
4063 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4064 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4065 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4066 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4067 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4068 auto promiseTy = jit->coroutine.yieldType;
4069 auto promisePtrTy = promiseTy->getPointerTo();
4070
4071 // LLVM intrinsics
4072 auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
4073 auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
4074 auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
4075 auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
4076 auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
4077 auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
4078 auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
4079 auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
4080 auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
4081 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4082
4083 auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
4084 auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
4085 auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
4086 auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
4087
4088 auto oldInsertionPoint = jit->builder->saveIP();
4089
4090 // Build the coroutine_await() function:
4091 //
4092 // bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4093 // {
4094 // if(llvm.coro.done(handle))
4095 // {
4096 // return false;
4097 // }
4098 // else
4099 // {
4100 // *value = (T*)llvm.coro.promise(handle);
4101 // llvm.coro.resume(handle);
4102 // return true;
4103 // }
4104 // }
4105 //
4106 {
4107 auto args = jit->coroutine.await->arg_begin();
4108 auto handle = args++;
4109 auto outPtr = args++;
4110 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
4111 auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
4112 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
4113
4114 auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4115 jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4116
4117 jit->builder->SetInsertPoint(doneBlock);
4118 jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
4119
4120 jit->builder->SetInsertPoint(resumeBlock);
4121 auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4); // TODO: Get correct alignment.
4122 auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
4123 auto promise = jit->builder->CreateLoad(jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4124 jit->builder->CreateStore(promise, outPtr);
4125 jit->builder->CreateCall(coro_resume, { handle });
4126 jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
4127 }
4128
4129 // Build the coroutine_destroy() function:
4130 //
4131 // void coroutine_destroy(CoroutineHandle* handle)
4132 // {
4133 // llvm.coro.destroy(handle);
4134 // }
4135 //
4136 {
4137 auto handle = jit->coroutine.destroy->arg_begin();
4138 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4139 jit->builder->CreateCall(coro_destroy, { handle });
4140 jit->builder->CreateRetVoid();
4141 }
4142
4143 // Begin building the main coroutine_begin() function.
4144 //
4145 // CoroutineHandle* coroutine_begin(<Arguments>)
4146 // {
4147 // YieldType promise;
4148 // auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4149 // void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4150 // CoroutineHandle *handle = llvm.coro.begin(id, frame);
4151 //
4152 // ... <REACTOR CODE> ...
4153 //
4154 // end:
4155 // SuspendAction action = llvm.coro.suspend(none, true /* final */); // <-- RESUME POINT
4156 // switch(action)
4157 // {
4158 // case SuspendActionResume:
4159 // UNREACHABLE(); // Illegal to resume after final suspend.
4160 // case SuspendActionDestroy:
4161 // goto destroy;
4162 // default: // (SuspendActionSuspend)
4163 // goto suspend;
4164 // }
4165 //
4166 // destroy:
4167 // coroutine_free_frame(llvm.coro.free(id, handle));
4168 // goto suspend;
4169 //
4170 // suspend:
4171 // llvm.coro.end(handle, false);
4172 // return handle;
4173 // }
4174 //
4175
4176 #ifdef ENABLE_RR_DEBUG_INFO
4177 jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
4178 #endif // ENABLE_RR_DEBUG_INFO
4179
4180 jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
4181 jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
4182 jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
4183
4184 jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4185 jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4186 jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4187 llvm::ConstantInt::get(i32Ty, 0),
4188 jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4189 llvm::ConstantPointerNull::get(i8PtrTy),
4190 llvm::ConstantPointerNull::get(i8PtrTy),
4191 });
4192 auto size = jit->builder->CreateCall(coro_size, {});
4193 auto frame = jit->builder->CreateCall(allocFrame, { size });
4194 jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4195
4196 // Build the suspend block
4197 jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4198 jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4199 jit->builder->CreateRet(jit->coroutine.handle);
4200
4201 // Build the end block
4202 jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4203 auto action = jit->builder->CreateCall(coro_suspend, {
4204 llvm::ConstantTokenNone::get(*jit->context),
4205 llvm::ConstantInt::get(i1Ty, 1), // final: true
4206 });
4207 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4208 // switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4209 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4210
4211 // Build the destroy block
4212 jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4213 auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4214 jit->builder->CreateCall(freeFrame, { memory });
4215 jit->builder->CreateBr(jit->coroutine.suspendBlock);
4216
4217 // Switch back to original insert point to continue building the coroutine.
4218 jit->builder->restoreIP(oldInsertionPoint);
4219 }
4220
4221 } // anonymous namespace
4222
4223 namespace rr {
4224
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4225 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4226 {
4227 // Coroutines are initially created as a regular function.
4228 // Upon the first call to Yield(), the function is promoted to a true
4229 // coroutine.
4230 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4231 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4232 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4233 auto handleTy = i8PtrTy;
4234 auto boolTy = i1Ty;
4235 auto promiseTy = T(YieldType);
4236 auto promisePtrTy = promiseTy->getPointerTo();
4237
4238 jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4239 jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4240 jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4241 jit->coroutine.yieldType = promiseTy;
4242 jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4243
4244 jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4245 }
4246
yield(Value * val)4247 void Nucleus::yield(Value *val)
4248 {
4249 if(jit->coroutine.id == nullptr)
4250 {
4251 // First call to yield().
4252 // Promote the function to a full coroutine.
4253 promoteFunctionToCoroutine();
4254 ASSERT(jit->coroutine.id != nullptr);
4255 }
4256
4257 // promise = val;
4258 //
4259 // auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4260 // switch(action)
4261 // {
4262 // case SuspendActionResume:
4263 // goto resume;
4264 // case SuspendActionDestroy:
4265 // goto destroy;
4266 // default: // (SuspendActionSuspend)
4267 // goto suspend;
4268 // }
4269 // resume:
4270 //
4271
4272 RR_DEBUG_INFO_UPDATE_LOC();
4273 Variable::materializeAll();
4274
4275 // Types
4276 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4277 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4278
4279 // Intrinsics
4280 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4281
4282 // Create a block to resume execution.
4283 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4284
4285 // Store the promise (yield value)
4286 jit->builder->CreateStore(V(val), jit->coroutine.promise);
4287 auto action = jit->builder->CreateCall(coro_suspend, {
4288 llvm::ConstantTokenNone::get(*jit->context),
4289 llvm::ConstantInt::get(i1Ty, 0), // final: true
4290 });
4291 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4292 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4293 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4294
4295 // Continue building in the resume block.
4296 jit->builder->SetInsertPoint(resumeBlock);
4297 }
4298
acquireCoroutine(const char * name,const Config::Edit & cfgEdit)4299 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
4300 {
4301 bool isCoroutine = jit->coroutine.id != nullptr;
4302 if(isCoroutine)
4303 {
4304 jit->builder->CreateBr(jit->coroutine.endBlock);
4305 }
4306 else
4307 {
4308 // Coroutine without a Yield acts as a regular function.
4309 // The 'coroutine_begin' function returns a nullptr for the coroutine
4310 // handle.
4311 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4312 // The 'coroutine_await' function always returns false (coroutine done).
4313 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4314 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4315 // The 'coroutine_destroy' does nothing, returns void.
4316 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4317 jit->builder->CreateRetVoid();
4318 }
4319
4320 #ifdef ENABLE_RR_DEBUG_INFO
4321 if(jit->debugInfo != nullptr)
4322 {
4323 jit->debugInfo->Finalize();
4324 }
4325 #endif // ENABLE_RR_DEBUG_INFO
4326
4327 if(false)
4328 {
4329 std::error_code error;
4330 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4331 jit->module->print(file, 0);
4332 }
4333
4334 if(isCoroutine)
4335 {
4336 // Run manadory coroutine transforms.
4337 llvm::legacy::PassManager pm;
4338
4339 pm.add(llvm::createCoroEarlyLegacyPass());
4340 pm.add(llvm::createCoroSplitLegacyPass());
4341 pm.add(llvm::createCoroElideLegacyPass());
4342 pm.add(llvm::createBarrierNoopPass());
4343 pm.add(llvm::createCoroCleanupLegacyPass());
4344
4345 pm.run(*jit->module);
4346 }
4347
4348 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4349 {
4350 llvm::legacy::PassManager pm;
4351 pm.add(llvm::createVerifierPass());
4352 pm.run(*jit->module);
4353 }
4354 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4355
4356 auto cfg = cfgEdit.apply(jit->config);
4357 jit->optimize(cfg);
4358
4359 if(false)
4360 {
4361 std::error_code error;
4362 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4363 jit->module->print(file, 0);
4364 }
4365
4366 llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4367 funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4368 funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4369 funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4370
4371 auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount, cfg);
4372
4373 delete jit;
4374 jit = nullptr;
4375
4376 return routine;
4377 }
4378
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4379 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4380 {
4381 return func();
4382 }
4383
4384 } // namespace rr
4385