1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "LLVMReactor.hpp"
16
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "EmulatedIntrinsics.hpp"
20 #include "LLVMReactorDebugInfo.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "x86.hpp"
24
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/IntrinsicsX86.h"
27 #include "llvm/IR/LegacyPassManager.h"
28 #include "llvm/IR/Verifier.h"
29 #include "llvm/Support/Alignment.h"
30 #include "llvm/Support/ManagedStatic.h"
31 #include "llvm/Transforms/Coroutines.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Scalar.h"
34
35 #include <fstream>
36 #include <iostream>
37 #include <mutex>
38 #include <numeric>
39 #include <thread>
40 #include <unordered_map>
41
42 #if defined(__i386__) || defined(__x86_64__)
43 # include <xmmintrin.h>
44 #endif
45
46 #include <math.h>
47
48 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()49 extern "C" void X86CompilationCallback()
50 {
51 UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
52 }
53 #endif
54
55 #if !LLVM_ENABLE_THREADS
56 # error "LLVM_ENABLE_THREADS needs to be enabled"
57 #endif
58
59 #if LLVM_VERSION_MAJOR < 11
60 namespace llvm {
61 using FixedVectorType = VectorType;
62 } // namespace llvm
63 #endif
64
65 namespace {
66
67 // Used to automatically invoke llvm_shutdown() when driver is unloaded
68 llvm::llvm_shutdown_obj llvmShutdownObj;
69
70 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
71 // for destructing objects at exit. See crbug.com/1074222
72 thread_local rr::JITBuilder *jit = nullptr;
73
74 // Default configuration settings. Must be accessed under mutex lock.
75 std::mutex defaultConfigLock;
defaultConfig()76 rr::Config &defaultConfig()
77 {
78 // This uses a static in a function to avoid the cost of a global static
79 // initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
80 static rr::Config config = rr::Config::Edit()
81 .add(rr::Optimization::Pass::ScalarReplAggregates)
82 .add(rr::Optimization::Pass::InstructionCombining)
83 .apply({});
84 return config;
85 }
86
lowerPAVG(llvm::Value * x,llvm::Value * y)87 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
88 {
89 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
90
91 llvm::VectorType *extTy =
92 llvm::VectorType::getExtendedElementVectorType(ty);
93 x = jit->builder->CreateZExt(x, extTy);
94 y = jit->builder->CreateZExt(y, extTy);
95
96 // (x + y + 1) >> 1
97 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
98 llvm::Value *res = jit->builder->CreateAdd(x, y);
99 res = jit->builder->CreateAdd(res, one);
100 res = jit->builder->CreateLShr(res, one);
101 return jit->builder->CreateTrunc(res, ty);
102 }
103
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)104 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
105 llvm::ICmpInst::Predicate pred)
106 {
107 return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
108 }
109
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)110 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
111 llvm::Value *y, llvm::Type *dstTy)
112 {
113 return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
114 }
115
116 #if defined(__i386__) || defined(__x86_64__)
lowerPMOV(llvm::Value * op,llvm::Type * dstType,bool sext)117 llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
118 {
119 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
120 llvm::FixedVectorType *dstTy = llvm::cast<llvm::FixedVectorType>(dstType);
121
122 llvm::Value *undef = llvm::UndefValue::get(srcTy);
123 llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
124 std::iota(mask.begin(), mask.end(), 0);
125 llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
126
127 return sext ? jit->builder->CreateSExt(v, dstTy)
128 : jit->builder->CreateZExt(v, dstTy);
129 }
130
lowerPABS(llvm::Value * v)131 llvm::Value *lowerPABS(llvm::Value *v)
132 {
133 llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
134 llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
135 llvm::Value *neg = jit->builder->CreateNeg(v);
136 return jit->builder->CreateSelect(cmp, v, neg);
137 }
138 #endif // defined(__i386__) || defined(__x86_64__)
139
140 #if !defined(__i386__) && !defined(__x86_64__)
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)141 llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
142 llvm::FCmpInst::Predicate pred)
143 {
144 return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
145 }
146
lowerRound(llvm::Value * x)147 llvm::Value *lowerRound(llvm::Value *x)
148 {
149 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
150 jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
151 return jit->builder->CreateCall(nearbyint, { x });
152 }
153
lowerRoundInt(llvm::Value * x,llvm::Type * ty)154 llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
155 {
156 return jit->builder->CreateFPToSI(lowerRound(x), ty);
157 }
158
lowerFloor(llvm::Value * x)159 llvm::Value *lowerFloor(llvm::Value *x)
160 {
161 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
162 jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
163 return jit->builder->CreateCall(floor, { x });
164 }
165
lowerTrunc(llvm::Value * x)166 llvm::Value *lowerTrunc(llvm::Value *x)
167 {
168 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
169 jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
170 return jit->builder->CreateCall(trunc, { x });
171 }
172
lowerSQRT(llvm::Value * x)173 llvm::Value *lowerSQRT(llvm::Value *x)
174 {
175 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
176 jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
177 return jit->builder->CreateCall(sqrt, { x });
178 }
179
lowerRCP(llvm::Value * x)180 llvm::Value *lowerRCP(llvm::Value *x)
181 {
182 llvm::Type *ty = x->getType();
183 llvm::Constant *one;
184 if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
185 {
186 one = llvm::ConstantVector::getSplat(
187 # if LLVM_VERSION_MAJOR >= 11
188 vectorTy->getElementCount(),
189 # else
190 vectorTy->getNumElements(),
191 # endif
192 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
193 }
194 else
195 {
196 one = llvm::ConstantFP::get(ty, 1);
197 }
198 return jit->builder->CreateFDiv(one, x);
199 }
200
lowerRSQRT(llvm::Value * x)201 llvm::Value *lowerRSQRT(llvm::Value *x)
202 {
203 return lowerRCP(lowerSQRT(x));
204 }
205
lowerVectorShl(llvm::Value * x,uint64_t scalarY)206 llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
207 {
208 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
209 llvm::Value *y = llvm::ConstantVector::getSplat(
210 # if LLVM_VERSION_MAJOR >= 11
211 ty->getElementCount(),
212 # else
213 ty->getNumElements(),
214 # endif
215 llvm::ConstantInt::get(ty->getElementType(), scalarY));
216 return jit->builder->CreateShl(x, y);
217 }
218
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)219 llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
220 {
221 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
222 llvm::Value *y = llvm::ConstantVector::getSplat(
223 # if LLVM_VERSION_MAJOR >= 11
224 ty->getElementCount(),
225 # else
226 ty->getNumElements(),
227 # endif
228 llvm::ConstantInt::get(ty->getElementType(), scalarY));
229 return jit->builder->CreateAShr(x, y);
230 }
231
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)232 llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
233 {
234 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
235 llvm::Value *y = llvm::ConstantVector::getSplat(
236 # if LLVM_VERSION_MAJOR >= 11
237 ty->getElementCount(),
238 # else
239 ty->getNumElements(),
240 # endif
241 llvm::ConstantInt::get(ty->getElementType(), scalarY));
242 return jit->builder->CreateLShr(x, y);
243 }
244
lowerMulAdd(llvm::Value * x,llvm::Value * y)245 llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
246 {
247 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
248 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
249
250 llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
251 llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
252 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
253
254 llvm::Value *undef = llvm::UndefValue::get(extTy);
255
256 llvm::SmallVector<uint32_t, 16> evenIdx;
257 llvm::SmallVector<uint32_t, 16> oddIdx;
258 for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
259 {
260 evenIdx.push_back(i);
261 oddIdx.push_back(i + 1);
262 }
263
264 llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
265 llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
266 return jit->builder->CreateAdd(lhs, rhs);
267 }
268
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)269 llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
270 {
271 llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
272 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
273
274 llvm::IntegerType *dstElemTy =
275 llvm::cast<llvm::IntegerType>(dstTy->getElementType());
276
277 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
278 ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
279 llvm::Constant *max, *min;
280 if(isSigned)
281 {
282 max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
283 min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
284 }
285 else
286 {
287 max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
288 min = llvm::ConstantInt::get(srcTy, 0, false);
289 }
290
291 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
292 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
293 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
294 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
295
296 x = jit->builder->CreateTrunc(x, dstTy);
297 y = jit->builder->CreateTrunc(y, dstTy);
298
299 llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
300 std::iota(index.begin(), index.end(), 0);
301
302 return jit->builder->CreateShuffleVector(x, y, index);
303 }
304
lowerSignMask(llvm::Value * x,llvm::Type * retTy)305 llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
306 {
307 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
308 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
309 llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
310
311 llvm::Value *ret = jit->builder->CreateZExt(
312 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
313 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
314 {
315 llvm::Value *elem = jit->builder->CreateZExt(
316 jit->builder->CreateExtractElement(cmp, i), retTy);
317 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
318 }
319 return ret;
320 }
321
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)322 llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
323 {
324 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
325 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
326 llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
327
328 llvm::Value *ret = jit->builder->CreateZExt(
329 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
330 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
331 {
332 llvm::Value *elem = jit->builder->CreateZExt(
333 jit->builder->CreateExtractElement(cmp, i), retTy);
334 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
335 }
336 return ret;
337 }
338 #endif // !defined(__i386__) && !defined(__x86_64__)
339
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)340 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
341 {
342 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
343 }
344
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)345 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
346 {
347 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
348 }
349
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)350 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
351 {
352 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
353 }
354
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)355 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
356 {
357 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
358 }
359
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)360 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
361 {
362 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
363 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
364
365 llvm::Value *extX, *extY;
366 if(sext)
367 {
368 extX = jit->builder->CreateSExt(x, extTy);
369 extY = jit->builder->CreateSExt(y, extTy);
370 }
371 else
372 {
373 extX = jit->builder->CreateZExt(x, extTy);
374 extY = jit->builder->CreateZExt(y, extTy);
375 }
376
377 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
378
379 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
380 llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
381 return jit->builder->CreateTrunc(mulh, ty);
382 }
383
384 } // namespace
385
386 namespace rr {
387
BackendName()388 std::string BackendName()
389 {
390 return std::string("LLVM ") + LLVM_VERSION_STRING;
391 }
392
393 const Capabilities Caps = {
394 true, // CoroutinesSupported
395 };
396
397 // The abstract Type* types are implemented as LLVM types, except that
398 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
399 // and VFP in ARM, and eliminate the overhead of converting them to explicit
400 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
401 // as abstract pointers with small enum values.
402 enum InternalType : uintptr_t
403 {
404 // Emulated types:
405 Type_v2i32,
406 Type_v4i16,
407 Type_v2i16,
408 Type_v8i8,
409 Type_v4i8,
410 Type_v2f32,
411 EmulatedTypeCount,
412 // Returned by asInternalType() to indicate that the abstract Type*
413 // should be interpreted as LLVM type pointer:
414 Type_LLVM
415 };
416
asInternalType(Type * type)417 inline InternalType asInternalType(Type *type)
418 {
419 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
420 return (t < EmulatedTypeCount) ? t : Type_LLVM;
421 }
422
T(Type * t)423 llvm::Type *T(Type *t)
424 {
425 // Use 128-bit vectors to implement logically shorter ones.
426 switch(asInternalType(t))
427 {
428 case Type_v2i32: return T(Int4::type());
429 case Type_v4i16: return T(Short8::type());
430 case Type_v2i16: return T(Short8::type());
431 case Type_v8i8: return T(Byte16::type());
432 case Type_v4i8: return T(Byte16::type());
433 case Type_v2f32: return T(Float4::type());
434 case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
435 default:
436 UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
437 return nullptr;
438 }
439 }
440
T(InternalType t)441 Type *T(InternalType t)
442 {
443 return reinterpret_cast<Type *>(t);
444 }
445
T(const std::vector<Type * > & t)446 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
447 {
448 return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
449 }
450
B(BasicBlock * t)451 inline llvm::BasicBlock *B(BasicBlock *t)
452 {
453 return reinterpret_cast<llvm::BasicBlock *>(t);
454 }
455
B(llvm::BasicBlock * t)456 inline BasicBlock *B(llvm::BasicBlock *t)
457 {
458 return reinterpret_cast<BasicBlock *>(t);
459 }
460
typeSize(Type * type)461 static size_t typeSize(Type *type)
462 {
463 switch(asInternalType(type))
464 {
465 case Type_v2i32: return 8;
466 case Type_v4i16: return 8;
467 case Type_v2i16: return 4;
468 case Type_v8i8: return 8;
469 case Type_v4i8: return 4;
470 case Type_v2f32: return 8;
471 case Type_LLVM:
472 {
473 llvm::Type *t = T(type);
474
475 if(t->isPointerTy())
476 {
477 return sizeof(void *);
478 }
479
480 // At this point we should only have LLVM 'primitive' types.
481 unsigned int bits = t->getPrimitiveSizeInBits();
482 ASSERT_MSG(bits != 0, "bits: %d", int(bits));
483
484 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
485 // but are typically stored as one byte. The DataLayout structure should
486 // be used here and many other places if this assumption fails.
487 return (bits + 7) / 8;
488 }
489 break;
490 default:
491 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
492 return 0;
493 }
494 }
495
elementCount(Type * type)496 static unsigned int elementCount(Type *type)
497 {
498 switch(asInternalType(type))
499 {
500 case Type_v2i32: return 2;
501 case Type_v4i16: return 4;
502 case Type_v2i16: return 2;
503 case Type_v8i8: return 8;
504 case Type_v4i8: return 4;
505 case Type_v2f32: return 2;
506 case Type_LLVM: return llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();
507 default:
508 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
509 return 0;
510 }
511 }
512
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)513 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> ¶ms)
514 {
515 llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
516 auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
517
518 func->setLinkage(llvm::GlobalValue::ExternalLinkage);
519 func->setDoesNotThrow();
520 func->setCallingConv(llvm::CallingConv::C);
521
522 if(__has_feature(memory_sanitizer))
523 {
524 func->addFnAttr(llvm::Attribute::SanitizeMemory);
525 }
526
527 return func;
528 }
529
Nucleus()530 Nucleus::Nucleus()
531 {
532 #if !__has_feature(memory_sanitizer)
533 // thread_local variables in shared libraries are initialized at load-time,
534 // but this is not observed by MemorySanitizer if the loader itself was not
535 // instrumented, leading to false-positive uninitialized variable errors.
536 ASSERT(jit == nullptr);
537 ASSERT(Variable::unmaterializedVariables == nullptr);
538 #endif
539
540 jit = new JITBuilder(Nucleus::getDefaultConfig());
541 Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
542 }
543
~Nucleus()544 Nucleus::~Nucleus()
545 {
546 delete Variable::unmaterializedVariables;
547 Variable::unmaterializedVariables = nullptr;
548
549 delete jit;
550 jit = nullptr;
551 }
552
setDefaultConfig(const Config & cfg)553 void Nucleus::setDefaultConfig(const Config &cfg)
554 {
555 std::unique_lock<std::mutex> lock(::defaultConfigLock);
556 ::defaultConfig() = cfg;
557 }
558
adjustDefaultConfig(const Config::Edit & cfgEdit)559 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
560 {
561 std::unique_lock<std::mutex> lock(::defaultConfigLock);
562 auto &config = ::defaultConfig();
563 config = cfgEdit.apply(config);
564 }
565
getDefaultConfig()566 Config Nucleus::getDefaultConfig()
567 {
568 std::unique_lock<std::mutex> lock(::defaultConfigLock);
569 return ::defaultConfig();
570 }
571
acquireRoutine(const char * name,const Config::Edit * cfgEdit)572 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
573 {
574 if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
575 {
576 llvm::Type *type = jit->function->getReturnType();
577
578 if(type->isVoidTy())
579 {
580 createRetVoid();
581 }
582 else
583 {
584 createRet(V(llvm::UndefValue::get(type)));
585 }
586 }
587
588 std::shared_ptr<Routine> routine;
589
590 auto acquire = [&](rr::JITBuilder *jit) {
591 // ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
592 // it needs to only use the jit variable passed in as an argument.
593
594 Config cfg = jit->config;
595 if(cfgEdit)
596 {
597 cfg = cfgEdit->apply(jit->config);
598 }
599
600 #ifdef ENABLE_RR_DEBUG_INFO
601 if(jit->debugInfo != nullptr)
602 {
603 jit->debugInfo->Finalize();
604 }
605 #endif // ENABLE_RR_DEBUG_INFO
606
607 if(false)
608 {
609 std::error_code error;
610 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
611 jit->module->print(file, 0);
612 }
613
614 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
615 {
616 llvm::legacy::PassManager pm;
617 pm.add(llvm::createVerifierPass());
618 pm.run(*jit->module);
619 }
620 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
621
622 jit->optimize(cfg);
623
624 if(false)
625 {
626 std::error_code error;
627 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
628 jit->module->print(file, 0);
629 }
630
631 routine = jit->acquireRoutine(name, &jit->function, 1, cfg);
632 };
633
634 #ifdef JIT_IN_SEPARATE_THREAD
635 // Perform optimizations and codegen in a separate thread to avoid stack overflow.
636 // FIXME(b/149829034): This is not a long-term solution. Reactor has no control
637 // over the threading and stack sizes of its users, so this should be addressed
638 // at a higher level instead.
639 std::thread thread(acquire, jit);
640 thread.join();
641 #else
642 acquire(jit);
643 #endif
644
645 return routine;
646 }
647
allocateStackVariable(Type * type,int arraySize)648 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
649 {
650 // Need to allocate it in the entry block for mem2reg to work
651 llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
652
653 llvm::Instruction *declaration;
654
655 #if LLVM_VERSION_MAJOR >= 11
656 auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
657 #else
658 auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
659 #endif
660
661 if(arraySize)
662 {
663 Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
664 declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
665 }
666 else
667 {
668 declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
669 }
670
671 entryBlock.getInstList().push_front(declaration);
672
673 return V(declaration);
674 }
675
createBasicBlock()676 BasicBlock *Nucleus::createBasicBlock()
677 {
678 return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
679 }
680
getInsertBlock()681 BasicBlock *Nucleus::getInsertBlock()
682 {
683 return B(jit->builder->GetInsertBlock());
684 }
685
setInsertBlock(BasicBlock * basicBlock)686 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
687 {
688 // assert(jit->builder->GetInsertBlock()->back().isTerminator());
689
690 jit->builder->SetInsertPoint(B(basicBlock));
691 }
692
createFunction(Type * ReturnType,const std::vector<Type * > & Params)693 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
694 {
695 jit->function = rr::createFunction("", T(ReturnType), T(Params));
696
697 #ifdef ENABLE_RR_DEBUG_INFO
698 jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
699 #endif // ENABLE_RR_DEBUG_INFO
700
701 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
702 }
703
getArgument(unsigned int index)704 Value *Nucleus::getArgument(unsigned int index)
705 {
706 llvm::Function::arg_iterator args = jit->function->arg_begin();
707
708 while(index)
709 {
710 args++;
711 index--;
712 }
713
714 return V(&*args);
715 }
716
createRetVoid()717 void Nucleus::createRetVoid()
718 {
719 RR_DEBUG_INFO_UPDATE_LOC();
720
721 ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
722
723 // Code generated after this point is unreachable, so any variables
724 // being read can safely return an undefined value. We have to avoid
725 // materializing variables after the terminator ret instruction.
726 Variable::killUnmaterialized();
727
728 jit->builder->CreateRetVoid();
729 }
730
createRet(Value * v)731 void Nucleus::createRet(Value *v)
732 {
733 RR_DEBUG_INFO_UPDATE_LOC();
734
735 ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
736
737 // Code generated after this point is unreachable, so any variables
738 // being read can safely return an undefined value. We have to avoid
739 // materializing variables after the terminator ret instruction.
740 Variable::killUnmaterialized();
741
742 jit->builder->CreateRet(V(v));
743 }
744
createBr(BasicBlock * dest)745 void Nucleus::createBr(BasicBlock *dest)
746 {
747 RR_DEBUG_INFO_UPDATE_LOC();
748 Variable::materializeAll();
749
750 jit->builder->CreateBr(B(dest));
751 }
752
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)753 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
754 {
755 RR_DEBUG_INFO_UPDATE_LOC();
756 Variable::materializeAll();
757 jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
758 }
759
createAdd(Value * lhs,Value * rhs)760 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
761 {
762 RR_DEBUG_INFO_UPDATE_LOC();
763 return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
764 }
765
createSub(Value * lhs,Value * rhs)766 Value *Nucleus::createSub(Value *lhs, Value *rhs)
767 {
768 RR_DEBUG_INFO_UPDATE_LOC();
769 return V(jit->builder->CreateSub(V(lhs), V(rhs)));
770 }
771
createMul(Value * lhs,Value * rhs)772 Value *Nucleus::createMul(Value *lhs, Value *rhs)
773 {
774 RR_DEBUG_INFO_UPDATE_LOC();
775 return V(jit->builder->CreateMul(V(lhs), V(rhs)));
776 }
777
createUDiv(Value * lhs,Value * rhs)778 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
779 {
780 RR_DEBUG_INFO_UPDATE_LOC();
781 return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
782 }
783
createSDiv(Value * lhs,Value * rhs)784 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
785 {
786 RR_DEBUG_INFO_UPDATE_LOC();
787 return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
788 }
789
createFAdd(Value * lhs,Value * rhs)790 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
791 {
792 RR_DEBUG_INFO_UPDATE_LOC();
793 return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
794 }
795
createFSub(Value * lhs,Value * rhs)796 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
797 {
798 RR_DEBUG_INFO_UPDATE_LOC();
799 return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
800 }
801
createFMul(Value * lhs,Value * rhs)802 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
803 {
804 RR_DEBUG_INFO_UPDATE_LOC();
805 return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
806 }
807
createFDiv(Value * lhs,Value * rhs)808 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
809 {
810 RR_DEBUG_INFO_UPDATE_LOC();
811 return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
812 }
813
createURem(Value * lhs,Value * rhs)814 Value *Nucleus::createURem(Value *lhs, Value *rhs)
815 {
816 RR_DEBUG_INFO_UPDATE_LOC();
817 return V(jit->builder->CreateURem(V(lhs), V(rhs)));
818 }
819
createSRem(Value * lhs,Value * rhs)820 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
821 {
822 RR_DEBUG_INFO_UPDATE_LOC();
823 return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
824 }
825
createFRem(Value * lhs,Value * rhs)826 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
827 {
828 RR_DEBUG_INFO_UPDATE_LOC();
829 return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
830 }
831
operator %(RValue<Float4> lhs,RValue<Float4> rhs)832 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
833 {
834 return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
835 }
836
createShl(Value * lhs,Value * rhs)837 Value *Nucleus::createShl(Value *lhs, Value *rhs)
838 {
839 RR_DEBUG_INFO_UPDATE_LOC();
840 return V(jit->builder->CreateShl(V(lhs), V(rhs)));
841 }
842
createLShr(Value * lhs,Value * rhs)843 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
844 {
845 RR_DEBUG_INFO_UPDATE_LOC();
846 return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
847 }
848
createAShr(Value * lhs,Value * rhs)849 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
850 {
851 RR_DEBUG_INFO_UPDATE_LOC();
852 return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
853 }
854
createAnd(Value * lhs,Value * rhs)855 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
856 {
857 RR_DEBUG_INFO_UPDATE_LOC();
858 return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
859 }
860
createOr(Value * lhs,Value * rhs)861 Value *Nucleus::createOr(Value *lhs, Value *rhs)
862 {
863 RR_DEBUG_INFO_UPDATE_LOC();
864 return V(jit->builder->CreateOr(V(lhs), V(rhs)));
865 }
866
createXor(Value * lhs,Value * rhs)867 Value *Nucleus::createXor(Value *lhs, Value *rhs)
868 {
869 RR_DEBUG_INFO_UPDATE_LOC();
870 return V(jit->builder->CreateXor(V(lhs), V(rhs)));
871 }
872
createNeg(Value * v)873 Value *Nucleus::createNeg(Value *v)
874 {
875 RR_DEBUG_INFO_UPDATE_LOC();
876 return V(jit->builder->CreateNeg(V(v)));
877 }
878
createFNeg(Value * v)879 Value *Nucleus::createFNeg(Value *v)
880 {
881 RR_DEBUG_INFO_UPDATE_LOC();
882 return V(jit->builder->CreateFNeg(V(v)));
883 }
884
createNot(Value * v)885 Value *Nucleus::createNot(Value *v)
886 {
887 RR_DEBUG_INFO_UPDATE_LOC();
888 return V(jit->builder->CreateNot(V(v)));
889 }
890
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)891 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
892 {
893 RR_DEBUG_INFO_UPDATE_LOC();
894 switch(asInternalType(type))
895 {
896 case Type_v2i32:
897 case Type_v4i16:
898 case Type_v8i8:
899 case Type_v2f32:
900 return createBitCast(
901 createInsertElement(
902 V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
903 createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
904 0),
905 type);
906 case Type_v2i16:
907 case Type_v4i8:
908 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
909 {
910 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
911 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
912 i = createZExt(i, Long::type());
913 Value *v = createInsertElement(u, i, 0);
914 return createBitCast(v, type);
915 }
916 // Fallthrough to non-emulated case.
917 case Type_LLVM:
918 {
919 auto elTy = T(type);
920 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
921
922 if(!atomic)
923 {
924 return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
925 }
926 else if(elTy->isIntegerTy() || elTy->isPointerTy())
927 {
928 // Integers and pointers can be atomically loaded by setting
929 // the ordering constraint on the load instruction.
930 auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
931 load->setAtomic(atomicOrdering(atomic, memoryOrder));
932 return V(load);
933 }
934 else if(elTy->isFloatTy() || elTy->isDoubleTy())
935 {
936 // LLVM claims to support atomic loads of float types as
937 // above, but certain backends cannot deal with this.
938 // Load as an integer and bitcast. See b/136037244.
939 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
940 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
941 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
942 auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
943 load->setAtomic(atomicOrdering(atomic, memoryOrder));
944 auto loadCast = jit->builder->CreateBitCast(load, elTy);
945 return V(loadCast);
946 }
947 else
948 {
949 // More exotic types require falling back to the extern:
950 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
951 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
952 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
953 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
954 auto i8PtrTy = i8Ty->getPointerTo();
955 auto voidTy = llvm::Type::getVoidTy(*jit->context);
956 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
957 auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
958 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
959 auto out = allocateStackVariable(type);
960 jit->builder->CreateCall(func, {
961 llvm::ConstantInt::get(sizetTy, size),
962 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
963 jit->builder->CreatePointerCast(V(out), i8PtrTy),
964 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
965 });
966 return V(jit->builder->CreateLoad(T(type), V(out)));
967 }
968 }
969 default:
970 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
971 return nullptr;
972 }
973 }
974
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)975 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
976 {
977 RR_DEBUG_INFO_UPDATE_LOC();
978 switch(asInternalType(type))
979 {
980 case Type_v2i32:
981 case Type_v4i16:
982 case Type_v8i8:
983 case Type_v2f32:
984 createStore(
985 createExtractElement(
986 createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
987 createBitCast(ptr, Pointer<Long>::type()),
988 Long::type(), isVolatile, alignment, atomic, memoryOrder);
989 return value;
990 case Type_v2i16:
991 case Type_v4i8:
992 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
993 {
994 createStore(
995 createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
996 createBitCast(ptr, Pointer<Int>::type()),
997 Int::type(), isVolatile, alignment, atomic, memoryOrder);
998 return value;
999 }
1000 // Fallthrough to non-emulated case.
1001 case Type_LLVM:
1002 {
1003 auto elTy = T(type);
1004 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
1005
1006 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1007 {
1008 // Mark all memory writes as initialized by calling __msan_unpoison
1009 // void __msan_unpoison(const volatile void *a, size_t size)
1010 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1011 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1012 auto voidPtrTy = i8Ty->getPointerTo();
1013 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1014 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1015 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1016 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1017
1018 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
1019 llvm::ConstantInt::get(sizetTy, size) });
1020 }
1021
1022 if(!atomic)
1023 {
1024 jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1025 }
1026 else if(elTy->isIntegerTy() || elTy->isPointerTy())
1027 {
1028 // Integers and pointers can be atomically stored by setting
1029 // the ordering constraint on the store instruction.
1030 auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1031 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1032 }
1033 else if(elTy->isFloatTy() || elTy->isDoubleTy())
1034 {
1035 // LLVM claims to support atomic stores of float types as
1036 // above, but certain backends cannot deal with this.
1037 // Store as an bitcast integer. See b/136037244.
1038 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1039 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1040 auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1041 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1042 auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1043 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1044 }
1045 else
1046 {
1047 // More exotic types require falling back to the extern:
1048 // void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1049 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1050 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1051 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1052 auto i8PtrTy = i8Ty->getPointerTo();
1053 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1054 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1055 auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1056 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1057 auto copy = allocateStackVariable(type);
1058 jit->builder->CreateStore(V(value), V(copy));
1059 jit->builder->CreateCall(func, {
1060 llvm::ConstantInt::get(sizetTy, size),
1061 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1062 jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1063 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1064 });
1065 }
1066
1067 return value;
1068 }
1069 default:
1070 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1071 return nullptr;
1072 }
1073 }
1074
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1075 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1076 {
1077 RR_DEBUG_INFO_UPDATE_LOC();
1078
1079 ASSERT(V(ptr)->getType()->isPointerTy());
1080 ASSERT(V(mask)->getType()->isVectorTy());
1081
1082 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1083 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1084 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1085 auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1086 auto elVecPtrTy = elVecTy->getPointerTo();
1087 auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1088 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1089 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1090 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1091 return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1092 }
1093
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1094 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1095 {
1096 RR_DEBUG_INFO_UPDATE_LOC();
1097
1098 ASSERT(V(ptr)->getType()->isPointerTy());
1099 ASSERT(V(val)->getType()->isVectorTy());
1100 ASSERT(V(mask)->getType()->isVectorTy());
1101
1102 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1103 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1104 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1105 auto elVecTy = V(val)->getType();
1106 auto elVecPtrTy = elVecTy->getPointerTo();
1107 auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1108 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1109 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1110 jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1111
1112 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1113 {
1114 // Mark memory writes as initialized by calling __msan_unpoison
1115 // void __msan_unpoison(const volatile void *a, size_t size)
1116 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1117 auto voidPtrTy = voidTy->getPointerTo();
1118 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1119 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1120 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1121 auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1122
1123 for(unsigned i = 0; i < numEls; i++)
1124 {
1125 // Check mask for this element
1126 auto idx = llvm::ConstantInt::get(i32Ty, i);
1127 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1128 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1129 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1130 jit->builder->SetInsertPoint(thenBlock);
1131
1132 // Insert __msan_unpoison call in conditional block
1133 auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1134 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1135 llvm::ConstantInt::get(sizetTy, size) });
1136
1137 jit->builder->CreateBr(mergeBlock);
1138 jit->builder->SetInsertPoint(mergeBlock);
1139 }
1140 }
1141 }
1142
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1143 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1144 {
1145 ASSERT(base->getType()->isPointerTy());
1146 ASSERT(offsets->getType()->isVectorTy());
1147 ASSERT(mask->getType()->isVectorTy());
1148
1149 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1150 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1151 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1152 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1153 auto i8PtrTy = i8Ty->getPointerTo();
1154 auto elPtrTy = elTy->getPointerTo();
1155 auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1156 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1157 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1158 auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1159 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1160 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1161 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1162
1163 if(!__has_feature(memory_sanitizer))
1164 {
1165 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1166 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1167 return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1168 }
1169 else // __has_feature(memory_sanitizer)
1170 {
1171 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1172 // Work around it by emulating gather with element-wise loads.
1173 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1174
1175 Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1176 Nucleus::createStore(V(passthrough), result, T(elVecTy));
1177
1178 for(unsigned i = 0; i < numEls; i++)
1179 {
1180 // Check mask for this element
1181 Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1182
1183 If(RValue<Bool>(elementMask))
1184 {
1185 Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1186 Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1187
1188 Value *v = Nucleus::createLoad(result, T(elVecTy));
1189 v = Nucleus::createInsertElement(v, el, i);
1190 Nucleus::createStore(v, result, T(elVecTy));
1191 }
1192 }
1193
1194 return V(Nucleus::createLoad(result, T(elVecTy)));
1195 }
1196 }
1197
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1198 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1199 {
1200 return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1201 }
1202
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1203 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1204 {
1205 return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1206 }
1207
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1208 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1209 {
1210 ASSERT(base->getType()->isPointerTy());
1211 ASSERT(val->getType()->isVectorTy());
1212 ASSERT(offsets->getType()->isVectorTy());
1213 ASSERT(mask->getType()->isVectorTy());
1214
1215 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1216 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1217 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1218 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1219 auto i8PtrTy = i8Ty->getPointerTo();
1220 auto elVecTy = val->getType();
1221 auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1222 auto elPtrTy = elTy->getPointerTo();
1223 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1224
1225 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1226 auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1227 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1228 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1229
1230 if(!__has_feature(memory_sanitizer))
1231 {
1232 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1233 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1234 jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1235 }
1236 else // __has_feature(memory_sanitizer)
1237 {
1238 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1239 // Work around it by emulating scatter with element-wise stores.
1240 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1241
1242 for(unsigned i = 0; i < numEls; i++)
1243 {
1244 // Check mask for this element
1245 auto idx = llvm::ConstantInt::get(i32Ty, i);
1246 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1247 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1248 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1249 jit->builder->SetInsertPoint(thenBlock);
1250
1251 auto el = jit->builder->CreateExtractElement(val, idx);
1252 auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1253 Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1254
1255 jit->builder->CreateBr(mergeBlock);
1256 jit->builder->SetInsertPoint(mergeBlock);
1257 }
1258 }
1259 }
1260
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1261 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1262 {
1263 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1264 }
1265
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1266 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1267 {
1268 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1269 }
1270
createFence(std::memory_order memoryOrder)1271 void Nucleus::createFence(std::memory_order memoryOrder)
1272 {
1273 RR_DEBUG_INFO_UPDATE_LOC();
1274 jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1275 }
1276
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1277 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1278 {
1279 RR_DEBUG_INFO_UPDATE_LOC();
1280 ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1281 if(sizeof(void *) == 8)
1282 {
1283 // LLVM manual: "When indexing into an array, pointer or vector,
1284 // integers of any width are allowed, and they are not required to
1285 // be constant. These integers are treated as signed values where
1286 // relevant."
1287 //
1288 // Thus if we want indexes to be treated as unsigned we have to
1289 // zero-extend them ourselves.
1290 //
1291 // Note that this is not because we want to address anywhere near
1292 // 4 GB of data. Instead this is important for performance because
1293 // x86 supports automatic zero-extending of 32-bit registers to
1294 // 64-bit. Thus when indexing into an array using a uint32 is
1295 // actually faster than an int32.
1296 index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1297 }
1298
1299 // For non-emulated types we can rely on LLVM's GEP to calculate the
1300 // effective address correctly.
1301 if(asInternalType(type) == Type_LLVM)
1302 {
1303 return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1304 }
1305
1306 // For emulated types we have to multiply the index by the intended
1307 // type size ourselves to obain the byte offset.
1308 index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1309
1310 // Cast to a byte pointer, apply the byte offset, and cast back to the
1311 // original pointer type.
1312 return createBitCast(
1313 V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1314 T(llvm::PointerType::get(T(type), 0)));
1315 }
1316
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1317 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1318 {
1319 RR_DEBUG_INFO_UPDATE_LOC();
1320 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1321 #if LLVM_VERSION_MAJOR >= 11
1322 llvm::MaybeAlign(),
1323 #endif
1324 atomicOrdering(true, memoryOrder)));
1325 }
1326
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1327 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1328 {
1329 RR_DEBUG_INFO_UPDATE_LOC();
1330 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1331 #if LLVM_VERSION_MAJOR >= 11
1332 llvm::MaybeAlign(),
1333 #endif
1334 atomicOrdering(true, memoryOrder)));
1335 }
1336
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1337 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1338 {
1339 RR_DEBUG_INFO_UPDATE_LOC();
1340 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1341 #if LLVM_VERSION_MAJOR >= 11
1342 llvm::MaybeAlign(),
1343 #endif
1344 atomicOrdering(true, memoryOrder)));
1345 }
1346
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1347 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1348 {
1349 RR_DEBUG_INFO_UPDATE_LOC();
1350 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1351 #if LLVM_VERSION_MAJOR >= 11
1352 llvm::MaybeAlign(),
1353 #endif
1354 atomicOrdering(true, memoryOrder)));
1355 }
1356
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1357 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1358 {
1359 RR_DEBUG_INFO_UPDATE_LOC();
1360 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1361 #if LLVM_VERSION_MAJOR >= 11
1362 llvm::MaybeAlign(),
1363 #endif
1364 atomicOrdering(true, memoryOrder)));
1365 }
1366
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1367 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1368 {
1369 RR_DEBUG_INFO_UPDATE_LOC();
1370 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1371 #if LLVM_VERSION_MAJOR >= 11
1372 llvm::MaybeAlign(),
1373 #endif
1374 atomicOrdering(true, memoryOrder)));
1375 }
1376
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1377 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1378 {
1379 RR_DEBUG_INFO_UPDATE_LOC();
1380 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1381 #if LLVM_VERSION_MAJOR >= 11
1382 llvm::MaybeAlign(),
1383 #endif
1384 atomicOrdering(true, memoryOrder)));
1385 }
1386
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1387 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1388 {
1389 RR_DEBUG_INFO_UPDATE_LOC();
1390 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1391 #if LLVM_VERSION_MAJOR >= 11
1392 llvm::MaybeAlign(),
1393 #endif
1394 atomicOrdering(true, memoryOrder)));
1395 }
1396
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1397 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1398 {
1399 RR_DEBUG_INFO_UPDATE_LOC();
1400 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1401 #if LLVM_VERSION_MAJOR >= 11
1402 llvm::MaybeAlign(),
1403 #endif
1404 atomicOrdering(true, memoryOrder)));
1405 }
1406
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1407 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1408 {
1409 RR_DEBUG_INFO_UPDATE_LOC();
1410 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1411 #if LLVM_VERSION_MAJOR >= 11
1412 llvm::MaybeAlign(),
1413 #endif
1414 atomicOrdering(true, memoryOrder)));
1415 }
1416
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1417 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1418 {
1419 RR_DEBUG_INFO_UPDATE_LOC();
1420 // Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1421 return V(jit->builder->CreateExtractValue(
1422 jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1423 #if LLVM_VERSION_MAJOR >= 11
1424 llvm::MaybeAlign(),
1425 #endif
1426 atomicOrdering(true, memoryOrderEqual),
1427 atomicOrdering(true, memoryOrderUnequal)),
1428 llvm::ArrayRef<unsigned>(0u)));
1429 }
1430
createTrunc(Value * v,Type * destType)1431 Value *Nucleus::createTrunc(Value *v, Type *destType)
1432 {
1433 RR_DEBUG_INFO_UPDATE_LOC();
1434 return V(jit->builder->CreateTrunc(V(v), T(destType)));
1435 }
1436
createZExt(Value * v,Type * destType)1437 Value *Nucleus::createZExt(Value *v, Type *destType)
1438 {
1439 RR_DEBUG_INFO_UPDATE_LOC();
1440 return V(jit->builder->CreateZExt(V(v), T(destType)));
1441 }
1442
createSExt(Value * v,Type * destType)1443 Value *Nucleus::createSExt(Value *v, Type *destType)
1444 {
1445 RR_DEBUG_INFO_UPDATE_LOC();
1446 return V(jit->builder->CreateSExt(V(v), T(destType)));
1447 }
1448
createFPToUI(Value * v,Type * destType)1449 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1450 {
1451 RR_DEBUG_INFO_UPDATE_LOC();
1452 return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1453 }
1454
createFPToSI(Value * v,Type * destType)1455 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1456 {
1457 RR_DEBUG_INFO_UPDATE_LOC();
1458 return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1459 }
1460
createSIToFP(Value * v,Type * destType)1461 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1462 {
1463 RR_DEBUG_INFO_UPDATE_LOC();
1464 return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1465 }
1466
createFPTrunc(Value * v,Type * destType)1467 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1468 {
1469 RR_DEBUG_INFO_UPDATE_LOC();
1470 return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1471 }
1472
createFPExt(Value * v,Type * destType)1473 Value *Nucleus::createFPExt(Value *v, Type *destType)
1474 {
1475 RR_DEBUG_INFO_UPDATE_LOC();
1476 return V(jit->builder->CreateFPExt(V(v), T(destType)));
1477 }
1478
createBitCast(Value * v,Type * destType)1479 Value *Nucleus::createBitCast(Value *v, Type *destType)
1480 {
1481 RR_DEBUG_INFO_UPDATE_LOC();
1482 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1483 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1484 // reading back as the destination type.
1485 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1486 {
1487 Value *readAddress = allocateStackVariable(destType);
1488 Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1489 createStore(v, writeAddress, T(V(v)->getType()));
1490 return createLoad(readAddress, destType);
1491 }
1492 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1493 {
1494 Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1495 createStore(v, writeAddress, T(V(v)->getType()));
1496 Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1497 return createLoad(readAddress, destType);
1498 }
1499
1500 return V(jit->builder->CreateBitCast(V(v), T(destType)));
1501 }
1502
createICmpEQ(Value * lhs,Value * rhs)1503 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1504 {
1505 RR_DEBUG_INFO_UPDATE_LOC();
1506 return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1507 }
1508
createICmpNE(Value * lhs,Value * rhs)1509 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1510 {
1511 RR_DEBUG_INFO_UPDATE_LOC();
1512 return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1513 }
1514
createICmpUGT(Value * lhs,Value * rhs)1515 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1516 {
1517 RR_DEBUG_INFO_UPDATE_LOC();
1518 return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1519 }
1520
createICmpUGE(Value * lhs,Value * rhs)1521 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1522 {
1523 RR_DEBUG_INFO_UPDATE_LOC();
1524 return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1525 }
1526
createICmpULT(Value * lhs,Value * rhs)1527 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1528 {
1529 RR_DEBUG_INFO_UPDATE_LOC();
1530 return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1531 }
1532
createICmpULE(Value * lhs,Value * rhs)1533 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1534 {
1535 RR_DEBUG_INFO_UPDATE_LOC();
1536 return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1537 }
1538
createICmpSGT(Value * lhs,Value * rhs)1539 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1540 {
1541 RR_DEBUG_INFO_UPDATE_LOC();
1542 return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1543 }
1544
createICmpSGE(Value * lhs,Value * rhs)1545 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1546 {
1547 RR_DEBUG_INFO_UPDATE_LOC();
1548 return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1549 }
1550
createICmpSLT(Value * lhs,Value * rhs)1551 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1552 {
1553 RR_DEBUG_INFO_UPDATE_LOC();
1554 return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1555 }
1556
createICmpSLE(Value * lhs,Value * rhs)1557 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1558 {
1559 RR_DEBUG_INFO_UPDATE_LOC();
1560 return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1561 }
1562
createFCmpOEQ(Value * lhs,Value * rhs)1563 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1564 {
1565 RR_DEBUG_INFO_UPDATE_LOC();
1566 return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1567 }
1568
createFCmpOGT(Value * lhs,Value * rhs)1569 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1570 {
1571 RR_DEBUG_INFO_UPDATE_LOC();
1572 return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1573 }
1574
createFCmpOGE(Value * lhs,Value * rhs)1575 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1576 {
1577 RR_DEBUG_INFO_UPDATE_LOC();
1578 return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1579 }
1580
createFCmpOLT(Value * lhs,Value * rhs)1581 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1582 {
1583 RR_DEBUG_INFO_UPDATE_LOC();
1584 return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1585 }
1586
createFCmpOLE(Value * lhs,Value * rhs)1587 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1588 {
1589 RR_DEBUG_INFO_UPDATE_LOC();
1590 return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1591 }
1592
createFCmpONE(Value * lhs,Value * rhs)1593 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1594 {
1595 RR_DEBUG_INFO_UPDATE_LOC();
1596 return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1597 }
1598
createFCmpORD(Value * lhs,Value * rhs)1599 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1600 {
1601 RR_DEBUG_INFO_UPDATE_LOC();
1602 return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1603 }
1604
createFCmpUNO(Value * lhs,Value * rhs)1605 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1606 {
1607 RR_DEBUG_INFO_UPDATE_LOC();
1608 return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1609 }
1610
createFCmpUEQ(Value * lhs,Value * rhs)1611 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1612 {
1613 RR_DEBUG_INFO_UPDATE_LOC();
1614 return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1615 }
1616
createFCmpUGT(Value * lhs,Value * rhs)1617 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1618 {
1619 RR_DEBUG_INFO_UPDATE_LOC();
1620 return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1621 }
1622
createFCmpUGE(Value * lhs,Value * rhs)1623 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1624 {
1625 RR_DEBUG_INFO_UPDATE_LOC();
1626 return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1627 }
1628
createFCmpULT(Value * lhs,Value * rhs)1629 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1630 {
1631 RR_DEBUG_INFO_UPDATE_LOC();
1632 return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1633 }
1634
createFCmpULE(Value * lhs,Value * rhs)1635 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1636 {
1637 RR_DEBUG_INFO_UPDATE_LOC();
1638 return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1639 }
1640
createFCmpUNE(Value * lhs,Value * rhs)1641 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1642 {
1643 RR_DEBUG_INFO_UPDATE_LOC();
1644 return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1645 }
1646
createExtractElement(Value * vector,Type * type,int index)1647 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1648 {
1649 RR_DEBUG_INFO_UPDATE_LOC();
1650 ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1651 return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1652 }
1653
createInsertElement(Value * vector,Value * element,int index)1654 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1655 {
1656 RR_DEBUG_INFO_UPDATE_LOC();
1657 return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1658 }
1659
createShuffleVector(Value * v1,Value * v2,const int * select)1660 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1661 {
1662 RR_DEBUG_INFO_UPDATE_LOC();
1663
1664 int size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1665 const int maxSize = 16;
1666 llvm::Constant *swizzle[maxSize];
1667 ASSERT(size <= maxSize);
1668
1669 for(int i = 0; i < size; i++)
1670 {
1671 swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
1672 }
1673
1674 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
1675
1676 return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1677 }
1678
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1679 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1680 {
1681 RR_DEBUG_INFO_UPDATE_LOC();
1682 return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1683 }
1684
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1685 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1686 {
1687 RR_DEBUG_INFO_UPDATE_LOC();
1688 return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1689 }
1690
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1691 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1692 {
1693 RR_DEBUG_INFO_UPDATE_LOC();
1694 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1695 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1696 }
1697
createUnreachable()1698 void Nucleus::createUnreachable()
1699 {
1700 RR_DEBUG_INFO_UPDATE_LOC();
1701 jit->builder->CreateUnreachable();
1702 }
1703
getType(Value * value)1704 Type *Nucleus::getType(Value *value)
1705 {
1706 return T(V(value)->getType());
1707 }
1708
getContainedType(Type * vectorType)1709 Type *Nucleus::getContainedType(Type *vectorType)
1710 {
1711 return T(T(vectorType)->getContainedType(0));
1712 }
1713
getPointerType(Type * ElementType)1714 Type *Nucleus::getPointerType(Type *ElementType)
1715 {
1716 return T(llvm::PointerType::get(T(ElementType), 0));
1717 }
1718
getNaturalIntType()1719 static llvm::Type *getNaturalIntType()
1720 {
1721 return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1722 }
1723
getPrintfStorageType(Type * valueType)1724 Type *Nucleus::getPrintfStorageType(Type *valueType)
1725 {
1726 llvm::Type *valueTy = T(valueType);
1727 if(valueTy->isIntegerTy())
1728 {
1729 return T(getNaturalIntType());
1730 }
1731 if(valueTy->isFloatTy())
1732 {
1733 return T(llvm::Type::getDoubleTy(*jit->context));
1734 }
1735
1736 UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1737 return {};
1738 }
1739
createNullValue(Type * Ty)1740 Value *Nucleus::createNullValue(Type *Ty)
1741 {
1742 RR_DEBUG_INFO_UPDATE_LOC();
1743 return V(llvm::Constant::getNullValue(T(Ty)));
1744 }
1745
createConstantLong(int64_t i)1746 Value *Nucleus::createConstantLong(int64_t i)
1747 {
1748 RR_DEBUG_INFO_UPDATE_LOC();
1749 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1750 }
1751
createConstantInt(int i)1752 Value *Nucleus::createConstantInt(int i)
1753 {
1754 RR_DEBUG_INFO_UPDATE_LOC();
1755 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1756 }
1757
createConstantInt(unsigned int i)1758 Value *Nucleus::createConstantInt(unsigned int i)
1759 {
1760 RR_DEBUG_INFO_UPDATE_LOC();
1761 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1762 }
1763
createConstantBool(bool b)1764 Value *Nucleus::createConstantBool(bool b)
1765 {
1766 RR_DEBUG_INFO_UPDATE_LOC();
1767 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1768 }
1769
createConstantByte(signed char i)1770 Value *Nucleus::createConstantByte(signed char i)
1771 {
1772 RR_DEBUG_INFO_UPDATE_LOC();
1773 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1774 }
1775
createConstantByte(unsigned char i)1776 Value *Nucleus::createConstantByte(unsigned char i)
1777 {
1778 RR_DEBUG_INFO_UPDATE_LOC();
1779 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1780 }
1781
createConstantShort(short i)1782 Value *Nucleus::createConstantShort(short i)
1783 {
1784 RR_DEBUG_INFO_UPDATE_LOC();
1785 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1786 }
1787
createConstantShort(unsigned short i)1788 Value *Nucleus::createConstantShort(unsigned short i)
1789 {
1790 RR_DEBUG_INFO_UPDATE_LOC();
1791 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1792 }
1793
createConstantFloat(float x)1794 Value *Nucleus::createConstantFloat(float x)
1795 {
1796 RR_DEBUG_INFO_UPDATE_LOC();
1797 return V(llvm::ConstantFP::get(T(Float::type()), x));
1798 }
1799
createNullPointer(Type * Ty)1800 Value *Nucleus::createNullPointer(Type *Ty)
1801 {
1802 RR_DEBUG_INFO_UPDATE_LOC();
1803 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1804 }
1805
createConstantVector(const int64_t * constants,Type * type)1806 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1807 {
1808 RR_DEBUG_INFO_UPDATE_LOC();
1809 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1810 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1811 const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1812 ASSERT(numElements <= 16 && numConstants <= numElements);
1813 llvm::Constant *constantVector[16];
1814
1815 for(int i = 0; i < numElements; i++)
1816 {
1817 constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1818 }
1819
1820 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1821 }
1822
createConstantVector(const double * constants,Type * type)1823 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1824 {
1825 RR_DEBUG_INFO_UPDATE_LOC();
1826 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1827 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1828 const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1829 ASSERT(numElements <= 8 && numConstants <= numElements);
1830 llvm::Constant *constantVector[8];
1831
1832 for(int i = 0; i < numElements; i++)
1833 {
1834 constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1835 }
1836
1837 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1838 }
1839
createConstantString(const char * v)1840 Value *Nucleus::createConstantString(const char *v)
1841 {
1842 // NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1843 auto ptr = jit->builder->CreateGlobalStringPtr(v);
1844 return V(ptr);
1845 }
1846
setOptimizerCallback(OptimizerCallback * callback)1847 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1848 {
1849 // The LLVM backend does not produce optimizer reports.
1850 (void)callback;
1851 }
1852
type()1853 Type *Void::type()
1854 {
1855 return T(llvm::Type::getVoidTy(*jit->context));
1856 }
1857
type()1858 Type *Bool::type()
1859 {
1860 return T(llvm::Type::getInt1Ty(*jit->context));
1861 }
1862
type()1863 Type *Byte::type()
1864 {
1865 return T(llvm::Type::getInt8Ty(*jit->context));
1866 }
1867
type()1868 Type *SByte::type()
1869 {
1870 return T(llvm::Type::getInt8Ty(*jit->context));
1871 }
1872
type()1873 Type *Short::type()
1874 {
1875 return T(llvm::Type::getInt16Ty(*jit->context));
1876 }
1877
type()1878 Type *UShort::type()
1879 {
1880 return T(llvm::Type::getInt16Ty(*jit->context));
1881 }
1882
type()1883 Type *Byte4::type()
1884 {
1885 return T(Type_v4i8);
1886 }
1887
type()1888 Type *SByte4::type()
1889 {
1890 return T(Type_v4i8);
1891 }
1892
AddSat(RValue<Byte8> x,RValue<Byte8> y)1893 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1894 {
1895 RR_DEBUG_INFO_UPDATE_LOC();
1896 #if defined(__i386__) || defined(__x86_64__)
1897 return x86::paddusb(x, y);
1898 #else
1899 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1900 #endif
1901 }
1902
SubSat(RValue<Byte8> x,RValue<Byte8> y)1903 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1904 {
1905 RR_DEBUG_INFO_UPDATE_LOC();
1906 #if defined(__i386__) || defined(__x86_64__)
1907 return x86::psubusb(x, y);
1908 #else
1909 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1910 #endif
1911 }
1912
SignMask(RValue<Byte8> x)1913 RValue<Int> SignMask(RValue<Byte8> x)
1914 {
1915 RR_DEBUG_INFO_UPDATE_LOC();
1916 #if defined(__i386__) || defined(__x86_64__)
1917 return x86::pmovmskb(x);
1918 #else
1919 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1920 #endif
1921 }
1922
1923 // RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1924 // {
1925 //#if defined(__i386__) || defined(__x86_64__)
1926 // return x86::pcmpgtb(x, y); // FIXME: Signedness
1927 //#else
1928 // return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1929 //#endif
1930 // }
1931
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1932 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1933 {
1934 RR_DEBUG_INFO_UPDATE_LOC();
1935 #if defined(__i386__) || defined(__x86_64__)
1936 return x86::pcmpeqb(x, y);
1937 #else
1938 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1939 #endif
1940 }
1941
type()1942 Type *Byte8::type()
1943 {
1944 return T(Type_v8i8);
1945 }
1946
AddSat(RValue<SByte8> x,RValue<SByte8> y)1947 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1948 {
1949 RR_DEBUG_INFO_UPDATE_LOC();
1950 #if defined(__i386__) || defined(__x86_64__)
1951 return x86::paddsb(x, y);
1952 #else
1953 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1954 #endif
1955 }
1956
SubSat(RValue<SByte8> x,RValue<SByte8> y)1957 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1958 {
1959 RR_DEBUG_INFO_UPDATE_LOC();
1960 #if defined(__i386__) || defined(__x86_64__)
1961 return x86::psubsb(x, y);
1962 #else
1963 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1964 #endif
1965 }
1966
SignMask(RValue<SByte8> x)1967 RValue<Int> SignMask(RValue<SByte8> x)
1968 {
1969 RR_DEBUG_INFO_UPDATE_LOC();
1970 #if defined(__i386__) || defined(__x86_64__)
1971 return x86::pmovmskb(As<Byte8>(x));
1972 #else
1973 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1974 #endif
1975 }
1976
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1977 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1978 {
1979 RR_DEBUG_INFO_UPDATE_LOC();
1980 #if defined(__i386__) || defined(__x86_64__)
1981 return x86::pcmpgtb(x, y);
1982 #else
1983 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1984 #endif
1985 }
1986
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1987 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1988 {
1989 RR_DEBUG_INFO_UPDATE_LOC();
1990 #if defined(__i386__) || defined(__x86_64__)
1991 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1992 #else
1993 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1994 #endif
1995 }
1996
type()1997 Type *SByte8::type()
1998 {
1999 return T(Type_v8i8);
2000 }
2001
type()2002 Type *Byte16::type()
2003 {
2004 return T(llvm::VectorType::get(T(Byte::type()), 16, false));
2005 }
2006
type()2007 Type *SByte16::type()
2008 {
2009 return T(llvm::VectorType::get(T(SByte::type()), 16, false));
2010 }
2011
type()2012 Type *Short2::type()
2013 {
2014 return T(Type_v2i16);
2015 }
2016
type()2017 Type *UShort2::type()
2018 {
2019 return T(Type_v2i16);
2020 }
2021
Short4(RValue<Int4> cast)2022 Short4::Short4(RValue<Int4> cast)
2023 {
2024 RR_DEBUG_INFO_UPDATE_LOC();
2025 int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2026 Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2027
2028 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2029 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2030
2031 storeValue(short4);
2032 }
2033
2034 // Short4::Short4(RValue<Float> cast)
2035 // {
2036 // }
2037
Short4(RValue<Float4> cast)2038 Short4::Short4(RValue<Float4> cast)
2039 {
2040 RR_DEBUG_INFO_UPDATE_LOC();
2041 Int4 v4i32 = Int4(cast);
2042 #if defined(__i386__) || defined(__x86_64__)
2043 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2044 #else
2045 Value *v = v4i32.loadValue();
2046 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2047 #endif
2048
2049 storeValue(As<Short4>(Int2(v4i32)).value());
2050 }
2051
operator <<(RValue<Short4> lhs,unsigned char rhs)2052 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2053 {
2054 RR_DEBUG_INFO_UPDATE_LOC();
2055 #if defined(__i386__) || defined(__x86_64__)
2056 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2057
2058 return x86::psllw(lhs, rhs);
2059 #else
2060 return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2061 #endif
2062 }
2063
operator >>(RValue<Short4> lhs,unsigned char rhs)2064 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2065 {
2066 RR_DEBUG_INFO_UPDATE_LOC();
2067 #if defined(__i386__) || defined(__x86_64__)
2068 return x86::psraw(lhs, rhs);
2069 #else
2070 return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2071 #endif
2072 }
2073
Max(RValue<Short4> x,RValue<Short4> y)2074 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2075 {
2076 RR_DEBUG_INFO_UPDATE_LOC();
2077 #if defined(__i386__) || defined(__x86_64__)
2078 return x86::pmaxsw(x, y);
2079 #else
2080 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2081 #endif
2082 }
2083
Min(RValue<Short4> x,RValue<Short4> y)2084 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2085 {
2086 RR_DEBUG_INFO_UPDATE_LOC();
2087 #if defined(__i386__) || defined(__x86_64__)
2088 return x86::pminsw(x, y);
2089 #else
2090 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2091 #endif
2092 }
2093
AddSat(RValue<Short4> x,RValue<Short4> y)2094 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2095 {
2096 RR_DEBUG_INFO_UPDATE_LOC();
2097 #if defined(__i386__) || defined(__x86_64__)
2098 return x86::paddsw(x, y);
2099 #else
2100 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2101 #endif
2102 }
2103
SubSat(RValue<Short4> x,RValue<Short4> y)2104 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2105 {
2106 RR_DEBUG_INFO_UPDATE_LOC();
2107 #if defined(__i386__) || defined(__x86_64__)
2108 return x86::psubsw(x, y);
2109 #else
2110 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2111 #endif
2112 }
2113
MulHigh(RValue<Short4> x,RValue<Short4> y)2114 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2115 {
2116 RR_DEBUG_INFO_UPDATE_LOC();
2117 #if defined(__i386__) || defined(__x86_64__)
2118 return x86::pmulhw(x, y);
2119 #else
2120 return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2121 #endif
2122 }
2123
MulAdd(RValue<Short4> x,RValue<Short4> y)2124 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2125 {
2126 RR_DEBUG_INFO_UPDATE_LOC();
2127 #if defined(__i386__) || defined(__x86_64__)
2128 return x86::pmaddwd(x, y);
2129 #else
2130 return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2131 #endif
2132 }
2133
PackSigned(RValue<Short4> x,RValue<Short4> y)2134 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2135 {
2136 RR_DEBUG_INFO_UPDATE_LOC();
2137 #if defined(__i386__) || defined(__x86_64__)
2138 auto result = x86::packsswb(x, y);
2139 #else
2140 auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2141 #endif
2142 return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2143 }
2144
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2145 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2146 {
2147 RR_DEBUG_INFO_UPDATE_LOC();
2148 #if defined(__i386__) || defined(__x86_64__)
2149 auto result = x86::packuswb(x, y);
2150 #else
2151 auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2152 #endif
2153 return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2154 }
2155
CmpGT(RValue<Short4> x,RValue<Short4> y)2156 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2157 {
2158 RR_DEBUG_INFO_UPDATE_LOC();
2159 #if defined(__i386__) || defined(__x86_64__)
2160 return x86::pcmpgtw(x, y);
2161 #else
2162 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2163 #endif
2164 }
2165
CmpEQ(RValue<Short4> x,RValue<Short4> y)2166 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2167 {
2168 RR_DEBUG_INFO_UPDATE_LOC();
2169 #if defined(__i386__) || defined(__x86_64__)
2170 return x86::pcmpeqw(x, y);
2171 #else
2172 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2173 #endif
2174 }
2175
type()2176 Type *Short4::type()
2177 {
2178 return T(Type_v4i16);
2179 }
2180
UShort4(RValue<Float4> cast,bool saturate)2181 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2182 {
2183 RR_DEBUG_INFO_UPDATE_LOC();
2184 if(saturate)
2185 {
2186 #if defined(__i386__) || defined(__x86_64__)
2187 if(CPUID::supportsSSE4_1())
2188 {
2189 Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
2190 *this = As<Short4>(PackUnsigned(int4, int4));
2191 }
2192 else
2193 #endif
2194 {
2195 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2196 }
2197 }
2198 else
2199 {
2200 *this = Short4(Int4(cast));
2201 }
2202 }
2203
operator <<(RValue<UShort4> lhs,unsigned char rhs)2204 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2205 {
2206 RR_DEBUG_INFO_UPDATE_LOC();
2207 #if defined(__i386__) || defined(__x86_64__)
2208 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2209
2210 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2211 #else
2212 return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2213 #endif
2214 }
2215
operator >>(RValue<UShort4> lhs,unsigned char rhs)2216 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2217 {
2218 RR_DEBUG_INFO_UPDATE_LOC();
2219 #if defined(__i386__) || defined(__x86_64__)
2220 // return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2221
2222 return x86::psrlw(lhs, rhs);
2223 #else
2224 return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2225 #endif
2226 }
2227
Max(RValue<UShort4> x,RValue<UShort4> y)2228 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2229 {
2230 RR_DEBUG_INFO_UPDATE_LOC();
2231 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2232 }
2233
Min(RValue<UShort4> x,RValue<UShort4> y)2234 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2235 {
2236 RR_DEBUG_INFO_UPDATE_LOC();
2237 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2238 }
2239
AddSat(RValue<UShort4> x,RValue<UShort4> y)2240 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2241 {
2242 RR_DEBUG_INFO_UPDATE_LOC();
2243 #if defined(__i386__) || defined(__x86_64__)
2244 return x86::paddusw(x, y);
2245 #else
2246 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2247 #endif
2248 }
2249
SubSat(RValue<UShort4> x,RValue<UShort4> y)2250 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2251 {
2252 RR_DEBUG_INFO_UPDATE_LOC();
2253 #if defined(__i386__) || defined(__x86_64__)
2254 return x86::psubusw(x, y);
2255 #else
2256 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2257 #endif
2258 }
2259
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2260 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2261 {
2262 RR_DEBUG_INFO_UPDATE_LOC();
2263 #if defined(__i386__) || defined(__x86_64__)
2264 return x86::pmulhuw(x, y);
2265 #else
2266 return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2267 #endif
2268 }
2269
Average(RValue<UShort4> x,RValue<UShort4> y)2270 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2271 {
2272 RR_DEBUG_INFO_UPDATE_LOC();
2273 #if defined(__i386__) || defined(__x86_64__)
2274 return x86::pavgw(x, y);
2275 #else
2276 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2277 #endif
2278 }
2279
type()2280 Type *UShort4::type()
2281 {
2282 return T(Type_v4i16);
2283 }
2284
operator <<(RValue<Short8> lhs,unsigned char rhs)2285 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2286 {
2287 RR_DEBUG_INFO_UPDATE_LOC();
2288 #if defined(__i386__) || defined(__x86_64__)
2289 return x86::psllw(lhs, rhs);
2290 #else
2291 return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2292 #endif
2293 }
2294
operator >>(RValue<Short8> lhs,unsigned char rhs)2295 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2296 {
2297 RR_DEBUG_INFO_UPDATE_LOC();
2298 #if defined(__i386__) || defined(__x86_64__)
2299 return x86::psraw(lhs, rhs);
2300 #else
2301 return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2302 #endif
2303 }
2304
MulAdd(RValue<Short8> x,RValue<Short8> y)2305 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2306 {
2307 RR_DEBUG_INFO_UPDATE_LOC();
2308 #if defined(__i386__) || defined(__x86_64__)
2309 return x86::pmaddwd(x, y);
2310 #else
2311 return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2312 #endif
2313 }
2314
MulHigh(RValue<Short8> x,RValue<Short8> y)2315 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2316 {
2317 RR_DEBUG_INFO_UPDATE_LOC();
2318 #if defined(__i386__) || defined(__x86_64__)
2319 return x86::pmulhw(x, y);
2320 #else
2321 return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2322 #endif
2323 }
2324
type()2325 Type *Short8::type()
2326 {
2327 return T(llvm::VectorType::get(T(Short::type()), 8, false));
2328 }
2329
operator <<(RValue<UShort8> lhs,unsigned char rhs)2330 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2331 {
2332 RR_DEBUG_INFO_UPDATE_LOC();
2333 #if defined(__i386__) || defined(__x86_64__)
2334 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2335 #else
2336 return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2337 #endif
2338 }
2339
operator >>(RValue<UShort8> lhs,unsigned char rhs)2340 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2341 {
2342 RR_DEBUG_INFO_UPDATE_LOC();
2343 #if defined(__i386__) || defined(__x86_64__)
2344 return x86::psrlw(lhs, rhs); // FIXME: Fallback required
2345 #else
2346 return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2347 #endif
2348 }
2349
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2350 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2351 {
2352 RR_DEBUG_INFO_UPDATE_LOC();
2353 #if defined(__i386__) || defined(__x86_64__)
2354 return x86::pmulhuw(x, y);
2355 #else
2356 return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2357 #endif
2358 }
2359
type()2360 Type *UShort8::type()
2361 {
2362 return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2363 }
2364
operator ++(Int & val,int)2365 RValue<Int> operator++(Int &val, int) // Post-increment
2366 {
2367 RR_DEBUG_INFO_UPDATE_LOC();
2368 RValue<Int> res = val;
2369
2370 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2371 val.storeValue(inc);
2372
2373 return res;
2374 }
2375
operator ++(Int & val)2376 const Int &operator++(Int &val) // Pre-increment
2377 {
2378 RR_DEBUG_INFO_UPDATE_LOC();
2379 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2380 val.storeValue(inc);
2381
2382 return val;
2383 }
2384
operator --(Int & val,int)2385 RValue<Int> operator--(Int &val, int) // Post-decrement
2386 {
2387 RR_DEBUG_INFO_UPDATE_LOC();
2388 RValue<Int> res = val;
2389
2390 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2391 val.storeValue(inc);
2392
2393 return res;
2394 }
2395
operator --(Int & val)2396 const Int &operator--(Int &val) // Pre-decrement
2397 {
2398 RR_DEBUG_INFO_UPDATE_LOC();
2399 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2400 val.storeValue(inc);
2401
2402 return val;
2403 }
2404
RoundInt(RValue<Float> cast)2405 RValue<Int> RoundInt(RValue<Float> cast)
2406 {
2407 RR_DEBUG_INFO_UPDATE_LOC();
2408 #if defined(__i386__) || defined(__x86_64__)
2409 return x86::cvtss2si(cast);
2410 #else
2411 return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2412 #endif
2413 }
2414
type()2415 Type *Int::type()
2416 {
2417 return T(llvm::Type::getInt32Ty(*jit->context));
2418 }
2419
type()2420 Type *Long::type()
2421 {
2422 return T(llvm::Type::getInt64Ty(*jit->context));
2423 }
2424
UInt(RValue<Float> cast)2425 UInt::UInt(RValue<Float> cast)
2426 {
2427 RR_DEBUG_INFO_UPDATE_LOC();
2428 Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2429 storeValue(integer);
2430 }
2431
operator ++(UInt & val,int)2432 RValue<UInt> operator++(UInt &val, int) // Post-increment
2433 {
2434 RR_DEBUG_INFO_UPDATE_LOC();
2435 RValue<UInt> res = val;
2436
2437 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2438 val.storeValue(inc);
2439
2440 return res;
2441 }
2442
operator ++(UInt & val)2443 const UInt &operator++(UInt &val) // Pre-increment
2444 {
2445 RR_DEBUG_INFO_UPDATE_LOC();
2446 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2447 val.storeValue(inc);
2448
2449 return val;
2450 }
2451
operator --(UInt & val,int)2452 RValue<UInt> operator--(UInt &val, int) // Post-decrement
2453 {
2454 RR_DEBUG_INFO_UPDATE_LOC();
2455 RValue<UInt> res = val;
2456
2457 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2458 val.storeValue(inc);
2459
2460 return res;
2461 }
2462
operator --(UInt & val)2463 const UInt &operator--(UInt &val) // Pre-decrement
2464 {
2465 RR_DEBUG_INFO_UPDATE_LOC();
2466 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2467 val.storeValue(inc);
2468
2469 return val;
2470 }
2471
2472 // RValue<UInt> RoundUInt(RValue<Float> cast)
2473 // {
2474 //#if defined(__i386__) || defined(__x86_64__)
2475 // return x86::cvtss2si(val); // FIXME: Unsigned
2476 //#else
2477 // return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2478 //#endif
2479 // }
2480
type()2481 Type *UInt::type()
2482 {
2483 return T(llvm::Type::getInt32Ty(*jit->context));
2484 }
2485
2486 // Int2::Int2(RValue<Int> cast)
2487 // {
2488 // Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2489 // Value *vector = Nucleus::createBitCast(extend, Int2::type());
2490 //
2491 // int shuffle[2] = {0, 0};
2492 // Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2493 //
2494 // storeValue(replicate);
2495 // }
2496
operator <<(RValue<Int2> lhs,unsigned char rhs)2497 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2498 {
2499 RR_DEBUG_INFO_UPDATE_LOC();
2500 #if defined(__i386__) || defined(__x86_64__)
2501 // return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2502
2503 return x86::pslld(lhs, rhs);
2504 #else
2505 return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2506 #endif
2507 }
2508
operator >>(RValue<Int2> lhs,unsigned char rhs)2509 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2510 {
2511 RR_DEBUG_INFO_UPDATE_LOC();
2512 #if defined(__i386__) || defined(__x86_64__)
2513 // return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2514
2515 return x86::psrad(lhs, rhs);
2516 #else
2517 return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2518 #endif
2519 }
2520
type()2521 Type *Int2::type()
2522 {
2523 return T(Type_v2i32);
2524 }
2525
operator <<(RValue<UInt2> lhs,unsigned char rhs)2526 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2527 {
2528 RR_DEBUG_INFO_UPDATE_LOC();
2529 #if defined(__i386__) || defined(__x86_64__)
2530 // return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2531
2532 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2533 #else
2534 return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2535 #endif
2536 }
2537
operator >>(RValue<UInt2> lhs,unsigned char rhs)2538 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2539 {
2540 RR_DEBUG_INFO_UPDATE_LOC();
2541 #if defined(__i386__) || defined(__x86_64__)
2542 // return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2543
2544 return x86::psrld(lhs, rhs);
2545 #else
2546 return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2547 #endif
2548 }
2549
type()2550 Type *UInt2::type()
2551 {
2552 return T(Type_v2i32);
2553 }
2554
Int4(RValue<Byte4> cast)2555 Int4::Int4(RValue<Byte4> cast)
2556 : XYZW(this)
2557 {
2558 RR_DEBUG_INFO_UPDATE_LOC();
2559 #if defined(__i386__) || defined(__x86_64__)
2560 if(CPUID::supportsSSE4_1())
2561 {
2562 *this = x86::pmovzxbd(As<Byte16>(cast));
2563 }
2564 else
2565 #endif
2566 {
2567 int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2568 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2569 Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2570
2571 int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2572 Value *c = Nucleus::createBitCast(b, Short8::type());
2573 Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2574
2575 *this = As<Int4>(d);
2576 }
2577 }
2578
Int4(RValue<SByte4> cast)2579 Int4::Int4(RValue<SByte4> cast)
2580 : XYZW(this)
2581 {
2582 RR_DEBUG_INFO_UPDATE_LOC();
2583 #if defined(__i386__) || defined(__x86_64__)
2584 if(CPUID::supportsSSE4_1())
2585 {
2586 *this = x86::pmovsxbd(As<SByte16>(cast));
2587 }
2588 else
2589 #endif
2590 {
2591 int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2592 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2593 Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2594
2595 int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2596 Value *c = Nucleus::createBitCast(b, Short8::type());
2597 Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2598
2599 *this = As<Int4>(d) >> 24;
2600 }
2601 }
2602
Int4(RValue<Short4> cast)2603 Int4::Int4(RValue<Short4> cast)
2604 : XYZW(this)
2605 {
2606 RR_DEBUG_INFO_UPDATE_LOC();
2607 #if defined(__i386__) || defined(__x86_64__)
2608 if(CPUID::supportsSSE4_1())
2609 {
2610 *this = x86::pmovsxwd(As<Short8>(cast));
2611 }
2612 else
2613 #endif
2614 {
2615 int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2616 Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2617 *this = As<Int4>(c) >> 16;
2618 }
2619 }
2620
Int4(RValue<UShort4> cast)2621 Int4::Int4(RValue<UShort4> cast)
2622 : XYZW(this)
2623 {
2624 RR_DEBUG_INFO_UPDATE_LOC();
2625 #if defined(__i386__) || defined(__x86_64__)
2626 if(CPUID::supportsSSE4_1())
2627 {
2628 *this = x86::pmovzxwd(As<UShort8>(cast));
2629 }
2630 else
2631 #endif
2632 {
2633 int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2634 Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2635 *this = As<Int4>(c);
2636 }
2637 }
2638
Int4(RValue<Int> rhs)2639 Int4::Int4(RValue<Int> rhs)
2640 : XYZW(this)
2641 {
2642 RR_DEBUG_INFO_UPDATE_LOC();
2643 Value *vector = loadValue();
2644 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2645
2646 int swizzle[4] = { 0, 0, 0, 0 };
2647 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2648
2649 storeValue(replicate);
2650 }
2651
operator <<(RValue<Int4> lhs,unsigned char rhs)2652 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2653 {
2654 RR_DEBUG_INFO_UPDATE_LOC();
2655 #if defined(__i386__) || defined(__x86_64__)
2656 return x86::pslld(lhs, rhs);
2657 #else
2658 return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2659 #endif
2660 }
2661
operator >>(RValue<Int4> lhs,unsigned char rhs)2662 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2663 {
2664 RR_DEBUG_INFO_UPDATE_LOC();
2665 #if defined(__i386__) || defined(__x86_64__)
2666 return x86::psrad(lhs, rhs);
2667 #else
2668 return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2669 #endif
2670 }
2671
CmpEQ(RValue<Int4> x,RValue<Int4> y)2672 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2673 {
2674 RR_DEBUG_INFO_UPDATE_LOC();
2675 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2676 }
2677
CmpLT(RValue<Int4> x,RValue<Int4> y)2678 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2679 {
2680 RR_DEBUG_INFO_UPDATE_LOC();
2681 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2682 }
2683
CmpLE(RValue<Int4> x,RValue<Int4> y)2684 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2685 {
2686 RR_DEBUG_INFO_UPDATE_LOC();
2687 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2688 }
2689
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2690 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2691 {
2692 RR_DEBUG_INFO_UPDATE_LOC();
2693 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2694 }
2695
CmpNLT(RValue<Int4> x,RValue<Int4> y)2696 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2697 {
2698 RR_DEBUG_INFO_UPDATE_LOC();
2699 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2700 }
2701
CmpNLE(RValue<Int4> x,RValue<Int4> y)2702 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2703 {
2704 RR_DEBUG_INFO_UPDATE_LOC();
2705 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2706 }
2707
Max(RValue<Int4> x,RValue<Int4> y)2708 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2709 {
2710 RR_DEBUG_INFO_UPDATE_LOC();
2711 #if defined(__i386__) || defined(__x86_64__)
2712 if(CPUID::supportsSSE4_1())
2713 {
2714 return x86::pmaxsd(x, y);
2715 }
2716 else
2717 #endif
2718 {
2719 RValue<Int4> greater = CmpNLE(x, y);
2720 return (x & greater) | (y & ~greater);
2721 }
2722 }
2723
Min(RValue<Int4> x,RValue<Int4> y)2724 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2725 {
2726 RR_DEBUG_INFO_UPDATE_LOC();
2727 #if defined(__i386__) || defined(__x86_64__)
2728 if(CPUID::supportsSSE4_1())
2729 {
2730 return x86::pminsd(x, y);
2731 }
2732 else
2733 #endif
2734 {
2735 RValue<Int4> less = CmpLT(x, y);
2736 return (x & less) | (y & ~less);
2737 }
2738 }
2739
RoundInt(RValue<Float4> cast)2740 RValue<Int4> RoundInt(RValue<Float4> cast)
2741 {
2742 RR_DEBUG_INFO_UPDATE_LOC();
2743 #if defined(__i386__) || defined(__x86_64__)
2744 return x86::cvtps2dq(cast);
2745 #else
2746 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2747 #endif
2748 }
2749
RoundIntClamped(RValue<Float4> cast)2750 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2751 {
2752 RR_DEBUG_INFO_UPDATE_LOC();
2753 #if defined(__i386__) || defined(__x86_64__)
2754 // cvtps2dq produces 0x80000000, a negative value, for input larger than
2755 // 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2756 // saturate to 0x80000000.
2757 return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2758 #else
2759 // ARM saturates to the largest positive or negative integer. Unit tests
2760 // verify that lowerRoundInt() behaves as desired.
2761 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2762 #endif
2763 }
2764
MulHigh(RValue<Int4> x,RValue<Int4> y)2765 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2766 {
2767 RR_DEBUG_INFO_UPDATE_LOC();
2768 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2769 return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2770 }
2771
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2772 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2773 {
2774 RR_DEBUG_INFO_UPDATE_LOC();
2775 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2776 return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2777 }
2778
PackSigned(RValue<Int4> x,RValue<Int4> y)2779 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2780 {
2781 RR_DEBUG_INFO_UPDATE_LOC();
2782 #if defined(__i386__) || defined(__x86_64__)
2783 return x86::packssdw(x, y);
2784 #else
2785 return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2786 #endif
2787 }
2788
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2789 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2790 {
2791 RR_DEBUG_INFO_UPDATE_LOC();
2792 #if defined(__i386__) || defined(__x86_64__)
2793 return x86::packusdw(x, y);
2794 #else
2795 return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2796 #endif
2797 }
2798
SignMask(RValue<Int4> x)2799 RValue<Int> SignMask(RValue<Int4> x)
2800 {
2801 RR_DEBUG_INFO_UPDATE_LOC();
2802 #if defined(__i386__) || defined(__x86_64__)
2803 return x86::movmskps(As<Float4>(x));
2804 #else
2805 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2806 #endif
2807 }
2808
type()2809 Type *Int4::type()
2810 {
2811 return T(llvm::VectorType::get(T(Int::type()), 4, false));
2812 }
2813
UInt4(RValue<Float4> cast)2814 UInt4::UInt4(RValue<Float4> cast)
2815 : XYZW(this)
2816 {
2817 RR_DEBUG_INFO_UPDATE_LOC();
2818 Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2819 storeValue(xyzw);
2820 }
2821
UInt4(RValue<UInt> rhs)2822 UInt4::UInt4(RValue<UInt> rhs)
2823 : XYZW(this)
2824 {
2825 RR_DEBUG_INFO_UPDATE_LOC();
2826 Value *vector = loadValue();
2827 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2828
2829 int swizzle[4] = { 0, 0, 0, 0 };
2830 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2831
2832 storeValue(replicate);
2833 }
2834
operator <<(RValue<UInt4> lhs,unsigned char rhs)2835 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2836 {
2837 RR_DEBUG_INFO_UPDATE_LOC();
2838 #if defined(__i386__) || defined(__x86_64__)
2839 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2840 #else
2841 return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2842 #endif
2843 }
2844
operator >>(RValue<UInt4> lhs,unsigned char rhs)2845 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2846 {
2847 RR_DEBUG_INFO_UPDATE_LOC();
2848 #if defined(__i386__) || defined(__x86_64__)
2849 return x86::psrld(lhs, rhs);
2850 #else
2851 return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2852 #endif
2853 }
2854
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2855 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2856 {
2857 RR_DEBUG_INFO_UPDATE_LOC();
2858 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2859 }
2860
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2861 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2862 {
2863 RR_DEBUG_INFO_UPDATE_LOC();
2864 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2865 }
2866
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2867 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2868 {
2869 RR_DEBUG_INFO_UPDATE_LOC();
2870 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2871 }
2872
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2873 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2874 {
2875 RR_DEBUG_INFO_UPDATE_LOC();
2876 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2877 }
2878
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2879 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2880 {
2881 RR_DEBUG_INFO_UPDATE_LOC();
2882 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2883 }
2884
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2885 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2886 {
2887 RR_DEBUG_INFO_UPDATE_LOC();
2888 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2889 }
2890
Max(RValue<UInt4> x,RValue<UInt4> y)2891 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2892 {
2893 RR_DEBUG_INFO_UPDATE_LOC();
2894 #if defined(__i386__) || defined(__x86_64__)
2895 if(CPUID::supportsSSE4_1())
2896 {
2897 return x86::pmaxud(x, y);
2898 }
2899 else
2900 #endif
2901 {
2902 RValue<UInt4> greater = CmpNLE(x, y);
2903 return (x & greater) | (y & ~greater);
2904 }
2905 }
2906
Min(RValue<UInt4> x,RValue<UInt4> y)2907 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2908 {
2909 RR_DEBUG_INFO_UPDATE_LOC();
2910 #if defined(__i386__) || defined(__x86_64__)
2911 if(CPUID::supportsSSE4_1())
2912 {
2913 return x86::pminud(x, y);
2914 }
2915 else
2916 #endif
2917 {
2918 RValue<UInt4> less = CmpLT(x, y);
2919 return (x & less) | (y & ~less);
2920 }
2921 }
2922
type()2923 Type *UInt4::type()
2924 {
2925 return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2926 }
2927
type()2928 Type *Half::type()
2929 {
2930 return T(llvm::Type::getInt16Ty(*jit->context));
2931 }
2932
Rcp_pp(RValue<Float> x,bool exactAtPow2)2933 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2934 {
2935 RR_DEBUG_INFO_UPDATE_LOC();
2936 #if defined(__i386__) || defined(__x86_64__)
2937 if(exactAtPow2)
2938 {
2939 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2940 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2941 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2942 }
2943 return x86::rcpss(x);
2944 #else
2945 return As<Float>(V(lowerRCP(V(x.value()))));
2946 #endif
2947 }
2948
RcpSqrt_pp(RValue<Float> x)2949 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2950 {
2951 RR_DEBUG_INFO_UPDATE_LOC();
2952 #if defined(__i386__) || defined(__x86_64__)
2953 return x86::rsqrtss(x);
2954 #else
2955 return As<Float>(V(lowerRSQRT(V(x.value()))));
2956 #endif
2957 }
2958
HasRcpApprox()2959 bool HasRcpApprox()
2960 {
2961 #if defined(__i386__) || defined(__x86_64__)
2962 return true;
2963 #else
2964 return false;
2965 #endif
2966 }
2967
RcpApprox(RValue<Float4> x,bool exactAtPow2)2968 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2969 {
2970 #if defined(__i386__) || defined(__x86_64__)
2971 if(exactAtPow2)
2972 {
2973 // rcpps uses a piecewise-linear approximation which minimizes the relative error
2974 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2975 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2976 }
2977 return x86::rcpps(x);
2978 #else
2979 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2980 return { 0.0f };
2981 #endif
2982 }
2983
RcpApprox(RValue<Float> x,bool exactAtPow2)2984 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2985 {
2986 #if defined(__i386__) || defined(__x86_64__)
2987 if(exactAtPow2)
2988 {
2989 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2990 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2991 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2992 }
2993 return x86::rcpss(x);
2994 #else
2995 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2996 return { 0.0f };
2997 #endif
2998 }
2999
HasRcpSqrtApprox()3000 bool HasRcpSqrtApprox()
3001 {
3002 #if defined(__i386__) || defined(__x86_64__)
3003 return true;
3004 #else
3005 return false;
3006 #endif
3007 }
3008
RcpSqrtApprox(RValue<Float4> x)3009 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
3010 {
3011 #if defined(__i386__) || defined(__x86_64__)
3012 return x86::rsqrtps(x);
3013 #else
3014 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
3015 return { 0.0f };
3016 #endif
3017 }
3018
RcpSqrtApprox(RValue<Float> x)3019 RValue<Float> RcpSqrtApprox(RValue<Float> x)
3020 {
3021 #if defined(__i386__) || defined(__x86_64__)
3022 return x86::rsqrtss(x);
3023 #else
3024 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
3025 return { 0.0f };
3026 #endif
3027 }
3028
Sqrt(RValue<Float> x)3029 RValue<Float> Sqrt(RValue<Float> x)
3030 {
3031 RR_DEBUG_INFO_UPDATE_LOC();
3032 #if defined(__i386__) || defined(__x86_64__)
3033 return x86::sqrtss(x);
3034 #else
3035 return As<Float>(V(lowerSQRT(V(x.value()))));
3036 #endif
3037 }
3038
Round(RValue<Float> x)3039 RValue<Float> Round(RValue<Float> x)
3040 {
3041 RR_DEBUG_INFO_UPDATE_LOC();
3042 #if defined(__i386__) || defined(__x86_64__)
3043 if(CPUID::supportsSSE4_1())
3044 {
3045 return x86::roundss(x, 0);
3046 }
3047 else
3048 {
3049 return Float4(Round(Float4(x))).x;
3050 }
3051 #else
3052 return RValue<Float>(V(lowerRound(V(x.value()))));
3053 #endif
3054 }
3055
Trunc(RValue<Float> x)3056 RValue<Float> Trunc(RValue<Float> x)
3057 {
3058 RR_DEBUG_INFO_UPDATE_LOC();
3059 #if defined(__i386__) || defined(__x86_64__)
3060 if(CPUID::supportsSSE4_1())
3061 {
3062 return x86::roundss(x, 3);
3063 }
3064 else
3065 {
3066 return Float(Int(x)); // Rounded toward zero
3067 }
3068 #else
3069 return RValue<Float>(V(lowerTrunc(V(x.value()))));
3070 #endif
3071 }
3072
Frac(RValue<Float> x)3073 RValue<Float> Frac(RValue<Float> x)
3074 {
3075 RR_DEBUG_INFO_UPDATE_LOC();
3076 #if defined(__i386__) || defined(__x86_64__)
3077 if(CPUID::supportsSSE4_1())
3078 {
3079 return x - x86::floorss(x);
3080 }
3081 else
3082 {
3083 return Float4(Frac(Float4(x))).x;
3084 }
3085 #else
3086 // x - floor(x) can be 1.0 for very small negative x.
3087 // Clamp against the value just below 1.0.
3088 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3089 #endif
3090 }
3091
Floor(RValue<Float> x)3092 RValue<Float> Floor(RValue<Float> x)
3093 {
3094 RR_DEBUG_INFO_UPDATE_LOC();
3095 #if defined(__i386__) || defined(__x86_64__)
3096 if(CPUID::supportsSSE4_1())
3097 {
3098 return x86::floorss(x);
3099 }
3100 else
3101 {
3102 return Float4(Floor(Float4(x))).x;
3103 }
3104 #else
3105 return RValue<Float>(V(lowerFloor(V(x.value()))));
3106 #endif
3107 }
3108
Ceil(RValue<Float> x)3109 RValue<Float> Ceil(RValue<Float> x)
3110 {
3111 RR_DEBUG_INFO_UPDATE_LOC();
3112 #if defined(__i386__) || defined(__x86_64__)
3113 if(CPUID::supportsSSE4_1())
3114 {
3115 return x86::ceilss(x);
3116 }
3117 else
3118 #endif
3119 {
3120 return Float4(Ceil(Float4(x))).x;
3121 }
3122 }
3123
type()3124 Type *Float::type()
3125 {
3126 return T(llvm::Type::getFloatTy(*jit->context));
3127 }
3128
type()3129 Type *Float2::type()
3130 {
3131 return T(Type_v2f32);
3132 }
3133
Exp2(RValue<Float> v)3134 RValue<Float> Exp2(RValue<Float> v)
3135 {
3136 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::type()) });
3137 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3138 }
3139
Log2(RValue<Float> v)3140 RValue<Float> Log2(RValue<Float> v)
3141 {
3142 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::type()) });
3143 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3144 }
3145
Float4(RValue<Float> rhs)3146 Float4::Float4(RValue<Float> rhs)
3147 : XYZW(this)
3148 {
3149 RR_DEBUG_INFO_UPDATE_LOC();
3150 Value *vector = loadValue();
3151 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3152
3153 int swizzle[4] = { 0, 0, 0, 0 };
3154 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3155
3156 storeValue(replicate);
3157 }
3158
Max(RValue<Float4> x,RValue<Float4> y)3159 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3160 {
3161 RR_DEBUG_INFO_UPDATE_LOC();
3162 #if defined(__i386__) || defined(__x86_64__)
3163 return x86::maxps(x, y);
3164 #else
3165 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3166 #endif
3167 }
3168
Min(RValue<Float4> x,RValue<Float4> y)3169 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3170 {
3171 RR_DEBUG_INFO_UPDATE_LOC();
3172 #if defined(__i386__) || defined(__x86_64__)
3173 return x86::minps(x, y);
3174 #else
3175 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3176 #endif
3177 }
3178
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3179 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3180 {
3181 RR_DEBUG_INFO_UPDATE_LOC();
3182 #if defined(__i386__) || defined(__x86_64__)
3183 if(exactAtPow2)
3184 {
3185 // rcpps uses a piecewise-linear approximation which minimizes the relative error
3186 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
3187 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
3188 }
3189 return x86::rcpps(x);
3190 #else
3191 return As<Float4>(V(lowerRCP(V(x.value()))));
3192 #endif
3193 }
3194
RcpSqrt_pp(RValue<Float4> x)3195 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3196 {
3197 RR_DEBUG_INFO_UPDATE_LOC();
3198 #if defined(__i386__) || defined(__x86_64__)
3199 return x86::rsqrtps(x);
3200 #else
3201 return As<Float4>(V(lowerRSQRT(V(x.value()))));
3202 #endif
3203 }
3204
Sqrt(RValue<Float4> x)3205 RValue<Float4> Sqrt(RValue<Float4> x)
3206 {
3207 RR_DEBUG_INFO_UPDATE_LOC();
3208 #if defined(__i386__) || defined(__x86_64__)
3209 return x86::sqrtps(x);
3210 #else
3211 return As<Float4>(V(lowerSQRT(V(x.value()))));
3212 #endif
3213 }
3214
SignMask(RValue<Float4> x)3215 RValue<Int> SignMask(RValue<Float4> x)
3216 {
3217 RR_DEBUG_INFO_UPDATE_LOC();
3218 #if defined(__i386__) || defined(__x86_64__)
3219 return x86::movmskps(x);
3220 #else
3221 return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3222 #endif
3223 }
3224
CmpEQ(RValue<Float4> x,RValue<Float4> y)3225 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3226 {
3227 RR_DEBUG_INFO_UPDATE_LOC();
3228 // return As<Int4>(x86::cmpeqps(x, y));
3229 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3230 }
3231
CmpLT(RValue<Float4> x,RValue<Float4> y)3232 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3233 {
3234 RR_DEBUG_INFO_UPDATE_LOC();
3235 // return As<Int4>(x86::cmpltps(x, y));
3236 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3237 }
3238
CmpLE(RValue<Float4> x,RValue<Float4> y)3239 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3240 {
3241 RR_DEBUG_INFO_UPDATE_LOC();
3242 // return As<Int4>(x86::cmpleps(x, y));
3243 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3244 }
3245
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3246 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3247 {
3248 RR_DEBUG_INFO_UPDATE_LOC();
3249 // return As<Int4>(x86::cmpneqps(x, y));
3250 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3251 }
3252
CmpNLT(RValue<Float4> x,RValue<Float4> y)3253 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3254 {
3255 RR_DEBUG_INFO_UPDATE_LOC();
3256 // return As<Int4>(x86::cmpnltps(x, y));
3257 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3258 }
3259
CmpNLE(RValue<Float4> x,RValue<Float4> y)3260 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3261 {
3262 RR_DEBUG_INFO_UPDATE_LOC();
3263 // return As<Int4>(x86::cmpnleps(x, y));
3264 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3265 }
3266
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3267 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3268 {
3269 RR_DEBUG_INFO_UPDATE_LOC();
3270 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3271 }
3272
CmpULT(RValue<Float4> x,RValue<Float4> y)3273 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3274 {
3275 RR_DEBUG_INFO_UPDATE_LOC();
3276 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3277 }
3278
CmpULE(RValue<Float4> x,RValue<Float4> y)3279 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3280 {
3281 RR_DEBUG_INFO_UPDATE_LOC();
3282 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3283 }
3284
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3285 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3286 {
3287 RR_DEBUG_INFO_UPDATE_LOC();
3288 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3289 }
3290
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3291 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3292 {
3293 RR_DEBUG_INFO_UPDATE_LOC();
3294 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3295 }
3296
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3297 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3298 {
3299 RR_DEBUG_INFO_UPDATE_LOC();
3300 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3301 }
3302
Round(RValue<Float4> x)3303 RValue<Float4> Round(RValue<Float4> x)
3304 {
3305 RR_DEBUG_INFO_UPDATE_LOC();
3306 #if defined(__i386__) || defined(__x86_64__)
3307 if(CPUID::supportsSSE4_1())
3308 {
3309 return x86::roundps(x, 0);
3310 }
3311 else
3312 {
3313 return Float4(RoundInt(x));
3314 }
3315 #else
3316 return RValue<Float4>(V(lowerRound(V(x.value()))));
3317 #endif
3318 }
3319
Trunc(RValue<Float4> x)3320 RValue<Float4> Trunc(RValue<Float4> x)
3321 {
3322 RR_DEBUG_INFO_UPDATE_LOC();
3323 #if defined(__i386__) || defined(__x86_64__)
3324 if(CPUID::supportsSSE4_1())
3325 {
3326 return x86::roundps(x, 3);
3327 }
3328 else
3329 {
3330 return Float4(Int4(x));
3331 }
3332 #else
3333 return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3334 #endif
3335 }
3336
Frac(RValue<Float4> x)3337 RValue<Float4> Frac(RValue<Float4> x)
3338 {
3339 RR_DEBUG_INFO_UPDATE_LOC();
3340 Float4 frc;
3341
3342 #if defined(__i386__) || defined(__x86_64__)
3343 if(CPUID::supportsSSE4_1())
3344 {
3345 frc = x - x86::floorps(x);
3346 }
3347 else
3348 {
3349 frc = x - Float4(Int4(x)); // Signed fractional part.
3350
3351 frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f))); // Add 1.0 if negative.
3352 }
3353 #else
3354 frc = x - Floor(x);
3355 #endif
3356
3357 // x - floor(x) can be 1.0 for very small negative x.
3358 // Clamp against the value just below 1.0.
3359 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3360 }
3361
Floor(RValue<Float4> x)3362 RValue<Float4> Floor(RValue<Float4> x)
3363 {
3364 RR_DEBUG_INFO_UPDATE_LOC();
3365 #if defined(__i386__) || defined(__x86_64__)
3366 if(CPUID::supportsSSE4_1())
3367 {
3368 return x86::floorps(x);
3369 }
3370 else
3371 {
3372 return x - Frac(x);
3373 }
3374 #else
3375 return RValue<Float4>(V(lowerFloor(V(x.value()))));
3376 #endif
3377 }
3378
Ceil(RValue<Float4> x)3379 RValue<Float4> Ceil(RValue<Float4> x)
3380 {
3381 RR_DEBUG_INFO_UPDATE_LOC();
3382 #if defined(__i386__) || defined(__x86_64__)
3383 if(CPUID::supportsSSE4_1())
3384 {
3385 return x86::ceilps(x);
3386 }
3387 else
3388 #endif
3389 {
3390 return -Floor(-x);
3391 }
3392 }
3393
Sin(RValue<Float4> v)3394 RValue<Float4> Sin(RValue<Float4> v)
3395 {
3396 RR_DEBUG_INFO_UPDATE_LOC();
3397 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value())->getType() });
3398 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3399 }
3400
Cos(RValue<Float4> v)3401 RValue<Float4> Cos(RValue<Float4> v)
3402 {
3403 RR_DEBUG_INFO_UPDATE_LOC();
3404 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value())->getType() });
3405 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3406 }
3407
Tan(RValue<Float4> v)3408 RValue<Float4> Tan(RValue<Float4> v)
3409 {
3410 RR_DEBUG_INFO_UPDATE_LOC();
3411 return Sin(v) / Cos(v);
3412 }
3413
TransformFloat4PerElement(RValue<Float4> v,const char * name)3414 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3415 {
3416 auto funcTy = llvm::FunctionType::get(T(Float::type()), llvm::ArrayRef<llvm::Type *>(T(Float::type())), false);
3417 auto func = jit->module->getOrInsertFunction(name, funcTy);
3418 llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3419 for(uint64_t i = 0; i < 4; i++)
3420 {
3421 auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value(), Float::type(), i)));
3422 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3423 }
3424 return RValue<Float4>(V(out));
3425 }
3426
Asin(RValue<Float4> v,Precision p)3427 RValue<Float4> Asin(RValue<Float4> v, Precision p)
3428 {
3429 RR_DEBUG_INFO_UPDATE_LOC();
3430 return TransformFloat4PerElement(v, "asinf");
3431 }
3432
Acos(RValue<Float4> v,Precision p)3433 RValue<Float4> Acos(RValue<Float4> v, Precision p)
3434 {
3435 RR_DEBUG_INFO_UPDATE_LOC();
3436 return TransformFloat4PerElement(v, "acosf");
3437 }
3438
Atan(RValue<Float4> v)3439 RValue<Float4> Atan(RValue<Float4> v)
3440 {
3441 RR_DEBUG_INFO_UPDATE_LOC();
3442 return TransformFloat4PerElement(v, "atanf");
3443 }
3444
Sinh(RValue<Float4> v)3445 RValue<Float4> Sinh(RValue<Float4> v)
3446 {
3447 RR_DEBUG_INFO_UPDATE_LOC();
3448 return emulated::Sinh(v);
3449 }
3450
Cosh(RValue<Float4> v)3451 RValue<Float4> Cosh(RValue<Float4> v)
3452 {
3453 RR_DEBUG_INFO_UPDATE_LOC();
3454 return emulated::Cosh(v);
3455 }
3456
Tanh(RValue<Float4> v)3457 RValue<Float4> Tanh(RValue<Float4> v)
3458 {
3459 RR_DEBUG_INFO_UPDATE_LOC();
3460 return TransformFloat4PerElement(v, "tanhf");
3461 }
3462
Asinh(RValue<Float4> v)3463 RValue<Float4> Asinh(RValue<Float4> v)
3464 {
3465 RR_DEBUG_INFO_UPDATE_LOC();
3466 return TransformFloat4PerElement(v, "asinhf");
3467 }
3468
Acosh(RValue<Float4> v)3469 RValue<Float4> Acosh(RValue<Float4> v)
3470 {
3471 RR_DEBUG_INFO_UPDATE_LOC();
3472 return TransformFloat4PerElement(v, "acoshf");
3473 }
3474
Atanh(RValue<Float4> v)3475 RValue<Float4> Atanh(RValue<Float4> v)
3476 {
3477 RR_DEBUG_INFO_UPDATE_LOC();
3478 return TransformFloat4PerElement(v, "atanhf");
3479 }
3480
Atan2(RValue<Float4> x,RValue<Float4> y)3481 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3482 {
3483 RR_DEBUG_INFO_UPDATE_LOC();
3484 llvm::SmallVector<llvm::Type *, 2> paramTys;
3485 paramTys.push_back(T(Float::type()));
3486 paramTys.push_back(T(Float::type()));
3487 auto funcTy = llvm::FunctionType::get(T(Float::type()), paramTys, false);
3488 auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3489 llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3490 for(uint64_t i = 0; i < 4; i++)
3491 {
3492 auto el = jit->builder->CreateCall(func, { V(Nucleus::createExtractElement(x.value(), Float::type(), i)),
3493 V(Nucleus::createExtractElement(y.value(), Float::type(), i)) });
3494 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3495 }
3496 return RValue<Float4>(V(out));
3497 }
3498
Pow(RValue<Float4> x,RValue<Float4> y)3499 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3500 {
3501 RR_DEBUG_INFO_UPDATE_LOC();
3502 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::type()) });
3503 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()) })));
3504 }
3505
Exp(RValue<Float4> v)3506 RValue<Float4> Exp(RValue<Float4> v)
3507 {
3508 RR_DEBUG_INFO_UPDATE_LOC();
3509 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::type()) });
3510 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3511 }
3512
Log(RValue<Float4> v)3513 RValue<Float4> Log(RValue<Float4> v)
3514 {
3515 RR_DEBUG_INFO_UPDATE_LOC();
3516 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::type()) });
3517 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3518 }
3519
Exp2(RValue<Float4> v)3520 RValue<Float4> Exp2(RValue<Float4> v)
3521 {
3522 RR_DEBUG_INFO_UPDATE_LOC();
3523 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::type()) });
3524 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3525 }
3526
Log2(RValue<Float4> v)3527 RValue<Float4> Log2(RValue<Float4> v)
3528 {
3529 RR_DEBUG_INFO_UPDATE_LOC();
3530 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::type()) });
3531 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3532 }
3533
Ctlz(RValue<UInt> v,bool isZeroUndef)3534 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3535 {
3536 RR_DEBUG_INFO_UPDATE_LOC();
3537 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3538 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3539 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3540 }
3541
Ctlz(RValue<UInt4> v,bool isZeroUndef)3542 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3543 {
3544 RR_DEBUG_INFO_UPDATE_LOC();
3545 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3546 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3547 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3548 }
3549
Cttz(RValue<UInt> v,bool isZeroUndef)3550 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3551 {
3552 RR_DEBUG_INFO_UPDATE_LOC();
3553 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3554 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3555 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3556 }
3557
Cttz(RValue<UInt4> v,bool isZeroUndef)3558 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3559 {
3560 RR_DEBUG_INFO_UPDATE_LOC();
3561 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3562 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3563 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3564 }
3565
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3566 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3567 {
3568 return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3569 }
3570
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3571 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3572 {
3573 return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3574 }
3575
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3576 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3577 {
3578 return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3579 }
3580
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3581 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3582 {
3583 return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3584 }
3585
type()3586 Type *Float4::type()
3587 {
3588 return T(llvm::VectorType::get(T(Float::type()), 4, false));
3589 }
3590
Ticks()3591 RValue<Long> Ticks()
3592 {
3593 RR_DEBUG_INFO_UPDATE_LOC();
3594 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3595
3596 return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3597 }
3598
ConstantPointer(void const * ptr)3599 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3600 {
3601 RR_DEBUG_INFO_UPDATE_LOC();
3602 // Note: this should work for 32-bit pointers as well because 'inttoptr'
3603 // is defined to truncate (and zero extend) if necessary.
3604 auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3605 return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3606 }
3607
ConstantData(void const * data,size_t size)3608 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3609 {
3610 RR_DEBUG_INFO_UPDATE_LOC();
3611 auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3612 auto ptr = jit->builder->CreateGlobalStringPtr(str);
3613 return RValue<Pointer<Byte>>(V(ptr));
3614 }
3615
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3616 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3617 {
3618 // If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3619 // mark all call arguments as initialized by calling __msan_unpoison_param().
3620 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3621 {
3622 // void __msan_unpoison_param(size_t n)
3623 auto voidTy = llvm::Type::getVoidTy(*jit->context);
3624 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3625 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3626 auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3627
3628 jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3629 }
3630
3631 RR_DEBUG_INFO_UPDATE_LOC();
3632 llvm::SmallVector<llvm::Type *, 8> paramTys;
3633 for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3634 auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3635
3636 auto funcPtrTy = funcTy->getPointerTo();
3637 auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3638
3639 llvm::SmallVector<llvm::Value *, 8> arguments;
3640 for(auto arg : args) { arguments.push_back(V(arg)); }
3641 return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3642 }
3643
Breakpoint()3644 void Breakpoint()
3645 {
3646 RR_DEBUG_INFO_UPDATE_LOC();
3647 llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3648
3649 jit->builder->CreateCall(debugtrap);
3650 }
3651
3652 } // namespace rr
3653
3654 namespace rr {
3655
3656 #if defined(__i386__) || defined(__x86_64__)
3657 namespace x86 {
3658
3659 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3660 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3661 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3662 {
3663 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3664
3665 return V(jit->builder->CreateCall(intrinsic, V(x)));
3666 }
3667
3668 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3669 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3670 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3671 {
3672 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3673
3674 return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3675 }
3676
cvtss2si(RValue<Float> val)3677 RValue<Int> cvtss2si(RValue<Float> val)
3678 {
3679 Float4 vector;
3680 vector.x = val;
3681
3682 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3683 }
3684
cvtps2dq(RValue<Float4> val)3685 RValue<Int4> cvtps2dq(RValue<Float4> val)
3686 {
3687 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3688 }
3689
rcpss(RValue<Float> val)3690 RValue<Float> rcpss(RValue<Float> val)
3691 {
3692 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3693
3694 // TODO(b/172238865): MemorySanitizer does not support the rcpss instruction,
3695 // which makes it look at the entire 128-bit input operand for undefined bits.
3696 // Use zero-initialized values instead.
3697 if(__has_feature(memory_sanitizer))
3698 {
3699 undef = Float4(0).loadValue();
3700 }
3701
3702 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3703
3704 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3705 }
3706
sqrtss(RValue<Float> val)3707 RValue<Float> sqrtss(RValue<Float> val)
3708 {
3709 return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3710 }
3711
rsqrtss(RValue<Float> val)3712 RValue<Float> rsqrtss(RValue<Float> val)
3713 {
3714 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3715
3716 // TODO(b/172238865): MemorySanitizer does not support the rsqrtss instruction,
3717 // which makes it look at the entire 128-bit input operand for undefined bits.
3718 // Use zero-initialized values instead.
3719 if(__has_feature(memory_sanitizer))
3720 {
3721 undef = Float4(0).loadValue();
3722 }
3723
3724 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3725
3726 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3727 }
3728
rcpps(RValue<Float4> val)3729 RValue<Float4> rcpps(RValue<Float4> val)
3730 {
3731 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3732 }
3733
sqrtps(RValue<Float4> val)3734 RValue<Float4> sqrtps(RValue<Float4> val)
3735 {
3736 return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3737 }
3738
rsqrtps(RValue<Float4> val)3739 RValue<Float4> rsqrtps(RValue<Float4> val)
3740 {
3741 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3742 }
3743
maxps(RValue<Float4> x,RValue<Float4> y)3744 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3745 {
3746 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3747 }
3748
minps(RValue<Float4> x,RValue<Float4> y)3749 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3750 {
3751 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3752 }
3753
roundss(RValue<Float> val,unsigned char imm)3754 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3755 {
3756 llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3757
3758 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3759
3760 // TODO(b/172238865): MemorySanitizer does not support the roundss instruction,
3761 // which makes it look at the entire 128-bit input operands for undefined bits.
3762 // Use zero-initialized values instead.
3763 if(__has_feature(memory_sanitizer))
3764 {
3765 undef = Float4(0).loadValue();
3766 }
3767
3768 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3769
3770 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3771 }
3772
floorss(RValue<Float> val)3773 RValue<Float> floorss(RValue<Float> val)
3774 {
3775 return roundss(val, 1);
3776 }
3777
ceilss(RValue<Float> val)3778 RValue<Float> ceilss(RValue<Float> val)
3779 {
3780 return roundss(val, 2);
3781 }
3782
roundps(RValue<Float4> val,unsigned char imm)3783 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3784 {
3785 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3786 }
3787
floorps(RValue<Float4> val)3788 RValue<Float4> floorps(RValue<Float4> val)
3789 {
3790 return roundps(val, 1);
3791 }
3792
ceilps(RValue<Float4> val)3793 RValue<Float4> ceilps(RValue<Float4> val)
3794 {
3795 return roundps(val, 2);
3796 }
3797
pabsd(RValue<Int4> x)3798 RValue<Int4> pabsd(RValue<Int4> x)
3799 {
3800 return RValue<Int4>(V(lowerPABS(V(x.value()))));
3801 }
3802
paddsw(RValue<Short4> x,RValue<Short4> y)3803 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3804 {
3805 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3806 }
3807
psubsw(RValue<Short4> x,RValue<Short4> y)3808 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3809 {
3810 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3811 }
3812
paddusw(RValue<UShort4> x,RValue<UShort4> y)3813 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3814 {
3815 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3816 }
3817
psubusw(RValue<UShort4> x,RValue<UShort4> y)3818 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3819 {
3820 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3821 }
3822
paddsb(RValue<SByte8> x,RValue<SByte8> y)3823 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3824 {
3825 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3826 }
3827
psubsb(RValue<SByte8> x,RValue<SByte8> y)3828 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3829 {
3830 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3831 }
3832
paddusb(RValue<Byte8> x,RValue<Byte8> y)3833 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3834 {
3835 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3836 }
3837
psubusb(RValue<Byte8> x,RValue<Byte8> y)3838 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3839 {
3840 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3841 }
3842
pavgw(RValue<UShort4> x,RValue<UShort4> y)3843 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3844 {
3845 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3846 }
3847
pmaxsw(RValue<Short4> x,RValue<Short4> y)3848 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3849 {
3850 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3851 }
3852
pminsw(RValue<Short4> x,RValue<Short4> y)3853 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3854 {
3855 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3856 }
3857
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3858 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3859 {
3860 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3861 }
3862
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3863 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3864 {
3865 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3866 }
3867
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3868 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3869 {
3870 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3871 }
3872
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3873 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3874 {
3875 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3876 }
3877
packssdw(RValue<Int2> x,RValue<Int2> y)3878 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3879 {
3880 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3881 }
3882
packssdw(RValue<Int4> x,RValue<Int4> y)3883 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3884 {
3885 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3886 }
3887
packsswb(RValue<Short4> x,RValue<Short4> y)3888 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3889 {
3890 return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3891 }
3892
packuswb(RValue<Short4> x,RValue<Short4> y)3893 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3894 {
3895 return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3896 }
3897
packusdw(RValue<Int4> x,RValue<Int4> y)3898 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3899 {
3900 if(CPUID::supportsSSE4_1())
3901 {
3902 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3903 }
3904 else
3905 {
3906 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3907 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3908
3909 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3910 }
3911 }
3912
psrlw(RValue<UShort4> x,unsigned char y)3913 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3914 {
3915 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3916 }
3917
psrlw(RValue<UShort8> x,unsigned char y)3918 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3919 {
3920 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3921 }
3922
psraw(RValue<Short4> x,unsigned char y)3923 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3924 {
3925 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3926 }
3927
psraw(RValue<Short8> x,unsigned char y)3928 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3929 {
3930 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3931 }
3932
psllw(RValue<Short4> x,unsigned char y)3933 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3934 {
3935 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3936 }
3937
psllw(RValue<Short8> x,unsigned char y)3938 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3939 {
3940 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3941 }
3942
pslld(RValue<Int2> x,unsigned char y)3943 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3944 {
3945 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3946 }
3947
pslld(RValue<Int4> x,unsigned char y)3948 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3949 {
3950 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3951 }
3952
psrad(RValue<Int2> x,unsigned char y)3953 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3954 {
3955 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3956 }
3957
psrad(RValue<Int4> x,unsigned char y)3958 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3959 {
3960 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3961 }
3962
psrld(RValue<UInt2> x,unsigned char y)3963 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3964 {
3965 return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3966 }
3967
psrld(RValue<UInt4> x,unsigned char y)3968 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3969 {
3970 return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3971 }
3972
pmaxsd(RValue<Int4> x,RValue<Int4> y)3973 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3974 {
3975 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3976 }
3977
pminsd(RValue<Int4> x,RValue<Int4> y)3978 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3979 {
3980 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3981 }
3982
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3983 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3984 {
3985 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3986 }
3987
pminud(RValue<UInt4> x,RValue<UInt4> y)3988 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3989 {
3990 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3991 }
3992
pmulhw(RValue<Short4> x,RValue<Short4> y)3993 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3994 {
3995 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3996 }
3997
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3998 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3999 {
4000 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
4001 }
4002
pmaddwd(RValue<Short4> x,RValue<Short4> y)4003 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
4004 {
4005 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
4006 }
4007
pmulhw(RValue<Short8> x,RValue<Short8> y)4008 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
4009 {
4010 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
4011 }
4012
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)4013 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
4014 {
4015 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
4016 }
4017
pmaddwd(RValue<Short8> x,RValue<Short8> y)4018 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
4019 {
4020 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
4021 }
4022
movmskps(RValue<Float4> x)4023 RValue<Int> movmskps(RValue<Float4> x)
4024 {
4025 Value *v = x.value();
4026
4027 // TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4028 // which makes it look at the entire 128-bit input for undefined bits. Mask off
4029 // just the sign bits to avoid false positives.
4030 if(__has_feature(memory_sanitizer))
4031 {
4032 v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
4033 }
4034
4035 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
4036 }
4037
pmovmskb(RValue<Byte8> x)4038 RValue<Int> pmovmskb(RValue<Byte8> x)
4039 {
4040 Value *v = x.value();
4041
4042 // TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4043 // which makes it look at the entire 128-bit input for undefined bits. Mask off
4044 // just the sign bits in the lower 64-bit vector to avoid false positives.
4045 if(__has_feature(memory_sanitizer))
4046 {
4047 v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
4048 }
4049
4050 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
4051 }
4052
pmovzxbd(RValue<Byte16> x)4053 RValue<Int4> pmovzxbd(RValue<Byte16> x)
4054 {
4055 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
4056 }
4057
pmovsxbd(RValue<SByte16> x)4058 RValue<Int4> pmovsxbd(RValue<SByte16> x)
4059 {
4060 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
4061 }
4062
pmovzxwd(RValue<UShort8> x)4063 RValue<Int4> pmovzxwd(RValue<UShort8> x)
4064 {
4065 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
4066 }
4067
pmovsxwd(RValue<Short8> x)4068 RValue<Int4> pmovsxwd(RValue<Short8> x)
4069 {
4070 return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
4071 }
4072
4073 } // namespace x86
4074 #endif // defined(__i386__) || defined(__x86_64__)
4075
4076 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)4077 void VPrintf(const std::vector<Value *> &vals)
4078 {
4079 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4080 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4081 auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
4082 auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
4083 jit->builder->CreateCall(func, V(vals));
4084 }
4085 #endif // ENABLE_RR_PRINT
4086
Nop()4087 void Nop()
4088 {
4089 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4090 auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
4091 auto func = jit->module->getOrInsertFunction("nop", funcTy);
4092 jit->builder->CreateCall(func);
4093 }
4094
EmitDebugLocation()4095 void EmitDebugLocation()
4096 {
4097 #ifdef ENABLE_RR_DEBUG_INFO
4098 if(jit->debugInfo != nullptr)
4099 {
4100 jit->debugInfo->EmitLocation();
4101 }
4102 #endif // ENABLE_RR_DEBUG_INFO
4103 }
4104
EmitDebugVariable(Value * value)4105 void EmitDebugVariable(Value *value)
4106 {
4107 #ifdef ENABLE_RR_DEBUG_INFO
4108 if(jit->debugInfo != nullptr)
4109 {
4110 jit->debugInfo->EmitVariable(value);
4111 }
4112 #endif // ENABLE_RR_DEBUG_INFO
4113 }
4114
FlushDebug()4115 void FlushDebug()
4116 {
4117 #ifdef ENABLE_RR_DEBUG_INFO
4118 if(jit->debugInfo != nullptr)
4119 {
4120 jit->debugInfo->Flush();
4121 }
4122 #endif // ENABLE_RR_DEBUG_INFO
4123 }
4124
4125 } // namespace rr
4126
4127 // ------------------------------ Coroutines ------------------------------
4128
4129 namespace {
4130
4131 // Magic values retuned by llvm.coro.suspend.
4132 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
4133 enum SuspendAction
4134 {
4135 SuspendActionSuspend = -1,
4136 SuspendActionResume = 0,
4137 SuspendActionDestroy = 1
4138 };
4139
promoteFunctionToCoroutine()4140 void promoteFunctionToCoroutine()
4141 {
4142 ASSERT(jit->coroutine.id == nullptr);
4143
4144 // Types
4145 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4146 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4147 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4148 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4149 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4150 auto promiseTy = jit->coroutine.yieldType;
4151 auto promisePtrTy = promiseTy->getPointerTo();
4152
4153 // LLVM intrinsics
4154 auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
4155 auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
4156 auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
4157 auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
4158 auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
4159 auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
4160 auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
4161 auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
4162 auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
4163 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4164
4165 auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
4166 auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
4167 auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
4168 auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
4169
4170 auto oldInsertionPoint = jit->builder->saveIP();
4171
4172 // Build the coroutine_await() function:
4173 //
4174 // bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4175 // {
4176 // if(llvm.coro.done(handle))
4177 // {
4178 // return false;
4179 // }
4180 // else
4181 // {
4182 // *value = (T*)llvm.coro.promise(handle);
4183 // llvm.coro.resume(handle);
4184 // return true;
4185 // }
4186 // }
4187 //
4188 {
4189 auto args = jit->coroutine.await->arg_begin();
4190 auto handle = args++;
4191 auto outPtr = args++;
4192 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
4193 auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
4194 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
4195
4196 auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4197 jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4198
4199 jit->builder->SetInsertPoint(doneBlock);
4200 jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
4201
4202 jit->builder->SetInsertPoint(resumeBlock);
4203 auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4); // TODO: Get correct alignment.
4204 auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
4205 auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4206 jit->builder->CreateStore(promise, outPtr);
4207 jit->builder->CreateCall(coro_resume, { handle });
4208 jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
4209 }
4210
4211 // Build the coroutine_destroy() function:
4212 //
4213 // void coroutine_destroy(CoroutineHandle* handle)
4214 // {
4215 // llvm.coro.destroy(handle);
4216 // }
4217 //
4218 {
4219 auto handle = jit->coroutine.destroy->arg_begin();
4220 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4221 jit->builder->CreateCall(coro_destroy, { handle });
4222 jit->builder->CreateRetVoid();
4223 }
4224
4225 // Begin building the main coroutine_begin() function.
4226 //
4227 // CoroutineHandle* coroutine_begin(<Arguments>)
4228 // {
4229 // YieldType promise;
4230 // auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4231 // void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4232 // CoroutineHandle *handle = llvm.coro.begin(id, frame);
4233 //
4234 // ... <REACTOR CODE> ...
4235 //
4236 // end:
4237 // SuspendAction action = llvm.coro.suspend(none, true /* final */); // <-- RESUME POINT
4238 // switch(action)
4239 // {
4240 // case SuspendActionResume:
4241 // UNREACHABLE(); // Illegal to resume after final suspend.
4242 // case SuspendActionDestroy:
4243 // goto destroy;
4244 // default: // (SuspendActionSuspend)
4245 // goto suspend;
4246 // }
4247 //
4248 // destroy:
4249 // coroutine_free_frame(llvm.coro.free(id, handle));
4250 // goto suspend;
4251 //
4252 // suspend:
4253 // llvm.coro.end(handle, false);
4254 // return handle;
4255 // }
4256 //
4257
4258 #ifdef ENABLE_RR_DEBUG_INFO
4259 jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
4260 #endif // ENABLE_RR_DEBUG_INFO
4261
4262 jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
4263 jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
4264 jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
4265
4266 jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4267 jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4268 jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4269 llvm::ConstantInt::get(i32Ty, 0),
4270 jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4271 llvm::ConstantPointerNull::get(i8PtrTy),
4272 llvm::ConstantPointerNull::get(i8PtrTy),
4273 });
4274 auto size = jit->builder->CreateCall(coro_size, {});
4275 auto frame = jit->builder->CreateCall(allocFrame, { size });
4276 jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4277
4278 // Build the suspend block
4279 jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4280 jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4281 jit->builder->CreateRet(jit->coroutine.handle);
4282
4283 // Build the end block
4284 jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4285 auto action = jit->builder->CreateCall(coro_suspend, {
4286 llvm::ConstantTokenNone::get(*jit->context),
4287 llvm::ConstantInt::get(i1Ty, 1), // final: true
4288 });
4289 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4290 // switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4291 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4292
4293 // Build the destroy block
4294 jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4295 auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4296 jit->builder->CreateCall(freeFrame, { memory });
4297 jit->builder->CreateBr(jit->coroutine.suspendBlock);
4298
4299 // Switch back to original insert point to continue building the coroutine.
4300 jit->builder->restoreIP(oldInsertionPoint);
4301 }
4302
4303 } // anonymous namespace
4304
4305 namespace rr {
4306
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4307 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4308 {
4309 // Coroutines are initially created as a regular function.
4310 // Upon the first call to Yield(), the function is promoted to a true
4311 // coroutine.
4312 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4313 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4314 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4315 auto handleTy = i8PtrTy;
4316 auto boolTy = i1Ty;
4317 auto promiseTy = T(YieldType);
4318 auto promisePtrTy = promiseTy->getPointerTo();
4319
4320 jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4321 jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4322 jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4323 jit->coroutine.yieldType = promiseTy;
4324 jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4325
4326 jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4327 }
4328
yield(Value * val)4329 void Nucleus::yield(Value *val)
4330 {
4331 if(jit->coroutine.id == nullptr)
4332 {
4333 // First call to yield().
4334 // Promote the function to a full coroutine.
4335 promoteFunctionToCoroutine();
4336 ASSERT(jit->coroutine.id != nullptr);
4337 }
4338
4339 // promise = val;
4340 //
4341 // auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4342 // switch(action)
4343 // {
4344 // case SuspendActionResume:
4345 // goto resume;
4346 // case SuspendActionDestroy:
4347 // goto destroy;
4348 // default: // (SuspendActionSuspend)
4349 // goto suspend;
4350 // }
4351 // resume:
4352 //
4353
4354 RR_DEBUG_INFO_UPDATE_LOC();
4355 Variable::materializeAll();
4356
4357 // Types
4358 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4359 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4360
4361 // Intrinsics
4362 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4363
4364 // Create a block to resume execution.
4365 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4366
4367 // Store the promise (yield value)
4368 jit->builder->CreateStore(V(val), jit->coroutine.promise);
4369 auto action = jit->builder->CreateCall(coro_suspend, {
4370 llvm::ConstantTokenNone::get(*jit->context),
4371 llvm::ConstantInt::get(i1Ty, 0), // final: true
4372 });
4373 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4374 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4375 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4376
4377 // Continue building in the resume block.
4378 jit->builder->SetInsertPoint(resumeBlock);
4379 }
4380
acquireCoroutine(const char * name,const Config::Edit * cfgEdit)4381 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
4382 {
4383 bool isCoroutine = jit->coroutine.id != nullptr;
4384 if(isCoroutine)
4385 {
4386 jit->builder->CreateBr(jit->coroutine.endBlock);
4387 }
4388 else
4389 {
4390 // Coroutine without a Yield acts as a regular function.
4391 // The 'coroutine_begin' function returns a nullptr for the coroutine
4392 // handle.
4393 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4394 // The 'coroutine_await' function always returns false (coroutine done).
4395 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4396 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4397 // The 'coroutine_destroy' does nothing, returns void.
4398 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4399 jit->builder->CreateRetVoid();
4400 }
4401
4402 #ifdef ENABLE_RR_DEBUG_INFO
4403 if(jit->debugInfo != nullptr)
4404 {
4405 jit->debugInfo->Finalize();
4406 }
4407 #endif // ENABLE_RR_DEBUG_INFO
4408
4409 if(false)
4410 {
4411 std::error_code error;
4412 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4413 jit->module->print(file, 0);
4414 }
4415
4416 if(isCoroutine)
4417 {
4418 // Run manadory coroutine transforms.
4419 llvm::legacy::PassManager pm;
4420
4421 pm.add(llvm::createCoroEarlyLegacyPass());
4422 pm.add(llvm::createCoroSplitLegacyPass());
4423 pm.add(llvm::createCoroElideLegacyPass());
4424 pm.add(llvm::createBarrierNoopPass());
4425 pm.add(llvm::createCoroCleanupLegacyPass());
4426
4427 pm.run(*jit->module);
4428 }
4429
4430 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4431 {
4432 llvm::legacy::PassManager pm;
4433 pm.add(llvm::createVerifierPass());
4434 pm.run(*jit->module);
4435 }
4436 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4437
4438 Config cfg = jit->config;
4439 if(cfgEdit)
4440 {
4441 cfg = cfgEdit->apply(jit->config);
4442 }
4443 jit->optimize(cfg);
4444
4445 if(false)
4446 {
4447 std::error_code error;
4448 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4449 jit->module->print(file, 0);
4450 }
4451
4452 llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4453 funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4454 funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4455 funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4456
4457 auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount, cfg);
4458
4459 delete jit;
4460 jit = nullptr;
4461
4462 return routine;
4463 }
4464
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4465 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4466 {
4467 return func();
4468 }
4469
4470 } // namespace rr
4471