1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "LLVMReactorDebugInfo.hpp"
20 #include "PragmaInternals.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "SIMD.hpp"
24 #include "x86.hpp"
25 
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/Support/Alignment.h"
29 #include "llvm/Support/Error.h"
30 #include "llvm/Support/ManagedStatic.h"
31 
32 #include <fstream>
33 #include <iostream>
34 #include <mutex>
35 #include <numeric>
36 #include <thread>
37 #include <unordered_map>
38 
39 #if defined(__i386__) || defined(__x86_64__)
40 #	include <xmmintrin.h>
41 #endif
42 
43 #include <math.h>
44 
45 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()46 extern "C" void X86CompilationCallback()
47 {
48 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
49 }
50 #endif
51 
52 #if !LLVM_ENABLE_THREADS
53 #	error "LLVM_ENABLE_THREADS needs to be enabled"
54 #endif
55 
56 #if LLVM_VERSION_MAJOR < 11
57 namespace llvm {
58 using FixedVectorType = VectorType;
59 }  // namespace llvm
60 #endif
61 
62 namespace {
63 
64 // Used to automatically invoke llvm_shutdown() when driver is unloaded
65 llvm::llvm_shutdown_obj llvmShutdownObj;
66 
67 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
68 // for destructing objects at exit. See crbug.com/1074222
69 thread_local rr::JITBuilder *jit = nullptr;
70 
lowerPAVG(llvm::Value * x,llvm::Value * y)71 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
72 {
73 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
74 
75 	llvm::VectorType *extTy =
76 	    llvm::VectorType::getExtendedElementVectorType(ty);
77 	x = jit->builder->CreateZExt(x, extTy);
78 	y = jit->builder->CreateZExt(y, extTy);
79 
80 	// (x + y + 1) >> 1
81 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
82 	llvm::Value *res = jit->builder->CreateAdd(x, y);
83 	res = jit->builder->CreateAdd(res, one);
84 	res = jit->builder->CreateLShr(res, one);
85 	return jit->builder->CreateTrunc(res, ty);
86 }
87 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)88 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
89                           llvm::ICmpInst::Predicate pred)
90 {
91 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
92 }
93 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)94 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
95                        llvm::Value *y, llvm::Type *dstTy)
96 {
97 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
98 }
99 
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)100 [[maybe_unused]] llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
101                                             llvm::FCmpInst::Predicate pred)
102 {
103 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
104 }
105 
lowerRound(llvm::Value * x)106 [[maybe_unused]] llvm::Value *lowerRound(llvm::Value *x)
107 {
108 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
109 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
110 	return jit->builder->CreateCall(nearbyint, { x });
111 }
112 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)113 [[maybe_unused]] llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
114 {
115 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
116 }
117 
lowerFloor(llvm::Value * x)118 [[maybe_unused]] llvm::Value *lowerFloor(llvm::Value *x)
119 {
120 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
121 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
122 	return jit->builder->CreateCall(floor, { x });
123 }
124 
lowerTrunc(llvm::Value * x)125 [[maybe_unused]] llvm::Value *lowerTrunc(llvm::Value *x)
126 {
127 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
128 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
129 	return jit->builder->CreateCall(trunc, { x });
130 }
131 
lowerSQRT(llvm::Value * x)132 [[maybe_unused]] llvm::Value *lowerSQRT(llvm::Value *x)
133 {
134 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
135 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
136 	return jit->builder->CreateCall(sqrt, { x });
137 }
138 
lowerRCP(llvm::Value * x)139 [[maybe_unused]] llvm::Value *lowerRCP(llvm::Value *x)
140 {
141 	llvm::Type *ty = x->getType();
142 	llvm::Constant *one;
143 	if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
144 	{
145 		one = llvm::ConstantVector::getSplat(
146 #if LLVM_VERSION_MAJOR >= 11
147 		    vectorTy->getElementCount(),
148 #else
149 		    vectorTy->getNumElements(),
150 #endif
151 		    llvm::ConstantFP::get(vectorTy->getElementType(), 1));
152 	}
153 	else
154 	{
155 		one = llvm::ConstantFP::get(ty, 1);
156 	}
157 	return jit->builder->CreateFDiv(one, x);
158 }
159 
lowerRSQRT(llvm::Value * x)160 [[maybe_unused]] llvm::Value *lowerRSQRT(llvm::Value *x)
161 {
162 	return lowerRCP(lowerSQRT(x));
163 }
164 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)165 [[maybe_unused]] llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
166 {
167 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
168 	llvm::Value *y = llvm::ConstantVector::getSplat(
169 #if LLVM_VERSION_MAJOR >= 11
170 	    ty->getElementCount(),
171 #else
172 	    ty->getNumElements(),
173 #endif
174 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
175 	return jit->builder->CreateShl(x, y);
176 }
177 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)178 [[maybe_unused]] llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
179 {
180 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
181 	llvm::Value *y = llvm::ConstantVector::getSplat(
182 #if LLVM_VERSION_MAJOR >= 11
183 	    ty->getElementCount(),
184 #else
185 	    ty->getNumElements(),
186 #endif
187 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
188 	return jit->builder->CreateAShr(x, y);
189 }
190 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)191 [[maybe_unused]] llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
192 {
193 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
194 	llvm::Value *y = llvm::ConstantVector::getSplat(
195 #if LLVM_VERSION_MAJOR >= 11
196 	    ty->getElementCount(),
197 #else
198 	    ty->getNumElements(),
199 #endif
200 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
201 	return jit->builder->CreateLShr(x, y);
202 }
203 
lowerShuffleVector(llvm::Value * v1,llvm::Value * v2,llvm::ArrayRef<int> select)204 llvm::Value *lowerShuffleVector(llvm::Value *v1, llvm::Value *v2, llvm::ArrayRef<int> select)
205 {
206 	int size = select.size();
207 	const int maxSize = 16;
208 	llvm::Constant *swizzle[maxSize];
209 	ASSERT(size <= maxSize);
210 
211 	for(int i = 0; i < size; i++)
212 	{
213 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
214 	}
215 
216 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
217 
218 	return jit->builder->CreateShuffleVector(v1, v2, shuffle);
219 }
220 
lowerMulAdd(llvm::Value * x,llvm::Value * y)221 [[maybe_unused]] llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
222 {
223 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
224 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
225 
226 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
227 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
228 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
229 
230 	llvm::Value *undef = llvm::UndefValue::get(extTy);
231 
232 	llvm::SmallVector<int, 16> evenIdx;
233 	llvm::SmallVector<int, 16> oddIdx;
234 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
235 	{
236 		evenIdx.push_back(i);
237 		oddIdx.push_back(i + 1);
238 	}
239 
240 	llvm::Value *lhs = lowerShuffleVector(mult, undef, evenIdx);
241 	llvm::Value *rhs = lowerShuffleVector(mult, undef, oddIdx);
242 	return jit->builder->CreateAdd(lhs, rhs);
243 }
244 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)245 [[maybe_unused]] llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
246 {
247 	llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
248 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
249 
250 	llvm::IntegerType *dstElemTy =
251 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
252 
253 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
254 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
255 	llvm::Constant *max, *min;
256 	if(isSigned)
257 	{
258 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
259 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
260 	}
261 	else
262 	{
263 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
264 		min = llvm::ConstantInt::get(srcTy, 0, false);
265 	}
266 
267 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
268 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
269 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
270 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
271 
272 	x = jit->builder->CreateTrunc(x, dstTy);
273 	y = jit->builder->CreateTrunc(y, dstTy);
274 
275 	llvm::SmallVector<int, 16> index(srcTy->getNumElements() * 2);
276 	std::iota(index.begin(), index.end(), 0);
277 
278 	return lowerShuffleVector(x, y, index);
279 }
280 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)281 [[maybe_unused]] llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
282 {
283 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
284 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
285 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
286 
287 	llvm::Value *ret = jit->builder->CreateZExt(
288 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
289 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
290 	{
291 		llvm::Value *elem = jit->builder->CreateZExt(
292 		    jit->builder->CreateExtractElement(cmp, i), retTy);
293 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
294 	}
295 	return ret;
296 }
297 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)298 [[maybe_unused]] llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
299 {
300 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
301 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
302 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
303 
304 	llvm::Value *ret = jit->builder->CreateZExt(
305 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
306 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
307 	{
308 		llvm::Value *elem = jit->builder->CreateZExt(
309 		    jit->builder->CreateExtractElement(cmp, i), retTy);
310 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
311 	}
312 	return ret;
313 }
314 
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)315 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
316 {
317 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
318 }
319 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)320 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
321 {
322 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
323 }
324 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)325 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
326 {
327 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
328 }
329 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)330 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
331 {
332 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
333 }
334 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)335 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
336 {
337 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
338 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
339 
340 	llvm::Value *extX, *extY;
341 	if(sext)
342 	{
343 		extX = jit->builder->CreateSExt(x, extTy);
344 		extY = jit->builder->CreateSExt(y, extTy);
345 	}
346 	else
347 	{
348 		extX = jit->builder->CreateZExt(x, extTy);
349 		extY = jit->builder->CreateZExt(y, extTy);
350 	}
351 
352 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
353 
354 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
355 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
356 	return jit->builder->CreateTrunc(mulh, ty);
357 }
358 
359 }  // namespace
360 
361 namespace rr {
362 
363 const int SIMD::Width = 4;
364 
backendName()365 std::string Caps::backendName()
366 {
367 	return std::string("LLVM ") + LLVM_VERSION_STRING;
368 }
369 
coroutinesSupported()370 bool Caps::coroutinesSupported()
371 {
372 	return true;
373 }
374 
fmaIsFast()375 bool Caps::fmaIsFast()
376 {
377 	static bool AVX2 = CPUID::supportsAVX2();  // Also checks for FMA support
378 
379 	// If x86 FMA instructions are supported, assume LLVM will emit them instead of making calls to std::fma().
380 	return AVX2;
381 }
382 
383 // The abstract Type* types are implemented as LLVM types, except that
384 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
385 // and VFP in ARM, and eliminate the overhead of converting them to explicit
386 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
387 // as abstract pointers with small enum values.
388 enum InternalType : uintptr_t
389 {
390 	// Emulated types:
391 	Type_v2i32,
392 	Type_v4i16,
393 	Type_v2i16,
394 	Type_v8i8,
395 	Type_v4i8,
396 	Type_v2f32,
397 	EmulatedTypeCount,
398 	// Returned by asInternalType() to indicate that the abstract Type*
399 	// should be interpreted as LLVM type pointer:
400 	Type_LLVM
401 };
402 
asInternalType(Type * type)403 inline InternalType asInternalType(Type *type)
404 {
405 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
406 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
407 }
408 
T(Type * t)409 llvm::Type *T(Type *t)
410 {
411 	// Use 128-bit vectors to implement logically shorter ones.
412 	switch(asInternalType(t))
413 	{
414 	case Type_v2i32: return T(Int4::type());
415 	case Type_v4i16: return T(Short8::type());
416 	case Type_v2i16: return T(Short8::type());
417 	case Type_v8i8: return T(Byte16::type());
418 	case Type_v4i8: return T(Byte16::type());
419 	case Type_v2f32: return T(Float4::type());
420 	case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
421 	default:
422 		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
423 		return nullptr;
424 	}
425 }
426 
T(InternalType t)427 Type *T(InternalType t)
428 {
429 	return reinterpret_cast<Type *>(t);
430 }
431 
T(const std::vector<Type * > & t)432 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
433 {
434 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
435 }
436 
B(BasicBlock * t)437 inline llvm::BasicBlock *B(BasicBlock *t)
438 {
439 	return reinterpret_cast<llvm::BasicBlock *>(t);
440 }
441 
B(llvm::BasicBlock * t)442 inline BasicBlock *B(llvm::BasicBlock *t)
443 {
444 	return reinterpret_cast<BasicBlock *>(t);
445 }
446 
typeSize(Type * type)447 static size_t typeSize(Type *type)
448 {
449 	switch(asInternalType(type))
450 	{
451 	case Type_v2i32: return 8;
452 	case Type_v4i16: return 8;
453 	case Type_v2i16: return 4;
454 	case Type_v8i8: return 8;
455 	case Type_v4i8: return 4;
456 	case Type_v2f32: return 8;
457 	case Type_LLVM:
458 		{
459 			llvm::Type *t = T(type);
460 
461 			if(t->isPointerTy())
462 			{
463 				return sizeof(void *);
464 			}
465 
466 			// At this point we should only have LLVM 'primitive' types.
467 			unsigned int bits = t->getPrimitiveSizeInBits();
468 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
469 
470 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
471 			// but are typically stored as one byte. The DataLayout structure should
472 			// be used here and many other places if this assumption fails.
473 			return (bits + 7) / 8;
474 		}
475 		break;
476 	default:
477 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
478 		return 0;
479 	}
480 }
481 
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)482 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> ¶ms)
483 {
484 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
485 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
486 
487 	func->setLinkage(llvm::GlobalValue::ExternalLinkage);
488 	func->setDoesNotThrow();
489 	func->setCallingConv(llvm::CallingConv::C);
490 
491 	if(__has_feature(memory_sanitizer))
492 	{
493 		func->addFnAttr(llvm::Attribute::SanitizeMemory);
494 
495 		// Assume that when using recent versions of LLVM, MemorySanitizer enabled builds
496 		// use -fsanitize-memory-param-retval, which makes the caller not update the shadow
497 		// of function parameters. NoUndef skips generating checks for uninitialized values.
498 #if LLVM_VERSION_MAJOR >= 13
499 		for(unsigned int i = 0; i < params.size(); i++)
500 		{
501 			func->addParamAttr(i, llvm::Attribute::NoUndef);
502 		}
503 #endif
504 	}
505 
506 	if(__has_feature(address_sanitizer))
507 	{
508 		func->addFnAttr(llvm::Attribute::SanitizeAddress);
509 	}
510 
511 	func->addFnAttr("warn-stack-size", "524288");  // Warn when a function uses more than 512 KiB of stack memory
512 
513 	return func;
514 }
515 
Nucleus()516 Nucleus::Nucleus()
517 {
518 #if !__has_feature(memory_sanitizer)
519 	// thread_local variables in shared libraries are initialized at load-time,
520 	// but this is not observed by MemorySanitizer if the loader itself was not
521 	// instrumented, leading to false-positive uninitialized variable errors.
522 	ASSERT(jit == nullptr);
523 	ASSERT(Variable::unmaterializedVariables == nullptr);
524 #endif
525 
526 	jit = new JITBuilder();
527 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
528 }
529 
~Nucleus()530 Nucleus::~Nucleus()
531 {
532 	delete Variable::unmaterializedVariables;
533 	Variable::unmaterializedVariables = nullptr;
534 
535 	delete jit;
536 	jit = nullptr;
537 }
538 
acquireRoutine(const char * name)539 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name)
540 {
541 	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
542 	{
543 		llvm::Type *type = jit->function->getReturnType();
544 
545 		if(type->isVoidTy())
546 		{
547 			createRetVoid();
548 		}
549 		else
550 		{
551 			createRet(V(llvm::UndefValue::get(type)));
552 		}
553 	}
554 
555 	std::shared_ptr<Routine> routine;
556 
557 	auto acquire = [&](rr::JITBuilder *jit) {
558 	// ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
559 	// it needs to only use the jit variable passed in as an argument.
560 
561 #ifdef ENABLE_RR_DEBUG_INFO
562 		if(jit->debugInfo != nullptr)
563 		{
564 			jit->debugInfo->Finalize();
565 		}
566 #endif  // ENABLE_RR_DEBUG_INFO
567 
568 		if(false)
569 		{
570 			std::error_code error;
571 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
572 			jit->module->print(file, 0);
573 		}
574 
575 		jit->runPasses();
576 
577 		if(false)
578 		{
579 			std::error_code error;
580 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
581 			jit->module->print(file, 0);
582 		}
583 
584 		routine = jit->acquireRoutine(name, &jit->function, 1);
585 	};
586 
587 #ifdef JIT_IN_SEPARATE_THREAD
588 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
589 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
590 	// over the threading and stack sizes of its users, so this should be addressed
591 	// at a higher level instead.
592 	std::thread thread(acquire, jit);
593 	thread.join();
594 #else
595 	acquire(jit);
596 #endif
597 
598 	return routine;
599 }
600 
allocateStackVariable(Type * type,int arraySize)601 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
602 {
603 	// Need to allocate it in the entry block for mem2reg to work
604 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
605 
606 	llvm::Instruction *declaration;
607 
608 #if LLVM_VERSION_MAJOR >= 11
609 	auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
610 #else
611 	auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
612 #endif
613 
614 	if(arraySize)
615 	{
616 		Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
617 		declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
618 	}
619 	else
620 	{
621 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
622 	}
623 
624 #if LLVM_VERSION_MAJOR >= 16
625 	declaration->insertInto(&entryBlock, entryBlock.begin());
626 #else
627 	entryBlock.getInstList().push_front(declaration);
628 #endif
629 
630 	if(getPragmaState(InitializeLocalVariables))
631 	{
632 		llvm::Type *i8PtrTy = llvm::Type::getInt8Ty(*jit->context)->getPointerTo();
633 		llvm::Type *i32Ty = llvm::Type::getInt32Ty(*jit->context);
634 		llvm::Function *memset = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::memset, { i8PtrTy, i32Ty });
635 
636 		jit->builder->CreateCall(memset, { jit->builder->CreatePointerCast(declaration, i8PtrTy),
637 		                                   V(Nucleus::createConstantByte((unsigned char)0)),
638 		                                   V(Nucleus::createConstantInt((int)typeSize(type) * (arraySize ? arraySize : 1))),
639 		                                   V(Nucleus::createConstantBool(false)) });
640 	}
641 
642 	return V(declaration);
643 }
644 
createBasicBlock()645 BasicBlock *Nucleus::createBasicBlock()
646 {
647 	return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
648 }
649 
getInsertBlock()650 BasicBlock *Nucleus::getInsertBlock()
651 {
652 	return B(jit->builder->GetInsertBlock());
653 }
654 
setInsertBlock(BasicBlock * basicBlock)655 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
656 {
657 	// assert(jit->builder->GetInsertBlock()->back().isTerminator());
658 
659 	jit->builder->SetInsertPoint(B(basicBlock));
660 }
661 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)662 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
663 {
664 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
665 
666 #ifdef ENABLE_RR_DEBUG_INFO
667 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
668 #endif  // ENABLE_RR_DEBUG_INFO
669 
670 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
671 }
672 
getArgument(unsigned int index)673 Value *Nucleus::getArgument(unsigned int index)
674 {
675 	llvm::Function::arg_iterator args = jit->function->arg_begin();
676 
677 	while(index)
678 	{
679 		args++;
680 		index--;
681 	}
682 
683 	return V(&*args);
684 }
685 
createRetVoid()686 void Nucleus::createRetVoid()
687 {
688 	RR_DEBUG_INFO_UPDATE_LOC();
689 
690 	ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
691 
692 	// Code generated after this point is unreachable, so any variables
693 	// being read can safely return an undefined value. We have to avoid
694 	// materializing variables after the terminator ret instruction.
695 	Variable::killUnmaterialized();
696 
697 	jit->builder->CreateRetVoid();
698 }
699 
createRet(Value * v)700 void Nucleus::createRet(Value *v)
701 {
702 	RR_DEBUG_INFO_UPDATE_LOC();
703 
704 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
705 
706 	// Code generated after this point is unreachable, so any variables
707 	// being read can safely return an undefined value. We have to avoid
708 	// materializing variables after the terminator ret instruction.
709 	Variable::killUnmaterialized();
710 
711 	jit->builder->CreateRet(V(v));
712 }
713 
createBr(BasicBlock * dest)714 void Nucleus::createBr(BasicBlock *dest)
715 {
716 	RR_DEBUG_INFO_UPDATE_LOC();
717 	Variable::materializeAll();
718 
719 	jit->builder->CreateBr(B(dest));
720 }
721 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)722 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
723 {
724 	RR_DEBUG_INFO_UPDATE_LOC();
725 	Variable::materializeAll();
726 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
727 }
728 
createAdd(Value * lhs,Value * rhs)729 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
730 {
731 	RR_DEBUG_INFO_UPDATE_LOC();
732 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
733 }
734 
createSub(Value * lhs,Value * rhs)735 Value *Nucleus::createSub(Value *lhs, Value *rhs)
736 {
737 	RR_DEBUG_INFO_UPDATE_LOC();
738 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
739 }
740 
createMul(Value * lhs,Value * rhs)741 Value *Nucleus::createMul(Value *lhs, Value *rhs)
742 {
743 	RR_DEBUG_INFO_UPDATE_LOC();
744 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
745 }
746 
createUDiv(Value * lhs,Value * rhs)747 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
748 {
749 	RR_DEBUG_INFO_UPDATE_LOC();
750 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
751 }
752 
createSDiv(Value * lhs,Value * rhs)753 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
754 {
755 	RR_DEBUG_INFO_UPDATE_LOC();
756 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
757 }
758 
createFAdd(Value * lhs,Value * rhs)759 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
760 {
761 	RR_DEBUG_INFO_UPDATE_LOC();
762 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
763 }
764 
createFSub(Value * lhs,Value * rhs)765 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
766 {
767 	RR_DEBUG_INFO_UPDATE_LOC();
768 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
769 }
770 
createFMul(Value * lhs,Value * rhs)771 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
772 {
773 	RR_DEBUG_INFO_UPDATE_LOC();
774 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
775 }
776 
createFDiv(Value * lhs,Value * rhs)777 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
778 {
779 	RR_DEBUG_INFO_UPDATE_LOC();
780 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
781 }
782 
createURem(Value * lhs,Value * rhs)783 Value *Nucleus::createURem(Value *lhs, Value *rhs)
784 {
785 	RR_DEBUG_INFO_UPDATE_LOC();
786 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
787 }
788 
createSRem(Value * lhs,Value * rhs)789 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
790 {
791 	RR_DEBUG_INFO_UPDATE_LOC();
792 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
793 }
794 
createFRem(Value * lhs,Value * rhs)795 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
796 {
797 	RR_DEBUG_INFO_UPDATE_LOC();
798 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
799 }
800 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)801 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
802 {
803 	return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
804 }
805 
createShl(Value * lhs,Value * rhs)806 Value *Nucleus::createShl(Value *lhs, Value *rhs)
807 {
808 	RR_DEBUG_INFO_UPDATE_LOC();
809 	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
810 }
811 
createLShr(Value * lhs,Value * rhs)812 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
813 {
814 	RR_DEBUG_INFO_UPDATE_LOC();
815 	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
816 }
817 
createAShr(Value * lhs,Value * rhs)818 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
819 {
820 	RR_DEBUG_INFO_UPDATE_LOC();
821 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
822 }
823 
createAnd(Value * lhs,Value * rhs)824 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
825 {
826 	RR_DEBUG_INFO_UPDATE_LOC();
827 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
828 }
829 
createOr(Value * lhs,Value * rhs)830 Value *Nucleus::createOr(Value *lhs, Value *rhs)
831 {
832 	RR_DEBUG_INFO_UPDATE_LOC();
833 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
834 }
835 
createXor(Value * lhs,Value * rhs)836 Value *Nucleus::createXor(Value *lhs, Value *rhs)
837 {
838 	RR_DEBUG_INFO_UPDATE_LOC();
839 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
840 }
841 
createNeg(Value * v)842 Value *Nucleus::createNeg(Value *v)
843 {
844 	RR_DEBUG_INFO_UPDATE_LOC();
845 	return V(jit->builder->CreateNeg(V(v)));
846 }
847 
createFNeg(Value * v)848 Value *Nucleus::createFNeg(Value *v)
849 {
850 	RR_DEBUG_INFO_UPDATE_LOC();
851 	return V(jit->builder->CreateFNeg(V(v)));
852 }
853 
createNot(Value * v)854 Value *Nucleus::createNot(Value *v)
855 {
856 	RR_DEBUG_INFO_UPDATE_LOC();
857 	return V(jit->builder->CreateNot(V(v)));
858 }
859 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)860 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
861 {
862 	RR_DEBUG_INFO_UPDATE_LOC();
863 	switch(asInternalType(type))
864 	{
865 	case Type_v2i32:
866 	case Type_v4i16:
867 	case Type_v8i8:
868 	case Type_v2f32:
869 		return createBitCast(
870 		    createInsertElement(
871 		        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
872 		        createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
873 		        0),
874 		    type);
875 	case Type_v2i16:
876 	case Type_v4i8:
877 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
878 		{
879 			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
880 			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
881 			i = createZExt(i, Long::type());
882 			Value *v = createInsertElement(u, i, 0);
883 			return createBitCast(v, type);
884 		}
885 		// Fallthrough to non-emulated case.
886 	case Type_LLVM:
887 		{
888 			auto elTy = T(type);
889 
890 			if(!atomic)
891 			{
892 				return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
893 			}
894 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
895 			{
896 				// Integers and pointers can be atomically loaded by setting
897 				// the ordering constraint on the load instruction.
898 				auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
899 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
900 				return V(load);
901 			}
902 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
903 			{
904 				// LLVM claims to support atomic loads of float types as
905 				// above, but certain backends cannot deal with this.
906 				// Load as an integer and bitcast. See b/136037244.
907 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
908 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
909 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
910 				auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
911 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
912 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
913 				return V(loadCast);
914 			}
915 			else
916 			{
917 				// More exotic types require falling back to the extern:
918 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
919 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
920 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
921 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
922 				auto i8PtrTy = i8Ty->getPointerTo();
923 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
924 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
925 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
926 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
927 				auto out = allocateStackVariable(type);
928 				jit->builder->CreateCall(func, {
929 				                                   llvm::ConstantInt::get(sizetTy, size),
930 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
931 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
932 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
933 				                               });
934 				return V(jit->builder->CreateLoad(T(type), V(out)));
935 			}
936 		}
937 	default:
938 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
939 		return nullptr;
940 	}
941 }
942 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)943 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
944 {
945 	RR_DEBUG_INFO_UPDATE_LOC();
946 	switch(asInternalType(type))
947 	{
948 	case Type_v2i32:
949 	case Type_v4i16:
950 	case Type_v8i8:
951 	case Type_v2f32:
952 		createStore(
953 		    createExtractElement(
954 		        createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
955 		    createBitCast(ptr, Pointer<Long>::type()),
956 		    Long::type(), isVolatile, alignment, atomic, memoryOrder);
957 		return value;
958 	case Type_v2i16:
959 	case Type_v4i8:
960 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
961 		{
962 			createStore(
963 			    createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
964 			    createBitCast(ptr, Pointer<Int>::type()),
965 			    Int::type(), isVolatile, alignment, atomic, memoryOrder);
966 			return value;
967 		}
968 		// Fallthrough to non-emulated case.
969 	case Type_LLVM:
970 		{
971 			auto elTy = T(type);
972 
973 			if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
974 			{
975 				// Mark all memory writes as initialized by calling __msan_unpoison
976 				// void __msan_unpoison(const volatile void *a, size_t size)
977 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
978 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
979 				auto voidPtrTy = i8Ty->getPointerTo();
980 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
981 				auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
982 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
983 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
984 
985 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
986 				                                 llvm::ConstantInt::get(sizetTy, size) });
987 			}
988 
989 			if(!atomic)
990 			{
991 				jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
992 			}
993 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
994 			{
995 				// Integers and pointers can be atomically stored by setting
996 				// the ordering constraint on the store instruction.
997 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
998 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
999 			}
1000 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
1001 			{
1002 				// LLVM claims to support atomic stores of float types as
1003 				// above, but certain backends cannot deal with this.
1004 				// Store as an bitcast integer. See b/136037244.
1005 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1006 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1007 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1008 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1009 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1010 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1011 			}
1012 			else
1013 			{
1014 				// More exotic types require falling back to the extern:
1015 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1016 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1017 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1018 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1019 				auto i8PtrTy = i8Ty->getPointerTo();
1020 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1021 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1022 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1023 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1024 				auto copy = allocateStackVariable(type);
1025 				jit->builder->CreateStore(V(value), V(copy));
1026 				jit->builder->CreateCall(func, {
1027 				                                   llvm::ConstantInt::get(sizetTy, size),
1028 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1029 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1030 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1031 				                               });
1032 			}
1033 
1034 			return value;
1035 		}
1036 	default:
1037 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1038 		return nullptr;
1039 	}
1040 }
1041 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1042 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1043 {
1044 	RR_DEBUG_INFO_UPDATE_LOC();
1045 
1046 	ASSERT(V(ptr)->getType()->isPointerTy());
1047 	ASSERT(V(mask)->getType()->isVectorTy());
1048 
1049 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1050 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1051 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1052 	auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1053 	auto elVecPtrTy = elVecTy->getPointerTo();
1054 	auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1055 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1056 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1057 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1058 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1059 }
1060 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1061 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1062 {
1063 	RR_DEBUG_INFO_UPDATE_LOC();
1064 
1065 	ASSERT(V(ptr)->getType()->isPointerTy());
1066 	ASSERT(V(val)->getType()->isVectorTy());
1067 	ASSERT(V(mask)->getType()->isVectorTy());
1068 
1069 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1070 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1071 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1072 	auto elVecTy = V(val)->getType();
1073 	auto elVecPtrTy = elVecTy->getPointerTo();
1074 	auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1075 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1076 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1077 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1078 
1079 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1080 	{
1081 		// Mark memory writes as initialized by calling __msan_unpoison
1082 		// void __msan_unpoison(const volatile void *a, size_t size)
1083 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
1084 		auto voidPtrTy = voidTy->getPointerTo();
1085 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1086 		auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1087 		auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1088 		auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1089 
1090 		for(unsigned i = 0; i < numEls; i++)
1091 		{
1092 			// Check mask for this element
1093 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1094 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1095 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1096 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1097 			jit->builder->SetInsertPoint(thenBlock);
1098 
1099 			// Insert __msan_unpoison call in conditional block
1100 			auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1101 			jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1102 			                                 llvm::ConstantInt::get(sizetTy, size) });
1103 
1104 			jit->builder->CreateBr(mergeBlock);
1105 			jit->builder->SetInsertPoint(mergeBlock);
1106 		}
1107 	}
1108 }
1109 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1110 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1111 {
1112 	ASSERT(base->getType()->isPointerTy());
1113 	ASSERT(offsets->getType()->isVectorTy());
1114 	ASSERT(mask->getType()->isVectorTy());
1115 
1116 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1117 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1118 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1119 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1120 	auto i8PtrTy = i8Ty->getPointerTo();
1121 	auto elPtrTy = elTy->getPointerTo();
1122 	auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1123 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1124 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1125 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1126 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1127 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1128 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1129 
1130 	if(!__has_feature(memory_sanitizer))
1131 	{
1132 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1133 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1134 		return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1135 	}
1136 	else  // __has_feature(memory_sanitizer)
1137 	{
1138 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1139 		// Work around it by emulating gather with element-wise loads.
1140 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1141 
1142 		Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1143 		Nucleus::createStore(V(passthrough), result, T(elVecTy));
1144 
1145 		for(unsigned i = 0; i < numEls; i++)
1146 		{
1147 			// Check mask for this element
1148 			Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1149 
1150 			If(RValue<Bool>(elementMask))
1151 			{
1152 				Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1153 				Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1154 
1155 				Value *v = Nucleus::createLoad(result, T(elVecTy));
1156 				v = Nucleus::createInsertElement(v, el, i);
1157 				Nucleus::createStore(v, result, T(elVecTy));
1158 			}
1159 		}
1160 
1161 		return V(Nucleus::createLoad(result, T(elVecTy)));
1162 	}
1163 }
1164 
Gather(RValue<Pointer<Float>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1165 RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1166 {
1167 	return As<SIMD::Float>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1168 }
1169 
Gather(RValue<Pointer<Int>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1170 RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1171 {
1172 	return As<SIMD::Int>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1173 }
1174 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1175 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1176 {
1177 	ASSERT(base->getType()->isPointerTy());
1178 	ASSERT(val->getType()->isVectorTy());
1179 	ASSERT(offsets->getType()->isVectorTy());
1180 	ASSERT(mask->getType()->isVectorTy());
1181 
1182 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1183 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1184 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1185 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1186 	auto i8PtrTy = i8Ty->getPointerTo();
1187 	auto elVecTy = val->getType();
1188 	auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1189 	auto elPtrTy = elTy->getPointerTo();
1190 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1191 
1192 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1193 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1194 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1195 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1196 
1197 	if(!__has_feature(memory_sanitizer))
1198 	{
1199 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1200 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1201 		jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1202 	}
1203 	else  // __has_feature(memory_sanitizer)
1204 	{
1205 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1206 		// Work around it by emulating scatter with element-wise stores.
1207 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1208 
1209 		for(unsigned i = 0; i < numEls; i++)
1210 		{
1211 			// Check mask for this element
1212 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1213 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1214 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1215 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1216 			jit->builder->SetInsertPoint(thenBlock);
1217 
1218 			auto el = jit->builder->CreateExtractElement(val, idx);
1219 			auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1220 			Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1221 
1222 			jit->builder->CreateBr(mergeBlock);
1223 			jit->builder->SetInsertPoint(mergeBlock);
1224 		}
1225 	}
1226 }
1227 
Scatter(RValue<Pointer<Float>> base,RValue<SIMD::Float> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1228 void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1229 {
1230 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1231 }
1232 
Scatter(RValue<Pointer<Int>> base,RValue<SIMD::Int> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1233 void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1234 {
1235 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1236 }
1237 
createFence(std::memory_order memoryOrder)1238 void Nucleus::createFence(std::memory_order memoryOrder)
1239 {
1240 	RR_DEBUG_INFO_UPDATE_LOC();
1241 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1242 }
1243 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1244 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1245 {
1246 	RR_DEBUG_INFO_UPDATE_LOC();
1247 
1248 	if(sizeof(void *) == 8)
1249 	{
1250 		// LLVM manual: "When indexing into an array, pointer or vector,
1251 		// integers of any width are allowed, and they are not required to
1252 		// be constant. These integers are treated as signed values where
1253 		// relevant."
1254 		//
1255 		// Thus if we want indexes to be treated as unsigned we have to
1256 		// zero-extend them ourselves.
1257 		//
1258 		// Note that this is not because we want to address anywhere near
1259 		// 4 GB of data. Instead this is important for performance because
1260 		// x86 supports automatic zero-extending of 32-bit registers to
1261 		// 64-bit. Thus when indexing into an array using a uint32 is
1262 		// actually faster than an int32.
1263 		index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1264 	}
1265 
1266 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1267 	// effective address correctly.
1268 	if(asInternalType(type) == Type_LLVM)
1269 	{
1270 		return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1271 	}
1272 
1273 	// For emulated types we have to multiply the index by the intended
1274 	// type size ourselves to obain the byte offset.
1275 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1276 
1277 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1278 	// original pointer type.
1279 	return createBitCast(
1280 	    V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1281 	    T(llvm::PointerType::get(T(type), 0)));
1282 }
1283 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1284 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1285 {
1286 	RR_DEBUG_INFO_UPDATE_LOC();
1287 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1288 #if LLVM_VERSION_MAJOR >= 11
1289 	                                       llvm::MaybeAlign(),
1290 #endif
1291 	                                       atomicOrdering(true, memoryOrder)));
1292 }
1293 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1294 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1295 {
1296 	RR_DEBUG_INFO_UPDATE_LOC();
1297 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1298 #if LLVM_VERSION_MAJOR >= 11
1299 	                                       llvm::MaybeAlign(),
1300 #endif
1301 	                                       atomicOrdering(true, memoryOrder)));
1302 }
1303 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1304 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1305 {
1306 	RR_DEBUG_INFO_UPDATE_LOC();
1307 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1308 #if LLVM_VERSION_MAJOR >= 11
1309 	                                       llvm::MaybeAlign(),
1310 #endif
1311 	                                       atomicOrdering(true, memoryOrder)));
1312 }
1313 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1314 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1315 {
1316 	RR_DEBUG_INFO_UPDATE_LOC();
1317 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1318 #if LLVM_VERSION_MAJOR >= 11
1319 	                                       llvm::MaybeAlign(),
1320 #endif
1321 	                                       atomicOrdering(true, memoryOrder)));
1322 }
1323 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1324 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1325 {
1326 	RR_DEBUG_INFO_UPDATE_LOC();
1327 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1328 #if LLVM_VERSION_MAJOR >= 11
1329 	                                       llvm::MaybeAlign(),
1330 #endif
1331 	                                       atomicOrdering(true, memoryOrder)));
1332 }
1333 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1334 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1335 {
1336 	RR_DEBUG_INFO_UPDATE_LOC();
1337 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1338 #if LLVM_VERSION_MAJOR >= 11
1339 	                                       llvm::MaybeAlign(),
1340 #endif
1341 	                                       atomicOrdering(true, memoryOrder)));
1342 }
1343 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1344 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1345 {
1346 	RR_DEBUG_INFO_UPDATE_LOC();
1347 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1348 #if LLVM_VERSION_MAJOR >= 11
1349 	                                       llvm::MaybeAlign(),
1350 #endif
1351 	                                       atomicOrdering(true, memoryOrder)));
1352 }
1353 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1354 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1355 {
1356 	RR_DEBUG_INFO_UPDATE_LOC();
1357 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1358 #if LLVM_VERSION_MAJOR >= 11
1359 	                                       llvm::MaybeAlign(),
1360 #endif
1361 	                                       atomicOrdering(true, memoryOrder)));
1362 }
1363 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1364 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1365 {
1366 	RR_DEBUG_INFO_UPDATE_LOC();
1367 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1368 #if LLVM_VERSION_MAJOR >= 11
1369 	                                       llvm::MaybeAlign(),
1370 #endif
1371 	                                       atomicOrdering(true, memoryOrder)));
1372 }
1373 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1374 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1375 {
1376 	RR_DEBUG_INFO_UPDATE_LOC();
1377 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1378 #if LLVM_VERSION_MAJOR >= 11
1379 	                                       llvm::MaybeAlign(),
1380 #endif
1381 	                                       atomicOrdering(true, memoryOrder)));
1382 }
1383 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1384 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1385 {
1386 	RR_DEBUG_INFO_UPDATE_LOC();
1387 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1388 	return V(jit->builder->CreateExtractValue(
1389 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1390 #if LLVM_VERSION_MAJOR >= 11
1391 	                                      llvm::MaybeAlign(),
1392 #endif
1393 	                                      atomicOrdering(true, memoryOrderEqual),
1394 	                                      atomicOrdering(true, memoryOrderUnequal)),
1395 	    llvm::ArrayRef<unsigned>(0u)));
1396 }
1397 
createTrunc(Value * v,Type * destType)1398 Value *Nucleus::createTrunc(Value *v, Type *destType)
1399 {
1400 	RR_DEBUG_INFO_UPDATE_LOC();
1401 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1402 }
1403 
createZExt(Value * v,Type * destType)1404 Value *Nucleus::createZExt(Value *v, Type *destType)
1405 {
1406 	RR_DEBUG_INFO_UPDATE_LOC();
1407 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1408 }
1409 
createSExt(Value * v,Type * destType)1410 Value *Nucleus::createSExt(Value *v, Type *destType)
1411 {
1412 	RR_DEBUG_INFO_UPDATE_LOC();
1413 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1414 }
1415 
createFPToUI(Value * v,Type * destType)1416 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1417 {
1418 	RR_DEBUG_INFO_UPDATE_LOC();
1419 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1420 }
1421 
createFPToSI(Value * v,Type * destType)1422 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1423 {
1424 	RR_DEBUG_INFO_UPDATE_LOC();
1425 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1426 }
1427 
createSIToFP(Value * v,Type * destType)1428 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1429 {
1430 	RR_DEBUG_INFO_UPDATE_LOC();
1431 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1432 }
1433 
createFPTrunc(Value * v,Type * destType)1434 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1435 {
1436 	RR_DEBUG_INFO_UPDATE_LOC();
1437 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1438 }
1439 
createFPExt(Value * v,Type * destType)1440 Value *Nucleus::createFPExt(Value *v, Type *destType)
1441 {
1442 	RR_DEBUG_INFO_UPDATE_LOC();
1443 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1444 }
1445 
createBitCast(Value * v,Type * destType)1446 Value *Nucleus::createBitCast(Value *v, Type *destType)
1447 {
1448 	RR_DEBUG_INFO_UPDATE_LOC();
1449 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1450 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1451 	// reading back as the destination type.
1452 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1453 	{
1454 		Value *readAddress = allocateStackVariable(destType);
1455 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1456 		createStore(v, writeAddress, T(V(v)->getType()));
1457 		return createLoad(readAddress, destType);
1458 	}
1459 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1460 	{
1461 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1462 		createStore(v, writeAddress, T(V(v)->getType()));
1463 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1464 		return createLoad(readAddress, destType);
1465 	}
1466 
1467 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1468 }
1469 
createICmpEQ(Value * lhs,Value * rhs)1470 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1471 {
1472 	RR_DEBUG_INFO_UPDATE_LOC();
1473 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1474 }
1475 
createICmpNE(Value * lhs,Value * rhs)1476 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1477 {
1478 	RR_DEBUG_INFO_UPDATE_LOC();
1479 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1480 }
1481 
createICmpUGT(Value * lhs,Value * rhs)1482 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1483 {
1484 	RR_DEBUG_INFO_UPDATE_LOC();
1485 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1486 }
1487 
createICmpUGE(Value * lhs,Value * rhs)1488 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1489 {
1490 	RR_DEBUG_INFO_UPDATE_LOC();
1491 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1492 }
1493 
createICmpULT(Value * lhs,Value * rhs)1494 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1495 {
1496 	RR_DEBUG_INFO_UPDATE_LOC();
1497 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1498 }
1499 
createICmpULE(Value * lhs,Value * rhs)1500 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1501 {
1502 	RR_DEBUG_INFO_UPDATE_LOC();
1503 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1504 }
1505 
createICmpSGT(Value * lhs,Value * rhs)1506 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1507 {
1508 	RR_DEBUG_INFO_UPDATE_LOC();
1509 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1510 }
1511 
createICmpSGE(Value * lhs,Value * rhs)1512 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1513 {
1514 	RR_DEBUG_INFO_UPDATE_LOC();
1515 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1516 }
1517 
createICmpSLT(Value * lhs,Value * rhs)1518 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1519 {
1520 	RR_DEBUG_INFO_UPDATE_LOC();
1521 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1522 }
1523 
createICmpSLE(Value * lhs,Value * rhs)1524 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1525 {
1526 	RR_DEBUG_INFO_UPDATE_LOC();
1527 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1528 }
1529 
createFCmpOEQ(Value * lhs,Value * rhs)1530 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1531 {
1532 	RR_DEBUG_INFO_UPDATE_LOC();
1533 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1534 }
1535 
createFCmpOGT(Value * lhs,Value * rhs)1536 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1537 {
1538 	RR_DEBUG_INFO_UPDATE_LOC();
1539 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1540 }
1541 
createFCmpOGE(Value * lhs,Value * rhs)1542 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1543 {
1544 	RR_DEBUG_INFO_UPDATE_LOC();
1545 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1546 }
1547 
createFCmpOLT(Value * lhs,Value * rhs)1548 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1549 {
1550 	RR_DEBUG_INFO_UPDATE_LOC();
1551 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1552 }
1553 
createFCmpOLE(Value * lhs,Value * rhs)1554 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1555 {
1556 	RR_DEBUG_INFO_UPDATE_LOC();
1557 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1558 }
1559 
createFCmpONE(Value * lhs,Value * rhs)1560 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1561 {
1562 	RR_DEBUG_INFO_UPDATE_LOC();
1563 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1564 }
1565 
createFCmpORD(Value * lhs,Value * rhs)1566 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1567 {
1568 	RR_DEBUG_INFO_UPDATE_LOC();
1569 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1570 }
1571 
createFCmpUNO(Value * lhs,Value * rhs)1572 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1573 {
1574 	RR_DEBUG_INFO_UPDATE_LOC();
1575 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1576 }
1577 
createFCmpUEQ(Value * lhs,Value * rhs)1578 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1579 {
1580 	RR_DEBUG_INFO_UPDATE_LOC();
1581 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1582 }
1583 
createFCmpUGT(Value * lhs,Value * rhs)1584 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1585 {
1586 	RR_DEBUG_INFO_UPDATE_LOC();
1587 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1588 }
1589 
createFCmpUGE(Value * lhs,Value * rhs)1590 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1591 {
1592 	RR_DEBUG_INFO_UPDATE_LOC();
1593 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1594 }
1595 
createFCmpULT(Value * lhs,Value * rhs)1596 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1597 {
1598 	RR_DEBUG_INFO_UPDATE_LOC();
1599 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1600 }
1601 
createFCmpULE(Value * lhs,Value * rhs)1602 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1603 {
1604 	RR_DEBUG_INFO_UPDATE_LOC();
1605 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1606 }
1607 
createFCmpUNE(Value * lhs,Value * rhs)1608 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1609 {
1610 	RR_DEBUG_INFO_UPDATE_LOC();
1611 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1612 }
1613 
createExtractElement(Value * vector,Type * type,int index)1614 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1615 {
1616 	RR_DEBUG_INFO_UPDATE_LOC();
1617 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1618 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1619 }
1620 
createInsertElement(Value * vector,Value * element,int index)1621 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1622 {
1623 	RR_DEBUG_INFO_UPDATE_LOC();
1624 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1625 }
1626 
createShuffleVector(Value * v1,Value * v2,std::vector<int> select)1627 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, std::vector<int> select)
1628 {
1629 	RR_DEBUG_INFO_UPDATE_LOC();
1630 
1631 	size_t size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1632 	ASSERT(size == llvm::cast<llvm::FixedVectorType>(V(v2)->getType())->getNumElements());
1633 
1634 	llvm::SmallVector<int, 16> mask;
1635 	const size_t selectSize = select.size();
1636 	for(size_t i = 0; i < size; i++)
1637 	{
1638 		mask.push_back(select[i % selectSize]);
1639 	}
1640 
1641 	return V(lowerShuffleVector(V(v1), V(v2), mask));
1642 }
1643 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1644 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1645 {
1646 	RR_DEBUG_INFO_UPDATE_LOC();
1647 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1648 }
1649 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1650 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1651 {
1652 	RR_DEBUG_INFO_UPDATE_LOC();
1653 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1654 }
1655 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1656 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1657 {
1658 	RR_DEBUG_INFO_UPDATE_LOC();
1659 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1660 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1661 }
1662 
createUnreachable()1663 void Nucleus::createUnreachable()
1664 {
1665 	RR_DEBUG_INFO_UPDATE_LOC();
1666 	jit->builder->CreateUnreachable();
1667 }
1668 
getType(Value * value)1669 Type *Nucleus::getType(Value *value)
1670 {
1671 	return T(V(value)->getType());
1672 }
1673 
getContainedType(Type * vectorType)1674 Type *Nucleus::getContainedType(Type *vectorType)
1675 {
1676 	return T(T(vectorType)->getContainedType(0));
1677 }
1678 
getPointerType(Type * ElementType)1679 Type *Nucleus::getPointerType(Type *ElementType)
1680 {
1681 	return T(llvm::PointerType::get(T(ElementType), 0));
1682 }
1683 
getNaturalIntType()1684 static llvm::Type *getNaturalIntType()
1685 {
1686 	return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1687 }
1688 
getPrintfStorageType(Type * valueType)1689 Type *Nucleus::getPrintfStorageType(Type *valueType)
1690 {
1691 	llvm::Type *valueTy = T(valueType);
1692 	if(valueTy->isIntegerTy())
1693 	{
1694 		return T(getNaturalIntType());
1695 	}
1696 	if(valueTy->isFloatTy())
1697 	{
1698 		return T(llvm::Type::getDoubleTy(*jit->context));
1699 	}
1700 
1701 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1702 	return {};
1703 }
1704 
createNullValue(Type * Ty)1705 Value *Nucleus::createNullValue(Type *Ty)
1706 {
1707 	RR_DEBUG_INFO_UPDATE_LOC();
1708 	return V(llvm::Constant::getNullValue(T(Ty)));
1709 }
1710 
createConstantLong(int64_t i)1711 Value *Nucleus::createConstantLong(int64_t i)
1712 {
1713 	RR_DEBUG_INFO_UPDATE_LOC();
1714 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1715 }
1716 
createConstantInt(int i)1717 Value *Nucleus::createConstantInt(int i)
1718 {
1719 	RR_DEBUG_INFO_UPDATE_LOC();
1720 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1721 }
1722 
createConstantInt(unsigned int i)1723 Value *Nucleus::createConstantInt(unsigned int i)
1724 {
1725 	RR_DEBUG_INFO_UPDATE_LOC();
1726 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1727 }
1728 
createConstantBool(bool b)1729 Value *Nucleus::createConstantBool(bool b)
1730 {
1731 	RR_DEBUG_INFO_UPDATE_LOC();
1732 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1733 }
1734 
createConstantByte(signed char i)1735 Value *Nucleus::createConstantByte(signed char i)
1736 {
1737 	RR_DEBUG_INFO_UPDATE_LOC();
1738 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1739 }
1740 
createConstantByte(unsigned char i)1741 Value *Nucleus::createConstantByte(unsigned char i)
1742 {
1743 	RR_DEBUG_INFO_UPDATE_LOC();
1744 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1745 }
1746 
createConstantShort(short i)1747 Value *Nucleus::createConstantShort(short i)
1748 {
1749 	RR_DEBUG_INFO_UPDATE_LOC();
1750 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1751 }
1752 
createConstantShort(unsigned short i)1753 Value *Nucleus::createConstantShort(unsigned short i)
1754 {
1755 	RR_DEBUG_INFO_UPDATE_LOC();
1756 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1757 }
1758 
createConstantFloat(float x)1759 Value *Nucleus::createConstantFloat(float x)
1760 {
1761 	RR_DEBUG_INFO_UPDATE_LOC();
1762 	return V(llvm::ConstantFP::get(T(Float::type()), x));
1763 }
1764 
createNullPointer(Type * Ty)1765 Value *Nucleus::createNullPointer(Type *Ty)
1766 {
1767 	RR_DEBUG_INFO_UPDATE_LOC();
1768 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1769 }
1770 
createConstantVector(std::vector<int64_t> constants,Type * type)1771 Value *Nucleus::createConstantVector(std::vector<int64_t> constants, Type *type)
1772 {
1773 	RR_DEBUG_INFO_UPDATE_LOC();
1774 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1775 	const size_t numConstants = constants.size();                                             // Number of provided constants for the (emulated) type.
1776 	const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1777 	llvm::SmallVector<llvm::Constant *, 16> constantVector;
1778 
1779 	for(size_t i = 0; i < numElements; i++)
1780 	{
1781 		constantVector.push_back(llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]));
1782 	}
1783 
1784 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1785 }
1786 
createConstantVector(std::vector<double> constants,Type * type)1787 Value *Nucleus::createConstantVector(std::vector<double> constants, Type *type)
1788 {
1789 	RR_DEBUG_INFO_UPDATE_LOC();
1790 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1791 	const size_t numConstants = constants.size();                                             // Number of provided constants for the (emulated) type.
1792 	const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1793 	llvm::SmallVector<llvm::Constant *, 16> constantVector;
1794 
1795 	for(size_t i = 0; i < numElements; i++)
1796 	{
1797 		constantVector.push_back(llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]));
1798 	}
1799 
1800 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1801 }
1802 
createConstantString(const char * v)1803 Value *Nucleus::createConstantString(const char *v)
1804 {
1805 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1806 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1807 	return V(ptr);
1808 }
1809 
setOptimizerCallback(OptimizerCallback * callback)1810 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1811 {
1812 	// The LLVM backend does not produce optimizer reports.
1813 	(void)callback;
1814 }
1815 
type()1816 Type *Void::type()
1817 {
1818 	return T(llvm::Type::getVoidTy(*jit->context));
1819 }
1820 
type()1821 Type *Bool::type()
1822 {
1823 	return T(llvm::Type::getInt1Ty(*jit->context));
1824 }
1825 
type()1826 Type *Byte::type()
1827 {
1828 	return T(llvm::Type::getInt8Ty(*jit->context));
1829 }
1830 
type()1831 Type *SByte::type()
1832 {
1833 	return T(llvm::Type::getInt8Ty(*jit->context));
1834 }
1835 
type()1836 Type *Short::type()
1837 {
1838 	return T(llvm::Type::getInt16Ty(*jit->context));
1839 }
1840 
type()1841 Type *UShort::type()
1842 {
1843 	return T(llvm::Type::getInt16Ty(*jit->context));
1844 }
1845 
type()1846 Type *Byte4::type()
1847 {
1848 	return T(Type_v4i8);
1849 }
1850 
type()1851 Type *SByte4::type()
1852 {
1853 	return T(Type_v4i8);
1854 }
1855 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1856 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1857 {
1858 	RR_DEBUG_INFO_UPDATE_LOC();
1859 #if defined(__i386__) || defined(__x86_64__)
1860 	return x86::paddusb(x, y);
1861 #else
1862 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1863 #endif
1864 }
1865 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1866 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1867 {
1868 	RR_DEBUG_INFO_UPDATE_LOC();
1869 #if defined(__i386__) || defined(__x86_64__)
1870 	return x86::psubusb(x, y);
1871 #else
1872 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1873 #endif
1874 }
1875 
SignMask(RValue<Byte8> x)1876 RValue<Int> SignMask(RValue<Byte8> x)
1877 {
1878 	RR_DEBUG_INFO_UPDATE_LOC();
1879 #if defined(__i386__) || defined(__x86_64__)
1880 	return x86::pmovmskb(x);
1881 #else
1882 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1883 #endif
1884 }
1885 
1886 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1887 //	{
1888 //#if defined(__i386__) || defined(__x86_64__)
1889 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1890 //#else
1891 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1892 //#endif
1893 //	}
1894 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1895 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1896 {
1897 	RR_DEBUG_INFO_UPDATE_LOC();
1898 #if defined(__i386__) || defined(__x86_64__)
1899 	return x86::pcmpeqb(x, y);
1900 #else
1901 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1902 #endif
1903 }
1904 
type()1905 Type *Byte8::type()
1906 {
1907 	return T(Type_v8i8);
1908 }
1909 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1910 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1911 {
1912 	RR_DEBUG_INFO_UPDATE_LOC();
1913 #if defined(__i386__) || defined(__x86_64__)
1914 	return x86::paddsb(x, y);
1915 #else
1916 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1917 #endif
1918 }
1919 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1920 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1921 {
1922 	RR_DEBUG_INFO_UPDATE_LOC();
1923 #if defined(__i386__) || defined(__x86_64__)
1924 	return x86::psubsb(x, y);
1925 #else
1926 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1927 #endif
1928 }
1929 
SignMask(RValue<SByte8> x)1930 RValue<Int> SignMask(RValue<SByte8> x)
1931 {
1932 	RR_DEBUG_INFO_UPDATE_LOC();
1933 #if defined(__i386__) || defined(__x86_64__)
1934 	return x86::pmovmskb(As<Byte8>(x));
1935 #else
1936 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1937 #endif
1938 }
1939 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1940 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1941 {
1942 	RR_DEBUG_INFO_UPDATE_LOC();
1943 #if defined(__i386__) || defined(__x86_64__)
1944 	return x86::pcmpgtb(x, y);
1945 #else
1946 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1947 #endif
1948 }
1949 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1950 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1951 {
1952 	RR_DEBUG_INFO_UPDATE_LOC();
1953 #if defined(__i386__) || defined(__x86_64__)
1954 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1955 #else
1956 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1957 #endif
1958 }
1959 
type()1960 Type *SByte8::type()
1961 {
1962 	return T(Type_v8i8);
1963 }
1964 
type()1965 Type *Byte16::type()
1966 {
1967 	return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1968 }
1969 
type()1970 Type *SByte16::type()
1971 {
1972 	return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1973 }
1974 
type()1975 Type *Short2::type()
1976 {
1977 	return T(Type_v2i16);
1978 }
1979 
type()1980 Type *UShort2::type()
1981 {
1982 	return T(Type_v2i16);
1983 }
1984 
Short4(RValue<Int4> cast)1985 Short4::Short4(RValue<Int4> cast)
1986 {
1987 	RR_DEBUG_INFO_UPDATE_LOC();
1988 	std::vector<int> select = { 0, 2, 4, 6, 0, 2, 4, 6 };
1989 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
1990 
1991 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1992 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
1993 
1994 	storeValue(short4);
1995 }
1996 
1997 //	Short4::Short4(RValue<Float> cast)
1998 //	{
1999 //	}
2000 
Short4(RValue<Float4> cast)2001 Short4::Short4(RValue<Float4> cast)
2002 {
2003 	RR_DEBUG_INFO_UPDATE_LOC();
2004 	Int4 v4i32 = Int4(cast);
2005 #if defined(__i386__) || defined(__x86_64__)
2006 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2007 #else
2008 	Value *v = v4i32.loadValue();
2009 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2010 #endif
2011 
2012 	storeValue(As<Short4>(Int2(v4i32)).value());
2013 }
2014 
operator <<(RValue<Short4> lhs,unsigned char rhs)2015 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2016 {
2017 	RR_DEBUG_INFO_UPDATE_LOC();
2018 #if defined(__i386__) || defined(__x86_64__)
2019 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2020 
2021 	return x86::psllw(lhs, rhs);
2022 #else
2023 	return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2024 #endif
2025 }
2026 
operator >>(RValue<Short4> lhs,unsigned char rhs)2027 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2028 {
2029 	RR_DEBUG_INFO_UPDATE_LOC();
2030 #if defined(__i386__) || defined(__x86_64__)
2031 	return x86::psraw(lhs, rhs);
2032 #else
2033 	return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2034 #endif
2035 }
2036 
Max(RValue<Short4> x,RValue<Short4> y)2037 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2038 {
2039 	RR_DEBUG_INFO_UPDATE_LOC();
2040 #if defined(__i386__) || defined(__x86_64__)
2041 	return x86::pmaxsw(x, y);
2042 #else
2043 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2044 #endif
2045 }
2046 
Min(RValue<Short4> x,RValue<Short4> y)2047 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2048 {
2049 	RR_DEBUG_INFO_UPDATE_LOC();
2050 #if defined(__i386__) || defined(__x86_64__)
2051 	return x86::pminsw(x, y);
2052 #else
2053 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2054 #endif
2055 }
2056 
AddSat(RValue<Short4> x,RValue<Short4> y)2057 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2058 {
2059 	RR_DEBUG_INFO_UPDATE_LOC();
2060 #if defined(__i386__) || defined(__x86_64__)
2061 	return x86::paddsw(x, y);
2062 #else
2063 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2064 #endif
2065 }
2066 
SubSat(RValue<Short4> x,RValue<Short4> y)2067 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2068 {
2069 	RR_DEBUG_INFO_UPDATE_LOC();
2070 #if defined(__i386__) || defined(__x86_64__)
2071 	return x86::psubsw(x, y);
2072 #else
2073 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2074 #endif
2075 }
2076 
MulHigh(RValue<Short4> x,RValue<Short4> y)2077 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2078 {
2079 	RR_DEBUG_INFO_UPDATE_LOC();
2080 #if defined(__i386__) || defined(__x86_64__)
2081 	return x86::pmulhw(x, y);
2082 #else
2083 	return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2084 #endif
2085 }
2086 
MulAdd(RValue<Short4> x,RValue<Short4> y)2087 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2088 {
2089 	RR_DEBUG_INFO_UPDATE_LOC();
2090 #if defined(__i386__) || defined(__x86_64__)
2091 	return x86::pmaddwd(x, y);
2092 #else
2093 	return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2094 #endif
2095 }
2096 
PackSigned(RValue<Short4> x,RValue<Short4> y)2097 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2098 {
2099 	RR_DEBUG_INFO_UPDATE_LOC();
2100 #if defined(__i386__) || defined(__x86_64__)
2101 	auto result = x86::packsswb(x, y);
2102 #else
2103 	auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2104 #endif
2105 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2106 }
2107 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2108 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2109 {
2110 	RR_DEBUG_INFO_UPDATE_LOC();
2111 #if defined(__i386__) || defined(__x86_64__)
2112 	auto result = x86::packuswb(x, y);
2113 #else
2114 	auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2115 #endif
2116 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2117 }
2118 
CmpGT(RValue<Short4> x,RValue<Short4> y)2119 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2120 {
2121 	RR_DEBUG_INFO_UPDATE_LOC();
2122 #if defined(__i386__) || defined(__x86_64__)
2123 	return x86::pcmpgtw(x, y);
2124 #else
2125 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2126 #endif
2127 }
2128 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2129 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2130 {
2131 	RR_DEBUG_INFO_UPDATE_LOC();
2132 #if defined(__i386__) || defined(__x86_64__)
2133 	return x86::pcmpeqw(x, y);
2134 #else
2135 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2136 #endif
2137 }
2138 
type()2139 Type *Short4::type()
2140 {
2141 	return T(Type_v4i16);
2142 }
2143 
UShort4(RValue<Float4> cast,bool saturate)2144 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2145 {
2146 	RR_DEBUG_INFO_UPDATE_LOC();
2147 	if(saturate)
2148 	{
2149 #if defined(__i386__) || defined(__x86_64__)
2150 		if(CPUID::supportsSSE4_1())
2151 		{
2152 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2153 			*this = As<Short4>(PackUnsigned(int4, int4));
2154 		}
2155 		else
2156 #endif
2157 		{
2158 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2159 		}
2160 	}
2161 	else
2162 	{
2163 		*this = Short4(Int4(cast));
2164 	}
2165 }
2166 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2167 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2168 {
2169 	RR_DEBUG_INFO_UPDATE_LOC();
2170 #if defined(__i386__) || defined(__x86_64__)
2171 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2172 
2173 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2174 #else
2175 	return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2176 #endif
2177 }
2178 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2179 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2180 {
2181 	RR_DEBUG_INFO_UPDATE_LOC();
2182 #if defined(__i386__) || defined(__x86_64__)
2183 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2184 
2185 	return x86::psrlw(lhs, rhs);
2186 #else
2187 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2188 #endif
2189 }
2190 
Max(RValue<UShort4> x,RValue<UShort4> y)2191 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2192 {
2193 	RR_DEBUG_INFO_UPDATE_LOC();
2194 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2195 }
2196 
Min(RValue<UShort4> x,RValue<UShort4> y)2197 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2198 {
2199 	RR_DEBUG_INFO_UPDATE_LOC();
2200 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2201 }
2202 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2203 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2204 {
2205 	RR_DEBUG_INFO_UPDATE_LOC();
2206 #if defined(__i386__) || defined(__x86_64__)
2207 	return x86::paddusw(x, y);
2208 #else
2209 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2210 #endif
2211 }
2212 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2213 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2214 {
2215 	RR_DEBUG_INFO_UPDATE_LOC();
2216 #if defined(__i386__) || defined(__x86_64__)
2217 	return x86::psubusw(x, y);
2218 #else
2219 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2220 #endif
2221 }
2222 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2223 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2224 {
2225 	RR_DEBUG_INFO_UPDATE_LOC();
2226 #if defined(__i386__) || defined(__x86_64__)
2227 	return x86::pmulhuw(x, y);
2228 #else
2229 	return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2230 #endif
2231 }
2232 
Average(RValue<UShort4> x,RValue<UShort4> y)2233 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2234 {
2235 	RR_DEBUG_INFO_UPDATE_LOC();
2236 #if defined(__i386__) || defined(__x86_64__)
2237 	return x86::pavgw(x, y);
2238 #else
2239 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2240 #endif
2241 }
2242 
type()2243 Type *UShort4::type()
2244 {
2245 	return T(Type_v4i16);
2246 }
2247 
operator <<(RValue<Short8> lhs,unsigned char rhs)2248 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2249 {
2250 	RR_DEBUG_INFO_UPDATE_LOC();
2251 #if defined(__i386__) || defined(__x86_64__)
2252 	return x86::psllw(lhs, rhs);
2253 #else
2254 	return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2255 #endif
2256 }
2257 
operator >>(RValue<Short8> lhs,unsigned char rhs)2258 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2259 {
2260 	RR_DEBUG_INFO_UPDATE_LOC();
2261 #if defined(__i386__) || defined(__x86_64__)
2262 	return x86::psraw(lhs, rhs);
2263 #else
2264 	return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2265 #endif
2266 }
2267 
MulAdd(RValue<Short8> x,RValue<Short8> y)2268 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2269 {
2270 	RR_DEBUG_INFO_UPDATE_LOC();
2271 #if defined(__i386__) || defined(__x86_64__)
2272 	return x86::pmaddwd(x, y);
2273 #else
2274 	return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2275 #endif
2276 }
2277 
MulHigh(RValue<Short8> x,RValue<Short8> y)2278 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2279 {
2280 	RR_DEBUG_INFO_UPDATE_LOC();
2281 #if defined(__i386__) || defined(__x86_64__)
2282 	return x86::pmulhw(x, y);
2283 #else
2284 	return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2285 #endif
2286 }
2287 
type()2288 Type *Short8::type()
2289 {
2290 	return T(llvm::VectorType::get(T(Short::type()), 8, false));
2291 }
2292 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2293 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2294 {
2295 	RR_DEBUG_INFO_UPDATE_LOC();
2296 #if defined(__i386__) || defined(__x86_64__)
2297 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2298 #else
2299 	return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2300 #endif
2301 }
2302 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2303 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2304 {
2305 	RR_DEBUG_INFO_UPDATE_LOC();
2306 #if defined(__i386__) || defined(__x86_64__)
2307 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2308 #else
2309 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2310 #endif
2311 }
2312 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2313 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2314 {
2315 	RR_DEBUG_INFO_UPDATE_LOC();
2316 #if defined(__i386__) || defined(__x86_64__)
2317 	return x86::pmulhuw(x, y);
2318 #else
2319 	return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2320 #endif
2321 }
2322 
type()2323 Type *UShort8::type()
2324 {
2325 	return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2326 }
2327 
operator ++(Int & val,int)2328 RValue<Int> operator++(Int &val, int)  // Post-increment
2329 {
2330 	RR_DEBUG_INFO_UPDATE_LOC();
2331 	RValue<Int> res = val;
2332 
2333 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2334 	val.storeValue(inc);
2335 
2336 	return res;
2337 }
2338 
operator ++(Int & val)2339 const Int &operator++(Int &val)  // Pre-increment
2340 {
2341 	RR_DEBUG_INFO_UPDATE_LOC();
2342 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2343 	val.storeValue(inc);
2344 
2345 	return val;
2346 }
2347 
operator --(Int & val,int)2348 RValue<Int> operator--(Int &val, int)  // Post-decrement
2349 {
2350 	RR_DEBUG_INFO_UPDATE_LOC();
2351 	RValue<Int> res = val;
2352 
2353 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2354 	val.storeValue(inc);
2355 
2356 	return res;
2357 }
2358 
operator --(Int & val)2359 const Int &operator--(Int &val)  // Pre-decrement
2360 {
2361 	RR_DEBUG_INFO_UPDATE_LOC();
2362 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2363 	val.storeValue(inc);
2364 
2365 	return val;
2366 }
2367 
RoundInt(RValue<Float> cast)2368 RValue<Int> RoundInt(RValue<Float> cast)
2369 {
2370 	RR_DEBUG_INFO_UPDATE_LOC();
2371 #if defined(__i386__) || defined(__x86_64__)
2372 	return x86::cvtss2si(cast);
2373 #else
2374 	return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2375 #endif
2376 }
2377 
type()2378 Type *Int::type()
2379 {
2380 	return T(llvm::Type::getInt32Ty(*jit->context));
2381 }
2382 
type()2383 Type *Long::type()
2384 {
2385 	return T(llvm::Type::getInt64Ty(*jit->context));
2386 }
2387 
UInt(RValue<Float> cast)2388 UInt::UInt(RValue<Float> cast)
2389 {
2390 	RR_DEBUG_INFO_UPDATE_LOC();
2391 	Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2392 	storeValue(integer);
2393 }
2394 
operator ++(UInt & val,int)2395 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2396 {
2397 	RR_DEBUG_INFO_UPDATE_LOC();
2398 	RValue<UInt> res = val;
2399 
2400 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2401 	val.storeValue(inc);
2402 
2403 	return res;
2404 }
2405 
operator ++(UInt & val)2406 const UInt &operator++(UInt &val)  // Pre-increment
2407 {
2408 	RR_DEBUG_INFO_UPDATE_LOC();
2409 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2410 	val.storeValue(inc);
2411 
2412 	return val;
2413 }
2414 
operator --(UInt & val,int)2415 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2416 {
2417 	RR_DEBUG_INFO_UPDATE_LOC();
2418 	RValue<UInt> res = val;
2419 
2420 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2421 	val.storeValue(inc);
2422 
2423 	return res;
2424 }
2425 
operator --(UInt & val)2426 const UInt &operator--(UInt &val)  // Pre-decrement
2427 {
2428 	RR_DEBUG_INFO_UPDATE_LOC();
2429 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2430 	val.storeValue(inc);
2431 
2432 	return val;
2433 }
2434 
2435 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2436 //	{
2437 //#if defined(__i386__) || defined(__x86_64__)
2438 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2439 //#else
2440 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2441 //#endif
2442 //	}
2443 
type()2444 Type *UInt::type()
2445 {
2446 	return T(llvm::Type::getInt32Ty(*jit->context));
2447 }
2448 
2449 //	Int2::Int2(RValue<Int> cast)
2450 //	{
2451 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2452 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
2453 //
2454 //		int shuffle[2] = {0, 0};
2455 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2456 //
2457 //		storeValue(replicate);
2458 //	}
2459 
operator <<(RValue<Int2> lhs,unsigned char rhs)2460 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2461 {
2462 	RR_DEBUG_INFO_UPDATE_LOC();
2463 #if defined(__i386__) || defined(__x86_64__)
2464 	//	return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2465 
2466 	return x86::pslld(lhs, rhs);
2467 #else
2468 	return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2469 #endif
2470 }
2471 
operator >>(RValue<Int2> lhs,unsigned char rhs)2472 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2473 {
2474 	RR_DEBUG_INFO_UPDATE_LOC();
2475 #if defined(__i386__) || defined(__x86_64__)
2476 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2477 
2478 	return x86::psrad(lhs, rhs);
2479 #else
2480 	return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2481 #endif
2482 }
2483 
type()2484 Type *Int2::type()
2485 {
2486 	return T(Type_v2i32);
2487 }
2488 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2489 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2490 {
2491 	RR_DEBUG_INFO_UPDATE_LOC();
2492 #if defined(__i386__) || defined(__x86_64__)
2493 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2494 
2495 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2496 #else
2497 	return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2498 #endif
2499 }
2500 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2501 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2502 {
2503 	RR_DEBUG_INFO_UPDATE_LOC();
2504 #if defined(__i386__) || defined(__x86_64__)
2505 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2506 
2507 	return x86::psrld(lhs, rhs);
2508 #else
2509 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2510 #endif
2511 }
2512 
type()2513 Type *UInt2::type()
2514 {
2515 	return T(Type_v2i32);
2516 }
2517 
Int4(RValue<Byte4> cast)2518 Int4::Int4(RValue<Byte4> cast)
2519     : XYZW(this)
2520 {
2521 	RR_DEBUG_INFO_UPDATE_LOC();
2522 	std::vector<int> swizzle = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2523 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2524 	Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2525 
2526 	std::vector<int> swizzle2 = { 0, 8, 1, 9, 2, 10, 3, 11 };
2527 	Value *c = Nucleus::createBitCast(b, Short8::type());
2528 	Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2529 
2530 	*this = As<Int4>(d);
2531 }
2532 
Int4(RValue<SByte4> cast)2533 Int4::Int4(RValue<SByte4> cast)
2534     : XYZW(this)
2535 {
2536 	RR_DEBUG_INFO_UPDATE_LOC();
2537 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2538 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2539 	Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2540 
2541 	std::vector<int> swizzle2 = { 0, 0, 1, 1, 2, 2, 3, 3 };
2542 	Value *c = Nucleus::createBitCast(b, Short8::type());
2543 	Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2544 
2545 	*this = As<Int4>(d) >> 24;
2546 }
2547 
Int4(RValue<Short4> cast)2548 Int4::Int4(RValue<Short4> cast)
2549     : XYZW(this)
2550 {
2551 	RR_DEBUG_INFO_UPDATE_LOC();
2552 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3 };
2553 	Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2554 	*this = As<Int4>(c) >> 16;
2555 }
2556 
Int4(RValue<UShort4> cast)2557 Int4::Int4(RValue<UShort4> cast)
2558     : XYZW(this)
2559 {
2560 	RR_DEBUG_INFO_UPDATE_LOC();
2561 	std::vector<int> swizzle = { 0, 8, 1, 9, 2, 10, 3, 11 };
2562 	Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2563 	*this = As<Int4>(c);
2564 }
2565 
Int4(RValue<Int> rhs)2566 Int4::Int4(RValue<Int> rhs)
2567     : XYZW(this)
2568 {
2569 	RR_DEBUG_INFO_UPDATE_LOC();
2570 	Value *vector = loadValue();
2571 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2572 
2573 	std::vector<int> swizzle = { 0, 0, 0, 0 };
2574 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2575 
2576 	storeValue(replicate);
2577 }
2578 
operator <<(RValue<Int4> lhs,unsigned char rhs)2579 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2580 {
2581 	RR_DEBUG_INFO_UPDATE_LOC();
2582 #if defined(__i386__) || defined(__x86_64__)
2583 	return x86::pslld(lhs, rhs);
2584 #else
2585 	return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2586 #endif
2587 }
2588 
operator >>(RValue<Int4> lhs,unsigned char rhs)2589 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2590 {
2591 	RR_DEBUG_INFO_UPDATE_LOC();
2592 #if defined(__i386__) || defined(__x86_64__)
2593 	return x86::psrad(lhs, rhs);
2594 #else
2595 	return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2596 #endif
2597 }
2598 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2599 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2600 {
2601 	RR_DEBUG_INFO_UPDATE_LOC();
2602 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2603 }
2604 
CmpLT(RValue<Int4> x,RValue<Int4> y)2605 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2606 {
2607 	RR_DEBUG_INFO_UPDATE_LOC();
2608 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2609 }
2610 
CmpLE(RValue<Int4> x,RValue<Int4> y)2611 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2612 {
2613 	RR_DEBUG_INFO_UPDATE_LOC();
2614 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2615 }
2616 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2617 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2618 {
2619 	RR_DEBUG_INFO_UPDATE_LOC();
2620 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2621 }
2622 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2623 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2624 {
2625 	RR_DEBUG_INFO_UPDATE_LOC();
2626 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2627 }
2628 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2629 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2630 {
2631 	RR_DEBUG_INFO_UPDATE_LOC();
2632 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2633 }
2634 
Abs(RValue<Int4> x)2635 RValue<Int4> Abs(RValue<Int4> x)
2636 {
2637 #if LLVM_VERSION_MAJOR >= 12
2638 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
2639 	return RValue<Int4>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
2640 #else
2641 	auto negative = x >> 31;
2642 	return (x ^ negative) - negative;
2643 #endif
2644 }
2645 
Max(RValue<Int4> x,RValue<Int4> y)2646 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2647 {
2648 	RR_DEBUG_INFO_UPDATE_LOC();
2649 #if defined(__i386__) || defined(__x86_64__)
2650 	if(CPUID::supportsSSE4_1())
2651 	{
2652 		return x86::pmaxsd(x, y);
2653 	}
2654 	else
2655 #endif
2656 	{
2657 		RValue<Int4> greater = CmpNLE(x, y);
2658 		return (x & greater) | (y & ~greater);
2659 	}
2660 }
2661 
Min(RValue<Int4> x,RValue<Int4> y)2662 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2663 {
2664 	RR_DEBUG_INFO_UPDATE_LOC();
2665 #if defined(__i386__) || defined(__x86_64__)
2666 	if(CPUID::supportsSSE4_1())
2667 	{
2668 		return x86::pminsd(x, y);
2669 	}
2670 	else
2671 #endif
2672 	{
2673 		RValue<Int4> less = CmpLT(x, y);
2674 		return (x & less) | (y & ~less);
2675 	}
2676 }
2677 
RoundInt(RValue<Float4> cast)2678 RValue<Int4> RoundInt(RValue<Float4> cast)
2679 {
2680 	RR_DEBUG_INFO_UPDATE_LOC();
2681 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2682 	return x86::cvtps2dq(cast);
2683 #else
2684 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2685 #endif
2686 }
2687 
RoundIntClamped(RValue<Float4> cast)2688 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2689 {
2690 	RR_DEBUG_INFO_UPDATE_LOC();
2691 
2692 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
2693 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2694 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
2695 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2696 	// saturate to 0x80000000.
2697 	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2698 #elif defined(__arm__) || defined(__aarch64__)
2699 	// ARM saturates to the largest positive or negative integer. Unit tests
2700 	// verify that lowerRoundInt() behaves as desired.
2701 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2702 #elif LLVM_VERSION_MAJOR >= 14
2703 	llvm::Value *rounded = lowerRound(V(cast.value()));
2704 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
2705 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(Int4::type()), T(Float4::type()) });
2706 	return RValue<Int4>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
2707 #else
2708 	RValue<Float4> clamped = Max(Min(cast, Float4(0x7FFFFF80)), Float4(static_cast<int>(0x80000000)));
2709 	return As<Int4>(V(lowerRoundInt(V(clamped.value()), T(Int4::type()))));
2710 #endif
2711 }
2712 
MulHigh(RValue<Int4> x,RValue<Int4> y)2713 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2714 {
2715 	RR_DEBUG_INFO_UPDATE_LOC();
2716 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2717 	return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2718 }
2719 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2720 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2721 {
2722 	RR_DEBUG_INFO_UPDATE_LOC();
2723 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2724 	return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2725 }
2726 
PackSigned(RValue<Int4> x,RValue<Int4> y)2727 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2728 {
2729 	RR_DEBUG_INFO_UPDATE_LOC();
2730 #if defined(__i386__) || defined(__x86_64__)
2731 	return x86::packssdw(x, y);
2732 #else
2733 	return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2734 #endif
2735 }
2736 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2737 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2738 {
2739 	RR_DEBUG_INFO_UPDATE_LOC();
2740 #if defined(__i386__) || defined(__x86_64__)
2741 	return x86::packusdw(x, y);
2742 #else
2743 	return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2744 #endif
2745 }
2746 
SignMask(RValue<Int4> x)2747 RValue<Int> SignMask(RValue<Int4> x)
2748 {
2749 	RR_DEBUG_INFO_UPDATE_LOC();
2750 #if defined(__i386__) || defined(__x86_64__)
2751 	return x86::movmskps(As<Float4>(x));
2752 #else
2753 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2754 #endif
2755 }
2756 
type()2757 Type *Int4::type()
2758 {
2759 	return T(llvm::VectorType::get(T(Int::type()), 4, false));
2760 }
2761 
UInt4(RValue<Float4> cast)2762 UInt4::UInt4(RValue<Float4> cast)
2763     : XYZW(this)
2764 {
2765 	RR_DEBUG_INFO_UPDATE_LOC();
2766 	Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2767 	storeValue(xyzw);
2768 }
2769 
UInt4(RValue<UInt> rhs)2770 UInt4::UInt4(RValue<UInt> rhs)
2771     : XYZW(this)
2772 {
2773 	RR_DEBUG_INFO_UPDATE_LOC();
2774 	Value *vector = loadValue();
2775 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2776 
2777 	std::vector<int> swizzle = { 0, 0, 0, 0 };
2778 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2779 
2780 	storeValue(replicate);
2781 }
2782 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2783 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2784 {
2785 	RR_DEBUG_INFO_UPDATE_LOC();
2786 #if defined(__i386__) || defined(__x86_64__)
2787 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2788 #else
2789 	return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2790 #endif
2791 }
2792 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2793 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2794 {
2795 	RR_DEBUG_INFO_UPDATE_LOC();
2796 #if defined(__i386__) || defined(__x86_64__)
2797 	return x86::psrld(lhs, rhs);
2798 #else
2799 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2800 #endif
2801 }
2802 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2803 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2804 {
2805 	RR_DEBUG_INFO_UPDATE_LOC();
2806 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2807 }
2808 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2809 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2810 {
2811 	RR_DEBUG_INFO_UPDATE_LOC();
2812 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2813 }
2814 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2815 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2816 {
2817 	RR_DEBUG_INFO_UPDATE_LOC();
2818 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2819 }
2820 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2821 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2822 {
2823 	RR_DEBUG_INFO_UPDATE_LOC();
2824 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2825 }
2826 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2827 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2828 {
2829 	RR_DEBUG_INFO_UPDATE_LOC();
2830 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2831 }
2832 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2833 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2834 {
2835 	RR_DEBUG_INFO_UPDATE_LOC();
2836 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2837 }
2838 
Max(RValue<UInt4> x,RValue<UInt4> y)2839 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2840 {
2841 	RR_DEBUG_INFO_UPDATE_LOC();
2842 #if defined(__i386__) || defined(__x86_64__)
2843 	if(CPUID::supportsSSE4_1())
2844 	{
2845 		return x86::pmaxud(x, y);
2846 	}
2847 	else
2848 #endif
2849 	{
2850 		RValue<UInt4> greater = CmpNLE(x, y);
2851 		return (x & greater) | (y & ~greater);
2852 	}
2853 }
2854 
Min(RValue<UInt4> x,RValue<UInt4> y)2855 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2856 {
2857 	RR_DEBUG_INFO_UPDATE_LOC();
2858 #if defined(__i386__) || defined(__x86_64__)
2859 	if(CPUID::supportsSSE4_1())
2860 	{
2861 		return x86::pminud(x, y);
2862 	}
2863 	else
2864 #endif
2865 	{
2866 		RValue<UInt4> less = CmpLT(x, y);
2867 		return (x & less) | (y & ~less);
2868 	}
2869 }
2870 
type()2871 Type *UInt4::type()
2872 {
2873 	return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2874 }
2875 
type()2876 Type *Half::type()
2877 {
2878 	return T(llvm::Type::getInt16Ty(*jit->context));
2879 }
2880 
HasRcpApprox()2881 bool HasRcpApprox()
2882 {
2883 #if defined(__i386__) || defined(__x86_64__)
2884 	return true;
2885 #else
2886 	return false;
2887 #endif
2888 }
2889 
RcpApprox(RValue<Float4> x,bool exactAtPow2)2890 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2891 {
2892 #if defined(__i386__) || defined(__x86_64__)
2893 	if(exactAtPow2)
2894 	{
2895 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2896 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2897 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2898 	}
2899 	return x86::rcpps(x);
2900 #else
2901 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2902 	return { 0.0f };
2903 #endif
2904 }
2905 
RcpApprox(RValue<Float> x,bool exactAtPow2)2906 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2907 {
2908 #if defined(__i386__) || defined(__x86_64__)
2909 	if(exactAtPow2)
2910 	{
2911 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2912 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2913 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2914 	}
2915 	return x86::rcpss(x);
2916 #else
2917 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2918 	return { 0.0f };
2919 #endif
2920 }
2921 
HasRcpSqrtApprox()2922 bool HasRcpSqrtApprox()
2923 {
2924 #if defined(__i386__) || defined(__x86_64__)
2925 	return true;
2926 #else
2927 	return false;
2928 #endif
2929 }
2930 
RcpSqrtApprox(RValue<Float4> x)2931 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2932 {
2933 #if defined(__i386__) || defined(__x86_64__)
2934 	return x86::rsqrtps(x);
2935 #else
2936 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2937 	return { 0.0f };
2938 #endif
2939 }
2940 
RcpSqrtApprox(RValue<Float> x)2941 RValue<Float> RcpSqrtApprox(RValue<Float> x)
2942 {
2943 #if defined(__i386__) || defined(__x86_64__)
2944 	return x86::rsqrtss(x);
2945 #else
2946 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2947 	return { 0.0f };
2948 #endif
2949 }
2950 
Sqrt(RValue<Float> x)2951 RValue<Float> Sqrt(RValue<Float> x)
2952 {
2953 	RR_DEBUG_INFO_UPDATE_LOC();
2954 #if defined(__i386__) || defined(__x86_64__)
2955 	return x86::sqrtss(x);
2956 #else
2957 	return As<Float>(V(lowerSQRT(V(x.value()))));
2958 #endif
2959 }
2960 
Round(RValue<Float> x)2961 RValue<Float> Round(RValue<Float> x)
2962 {
2963 	RR_DEBUG_INFO_UPDATE_LOC();
2964 #if defined(__i386__) || defined(__x86_64__)
2965 	if(CPUID::supportsSSE4_1())
2966 	{
2967 		return x86::roundss(x, 0);
2968 	}
2969 	else
2970 	{
2971 		return Float4(Round(Float4(x))).x;
2972 	}
2973 #else
2974 	return RValue<Float>(V(lowerRound(V(x.value()))));
2975 #endif
2976 }
2977 
Trunc(RValue<Float> x)2978 RValue<Float> Trunc(RValue<Float> x)
2979 {
2980 	RR_DEBUG_INFO_UPDATE_LOC();
2981 #if defined(__i386__) || defined(__x86_64__)
2982 	if(CPUID::supportsSSE4_1())
2983 	{
2984 		return x86::roundss(x, 3);
2985 	}
2986 	else
2987 	{
2988 		return Float(Int(x));  // Rounded toward zero
2989 	}
2990 #else
2991 	return RValue<Float>(V(lowerTrunc(V(x.value()))));
2992 #endif
2993 }
2994 
Frac(RValue<Float> x)2995 RValue<Float> Frac(RValue<Float> x)
2996 {
2997 	RR_DEBUG_INFO_UPDATE_LOC();
2998 #if defined(__i386__) || defined(__x86_64__)
2999 	if(CPUID::supportsSSE4_1())
3000 	{
3001 		return x - x86::floorss(x);
3002 	}
3003 	else
3004 	{
3005 		return Float4(Frac(Float4(x))).x;
3006 	}
3007 #else
3008 	// x - floor(x) can be 1.0 for very small negative x.
3009 	// Clamp against the value just below 1.0.
3010 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3011 #endif
3012 }
3013 
Floor(RValue<Float> x)3014 RValue<Float> Floor(RValue<Float> x)
3015 {
3016 	RR_DEBUG_INFO_UPDATE_LOC();
3017 #if defined(__i386__) || defined(__x86_64__)
3018 	if(CPUID::supportsSSE4_1())
3019 	{
3020 		return x86::floorss(x);
3021 	}
3022 	else
3023 	{
3024 		return Float4(Floor(Float4(x))).x;
3025 	}
3026 #else
3027 	return RValue<Float>(V(lowerFloor(V(x.value()))));
3028 #endif
3029 }
3030 
Ceil(RValue<Float> x)3031 RValue<Float> Ceil(RValue<Float> x)
3032 {
3033 	RR_DEBUG_INFO_UPDATE_LOC();
3034 #if defined(__i386__) || defined(__x86_64__)
3035 	if(CPUID::supportsSSE4_1())
3036 	{
3037 		return x86::ceilss(x);
3038 	}
3039 	else
3040 #endif
3041 	{
3042 		return Float4(Ceil(Float4(x))).x;
3043 	}
3044 }
3045 
type()3046 Type *Float::type()
3047 {
3048 	return T(llvm::Type::getFloatTy(*jit->context));
3049 }
3050 
type()3051 Type *Float2::type()
3052 {
3053 	return T(Type_v2f32);
3054 }
3055 
Float4(RValue<Float> rhs)3056 Float4::Float4(RValue<Float> rhs)
3057     : XYZW(this)
3058 {
3059 	RR_DEBUG_INFO_UPDATE_LOC();
3060 	Value *vector = loadValue();
3061 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3062 
3063 	std::vector<int> swizzle = { 0, 0, 0, 0 };
3064 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3065 
3066 	storeValue(replicate);
3067 }
3068 
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3069 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3070 {
3071 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
3072 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3073 }
3074 
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3075 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3076 {
3077 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(Float4::type()) });
3078 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3079 }
3080 
Abs(RValue<Float4> x)3081 RValue<Float4> Abs(RValue<Float4> x)
3082 {
3083 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
3084 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(x.value()))));
3085 }
3086 
Max(RValue<Float4> x,RValue<Float4> y)3087 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3088 {
3089 	RR_DEBUG_INFO_UPDATE_LOC();
3090 #if defined(__i386__) || defined(__x86_64__)
3091 	return x86::maxps(x, y);
3092 #else
3093 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3094 #endif
3095 }
3096 
Min(RValue<Float4> x,RValue<Float4> y)3097 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3098 {
3099 	RR_DEBUG_INFO_UPDATE_LOC();
3100 #if defined(__i386__) || defined(__x86_64__)
3101 	return x86::minps(x, y);
3102 #else
3103 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3104 #endif
3105 }
3106 
Sqrt(RValue<Float4> x)3107 RValue<Float4> Sqrt(RValue<Float4> x)
3108 {
3109 	RR_DEBUG_INFO_UPDATE_LOC();
3110 #if defined(__i386__) || defined(__x86_64__)
3111 	return x86::sqrtps(x);
3112 #else
3113 	return As<Float4>(V(lowerSQRT(V(x.value()))));
3114 #endif
3115 }
3116 
SignMask(RValue<Float4> x)3117 RValue<Int> SignMask(RValue<Float4> x)
3118 {
3119 	RR_DEBUG_INFO_UPDATE_LOC();
3120 #if defined(__i386__) || defined(__x86_64__)
3121 	return x86::movmskps(x);
3122 #else
3123 	return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3124 #endif
3125 }
3126 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3127 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3128 {
3129 	RR_DEBUG_INFO_UPDATE_LOC();
3130 	//	return As<Int4>(x86::cmpeqps(x, y));
3131 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3132 }
3133 
CmpLT(RValue<Float4> x,RValue<Float4> y)3134 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3135 {
3136 	RR_DEBUG_INFO_UPDATE_LOC();
3137 	//	return As<Int4>(x86::cmpltps(x, y));
3138 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3139 }
3140 
CmpLE(RValue<Float4> x,RValue<Float4> y)3141 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3142 {
3143 	RR_DEBUG_INFO_UPDATE_LOC();
3144 	//	return As<Int4>(x86::cmpleps(x, y));
3145 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3146 }
3147 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3148 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3149 {
3150 	RR_DEBUG_INFO_UPDATE_LOC();
3151 	//	return As<Int4>(x86::cmpneqps(x, y));
3152 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3153 }
3154 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3155 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3156 {
3157 	RR_DEBUG_INFO_UPDATE_LOC();
3158 	//	return As<Int4>(x86::cmpnltps(x, y));
3159 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3160 }
3161 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3162 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3163 {
3164 	RR_DEBUG_INFO_UPDATE_LOC();
3165 	//	return As<Int4>(x86::cmpnleps(x, y));
3166 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3167 }
3168 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3169 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3170 {
3171 	RR_DEBUG_INFO_UPDATE_LOC();
3172 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3173 }
3174 
CmpULT(RValue<Float4> x,RValue<Float4> y)3175 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3176 {
3177 	RR_DEBUG_INFO_UPDATE_LOC();
3178 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3179 }
3180 
CmpULE(RValue<Float4> x,RValue<Float4> y)3181 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3182 {
3183 	RR_DEBUG_INFO_UPDATE_LOC();
3184 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3185 }
3186 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3187 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3188 {
3189 	RR_DEBUG_INFO_UPDATE_LOC();
3190 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3191 }
3192 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3193 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3194 {
3195 	RR_DEBUG_INFO_UPDATE_LOC();
3196 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3197 }
3198 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3199 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3200 {
3201 	RR_DEBUG_INFO_UPDATE_LOC();
3202 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3203 }
3204 
Round(RValue<Float4> x)3205 RValue<Float4> Round(RValue<Float4> x)
3206 {
3207 	RR_DEBUG_INFO_UPDATE_LOC();
3208 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3209 	if(CPUID::supportsSSE4_1())
3210 	{
3211 		return x86::roundps(x, 0);
3212 	}
3213 	else
3214 	{
3215 		return Float4(RoundInt(x));
3216 	}
3217 #else
3218 	return RValue<Float4>(V(lowerRound(V(x.value()))));
3219 #endif
3220 }
3221 
Trunc(RValue<Float4> x)3222 RValue<Float4> Trunc(RValue<Float4> x)
3223 {
3224 	RR_DEBUG_INFO_UPDATE_LOC();
3225 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3226 	if(CPUID::supportsSSE4_1())
3227 	{
3228 		return x86::roundps(x, 3);
3229 	}
3230 	else
3231 	{
3232 		return Float4(Int4(x));
3233 	}
3234 #else
3235 	return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3236 #endif
3237 }
3238 
Frac(RValue<Float4> x)3239 RValue<Float4> Frac(RValue<Float4> x)
3240 {
3241 	RR_DEBUG_INFO_UPDATE_LOC();
3242 	Float4 frc;
3243 
3244 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3245 	if(CPUID::supportsSSE4_1())
3246 	{
3247 		frc = x - x86::floorps(x);
3248 	}
3249 	else
3250 	{
3251 		frc = x - Float4(Int4(x));  // Signed fractional part.
3252 
3253 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3254 	}
3255 #else
3256 	frc = x - Floor(x);
3257 #endif
3258 
3259 	// x - floor(x) can be 1.0 for very small negative x.
3260 	// Clamp against the value just below 1.0.
3261 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3262 }
3263 
Floor(RValue<Float4> x)3264 RValue<Float4> Floor(RValue<Float4> x)
3265 {
3266 	RR_DEBUG_INFO_UPDATE_LOC();
3267 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3268 	if(CPUID::supportsSSE4_1())
3269 	{
3270 		return x86::floorps(x);
3271 	}
3272 	else
3273 	{
3274 		return x - Frac(x);
3275 	}
3276 #else
3277 	return RValue<Float4>(V(lowerFloor(V(x.value()))));
3278 #endif
3279 }
3280 
Ceil(RValue<Float4> x)3281 RValue<Float4> Ceil(RValue<Float4> x)
3282 {
3283 	RR_DEBUG_INFO_UPDATE_LOC();
3284 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3285 	if(CPUID::supportsSSE4_1())
3286 	{
3287 		return x86::ceilps(x);
3288 	}
3289 	else
3290 #endif
3291 	{
3292 		return -Floor(-x);
3293 	}
3294 }
3295 
Ctlz(RValue<UInt> v,bool isZeroUndef)3296 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3297 {
3298 	RR_DEBUG_INFO_UPDATE_LOC();
3299 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3300 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3301 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3302 }
3303 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3304 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3305 {
3306 	RR_DEBUG_INFO_UPDATE_LOC();
3307 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3308 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3309 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3310 }
3311 
Cttz(RValue<UInt> v,bool isZeroUndef)3312 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3313 {
3314 	RR_DEBUG_INFO_UPDATE_LOC();
3315 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3316 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3317 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3318 }
3319 
Cttz(RValue<UInt4> v,bool isZeroUndef)3320 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3321 {
3322 	RR_DEBUG_INFO_UPDATE_LOC();
3323 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3324 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3325 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3326 }
3327 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3328 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3329 {
3330 	return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3331 }
3332 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3333 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3334 {
3335 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3336 }
3337 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3338 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3339 {
3340 	return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3341 }
3342 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3343 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3344 {
3345 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3346 }
3347 
type()3348 Type *Float4::type()
3349 {
3350 	return T(llvm::VectorType::get(T(Float::type()), 4, false));
3351 }
3352 
Ticks()3353 RValue<Long> Ticks()
3354 {
3355 	RR_DEBUG_INFO_UPDATE_LOC();
3356 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3357 
3358 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3359 }
3360 
ConstantPointer(const void * ptr)3361 RValue<Pointer<Byte>> ConstantPointer(const void *ptr)
3362 {
3363 	RR_DEBUG_INFO_UPDATE_LOC();
3364 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3365 	// is defined to truncate (and zero extend) if necessary.
3366 	auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3367 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3368 }
3369 
ConstantData(const void * data,size_t size)3370 RValue<Pointer<Byte>> ConstantData(const void *data, size_t size)
3371 {
3372 	RR_DEBUG_INFO_UPDATE_LOC();
3373 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3374 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3375 	return RValue<Pointer<Byte>>(V(ptr));
3376 }
3377 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3378 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3379 {
3380 	// If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3381 	// mark all call arguments as initialized by calling __msan_unpoison_param().
3382 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3383 	{
3384 		// void __msan_unpoison_param(size_t n)
3385 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
3386 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3387 		auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3388 		auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3389 
3390 		jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3391 	}
3392 
3393 	RR_DEBUG_INFO_UPDATE_LOC();
3394 	llvm::SmallVector<llvm::Type *, 8> paramTys;
3395 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3396 	auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3397 
3398 	auto funcPtrTy = funcTy->getPointerTo();
3399 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3400 
3401 	llvm::SmallVector<llvm::Value *, 8> arguments;
3402 	for(auto arg : args) { arguments.push_back(V(arg)); }
3403 	return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3404 }
3405 
Breakpoint()3406 void Breakpoint()
3407 {
3408 	RR_DEBUG_INFO_UPDATE_LOC();
3409 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3410 
3411 	jit->builder->CreateCall(debugtrap);
3412 }
3413 
3414 }  // namespace rr
3415 
3416 namespace rr {
3417 
3418 #if defined(__i386__) || defined(__x86_64__)
3419 namespace x86 {
3420 
3421 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3422 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3423 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3424 {
3425 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3426 
3427 	return V(jit->builder->CreateCall(intrinsic, V(x)));
3428 }
3429 
3430 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3431 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3432 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3433 {
3434 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3435 
3436 	return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3437 }
3438 
cvtss2si(RValue<Float> val)3439 RValue<Int> cvtss2si(RValue<Float> val)
3440 {
3441 	Float4 vector;
3442 	vector.x = val;
3443 
3444 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3445 }
3446 
cvtps2dq(RValue<Float4> val)3447 RValue<Int4> cvtps2dq(RValue<Float4> val)
3448 {
3449 	ASSERT(!__has_feature(memory_sanitizer));  // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3450 
3451 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3452 }
3453 
rcpss(RValue<Float> val)3454 RValue<Float> rcpss(RValue<Float> val)
3455 {
3456 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3457 
3458 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3459 }
3460 
sqrtss(RValue<Float> val)3461 RValue<Float> sqrtss(RValue<Float> val)
3462 {
3463 	return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3464 }
3465 
rsqrtss(RValue<Float> val)3466 RValue<Float> rsqrtss(RValue<Float> val)
3467 {
3468 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3469 
3470 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3471 }
3472 
rcpps(RValue<Float4> val)3473 RValue<Float4> rcpps(RValue<Float4> val)
3474 {
3475 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3476 }
3477 
sqrtps(RValue<Float4> val)3478 RValue<Float4> sqrtps(RValue<Float4> val)
3479 {
3480 	return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3481 }
3482 
rsqrtps(RValue<Float4> val)3483 RValue<Float4> rsqrtps(RValue<Float4> val)
3484 {
3485 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3486 }
3487 
maxps(RValue<Float4> x,RValue<Float4> y)3488 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3489 {
3490 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3491 }
3492 
minps(RValue<Float4> x,RValue<Float4> y)3493 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3494 {
3495 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3496 }
3497 
roundss(RValue<Float> val,unsigned char imm)3498 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3499 {
3500 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3501 
3502 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3503 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3504 
3505 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3506 }
3507 
floorss(RValue<Float> val)3508 RValue<Float> floorss(RValue<Float> val)
3509 {
3510 	return roundss(val, 1);
3511 }
3512 
ceilss(RValue<Float> val)3513 RValue<Float> ceilss(RValue<Float> val)
3514 {
3515 	return roundss(val, 2);
3516 }
3517 
roundps(RValue<Float4> val,unsigned char imm)3518 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3519 {
3520 	ASSERT(!__has_feature(memory_sanitizer));  // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3521 
3522 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3523 }
3524 
floorps(RValue<Float4> val)3525 RValue<Float4> floorps(RValue<Float4> val)
3526 {
3527 	return roundps(val, 1);
3528 }
3529 
ceilps(RValue<Float4> val)3530 RValue<Float4> ceilps(RValue<Float4> val)
3531 {
3532 	return roundps(val, 2);
3533 }
3534 
paddsw(RValue<Short4> x,RValue<Short4> y)3535 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3536 {
3537 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3538 }
3539 
psubsw(RValue<Short4> x,RValue<Short4> y)3540 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3541 {
3542 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3543 }
3544 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3545 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3546 {
3547 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3548 }
3549 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3550 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3551 {
3552 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3553 }
3554 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3555 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3556 {
3557 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3558 }
3559 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3560 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3561 {
3562 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3563 }
3564 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3565 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3566 {
3567 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3568 }
3569 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3570 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3571 {
3572 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3573 }
3574 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3575 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3576 {
3577 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3578 }
3579 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3580 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3581 {
3582 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3583 }
3584 
pminsw(RValue<Short4> x,RValue<Short4> y)3585 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3586 {
3587 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3588 }
3589 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3590 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3591 {
3592 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3593 }
3594 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3595 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3596 {
3597 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3598 }
3599 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3600 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3601 {
3602 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3603 }
3604 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3605 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3606 {
3607 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3608 }
3609 
packssdw(RValue<Int2> x,RValue<Int2> y)3610 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3611 {
3612 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3613 }
3614 
packssdw(RValue<Int4> x,RValue<Int4> y)3615 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3616 {
3617 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3618 }
3619 
packsswb(RValue<Short4> x,RValue<Short4> y)3620 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3621 {
3622 	return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3623 }
3624 
packuswb(RValue<Short4> x,RValue<Short4> y)3625 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3626 {
3627 	return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3628 }
3629 
packusdw(RValue<Int4> x,RValue<Int4> y)3630 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3631 {
3632 	if(CPUID::supportsSSE4_1())
3633 	{
3634 		return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3635 	}
3636 	else
3637 	{
3638 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3639 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3640 
3641 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3642 	}
3643 }
3644 
psrlw(RValue<UShort4> x,unsigned char y)3645 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3646 {
3647 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3648 }
3649 
psrlw(RValue<UShort8> x,unsigned char y)3650 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3651 {
3652 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3653 }
3654 
psraw(RValue<Short4> x,unsigned char y)3655 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3656 {
3657 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3658 }
3659 
psraw(RValue<Short8> x,unsigned char y)3660 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3661 {
3662 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3663 }
3664 
psllw(RValue<Short4> x,unsigned char y)3665 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3666 {
3667 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3668 }
3669 
psllw(RValue<Short8> x,unsigned char y)3670 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3671 {
3672 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3673 }
3674 
pslld(RValue<Int2> x,unsigned char y)3675 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3676 {
3677 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3678 }
3679 
pslld(RValue<Int4> x,unsigned char y)3680 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3681 {
3682 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3683 }
3684 
psrad(RValue<Int2> x,unsigned char y)3685 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3686 {
3687 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3688 }
3689 
psrad(RValue<Int4> x,unsigned char y)3690 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3691 {
3692 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3693 }
3694 
psrld(RValue<UInt2> x,unsigned char y)3695 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3696 {
3697 	return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3698 }
3699 
psrld(RValue<UInt4> x,unsigned char y)3700 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3701 {
3702 	return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3703 }
3704 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3705 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3706 {
3707 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3708 }
3709 
pminsd(RValue<Int4> x,RValue<Int4> y)3710 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3711 {
3712 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3713 }
3714 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3715 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3716 {
3717 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3718 }
3719 
pminud(RValue<UInt4> x,RValue<UInt4> y)3720 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3721 {
3722 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3723 }
3724 
pmulhw(RValue<Short4> x,RValue<Short4> y)3725 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3726 {
3727 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3728 }
3729 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3730 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3731 {
3732 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3733 }
3734 
pmaddwd(RValue<Short4> x,RValue<Short4> y)3735 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3736 {
3737 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3738 }
3739 
pmulhw(RValue<Short8> x,RValue<Short8> y)3740 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3741 {
3742 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3743 }
3744 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3745 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3746 {
3747 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3748 }
3749 
pmaddwd(RValue<Short8> x,RValue<Short8> y)3750 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3751 {
3752 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3753 }
3754 
movmskps(RValue<Float4> x)3755 RValue<Int> movmskps(RValue<Float4> x)
3756 {
3757 	Value *v = x.value();
3758 
3759 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3760 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
3761 	// just the sign bits to avoid false positives.
3762 	if(__has_feature(memory_sanitizer))
3763 	{
3764 		v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
3765 	}
3766 
3767 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
3768 }
3769 
pmovmskb(RValue<Byte8> x)3770 RValue<Int> pmovmskb(RValue<Byte8> x)
3771 {
3772 	Value *v = x.value();
3773 
3774 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3775 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
3776 	// just the sign bits in the lower 64-bit vector to avoid false positives.
3777 	if(__has_feature(memory_sanitizer))
3778 	{
3779 		v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
3780 	}
3781 
3782 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
3783 }
3784 
3785 }  // namespace x86
3786 #endif  // defined(__i386__) || defined(__x86_64__)
3787 
3788 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3789 void VPrintf(const std::vector<Value *> &vals)
3790 {
3791 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3792 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3793 	auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
3794 	auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
3795 	jit->builder->CreateCall(func, V(vals));
3796 }
3797 #endif  // ENABLE_RR_PRINT
3798 
Nop()3799 void Nop()
3800 {
3801 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
3802 	auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
3803 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
3804 	jit->builder->CreateCall(func);
3805 }
3806 
EmitDebugLocation()3807 void EmitDebugLocation()
3808 {
3809 #ifdef ENABLE_RR_DEBUG_INFO
3810 	if(jit->debugInfo != nullptr)
3811 	{
3812 		jit->debugInfo->EmitLocation();
3813 	}
3814 #endif  // ENABLE_RR_DEBUG_INFO
3815 }
3816 
EmitDebugVariable(Value * value)3817 void EmitDebugVariable(Value *value)
3818 {
3819 #ifdef ENABLE_RR_DEBUG_INFO
3820 	if(jit->debugInfo != nullptr)
3821 	{
3822 		jit->debugInfo->EmitVariable(value);
3823 	}
3824 #endif  // ENABLE_RR_DEBUG_INFO
3825 }
3826 
FlushDebug()3827 void FlushDebug()
3828 {
3829 #ifdef ENABLE_RR_DEBUG_INFO
3830 	if(jit->debugInfo != nullptr)
3831 	{
3832 		jit->debugInfo->Flush();
3833 	}
3834 #endif  // ENABLE_RR_DEBUG_INFO
3835 }
3836 
3837 }  // namespace rr
3838 
3839 // ------------------------------  Coroutines ------------------------------
3840 
3841 namespace {
3842 
3843 // Magic values retuned by llvm.coro.suspend.
3844 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
3845 enum SuspendAction
3846 {
3847 	SuspendActionSuspend = -1,
3848 	SuspendActionResume = 0,
3849 	SuspendActionDestroy = 1
3850 };
3851 
promoteFunctionToCoroutine()3852 void promoteFunctionToCoroutine()
3853 {
3854 	ASSERT(jit->coroutine.id == nullptr);
3855 
3856 	// Types
3857 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
3858 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
3859 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
3860 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3861 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3862 	auto promiseTy = jit->coroutine.yieldType;
3863 	auto promisePtrTy = promiseTy->getPointerTo();
3864 
3865 	// LLVM intrinsics
3866 	auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
3867 	auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
3868 	auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
3869 	auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
3870 	auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
3871 	auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
3872 	auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
3873 	auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
3874 	auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
3875 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
3876 
3877 	auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
3878 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
3879 	auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
3880 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
3881 
3882 	auto oldInsertionPoint = jit->builder->saveIP();
3883 
3884 	// Build the coroutine_await() function:
3885 	//
3886 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
3887 	//    {
3888 	//        if(llvm.coro.done(handle))
3889 	//        {
3890 	//            return false;
3891 	//        }
3892 	//        else
3893 	//        {
3894 	//            *value = (T*)llvm.coro.promise(handle);
3895 	//            llvm.coro.resume(handle);
3896 	//            return true;
3897 	//        }
3898 	//    }
3899 	//
3900 	{
3901 		auto args = jit->coroutine.await->arg_begin();
3902 		auto handle = args++;
3903 		auto outPtr = args++;
3904 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
3905 		auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
3906 		auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
3907 
3908 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
3909 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
3910 
3911 		jit->builder->SetInsertPoint(doneBlock);
3912 		jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
3913 
3914 		jit->builder->SetInsertPoint(resumeBlock);
3915 		auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
3916 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
3917 		auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
3918 		jit->builder->CreateStore(promise, outPtr);
3919 		jit->builder->CreateCall(coro_resume, { handle });
3920 		jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
3921 	}
3922 
3923 	// Build the coroutine_destroy() function:
3924 	//
3925 	//    void coroutine_destroy(CoroutineHandle* handle)
3926 	//    {
3927 	//        llvm.coro.destroy(handle);
3928 	//    }
3929 	//
3930 	{
3931 		auto handle = jit->coroutine.destroy->arg_begin();
3932 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
3933 		jit->builder->CreateCall(coro_destroy, { handle });
3934 		jit->builder->CreateRetVoid();
3935 	}
3936 
3937 	// Begin building the main coroutine_begin() function.
3938 	//
3939 	//    CoroutineHandle* coroutine_begin(<Arguments>)
3940 	//    {
3941 	//        YieldType promise;
3942 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
3943 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
3944 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
3945 	//
3946 	//        ... <REACTOR CODE> ...
3947 	//
3948 	//    end:
3949 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
3950 	//        switch(action)
3951 	//        {
3952 	//        case SuspendActionResume:
3953 	//            UNREACHABLE(); // Illegal to resume after final suspend.
3954 	//        case SuspendActionDestroy:
3955 	//            goto destroy;
3956 	//        default: // (SuspendActionSuspend)
3957 	//            goto suspend;
3958 	//        }
3959 	//
3960 	//    destroy:
3961 	//        coroutine_free_frame(llvm.coro.free(id, handle));
3962 	//        goto suspend;
3963 	//
3964 	//    suspend:
3965 	//        llvm.coro.end(handle, false);
3966 	//        return handle;
3967 	//    }
3968 	//
3969 
3970 #ifdef ENABLE_RR_DEBUG_INFO
3971 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
3972 #endif  // ENABLE_RR_DEBUG_INFO
3973 
3974 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
3975 	jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
3976 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
3977 
3978 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
3979 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
3980 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
3981 	                                                          llvm::ConstantInt::get(i32Ty, 0),
3982 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
3983 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
3984 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
3985 	                                                      });
3986 	auto size = jit->builder->CreateCall(coro_size, {});
3987 	auto frame = jit->builder->CreateCall(allocFrame, { size });
3988 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
3989 
3990 	// Build the suspend block
3991 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
3992 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
3993 	jit->builder->CreateRet(jit->coroutine.handle);
3994 
3995 	// Build the end block
3996 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
3997 	auto action = jit->builder->CreateCall(coro_suspend, {
3998 	                                                         llvm::ConstantTokenNone::get(*jit->context),
3999 	                                                         llvm::ConstantInt::get(i1Ty, 1),  // final: true
4000 	                                                     });
4001 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4002 	// switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4003 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4004 
4005 	// Build the destroy block
4006 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4007 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4008 	jit->builder->CreateCall(freeFrame, { memory });
4009 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4010 
4011 	// Switch back to original insert point to continue building the coroutine.
4012 	jit->builder->restoreIP(oldInsertionPoint);
4013 }
4014 
4015 }  // anonymous namespace
4016 
4017 namespace rr {
4018 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4019 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4020 {
4021 	// Coroutines are initially created as a regular function.
4022 	// Upon the first call to Yield(), the function is promoted to a true
4023 	// coroutine.
4024 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4025 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4026 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4027 	auto handleTy = i8PtrTy;
4028 	auto boolTy = i1Ty;
4029 	auto promiseTy = T(YieldType);
4030 	auto promisePtrTy = promiseTy->getPointerTo();
4031 
4032 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4033 #if LLVM_VERSION_MAJOR >= 16
4034 	jit->function->setPresplitCoroutine();
4035 #else
4036 	jit->function->addFnAttr("coroutine.presplit", "0");
4037 #endif
4038 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4039 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4040 	jit->coroutine.yieldType = promiseTy;
4041 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4042 
4043 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4044 }
4045 
yield(Value * val)4046 void Nucleus::yield(Value *val)
4047 {
4048 	if(jit->coroutine.id == nullptr)
4049 	{
4050 		// First call to yield().
4051 		// Promote the function to a full coroutine.
4052 		promoteFunctionToCoroutine();
4053 		ASSERT(jit->coroutine.id != nullptr);
4054 	}
4055 
4056 	//      promise = val;
4057 	//
4058 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4059 	//      switch(action)
4060 	//      {
4061 	//      case SuspendActionResume:
4062 	//          goto resume;
4063 	//      case SuspendActionDestroy:
4064 	//          goto destroy;
4065 	//      default: // (SuspendActionSuspend)
4066 	//          goto suspend;
4067 	//      }
4068 	//  resume:
4069 	//
4070 
4071 	RR_DEBUG_INFO_UPDATE_LOC();
4072 	Variable::materializeAll();
4073 
4074 	// Types
4075 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4076 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4077 
4078 	// Intrinsics
4079 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4080 
4081 	// Create a block to resume execution.
4082 	auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4083 
4084 	// Store the promise (yield value)
4085 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4086 	auto action = jit->builder->CreateCall(coro_suspend, {
4087 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4088 	                                                         llvm::ConstantInt::get(i1Ty, 0),  // final: true
4089 	                                                     });
4090 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4091 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4092 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4093 
4094 	// Continue building in the resume block.
4095 	jit->builder->SetInsertPoint(resumeBlock);
4096 }
4097 
acquireCoroutine(const char * name)4098 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name)
4099 {
4100 	if(jit->coroutine.id)
4101 	{
4102 		jit->builder->CreateBr(jit->coroutine.endBlock);
4103 	}
4104 	else
4105 	{
4106 		// Coroutine without a Yield acts as a regular function.
4107 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4108 		// handle.
4109 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4110 		// The 'coroutine_await' function always returns false (coroutine done).
4111 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4112 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4113 		// The 'coroutine_destroy' does nothing, returns void.
4114 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4115 		jit->builder->CreateRetVoid();
4116 	}
4117 
4118 #ifdef ENABLE_RR_DEBUG_INFO
4119 	if(jit->debugInfo != nullptr)
4120 	{
4121 		jit->debugInfo->Finalize();
4122 	}
4123 #endif  // ENABLE_RR_DEBUG_INFO
4124 
4125 	if(false)
4126 	{
4127 		std::error_code error;
4128 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4129 		jit->module->print(file, 0);
4130 	}
4131 
4132 	jit->runPasses();
4133 
4134 	if(false)
4135 	{
4136 		std::error_code error;
4137 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4138 		jit->module->print(file, 0);
4139 	}
4140 
4141 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4142 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4143 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4144 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4145 
4146 	auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount);
4147 
4148 	delete jit;
4149 	jit = nullptr;
4150 
4151 	return routine;
4152 }
4153 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4154 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4155 {
4156 	return func();
4157 }
4158 
Int(RValue<scalar::Int> rhs)4159 SIMD::Int::Int(RValue<scalar::Int> rhs)
4160     : XYZW(this)
4161 {
4162 	RR_DEBUG_INFO_UPDATE_LOC();
4163 	Value *vector = loadValue();
4164 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4165 
4166 	std::vector<int> swizzle = { 0 };
4167 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4168 
4169 	storeValue(replicate);
4170 }
4171 
operator <<(RValue<SIMD::Int> lhs,unsigned char rhs)4172 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, unsigned char rhs)
4173 {
4174 	RR_DEBUG_INFO_UPDATE_LOC();
4175 	return As<SIMD::Int>(V(lowerVectorShl(V(lhs.value()), rhs)));
4176 }
4177 
operator >>(RValue<SIMD::Int> lhs,unsigned char rhs)4178 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, unsigned char rhs)
4179 {
4180 	RR_DEBUG_INFO_UPDATE_LOC();
4181 	return As<SIMD::Int>(V(lowerVectorAShr(V(lhs.value()), rhs)));
4182 }
4183 
CmpEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4184 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4185 {
4186 	RR_DEBUG_INFO_UPDATE_LOC();
4187 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4188 }
4189 
CmpLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4190 RValue<SIMD::Int> CmpLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4191 {
4192 	RR_DEBUG_INFO_UPDATE_LOC();
4193 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), SIMD::Int::type()));
4194 }
4195 
CmpLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4196 RValue<SIMD::Int> CmpLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4197 {
4198 	RR_DEBUG_INFO_UPDATE_LOC();
4199 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), SIMD::Int::type()));
4200 }
4201 
CmpNEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4202 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4203 {
4204 	RR_DEBUG_INFO_UPDATE_LOC();
4205 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4206 }
4207 
CmpNLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4208 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4209 {
4210 	RR_DEBUG_INFO_UPDATE_LOC();
4211 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), SIMD::Int::type()));
4212 }
4213 
CmpNLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4214 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4215 {
4216 	RR_DEBUG_INFO_UPDATE_LOC();
4217 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), SIMD::Int::type()));
4218 }
4219 
Abs(RValue<SIMD::Int> x)4220 RValue<SIMD::Int> Abs(RValue<SIMD::Int> x)
4221 {
4222 #if LLVM_VERSION_MAJOR >= 12
4223 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
4224 	return RValue<SIMD::Int>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
4225 #else
4226 	auto negative = x >> 31;
4227 	return (x ^ negative) - negative;
4228 #endif
4229 }
4230 
Max(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4231 RValue<SIMD::Int> Max(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4232 {
4233 	RR_DEBUG_INFO_UPDATE_LOC();
4234 	RValue<SIMD::Int> greater = CmpNLE(x, y);
4235 	return (x & greater) | (y & ~greater);
4236 }
4237 
Min(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4238 RValue<SIMD::Int> Min(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4239 {
4240 	RR_DEBUG_INFO_UPDATE_LOC();
4241 	RValue<SIMD::Int> less = CmpLT(x, y);
4242 	return (x & less) | (y & ~less);
4243 }
4244 
RoundInt(RValue<SIMD::Float> cast)4245 RValue<SIMD::Int> RoundInt(RValue<SIMD::Float> cast)
4246 {
4247 	RR_DEBUG_INFO_UPDATE_LOC();
4248 	return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4249 }
4250 
RoundIntClamped(RValue<SIMD::Float> cast)4251 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast)
4252 {
4253 	RR_DEBUG_INFO_UPDATE_LOC();
4254 
4255 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
4256 #if defined(__arm__) || defined(__aarch64__)
4257 	// ARM saturates to the largest positive or negative integer. Unit tests
4258 	// verify that lowerRoundInt() behaves as desired.
4259 	return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4260 #elif LLVM_VERSION_MAJOR >= 14
4261 	llvm::Value *rounded = lowerRound(V(cast.value()));
4262 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
4263 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(SIMD::Int::type()), T(SIMD::Float::type()) });
4264 	return RValue<SIMD::Int>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
4265 #else
4266 	RValue<SIMD::Float> clamped = Max(Min(cast, SIMD::Float(0x7FFFFF80)), SIMD::Float(static_cast<int>(0x80000000)));
4267 	return As<SIMD::Int>(V(lowerRoundInt(V(clamped.value()), T(SIMD::Int::type()))));
4268 #endif
4269 }
4270 
Extract128(RValue<SIMD::Int> val,int i)4271 RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
4272 {
4273 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4274 
4275 	return As<Int4>(V(jit->builder->CreateExtractElement(v128, i)));
4276 }
4277 
Insert128(RValue<SIMD::Int> val,RValue<Int4> element,int i)4278 RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
4279 {
4280 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4281 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4282 
4283 	return As<SIMD::Int>(V(jit->builder->CreateInsertElement(v128, a, i)));
4284 }
4285 
type()4286 Type *SIMD::Int::type()
4287 {
4288 	return T(llvm::VectorType::get(T(scalar::Int::type()), SIMD::Width, false));
4289 }
4290 
UInt(RValue<SIMD::Float> cast)4291 SIMD::UInt::UInt(RValue<SIMD::Float> cast)
4292     : XYZW(this)
4293 {
4294 	RR_DEBUG_INFO_UPDATE_LOC();
4295 	Value *xyzw = Nucleus::createFPToUI(cast.value(), SIMD::UInt::type());
4296 	storeValue(xyzw);
4297 }
4298 
UInt(RValue<scalar::UInt> rhs)4299 SIMD::UInt::UInt(RValue<scalar::UInt> rhs)
4300     : XYZW(this)
4301 {
4302 	RR_DEBUG_INFO_UPDATE_LOC();
4303 	Value *vector = loadValue();
4304 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4305 
4306 	std::vector<int> swizzle = { 0 };
4307 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4308 
4309 	storeValue(replicate);
4310 }
4311 
operator <<(RValue<SIMD::UInt> lhs,unsigned char rhs)4312 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, unsigned char rhs)
4313 {
4314 	RR_DEBUG_INFO_UPDATE_LOC();
4315 	return As<SIMD::UInt>(V(lowerVectorShl(V(lhs.value()), rhs)));
4316 }
4317 
operator >>(RValue<SIMD::UInt> lhs,unsigned char rhs)4318 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, unsigned char rhs)
4319 {
4320 	RR_DEBUG_INFO_UPDATE_LOC();
4321 	return As<SIMD::UInt>(V(lowerVectorLShr(V(lhs.value()), rhs)));
4322 }
4323 
CmpEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4324 RValue<SIMD::UInt> CmpEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4325 {
4326 	RR_DEBUG_INFO_UPDATE_LOC();
4327 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4328 }
4329 
CmpLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4330 RValue<SIMD::UInt> CmpLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4331 {
4332 	RR_DEBUG_INFO_UPDATE_LOC();
4333 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), SIMD::Int::type()));
4334 }
4335 
CmpLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4336 RValue<SIMD::UInt> CmpLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4337 {
4338 	RR_DEBUG_INFO_UPDATE_LOC();
4339 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), SIMD::Int::type()));
4340 }
4341 
CmpNEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4342 RValue<SIMD::UInt> CmpNEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4343 {
4344 	RR_DEBUG_INFO_UPDATE_LOC();
4345 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4346 }
4347 
CmpNLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4348 RValue<SIMD::UInt> CmpNLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4349 {
4350 	RR_DEBUG_INFO_UPDATE_LOC();
4351 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), SIMD::Int::type()));
4352 }
4353 
CmpNLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4354 RValue<SIMD::UInt> CmpNLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4355 {
4356 	RR_DEBUG_INFO_UPDATE_LOC();
4357 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), SIMD::Int::type()));
4358 }
4359 
Max(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4360 RValue<SIMD::UInt> Max(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4361 {
4362 	RR_DEBUG_INFO_UPDATE_LOC();
4363 	RValue<SIMD::UInt> greater = CmpNLE(x, y);
4364 	return (x & greater) | (y & ~greater);
4365 }
4366 
Min(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4367 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4368 {
4369 	RR_DEBUG_INFO_UPDATE_LOC();
4370 	RValue<SIMD::UInt> less = CmpLT(x, y);
4371 	return (x & less) | (y & ~less);
4372 }
4373 
Extract128(RValue<SIMD::UInt> val,int i)4374 RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
4375 {
4376 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4377 
4378 	return As<UInt4>(V(jit->builder->CreateExtractElement(v128, i)));
4379 }
4380 
Insert128(RValue<SIMD::UInt> val,RValue<UInt4> element,int i)4381 RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
4382 {
4383 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4384 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4385 
4386 	return As<SIMD::UInt>(V(jit->builder->CreateInsertElement(v128, a, i)));
4387 }
4388 
type()4389 Type *SIMD::UInt::type()
4390 {
4391 	return T(llvm::VectorType::get(T(scalar::UInt::type()), SIMD::Width, false));
4392 }
4393 
Float(RValue<scalar::Float> rhs)4394 SIMD::Float::Float(RValue<scalar::Float> rhs)
4395     : XYZW(this)
4396 {
4397 	RR_DEBUG_INFO_UPDATE_LOC();
4398 	Value *vector = loadValue();
4399 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4400 
4401 	std::vector<int> swizzle = { 0 };
4402 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4403 
4404 	storeValue(replicate);
4405 }
4406 
operator %(RValue<SIMD::Float> lhs,RValue<SIMD::Float> rhs)4407 RValue<SIMD::Float> operator%(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs)
4408 {
4409 	return RValue<SIMD::Float>(Nucleus::createFRem(lhs.value(), rhs.value()));
4410 }
4411 
MulAdd(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4412 RValue<SIMD::Float> MulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4413 {
4414 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(SIMD::Float::type()) });
4415 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4416 }
4417 
FMA(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4418 RValue<SIMD::Float> FMA(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4419 {
4420 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(SIMD::Float::type()) });
4421 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4422 }
4423 
Abs(RValue<SIMD::Float> x)4424 RValue<SIMD::Float> Abs(RValue<SIMD::Float> x)
4425 {
4426 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
4427 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, V(x.value()))));
4428 }
4429 
Max(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4430 RValue<SIMD::Float> Max(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4431 {
4432 	RR_DEBUG_INFO_UPDATE_LOC();
4433 	return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
4434 }
4435 
Min(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4436 RValue<SIMD::Float> Min(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4437 {
4438 	RR_DEBUG_INFO_UPDATE_LOC();
4439 	return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
4440 }
4441 
Sqrt(RValue<SIMD::Float> x)4442 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x)
4443 {
4444 	RR_DEBUG_INFO_UPDATE_LOC();
4445 	return As<SIMD::Float>(V(lowerSQRT(V(x.value()))));
4446 }
4447 
CmpEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4448 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4449 {
4450 	RR_DEBUG_INFO_UPDATE_LOC();
4451 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), SIMD::Int::type()));
4452 }
4453 
CmpLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4454 RValue<SIMD::Int> CmpLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4455 {
4456 	RR_DEBUG_INFO_UPDATE_LOC();
4457 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), SIMD::Int::type()));
4458 }
4459 
CmpLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4460 RValue<SIMD::Int> CmpLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4461 {
4462 	RR_DEBUG_INFO_UPDATE_LOC();
4463 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), SIMD::Int::type()));
4464 }
4465 
CmpNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4466 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4467 {
4468 	RR_DEBUG_INFO_UPDATE_LOC();
4469 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), SIMD::Int::type()));
4470 }
4471 
CmpNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4472 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4473 {
4474 	RR_DEBUG_INFO_UPDATE_LOC();
4475 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), SIMD::Int::type()));
4476 }
4477 
CmpNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4478 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4479 {
4480 	RR_DEBUG_INFO_UPDATE_LOC();
4481 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), SIMD::Int::type()));
4482 }
4483 
CmpUEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4484 RValue<SIMD::Int> CmpUEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4485 {
4486 	RR_DEBUG_INFO_UPDATE_LOC();
4487 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), SIMD::Int::type()));
4488 }
4489 
CmpULT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4490 RValue<SIMD::Int> CmpULT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4491 {
4492 	RR_DEBUG_INFO_UPDATE_LOC();
4493 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), SIMD::Int::type()));
4494 }
4495 
CmpULE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4496 RValue<SIMD::Int> CmpULE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4497 {
4498 	RR_DEBUG_INFO_UPDATE_LOC();
4499 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), SIMD::Int::type()));
4500 }
4501 
CmpUNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4502 RValue<SIMD::Int> CmpUNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4503 {
4504 	RR_DEBUG_INFO_UPDATE_LOC();
4505 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), SIMD::Int::type()));
4506 }
4507 
CmpUNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4508 RValue<SIMD::Int> CmpUNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4509 {
4510 	RR_DEBUG_INFO_UPDATE_LOC();
4511 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), SIMD::Int::type()));
4512 }
4513 
CmpUNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4514 RValue<SIMD::Int> CmpUNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4515 {
4516 	RR_DEBUG_INFO_UPDATE_LOC();
4517 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), SIMD::Int::type()));
4518 }
4519 
Round(RValue<SIMD::Float> x)4520 RValue<SIMD::Float> Round(RValue<SIMD::Float> x)
4521 {
4522 	RR_DEBUG_INFO_UPDATE_LOC();
4523 	return RValue<SIMD::Float>(V(lowerRound(V(x.value()))));
4524 }
4525 
Trunc(RValue<SIMD::Float> x)4526 RValue<SIMD::Float> Trunc(RValue<SIMD::Float> x)
4527 {
4528 	RR_DEBUG_INFO_UPDATE_LOC();
4529 	return RValue<SIMD::Float>(V(lowerTrunc(V(x.value()))));
4530 }
4531 
Frac(RValue<SIMD::Float> x)4532 RValue<SIMD::Float> Frac(RValue<SIMD::Float> x)
4533 {
4534 	RR_DEBUG_INFO_UPDATE_LOC();
4535 	SIMD::Float frc = x - Floor(x);
4536 
4537 	// x - floor(x) can be 1.0 for very small negative x.
4538 	// Clamp against the value just below 1.0.
4539 	return Min(frc, As<SIMD::Float>(SIMD::Int(0x3F7FFFFF)));
4540 }
4541 
Floor(RValue<SIMD::Float> x)4542 RValue<SIMD::Float> Floor(RValue<SIMD::Float> x)
4543 {
4544 	RR_DEBUG_INFO_UPDATE_LOC();
4545 	return RValue<SIMD::Float>(V(lowerFloor(V(x.value()))));
4546 }
4547 
Ceil(RValue<SIMD::Float> x)4548 RValue<SIMD::Float> Ceil(RValue<SIMD::Float> x)
4549 {
4550 	RR_DEBUG_INFO_UPDATE_LOC();
4551 	return -Floor(-x);
4552 }
4553 
Extract128(RValue<SIMD::Float> val,int i)4554 RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
4555 {
4556 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4557 
4558 	return As<Float4>(V(jit->builder->CreateExtractElement(v128, i)));
4559 }
4560 
Insert128(RValue<SIMD::Float> val,RValue<Float4> element,int i)4561 RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
4562 {
4563 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4564 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4565 
4566 	return As<SIMD::Float>(V(jit->builder->CreateInsertElement(v128, a, i)));
4567 }
4568 
type()4569 Type *SIMD::Float::type()
4570 {
4571 	return T(llvm::VectorType::get(T(scalar::Float::type()), SIMD::Width, false));
4572 }
4573 
4574 }  // namespace rr
4575