• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "LLVMReactorDebugInfo.hpp"
20 #include "PragmaInternals.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "SIMD.hpp"
24 #include "x86.hpp"
25 
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/Support/Alignment.h"
29 #include "llvm/Support/Error.h"
30 #include "llvm/Support/ManagedStatic.h"
31 
32 #include <fstream>
33 #include <iostream>
34 #include <mutex>
35 #include <numeric>
36 #include <thread>
37 #include <unordered_map>
38 
39 #if defined(__i386__) || defined(__x86_64__)
40 #	include <xmmintrin.h>
41 #endif
42 
43 #include <math.h>
44 
45 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()46 extern "C" void X86CompilationCallback()
47 {
48 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
49 }
50 #endif
51 
52 #if !LLVM_ENABLE_THREADS
53 #	error "LLVM_ENABLE_THREADS needs to be enabled"
54 #endif
55 
56 #if LLVM_VERSION_MAJOR < 11
57 namespace llvm {
58 using FixedVectorType = VectorType;
59 }  // namespace llvm
60 #endif
61 
62 namespace {
63 
64 // Used to automatically invoke llvm_shutdown() when driver is unloaded
65 llvm::llvm_shutdown_obj llvmShutdownObj;
66 
67 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
68 // for destructing objects at exit. See crbug.com/1074222
69 thread_local rr::JITBuilder *jit = nullptr;
70 
getNumElements(llvm::FixedVectorType * vec)71 auto getNumElements(llvm::FixedVectorType *vec)
72 {
73 #if LLVM_VERSION_MAJOR >= 11
74 	return vec->getElementCount();
75 #else
76 	return vec->getNumElements();
77 #endif
78 }
79 
lowerPAVG(llvm::Value * x,llvm::Value * y)80 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
81 {
82 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
83 
84 	llvm::VectorType *extTy =
85 	    llvm::VectorType::getExtendedElementVectorType(ty);
86 	x = jit->builder->CreateZExt(x, extTy);
87 	y = jit->builder->CreateZExt(y, extTy);
88 
89 	// (x + y + 1) >> 1
90 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
91 	llvm::Value *res = jit->builder->CreateAdd(x, y);
92 	res = jit->builder->CreateAdd(res, one);
93 	res = jit->builder->CreateLShr(res, one);
94 	return jit->builder->CreateTrunc(res, ty);
95 }
96 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)97 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
98                           llvm::ICmpInst::Predicate pred)
99 {
100 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
101 }
102 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)103 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
104                        llvm::Value *y, llvm::Type *dstTy)
105 {
106 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
107 }
108 
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)109 [[maybe_unused]] llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
110                                             llvm::FCmpInst::Predicate pred)
111 {
112 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
113 }
114 
lowerRound(llvm::Value * x)115 [[maybe_unused]] llvm::Value *lowerRound(llvm::Value *x)
116 {
117 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
118 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
119 	return jit->builder->CreateCall(nearbyint, { x });
120 }
121 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)122 [[maybe_unused]] llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
123 {
124 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
125 }
126 
lowerFloor(llvm::Value * x)127 [[maybe_unused]] llvm::Value *lowerFloor(llvm::Value *x)
128 {
129 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
130 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
131 	return jit->builder->CreateCall(floor, { x });
132 }
133 
lowerTrunc(llvm::Value * x)134 [[maybe_unused]] llvm::Value *lowerTrunc(llvm::Value *x)
135 {
136 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
137 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
138 	return jit->builder->CreateCall(trunc, { x });
139 }
140 
lowerSQRT(llvm::Value * x)141 [[maybe_unused]] llvm::Value *lowerSQRT(llvm::Value *x)
142 {
143 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
144 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
145 	return jit->builder->CreateCall(sqrt, { x });
146 }
147 
lowerRCP(llvm::Value * x)148 [[maybe_unused]] llvm::Value *lowerRCP(llvm::Value *x)
149 {
150 	llvm::Type *ty = x->getType();
151 	llvm::Constant *one;
152 	if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
153 	{
154 		one = llvm::ConstantVector::getSplat(getNumElements(vectorTy),
155 		                                     llvm::ConstantFP::get(vectorTy->getElementType(), 1));
156 	}
157 	else
158 	{
159 		one = llvm::ConstantFP::get(ty, 1);
160 	}
161 	return jit->builder->CreateFDiv(one, x);
162 }
163 
lowerRSQRT(llvm::Value * x)164 [[maybe_unused]] llvm::Value *lowerRSQRT(llvm::Value *x)
165 {
166 	return lowerRCP(lowerSQRT(x));
167 }
168 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)169 [[maybe_unused]] llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
170 {
171 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
172 	llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
173 	                                                llvm::ConstantInt::get(ty->getElementType(), scalarY));
174 	return jit->builder->CreateShl(x, y);
175 }
176 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)177 [[maybe_unused]] llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
178 {
179 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
180 	llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
181 	                                                llvm::ConstantInt::get(ty->getElementType(), scalarY));
182 	return jit->builder->CreateAShr(x, y);
183 }
184 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)185 [[maybe_unused]] llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
186 {
187 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
188 	llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
189 	                                                llvm::ConstantInt::get(ty->getElementType(), scalarY));
190 	return jit->builder->CreateLShr(x, y);
191 }
192 
lowerShuffleVector(llvm::Value * v1,llvm::Value * v2,llvm::ArrayRef<int> select)193 llvm::Value *lowerShuffleVector(llvm::Value *v1, llvm::Value *v2, llvm::ArrayRef<int> select)
194 {
195 	int size = select.size();
196 	const int maxSize = 16;
197 	llvm::Constant *swizzle[maxSize];
198 	ASSERT(size <= maxSize);
199 
200 	for(int i = 0; i < size; i++)
201 	{
202 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
203 	}
204 
205 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
206 
207 	return jit->builder->CreateShuffleVector(v1, v2, shuffle);
208 }
209 
lowerMulAdd(llvm::Value * x,llvm::Value * y)210 [[maybe_unused]] llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
211 {
212 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
213 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
214 
215 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
216 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
217 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
218 
219 	llvm::Value *undef = llvm::UndefValue::get(extTy);
220 
221 	llvm::SmallVector<int, 16> evenIdx;
222 	llvm::SmallVector<int, 16> oddIdx;
223 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
224 	{
225 		evenIdx.push_back(i);
226 		oddIdx.push_back(i + 1);
227 	}
228 
229 	llvm::Value *lhs = lowerShuffleVector(mult, undef, evenIdx);
230 	llvm::Value *rhs = lowerShuffleVector(mult, undef, oddIdx);
231 	return jit->builder->CreateAdd(lhs, rhs);
232 }
233 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)234 [[maybe_unused]] llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
235 {
236 	llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
237 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
238 
239 	llvm::IntegerType *dstElemTy =
240 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
241 
242 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
243 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
244 	llvm::Constant *max, *min;
245 	if(isSigned)
246 	{
247 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
248 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
249 	}
250 	else
251 	{
252 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
253 		min = llvm::ConstantInt::get(srcTy, 0, false);
254 	}
255 
256 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
257 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
258 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
259 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
260 
261 	x = jit->builder->CreateTrunc(x, dstTy);
262 	y = jit->builder->CreateTrunc(y, dstTy);
263 
264 	llvm::SmallVector<int, 16> index(srcTy->getNumElements() * 2);
265 	std::iota(index.begin(), index.end(), 0);
266 
267 	return lowerShuffleVector(x, y, index);
268 }
269 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)270 [[maybe_unused]] llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
271 {
272 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
273 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
274 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
275 
276 	llvm::Value *ret = jit->builder->CreateZExt(
277 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
278 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
279 	{
280 		llvm::Value *elem = jit->builder->CreateZExt(
281 		    jit->builder->CreateExtractElement(cmp, i), retTy);
282 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
283 	}
284 	return ret;
285 }
286 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)287 [[maybe_unused]] llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
288 {
289 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
290 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
291 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
292 
293 	llvm::Value *ret = jit->builder->CreateZExt(
294 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
295 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
296 	{
297 		llvm::Value *elem = jit->builder->CreateZExt(
298 		    jit->builder->CreateExtractElement(cmp, i), retTy);
299 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
300 	}
301 	return ret;
302 }
303 
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)304 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
305 {
306 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
307 }
308 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)309 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
310 {
311 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
312 }
313 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)314 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
315 {
316 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
317 }
318 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)319 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
320 {
321 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
322 }
323 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)324 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
325 {
326 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
327 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
328 
329 	llvm::Value *extX, *extY;
330 	if(sext)
331 	{
332 		extX = jit->builder->CreateSExt(x, extTy);
333 		extY = jit->builder->CreateSExt(y, extTy);
334 	}
335 	else
336 	{
337 		extX = jit->builder->CreateZExt(x, extTy);
338 		extY = jit->builder->CreateZExt(y, extTy);
339 	}
340 
341 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
342 
343 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
344 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
345 	return jit->builder->CreateTrunc(mulh, ty);
346 }
347 
348 // TODO(crbug.com/swiftshader/185): A temporary workaround for failing chromium tests.
clampForShift(llvm::Value * rhs)349 llvm::Value *clampForShift(llvm::Value *rhs)
350 {
351 	llvm::Value *max;
352 	if(auto *vec = llvm::dyn_cast<llvm::FixedVectorType>(rhs->getType()))
353 	{
354 		auto N = vec->getElementType()->getIntegerBitWidth() - 1;
355 		max = llvm::ConstantVector::getSplat(getNumElements(vec), llvm::ConstantInt::get(vec->getElementType(), N));
356 	}
357 	else
358 	{
359 		auto N = rhs->getType()->getIntegerBitWidth() - 1;
360 		max = llvm::ConstantInt::get(rhs->getType(), N);
361 	}
362 	return jit->builder->CreateSelect(jit->builder->CreateICmpULE(rhs, max), rhs, max);
363 }
364 
365 }  // namespace
366 
367 namespace rr {
368 
369 const int SIMD::Width = 4;
370 
backendName()371 std::string Caps::backendName()
372 {
373 	return std::string("LLVM ") + LLVM_VERSION_STRING;
374 }
375 
coroutinesSupported()376 bool Caps::coroutinesSupported()
377 {
378 	return true;
379 }
380 
fmaIsFast()381 bool Caps::fmaIsFast()
382 {
383 	static bool AVX2 = CPUID::supportsAVX2();  // Also checks for FMA support
384 
385 	// If x86 FMA instructions are supported, assume LLVM will emit them instead of making calls to std::fma().
386 	return AVX2;
387 }
388 
389 // The abstract Type* types are implemented as LLVM types, except that
390 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
391 // and VFP in ARM, and eliminate the overhead of converting them to explicit
392 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
393 // as abstract pointers with small enum values.
394 enum InternalType : uintptr_t
395 {
396 	// Emulated types:
397 	Type_v2i32,
398 	Type_v4i16,
399 	Type_v2i16,
400 	Type_v8i8,
401 	Type_v4i8,
402 	Type_v2f32,
403 	EmulatedTypeCount,
404 	// Returned by asInternalType() to indicate that the abstract Type*
405 	// should be interpreted as LLVM type pointer:
406 	Type_LLVM
407 };
408 
asInternalType(Type * type)409 inline InternalType asInternalType(Type *type)
410 {
411 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
412 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
413 }
414 
T(Type * t)415 llvm::Type *T(Type *t)
416 {
417 	// Use 128-bit vectors to implement logically shorter ones.
418 	switch(asInternalType(t))
419 	{
420 	case Type_v2i32: return T(Int4::type());
421 	case Type_v4i16: return T(Short8::type());
422 	case Type_v2i16: return T(Short8::type());
423 	case Type_v8i8: return T(Byte16::type());
424 	case Type_v4i8: return T(Byte16::type());
425 	case Type_v2f32: return T(Float4::type());
426 	case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
427 	default:
428 		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
429 		return nullptr;
430 	}
431 }
432 
T(InternalType t)433 Type *T(InternalType t)
434 {
435 	return reinterpret_cast<Type *>(t);
436 }
437 
T(const std::vector<Type * > & t)438 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
439 {
440 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
441 }
442 
B(BasicBlock * t)443 inline llvm::BasicBlock *B(BasicBlock *t)
444 {
445 	return reinterpret_cast<llvm::BasicBlock *>(t);
446 }
447 
B(llvm::BasicBlock * t)448 inline BasicBlock *B(llvm::BasicBlock *t)
449 {
450 	return reinterpret_cast<BasicBlock *>(t);
451 }
452 
typeSize(Type * type)453 static size_t typeSize(Type *type)
454 {
455 	switch(asInternalType(type))
456 	{
457 	case Type_v2i32: return 8;
458 	case Type_v4i16: return 8;
459 	case Type_v2i16: return 4;
460 	case Type_v8i8: return 8;
461 	case Type_v4i8: return 4;
462 	case Type_v2f32: return 8;
463 	case Type_LLVM:
464 		{
465 			llvm::Type *t = T(type);
466 
467 			if(t->isPointerTy())
468 			{
469 				return sizeof(void *);
470 			}
471 
472 			// At this point we should only have LLVM 'primitive' types.
473 			unsigned int bits = t->getPrimitiveSizeInBits();
474 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
475 
476 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
477 			// but are typically stored as one byte. The DataLayout structure should
478 			// be used here and many other places if this assumption fails.
479 			return (bits + 7) / 8;
480 		}
481 		break;
482 	default:
483 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
484 		return 0;
485 	}
486 }
487 
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)488 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> &params)
489 {
490 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
491 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
492 
493 	func->setLinkage(llvm::GlobalValue::ExternalLinkage);
494 	func->setDoesNotThrow();
495 	func->setCallingConv(llvm::CallingConv::C);
496 
497 	if(__has_feature(memory_sanitizer))
498 	{
499 		func->addFnAttr(llvm::Attribute::SanitizeMemory);
500 
501 		// Assume that when using recent versions of LLVM, MemorySanitizer enabled builds
502 		// use -fsanitize-memory-param-retval, which makes the caller not update the shadow
503 		// of function parameters. NoUndef skips generating checks for uninitialized values.
504 #if LLVM_VERSION_MAJOR >= 13
505 		for(unsigned int i = 0; i < params.size(); i++)
506 		{
507 			func->addParamAttr(i, llvm::Attribute::NoUndef);
508 		}
509 #endif
510 	}
511 
512 	if(__has_feature(address_sanitizer))
513 	{
514 		func->addFnAttr(llvm::Attribute::SanitizeAddress);
515 	}
516 
517 	func->addFnAttr("warn-stack-size", "524288");  // Warn when a function uses more than 512 KiB of stack memory
518 
519 	return func;
520 }
521 
Nucleus()522 Nucleus::Nucleus()
523 {
524 #if !__has_feature(memory_sanitizer)
525 	// thread_local variables in shared libraries are initialized at load-time,
526 	// but this is not observed by MemorySanitizer if the loader itself was not
527 	// instrumented, leading to false-positive uninitialized variable errors.
528 	ASSERT(jit == nullptr);
529 	ASSERT(Variable::unmaterializedVariables == nullptr);
530 #endif
531 
532 	jit = new JITBuilder();
533 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
534 }
535 
~Nucleus()536 Nucleus::~Nucleus()
537 {
538 	delete Variable::unmaterializedVariables;
539 	Variable::unmaterializedVariables = nullptr;
540 
541 	delete jit;
542 	jit = nullptr;
543 }
544 
acquireRoutine(const char * name)545 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name)
546 {
547 	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
548 	{
549 		llvm::Type *type = jit->function->getReturnType();
550 
551 		if(type->isVoidTy())
552 		{
553 			createRetVoid();
554 		}
555 		else
556 		{
557 			createRet(V(llvm::UndefValue::get(type)));
558 		}
559 	}
560 
561 	std::shared_ptr<Routine> routine;
562 
563 	auto acquire = [&](rr::JITBuilder *jit) {
564 	// ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
565 	// it needs to only use the jit variable passed in as an argument.
566 
567 #ifdef ENABLE_RR_DEBUG_INFO
568 		if(jit->debugInfo != nullptr)
569 		{
570 			jit->debugInfo->Finalize();
571 		}
572 #endif  // ENABLE_RR_DEBUG_INFO
573 
574 		if(false)
575 		{
576 			std::error_code error;
577 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
578 			jit->module->print(file, 0);
579 		}
580 
581 		jit->runPasses();
582 
583 		if(false)
584 		{
585 			std::error_code error;
586 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
587 			jit->module->print(file, 0);
588 		}
589 
590 		routine = jit->acquireRoutine(name, &jit->function, 1);
591 	};
592 
593 #ifdef JIT_IN_SEPARATE_THREAD
594 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
595 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
596 	// over the threading and stack sizes of its users, so this should be addressed
597 	// at a higher level instead.
598 	std::thread thread(acquire, jit);
599 	thread.join();
600 #else
601 	acquire(jit);
602 #endif
603 
604 	return routine;
605 }
606 
allocateStackVariable(Type * type,int arraySize)607 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
608 {
609 	// Need to allocate it in the entry block for mem2reg to work
610 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
611 
612 	llvm::Instruction *declaration;
613 
614 #if LLVM_VERSION_MAJOR >= 11
615 	auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
616 #else
617 	auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
618 #endif
619 
620 	if(arraySize)
621 	{
622 		Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
623 		declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
624 	}
625 	else
626 	{
627 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
628 	}
629 
630 #if LLVM_VERSION_MAJOR >= 16
631 	declaration->insertInto(&entryBlock, entryBlock.begin());
632 #else
633 	entryBlock.getInstList().push_front(declaration);
634 #endif
635 
636 	if(getPragmaState(InitializeLocalVariables))
637 	{
638 		llvm::Type *i8PtrTy = llvm::Type::getInt8Ty(*jit->context)->getPointerTo();
639 		llvm::Type *i32Ty = llvm::Type::getInt32Ty(*jit->context);
640 		llvm::Function *memset = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::memset, { i8PtrTy, i32Ty });
641 
642 		jit->builder->CreateCall(memset, { jit->builder->CreatePointerCast(declaration, i8PtrTy),
643 		                                   V(Nucleus::createConstantByte((unsigned char)0)),
644 		                                   V(Nucleus::createConstantInt((int)typeSize(type) * (arraySize ? arraySize : 1))),
645 		                                   V(Nucleus::createConstantBool(false)) });
646 	}
647 
648 	return V(declaration);
649 }
650 
createBasicBlock()651 BasicBlock *Nucleus::createBasicBlock()
652 {
653 	return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
654 }
655 
getInsertBlock()656 BasicBlock *Nucleus::getInsertBlock()
657 {
658 	return B(jit->builder->GetInsertBlock());
659 }
660 
setInsertBlock(BasicBlock * basicBlock)661 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
662 {
663 	// assert(jit->builder->GetInsertBlock()->back().isTerminator());
664 
665 	jit->builder->SetInsertPoint(B(basicBlock));
666 }
667 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)668 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
669 {
670 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
671 
672 #ifdef ENABLE_RR_DEBUG_INFO
673 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
674 #endif  // ENABLE_RR_DEBUG_INFO
675 
676 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
677 }
678 
getArgument(unsigned int index)679 Value *Nucleus::getArgument(unsigned int index)
680 {
681 	llvm::Function::arg_iterator args = jit->function->arg_begin();
682 
683 	while(index)
684 	{
685 		args++;
686 		index--;
687 	}
688 
689 	return V(&*args);
690 }
691 
createRetVoid()692 void Nucleus::createRetVoid()
693 {
694 	RR_DEBUG_INFO_UPDATE_LOC();
695 
696 	ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
697 
698 	// Code generated after this point is unreachable, so any variables
699 	// being read can safely return an undefined value. We have to avoid
700 	// materializing variables after the terminator ret instruction.
701 	Variable::killUnmaterialized();
702 
703 	jit->builder->CreateRetVoid();
704 }
705 
createRet(Value * v)706 void Nucleus::createRet(Value *v)
707 {
708 	RR_DEBUG_INFO_UPDATE_LOC();
709 
710 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
711 
712 	// Code generated after this point is unreachable, so any variables
713 	// being read can safely return an undefined value. We have to avoid
714 	// materializing variables after the terminator ret instruction.
715 	Variable::killUnmaterialized();
716 
717 	jit->builder->CreateRet(V(v));
718 }
719 
createBr(BasicBlock * dest)720 void Nucleus::createBr(BasicBlock *dest)
721 {
722 	RR_DEBUG_INFO_UPDATE_LOC();
723 	Variable::materializeAll();
724 
725 	jit->builder->CreateBr(B(dest));
726 }
727 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)728 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
729 {
730 	RR_DEBUG_INFO_UPDATE_LOC();
731 	Variable::materializeAll();
732 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
733 }
734 
createAdd(Value * lhs,Value * rhs)735 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
736 {
737 	RR_DEBUG_INFO_UPDATE_LOC();
738 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
739 }
740 
createSub(Value * lhs,Value * rhs)741 Value *Nucleus::createSub(Value *lhs, Value *rhs)
742 {
743 	RR_DEBUG_INFO_UPDATE_LOC();
744 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
745 }
746 
createMul(Value * lhs,Value * rhs)747 Value *Nucleus::createMul(Value *lhs, Value *rhs)
748 {
749 	RR_DEBUG_INFO_UPDATE_LOC();
750 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
751 }
752 
createUDiv(Value * lhs,Value * rhs)753 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
754 {
755 	RR_DEBUG_INFO_UPDATE_LOC();
756 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
757 }
758 
createSDiv(Value * lhs,Value * rhs)759 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
760 {
761 	RR_DEBUG_INFO_UPDATE_LOC();
762 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
763 }
764 
createFAdd(Value * lhs,Value * rhs)765 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
766 {
767 	RR_DEBUG_INFO_UPDATE_LOC();
768 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
769 }
770 
createFSub(Value * lhs,Value * rhs)771 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
772 {
773 	RR_DEBUG_INFO_UPDATE_LOC();
774 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
775 }
776 
createFMul(Value * lhs,Value * rhs)777 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
778 {
779 	RR_DEBUG_INFO_UPDATE_LOC();
780 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
781 }
782 
createFDiv(Value * lhs,Value * rhs)783 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
784 {
785 	RR_DEBUG_INFO_UPDATE_LOC();
786 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
787 }
788 
createURem(Value * lhs,Value * rhs)789 Value *Nucleus::createURem(Value *lhs, Value *rhs)
790 {
791 	RR_DEBUG_INFO_UPDATE_LOC();
792 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
793 }
794 
createSRem(Value * lhs,Value * rhs)795 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
796 {
797 	RR_DEBUG_INFO_UPDATE_LOC();
798 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
799 }
800 
createFRem(Value * lhs,Value * rhs)801 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
802 {
803 	RR_DEBUG_INFO_UPDATE_LOC();
804 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
805 }
806 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)807 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
808 {
809 	return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
810 }
811 
createShl(Value * lhs,Value * rhs)812 Value *Nucleus::createShl(Value *lhs, Value *rhs)
813 {
814 	RR_DEBUG_INFO_UPDATE_LOC();
815 	auto *clamped_rhs = clampForShift(V(rhs));
816 	return V(jit->builder->CreateShl(V(lhs), clamped_rhs));
817 }
818 
createLShr(Value * lhs,Value * rhs)819 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
820 {
821 	RR_DEBUG_INFO_UPDATE_LOC();
822 	auto *clamped_rhs = clampForShift(V(rhs));
823 	return V(jit->builder->CreateLShr(V(lhs), clamped_rhs));
824 }
825 
createAShr(Value * lhs,Value * rhs)826 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
827 {
828 	RR_DEBUG_INFO_UPDATE_LOC();
829 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
830 }
831 
createAnd(Value * lhs,Value * rhs)832 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
833 {
834 	RR_DEBUG_INFO_UPDATE_LOC();
835 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
836 }
837 
createOr(Value * lhs,Value * rhs)838 Value *Nucleus::createOr(Value *lhs, Value *rhs)
839 {
840 	RR_DEBUG_INFO_UPDATE_LOC();
841 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
842 }
843 
createXor(Value * lhs,Value * rhs)844 Value *Nucleus::createXor(Value *lhs, Value *rhs)
845 {
846 	RR_DEBUG_INFO_UPDATE_LOC();
847 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
848 }
849 
createNeg(Value * v)850 Value *Nucleus::createNeg(Value *v)
851 {
852 	RR_DEBUG_INFO_UPDATE_LOC();
853 	return V(jit->builder->CreateNeg(V(v)));
854 }
855 
createFNeg(Value * v)856 Value *Nucleus::createFNeg(Value *v)
857 {
858 	RR_DEBUG_INFO_UPDATE_LOC();
859 	return V(jit->builder->CreateFNeg(V(v)));
860 }
861 
createNot(Value * v)862 Value *Nucleus::createNot(Value *v)
863 {
864 	RR_DEBUG_INFO_UPDATE_LOC();
865 	return V(jit->builder->CreateNot(V(v)));
866 }
867 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)868 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
869 {
870 	RR_DEBUG_INFO_UPDATE_LOC();
871 	switch(asInternalType(type))
872 	{
873 	case Type_v2i32:
874 	case Type_v4i16:
875 	case Type_v8i8:
876 	case Type_v2f32:
877 		return createBitCast(
878 		    createInsertElement(
879 		        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
880 		        createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
881 		        0),
882 		    type);
883 	case Type_v2i16:
884 	case Type_v4i8:
885 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
886 		{
887 			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
888 			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
889 			i = createZExt(i, Long::type());
890 			Value *v = createInsertElement(u, i, 0);
891 			return createBitCast(v, type);
892 		}
893 		// Fallthrough to non-emulated case.
894 	case Type_LLVM:
895 		{
896 			auto elTy = T(type);
897 
898 			if(!atomic)
899 			{
900 				return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
901 			}
902 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
903 			{
904 				// Integers and pointers can be atomically loaded by setting
905 				// the ordering constraint on the load instruction.
906 				auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
907 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
908 				return V(load);
909 			}
910 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
911 			{
912 				// LLVM claims to support atomic loads of float types as
913 				// above, but certain backends cannot deal with this.
914 				// Load as an integer and bitcast. See b/136037244.
915 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
916 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
917 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
918 				auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
919 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
920 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
921 				return V(loadCast);
922 			}
923 			else
924 			{
925 				// More exotic types require falling back to the extern:
926 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
927 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
928 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
929 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
930 				auto i8PtrTy = i8Ty->getPointerTo();
931 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
932 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
933 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
934 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
935 				auto out = allocateStackVariable(type);
936 				jit->builder->CreateCall(func, {
937 				                                   llvm::ConstantInt::get(sizetTy, size),
938 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
939 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
940 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
941 				                               });
942 				return V(jit->builder->CreateLoad(T(type), V(out)));
943 			}
944 		}
945 	default:
946 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
947 		return nullptr;
948 	}
949 }
950 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)951 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
952 {
953 	RR_DEBUG_INFO_UPDATE_LOC();
954 	switch(asInternalType(type))
955 	{
956 	case Type_v2i32:
957 	case Type_v4i16:
958 	case Type_v8i8:
959 	case Type_v2f32:
960 		createStore(
961 		    createExtractElement(
962 		        createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
963 		    createBitCast(ptr, Pointer<Long>::type()),
964 		    Long::type(), isVolatile, alignment, atomic, memoryOrder);
965 		return value;
966 	case Type_v2i16:
967 	case Type_v4i8:
968 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
969 		{
970 			createStore(
971 			    createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
972 			    createBitCast(ptr, Pointer<Int>::type()),
973 			    Int::type(), isVolatile, alignment, atomic, memoryOrder);
974 			return value;
975 		}
976 		// Fallthrough to non-emulated case.
977 	case Type_LLVM:
978 		{
979 			auto elTy = T(type);
980 
981 			if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
982 			{
983 				// Mark all memory writes as initialized by calling __msan_unpoison
984 				// void __msan_unpoison(const volatile void *a, size_t size)
985 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
986 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
987 				auto voidPtrTy = i8Ty->getPointerTo();
988 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
989 				auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
990 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
991 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
992 
993 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
994 				                                 llvm::ConstantInt::get(sizetTy, size) });
995 			}
996 
997 			if(!atomic)
998 			{
999 				jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1000 			}
1001 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
1002 			{
1003 				// Integers and pointers can be atomically stored by setting
1004 				// the ordering constraint on the store instruction.
1005 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1006 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1007 			}
1008 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
1009 			{
1010 				// LLVM claims to support atomic stores of float types as
1011 				// above, but certain backends cannot deal with this.
1012 				// Store as an bitcast integer. See b/136037244.
1013 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1014 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1015 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1016 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1017 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1018 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1019 			}
1020 			else
1021 			{
1022 				// More exotic types require falling back to the extern:
1023 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1024 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1025 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1026 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1027 				auto i8PtrTy = i8Ty->getPointerTo();
1028 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1029 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1030 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1031 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1032 				auto copy = allocateStackVariable(type);
1033 				jit->builder->CreateStore(V(value), V(copy));
1034 				jit->builder->CreateCall(func, {
1035 				                                   llvm::ConstantInt::get(sizetTy, size),
1036 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1037 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1038 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1039 				                               });
1040 			}
1041 
1042 			return value;
1043 		}
1044 	default:
1045 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1046 		return nullptr;
1047 	}
1048 }
1049 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1050 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1051 {
1052 	RR_DEBUG_INFO_UPDATE_LOC();
1053 
1054 	ASSERT(V(ptr)->getType()->isPointerTy());
1055 	ASSERT(V(mask)->getType()->isVectorTy());
1056 
1057 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1058 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1059 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1060 	auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1061 	auto elVecPtrTy = elVecTy->getPointerTo();
1062 	auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1063 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1064 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1065 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1066 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1067 }
1068 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1069 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1070 {
1071 	RR_DEBUG_INFO_UPDATE_LOC();
1072 
1073 	ASSERT(V(ptr)->getType()->isPointerTy());
1074 	ASSERT(V(val)->getType()->isVectorTy());
1075 	ASSERT(V(mask)->getType()->isVectorTy());
1076 
1077 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1078 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1079 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1080 	auto elVecTy = V(val)->getType();
1081 	auto elVecPtrTy = elVecTy->getPointerTo();
1082 	auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1083 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1084 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1085 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1086 
1087 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1088 	{
1089 		// Mark memory writes as initialized by calling __msan_unpoison
1090 		// void __msan_unpoison(const volatile void *a, size_t size)
1091 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
1092 		auto voidPtrTy = voidTy->getPointerTo();
1093 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1094 		auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1095 		auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1096 		auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1097 
1098 		for(unsigned i = 0; i < numEls; i++)
1099 		{
1100 			// Check mask for this element
1101 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1102 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1103 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1104 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1105 			jit->builder->SetInsertPoint(thenBlock);
1106 
1107 			// Insert __msan_unpoison call in conditional block
1108 			auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1109 			jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1110 			                                 llvm::ConstantInt::get(sizetTy, size) });
1111 
1112 			jit->builder->CreateBr(mergeBlock);
1113 			jit->builder->SetInsertPoint(mergeBlock);
1114 		}
1115 	}
1116 }
1117 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1118 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1119 {
1120 	ASSERT(base->getType()->isPointerTy());
1121 	ASSERT(offsets->getType()->isVectorTy());
1122 	ASSERT(mask->getType()->isVectorTy());
1123 
1124 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1125 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1126 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1127 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1128 	auto i8PtrTy = i8Ty->getPointerTo();
1129 	auto elPtrTy = elTy->getPointerTo();
1130 	auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1131 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1132 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1133 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1134 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1135 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1136 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1137 
1138 	if(!__has_feature(memory_sanitizer))
1139 	{
1140 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1141 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1142 		return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1143 	}
1144 	else  // __has_feature(memory_sanitizer)
1145 	{
1146 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1147 		// Work around it by emulating gather with element-wise loads.
1148 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1149 
1150 		Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1151 		Nucleus::createStore(V(passthrough), result, T(elVecTy));
1152 
1153 		for(unsigned i = 0; i < numEls; i++)
1154 		{
1155 			// Check mask for this element
1156 			Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1157 
1158 			If(RValue<Bool>(elementMask))
1159 			{
1160 				Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1161 				Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1162 
1163 				Value *v = Nucleus::createLoad(result, T(elVecTy));
1164 				v = Nucleus::createInsertElement(v, el, i);
1165 				Nucleus::createStore(v, result, T(elVecTy));
1166 			}
1167 		}
1168 
1169 		return V(Nucleus::createLoad(result, T(elVecTy)));
1170 	}
1171 }
1172 
Gather(RValue<Pointer<Float>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1173 RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1174 {
1175 	return As<SIMD::Float>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1176 }
1177 
Gather(RValue<Pointer<Int>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1178 RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1179 {
1180 	return As<SIMD::Int>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1181 }
1182 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1183 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1184 {
1185 	ASSERT(base->getType()->isPointerTy());
1186 	ASSERT(val->getType()->isVectorTy());
1187 	ASSERT(offsets->getType()->isVectorTy());
1188 	ASSERT(mask->getType()->isVectorTy());
1189 
1190 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1191 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1192 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1193 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1194 	auto i8PtrTy = i8Ty->getPointerTo();
1195 	auto elVecTy = val->getType();
1196 	auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1197 	auto elPtrTy = elTy->getPointerTo();
1198 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1199 
1200 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1201 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1202 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1203 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1204 
1205 	if(!__has_feature(memory_sanitizer))
1206 	{
1207 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1208 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1209 		jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1210 	}
1211 	else  // __has_feature(memory_sanitizer)
1212 	{
1213 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1214 		// Work around it by emulating scatter with element-wise stores.
1215 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1216 
1217 		for(unsigned i = 0; i < numEls; i++)
1218 		{
1219 			// Check mask for this element
1220 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1221 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1222 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1223 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1224 			jit->builder->SetInsertPoint(thenBlock);
1225 
1226 			auto el = jit->builder->CreateExtractElement(val, idx);
1227 			auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1228 			Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1229 
1230 			jit->builder->CreateBr(mergeBlock);
1231 			jit->builder->SetInsertPoint(mergeBlock);
1232 		}
1233 	}
1234 }
1235 
Scatter(RValue<Pointer<Float>> base,RValue<SIMD::Float> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1236 void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1237 {
1238 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1239 }
1240 
Scatter(RValue<Pointer<Int>> base,RValue<SIMD::Int> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1241 void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1242 {
1243 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1244 }
1245 
createFence(std::memory_order memoryOrder)1246 void Nucleus::createFence(std::memory_order memoryOrder)
1247 {
1248 	RR_DEBUG_INFO_UPDATE_LOC();
1249 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1250 }
1251 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1252 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1253 {
1254 	RR_DEBUG_INFO_UPDATE_LOC();
1255 
1256 	if(sizeof(void *) == 8)
1257 	{
1258 		// LLVM manual: "When indexing into an array, pointer or vector,
1259 		// integers of any width are allowed, and they are not required to
1260 		// be constant. These integers are treated as signed values where
1261 		// relevant."
1262 		//
1263 		// Thus if we want indexes to be treated as unsigned we have to
1264 		// zero-extend them ourselves.
1265 		//
1266 		// Note that this is not because we want to address anywhere near
1267 		// 4 GB of data. Instead this is important for performance because
1268 		// x86 supports automatic zero-extending of 32-bit registers to
1269 		// 64-bit. Thus when indexing into an array using a uint32 is
1270 		// actually faster than an int32.
1271 		index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1272 	}
1273 
1274 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1275 	// effective address correctly.
1276 	if(asInternalType(type) == Type_LLVM)
1277 	{
1278 		return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1279 	}
1280 
1281 	// For emulated types we have to multiply the index by the intended
1282 	// type size ourselves to obain the byte offset.
1283 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1284 
1285 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1286 	// original pointer type.
1287 	return createBitCast(
1288 	    V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1289 	    T(llvm::PointerType::get(T(type), 0)));
1290 }
1291 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1292 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1293 {
1294 	RR_DEBUG_INFO_UPDATE_LOC();
1295 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1296 #if LLVM_VERSION_MAJOR >= 11
1297 	                                       llvm::MaybeAlign(),
1298 #endif
1299 	                                       atomicOrdering(true, memoryOrder)));
1300 }
1301 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1302 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1303 {
1304 	RR_DEBUG_INFO_UPDATE_LOC();
1305 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1306 #if LLVM_VERSION_MAJOR >= 11
1307 	                                       llvm::MaybeAlign(),
1308 #endif
1309 	                                       atomicOrdering(true, memoryOrder)));
1310 }
1311 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1312 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1313 {
1314 	RR_DEBUG_INFO_UPDATE_LOC();
1315 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1316 #if LLVM_VERSION_MAJOR >= 11
1317 	                                       llvm::MaybeAlign(),
1318 #endif
1319 	                                       atomicOrdering(true, memoryOrder)));
1320 }
1321 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1322 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1323 {
1324 	RR_DEBUG_INFO_UPDATE_LOC();
1325 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1326 #if LLVM_VERSION_MAJOR >= 11
1327 	                                       llvm::MaybeAlign(),
1328 #endif
1329 	                                       atomicOrdering(true, memoryOrder)));
1330 }
1331 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1332 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1333 {
1334 	RR_DEBUG_INFO_UPDATE_LOC();
1335 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1336 #if LLVM_VERSION_MAJOR >= 11
1337 	                                       llvm::MaybeAlign(),
1338 #endif
1339 	                                       atomicOrdering(true, memoryOrder)));
1340 }
1341 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1342 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1343 {
1344 	RR_DEBUG_INFO_UPDATE_LOC();
1345 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1346 #if LLVM_VERSION_MAJOR >= 11
1347 	                                       llvm::MaybeAlign(),
1348 #endif
1349 	                                       atomicOrdering(true, memoryOrder)));
1350 }
1351 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1352 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1353 {
1354 	RR_DEBUG_INFO_UPDATE_LOC();
1355 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1356 #if LLVM_VERSION_MAJOR >= 11
1357 	                                       llvm::MaybeAlign(),
1358 #endif
1359 	                                       atomicOrdering(true, memoryOrder)));
1360 }
1361 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1362 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1363 {
1364 	RR_DEBUG_INFO_UPDATE_LOC();
1365 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1366 #if LLVM_VERSION_MAJOR >= 11
1367 	                                       llvm::MaybeAlign(),
1368 #endif
1369 	                                       atomicOrdering(true, memoryOrder)));
1370 }
1371 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1372 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1373 {
1374 	RR_DEBUG_INFO_UPDATE_LOC();
1375 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1376 #if LLVM_VERSION_MAJOR >= 11
1377 	                                       llvm::MaybeAlign(),
1378 #endif
1379 	                                       atomicOrdering(true, memoryOrder)));
1380 }
1381 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1382 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1383 {
1384 	RR_DEBUG_INFO_UPDATE_LOC();
1385 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1386 #if LLVM_VERSION_MAJOR >= 11
1387 	                                       llvm::MaybeAlign(),
1388 #endif
1389 	                                       atomicOrdering(true, memoryOrder)));
1390 }
1391 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1392 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1393 {
1394 	RR_DEBUG_INFO_UPDATE_LOC();
1395 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1396 	return V(jit->builder->CreateExtractValue(
1397 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1398 #if LLVM_VERSION_MAJOR >= 11
1399 	                                      llvm::MaybeAlign(),
1400 #endif
1401 	                                      atomicOrdering(true, memoryOrderEqual),
1402 	                                      atomicOrdering(true, memoryOrderUnequal)),
1403 	    llvm::ArrayRef<unsigned>(0u)));
1404 }
1405 
createTrunc(Value * v,Type * destType)1406 Value *Nucleus::createTrunc(Value *v, Type *destType)
1407 {
1408 	RR_DEBUG_INFO_UPDATE_LOC();
1409 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1410 }
1411 
createZExt(Value * v,Type * destType)1412 Value *Nucleus::createZExt(Value *v, Type *destType)
1413 {
1414 	RR_DEBUG_INFO_UPDATE_LOC();
1415 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1416 }
1417 
createSExt(Value * v,Type * destType)1418 Value *Nucleus::createSExt(Value *v, Type *destType)
1419 {
1420 	RR_DEBUG_INFO_UPDATE_LOC();
1421 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1422 }
1423 
createFPToUI(Value * v,Type * destType)1424 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1425 {
1426 	RR_DEBUG_INFO_UPDATE_LOC();
1427 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1428 }
1429 
createFPToSI(Value * v,Type * destType)1430 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1431 {
1432 	RR_DEBUG_INFO_UPDATE_LOC();
1433 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1434 }
1435 
createSIToFP(Value * v,Type * destType)1436 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1437 {
1438 	RR_DEBUG_INFO_UPDATE_LOC();
1439 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1440 }
1441 
createFPTrunc(Value * v,Type * destType)1442 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1443 {
1444 	RR_DEBUG_INFO_UPDATE_LOC();
1445 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1446 }
1447 
createFPExt(Value * v,Type * destType)1448 Value *Nucleus::createFPExt(Value *v, Type *destType)
1449 {
1450 	RR_DEBUG_INFO_UPDATE_LOC();
1451 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1452 }
1453 
createBitCast(Value * v,Type * destType)1454 Value *Nucleus::createBitCast(Value *v, Type *destType)
1455 {
1456 	RR_DEBUG_INFO_UPDATE_LOC();
1457 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1458 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1459 	// reading back as the destination type.
1460 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1461 	{
1462 		Value *readAddress = allocateStackVariable(destType);
1463 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1464 		createStore(v, writeAddress, T(V(v)->getType()));
1465 		return createLoad(readAddress, destType);
1466 	}
1467 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1468 	{
1469 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1470 		createStore(v, writeAddress, T(V(v)->getType()));
1471 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1472 		return createLoad(readAddress, destType);
1473 	}
1474 
1475 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1476 }
1477 
createICmpEQ(Value * lhs,Value * rhs)1478 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1479 {
1480 	RR_DEBUG_INFO_UPDATE_LOC();
1481 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1482 }
1483 
createICmpNE(Value * lhs,Value * rhs)1484 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1485 {
1486 	RR_DEBUG_INFO_UPDATE_LOC();
1487 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1488 }
1489 
createICmpUGT(Value * lhs,Value * rhs)1490 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1491 {
1492 	RR_DEBUG_INFO_UPDATE_LOC();
1493 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1494 }
1495 
createICmpUGE(Value * lhs,Value * rhs)1496 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1497 {
1498 	RR_DEBUG_INFO_UPDATE_LOC();
1499 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1500 }
1501 
createICmpULT(Value * lhs,Value * rhs)1502 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1503 {
1504 	RR_DEBUG_INFO_UPDATE_LOC();
1505 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1506 }
1507 
createICmpULE(Value * lhs,Value * rhs)1508 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1509 {
1510 	RR_DEBUG_INFO_UPDATE_LOC();
1511 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1512 }
1513 
createICmpSGT(Value * lhs,Value * rhs)1514 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1515 {
1516 	RR_DEBUG_INFO_UPDATE_LOC();
1517 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1518 }
1519 
createICmpSGE(Value * lhs,Value * rhs)1520 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1521 {
1522 	RR_DEBUG_INFO_UPDATE_LOC();
1523 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1524 }
1525 
createICmpSLT(Value * lhs,Value * rhs)1526 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1527 {
1528 	RR_DEBUG_INFO_UPDATE_LOC();
1529 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1530 }
1531 
createICmpSLE(Value * lhs,Value * rhs)1532 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1533 {
1534 	RR_DEBUG_INFO_UPDATE_LOC();
1535 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1536 }
1537 
createFCmpOEQ(Value * lhs,Value * rhs)1538 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1539 {
1540 	RR_DEBUG_INFO_UPDATE_LOC();
1541 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1542 }
1543 
createFCmpOGT(Value * lhs,Value * rhs)1544 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1545 {
1546 	RR_DEBUG_INFO_UPDATE_LOC();
1547 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1548 }
1549 
createFCmpOGE(Value * lhs,Value * rhs)1550 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1551 {
1552 	RR_DEBUG_INFO_UPDATE_LOC();
1553 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1554 }
1555 
createFCmpOLT(Value * lhs,Value * rhs)1556 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1557 {
1558 	RR_DEBUG_INFO_UPDATE_LOC();
1559 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1560 }
1561 
createFCmpOLE(Value * lhs,Value * rhs)1562 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1563 {
1564 	RR_DEBUG_INFO_UPDATE_LOC();
1565 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1566 }
1567 
createFCmpONE(Value * lhs,Value * rhs)1568 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1569 {
1570 	RR_DEBUG_INFO_UPDATE_LOC();
1571 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1572 }
1573 
createFCmpORD(Value * lhs,Value * rhs)1574 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1575 {
1576 	RR_DEBUG_INFO_UPDATE_LOC();
1577 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1578 }
1579 
createFCmpUNO(Value * lhs,Value * rhs)1580 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1581 {
1582 	RR_DEBUG_INFO_UPDATE_LOC();
1583 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1584 }
1585 
createFCmpUEQ(Value * lhs,Value * rhs)1586 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1587 {
1588 	RR_DEBUG_INFO_UPDATE_LOC();
1589 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1590 }
1591 
createFCmpUGT(Value * lhs,Value * rhs)1592 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1593 {
1594 	RR_DEBUG_INFO_UPDATE_LOC();
1595 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1596 }
1597 
createFCmpUGE(Value * lhs,Value * rhs)1598 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1599 {
1600 	RR_DEBUG_INFO_UPDATE_LOC();
1601 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1602 }
1603 
createFCmpULT(Value * lhs,Value * rhs)1604 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1605 {
1606 	RR_DEBUG_INFO_UPDATE_LOC();
1607 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1608 }
1609 
createFCmpULE(Value * lhs,Value * rhs)1610 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1611 {
1612 	RR_DEBUG_INFO_UPDATE_LOC();
1613 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1614 }
1615 
createFCmpUNE(Value * lhs,Value * rhs)1616 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1617 {
1618 	RR_DEBUG_INFO_UPDATE_LOC();
1619 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1620 }
1621 
createExtractElement(Value * vector,Type * type,int index)1622 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1623 {
1624 	RR_DEBUG_INFO_UPDATE_LOC();
1625 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1626 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1627 }
1628 
createInsertElement(Value * vector,Value * element,int index)1629 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1630 {
1631 	RR_DEBUG_INFO_UPDATE_LOC();
1632 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1633 }
1634 
createShuffleVector(Value * v1,Value * v2,std::vector<int> select)1635 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, std::vector<int> select)
1636 {
1637 	RR_DEBUG_INFO_UPDATE_LOC();
1638 
1639 	size_t size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1640 	ASSERT(size == llvm::cast<llvm::FixedVectorType>(V(v2)->getType())->getNumElements());
1641 
1642 	llvm::SmallVector<int, 16> mask;
1643 	const size_t selectSize = select.size();
1644 	for(size_t i = 0; i < size; i++)
1645 	{
1646 		mask.push_back(select[i % selectSize]);
1647 	}
1648 
1649 	return V(lowerShuffleVector(V(v1), V(v2), mask));
1650 }
1651 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1652 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1653 {
1654 	RR_DEBUG_INFO_UPDATE_LOC();
1655 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1656 }
1657 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1658 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1659 {
1660 	RR_DEBUG_INFO_UPDATE_LOC();
1661 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1662 }
1663 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1664 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1665 {
1666 	RR_DEBUG_INFO_UPDATE_LOC();
1667 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1668 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1669 }
1670 
createUnreachable()1671 void Nucleus::createUnreachable()
1672 {
1673 	RR_DEBUG_INFO_UPDATE_LOC();
1674 	jit->builder->CreateUnreachable();
1675 }
1676 
getType(Value * value)1677 Type *Nucleus::getType(Value *value)
1678 {
1679 	return T(V(value)->getType());
1680 }
1681 
getContainedType(Type * vectorType)1682 Type *Nucleus::getContainedType(Type *vectorType)
1683 {
1684 	return T(T(vectorType)->getContainedType(0));
1685 }
1686 
getPointerType(Type * ElementType)1687 Type *Nucleus::getPointerType(Type *ElementType)
1688 {
1689 	return T(llvm::PointerType::get(T(ElementType), 0));
1690 }
1691 
getNaturalIntType()1692 static llvm::Type *getNaturalIntType()
1693 {
1694 	return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1695 }
1696 
getPrintfStorageType(Type * valueType)1697 Type *Nucleus::getPrintfStorageType(Type *valueType)
1698 {
1699 	llvm::Type *valueTy = T(valueType);
1700 	if(valueTy->isIntegerTy())
1701 	{
1702 		return T(getNaturalIntType());
1703 	}
1704 	if(valueTy->isFloatTy())
1705 	{
1706 		return T(llvm::Type::getDoubleTy(*jit->context));
1707 	}
1708 
1709 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1710 	return {};
1711 }
1712 
createNullValue(Type * Ty)1713 Value *Nucleus::createNullValue(Type *Ty)
1714 {
1715 	RR_DEBUG_INFO_UPDATE_LOC();
1716 	return V(llvm::Constant::getNullValue(T(Ty)));
1717 }
1718 
createConstantLong(int64_t i)1719 Value *Nucleus::createConstantLong(int64_t i)
1720 {
1721 	RR_DEBUG_INFO_UPDATE_LOC();
1722 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1723 }
1724 
createConstantInt(int i)1725 Value *Nucleus::createConstantInt(int i)
1726 {
1727 	RR_DEBUG_INFO_UPDATE_LOC();
1728 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1729 }
1730 
createConstantInt(unsigned int i)1731 Value *Nucleus::createConstantInt(unsigned int i)
1732 {
1733 	RR_DEBUG_INFO_UPDATE_LOC();
1734 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1735 }
1736 
createConstantBool(bool b)1737 Value *Nucleus::createConstantBool(bool b)
1738 {
1739 	RR_DEBUG_INFO_UPDATE_LOC();
1740 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1741 }
1742 
createConstantByte(signed char i)1743 Value *Nucleus::createConstantByte(signed char i)
1744 {
1745 	RR_DEBUG_INFO_UPDATE_LOC();
1746 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1747 }
1748 
createConstantByte(unsigned char i)1749 Value *Nucleus::createConstantByte(unsigned char i)
1750 {
1751 	RR_DEBUG_INFO_UPDATE_LOC();
1752 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1753 }
1754 
createConstantShort(short i)1755 Value *Nucleus::createConstantShort(short i)
1756 {
1757 	RR_DEBUG_INFO_UPDATE_LOC();
1758 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1759 }
1760 
createConstantShort(unsigned short i)1761 Value *Nucleus::createConstantShort(unsigned short i)
1762 {
1763 	RR_DEBUG_INFO_UPDATE_LOC();
1764 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1765 }
1766 
createConstantFloat(float x)1767 Value *Nucleus::createConstantFloat(float x)
1768 {
1769 	RR_DEBUG_INFO_UPDATE_LOC();
1770 	return V(llvm::ConstantFP::get(T(Float::type()), x));
1771 }
1772 
createNullPointer(Type * Ty)1773 Value *Nucleus::createNullPointer(Type *Ty)
1774 {
1775 	RR_DEBUG_INFO_UPDATE_LOC();
1776 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1777 }
1778 
createConstantVector(std::vector<int64_t> constants,Type * type)1779 Value *Nucleus::createConstantVector(std::vector<int64_t> constants, Type *type)
1780 {
1781 	RR_DEBUG_INFO_UPDATE_LOC();
1782 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1783 	const size_t numConstants = constants.size();                                             // Number of provided constants for the (emulated) type.
1784 	const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1785 	llvm::SmallVector<llvm::Constant *, 16> constantVector;
1786 
1787 	for(size_t i = 0; i < numElements; i++)
1788 	{
1789 		constantVector.push_back(llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]));
1790 	}
1791 
1792 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1793 }
1794 
createConstantVector(std::vector<double> constants,Type * type)1795 Value *Nucleus::createConstantVector(std::vector<double> constants, Type *type)
1796 {
1797 	RR_DEBUG_INFO_UPDATE_LOC();
1798 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1799 	const size_t numConstants = constants.size();                                             // Number of provided constants for the (emulated) type.
1800 	const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1801 	llvm::SmallVector<llvm::Constant *, 16> constantVector;
1802 
1803 	for(size_t i = 0; i < numElements; i++)
1804 	{
1805 		constantVector.push_back(llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]));
1806 	}
1807 
1808 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1809 }
1810 
createConstantString(const char * v)1811 Value *Nucleus::createConstantString(const char *v)
1812 {
1813 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1814 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1815 	return V(ptr);
1816 }
1817 
setOptimizerCallback(OptimizerCallback * callback)1818 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1819 {
1820 	// The LLVM backend does not produce optimizer reports.
1821 	(void)callback;
1822 }
1823 
type()1824 Type *Void::type()
1825 {
1826 	return T(llvm::Type::getVoidTy(*jit->context));
1827 }
1828 
type()1829 Type *Bool::type()
1830 {
1831 	return T(llvm::Type::getInt1Ty(*jit->context));
1832 }
1833 
type()1834 Type *Byte::type()
1835 {
1836 	return T(llvm::Type::getInt8Ty(*jit->context));
1837 }
1838 
type()1839 Type *SByte::type()
1840 {
1841 	return T(llvm::Type::getInt8Ty(*jit->context));
1842 }
1843 
type()1844 Type *Short::type()
1845 {
1846 	return T(llvm::Type::getInt16Ty(*jit->context));
1847 }
1848 
type()1849 Type *UShort::type()
1850 {
1851 	return T(llvm::Type::getInt16Ty(*jit->context));
1852 }
1853 
type()1854 Type *Byte4::type()
1855 {
1856 	return T(Type_v4i8);
1857 }
1858 
type()1859 Type *SByte4::type()
1860 {
1861 	return T(Type_v4i8);
1862 }
1863 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1864 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1865 {
1866 	RR_DEBUG_INFO_UPDATE_LOC();
1867 #if defined(__i386__) || defined(__x86_64__)
1868 	return x86::paddusb(x, y);
1869 #else
1870 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1871 #endif
1872 }
1873 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1874 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1875 {
1876 	RR_DEBUG_INFO_UPDATE_LOC();
1877 #if defined(__i386__) || defined(__x86_64__)
1878 	return x86::psubusb(x, y);
1879 #else
1880 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1881 #endif
1882 }
1883 
SignMask(RValue<Byte8> x)1884 RValue<Int> SignMask(RValue<Byte8> x)
1885 {
1886 	RR_DEBUG_INFO_UPDATE_LOC();
1887 #if defined(__i386__) || defined(__x86_64__)
1888 	return x86::pmovmskb(x);
1889 #else
1890 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1891 #endif
1892 }
1893 
1894 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1895 //	{
1896 //#if defined(__i386__) || defined(__x86_64__)
1897 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1898 //#else
1899 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1900 //#endif
1901 //	}
1902 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1903 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1904 {
1905 	RR_DEBUG_INFO_UPDATE_LOC();
1906 #if defined(__i386__) || defined(__x86_64__)
1907 	return x86::pcmpeqb(x, y);
1908 #else
1909 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1910 #endif
1911 }
1912 
type()1913 Type *Byte8::type()
1914 {
1915 	return T(Type_v8i8);
1916 }
1917 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1918 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1919 {
1920 	RR_DEBUG_INFO_UPDATE_LOC();
1921 #if defined(__i386__) || defined(__x86_64__)
1922 	return x86::paddsb(x, y);
1923 #else
1924 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1925 #endif
1926 }
1927 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1928 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1929 {
1930 	RR_DEBUG_INFO_UPDATE_LOC();
1931 #if defined(__i386__) || defined(__x86_64__)
1932 	return x86::psubsb(x, y);
1933 #else
1934 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1935 #endif
1936 }
1937 
SignMask(RValue<SByte8> x)1938 RValue<Int> SignMask(RValue<SByte8> x)
1939 {
1940 	RR_DEBUG_INFO_UPDATE_LOC();
1941 #if defined(__i386__) || defined(__x86_64__)
1942 	return x86::pmovmskb(As<Byte8>(x));
1943 #else
1944 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1945 #endif
1946 }
1947 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1948 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1949 {
1950 	RR_DEBUG_INFO_UPDATE_LOC();
1951 #if defined(__i386__) || defined(__x86_64__)
1952 	return x86::pcmpgtb(x, y);
1953 #else
1954 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1955 #endif
1956 }
1957 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1958 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1959 {
1960 	RR_DEBUG_INFO_UPDATE_LOC();
1961 #if defined(__i386__) || defined(__x86_64__)
1962 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1963 #else
1964 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1965 #endif
1966 }
1967 
type()1968 Type *SByte8::type()
1969 {
1970 	return T(Type_v8i8);
1971 }
1972 
type()1973 Type *Byte16::type()
1974 {
1975 	return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1976 }
1977 
type()1978 Type *SByte16::type()
1979 {
1980 	return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1981 }
1982 
type()1983 Type *Short2::type()
1984 {
1985 	return T(Type_v2i16);
1986 }
1987 
type()1988 Type *UShort2::type()
1989 {
1990 	return T(Type_v2i16);
1991 }
1992 
Short4(RValue<Int4> cast)1993 Short4::Short4(RValue<Int4> cast)
1994 {
1995 	RR_DEBUG_INFO_UPDATE_LOC();
1996 	std::vector<int> select = { 0, 2, 4, 6, 0, 2, 4, 6 };
1997 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
1998 
1999 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2000 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2001 
2002 	storeValue(short4);
2003 }
2004 
2005 //	Short4::Short4(RValue<Float> cast)
2006 //	{
2007 //	}
2008 
Short4(RValue<Float4> cast)2009 Short4::Short4(RValue<Float4> cast)
2010 {
2011 	RR_DEBUG_INFO_UPDATE_LOC();
2012 	Int4 v4i32 = Int4(cast);
2013 #if defined(__i386__) || defined(__x86_64__)
2014 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2015 #else
2016 	Value *v = v4i32.loadValue();
2017 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2018 #endif
2019 
2020 	storeValue(As<Short4>(Int2(v4i32)).value());
2021 }
2022 
operator <<(RValue<Short4> lhs,unsigned char rhs)2023 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2024 {
2025 	RR_DEBUG_INFO_UPDATE_LOC();
2026 #if defined(__i386__) || defined(__x86_64__)
2027 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2028 
2029 	return x86::psllw(lhs, rhs);
2030 #else
2031 	return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2032 #endif
2033 }
2034 
operator >>(RValue<Short4> lhs,unsigned char rhs)2035 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2036 {
2037 	RR_DEBUG_INFO_UPDATE_LOC();
2038 #if defined(__i386__) || defined(__x86_64__)
2039 	return x86::psraw(lhs, rhs);
2040 #else
2041 	return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2042 #endif
2043 }
2044 
Max(RValue<Short4> x,RValue<Short4> y)2045 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2046 {
2047 	RR_DEBUG_INFO_UPDATE_LOC();
2048 #if defined(__i386__) || defined(__x86_64__)
2049 	return x86::pmaxsw(x, y);
2050 #else
2051 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2052 #endif
2053 }
2054 
Min(RValue<Short4> x,RValue<Short4> y)2055 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2056 {
2057 	RR_DEBUG_INFO_UPDATE_LOC();
2058 #if defined(__i386__) || defined(__x86_64__)
2059 	return x86::pminsw(x, y);
2060 #else
2061 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2062 #endif
2063 }
2064 
AddSat(RValue<Short4> x,RValue<Short4> y)2065 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2066 {
2067 	RR_DEBUG_INFO_UPDATE_LOC();
2068 #if defined(__i386__) || defined(__x86_64__)
2069 	return x86::paddsw(x, y);
2070 #else
2071 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2072 #endif
2073 }
2074 
SubSat(RValue<Short4> x,RValue<Short4> y)2075 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2076 {
2077 	RR_DEBUG_INFO_UPDATE_LOC();
2078 #if defined(__i386__) || defined(__x86_64__)
2079 	return x86::psubsw(x, y);
2080 #else
2081 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2082 #endif
2083 }
2084 
MulHigh(RValue<Short4> x,RValue<Short4> y)2085 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2086 {
2087 	RR_DEBUG_INFO_UPDATE_LOC();
2088 #if defined(__i386__) || defined(__x86_64__)
2089 	return x86::pmulhw(x, y);
2090 #else
2091 	return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2092 #endif
2093 }
2094 
MulAdd(RValue<Short4> x,RValue<Short4> y)2095 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2096 {
2097 	RR_DEBUG_INFO_UPDATE_LOC();
2098 #if defined(__i386__) || defined(__x86_64__)
2099 	return x86::pmaddwd(x, y);
2100 #else
2101 	return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2102 #endif
2103 }
2104 
PackSigned(RValue<Short4> x,RValue<Short4> y)2105 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2106 {
2107 	RR_DEBUG_INFO_UPDATE_LOC();
2108 #if defined(__i386__) || defined(__x86_64__)
2109 	auto result = x86::packsswb(x, y);
2110 #else
2111 	auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2112 #endif
2113 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2114 }
2115 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2116 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2117 {
2118 	RR_DEBUG_INFO_UPDATE_LOC();
2119 #if defined(__i386__) || defined(__x86_64__)
2120 	auto result = x86::packuswb(x, y);
2121 #else
2122 	auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2123 #endif
2124 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2125 }
2126 
CmpGT(RValue<Short4> x,RValue<Short4> y)2127 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2128 {
2129 	RR_DEBUG_INFO_UPDATE_LOC();
2130 #if defined(__i386__) || defined(__x86_64__)
2131 	return x86::pcmpgtw(x, y);
2132 #else
2133 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2134 #endif
2135 }
2136 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2137 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2138 {
2139 	RR_DEBUG_INFO_UPDATE_LOC();
2140 #if defined(__i386__) || defined(__x86_64__)
2141 	return x86::pcmpeqw(x, y);
2142 #else
2143 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2144 #endif
2145 }
2146 
type()2147 Type *Short4::type()
2148 {
2149 	return T(Type_v4i16);
2150 }
2151 
UShort4(RValue<Float4> cast,bool saturate)2152 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2153 {
2154 	RR_DEBUG_INFO_UPDATE_LOC();
2155 	if(saturate)
2156 	{
2157 #if defined(__i386__) || defined(__x86_64__)
2158 		if(CPUID::supportsSSE4_1())
2159 		{
2160 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2161 			*this = As<Short4>(PackUnsigned(int4, int4));
2162 		}
2163 		else
2164 #endif
2165 		{
2166 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2167 		}
2168 	}
2169 	else
2170 	{
2171 		*this = Short4(Int4(cast));
2172 	}
2173 }
2174 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2175 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2176 {
2177 	RR_DEBUG_INFO_UPDATE_LOC();
2178 #if defined(__i386__) || defined(__x86_64__)
2179 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2180 
2181 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2182 #else
2183 	return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2184 #endif
2185 }
2186 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2187 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2188 {
2189 	RR_DEBUG_INFO_UPDATE_LOC();
2190 #if defined(__i386__) || defined(__x86_64__)
2191 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2192 
2193 	return x86::psrlw(lhs, rhs);
2194 #else
2195 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2196 #endif
2197 }
2198 
Max(RValue<UShort4> x,RValue<UShort4> y)2199 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2200 {
2201 	RR_DEBUG_INFO_UPDATE_LOC();
2202 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2203 }
2204 
Min(RValue<UShort4> x,RValue<UShort4> y)2205 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2206 {
2207 	RR_DEBUG_INFO_UPDATE_LOC();
2208 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2209 }
2210 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2211 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2212 {
2213 	RR_DEBUG_INFO_UPDATE_LOC();
2214 #if defined(__i386__) || defined(__x86_64__)
2215 	return x86::paddusw(x, y);
2216 #else
2217 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2218 #endif
2219 }
2220 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2221 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2222 {
2223 	RR_DEBUG_INFO_UPDATE_LOC();
2224 #if defined(__i386__) || defined(__x86_64__)
2225 	return x86::psubusw(x, y);
2226 #else
2227 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2228 #endif
2229 }
2230 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2231 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2232 {
2233 	RR_DEBUG_INFO_UPDATE_LOC();
2234 #if defined(__i386__) || defined(__x86_64__)
2235 	return x86::pmulhuw(x, y);
2236 #else
2237 	return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2238 #endif
2239 }
2240 
Average(RValue<UShort4> x,RValue<UShort4> y)2241 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2242 {
2243 	RR_DEBUG_INFO_UPDATE_LOC();
2244 #if defined(__i386__) || defined(__x86_64__)
2245 	return x86::pavgw(x, y);
2246 #else
2247 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2248 #endif
2249 }
2250 
type()2251 Type *UShort4::type()
2252 {
2253 	return T(Type_v4i16);
2254 }
2255 
operator <<(RValue<Short8> lhs,unsigned char rhs)2256 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2257 {
2258 	RR_DEBUG_INFO_UPDATE_LOC();
2259 #if defined(__i386__) || defined(__x86_64__)
2260 	return x86::psllw(lhs, rhs);
2261 #else
2262 	return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2263 #endif
2264 }
2265 
operator >>(RValue<Short8> lhs,unsigned char rhs)2266 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2267 {
2268 	RR_DEBUG_INFO_UPDATE_LOC();
2269 #if defined(__i386__) || defined(__x86_64__)
2270 	return x86::psraw(lhs, rhs);
2271 #else
2272 	return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2273 #endif
2274 }
2275 
MulAdd(RValue<Short8> x,RValue<Short8> y)2276 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2277 {
2278 	RR_DEBUG_INFO_UPDATE_LOC();
2279 #if defined(__i386__) || defined(__x86_64__)
2280 	return x86::pmaddwd(x, y);
2281 #else
2282 	return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2283 #endif
2284 }
2285 
MulHigh(RValue<Short8> x,RValue<Short8> y)2286 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2287 {
2288 	RR_DEBUG_INFO_UPDATE_LOC();
2289 #if defined(__i386__) || defined(__x86_64__)
2290 	return x86::pmulhw(x, y);
2291 #else
2292 	return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2293 #endif
2294 }
2295 
type()2296 Type *Short8::type()
2297 {
2298 	return T(llvm::VectorType::get(T(Short::type()), 8, false));
2299 }
2300 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2301 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2302 {
2303 	RR_DEBUG_INFO_UPDATE_LOC();
2304 #if defined(__i386__) || defined(__x86_64__)
2305 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2306 #else
2307 	return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2308 #endif
2309 }
2310 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2311 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2312 {
2313 	RR_DEBUG_INFO_UPDATE_LOC();
2314 #if defined(__i386__) || defined(__x86_64__)
2315 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2316 #else
2317 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2318 #endif
2319 }
2320 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2321 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2322 {
2323 	RR_DEBUG_INFO_UPDATE_LOC();
2324 #if defined(__i386__) || defined(__x86_64__)
2325 	return x86::pmulhuw(x, y);
2326 #else
2327 	return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2328 #endif
2329 }
2330 
type()2331 Type *UShort8::type()
2332 {
2333 	return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2334 }
2335 
operator ++(Int & val,int)2336 RValue<Int> operator++(Int &val, int)  // Post-increment
2337 {
2338 	RR_DEBUG_INFO_UPDATE_LOC();
2339 	RValue<Int> res = val;
2340 
2341 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2342 	val.storeValue(inc);
2343 
2344 	return res;
2345 }
2346 
operator ++(Int & val)2347 const Int &operator++(Int &val)  // Pre-increment
2348 {
2349 	RR_DEBUG_INFO_UPDATE_LOC();
2350 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2351 	val.storeValue(inc);
2352 
2353 	return val;
2354 }
2355 
operator --(Int & val,int)2356 RValue<Int> operator--(Int &val, int)  // Post-decrement
2357 {
2358 	RR_DEBUG_INFO_UPDATE_LOC();
2359 	RValue<Int> res = val;
2360 
2361 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2362 	val.storeValue(inc);
2363 
2364 	return res;
2365 }
2366 
operator --(Int & val)2367 const Int &operator--(Int &val)  // Pre-decrement
2368 {
2369 	RR_DEBUG_INFO_UPDATE_LOC();
2370 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2371 	val.storeValue(inc);
2372 
2373 	return val;
2374 }
2375 
RoundInt(RValue<Float> cast)2376 RValue<Int> RoundInt(RValue<Float> cast)
2377 {
2378 	RR_DEBUG_INFO_UPDATE_LOC();
2379 #if defined(__i386__) || defined(__x86_64__)
2380 	return x86::cvtss2si(cast);
2381 #else
2382 	return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2383 #endif
2384 }
2385 
type()2386 Type *Int::type()
2387 {
2388 	return T(llvm::Type::getInt32Ty(*jit->context));
2389 }
2390 
type()2391 Type *Long::type()
2392 {
2393 	return T(llvm::Type::getInt64Ty(*jit->context));
2394 }
2395 
UInt(RValue<Float> cast)2396 UInt::UInt(RValue<Float> cast)
2397 {
2398 	RR_DEBUG_INFO_UPDATE_LOC();
2399 	Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2400 	storeValue(integer);
2401 }
2402 
operator ++(UInt & val,int)2403 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2404 {
2405 	RR_DEBUG_INFO_UPDATE_LOC();
2406 	RValue<UInt> res = val;
2407 
2408 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2409 	val.storeValue(inc);
2410 
2411 	return res;
2412 }
2413 
operator ++(UInt & val)2414 const UInt &operator++(UInt &val)  // Pre-increment
2415 {
2416 	RR_DEBUG_INFO_UPDATE_LOC();
2417 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2418 	val.storeValue(inc);
2419 
2420 	return val;
2421 }
2422 
operator --(UInt & val,int)2423 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2424 {
2425 	RR_DEBUG_INFO_UPDATE_LOC();
2426 	RValue<UInt> res = val;
2427 
2428 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2429 	val.storeValue(inc);
2430 
2431 	return res;
2432 }
2433 
operator --(UInt & val)2434 const UInt &operator--(UInt &val)  // Pre-decrement
2435 {
2436 	RR_DEBUG_INFO_UPDATE_LOC();
2437 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2438 	val.storeValue(inc);
2439 
2440 	return val;
2441 }
2442 
2443 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2444 //	{
2445 //#if defined(__i386__) || defined(__x86_64__)
2446 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2447 //#else
2448 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2449 //#endif
2450 //	}
2451 
type()2452 Type *UInt::type()
2453 {
2454 	return T(llvm::Type::getInt32Ty(*jit->context));
2455 }
2456 
2457 //	Int2::Int2(RValue<Int> cast)
2458 //	{
2459 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2460 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
2461 //
2462 //		int shuffle[2] = {0, 0};
2463 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2464 //
2465 //		storeValue(replicate);
2466 //	}
2467 
operator <<(RValue<Int2> lhs,unsigned char rhs)2468 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2469 {
2470 	RR_DEBUG_INFO_UPDATE_LOC();
2471 #if defined(__i386__) || defined(__x86_64__)
2472 	//	return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2473 
2474 	return x86::pslld(lhs, rhs);
2475 #else
2476 	return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2477 #endif
2478 }
2479 
operator >>(RValue<Int2> lhs,unsigned char rhs)2480 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2481 {
2482 	RR_DEBUG_INFO_UPDATE_LOC();
2483 #if defined(__i386__) || defined(__x86_64__)
2484 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2485 
2486 	return x86::psrad(lhs, rhs);
2487 #else
2488 	return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2489 #endif
2490 }
2491 
type()2492 Type *Int2::type()
2493 {
2494 	return T(Type_v2i32);
2495 }
2496 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2497 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2498 {
2499 	RR_DEBUG_INFO_UPDATE_LOC();
2500 #if defined(__i386__) || defined(__x86_64__)
2501 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2502 
2503 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2504 #else
2505 	return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2506 #endif
2507 }
2508 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2509 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2510 {
2511 	RR_DEBUG_INFO_UPDATE_LOC();
2512 #if defined(__i386__) || defined(__x86_64__)
2513 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2514 
2515 	return x86::psrld(lhs, rhs);
2516 #else
2517 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2518 #endif
2519 }
2520 
type()2521 Type *UInt2::type()
2522 {
2523 	return T(Type_v2i32);
2524 }
2525 
Int4(RValue<Byte4> cast)2526 Int4::Int4(RValue<Byte4> cast)
2527     : XYZW(this)
2528 {
2529 	RR_DEBUG_INFO_UPDATE_LOC();
2530 	std::vector<int> swizzle = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2531 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2532 	Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2533 
2534 	std::vector<int> swizzle2 = { 0, 8, 1, 9, 2, 10, 3, 11 };
2535 	Value *c = Nucleus::createBitCast(b, Short8::type());
2536 	Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2537 
2538 	*this = As<Int4>(d);
2539 }
2540 
Int4(RValue<SByte4> cast)2541 Int4::Int4(RValue<SByte4> cast)
2542     : XYZW(this)
2543 {
2544 	RR_DEBUG_INFO_UPDATE_LOC();
2545 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2546 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2547 	Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2548 
2549 	std::vector<int> swizzle2 = { 0, 0, 1, 1, 2, 2, 3, 3 };
2550 	Value *c = Nucleus::createBitCast(b, Short8::type());
2551 	Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2552 
2553 	*this = As<Int4>(d) >> 24;
2554 }
2555 
Int4(RValue<Short4> cast)2556 Int4::Int4(RValue<Short4> cast)
2557     : XYZW(this)
2558 {
2559 	RR_DEBUG_INFO_UPDATE_LOC();
2560 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3 };
2561 	Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2562 	*this = As<Int4>(c) >> 16;
2563 }
2564 
Int4(RValue<UShort4> cast)2565 Int4::Int4(RValue<UShort4> cast)
2566     : XYZW(this)
2567 {
2568 	RR_DEBUG_INFO_UPDATE_LOC();
2569 	std::vector<int> swizzle = { 0, 8, 1, 9, 2, 10, 3, 11 };
2570 	Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2571 	*this = As<Int4>(c);
2572 }
2573 
Int4(RValue<Int> rhs)2574 Int4::Int4(RValue<Int> rhs)
2575     : XYZW(this)
2576 {
2577 	RR_DEBUG_INFO_UPDATE_LOC();
2578 	Value *vector = loadValue();
2579 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2580 
2581 	std::vector<int> swizzle = { 0, 0, 0, 0 };
2582 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2583 
2584 	storeValue(replicate);
2585 }
2586 
operator <<(RValue<Int4> lhs,unsigned char rhs)2587 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2588 {
2589 	RR_DEBUG_INFO_UPDATE_LOC();
2590 #if defined(__i386__) || defined(__x86_64__)
2591 	return x86::pslld(lhs, rhs);
2592 #else
2593 	return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2594 #endif
2595 }
2596 
operator >>(RValue<Int4> lhs,unsigned char rhs)2597 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2598 {
2599 	RR_DEBUG_INFO_UPDATE_LOC();
2600 #if defined(__i386__) || defined(__x86_64__)
2601 	return x86::psrad(lhs, rhs);
2602 #else
2603 	return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2604 #endif
2605 }
2606 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2607 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2608 {
2609 	RR_DEBUG_INFO_UPDATE_LOC();
2610 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2611 }
2612 
CmpLT(RValue<Int4> x,RValue<Int4> y)2613 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2614 {
2615 	RR_DEBUG_INFO_UPDATE_LOC();
2616 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2617 }
2618 
CmpLE(RValue<Int4> x,RValue<Int4> y)2619 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2620 {
2621 	RR_DEBUG_INFO_UPDATE_LOC();
2622 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2623 }
2624 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2625 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2626 {
2627 	RR_DEBUG_INFO_UPDATE_LOC();
2628 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2629 }
2630 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2631 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2632 {
2633 	RR_DEBUG_INFO_UPDATE_LOC();
2634 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2635 }
2636 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2637 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2638 {
2639 	RR_DEBUG_INFO_UPDATE_LOC();
2640 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2641 }
2642 
Abs(RValue<Int4> x)2643 RValue<Int4> Abs(RValue<Int4> x)
2644 {
2645 #if LLVM_VERSION_MAJOR >= 12
2646 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
2647 	return RValue<Int4>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
2648 #else
2649 	auto negative = x >> 31;
2650 	return (x ^ negative) - negative;
2651 #endif
2652 }
2653 
Max(RValue<Int4> x,RValue<Int4> y)2654 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2655 {
2656 	RR_DEBUG_INFO_UPDATE_LOC();
2657 #if defined(__i386__) || defined(__x86_64__)
2658 	if(CPUID::supportsSSE4_1())
2659 	{
2660 		return x86::pmaxsd(x, y);
2661 	}
2662 	else
2663 #endif
2664 	{
2665 		RValue<Int4> greater = CmpNLE(x, y);
2666 		return (x & greater) | (y & ~greater);
2667 	}
2668 }
2669 
Min(RValue<Int4> x,RValue<Int4> y)2670 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2671 {
2672 	RR_DEBUG_INFO_UPDATE_LOC();
2673 #if defined(__i386__) || defined(__x86_64__)
2674 	if(CPUID::supportsSSE4_1())
2675 	{
2676 		return x86::pminsd(x, y);
2677 	}
2678 	else
2679 #endif
2680 	{
2681 		RValue<Int4> less = CmpLT(x, y);
2682 		return (x & less) | (y & ~less);
2683 	}
2684 }
2685 
RoundInt(RValue<Float4> cast)2686 RValue<Int4> RoundInt(RValue<Float4> cast)
2687 {
2688 	RR_DEBUG_INFO_UPDATE_LOC();
2689 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2690 	return x86::cvtps2dq(cast);
2691 #else
2692 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2693 #endif
2694 }
2695 
RoundIntClamped(RValue<Float4> cast)2696 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2697 {
2698 	RR_DEBUG_INFO_UPDATE_LOC();
2699 
2700 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
2701 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2702 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
2703 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2704 	// saturate to 0x80000000.
2705 	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2706 #elif defined(__arm__) || defined(__aarch64__)
2707 	// ARM saturates to the largest positive or negative integer. Unit tests
2708 	// verify that lowerRoundInt() behaves as desired.
2709 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2710 #elif LLVM_VERSION_MAJOR >= 14
2711 	llvm::Value *rounded = lowerRound(V(cast.value()));
2712 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
2713 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(Int4::type()), T(Float4::type()) });
2714 	return RValue<Int4>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
2715 #else
2716 	RValue<Float4> clamped = Max(Min(cast, Float4(0x7FFFFF80)), Float4(static_cast<int>(0x80000000)));
2717 	return As<Int4>(V(lowerRoundInt(V(clamped.value()), T(Int4::type()))));
2718 #endif
2719 }
2720 
MulHigh(RValue<Int4> x,RValue<Int4> y)2721 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2722 {
2723 	RR_DEBUG_INFO_UPDATE_LOC();
2724 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2725 	return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2726 }
2727 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2728 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2729 {
2730 	RR_DEBUG_INFO_UPDATE_LOC();
2731 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2732 	return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2733 }
2734 
PackSigned(RValue<Int4> x,RValue<Int4> y)2735 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2736 {
2737 	RR_DEBUG_INFO_UPDATE_LOC();
2738 #if defined(__i386__) || defined(__x86_64__)
2739 	return x86::packssdw(x, y);
2740 #else
2741 	return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2742 #endif
2743 }
2744 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2745 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2746 {
2747 	RR_DEBUG_INFO_UPDATE_LOC();
2748 #if defined(__i386__) || defined(__x86_64__)
2749 	return x86::packusdw(x, y);
2750 #else
2751 	return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2752 #endif
2753 }
2754 
SignMask(RValue<Int4> x)2755 RValue<Int> SignMask(RValue<Int4> x)
2756 {
2757 	RR_DEBUG_INFO_UPDATE_LOC();
2758 #if defined(__i386__) || defined(__x86_64__)
2759 	return x86::movmskps(As<Float4>(x));
2760 #else
2761 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2762 #endif
2763 }
2764 
type()2765 Type *Int4::type()
2766 {
2767 	return T(llvm::VectorType::get(T(Int::type()), 4, false));
2768 }
2769 
UInt4(RValue<Float4> cast)2770 UInt4::UInt4(RValue<Float4> cast)
2771     : XYZW(this)
2772 {
2773 	RR_DEBUG_INFO_UPDATE_LOC();
2774 	Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2775 	storeValue(xyzw);
2776 }
2777 
UInt4(RValue<UInt> rhs)2778 UInt4::UInt4(RValue<UInt> rhs)
2779     : XYZW(this)
2780 {
2781 	RR_DEBUG_INFO_UPDATE_LOC();
2782 	Value *vector = loadValue();
2783 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2784 
2785 	std::vector<int> swizzle = { 0, 0, 0, 0 };
2786 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2787 
2788 	storeValue(replicate);
2789 }
2790 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2791 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2792 {
2793 	RR_DEBUG_INFO_UPDATE_LOC();
2794 #if defined(__i386__) || defined(__x86_64__)
2795 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2796 #else
2797 	return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2798 #endif
2799 }
2800 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2801 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2802 {
2803 	RR_DEBUG_INFO_UPDATE_LOC();
2804 #if defined(__i386__) || defined(__x86_64__)
2805 	return x86::psrld(lhs, rhs);
2806 #else
2807 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2808 #endif
2809 }
2810 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2811 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2812 {
2813 	RR_DEBUG_INFO_UPDATE_LOC();
2814 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2815 }
2816 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2817 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2818 {
2819 	RR_DEBUG_INFO_UPDATE_LOC();
2820 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2821 }
2822 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2823 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2824 {
2825 	RR_DEBUG_INFO_UPDATE_LOC();
2826 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2827 }
2828 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2829 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2830 {
2831 	RR_DEBUG_INFO_UPDATE_LOC();
2832 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2833 }
2834 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2835 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2836 {
2837 	RR_DEBUG_INFO_UPDATE_LOC();
2838 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2839 }
2840 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2841 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2842 {
2843 	RR_DEBUG_INFO_UPDATE_LOC();
2844 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2845 }
2846 
Max(RValue<UInt4> x,RValue<UInt4> y)2847 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2848 {
2849 	RR_DEBUG_INFO_UPDATE_LOC();
2850 #if defined(__i386__) || defined(__x86_64__)
2851 	if(CPUID::supportsSSE4_1())
2852 	{
2853 		return x86::pmaxud(x, y);
2854 	}
2855 	else
2856 #endif
2857 	{
2858 		RValue<UInt4> greater = CmpNLE(x, y);
2859 		return (x & greater) | (y & ~greater);
2860 	}
2861 }
2862 
Min(RValue<UInt4> x,RValue<UInt4> y)2863 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2864 {
2865 	RR_DEBUG_INFO_UPDATE_LOC();
2866 #if defined(__i386__) || defined(__x86_64__)
2867 	if(CPUID::supportsSSE4_1())
2868 	{
2869 		return x86::pminud(x, y);
2870 	}
2871 	else
2872 #endif
2873 	{
2874 		RValue<UInt4> less = CmpLT(x, y);
2875 		return (x & less) | (y & ~less);
2876 	}
2877 }
2878 
type()2879 Type *UInt4::type()
2880 {
2881 	return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2882 }
2883 
type()2884 Type *Half::type()
2885 {
2886 	return T(llvm::Type::getInt16Ty(*jit->context));
2887 }
2888 
HasRcpApprox()2889 bool HasRcpApprox()
2890 {
2891 #if defined(__i386__) || defined(__x86_64__)
2892 	return true;
2893 #else
2894 	return false;
2895 #endif
2896 }
2897 
RcpApprox(RValue<Float4> x,bool exactAtPow2)2898 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2899 {
2900 #if defined(__i386__) || defined(__x86_64__)
2901 	if(exactAtPow2)
2902 	{
2903 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2904 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2905 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2906 	}
2907 	return x86::rcpps(x);
2908 #else
2909 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2910 	return { 0.0f };
2911 #endif
2912 }
2913 
RcpApprox(RValue<Float> x,bool exactAtPow2)2914 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2915 {
2916 #if defined(__i386__) || defined(__x86_64__)
2917 	if(exactAtPow2)
2918 	{
2919 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2920 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2921 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2922 	}
2923 	return x86::rcpss(x);
2924 #else
2925 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2926 	return { 0.0f };
2927 #endif
2928 }
2929 
HasRcpSqrtApprox()2930 bool HasRcpSqrtApprox()
2931 {
2932 #if defined(__i386__) || defined(__x86_64__)
2933 	return true;
2934 #else
2935 	return false;
2936 #endif
2937 }
2938 
RcpSqrtApprox(RValue<Float4> x)2939 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2940 {
2941 #if defined(__i386__) || defined(__x86_64__)
2942 	return x86::rsqrtps(x);
2943 #else
2944 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2945 	return { 0.0f };
2946 #endif
2947 }
2948 
RcpSqrtApprox(RValue<Float> x)2949 RValue<Float> RcpSqrtApprox(RValue<Float> x)
2950 {
2951 #if defined(__i386__) || defined(__x86_64__)
2952 	return x86::rsqrtss(x);
2953 #else
2954 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2955 	return { 0.0f };
2956 #endif
2957 }
2958 
Sqrt(RValue<Float> x)2959 RValue<Float> Sqrt(RValue<Float> x)
2960 {
2961 	RR_DEBUG_INFO_UPDATE_LOC();
2962 #if defined(__i386__) || defined(__x86_64__)
2963 	return x86::sqrtss(x);
2964 #else
2965 	return As<Float>(V(lowerSQRT(V(x.value()))));
2966 #endif
2967 }
2968 
Round(RValue<Float> x)2969 RValue<Float> Round(RValue<Float> x)
2970 {
2971 	RR_DEBUG_INFO_UPDATE_LOC();
2972 #if defined(__i386__) || defined(__x86_64__)
2973 	if(CPUID::supportsSSE4_1())
2974 	{
2975 		return x86::roundss(x, 0);
2976 	}
2977 	else
2978 	{
2979 		return Float4(Round(Float4(x))).x;
2980 	}
2981 #else
2982 	return RValue<Float>(V(lowerRound(V(x.value()))));
2983 #endif
2984 }
2985 
Trunc(RValue<Float> x)2986 RValue<Float> Trunc(RValue<Float> x)
2987 {
2988 	RR_DEBUG_INFO_UPDATE_LOC();
2989 #if defined(__i386__) || defined(__x86_64__)
2990 	if(CPUID::supportsSSE4_1())
2991 	{
2992 		return x86::roundss(x, 3);
2993 	}
2994 	else
2995 	{
2996 		return Float(Int(x));  // Rounded toward zero
2997 	}
2998 #else
2999 	return RValue<Float>(V(lowerTrunc(V(x.value()))));
3000 #endif
3001 }
3002 
Frac(RValue<Float> x)3003 RValue<Float> Frac(RValue<Float> x)
3004 {
3005 	RR_DEBUG_INFO_UPDATE_LOC();
3006 #if defined(__i386__) || defined(__x86_64__)
3007 	if(CPUID::supportsSSE4_1())
3008 	{
3009 		return x - x86::floorss(x);
3010 	}
3011 	else
3012 	{
3013 		return Float4(Frac(Float4(x))).x;
3014 	}
3015 #else
3016 	// x - floor(x) can be 1.0 for very small negative x.
3017 	// Clamp against the value just below 1.0.
3018 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3019 #endif
3020 }
3021 
Floor(RValue<Float> x)3022 RValue<Float> Floor(RValue<Float> x)
3023 {
3024 	RR_DEBUG_INFO_UPDATE_LOC();
3025 #if defined(__i386__) || defined(__x86_64__)
3026 	if(CPUID::supportsSSE4_1())
3027 	{
3028 		return x86::floorss(x);
3029 	}
3030 	else
3031 	{
3032 		return Float4(Floor(Float4(x))).x;
3033 	}
3034 #else
3035 	return RValue<Float>(V(lowerFloor(V(x.value()))));
3036 #endif
3037 }
3038 
Ceil(RValue<Float> x)3039 RValue<Float> Ceil(RValue<Float> x)
3040 {
3041 	RR_DEBUG_INFO_UPDATE_LOC();
3042 #if defined(__i386__) || defined(__x86_64__)
3043 	if(CPUID::supportsSSE4_1())
3044 	{
3045 		return x86::ceilss(x);
3046 	}
3047 	else
3048 #endif
3049 	{
3050 		return Float4(Ceil(Float4(x))).x;
3051 	}
3052 }
3053 
type()3054 Type *Float::type()
3055 {
3056 	return T(llvm::Type::getFloatTy(*jit->context));
3057 }
3058 
type()3059 Type *Float2::type()
3060 {
3061 	return T(Type_v2f32);
3062 }
3063 
Float4(RValue<Float> rhs)3064 Float4::Float4(RValue<Float> rhs)
3065     : XYZW(this)
3066 {
3067 	RR_DEBUG_INFO_UPDATE_LOC();
3068 	Value *vector = loadValue();
3069 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3070 
3071 	std::vector<int> swizzle = { 0, 0, 0, 0 };
3072 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3073 
3074 	storeValue(replicate);
3075 }
3076 
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3077 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3078 {
3079 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
3080 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3081 }
3082 
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3083 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3084 {
3085 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(Float4::type()) });
3086 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3087 }
3088 
Abs(RValue<Float4> x)3089 RValue<Float4> Abs(RValue<Float4> x)
3090 {
3091 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
3092 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(x.value()))));
3093 }
3094 
Max(RValue<Float4> x,RValue<Float4> y)3095 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3096 {
3097 	RR_DEBUG_INFO_UPDATE_LOC();
3098 #if defined(__i386__) || defined(__x86_64__)
3099 	return x86::maxps(x, y);
3100 #else
3101 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3102 #endif
3103 }
3104 
Min(RValue<Float4> x,RValue<Float4> y)3105 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3106 {
3107 	RR_DEBUG_INFO_UPDATE_LOC();
3108 #if defined(__i386__) || defined(__x86_64__)
3109 	return x86::minps(x, y);
3110 #else
3111 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3112 #endif
3113 }
3114 
Sqrt(RValue<Float4> x)3115 RValue<Float4> Sqrt(RValue<Float4> x)
3116 {
3117 	RR_DEBUG_INFO_UPDATE_LOC();
3118 #if defined(__i386__) || defined(__x86_64__)
3119 	return x86::sqrtps(x);
3120 #else
3121 	return As<Float4>(V(lowerSQRT(V(x.value()))));
3122 #endif
3123 }
3124 
SignMask(RValue<Float4> x)3125 RValue<Int> SignMask(RValue<Float4> x)
3126 {
3127 	RR_DEBUG_INFO_UPDATE_LOC();
3128 #if defined(__i386__) || defined(__x86_64__)
3129 	return x86::movmskps(x);
3130 #else
3131 	return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3132 #endif
3133 }
3134 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3135 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3136 {
3137 	RR_DEBUG_INFO_UPDATE_LOC();
3138 	//	return As<Int4>(x86::cmpeqps(x, y));
3139 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3140 }
3141 
CmpLT(RValue<Float4> x,RValue<Float4> y)3142 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3143 {
3144 	RR_DEBUG_INFO_UPDATE_LOC();
3145 	//	return As<Int4>(x86::cmpltps(x, y));
3146 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3147 }
3148 
CmpLE(RValue<Float4> x,RValue<Float4> y)3149 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3150 {
3151 	RR_DEBUG_INFO_UPDATE_LOC();
3152 	//	return As<Int4>(x86::cmpleps(x, y));
3153 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3154 }
3155 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3156 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3157 {
3158 	RR_DEBUG_INFO_UPDATE_LOC();
3159 	//	return As<Int4>(x86::cmpneqps(x, y));
3160 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3161 }
3162 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3163 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3164 {
3165 	RR_DEBUG_INFO_UPDATE_LOC();
3166 	//	return As<Int4>(x86::cmpnltps(x, y));
3167 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3168 }
3169 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3170 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3171 {
3172 	RR_DEBUG_INFO_UPDATE_LOC();
3173 	//	return As<Int4>(x86::cmpnleps(x, y));
3174 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3175 }
3176 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3177 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3178 {
3179 	RR_DEBUG_INFO_UPDATE_LOC();
3180 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3181 }
3182 
CmpULT(RValue<Float4> x,RValue<Float4> y)3183 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3184 {
3185 	RR_DEBUG_INFO_UPDATE_LOC();
3186 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3187 }
3188 
CmpULE(RValue<Float4> x,RValue<Float4> y)3189 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3190 {
3191 	RR_DEBUG_INFO_UPDATE_LOC();
3192 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3193 }
3194 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3195 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3196 {
3197 	RR_DEBUG_INFO_UPDATE_LOC();
3198 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3199 }
3200 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3201 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3202 {
3203 	RR_DEBUG_INFO_UPDATE_LOC();
3204 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3205 }
3206 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3207 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3208 {
3209 	RR_DEBUG_INFO_UPDATE_LOC();
3210 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3211 }
3212 
Round(RValue<Float4> x)3213 RValue<Float4> Round(RValue<Float4> x)
3214 {
3215 	RR_DEBUG_INFO_UPDATE_LOC();
3216 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3217 	if(CPUID::supportsSSE4_1())
3218 	{
3219 		return x86::roundps(x, 0);
3220 	}
3221 	else
3222 	{
3223 		return Float4(RoundInt(x));
3224 	}
3225 #else
3226 	return RValue<Float4>(V(lowerRound(V(x.value()))));
3227 #endif
3228 }
3229 
Trunc(RValue<Float4> x)3230 RValue<Float4> Trunc(RValue<Float4> x)
3231 {
3232 	RR_DEBUG_INFO_UPDATE_LOC();
3233 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3234 	if(CPUID::supportsSSE4_1())
3235 	{
3236 		return x86::roundps(x, 3);
3237 	}
3238 	else
3239 	{
3240 		return Float4(Int4(x));
3241 	}
3242 #else
3243 	return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3244 #endif
3245 }
3246 
Frac(RValue<Float4> x)3247 RValue<Float4> Frac(RValue<Float4> x)
3248 {
3249 	RR_DEBUG_INFO_UPDATE_LOC();
3250 	Float4 frc;
3251 
3252 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3253 	if(CPUID::supportsSSE4_1())
3254 	{
3255 		frc = x - x86::floorps(x);
3256 	}
3257 	else
3258 	{
3259 		frc = x - Float4(Int4(x));  // Signed fractional part.
3260 
3261 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3262 	}
3263 #else
3264 	frc = x - Floor(x);
3265 #endif
3266 
3267 	// x - floor(x) can be 1.0 for very small negative x.
3268 	// Clamp against the value just below 1.0.
3269 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3270 }
3271 
Floor(RValue<Float4> x)3272 RValue<Float4> Floor(RValue<Float4> x)
3273 {
3274 	RR_DEBUG_INFO_UPDATE_LOC();
3275 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3276 	if(CPUID::supportsSSE4_1())
3277 	{
3278 		return x86::floorps(x);
3279 	}
3280 	else
3281 	{
3282 		return x - Frac(x);
3283 	}
3284 #else
3285 	return RValue<Float4>(V(lowerFloor(V(x.value()))));
3286 #endif
3287 }
3288 
Ceil(RValue<Float4> x)3289 RValue<Float4> Ceil(RValue<Float4> x)
3290 {
3291 	RR_DEBUG_INFO_UPDATE_LOC();
3292 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3293 	if(CPUID::supportsSSE4_1())
3294 	{
3295 		return x86::ceilps(x);
3296 	}
3297 	else
3298 #endif
3299 	{
3300 		return -Floor(-x);
3301 	}
3302 }
3303 
Ctlz(RValue<UInt> v,bool isZeroUndef)3304 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3305 {
3306 	RR_DEBUG_INFO_UPDATE_LOC();
3307 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3308 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3309 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3310 }
3311 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3312 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3313 {
3314 	RR_DEBUG_INFO_UPDATE_LOC();
3315 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3316 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3317 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3318 }
3319 
Cttz(RValue<UInt> v,bool isZeroUndef)3320 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3321 {
3322 	RR_DEBUG_INFO_UPDATE_LOC();
3323 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3324 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3325 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3326 }
3327 
Cttz(RValue<UInt4> v,bool isZeroUndef)3328 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3329 {
3330 	RR_DEBUG_INFO_UPDATE_LOC();
3331 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3332 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3333 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3334 }
3335 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3336 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3337 {
3338 	return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3339 }
3340 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3341 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3342 {
3343 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3344 }
3345 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3346 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3347 {
3348 	return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3349 }
3350 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3351 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3352 {
3353 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3354 }
3355 
type()3356 Type *Float4::type()
3357 {
3358 	return T(llvm::VectorType::get(T(Float::type()), 4, false));
3359 }
3360 
Ticks()3361 RValue<Long> Ticks()
3362 {
3363 	RR_DEBUG_INFO_UPDATE_LOC();
3364 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3365 
3366 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3367 }
3368 
ConstantPointer(const void * ptr)3369 RValue<Pointer<Byte>> ConstantPointer(const void *ptr)
3370 {
3371 	RR_DEBUG_INFO_UPDATE_LOC();
3372 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3373 	// is defined to truncate (and zero extend) if necessary.
3374 	auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3375 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3376 }
3377 
ConstantData(const void * data,size_t size)3378 RValue<Pointer<Byte>> ConstantData(const void *data, size_t size)
3379 {
3380 	RR_DEBUG_INFO_UPDATE_LOC();
3381 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3382 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3383 	return RValue<Pointer<Byte>>(V(ptr));
3384 }
3385 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3386 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3387 {
3388 	// If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3389 	// mark all call arguments as initialized by calling __msan_unpoison_param().
3390 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3391 	{
3392 		// void __msan_unpoison_param(size_t n)
3393 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
3394 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3395 		auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3396 		auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3397 
3398 		jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3399 	}
3400 
3401 	RR_DEBUG_INFO_UPDATE_LOC();
3402 	llvm::SmallVector<llvm::Type *, 8> paramTys;
3403 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3404 	auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3405 
3406 	auto funcPtrTy = funcTy->getPointerTo();
3407 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3408 
3409 	llvm::SmallVector<llvm::Value *, 8> arguments;
3410 	for(auto arg : args) { arguments.push_back(V(arg)); }
3411 	return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3412 }
3413 
Breakpoint()3414 void Breakpoint()
3415 {
3416 	RR_DEBUG_INFO_UPDATE_LOC();
3417 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3418 
3419 	jit->builder->CreateCall(debugtrap);
3420 }
3421 
3422 }  // namespace rr
3423 
3424 namespace rr {
3425 
3426 #if defined(__i386__) || defined(__x86_64__)
3427 namespace x86 {
3428 
3429 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3430 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3431 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3432 {
3433 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3434 
3435 	return V(jit->builder->CreateCall(intrinsic, V(x)));
3436 }
3437 
3438 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3439 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3440 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3441 {
3442 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3443 
3444 	return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3445 }
3446 
cvtss2si(RValue<Float> val)3447 RValue<Int> cvtss2si(RValue<Float> val)
3448 {
3449 	Float4 vector;
3450 	vector.x = val;
3451 
3452 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3453 }
3454 
cvtps2dq(RValue<Float4> val)3455 RValue<Int4> cvtps2dq(RValue<Float4> val)
3456 {
3457 	ASSERT(!__has_feature(memory_sanitizer));  // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3458 
3459 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3460 }
3461 
rcpss(RValue<Float> val)3462 RValue<Float> rcpss(RValue<Float> val)
3463 {
3464 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3465 
3466 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3467 }
3468 
sqrtss(RValue<Float> val)3469 RValue<Float> sqrtss(RValue<Float> val)
3470 {
3471 	return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3472 }
3473 
rsqrtss(RValue<Float> val)3474 RValue<Float> rsqrtss(RValue<Float> val)
3475 {
3476 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3477 
3478 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3479 }
3480 
rcpps(RValue<Float4> val)3481 RValue<Float4> rcpps(RValue<Float4> val)
3482 {
3483 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3484 }
3485 
sqrtps(RValue<Float4> val)3486 RValue<Float4> sqrtps(RValue<Float4> val)
3487 {
3488 	return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3489 }
3490 
rsqrtps(RValue<Float4> val)3491 RValue<Float4> rsqrtps(RValue<Float4> val)
3492 {
3493 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3494 }
3495 
maxps(RValue<Float4> x,RValue<Float4> y)3496 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3497 {
3498 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3499 }
3500 
minps(RValue<Float4> x,RValue<Float4> y)3501 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3502 {
3503 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3504 }
3505 
roundss(RValue<Float> val,unsigned char imm)3506 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3507 {
3508 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3509 
3510 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3511 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3512 
3513 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3514 }
3515 
floorss(RValue<Float> val)3516 RValue<Float> floorss(RValue<Float> val)
3517 {
3518 	return roundss(val, 1);
3519 }
3520 
ceilss(RValue<Float> val)3521 RValue<Float> ceilss(RValue<Float> val)
3522 {
3523 	return roundss(val, 2);
3524 }
3525 
roundps(RValue<Float4> val,unsigned char imm)3526 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3527 {
3528 	ASSERT(!__has_feature(memory_sanitizer));  // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3529 
3530 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3531 }
3532 
floorps(RValue<Float4> val)3533 RValue<Float4> floorps(RValue<Float4> val)
3534 {
3535 	return roundps(val, 1);
3536 }
3537 
ceilps(RValue<Float4> val)3538 RValue<Float4> ceilps(RValue<Float4> val)
3539 {
3540 	return roundps(val, 2);
3541 }
3542 
paddsw(RValue<Short4> x,RValue<Short4> y)3543 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3544 {
3545 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3546 }
3547 
psubsw(RValue<Short4> x,RValue<Short4> y)3548 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3549 {
3550 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3551 }
3552 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3553 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3554 {
3555 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3556 }
3557 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3558 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3559 {
3560 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3561 }
3562 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3563 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3564 {
3565 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3566 }
3567 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3568 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3569 {
3570 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3571 }
3572 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3573 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3574 {
3575 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3576 }
3577 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3578 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3579 {
3580 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3581 }
3582 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3583 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3584 {
3585 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3586 }
3587 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3588 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3589 {
3590 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3591 }
3592 
pminsw(RValue<Short4> x,RValue<Short4> y)3593 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3594 {
3595 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3596 }
3597 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3598 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3599 {
3600 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3601 }
3602 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3603 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3604 {
3605 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3606 }
3607 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3608 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3609 {
3610 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3611 }
3612 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3613 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3614 {
3615 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3616 }
3617 
packssdw(RValue<Int2> x,RValue<Int2> y)3618 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3619 {
3620 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3621 }
3622 
packssdw(RValue<Int4> x,RValue<Int4> y)3623 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3624 {
3625 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3626 }
3627 
packsswb(RValue<Short4> x,RValue<Short4> y)3628 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3629 {
3630 	return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3631 }
3632 
packuswb(RValue<Short4> x,RValue<Short4> y)3633 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3634 {
3635 	return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3636 }
3637 
packusdw(RValue<Int4> x,RValue<Int4> y)3638 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3639 {
3640 	if(CPUID::supportsSSE4_1())
3641 	{
3642 		return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3643 	}
3644 	else
3645 	{
3646 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3647 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3648 
3649 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3650 	}
3651 }
3652 
psrlw(RValue<UShort4> x,unsigned char y)3653 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3654 {
3655 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3656 }
3657 
psrlw(RValue<UShort8> x,unsigned char y)3658 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3659 {
3660 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3661 }
3662 
psraw(RValue<Short4> x,unsigned char y)3663 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3664 {
3665 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3666 }
3667 
psraw(RValue<Short8> x,unsigned char y)3668 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3669 {
3670 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3671 }
3672 
psllw(RValue<Short4> x,unsigned char y)3673 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3674 {
3675 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3676 }
3677 
psllw(RValue<Short8> x,unsigned char y)3678 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3679 {
3680 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3681 }
3682 
pslld(RValue<Int2> x,unsigned char y)3683 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3684 {
3685 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3686 }
3687 
pslld(RValue<Int4> x,unsigned char y)3688 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3689 {
3690 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3691 }
3692 
psrad(RValue<Int2> x,unsigned char y)3693 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3694 {
3695 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3696 }
3697 
psrad(RValue<Int4> x,unsigned char y)3698 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3699 {
3700 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3701 }
3702 
psrld(RValue<UInt2> x,unsigned char y)3703 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3704 {
3705 	return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3706 }
3707 
psrld(RValue<UInt4> x,unsigned char y)3708 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3709 {
3710 	return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3711 }
3712 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3713 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3714 {
3715 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3716 }
3717 
pminsd(RValue<Int4> x,RValue<Int4> y)3718 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3719 {
3720 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3721 }
3722 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3723 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3724 {
3725 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3726 }
3727 
pminud(RValue<UInt4> x,RValue<UInt4> y)3728 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3729 {
3730 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3731 }
3732 
pmulhw(RValue<Short4> x,RValue<Short4> y)3733 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3734 {
3735 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3736 }
3737 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3738 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3739 {
3740 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3741 }
3742 
pmaddwd(RValue<Short4> x,RValue<Short4> y)3743 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3744 {
3745 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3746 }
3747 
pmulhw(RValue<Short8> x,RValue<Short8> y)3748 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3749 {
3750 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3751 }
3752 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3753 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3754 {
3755 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3756 }
3757 
pmaddwd(RValue<Short8> x,RValue<Short8> y)3758 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3759 {
3760 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3761 }
3762 
movmskps(RValue<Float4> x)3763 RValue<Int> movmskps(RValue<Float4> x)
3764 {
3765 	Value *v = x.value();
3766 
3767 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3768 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
3769 	// just the sign bits to avoid false positives.
3770 	if(__has_feature(memory_sanitizer))
3771 	{
3772 		v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
3773 	}
3774 
3775 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
3776 }
3777 
pmovmskb(RValue<Byte8> x)3778 RValue<Int> pmovmskb(RValue<Byte8> x)
3779 {
3780 	Value *v = x.value();
3781 
3782 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3783 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
3784 	// just the sign bits in the lower 64-bit vector to avoid false positives.
3785 	if(__has_feature(memory_sanitizer))
3786 	{
3787 		v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
3788 	}
3789 
3790 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
3791 }
3792 
3793 }  // namespace x86
3794 #endif  // defined(__i386__) || defined(__x86_64__)
3795 
3796 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3797 void VPrintf(const std::vector<Value *> &vals)
3798 {
3799 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3800 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3801 	auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
3802 	auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
3803 	jit->builder->CreateCall(func, V(vals));
3804 }
3805 #endif  // ENABLE_RR_PRINT
3806 
Nop()3807 void Nop()
3808 {
3809 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
3810 	auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
3811 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
3812 	jit->builder->CreateCall(func);
3813 }
3814 
EmitDebugLocation()3815 void EmitDebugLocation()
3816 {
3817 #ifdef ENABLE_RR_DEBUG_INFO
3818 	if(jit->debugInfo != nullptr)
3819 	{
3820 		jit->debugInfo->EmitLocation();
3821 	}
3822 #endif  // ENABLE_RR_DEBUG_INFO
3823 }
3824 
EmitDebugVariable(Value * value)3825 void EmitDebugVariable(Value *value)
3826 {
3827 #ifdef ENABLE_RR_DEBUG_INFO
3828 	if(jit->debugInfo != nullptr)
3829 	{
3830 		jit->debugInfo->EmitVariable(value);
3831 	}
3832 #endif  // ENABLE_RR_DEBUG_INFO
3833 }
3834 
FlushDebug()3835 void FlushDebug()
3836 {
3837 #ifdef ENABLE_RR_DEBUG_INFO
3838 	if(jit->debugInfo != nullptr)
3839 	{
3840 		jit->debugInfo->Flush();
3841 	}
3842 #endif  // ENABLE_RR_DEBUG_INFO
3843 }
3844 
3845 }  // namespace rr
3846 
3847 // ------------------------------  Coroutines ------------------------------
3848 
3849 namespace {
3850 
3851 // Magic values retuned by llvm.coro.suspend.
3852 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
3853 enum SuspendAction
3854 {
3855 	SuspendActionSuspend = -1,
3856 	SuspendActionResume = 0,
3857 	SuspendActionDestroy = 1
3858 };
3859 
promoteFunctionToCoroutine()3860 void promoteFunctionToCoroutine()
3861 {
3862 	ASSERT(jit->coroutine.id == nullptr);
3863 
3864 	// Types
3865 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
3866 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
3867 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
3868 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3869 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3870 	auto promiseTy = jit->coroutine.yieldType;
3871 	auto promisePtrTy = promiseTy->getPointerTo();
3872 
3873 	// LLVM intrinsics
3874 	auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
3875 	auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
3876 	auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
3877 	auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
3878 	auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
3879 	auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
3880 	auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
3881 	auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
3882 	auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
3883 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
3884 
3885 	auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
3886 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
3887 	auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
3888 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
3889 
3890 	auto oldInsertionPoint = jit->builder->saveIP();
3891 
3892 	// Build the coroutine_await() function:
3893 	//
3894 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
3895 	//    {
3896 	//        if(llvm.coro.done(handle))
3897 	//        {
3898 	//            return false;
3899 	//        }
3900 	//        else
3901 	//        {
3902 	//            *value = (T*)llvm.coro.promise(handle);
3903 	//            llvm.coro.resume(handle);
3904 	//            return true;
3905 	//        }
3906 	//    }
3907 	//
3908 	{
3909 		auto args = jit->coroutine.await->arg_begin();
3910 		auto handle = args++;
3911 		auto outPtr = args++;
3912 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
3913 		auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
3914 		auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
3915 
3916 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
3917 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
3918 
3919 		jit->builder->SetInsertPoint(doneBlock);
3920 		jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
3921 
3922 		jit->builder->SetInsertPoint(resumeBlock);
3923 		auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
3924 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
3925 		auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
3926 		jit->builder->CreateStore(promise, outPtr);
3927 		jit->builder->CreateCall(coro_resume, { handle });
3928 		jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
3929 	}
3930 
3931 	// Build the coroutine_destroy() function:
3932 	//
3933 	//    void coroutine_destroy(CoroutineHandle* handle)
3934 	//    {
3935 	//        llvm.coro.destroy(handle);
3936 	//    }
3937 	//
3938 	{
3939 		auto handle = jit->coroutine.destroy->arg_begin();
3940 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
3941 		jit->builder->CreateCall(coro_destroy, { handle });
3942 		jit->builder->CreateRetVoid();
3943 	}
3944 
3945 	// Begin building the main coroutine_begin() function.
3946 	//
3947 	//    CoroutineHandle* coroutine_begin(<Arguments>)
3948 	//    {
3949 	//        YieldType promise;
3950 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
3951 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
3952 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
3953 	//
3954 	//        ... <REACTOR CODE> ...
3955 	//
3956 	//    end:
3957 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
3958 	//        switch(action)
3959 	//        {
3960 	//        case SuspendActionResume:
3961 	//            UNREACHABLE(); // Illegal to resume after final suspend.
3962 	//        case SuspendActionDestroy:
3963 	//            goto destroy;
3964 	//        default: // (SuspendActionSuspend)
3965 	//            goto suspend;
3966 	//        }
3967 	//
3968 	//    destroy:
3969 	//        coroutine_free_frame(llvm.coro.free(id, handle));
3970 	//        goto suspend;
3971 	//
3972 	//    suspend:
3973 	//        llvm.coro.end(handle, false);
3974 	//        return handle;
3975 	//    }
3976 	//
3977 
3978 #ifdef ENABLE_RR_DEBUG_INFO
3979 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
3980 #endif  // ENABLE_RR_DEBUG_INFO
3981 
3982 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
3983 	jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
3984 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
3985 
3986 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
3987 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
3988 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
3989 	                                                          llvm::ConstantInt::get(i32Ty, 0),
3990 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
3991 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
3992 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
3993 	                                                      });
3994 	auto size = jit->builder->CreateCall(coro_size, {});
3995 	auto frame = jit->builder->CreateCall(allocFrame, { size });
3996 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
3997 
3998 	// Build the suspend block
3999 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4000 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4001 	jit->builder->CreateRet(jit->coroutine.handle);
4002 
4003 	// Build the end block
4004 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4005 	auto action = jit->builder->CreateCall(coro_suspend, {
4006 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4007 	                                                         llvm::ConstantInt::get(i1Ty, 1),  // final: true
4008 	                                                     });
4009 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4010 	// switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4011 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4012 
4013 	// Build the destroy block
4014 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4015 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4016 	jit->builder->CreateCall(freeFrame, { memory });
4017 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4018 
4019 	// Switch back to original insert point to continue building the coroutine.
4020 	jit->builder->restoreIP(oldInsertionPoint);
4021 }
4022 
4023 }  // anonymous namespace
4024 
4025 namespace rr {
4026 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4027 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4028 {
4029 	// Coroutines are initially created as a regular function.
4030 	// Upon the first call to Yield(), the function is promoted to a true
4031 	// coroutine.
4032 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4033 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4034 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4035 	auto handleTy = i8PtrTy;
4036 	auto boolTy = i1Ty;
4037 	auto promiseTy = T(YieldType);
4038 	auto promisePtrTy = promiseTy->getPointerTo();
4039 
4040 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4041 #if LLVM_VERSION_MAJOR >= 16
4042 	jit->function->setPresplitCoroutine();
4043 #else
4044 	jit->function->addFnAttr("coroutine.presplit", "0");
4045 #endif
4046 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4047 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4048 	jit->coroutine.yieldType = promiseTy;
4049 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4050 
4051 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4052 }
4053 
yield(Value * val)4054 void Nucleus::yield(Value *val)
4055 {
4056 	if(jit->coroutine.id == nullptr)
4057 	{
4058 		// First call to yield().
4059 		// Promote the function to a full coroutine.
4060 		promoteFunctionToCoroutine();
4061 		ASSERT(jit->coroutine.id != nullptr);
4062 	}
4063 
4064 	//      promise = val;
4065 	//
4066 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4067 	//      switch(action)
4068 	//      {
4069 	//      case SuspendActionResume:
4070 	//          goto resume;
4071 	//      case SuspendActionDestroy:
4072 	//          goto destroy;
4073 	//      default: // (SuspendActionSuspend)
4074 	//          goto suspend;
4075 	//      }
4076 	//  resume:
4077 	//
4078 
4079 	RR_DEBUG_INFO_UPDATE_LOC();
4080 	Variable::materializeAll();
4081 
4082 	// Types
4083 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4084 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4085 
4086 	// Intrinsics
4087 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4088 
4089 	// Create a block to resume execution.
4090 	auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4091 
4092 	// Store the promise (yield value)
4093 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4094 	auto action = jit->builder->CreateCall(coro_suspend, {
4095 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4096 	                                                         llvm::ConstantInt::get(i1Ty, 0),  // final: true
4097 	                                                     });
4098 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4099 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4100 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4101 
4102 	// Continue building in the resume block.
4103 	jit->builder->SetInsertPoint(resumeBlock);
4104 }
4105 
acquireCoroutine(const char * name)4106 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name)
4107 {
4108 	if(jit->coroutine.id)
4109 	{
4110 		jit->builder->CreateBr(jit->coroutine.endBlock);
4111 	}
4112 	else
4113 	{
4114 		// Coroutine without a Yield acts as a regular function.
4115 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4116 		// handle.
4117 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4118 		// The 'coroutine_await' function always returns false (coroutine done).
4119 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4120 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4121 		// The 'coroutine_destroy' does nothing, returns void.
4122 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4123 		jit->builder->CreateRetVoid();
4124 	}
4125 
4126 #ifdef ENABLE_RR_DEBUG_INFO
4127 	if(jit->debugInfo != nullptr)
4128 	{
4129 		jit->debugInfo->Finalize();
4130 	}
4131 #endif  // ENABLE_RR_DEBUG_INFO
4132 
4133 	if(false)
4134 	{
4135 		std::error_code error;
4136 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4137 		jit->module->print(file, 0);
4138 	}
4139 
4140 	jit->runPasses();
4141 
4142 	if(false)
4143 	{
4144 		std::error_code error;
4145 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4146 		jit->module->print(file, 0);
4147 	}
4148 
4149 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4150 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4151 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4152 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4153 
4154 	auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount);
4155 
4156 	delete jit;
4157 	jit = nullptr;
4158 
4159 	return routine;
4160 }
4161 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4162 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4163 {
4164 	return func();
4165 }
4166 
Int(RValue<scalar::Int> rhs)4167 SIMD::Int::Int(RValue<scalar::Int> rhs)
4168     : XYZW(this)
4169 {
4170 	RR_DEBUG_INFO_UPDATE_LOC();
4171 	Value *vector = loadValue();
4172 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4173 
4174 	std::vector<int> swizzle = { 0 };
4175 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4176 
4177 	storeValue(replicate);
4178 }
4179 
operator <<(RValue<SIMD::Int> lhs,unsigned char rhs)4180 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, unsigned char rhs)
4181 {
4182 	RR_DEBUG_INFO_UPDATE_LOC();
4183 	return As<SIMD::Int>(V(lowerVectorShl(V(lhs.value()), rhs)));
4184 }
4185 
operator >>(RValue<SIMD::Int> lhs,unsigned char rhs)4186 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, unsigned char rhs)
4187 {
4188 	RR_DEBUG_INFO_UPDATE_LOC();
4189 	return As<SIMD::Int>(V(lowerVectorAShr(V(lhs.value()), rhs)));
4190 }
4191 
CmpEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4192 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4193 {
4194 	RR_DEBUG_INFO_UPDATE_LOC();
4195 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4196 }
4197 
CmpLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4198 RValue<SIMD::Int> CmpLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4199 {
4200 	RR_DEBUG_INFO_UPDATE_LOC();
4201 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), SIMD::Int::type()));
4202 }
4203 
CmpLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4204 RValue<SIMD::Int> CmpLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4205 {
4206 	RR_DEBUG_INFO_UPDATE_LOC();
4207 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), SIMD::Int::type()));
4208 }
4209 
CmpNEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4210 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4211 {
4212 	RR_DEBUG_INFO_UPDATE_LOC();
4213 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4214 }
4215 
CmpNLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4216 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4217 {
4218 	RR_DEBUG_INFO_UPDATE_LOC();
4219 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), SIMD::Int::type()));
4220 }
4221 
CmpNLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4222 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4223 {
4224 	RR_DEBUG_INFO_UPDATE_LOC();
4225 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), SIMD::Int::type()));
4226 }
4227 
Abs(RValue<SIMD::Int> x)4228 RValue<SIMD::Int> Abs(RValue<SIMD::Int> x)
4229 {
4230 #if LLVM_VERSION_MAJOR >= 12
4231 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
4232 	return RValue<SIMD::Int>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
4233 #else
4234 	auto negative = x >> 31;
4235 	return (x ^ negative) - negative;
4236 #endif
4237 }
4238 
Max(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4239 RValue<SIMD::Int> Max(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4240 {
4241 	RR_DEBUG_INFO_UPDATE_LOC();
4242 	RValue<SIMD::Int> greater = CmpNLE(x, y);
4243 	return (x & greater) | (y & ~greater);
4244 }
4245 
Min(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4246 RValue<SIMD::Int> Min(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4247 {
4248 	RR_DEBUG_INFO_UPDATE_LOC();
4249 	RValue<SIMD::Int> less = CmpLT(x, y);
4250 	return (x & less) | (y & ~less);
4251 }
4252 
RoundInt(RValue<SIMD::Float> cast)4253 RValue<SIMD::Int> RoundInt(RValue<SIMD::Float> cast)
4254 {
4255 	RR_DEBUG_INFO_UPDATE_LOC();
4256 	return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4257 }
4258 
RoundIntClamped(RValue<SIMD::Float> cast)4259 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast)
4260 {
4261 	RR_DEBUG_INFO_UPDATE_LOC();
4262 
4263 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
4264 #if defined(__arm__) || defined(__aarch64__)
4265 	// ARM saturates to the largest positive or negative integer. Unit tests
4266 	// verify that lowerRoundInt() behaves as desired.
4267 	return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4268 #elif LLVM_VERSION_MAJOR >= 14
4269 	llvm::Value *rounded = lowerRound(V(cast.value()));
4270 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
4271 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(SIMD::Int::type()), T(SIMD::Float::type()) });
4272 	return RValue<SIMD::Int>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
4273 #else
4274 	RValue<SIMD::Float> clamped = Max(Min(cast, SIMD::Float(0x7FFFFF80)), SIMD::Float(static_cast<int>(0x80000000)));
4275 	return As<SIMD::Int>(V(lowerRoundInt(V(clamped.value()), T(SIMD::Int::type()))));
4276 #endif
4277 }
4278 
Extract128(RValue<SIMD::Int> val,int i)4279 RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
4280 {
4281 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4282 
4283 	return As<Int4>(V(jit->builder->CreateExtractElement(v128, i)));
4284 }
4285 
Insert128(RValue<SIMD::Int> val,RValue<Int4> element,int i)4286 RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
4287 {
4288 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4289 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4290 
4291 	return As<SIMD::Int>(V(jit->builder->CreateInsertElement(v128, a, i)));
4292 }
4293 
type()4294 Type *SIMD::Int::type()
4295 {
4296 	return T(llvm::VectorType::get(T(scalar::Int::type()), SIMD::Width, false));
4297 }
4298 
UInt(RValue<SIMD::Float> cast)4299 SIMD::UInt::UInt(RValue<SIMD::Float> cast)
4300     : XYZW(this)
4301 {
4302 	RR_DEBUG_INFO_UPDATE_LOC();
4303 	Value *xyzw = Nucleus::createFPToUI(cast.value(), SIMD::UInt::type());
4304 	storeValue(xyzw);
4305 }
4306 
UInt(RValue<scalar::UInt> rhs)4307 SIMD::UInt::UInt(RValue<scalar::UInt> rhs)
4308     : XYZW(this)
4309 {
4310 	RR_DEBUG_INFO_UPDATE_LOC();
4311 	Value *vector = loadValue();
4312 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4313 
4314 	std::vector<int> swizzle = { 0 };
4315 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4316 
4317 	storeValue(replicate);
4318 }
4319 
operator <<(RValue<SIMD::UInt> lhs,unsigned char rhs)4320 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, unsigned char rhs)
4321 {
4322 	RR_DEBUG_INFO_UPDATE_LOC();
4323 	return As<SIMD::UInt>(V(lowerVectorShl(V(lhs.value()), rhs)));
4324 }
4325 
operator >>(RValue<SIMD::UInt> lhs,unsigned char rhs)4326 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, unsigned char rhs)
4327 {
4328 	RR_DEBUG_INFO_UPDATE_LOC();
4329 	return As<SIMD::UInt>(V(lowerVectorLShr(V(lhs.value()), rhs)));
4330 }
4331 
CmpEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4332 RValue<SIMD::UInt> CmpEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4333 {
4334 	RR_DEBUG_INFO_UPDATE_LOC();
4335 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4336 }
4337 
CmpLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4338 RValue<SIMD::UInt> CmpLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4339 {
4340 	RR_DEBUG_INFO_UPDATE_LOC();
4341 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), SIMD::Int::type()));
4342 }
4343 
CmpLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4344 RValue<SIMD::UInt> CmpLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4345 {
4346 	RR_DEBUG_INFO_UPDATE_LOC();
4347 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), SIMD::Int::type()));
4348 }
4349 
CmpNEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4350 RValue<SIMD::UInt> CmpNEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4351 {
4352 	RR_DEBUG_INFO_UPDATE_LOC();
4353 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4354 }
4355 
CmpNLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4356 RValue<SIMD::UInt> CmpNLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4357 {
4358 	RR_DEBUG_INFO_UPDATE_LOC();
4359 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), SIMD::Int::type()));
4360 }
4361 
CmpNLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4362 RValue<SIMD::UInt> CmpNLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4363 {
4364 	RR_DEBUG_INFO_UPDATE_LOC();
4365 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), SIMD::Int::type()));
4366 }
4367 
Max(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4368 RValue<SIMD::UInt> Max(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4369 {
4370 	RR_DEBUG_INFO_UPDATE_LOC();
4371 	RValue<SIMD::UInt> greater = CmpNLE(x, y);
4372 	return (x & greater) | (y & ~greater);
4373 }
4374 
Min(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4375 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4376 {
4377 	RR_DEBUG_INFO_UPDATE_LOC();
4378 	RValue<SIMD::UInt> less = CmpLT(x, y);
4379 	return (x & less) | (y & ~less);
4380 }
4381 
Extract128(RValue<SIMD::UInt> val,int i)4382 RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
4383 {
4384 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4385 
4386 	return As<UInt4>(V(jit->builder->CreateExtractElement(v128, i)));
4387 }
4388 
Insert128(RValue<SIMD::UInt> val,RValue<UInt4> element,int i)4389 RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
4390 {
4391 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4392 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4393 
4394 	return As<SIMD::UInt>(V(jit->builder->CreateInsertElement(v128, a, i)));
4395 }
4396 
type()4397 Type *SIMD::UInt::type()
4398 {
4399 	return T(llvm::VectorType::get(T(scalar::UInt::type()), SIMD::Width, false));
4400 }
4401 
Float(RValue<scalar::Float> rhs)4402 SIMD::Float::Float(RValue<scalar::Float> rhs)
4403     : XYZW(this)
4404 {
4405 	RR_DEBUG_INFO_UPDATE_LOC();
4406 	Value *vector = loadValue();
4407 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4408 
4409 	std::vector<int> swizzle = { 0 };
4410 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4411 
4412 	storeValue(replicate);
4413 }
4414 
operator %(RValue<SIMD::Float> lhs,RValue<SIMD::Float> rhs)4415 RValue<SIMD::Float> operator%(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs)
4416 {
4417 	return RValue<SIMD::Float>(Nucleus::createFRem(lhs.value(), rhs.value()));
4418 }
4419 
MulAdd(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4420 RValue<SIMD::Float> MulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4421 {
4422 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(SIMD::Float::type()) });
4423 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4424 }
4425 
FMA(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4426 RValue<SIMD::Float> FMA(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4427 {
4428 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(SIMD::Float::type()) });
4429 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4430 }
4431 
Abs(RValue<SIMD::Float> x)4432 RValue<SIMD::Float> Abs(RValue<SIMD::Float> x)
4433 {
4434 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
4435 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, V(x.value()))));
4436 }
4437 
Max(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4438 RValue<SIMD::Float> Max(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4439 {
4440 	RR_DEBUG_INFO_UPDATE_LOC();
4441 	return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
4442 }
4443 
Min(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4444 RValue<SIMD::Float> Min(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4445 {
4446 	RR_DEBUG_INFO_UPDATE_LOC();
4447 	return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
4448 }
4449 
Sqrt(RValue<SIMD::Float> x)4450 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x)
4451 {
4452 	RR_DEBUG_INFO_UPDATE_LOC();
4453 	return As<SIMD::Float>(V(lowerSQRT(V(x.value()))));
4454 }
4455 
CmpEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4456 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4457 {
4458 	RR_DEBUG_INFO_UPDATE_LOC();
4459 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), SIMD::Int::type()));
4460 }
4461 
CmpLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4462 RValue<SIMD::Int> CmpLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4463 {
4464 	RR_DEBUG_INFO_UPDATE_LOC();
4465 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), SIMD::Int::type()));
4466 }
4467 
CmpLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4468 RValue<SIMD::Int> CmpLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4469 {
4470 	RR_DEBUG_INFO_UPDATE_LOC();
4471 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), SIMD::Int::type()));
4472 }
4473 
CmpNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4474 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4475 {
4476 	RR_DEBUG_INFO_UPDATE_LOC();
4477 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), SIMD::Int::type()));
4478 }
4479 
CmpNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4480 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4481 {
4482 	RR_DEBUG_INFO_UPDATE_LOC();
4483 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), SIMD::Int::type()));
4484 }
4485 
CmpNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4486 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4487 {
4488 	RR_DEBUG_INFO_UPDATE_LOC();
4489 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), SIMD::Int::type()));
4490 }
4491 
CmpUEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4492 RValue<SIMD::Int> CmpUEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4493 {
4494 	RR_DEBUG_INFO_UPDATE_LOC();
4495 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), SIMD::Int::type()));
4496 }
4497 
CmpULT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4498 RValue<SIMD::Int> CmpULT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4499 {
4500 	RR_DEBUG_INFO_UPDATE_LOC();
4501 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), SIMD::Int::type()));
4502 }
4503 
CmpULE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4504 RValue<SIMD::Int> CmpULE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4505 {
4506 	RR_DEBUG_INFO_UPDATE_LOC();
4507 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), SIMD::Int::type()));
4508 }
4509 
CmpUNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4510 RValue<SIMD::Int> CmpUNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4511 {
4512 	RR_DEBUG_INFO_UPDATE_LOC();
4513 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), SIMD::Int::type()));
4514 }
4515 
CmpUNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4516 RValue<SIMD::Int> CmpUNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4517 {
4518 	RR_DEBUG_INFO_UPDATE_LOC();
4519 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), SIMD::Int::type()));
4520 }
4521 
CmpUNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4522 RValue<SIMD::Int> CmpUNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4523 {
4524 	RR_DEBUG_INFO_UPDATE_LOC();
4525 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), SIMD::Int::type()));
4526 }
4527 
Round(RValue<SIMD::Float> x)4528 RValue<SIMD::Float> Round(RValue<SIMD::Float> x)
4529 {
4530 	RR_DEBUG_INFO_UPDATE_LOC();
4531 	return RValue<SIMD::Float>(V(lowerRound(V(x.value()))));
4532 }
4533 
Trunc(RValue<SIMD::Float> x)4534 RValue<SIMD::Float> Trunc(RValue<SIMD::Float> x)
4535 {
4536 	RR_DEBUG_INFO_UPDATE_LOC();
4537 	return RValue<SIMD::Float>(V(lowerTrunc(V(x.value()))));
4538 }
4539 
Frac(RValue<SIMD::Float> x)4540 RValue<SIMD::Float> Frac(RValue<SIMD::Float> x)
4541 {
4542 	RR_DEBUG_INFO_UPDATE_LOC();
4543 	SIMD::Float frc = x - Floor(x);
4544 
4545 	// x - floor(x) can be 1.0 for very small negative x.
4546 	// Clamp against the value just below 1.0.
4547 	return Min(frc, As<SIMD::Float>(SIMD::Int(0x3F7FFFFF)));
4548 }
4549 
Floor(RValue<SIMD::Float> x)4550 RValue<SIMD::Float> Floor(RValue<SIMD::Float> x)
4551 {
4552 	RR_DEBUG_INFO_UPDATE_LOC();
4553 	return RValue<SIMD::Float>(V(lowerFloor(V(x.value()))));
4554 }
4555 
Ceil(RValue<SIMD::Float> x)4556 RValue<SIMD::Float> Ceil(RValue<SIMD::Float> x)
4557 {
4558 	RR_DEBUG_INFO_UPDATE_LOC();
4559 	return -Floor(-x);
4560 }
4561 
Extract128(RValue<SIMD::Float> val,int i)4562 RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
4563 {
4564 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4565 
4566 	return As<Float4>(V(jit->builder->CreateExtractElement(v128, i)));
4567 }
4568 
Insert128(RValue<SIMD::Float> val,RValue<Float4> element,int i)4569 RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
4570 {
4571 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4572 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4573 
4574 	return As<SIMD::Float>(V(jit->builder->CreateInsertElement(v128, a, i)));
4575 }
4576 
type()4577 Type *SIMD::Float::type()
4578 {
4579 	return T(llvm::VectorType::get(T(scalar::Float::type()), SIMD::Width, false));
4580 }
4581 
4582 }  // namespace rr
4583