• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "LLVMReactorDebugInfo.hpp"
20 #include "Print.hpp"
21 #include "Reactor.hpp"
22 #include "x86.hpp"
23 
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/IntrinsicsX86.h"
26 #include "llvm/Support/Alignment.h"
27 #include "llvm/Support/Error.h"
28 #include "llvm/Support/ManagedStatic.h"
29 
30 #include <fstream>
31 #include <iostream>
32 #include <mutex>
33 #include <numeric>
34 #include <thread>
35 #include <unordered_map>
36 
37 #if defined(__i386__) || defined(__x86_64__)
38 #	include <xmmintrin.h>
39 #endif
40 
41 #include <math.h>
42 
43 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()44 extern "C" void X86CompilationCallback()
45 {
46 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
47 }
48 #endif
49 
50 #if !LLVM_ENABLE_THREADS
51 #	error "LLVM_ENABLE_THREADS needs to be enabled"
52 #endif
53 
54 #if LLVM_VERSION_MAJOR < 11
55 namespace llvm {
56 using FixedVectorType = VectorType;
57 }  // namespace llvm
58 #endif
59 
60 namespace {
61 
62 // Used to automatically invoke llvm_shutdown() when driver is unloaded
63 llvm::llvm_shutdown_obj llvmShutdownObj;
64 
65 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
66 // for destructing objects at exit. See crbug.com/1074222
67 thread_local rr::JITBuilder *jit = nullptr;
68 
69 // Default configuration settings. Must be accessed under mutex lock.
70 std::mutex defaultConfigLock;
defaultConfig()71 rr::Config &defaultConfig()
72 {
73 	// This uses a static in a function to avoid the cost of a global static
74 	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
75 	static rr::Config config = rr::Config::Edit()
76 	                               .add(rr::Optimization::Pass::ScalarReplAggregates)
77 	                               .add(rr::Optimization::Pass::InstructionCombining)
78 	                               .apply({});
79 	return config;
80 }
81 
lowerPAVG(llvm::Value * x,llvm::Value * y)82 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
83 {
84 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
85 
86 	llvm::VectorType *extTy =
87 	    llvm::VectorType::getExtendedElementVectorType(ty);
88 	x = jit->builder->CreateZExt(x, extTy);
89 	y = jit->builder->CreateZExt(y, extTy);
90 
91 	// (x + y + 1) >> 1
92 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
93 	llvm::Value *res = jit->builder->CreateAdd(x, y);
94 	res = jit->builder->CreateAdd(res, one);
95 	res = jit->builder->CreateLShr(res, one);
96 	return jit->builder->CreateTrunc(res, ty);
97 }
98 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)99 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
100                           llvm::ICmpInst::Predicate pred)
101 {
102 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
103 }
104 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)105 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
106                        llvm::Value *y, llvm::Type *dstTy)
107 {
108 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
109 }
110 
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)111 [[maybe_unused]] llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
112                                             llvm::FCmpInst::Predicate pred)
113 {
114 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
115 }
116 
lowerRound(llvm::Value * x)117 [[maybe_unused]] llvm::Value *lowerRound(llvm::Value *x)
118 {
119 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
120 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
121 	return jit->builder->CreateCall(nearbyint, { x });
122 }
123 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)124 [[maybe_unused]] llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
125 {
126 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
127 }
128 
lowerFloor(llvm::Value * x)129 [[maybe_unused]] llvm::Value *lowerFloor(llvm::Value *x)
130 {
131 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
132 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
133 	return jit->builder->CreateCall(floor, { x });
134 }
135 
lowerTrunc(llvm::Value * x)136 [[maybe_unused]] llvm::Value *lowerTrunc(llvm::Value *x)
137 {
138 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
139 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
140 	return jit->builder->CreateCall(trunc, { x });
141 }
142 
lowerSQRT(llvm::Value * x)143 [[maybe_unused]] llvm::Value *lowerSQRT(llvm::Value *x)
144 {
145 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
146 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
147 	return jit->builder->CreateCall(sqrt, { x });
148 }
149 
lowerRCP(llvm::Value * x)150 [[maybe_unused]] llvm::Value *lowerRCP(llvm::Value *x)
151 {
152 	llvm::Type *ty = x->getType();
153 	llvm::Constant *one;
154 	if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
155 	{
156 		one = llvm::ConstantVector::getSplat(
157 #if LLVM_VERSION_MAJOR >= 11
158 		    vectorTy->getElementCount(),
159 #else
160 		    vectorTy->getNumElements(),
161 #endif
162 		    llvm::ConstantFP::get(vectorTy->getElementType(), 1));
163 	}
164 	else
165 	{
166 		one = llvm::ConstantFP::get(ty, 1);
167 	}
168 	return jit->builder->CreateFDiv(one, x);
169 }
170 
lowerRSQRT(llvm::Value * x)171 [[maybe_unused]] llvm::Value *lowerRSQRT(llvm::Value *x)
172 {
173 	return lowerRCP(lowerSQRT(x));
174 }
175 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)176 [[maybe_unused]] llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
177 {
178 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
179 	llvm::Value *y = llvm::ConstantVector::getSplat(
180 #if LLVM_VERSION_MAJOR >= 11
181 	    ty->getElementCount(),
182 #else
183 	    ty->getNumElements(),
184 #endif
185 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
186 	return jit->builder->CreateShl(x, y);
187 }
188 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)189 [[maybe_unused]] llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
190 {
191 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
192 	llvm::Value *y = llvm::ConstantVector::getSplat(
193 #if LLVM_VERSION_MAJOR >= 11
194 	    ty->getElementCount(),
195 #else
196 	    ty->getNumElements(),
197 #endif
198 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
199 	return jit->builder->CreateAShr(x, y);
200 }
201 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)202 [[maybe_unused]] llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
203 {
204 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
205 	llvm::Value *y = llvm::ConstantVector::getSplat(
206 #if LLVM_VERSION_MAJOR >= 11
207 	    ty->getElementCount(),
208 #else
209 	    ty->getNumElements(),
210 #endif
211 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
212 	return jit->builder->CreateLShr(x, y);
213 }
214 
lowerShuffleVector(llvm::Value * v1,llvm::Value * v2,llvm::ArrayRef<int> select)215 llvm::Value *lowerShuffleVector(llvm::Value *v1, llvm::Value *v2, llvm::ArrayRef<int> select)
216 {
217 	int size = select.size();
218 	const int maxSize = 16;
219 	llvm::Constant *swizzle[maxSize];
220 	ASSERT(size <= maxSize);
221 
222 	for(int i = 0; i < size; i++)
223 	{
224 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
225 	}
226 
227 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
228 
229 	return jit->builder->CreateShuffleVector(v1, v2, shuffle);
230 }
231 
lowerMulAdd(llvm::Value * x,llvm::Value * y)232 [[maybe_unused]] llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
233 {
234 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
235 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
236 
237 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
238 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
239 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
240 
241 	llvm::Value *undef = llvm::UndefValue::get(extTy);
242 
243 	llvm::SmallVector<int, 16> evenIdx;
244 	llvm::SmallVector<int, 16> oddIdx;
245 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
246 	{
247 		evenIdx.push_back(i);
248 		oddIdx.push_back(i + 1);
249 	}
250 
251 	llvm::Value *lhs = lowerShuffleVector(mult, undef, evenIdx);
252 	llvm::Value *rhs = lowerShuffleVector(mult, undef, oddIdx);
253 	return jit->builder->CreateAdd(lhs, rhs);
254 }
255 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)256 [[maybe_unused]] llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
257 {
258 	llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
259 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
260 
261 	llvm::IntegerType *dstElemTy =
262 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
263 
264 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
265 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
266 	llvm::Constant *max, *min;
267 	if(isSigned)
268 	{
269 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
270 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
271 	}
272 	else
273 	{
274 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
275 		min = llvm::ConstantInt::get(srcTy, 0, false);
276 	}
277 
278 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
279 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
280 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
281 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
282 
283 	x = jit->builder->CreateTrunc(x, dstTy);
284 	y = jit->builder->CreateTrunc(y, dstTy);
285 
286 	llvm::SmallVector<int, 16> index(srcTy->getNumElements() * 2);
287 	std::iota(index.begin(), index.end(), 0);
288 
289 	return lowerShuffleVector(x, y, index);
290 }
291 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)292 [[maybe_unused]] llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
293 {
294 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
295 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
296 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
297 
298 	llvm::Value *ret = jit->builder->CreateZExt(
299 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
300 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
301 	{
302 		llvm::Value *elem = jit->builder->CreateZExt(
303 		    jit->builder->CreateExtractElement(cmp, i), retTy);
304 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
305 	}
306 	return ret;
307 }
308 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)309 [[maybe_unused]] llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
310 {
311 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
312 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
313 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
314 
315 	llvm::Value *ret = jit->builder->CreateZExt(
316 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
317 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
318 	{
319 		llvm::Value *elem = jit->builder->CreateZExt(
320 		    jit->builder->CreateExtractElement(cmp, i), retTy);
321 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
322 	}
323 	return ret;
324 }
325 
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)326 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
327 {
328 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
329 }
330 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)331 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
332 {
333 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
334 }
335 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)336 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
337 {
338 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
339 }
340 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)341 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
342 {
343 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
344 }
345 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)346 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
347 {
348 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
349 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
350 
351 	llvm::Value *extX, *extY;
352 	if(sext)
353 	{
354 		extX = jit->builder->CreateSExt(x, extTy);
355 		extY = jit->builder->CreateSExt(y, extTy);
356 	}
357 	else
358 	{
359 		extX = jit->builder->CreateZExt(x, extTy);
360 		extY = jit->builder->CreateZExt(y, extTy);
361 	}
362 
363 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
364 
365 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
366 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
367 	return jit->builder->CreateTrunc(mulh, ty);
368 }
369 
370 }  // namespace
371 
372 namespace rr {
373 
backendName()374 std::string Caps::backendName()
375 {
376 	return std::string("LLVM ") + LLVM_VERSION_STRING;
377 }
378 
coroutinesSupported()379 bool Caps::coroutinesSupported()
380 {
381 	return true;
382 }
383 
fmaIsFast()384 bool Caps::fmaIsFast()
385 {
386 	static bool AVX2 = CPUID::supportsAVX2();  // Also checks for FMA support
387 
388 	// If x86 FMA instructions are supported, assume LLVM will emit them instead of making calls to std::fma().
389 	return AVX2;
390 }
391 
392 // The abstract Type* types are implemented as LLVM types, except that
393 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
394 // and VFP in ARM, and eliminate the overhead of converting them to explicit
395 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
396 // as abstract pointers with small enum values.
397 enum InternalType : uintptr_t
398 {
399 	// Emulated types:
400 	Type_v2i32,
401 	Type_v4i16,
402 	Type_v2i16,
403 	Type_v8i8,
404 	Type_v4i8,
405 	Type_v2f32,
406 	EmulatedTypeCount,
407 	// Returned by asInternalType() to indicate that the abstract Type*
408 	// should be interpreted as LLVM type pointer:
409 	Type_LLVM
410 };
411 
asInternalType(Type * type)412 inline InternalType asInternalType(Type *type)
413 {
414 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
415 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
416 }
417 
T(Type * t)418 llvm::Type *T(Type *t)
419 {
420 	// Use 128-bit vectors to implement logically shorter ones.
421 	switch(asInternalType(t))
422 	{
423 	case Type_v2i32: return T(Int4::type());
424 	case Type_v4i16: return T(Short8::type());
425 	case Type_v2i16: return T(Short8::type());
426 	case Type_v8i8: return T(Byte16::type());
427 	case Type_v4i8: return T(Byte16::type());
428 	case Type_v2f32: return T(Float4::type());
429 	case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
430 	default:
431 		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
432 		return nullptr;
433 	}
434 }
435 
T(InternalType t)436 Type *T(InternalType t)
437 {
438 	return reinterpret_cast<Type *>(t);
439 }
440 
T(const std::vector<Type * > & t)441 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
442 {
443 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
444 }
445 
B(BasicBlock * t)446 inline llvm::BasicBlock *B(BasicBlock *t)
447 {
448 	return reinterpret_cast<llvm::BasicBlock *>(t);
449 }
450 
B(llvm::BasicBlock * t)451 inline BasicBlock *B(llvm::BasicBlock *t)
452 {
453 	return reinterpret_cast<BasicBlock *>(t);
454 }
455 
typeSize(Type * type)456 static size_t typeSize(Type *type)
457 {
458 	switch(asInternalType(type))
459 	{
460 	case Type_v2i32: return 8;
461 	case Type_v4i16: return 8;
462 	case Type_v2i16: return 4;
463 	case Type_v8i8: return 8;
464 	case Type_v4i8: return 4;
465 	case Type_v2f32: return 8;
466 	case Type_LLVM:
467 		{
468 			llvm::Type *t = T(type);
469 
470 			if(t->isPointerTy())
471 			{
472 				return sizeof(void *);
473 			}
474 
475 			// At this point we should only have LLVM 'primitive' types.
476 			unsigned int bits = t->getPrimitiveSizeInBits();
477 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
478 
479 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
480 			// but are typically stored as one byte. The DataLayout structure should
481 			// be used here and many other places if this assumption fails.
482 			return (bits + 7) / 8;
483 		}
484 		break;
485 	default:
486 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
487 		return 0;
488 	}
489 }
490 
elementCount(Type * type)491 static unsigned int elementCount(Type *type)
492 {
493 	switch(asInternalType(type))
494 	{
495 	case Type_v2i32: return 2;
496 	case Type_v4i16: return 4;
497 	case Type_v2i16: return 2;
498 	case Type_v8i8: return 8;
499 	case Type_v4i8: return 4;
500 	case Type_v2f32: return 2;
501 	case Type_LLVM: return llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();
502 	default:
503 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
504 		return 0;
505 	}
506 }
507 
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)508 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> &params)
509 {
510 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
511 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
512 
513 	func->setLinkage(llvm::GlobalValue::ExternalLinkage);
514 	func->setDoesNotThrow();
515 	func->setCallingConv(llvm::CallingConv::C);
516 
517 	if(__has_feature(memory_sanitizer))
518 	{
519 		func->addFnAttr(llvm::Attribute::SanitizeMemory);
520 	}
521 
522 	func->addFnAttr("warn-stack-size", "524288");  // Warn when a function uses more than 512 KiB of stack memory
523 
524 	return func;
525 }
526 
Nucleus()527 Nucleus::Nucleus()
528 {
529 #if !__has_feature(memory_sanitizer)
530 	// thread_local variables in shared libraries are initialized at load-time,
531 	// but this is not observed by MemorySanitizer if the loader itself was not
532 	// instrumented, leading to false-positive uninitialized variable errors.
533 	ASSERT(jit == nullptr);
534 	ASSERT(Variable::unmaterializedVariables == nullptr);
535 #endif
536 
537 	jit = new JITBuilder(Nucleus::getDefaultConfig());
538 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
539 }
540 
~Nucleus()541 Nucleus::~Nucleus()
542 {
543 	delete Variable::unmaterializedVariables;
544 	Variable::unmaterializedVariables = nullptr;
545 
546 	delete jit;
547 	jit = nullptr;
548 }
549 
setDefaultConfig(const Config & cfg)550 void Nucleus::setDefaultConfig(const Config &cfg)
551 {
552 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
553 	::defaultConfig() = cfg;
554 }
555 
adjustDefaultConfig(const Config::Edit & cfgEdit)556 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
557 {
558 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
559 	auto &config = ::defaultConfig();
560 	config = cfgEdit.apply(config);
561 }
562 
getDefaultConfig()563 Config Nucleus::getDefaultConfig()
564 {
565 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
566 	return ::defaultConfig();
567 }
568 
acquireRoutine(const char * name,const Config::Edit * cfgEdit)569 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
570 {
571 	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
572 	{
573 		llvm::Type *type = jit->function->getReturnType();
574 
575 		if(type->isVoidTy())
576 		{
577 			createRetVoid();
578 		}
579 		else
580 		{
581 			createRet(V(llvm::UndefValue::get(type)));
582 		}
583 	}
584 
585 	std::shared_ptr<Routine> routine;
586 
587 	auto acquire = [&](rr::JITBuilder *jit) {
588 		// ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
589 		// it needs to only use the jit variable passed in as an argument.
590 
591 		Config cfg = jit->config;
592 		if(cfgEdit)
593 		{
594 			cfg = cfgEdit->apply(jit->config);
595 		}
596 
597 #ifdef ENABLE_RR_DEBUG_INFO
598 		if(jit->debugInfo != nullptr)
599 		{
600 			jit->debugInfo->Finalize();
601 		}
602 #endif  // ENABLE_RR_DEBUG_INFO
603 
604 		if(false)
605 		{
606 			std::error_code error;
607 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
608 			jit->module->print(file, 0);
609 		}
610 
611 		jit->runPasses(cfg);
612 
613 		if(false)
614 		{
615 			std::error_code error;
616 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
617 			jit->module->print(file, 0);
618 		}
619 
620 		routine = jit->acquireRoutine(name, &jit->function, 1, cfg);
621 	};
622 
623 #ifdef JIT_IN_SEPARATE_THREAD
624 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
625 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
626 	// over the threading and stack sizes of its users, so this should be addressed
627 	// at a higher level instead.
628 	std::thread thread(acquire, jit);
629 	thread.join();
630 #else
631 	acquire(jit);
632 #endif
633 
634 	return routine;
635 }
636 
allocateStackVariable(Type * type,int arraySize)637 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
638 {
639 	// Need to allocate it in the entry block for mem2reg to work
640 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
641 
642 	llvm::Instruction *declaration;
643 
644 #if LLVM_VERSION_MAJOR >= 11
645 	auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
646 #else
647 	auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
648 #endif
649 
650 	if(arraySize)
651 	{
652 		Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
653 		declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
654 	}
655 	else
656 	{
657 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
658 	}
659 
660 	entryBlock.getInstList().push_front(declaration);
661 
662 	return V(declaration);
663 }
664 
createBasicBlock()665 BasicBlock *Nucleus::createBasicBlock()
666 {
667 	return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
668 }
669 
getInsertBlock()670 BasicBlock *Nucleus::getInsertBlock()
671 {
672 	return B(jit->builder->GetInsertBlock());
673 }
674 
setInsertBlock(BasicBlock * basicBlock)675 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
676 {
677 	// assert(jit->builder->GetInsertBlock()->back().isTerminator());
678 
679 	jit->builder->SetInsertPoint(B(basicBlock));
680 }
681 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)682 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
683 {
684 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
685 
686 #ifdef ENABLE_RR_DEBUG_INFO
687 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
688 #endif  // ENABLE_RR_DEBUG_INFO
689 
690 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
691 }
692 
getArgument(unsigned int index)693 Value *Nucleus::getArgument(unsigned int index)
694 {
695 	llvm::Function::arg_iterator args = jit->function->arg_begin();
696 
697 	while(index)
698 	{
699 		args++;
700 		index--;
701 	}
702 
703 	return V(&*args);
704 }
705 
createRetVoid()706 void Nucleus::createRetVoid()
707 {
708 	RR_DEBUG_INFO_UPDATE_LOC();
709 
710 	ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
711 
712 	// Code generated after this point is unreachable, so any variables
713 	// being read can safely return an undefined value. We have to avoid
714 	// materializing variables after the terminator ret instruction.
715 	Variable::killUnmaterialized();
716 
717 	jit->builder->CreateRetVoid();
718 }
719 
createRet(Value * v)720 void Nucleus::createRet(Value *v)
721 {
722 	RR_DEBUG_INFO_UPDATE_LOC();
723 
724 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
725 
726 	// Code generated after this point is unreachable, so any variables
727 	// being read can safely return an undefined value. We have to avoid
728 	// materializing variables after the terminator ret instruction.
729 	Variable::killUnmaterialized();
730 
731 	jit->builder->CreateRet(V(v));
732 }
733 
createBr(BasicBlock * dest)734 void Nucleus::createBr(BasicBlock *dest)
735 {
736 	RR_DEBUG_INFO_UPDATE_LOC();
737 	Variable::materializeAll();
738 
739 	jit->builder->CreateBr(B(dest));
740 }
741 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)742 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
743 {
744 	RR_DEBUG_INFO_UPDATE_LOC();
745 	Variable::materializeAll();
746 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
747 }
748 
createAdd(Value * lhs,Value * rhs)749 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
750 {
751 	RR_DEBUG_INFO_UPDATE_LOC();
752 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
753 }
754 
createSub(Value * lhs,Value * rhs)755 Value *Nucleus::createSub(Value *lhs, Value *rhs)
756 {
757 	RR_DEBUG_INFO_UPDATE_LOC();
758 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
759 }
760 
createMul(Value * lhs,Value * rhs)761 Value *Nucleus::createMul(Value *lhs, Value *rhs)
762 {
763 	RR_DEBUG_INFO_UPDATE_LOC();
764 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
765 }
766 
createUDiv(Value * lhs,Value * rhs)767 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
768 {
769 	RR_DEBUG_INFO_UPDATE_LOC();
770 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
771 }
772 
createSDiv(Value * lhs,Value * rhs)773 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
774 {
775 	RR_DEBUG_INFO_UPDATE_LOC();
776 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
777 }
778 
createFAdd(Value * lhs,Value * rhs)779 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
780 {
781 	RR_DEBUG_INFO_UPDATE_LOC();
782 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
783 }
784 
createFSub(Value * lhs,Value * rhs)785 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
786 {
787 	RR_DEBUG_INFO_UPDATE_LOC();
788 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
789 }
790 
createFMul(Value * lhs,Value * rhs)791 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
792 {
793 	RR_DEBUG_INFO_UPDATE_LOC();
794 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
795 }
796 
createFDiv(Value * lhs,Value * rhs)797 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
798 {
799 	RR_DEBUG_INFO_UPDATE_LOC();
800 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
801 }
802 
createURem(Value * lhs,Value * rhs)803 Value *Nucleus::createURem(Value *lhs, Value *rhs)
804 {
805 	RR_DEBUG_INFO_UPDATE_LOC();
806 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
807 }
808 
createSRem(Value * lhs,Value * rhs)809 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
810 {
811 	RR_DEBUG_INFO_UPDATE_LOC();
812 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
813 }
814 
createFRem(Value * lhs,Value * rhs)815 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
816 {
817 	RR_DEBUG_INFO_UPDATE_LOC();
818 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
819 }
820 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)821 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
822 {
823 	return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
824 }
825 
createShl(Value * lhs,Value * rhs)826 Value *Nucleus::createShl(Value *lhs, Value *rhs)
827 {
828 	RR_DEBUG_INFO_UPDATE_LOC();
829 	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
830 }
831 
createLShr(Value * lhs,Value * rhs)832 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
833 {
834 	RR_DEBUG_INFO_UPDATE_LOC();
835 	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
836 }
837 
createAShr(Value * lhs,Value * rhs)838 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
839 {
840 	RR_DEBUG_INFO_UPDATE_LOC();
841 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
842 }
843 
createAnd(Value * lhs,Value * rhs)844 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
845 {
846 	RR_DEBUG_INFO_UPDATE_LOC();
847 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
848 }
849 
createOr(Value * lhs,Value * rhs)850 Value *Nucleus::createOr(Value *lhs, Value *rhs)
851 {
852 	RR_DEBUG_INFO_UPDATE_LOC();
853 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
854 }
855 
createXor(Value * lhs,Value * rhs)856 Value *Nucleus::createXor(Value *lhs, Value *rhs)
857 {
858 	RR_DEBUG_INFO_UPDATE_LOC();
859 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
860 }
861 
createNeg(Value * v)862 Value *Nucleus::createNeg(Value *v)
863 {
864 	RR_DEBUG_INFO_UPDATE_LOC();
865 	return V(jit->builder->CreateNeg(V(v)));
866 }
867 
createFNeg(Value * v)868 Value *Nucleus::createFNeg(Value *v)
869 {
870 	RR_DEBUG_INFO_UPDATE_LOC();
871 	return V(jit->builder->CreateFNeg(V(v)));
872 }
873 
createNot(Value * v)874 Value *Nucleus::createNot(Value *v)
875 {
876 	RR_DEBUG_INFO_UPDATE_LOC();
877 	return V(jit->builder->CreateNot(V(v)));
878 }
879 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)880 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
881 {
882 	RR_DEBUG_INFO_UPDATE_LOC();
883 	switch(asInternalType(type))
884 	{
885 	case Type_v2i32:
886 	case Type_v4i16:
887 	case Type_v8i8:
888 	case Type_v2f32:
889 		return createBitCast(
890 		    createInsertElement(
891 		        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
892 		        createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
893 		        0),
894 		    type);
895 	case Type_v2i16:
896 	case Type_v4i8:
897 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
898 		{
899 			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
900 			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
901 			i = createZExt(i, Long::type());
902 			Value *v = createInsertElement(u, i, 0);
903 			return createBitCast(v, type);
904 		}
905 		// Fallthrough to non-emulated case.
906 	case Type_LLVM:
907 		{
908 			auto elTy = T(type);
909 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
910 
911 			if(!atomic)
912 			{
913 				return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
914 			}
915 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
916 			{
917 				// Integers and pointers can be atomically loaded by setting
918 				// the ordering constraint on the load instruction.
919 				auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
920 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
921 				return V(load);
922 			}
923 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
924 			{
925 				// LLVM claims to support atomic loads of float types as
926 				// above, but certain backends cannot deal with this.
927 				// Load as an integer and bitcast. See b/136037244.
928 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
929 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
930 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
931 				auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
932 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
933 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
934 				return V(loadCast);
935 			}
936 			else
937 			{
938 				// More exotic types require falling back to the extern:
939 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
940 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
941 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
942 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
943 				auto i8PtrTy = i8Ty->getPointerTo();
944 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
945 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
946 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
947 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
948 				auto out = allocateStackVariable(type);
949 				jit->builder->CreateCall(func, {
950 				                                   llvm::ConstantInt::get(sizetTy, size),
951 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
952 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
953 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
954 				                               });
955 				return V(jit->builder->CreateLoad(T(type), V(out)));
956 			}
957 		}
958 	default:
959 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
960 		return nullptr;
961 	}
962 }
963 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)964 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
965 {
966 	RR_DEBUG_INFO_UPDATE_LOC();
967 	switch(asInternalType(type))
968 	{
969 	case Type_v2i32:
970 	case Type_v4i16:
971 	case Type_v8i8:
972 	case Type_v2f32:
973 		createStore(
974 		    createExtractElement(
975 		        createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
976 		    createBitCast(ptr, Pointer<Long>::type()),
977 		    Long::type(), isVolatile, alignment, atomic, memoryOrder);
978 		return value;
979 	case Type_v2i16:
980 	case Type_v4i8:
981 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
982 		{
983 			createStore(
984 			    createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
985 			    createBitCast(ptr, Pointer<Int>::type()),
986 			    Int::type(), isVolatile, alignment, atomic, memoryOrder);
987 			return value;
988 		}
989 		// Fallthrough to non-emulated case.
990 	case Type_LLVM:
991 		{
992 			auto elTy = T(type);
993 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
994 
995 			if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
996 			{
997 				// Mark all memory writes as initialized by calling __msan_unpoison
998 				// void __msan_unpoison(const volatile void *a, size_t size)
999 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1000 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1001 				auto voidPtrTy = i8Ty->getPointerTo();
1002 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1003 				auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1004 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1005 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1006 
1007 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
1008 				                                 llvm::ConstantInt::get(sizetTy, size) });
1009 			}
1010 
1011 			if(!atomic)
1012 			{
1013 				jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1014 			}
1015 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
1016 			{
1017 				// Integers and pointers can be atomically stored by setting
1018 				// the ordering constraint on the store instruction.
1019 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1020 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1021 			}
1022 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
1023 			{
1024 				// LLVM claims to support atomic stores of float types as
1025 				// above, but certain backends cannot deal with this.
1026 				// Store as an bitcast integer. See b/136037244.
1027 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1028 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1029 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1030 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1031 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1032 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1033 			}
1034 			else
1035 			{
1036 				// More exotic types require falling back to the extern:
1037 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1038 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1039 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1040 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1041 				auto i8PtrTy = i8Ty->getPointerTo();
1042 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1043 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1044 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1045 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1046 				auto copy = allocateStackVariable(type);
1047 				jit->builder->CreateStore(V(value), V(copy));
1048 				jit->builder->CreateCall(func, {
1049 				                                   llvm::ConstantInt::get(sizetTy, size),
1050 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1051 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1052 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1053 				                               });
1054 			}
1055 
1056 			return value;
1057 		}
1058 	default:
1059 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1060 		return nullptr;
1061 	}
1062 }
1063 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1064 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1065 {
1066 	RR_DEBUG_INFO_UPDATE_LOC();
1067 
1068 	ASSERT(V(ptr)->getType()->isPointerTy());
1069 	ASSERT(V(mask)->getType()->isVectorTy());
1070 
1071 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1072 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1073 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1074 	auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1075 	auto elVecPtrTy = elVecTy->getPointerTo();
1076 	auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1077 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1078 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1079 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1080 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1081 }
1082 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1083 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1084 {
1085 	RR_DEBUG_INFO_UPDATE_LOC();
1086 
1087 	ASSERT(V(ptr)->getType()->isPointerTy());
1088 	ASSERT(V(val)->getType()->isVectorTy());
1089 	ASSERT(V(mask)->getType()->isVectorTy());
1090 
1091 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1092 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1093 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1094 	auto elVecTy = V(val)->getType();
1095 	auto elVecPtrTy = elVecTy->getPointerTo();
1096 	auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1097 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1098 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1099 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1100 
1101 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1102 	{
1103 		// Mark memory writes as initialized by calling __msan_unpoison
1104 		// void __msan_unpoison(const volatile void *a, size_t size)
1105 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
1106 		auto voidPtrTy = voidTy->getPointerTo();
1107 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1108 		auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1109 		auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1110 		auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1111 
1112 		for(unsigned i = 0; i < numEls; i++)
1113 		{
1114 			// Check mask for this element
1115 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1116 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1117 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1118 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1119 			jit->builder->SetInsertPoint(thenBlock);
1120 
1121 			// Insert __msan_unpoison call in conditional block
1122 			auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1123 			jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1124 			                                 llvm::ConstantInt::get(sizetTy, size) });
1125 
1126 			jit->builder->CreateBr(mergeBlock);
1127 			jit->builder->SetInsertPoint(mergeBlock);
1128 		}
1129 	}
1130 }
1131 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1132 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1133 {
1134 	ASSERT(base->getType()->isPointerTy());
1135 	ASSERT(offsets->getType()->isVectorTy());
1136 	ASSERT(mask->getType()->isVectorTy());
1137 
1138 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1139 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1140 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1141 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1142 	auto i8PtrTy = i8Ty->getPointerTo();
1143 	auto elPtrTy = elTy->getPointerTo();
1144 	auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1145 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1146 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1147 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1148 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1149 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1150 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1151 
1152 	if(!__has_feature(memory_sanitizer))
1153 	{
1154 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1155 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1156 		return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1157 	}
1158 	else  // __has_feature(memory_sanitizer)
1159 	{
1160 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1161 		// Work around it by emulating gather with element-wise loads.
1162 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1163 
1164 		Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1165 		Nucleus::createStore(V(passthrough), result, T(elVecTy));
1166 
1167 		for(unsigned i = 0; i < numEls; i++)
1168 		{
1169 			// Check mask for this element
1170 			Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1171 
1172 			If(RValue<Bool>(elementMask))
1173 			{
1174 				Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1175 				Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1176 
1177 				Value *v = Nucleus::createLoad(result, T(elVecTy));
1178 				v = Nucleus::createInsertElement(v, el, i);
1179 				Nucleus::createStore(v, result, T(elVecTy));
1180 			}
1181 		}
1182 
1183 		return V(Nucleus::createLoad(result, T(elVecTy)));
1184 	}
1185 }
1186 
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1187 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1188 {
1189 	return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1190 }
1191 
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1192 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1193 {
1194 	return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1195 }
1196 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1197 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1198 {
1199 	ASSERT(base->getType()->isPointerTy());
1200 	ASSERT(val->getType()->isVectorTy());
1201 	ASSERT(offsets->getType()->isVectorTy());
1202 	ASSERT(mask->getType()->isVectorTy());
1203 
1204 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1205 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1206 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1207 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1208 	auto i8PtrTy = i8Ty->getPointerTo();
1209 	auto elVecTy = val->getType();
1210 	auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1211 	auto elPtrTy = elTy->getPointerTo();
1212 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1213 
1214 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1215 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1216 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1217 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1218 
1219 	if(!__has_feature(memory_sanitizer))
1220 	{
1221 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1222 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1223 		jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1224 	}
1225 	else  // __has_feature(memory_sanitizer)
1226 	{
1227 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1228 		// Work around it by emulating scatter with element-wise stores.
1229 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1230 
1231 		for(unsigned i = 0; i < numEls; i++)
1232 		{
1233 			// Check mask for this element
1234 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1235 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1236 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1237 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1238 			jit->builder->SetInsertPoint(thenBlock);
1239 
1240 			auto el = jit->builder->CreateExtractElement(val, idx);
1241 			auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1242 			Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1243 
1244 			jit->builder->CreateBr(mergeBlock);
1245 			jit->builder->SetInsertPoint(mergeBlock);
1246 		}
1247 	}
1248 }
1249 
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1250 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1251 {
1252 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1253 }
1254 
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1255 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1256 {
1257 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1258 }
1259 
createFence(std::memory_order memoryOrder)1260 void Nucleus::createFence(std::memory_order memoryOrder)
1261 {
1262 	RR_DEBUG_INFO_UPDATE_LOC();
1263 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1264 }
1265 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1266 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1267 {
1268 	RR_DEBUG_INFO_UPDATE_LOC();
1269 	ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1270 	if(sizeof(void *) == 8)
1271 	{
1272 		// LLVM manual: "When indexing into an array, pointer or vector,
1273 		// integers of any width are allowed, and they are not required to
1274 		// be constant. These integers are treated as signed values where
1275 		// relevant."
1276 		//
1277 		// Thus if we want indexes to be treated as unsigned we have to
1278 		// zero-extend them ourselves.
1279 		//
1280 		// Note that this is not because we want to address anywhere near
1281 		// 4 GB of data. Instead this is important for performance because
1282 		// x86 supports automatic zero-extending of 32-bit registers to
1283 		// 64-bit. Thus when indexing into an array using a uint32 is
1284 		// actually faster than an int32.
1285 		index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1286 	}
1287 
1288 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1289 	// effective address correctly.
1290 	if(asInternalType(type) == Type_LLVM)
1291 	{
1292 		return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1293 	}
1294 
1295 	// For emulated types we have to multiply the index by the intended
1296 	// type size ourselves to obain the byte offset.
1297 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1298 
1299 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1300 	// original pointer type.
1301 	return createBitCast(
1302 	    V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1303 	    T(llvm::PointerType::get(T(type), 0)));
1304 }
1305 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1306 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1307 {
1308 	RR_DEBUG_INFO_UPDATE_LOC();
1309 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1310 #if LLVM_VERSION_MAJOR >= 11
1311 	                                       llvm::MaybeAlign(),
1312 #endif
1313 	                                       atomicOrdering(true, memoryOrder)));
1314 }
1315 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1316 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1317 {
1318 	RR_DEBUG_INFO_UPDATE_LOC();
1319 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1320 #if LLVM_VERSION_MAJOR >= 11
1321 	                                       llvm::MaybeAlign(),
1322 #endif
1323 	                                       atomicOrdering(true, memoryOrder)));
1324 }
1325 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1326 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1327 {
1328 	RR_DEBUG_INFO_UPDATE_LOC();
1329 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1330 #if LLVM_VERSION_MAJOR >= 11
1331 	                                       llvm::MaybeAlign(),
1332 #endif
1333 	                                       atomicOrdering(true, memoryOrder)));
1334 }
1335 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1336 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1337 {
1338 	RR_DEBUG_INFO_UPDATE_LOC();
1339 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1340 #if LLVM_VERSION_MAJOR >= 11
1341 	                                       llvm::MaybeAlign(),
1342 #endif
1343 	                                       atomicOrdering(true, memoryOrder)));
1344 }
1345 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1346 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1347 {
1348 	RR_DEBUG_INFO_UPDATE_LOC();
1349 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1350 #if LLVM_VERSION_MAJOR >= 11
1351 	                                       llvm::MaybeAlign(),
1352 #endif
1353 	                                       atomicOrdering(true, memoryOrder)));
1354 }
1355 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1356 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1357 {
1358 	RR_DEBUG_INFO_UPDATE_LOC();
1359 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1360 #if LLVM_VERSION_MAJOR >= 11
1361 	                                       llvm::MaybeAlign(),
1362 #endif
1363 	                                       atomicOrdering(true, memoryOrder)));
1364 }
1365 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1366 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1367 {
1368 	RR_DEBUG_INFO_UPDATE_LOC();
1369 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1370 #if LLVM_VERSION_MAJOR >= 11
1371 	                                       llvm::MaybeAlign(),
1372 #endif
1373 	                                       atomicOrdering(true, memoryOrder)));
1374 }
1375 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1376 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1377 {
1378 	RR_DEBUG_INFO_UPDATE_LOC();
1379 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1380 #if LLVM_VERSION_MAJOR >= 11
1381 	                                       llvm::MaybeAlign(),
1382 #endif
1383 	                                       atomicOrdering(true, memoryOrder)));
1384 }
1385 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1386 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1387 {
1388 	RR_DEBUG_INFO_UPDATE_LOC();
1389 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1390 #if LLVM_VERSION_MAJOR >= 11
1391 	                                       llvm::MaybeAlign(),
1392 #endif
1393 	                                       atomicOrdering(true, memoryOrder)));
1394 }
1395 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1396 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1397 {
1398 	RR_DEBUG_INFO_UPDATE_LOC();
1399 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1400 #if LLVM_VERSION_MAJOR >= 11
1401 	                                       llvm::MaybeAlign(),
1402 #endif
1403 	                                       atomicOrdering(true, memoryOrder)));
1404 }
1405 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1406 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1407 {
1408 	RR_DEBUG_INFO_UPDATE_LOC();
1409 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1410 	return V(jit->builder->CreateExtractValue(
1411 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1412 #if LLVM_VERSION_MAJOR >= 11
1413 	                                      llvm::MaybeAlign(),
1414 #endif
1415 	                                      atomicOrdering(true, memoryOrderEqual),
1416 	                                      atomicOrdering(true, memoryOrderUnequal)),
1417 	    llvm::ArrayRef<unsigned>(0u)));
1418 }
1419 
createTrunc(Value * v,Type * destType)1420 Value *Nucleus::createTrunc(Value *v, Type *destType)
1421 {
1422 	RR_DEBUG_INFO_UPDATE_LOC();
1423 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1424 }
1425 
createZExt(Value * v,Type * destType)1426 Value *Nucleus::createZExt(Value *v, Type *destType)
1427 {
1428 	RR_DEBUG_INFO_UPDATE_LOC();
1429 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1430 }
1431 
createSExt(Value * v,Type * destType)1432 Value *Nucleus::createSExt(Value *v, Type *destType)
1433 {
1434 	RR_DEBUG_INFO_UPDATE_LOC();
1435 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1436 }
1437 
createFPToUI(Value * v,Type * destType)1438 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1439 {
1440 	RR_DEBUG_INFO_UPDATE_LOC();
1441 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1442 }
1443 
createFPToSI(Value * v,Type * destType)1444 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1445 {
1446 	RR_DEBUG_INFO_UPDATE_LOC();
1447 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1448 }
1449 
createSIToFP(Value * v,Type * destType)1450 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1451 {
1452 	RR_DEBUG_INFO_UPDATE_LOC();
1453 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1454 }
1455 
createFPTrunc(Value * v,Type * destType)1456 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1457 {
1458 	RR_DEBUG_INFO_UPDATE_LOC();
1459 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1460 }
1461 
createFPExt(Value * v,Type * destType)1462 Value *Nucleus::createFPExt(Value *v, Type *destType)
1463 {
1464 	RR_DEBUG_INFO_UPDATE_LOC();
1465 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1466 }
1467 
createBitCast(Value * v,Type * destType)1468 Value *Nucleus::createBitCast(Value *v, Type *destType)
1469 {
1470 	RR_DEBUG_INFO_UPDATE_LOC();
1471 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1472 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1473 	// reading back as the destination type.
1474 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1475 	{
1476 		Value *readAddress = allocateStackVariable(destType);
1477 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1478 		createStore(v, writeAddress, T(V(v)->getType()));
1479 		return createLoad(readAddress, destType);
1480 	}
1481 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1482 	{
1483 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1484 		createStore(v, writeAddress, T(V(v)->getType()));
1485 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1486 		return createLoad(readAddress, destType);
1487 	}
1488 
1489 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1490 }
1491 
createICmpEQ(Value * lhs,Value * rhs)1492 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1493 {
1494 	RR_DEBUG_INFO_UPDATE_LOC();
1495 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1496 }
1497 
createICmpNE(Value * lhs,Value * rhs)1498 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1499 {
1500 	RR_DEBUG_INFO_UPDATE_LOC();
1501 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1502 }
1503 
createICmpUGT(Value * lhs,Value * rhs)1504 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1505 {
1506 	RR_DEBUG_INFO_UPDATE_LOC();
1507 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1508 }
1509 
createICmpUGE(Value * lhs,Value * rhs)1510 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1511 {
1512 	RR_DEBUG_INFO_UPDATE_LOC();
1513 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1514 }
1515 
createICmpULT(Value * lhs,Value * rhs)1516 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1517 {
1518 	RR_DEBUG_INFO_UPDATE_LOC();
1519 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1520 }
1521 
createICmpULE(Value * lhs,Value * rhs)1522 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1523 {
1524 	RR_DEBUG_INFO_UPDATE_LOC();
1525 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1526 }
1527 
createICmpSGT(Value * lhs,Value * rhs)1528 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1529 {
1530 	RR_DEBUG_INFO_UPDATE_LOC();
1531 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1532 }
1533 
createICmpSGE(Value * lhs,Value * rhs)1534 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1535 {
1536 	RR_DEBUG_INFO_UPDATE_LOC();
1537 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1538 }
1539 
createICmpSLT(Value * lhs,Value * rhs)1540 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1541 {
1542 	RR_DEBUG_INFO_UPDATE_LOC();
1543 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1544 }
1545 
createICmpSLE(Value * lhs,Value * rhs)1546 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1547 {
1548 	RR_DEBUG_INFO_UPDATE_LOC();
1549 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1550 }
1551 
createFCmpOEQ(Value * lhs,Value * rhs)1552 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1553 {
1554 	RR_DEBUG_INFO_UPDATE_LOC();
1555 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1556 }
1557 
createFCmpOGT(Value * lhs,Value * rhs)1558 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1559 {
1560 	RR_DEBUG_INFO_UPDATE_LOC();
1561 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1562 }
1563 
createFCmpOGE(Value * lhs,Value * rhs)1564 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1565 {
1566 	RR_DEBUG_INFO_UPDATE_LOC();
1567 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1568 }
1569 
createFCmpOLT(Value * lhs,Value * rhs)1570 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1571 {
1572 	RR_DEBUG_INFO_UPDATE_LOC();
1573 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1574 }
1575 
createFCmpOLE(Value * lhs,Value * rhs)1576 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1577 {
1578 	RR_DEBUG_INFO_UPDATE_LOC();
1579 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1580 }
1581 
createFCmpONE(Value * lhs,Value * rhs)1582 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1583 {
1584 	RR_DEBUG_INFO_UPDATE_LOC();
1585 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1586 }
1587 
createFCmpORD(Value * lhs,Value * rhs)1588 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1589 {
1590 	RR_DEBUG_INFO_UPDATE_LOC();
1591 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1592 }
1593 
createFCmpUNO(Value * lhs,Value * rhs)1594 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1595 {
1596 	RR_DEBUG_INFO_UPDATE_LOC();
1597 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1598 }
1599 
createFCmpUEQ(Value * lhs,Value * rhs)1600 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1601 {
1602 	RR_DEBUG_INFO_UPDATE_LOC();
1603 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1604 }
1605 
createFCmpUGT(Value * lhs,Value * rhs)1606 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1607 {
1608 	RR_DEBUG_INFO_UPDATE_LOC();
1609 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1610 }
1611 
createFCmpUGE(Value * lhs,Value * rhs)1612 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1613 {
1614 	RR_DEBUG_INFO_UPDATE_LOC();
1615 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1616 }
1617 
createFCmpULT(Value * lhs,Value * rhs)1618 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1619 {
1620 	RR_DEBUG_INFO_UPDATE_LOC();
1621 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1622 }
1623 
createFCmpULE(Value * lhs,Value * rhs)1624 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1625 {
1626 	RR_DEBUG_INFO_UPDATE_LOC();
1627 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1628 }
1629 
createFCmpUNE(Value * lhs,Value * rhs)1630 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1631 {
1632 	RR_DEBUG_INFO_UPDATE_LOC();
1633 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1634 }
1635 
createExtractElement(Value * vector,Type * type,int index)1636 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1637 {
1638 	RR_DEBUG_INFO_UPDATE_LOC();
1639 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1640 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1641 }
1642 
createInsertElement(Value * vector,Value * element,int index)1643 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1644 {
1645 	RR_DEBUG_INFO_UPDATE_LOC();
1646 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1647 }
1648 
createShuffleVector(Value * v1,Value * v2,const int * select)1649 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1650 {
1651 	RR_DEBUG_INFO_UPDATE_LOC();
1652 
1653 	int size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1654 	llvm::SmallVector<int, 16> mask;
1655 	for(int i = 0; i < size; i++)
1656 	{
1657 		mask.push_back(select[i]);
1658 	}
1659 
1660 	return V(lowerShuffleVector(V(v1), V(v2), mask));
1661 }
1662 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1663 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1664 {
1665 	RR_DEBUG_INFO_UPDATE_LOC();
1666 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1667 }
1668 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1669 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1670 {
1671 	RR_DEBUG_INFO_UPDATE_LOC();
1672 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1673 }
1674 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1675 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1676 {
1677 	RR_DEBUG_INFO_UPDATE_LOC();
1678 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1679 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1680 }
1681 
createUnreachable()1682 void Nucleus::createUnreachable()
1683 {
1684 	RR_DEBUG_INFO_UPDATE_LOC();
1685 	jit->builder->CreateUnreachable();
1686 }
1687 
getType(Value * value)1688 Type *Nucleus::getType(Value *value)
1689 {
1690 	return T(V(value)->getType());
1691 }
1692 
getContainedType(Type * vectorType)1693 Type *Nucleus::getContainedType(Type *vectorType)
1694 {
1695 	return T(T(vectorType)->getContainedType(0));
1696 }
1697 
getPointerType(Type * ElementType)1698 Type *Nucleus::getPointerType(Type *ElementType)
1699 {
1700 	return T(llvm::PointerType::get(T(ElementType), 0));
1701 }
1702 
getNaturalIntType()1703 static llvm::Type *getNaturalIntType()
1704 {
1705 	return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1706 }
1707 
getPrintfStorageType(Type * valueType)1708 Type *Nucleus::getPrintfStorageType(Type *valueType)
1709 {
1710 	llvm::Type *valueTy = T(valueType);
1711 	if(valueTy->isIntegerTy())
1712 	{
1713 		return T(getNaturalIntType());
1714 	}
1715 	if(valueTy->isFloatTy())
1716 	{
1717 		return T(llvm::Type::getDoubleTy(*jit->context));
1718 	}
1719 
1720 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1721 	return {};
1722 }
1723 
createNullValue(Type * Ty)1724 Value *Nucleus::createNullValue(Type *Ty)
1725 {
1726 	RR_DEBUG_INFO_UPDATE_LOC();
1727 	return V(llvm::Constant::getNullValue(T(Ty)));
1728 }
1729 
createConstantLong(int64_t i)1730 Value *Nucleus::createConstantLong(int64_t i)
1731 {
1732 	RR_DEBUG_INFO_UPDATE_LOC();
1733 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1734 }
1735 
createConstantInt(int i)1736 Value *Nucleus::createConstantInt(int i)
1737 {
1738 	RR_DEBUG_INFO_UPDATE_LOC();
1739 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1740 }
1741 
createConstantInt(unsigned int i)1742 Value *Nucleus::createConstantInt(unsigned int i)
1743 {
1744 	RR_DEBUG_INFO_UPDATE_LOC();
1745 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1746 }
1747 
createConstantBool(bool b)1748 Value *Nucleus::createConstantBool(bool b)
1749 {
1750 	RR_DEBUG_INFO_UPDATE_LOC();
1751 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1752 }
1753 
createConstantByte(signed char i)1754 Value *Nucleus::createConstantByte(signed char i)
1755 {
1756 	RR_DEBUG_INFO_UPDATE_LOC();
1757 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1758 }
1759 
createConstantByte(unsigned char i)1760 Value *Nucleus::createConstantByte(unsigned char i)
1761 {
1762 	RR_DEBUG_INFO_UPDATE_LOC();
1763 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1764 }
1765 
createConstantShort(short i)1766 Value *Nucleus::createConstantShort(short i)
1767 {
1768 	RR_DEBUG_INFO_UPDATE_LOC();
1769 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1770 }
1771 
createConstantShort(unsigned short i)1772 Value *Nucleus::createConstantShort(unsigned short i)
1773 {
1774 	RR_DEBUG_INFO_UPDATE_LOC();
1775 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1776 }
1777 
createConstantFloat(float x)1778 Value *Nucleus::createConstantFloat(float x)
1779 {
1780 	RR_DEBUG_INFO_UPDATE_LOC();
1781 	return V(llvm::ConstantFP::get(T(Float::type()), x));
1782 }
1783 
createNullPointer(Type * Ty)1784 Value *Nucleus::createNullPointer(Type *Ty)
1785 {
1786 	RR_DEBUG_INFO_UPDATE_LOC();
1787 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1788 }
1789 
createConstantVector(const int64_t * constants,Type * type)1790 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1791 {
1792 	RR_DEBUG_INFO_UPDATE_LOC();
1793 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1794 	const int numConstants = elementCount(type);                                           // Number of provided constants for the (emulated) type.
1795 	const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1796 	ASSERT(numElements <= 16 && numConstants <= numElements);
1797 	llvm::Constant *constantVector[16];
1798 
1799 	for(int i = 0; i < numElements; i++)
1800 	{
1801 		constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1802 	}
1803 
1804 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1805 }
1806 
createConstantVector(const double * constants,Type * type)1807 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1808 {
1809 	RR_DEBUG_INFO_UPDATE_LOC();
1810 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1811 	const int numConstants = elementCount(type);                                           // Number of provided constants for the (emulated) type.
1812 	const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1813 	ASSERT(numElements <= 8 && numConstants <= numElements);
1814 	llvm::Constant *constantVector[8];
1815 
1816 	for(int i = 0; i < numElements; i++)
1817 	{
1818 		constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1819 	}
1820 
1821 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1822 }
1823 
createConstantString(const char * v)1824 Value *Nucleus::createConstantString(const char *v)
1825 {
1826 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1827 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1828 	return V(ptr);
1829 }
1830 
setOptimizerCallback(OptimizerCallback * callback)1831 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1832 {
1833 	// The LLVM backend does not produce optimizer reports.
1834 	(void)callback;
1835 }
1836 
type()1837 Type *Void::type()
1838 {
1839 	return T(llvm::Type::getVoidTy(*jit->context));
1840 }
1841 
type()1842 Type *Bool::type()
1843 {
1844 	return T(llvm::Type::getInt1Ty(*jit->context));
1845 }
1846 
type()1847 Type *Byte::type()
1848 {
1849 	return T(llvm::Type::getInt8Ty(*jit->context));
1850 }
1851 
type()1852 Type *SByte::type()
1853 {
1854 	return T(llvm::Type::getInt8Ty(*jit->context));
1855 }
1856 
type()1857 Type *Short::type()
1858 {
1859 	return T(llvm::Type::getInt16Ty(*jit->context));
1860 }
1861 
type()1862 Type *UShort::type()
1863 {
1864 	return T(llvm::Type::getInt16Ty(*jit->context));
1865 }
1866 
type()1867 Type *Byte4::type()
1868 {
1869 	return T(Type_v4i8);
1870 }
1871 
type()1872 Type *SByte4::type()
1873 {
1874 	return T(Type_v4i8);
1875 }
1876 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1877 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1878 {
1879 	RR_DEBUG_INFO_UPDATE_LOC();
1880 #if defined(__i386__) || defined(__x86_64__)
1881 	return x86::paddusb(x, y);
1882 #else
1883 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1884 #endif
1885 }
1886 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1887 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1888 {
1889 	RR_DEBUG_INFO_UPDATE_LOC();
1890 #if defined(__i386__) || defined(__x86_64__)
1891 	return x86::psubusb(x, y);
1892 #else
1893 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1894 #endif
1895 }
1896 
SignMask(RValue<Byte8> x)1897 RValue<Int> SignMask(RValue<Byte8> x)
1898 {
1899 	RR_DEBUG_INFO_UPDATE_LOC();
1900 #if defined(__i386__) || defined(__x86_64__)
1901 	return x86::pmovmskb(x);
1902 #else
1903 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1904 #endif
1905 }
1906 
1907 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1908 //	{
1909 //#if defined(__i386__) || defined(__x86_64__)
1910 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1911 //#else
1912 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1913 //#endif
1914 //	}
1915 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1916 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1917 {
1918 	RR_DEBUG_INFO_UPDATE_LOC();
1919 #if defined(__i386__) || defined(__x86_64__)
1920 	return x86::pcmpeqb(x, y);
1921 #else
1922 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1923 #endif
1924 }
1925 
type()1926 Type *Byte8::type()
1927 {
1928 	return T(Type_v8i8);
1929 }
1930 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1931 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1932 {
1933 	RR_DEBUG_INFO_UPDATE_LOC();
1934 #if defined(__i386__) || defined(__x86_64__)
1935 	return x86::paddsb(x, y);
1936 #else
1937 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1938 #endif
1939 }
1940 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1941 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1942 {
1943 	RR_DEBUG_INFO_UPDATE_LOC();
1944 #if defined(__i386__) || defined(__x86_64__)
1945 	return x86::psubsb(x, y);
1946 #else
1947 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1948 #endif
1949 }
1950 
SignMask(RValue<SByte8> x)1951 RValue<Int> SignMask(RValue<SByte8> x)
1952 {
1953 	RR_DEBUG_INFO_UPDATE_LOC();
1954 #if defined(__i386__) || defined(__x86_64__)
1955 	return x86::pmovmskb(As<Byte8>(x));
1956 #else
1957 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1958 #endif
1959 }
1960 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1961 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1962 {
1963 	RR_DEBUG_INFO_UPDATE_LOC();
1964 #if defined(__i386__) || defined(__x86_64__)
1965 	return x86::pcmpgtb(x, y);
1966 #else
1967 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1968 #endif
1969 }
1970 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1971 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1972 {
1973 	RR_DEBUG_INFO_UPDATE_LOC();
1974 #if defined(__i386__) || defined(__x86_64__)
1975 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1976 #else
1977 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1978 #endif
1979 }
1980 
type()1981 Type *SByte8::type()
1982 {
1983 	return T(Type_v8i8);
1984 }
1985 
type()1986 Type *Byte16::type()
1987 {
1988 	return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1989 }
1990 
type()1991 Type *SByte16::type()
1992 {
1993 	return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1994 }
1995 
type()1996 Type *Short2::type()
1997 {
1998 	return T(Type_v2i16);
1999 }
2000 
type()2001 Type *UShort2::type()
2002 {
2003 	return T(Type_v2i16);
2004 }
2005 
Short4(RValue<Int4> cast)2006 Short4::Short4(RValue<Int4> cast)
2007 {
2008 	RR_DEBUG_INFO_UPDATE_LOC();
2009 	int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2010 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2011 
2012 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2013 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2014 
2015 	storeValue(short4);
2016 }
2017 
2018 //	Short4::Short4(RValue<Float> cast)
2019 //	{
2020 //	}
2021 
Short4(RValue<Float4> cast)2022 Short4::Short4(RValue<Float4> cast)
2023 {
2024 	RR_DEBUG_INFO_UPDATE_LOC();
2025 	Int4 v4i32 = Int4(cast);
2026 #if defined(__i386__) || defined(__x86_64__)
2027 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2028 #else
2029 	Value *v = v4i32.loadValue();
2030 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2031 #endif
2032 
2033 	storeValue(As<Short4>(Int2(v4i32)).value());
2034 }
2035 
operator <<(RValue<Short4> lhs,unsigned char rhs)2036 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2037 {
2038 	RR_DEBUG_INFO_UPDATE_LOC();
2039 #if defined(__i386__) || defined(__x86_64__)
2040 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2041 
2042 	return x86::psllw(lhs, rhs);
2043 #else
2044 	return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2045 #endif
2046 }
2047 
operator >>(RValue<Short4> lhs,unsigned char rhs)2048 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2049 {
2050 	RR_DEBUG_INFO_UPDATE_LOC();
2051 #if defined(__i386__) || defined(__x86_64__)
2052 	return x86::psraw(lhs, rhs);
2053 #else
2054 	return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2055 #endif
2056 }
2057 
Max(RValue<Short4> x,RValue<Short4> y)2058 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2059 {
2060 	RR_DEBUG_INFO_UPDATE_LOC();
2061 #if defined(__i386__) || defined(__x86_64__)
2062 	return x86::pmaxsw(x, y);
2063 #else
2064 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2065 #endif
2066 }
2067 
Min(RValue<Short4> x,RValue<Short4> y)2068 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2069 {
2070 	RR_DEBUG_INFO_UPDATE_LOC();
2071 #if defined(__i386__) || defined(__x86_64__)
2072 	return x86::pminsw(x, y);
2073 #else
2074 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2075 #endif
2076 }
2077 
AddSat(RValue<Short4> x,RValue<Short4> y)2078 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2079 {
2080 	RR_DEBUG_INFO_UPDATE_LOC();
2081 #if defined(__i386__) || defined(__x86_64__)
2082 	return x86::paddsw(x, y);
2083 #else
2084 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2085 #endif
2086 }
2087 
SubSat(RValue<Short4> x,RValue<Short4> y)2088 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2089 {
2090 	RR_DEBUG_INFO_UPDATE_LOC();
2091 #if defined(__i386__) || defined(__x86_64__)
2092 	return x86::psubsw(x, y);
2093 #else
2094 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2095 #endif
2096 }
2097 
MulHigh(RValue<Short4> x,RValue<Short4> y)2098 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2099 {
2100 	RR_DEBUG_INFO_UPDATE_LOC();
2101 #if defined(__i386__) || defined(__x86_64__)
2102 	return x86::pmulhw(x, y);
2103 #else
2104 	return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2105 #endif
2106 }
2107 
MulAdd(RValue<Short4> x,RValue<Short4> y)2108 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2109 {
2110 	RR_DEBUG_INFO_UPDATE_LOC();
2111 #if defined(__i386__) || defined(__x86_64__)
2112 	return x86::pmaddwd(x, y);
2113 #else
2114 	return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2115 #endif
2116 }
2117 
PackSigned(RValue<Short4> x,RValue<Short4> y)2118 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2119 {
2120 	RR_DEBUG_INFO_UPDATE_LOC();
2121 #if defined(__i386__) || defined(__x86_64__)
2122 	auto result = x86::packsswb(x, y);
2123 #else
2124 	auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2125 #endif
2126 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2127 }
2128 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2129 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2130 {
2131 	RR_DEBUG_INFO_UPDATE_LOC();
2132 #if defined(__i386__) || defined(__x86_64__)
2133 	auto result = x86::packuswb(x, y);
2134 #else
2135 	auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2136 #endif
2137 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2138 }
2139 
CmpGT(RValue<Short4> x,RValue<Short4> y)2140 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2141 {
2142 	RR_DEBUG_INFO_UPDATE_LOC();
2143 #if defined(__i386__) || defined(__x86_64__)
2144 	return x86::pcmpgtw(x, y);
2145 #else
2146 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2147 #endif
2148 }
2149 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2150 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2151 {
2152 	RR_DEBUG_INFO_UPDATE_LOC();
2153 #if defined(__i386__) || defined(__x86_64__)
2154 	return x86::pcmpeqw(x, y);
2155 #else
2156 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2157 #endif
2158 }
2159 
type()2160 Type *Short4::type()
2161 {
2162 	return T(Type_v4i16);
2163 }
2164 
UShort4(RValue<Float4> cast,bool saturate)2165 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2166 {
2167 	RR_DEBUG_INFO_UPDATE_LOC();
2168 	if(saturate)
2169 	{
2170 #if defined(__i386__) || defined(__x86_64__)
2171 		if(CPUID::supportsSSE4_1())
2172 		{
2173 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2174 			*this = As<Short4>(PackUnsigned(int4, int4));
2175 		}
2176 		else
2177 #endif
2178 		{
2179 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2180 		}
2181 	}
2182 	else
2183 	{
2184 		*this = Short4(Int4(cast));
2185 	}
2186 }
2187 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2188 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2189 {
2190 	RR_DEBUG_INFO_UPDATE_LOC();
2191 #if defined(__i386__) || defined(__x86_64__)
2192 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2193 
2194 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2195 #else
2196 	return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2197 #endif
2198 }
2199 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2200 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2201 {
2202 	RR_DEBUG_INFO_UPDATE_LOC();
2203 #if defined(__i386__) || defined(__x86_64__)
2204 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2205 
2206 	return x86::psrlw(lhs, rhs);
2207 #else
2208 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2209 #endif
2210 }
2211 
Max(RValue<UShort4> x,RValue<UShort4> y)2212 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2213 {
2214 	RR_DEBUG_INFO_UPDATE_LOC();
2215 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2216 }
2217 
Min(RValue<UShort4> x,RValue<UShort4> y)2218 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2219 {
2220 	RR_DEBUG_INFO_UPDATE_LOC();
2221 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2222 }
2223 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2224 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2225 {
2226 	RR_DEBUG_INFO_UPDATE_LOC();
2227 #if defined(__i386__) || defined(__x86_64__)
2228 	return x86::paddusw(x, y);
2229 #else
2230 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2231 #endif
2232 }
2233 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2234 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2235 {
2236 	RR_DEBUG_INFO_UPDATE_LOC();
2237 #if defined(__i386__) || defined(__x86_64__)
2238 	return x86::psubusw(x, y);
2239 #else
2240 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2241 #endif
2242 }
2243 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2244 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2245 {
2246 	RR_DEBUG_INFO_UPDATE_LOC();
2247 #if defined(__i386__) || defined(__x86_64__)
2248 	return x86::pmulhuw(x, y);
2249 #else
2250 	return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2251 #endif
2252 }
2253 
Average(RValue<UShort4> x,RValue<UShort4> y)2254 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2255 {
2256 	RR_DEBUG_INFO_UPDATE_LOC();
2257 #if defined(__i386__) || defined(__x86_64__)
2258 	return x86::pavgw(x, y);
2259 #else
2260 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2261 #endif
2262 }
2263 
type()2264 Type *UShort4::type()
2265 {
2266 	return T(Type_v4i16);
2267 }
2268 
operator <<(RValue<Short8> lhs,unsigned char rhs)2269 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2270 {
2271 	RR_DEBUG_INFO_UPDATE_LOC();
2272 #if defined(__i386__) || defined(__x86_64__)
2273 	return x86::psllw(lhs, rhs);
2274 #else
2275 	return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2276 #endif
2277 }
2278 
operator >>(RValue<Short8> lhs,unsigned char rhs)2279 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2280 {
2281 	RR_DEBUG_INFO_UPDATE_LOC();
2282 #if defined(__i386__) || defined(__x86_64__)
2283 	return x86::psraw(lhs, rhs);
2284 #else
2285 	return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2286 #endif
2287 }
2288 
MulAdd(RValue<Short8> x,RValue<Short8> y)2289 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2290 {
2291 	RR_DEBUG_INFO_UPDATE_LOC();
2292 #if defined(__i386__) || defined(__x86_64__)
2293 	return x86::pmaddwd(x, y);
2294 #else
2295 	return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2296 #endif
2297 }
2298 
MulHigh(RValue<Short8> x,RValue<Short8> y)2299 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2300 {
2301 	RR_DEBUG_INFO_UPDATE_LOC();
2302 #if defined(__i386__) || defined(__x86_64__)
2303 	return x86::pmulhw(x, y);
2304 #else
2305 	return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2306 #endif
2307 }
2308 
type()2309 Type *Short8::type()
2310 {
2311 	return T(llvm::VectorType::get(T(Short::type()), 8, false));
2312 }
2313 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2314 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2315 {
2316 	RR_DEBUG_INFO_UPDATE_LOC();
2317 #if defined(__i386__) || defined(__x86_64__)
2318 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2319 #else
2320 	return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2321 #endif
2322 }
2323 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2324 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2325 {
2326 	RR_DEBUG_INFO_UPDATE_LOC();
2327 #if defined(__i386__) || defined(__x86_64__)
2328 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2329 #else
2330 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2331 #endif
2332 }
2333 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2334 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2335 {
2336 	RR_DEBUG_INFO_UPDATE_LOC();
2337 #if defined(__i386__) || defined(__x86_64__)
2338 	return x86::pmulhuw(x, y);
2339 #else
2340 	return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2341 #endif
2342 }
2343 
type()2344 Type *UShort8::type()
2345 {
2346 	return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2347 }
2348 
operator ++(Int & val,int)2349 RValue<Int> operator++(Int &val, int)  // Post-increment
2350 {
2351 	RR_DEBUG_INFO_UPDATE_LOC();
2352 	RValue<Int> res = val;
2353 
2354 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2355 	val.storeValue(inc);
2356 
2357 	return res;
2358 }
2359 
operator ++(Int & val)2360 const Int &operator++(Int &val)  // Pre-increment
2361 {
2362 	RR_DEBUG_INFO_UPDATE_LOC();
2363 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2364 	val.storeValue(inc);
2365 
2366 	return val;
2367 }
2368 
operator --(Int & val,int)2369 RValue<Int> operator--(Int &val, int)  // Post-decrement
2370 {
2371 	RR_DEBUG_INFO_UPDATE_LOC();
2372 	RValue<Int> res = val;
2373 
2374 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2375 	val.storeValue(inc);
2376 
2377 	return res;
2378 }
2379 
operator --(Int & val)2380 const Int &operator--(Int &val)  // Pre-decrement
2381 {
2382 	RR_DEBUG_INFO_UPDATE_LOC();
2383 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2384 	val.storeValue(inc);
2385 
2386 	return val;
2387 }
2388 
RoundInt(RValue<Float> cast)2389 RValue<Int> RoundInt(RValue<Float> cast)
2390 {
2391 	RR_DEBUG_INFO_UPDATE_LOC();
2392 #if defined(__i386__) || defined(__x86_64__)
2393 	return x86::cvtss2si(cast);
2394 #else
2395 	return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2396 #endif
2397 }
2398 
type()2399 Type *Int::type()
2400 {
2401 	return T(llvm::Type::getInt32Ty(*jit->context));
2402 }
2403 
type()2404 Type *Long::type()
2405 {
2406 	return T(llvm::Type::getInt64Ty(*jit->context));
2407 }
2408 
UInt(RValue<Float> cast)2409 UInt::UInt(RValue<Float> cast)
2410 {
2411 	RR_DEBUG_INFO_UPDATE_LOC();
2412 	Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2413 	storeValue(integer);
2414 }
2415 
operator ++(UInt & val,int)2416 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2417 {
2418 	RR_DEBUG_INFO_UPDATE_LOC();
2419 	RValue<UInt> res = val;
2420 
2421 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2422 	val.storeValue(inc);
2423 
2424 	return res;
2425 }
2426 
operator ++(UInt & val)2427 const UInt &operator++(UInt &val)  // Pre-increment
2428 {
2429 	RR_DEBUG_INFO_UPDATE_LOC();
2430 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2431 	val.storeValue(inc);
2432 
2433 	return val;
2434 }
2435 
operator --(UInt & val,int)2436 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2437 {
2438 	RR_DEBUG_INFO_UPDATE_LOC();
2439 	RValue<UInt> res = val;
2440 
2441 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2442 	val.storeValue(inc);
2443 
2444 	return res;
2445 }
2446 
operator --(UInt & val)2447 const UInt &operator--(UInt &val)  // Pre-decrement
2448 {
2449 	RR_DEBUG_INFO_UPDATE_LOC();
2450 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2451 	val.storeValue(inc);
2452 
2453 	return val;
2454 }
2455 
2456 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2457 //	{
2458 //#if defined(__i386__) || defined(__x86_64__)
2459 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2460 //#else
2461 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2462 //#endif
2463 //	}
2464 
type()2465 Type *UInt::type()
2466 {
2467 	return T(llvm::Type::getInt32Ty(*jit->context));
2468 }
2469 
2470 //	Int2::Int2(RValue<Int> cast)
2471 //	{
2472 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2473 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
2474 //
2475 //		int shuffle[2] = {0, 0};
2476 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2477 //
2478 //		storeValue(replicate);
2479 //	}
2480 
operator <<(RValue<Int2> lhs,unsigned char rhs)2481 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2482 {
2483 	RR_DEBUG_INFO_UPDATE_LOC();
2484 #if defined(__i386__) || defined(__x86_64__)
2485 	//	return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2486 
2487 	return x86::pslld(lhs, rhs);
2488 #else
2489 	return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2490 #endif
2491 }
2492 
operator >>(RValue<Int2> lhs,unsigned char rhs)2493 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2494 {
2495 	RR_DEBUG_INFO_UPDATE_LOC();
2496 #if defined(__i386__) || defined(__x86_64__)
2497 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2498 
2499 	return x86::psrad(lhs, rhs);
2500 #else
2501 	return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2502 #endif
2503 }
2504 
type()2505 Type *Int2::type()
2506 {
2507 	return T(Type_v2i32);
2508 }
2509 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2510 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2511 {
2512 	RR_DEBUG_INFO_UPDATE_LOC();
2513 #if defined(__i386__) || defined(__x86_64__)
2514 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2515 
2516 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2517 #else
2518 	return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2519 #endif
2520 }
2521 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2522 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2523 {
2524 	RR_DEBUG_INFO_UPDATE_LOC();
2525 #if defined(__i386__) || defined(__x86_64__)
2526 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2527 
2528 	return x86::psrld(lhs, rhs);
2529 #else
2530 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2531 #endif
2532 }
2533 
type()2534 Type *UInt2::type()
2535 {
2536 	return T(Type_v2i32);
2537 }
2538 
Int4(RValue<Byte4> cast)2539 Int4::Int4(RValue<Byte4> cast)
2540     : XYZW(this)
2541 {
2542 	RR_DEBUG_INFO_UPDATE_LOC();
2543 	int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2544 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2545 	Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2546 
2547 	int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2548 	Value *c = Nucleus::createBitCast(b, Short8::type());
2549 	Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2550 
2551 	*this = As<Int4>(d);
2552 }
2553 
Int4(RValue<SByte4> cast)2554 Int4::Int4(RValue<SByte4> cast)
2555     : XYZW(this)
2556 {
2557 	RR_DEBUG_INFO_UPDATE_LOC();
2558 	int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2559 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2560 	Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2561 
2562 	int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2563 	Value *c = Nucleus::createBitCast(b, Short8::type());
2564 	Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2565 
2566 	*this = As<Int4>(d) >> 24;
2567 }
2568 
Int4(RValue<Short4> cast)2569 Int4::Int4(RValue<Short4> cast)
2570     : XYZW(this)
2571 {
2572 	RR_DEBUG_INFO_UPDATE_LOC();
2573 	int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2574 	Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2575 	*this = As<Int4>(c) >> 16;
2576 }
2577 
Int4(RValue<UShort4> cast)2578 Int4::Int4(RValue<UShort4> cast)
2579     : XYZW(this)
2580 {
2581 	RR_DEBUG_INFO_UPDATE_LOC();
2582 	int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2583 	Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2584 	*this = As<Int4>(c);
2585 }
2586 
Int4(RValue<Int> rhs)2587 Int4::Int4(RValue<Int> rhs)
2588     : XYZW(this)
2589 {
2590 	RR_DEBUG_INFO_UPDATE_LOC();
2591 	Value *vector = loadValue();
2592 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2593 
2594 	int swizzle[4] = { 0, 0, 0, 0 };
2595 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2596 
2597 	storeValue(replicate);
2598 }
2599 
operator <<(RValue<Int4> lhs,unsigned char rhs)2600 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2601 {
2602 	RR_DEBUG_INFO_UPDATE_LOC();
2603 #if defined(__i386__) || defined(__x86_64__)
2604 	return x86::pslld(lhs, rhs);
2605 #else
2606 	return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2607 #endif
2608 }
2609 
operator >>(RValue<Int4> lhs,unsigned char rhs)2610 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2611 {
2612 	RR_DEBUG_INFO_UPDATE_LOC();
2613 #if defined(__i386__) || defined(__x86_64__)
2614 	return x86::psrad(lhs, rhs);
2615 #else
2616 	return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2617 #endif
2618 }
2619 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2620 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2621 {
2622 	RR_DEBUG_INFO_UPDATE_LOC();
2623 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2624 }
2625 
CmpLT(RValue<Int4> x,RValue<Int4> y)2626 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2627 {
2628 	RR_DEBUG_INFO_UPDATE_LOC();
2629 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2630 }
2631 
CmpLE(RValue<Int4> x,RValue<Int4> y)2632 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2633 {
2634 	RR_DEBUG_INFO_UPDATE_LOC();
2635 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2636 }
2637 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2638 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2639 {
2640 	RR_DEBUG_INFO_UPDATE_LOC();
2641 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2642 }
2643 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2644 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2645 {
2646 	RR_DEBUG_INFO_UPDATE_LOC();
2647 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2648 }
2649 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2650 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2651 {
2652 	RR_DEBUG_INFO_UPDATE_LOC();
2653 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2654 }
2655 
Abs(RValue<Int4> x)2656 RValue<Int4> Abs(RValue<Int4> x)
2657 {
2658 #if LLVM_VERSION_MAJOR >= 12
2659 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
2660 	return RValue<Int4>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
2661 #else
2662 	auto negative = x >> 31;
2663 	return (x ^ negative) - negative;
2664 #endif
2665 }
2666 
Max(RValue<Int4> x,RValue<Int4> y)2667 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2668 {
2669 	RR_DEBUG_INFO_UPDATE_LOC();
2670 #if defined(__i386__) || defined(__x86_64__)
2671 	if(CPUID::supportsSSE4_1())
2672 	{
2673 		return x86::pmaxsd(x, y);
2674 	}
2675 	else
2676 #endif
2677 	{
2678 		RValue<Int4> greater = CmpNLE(x, y);
2679 		return (x & greater) | (y & ~greater);
2680 	}
2681 }
2682 
Min(RValue<Int4> x,RValue<Int4> y)2683 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2684 {
2685 	RR_DEBUG_INFO_UPDATE_LOC();
2686 #if defined(__i386__) || defined(__x86_64__)
2687 	if(CPUID::supportsSSE4_1())
2688 	{
2689 		return x86::pminsd(x, y);
2690 	}
2691 	else
2692 #endif
2693 	{
2694 		RValue<Int4> less = CmpLT(x, y);
2695 		return (x & less) | (y & ~less);
2696 	}
2697 }
2698 
RoundInt(RValue<Float4> cast)2699 RValue<Int4> RoundInt(RValue<Float4> cast)
2700 {
2701 	RR_DEBUG_INFO_UPDATE_LOC();
2702 #if defined(__i386__) || defined(__x86_64__)
2703 	return x86::cvtps2dq(cast);
2704 #else
2705 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2706 #endif
2707 }
2708 
RoundIntClamped(RValue<Float4> cast)2709 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2710 {
2711 	RR_DEBUG_INFO_UPDATE_LOC();
2712 
2713 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
2714 #if defined(__i386__) || defined(__x86_64__)
2715 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
2716 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2717 	// saturate to 0x80000000.
2718 	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2719 #elif defined(__arm__) || defined(__aarch64__)
2720 	// ARM saturates to the largest positive or negative integer. Unit tests
2721 	// verify that lowerRoundInt() behaves as desired.
2722 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2723 #elif LLVM_VERSION_MAJOR >= 14
2724 	llvm::Value *rounded = lowerRound(V(cast.value()));
2725 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
2726 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(Int4::type()), T(Float4::type()) });
2727 	return RValue<Int4>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
2728 #else
2729 	RValue<Float4> clamped = Max(Min(cast, Float4(0x7FFFFF80)), Float4(0x80000000));
2730 	return As<Int4>(V(lowerRoundInt(V(clamped.value()), T(Int4::type()))));
2731 #endif
2732 }
2733 
MulHigh(RValue<Int4> x,RValue<Int4> y)2734 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2735 {
2736 	RR_DEBUG_INFO_UPDATE_LOC();
2737 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2738 	return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2739 }
2740 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2741 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2742 {
2743 	RR_DEBUG_INFO_UPDATE_LOC();
2744 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2745 	return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2746 }
2747 
PackSigned(RValue<Int4> x,RValue<Int4> y)2748 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2749 {
2750 	RR_DEBUG_INFO_UPDATE_LOC();
2751 #if defined(__i386__) || defined(__x86_64__)
2752 	return x86::packssdw(x, y);
2753 #else
2754 	return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2755 #endif
2756 }
2757 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2758 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2759 {
2760 	RR_DEBUG_INFO_UPDATE_LOC();
2761 #if defined(__i386__) || defined(__x86_64__)
2762 	return x86::packusdw(x, y);
2763 #else
2764 	return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2765 #endif
2766 }
2767 
SignMask(RValue<Int4> x)2768 RValue<Int> SignMask(RValue<Int4> x)
2769 {
2770 	RR_DEBUG_INFO_UPDATE_LOC();
2771 #if defined(__i386__) || defined(__x86_64__)
2772 	return x86::movmskps(As<Float4>(x));
2773 #else
2774 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2775 #endif
2776 }
2777 
type()2778 Type *Int4::type()
2779 {
2780 	return T(llvm::VectorType::get(T(Int::type()), 4, false));
2781 }
2782 
UInt4(RValue<Float4> cast)2783 UInt4::UInt4(RValue<Float4> cast)
2784     : XYZW(this)
2785 {
2786 	RR_DEBUG_INFO_UPDATE_LOC();
2787 	Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2788 	storeValue(xyzw);
2789 }
2790 
UInt4(RValue<UInt> rhs)2791 UInt4::UInt4(RValue<UInt> rhs)
2792     : XYZW(this)
2793 {
2794 	RR_DEBUG_INFO_UPDATE_LOC();
2795 	Value *vector = loadValue();
2796 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2797 
2798 	int swizzle[4] = { 0, 0, 0, 0 };
2799 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2800 
2801 	storeValue(replicate);
2802 }
2803 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2804 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2805 {
2806 	RR_DEBUG_INFO_UPDATE_LOC();
2807 #if defined(__i386__) || defined(__x86_64__)
2808 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2809 #else
2810 	return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2811 #endif
2812 }
2813 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2814 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2815 {
2816 	RR_DEBUG_INFO_UPDATE_LOC();
2817 #if defined(__i386__) || defined(__x86_64__)
2818 	return x86::psrld(lhs, rhs);
2819 #else
2820 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2821 #endif
2822 }
2823 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2824 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2825 {
2826 	RR_DEBUG_INFO_UPDATE_LOC();
2827 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2828 }
2829 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2830 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2831 {
2832 	RR_DEBUG_INFO_UPDATE_LOC();
2833 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2834 }
2835 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2836 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2837 {
2838 	RR_DEBUG_INFO_UPDATE_LOC();
2839 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2840 }
2841 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2842 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2843 {
2844 	RR_DEBUG_INFO_UPDATE_LOC();
2845 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2846 }
2847 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2848 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2849 {
2850 	RR_DEBUG_INFO_UPDATE_LOC();
2851 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2852 }
2853 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2854 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2855 {
2856 	RR_DEBUG_INFO_UPDATE_LOC();
2857 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2858 }
2859 
Max(RValue<UInt4> x,RValue<UInt4> y)2860 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2861 {
2862 	RR_DEBUG_INFO_UPDATE_LOC();
2863 #if defined(__i386__) || defined(__x86_64__)
2864 	if(CPUID::supportsSSE4_1())
2865 	{
2866 		return x86::pmaxud(x, y);
2867 	}
2868 	else
2869 #endif
2870 	{
2871 		RValue<UInt4> greater = CmpNLE(x, y);
2872 		return (x & greater) | (y & ~greater);
2873 	}
2874 }
2875 
Min(RValue<UInt4> x,RValue<UInt4> y)2876 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2877 {
2878 	RR_DEBUG_INFO_UPDATE_LOC();
2879 #if defined(__i386__) || defined(__x86_64__)
2880 	if(CPUID::supportsSSE4_1())
2881 	{
2882 		return x86::pminud(x, y);
2883 	}
2884 	else
2885 #endif
2886 	{
2887 		RValue<UInt4> less = CmpLT(x, y);
2888 		return (x & less) | (y & ~less);
2889 	}
2890 }
2891 
type()2892 Type *UInt4::type()
2893 {
2894 	return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2895 }
2896 
type()2897 Type *Half::type()
2898 {
2899 	return T(llvm::Type::getInt16Ty(*jit->context));
2900 }
2901 
Rcp_pp(RValue<Float> x,bool exactAtPow2)2902 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2903 {
2904 	RR_DEBUG_INFO_UPDATE_LOC();
2905 #if defined(__i386__) || defined(__x86_64__)
2906 	if(exactAtPow2)
2907 	{
2908 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2909 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2910 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2911 	}
2912 	return x86::rcpss(x);
2913 #else
2914 	return As<Float>(V(lowerRCP(V(x.value()))));
2915 #endif
2916 }
2917 
RcpSqrt_pp(RValue<Float> x)2918 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2919 {
2920 	RR_DEBUG_INFO_UPDATE_LOC();
2921 #if defined(__i386__) || defined(__x86_64__)
2922 	return x86::rsqrtss(x);
2923 #else
2924 	return As<Float>(V(lowerRSQRT(V(x.value()))));
2925 #endif
2926 }
2927 
HasRcpApprox()2928 bool HasRcpApprox()
2929 {
2930 #if defined(__i386__) || defined(__x86_64__)
2931 	return true;
2932 #else
2933 	return false;
2934 #endif
2935 }
2936 
RcpApprox(RValue<Float4> x,bool exactAtPow2)2937 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2938 {
2939 #if defined(__i386__) || defined(__x86_64__)
2940 	if(exactAtPow2)
2941 	{
2942 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2943 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2944 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2945 	}
2946 	return x86::rcpps(x);
2947 #else
2948 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2949 	return { 0.0f };
2950 #endif
2951 }
2952 
RcpApprox(RValue<Float> x,bool exactAtPow2)2953 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2954 {
2955 #if defined(__i386__) || defined(__x86_64__)
2956 	if(exactAtPow2)
2957 	{
2958 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2959 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2960 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2961 	}
2962 	return x86::rcpss(x);
2963 #else
2964 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2965 	return { 0.0f };
2966 #endif
2967 }
2968 
HasRcpSqrtApprox()2969 bool HasRcpSqrtApprox()
2970 {
2971 #if defined(__i386__) || defined(__x86_64__)
2972 	return true;
2973 #else
2974 	return false;
2975 #endif
2976 }
2977 
RcpSqrtApprox(RValue<Float4> x)2978 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2979 {
2980 #if defined(__i386__) || defined(__x86_64__)
2981 	return x86::rsqrtps(x);
2982 #else
2983 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2984 	return { 0.0f };
2985 #endif
2986 }
2987 
RcpSqrtApprox(RValue<Float> x)2988 RValue<Float> RcpSqrtApprox(RValue<Float> x)
2989 {
2990 #if defined(__i386__) || defined(__x86_64__)
2991 	return x86::rsqrtss(x);
2992 #else
2993 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2994 	return { 0.0f };
2995 #endif
2996 }
2997 
Sqrt(RValue<Float> x)2998 RValue<Float> Sqrt(RValue<Float> x)
2999 {
3000 	RR_DEBUG_INFO_UPDATE_LOC();
3001 #if defined(__i386__) || defined(__x86_64__)
3002 	return x86::sqrtss(x);
3003 #else
3004 	return As<Float>(V(lowerSQRT(V(x.value()))));
3005 #endif
3006 }
3007 
Round(RValue<Float> x)3008 RValue<Float> Round(RValue<Float> x)
3009 {
3010 	RR_DEBUG_INFO_UPDATE_LOC();
3011 #if defined(__i386__) || defined(__x86_64__)
3012 	if(CPUID::supportsSSE4_1())
3013 	{
3014 		return x86::roundss(x, 0);
3015 	}
3016 	else
3017 	{
3018 		return Float4(Round(Float4(x))).x;
3019 	}
3020 #else
3021 	return RValue<Float>(V(lowerRound(V(x.value()))));
3022 #endif
3023 }
3024 
Trunc(RValue<Float> x)3025 RValue<Float> Trunc(RValue<Float> x)
3026 {
3027 	RR_DEBUG_INFO_UPDATE_LOC();
3028 #if defined(__i386__) || defined(__x86_64__)
3029 	if(CPUID::supportsSSE4_1())
3030 	{
3031 		return x86::roundss(x, 3);
3032 	}
3033 	else
3034 	{
3035 		return Float(Int(x));  // Rounded toward zero
3036 	}
3037 #else
3038 	return RValue<Float>(V(lowerTrunc(V(x.value()))));
3039 #endif
3040 }
3041 
Frac(RValue<Float> x)3042 RValue<Float> Frac(RValue<Float> x)
3043 {
3044 	RR_DEBUG_INFO_UPDATE_LOC();
3045 #if defined(__i386__) || defined(__x86_64__)
3046 	if(CPUID::supportsSSE4_1())
3047 	{
3048 		return x - x86::floorss(x);
3049 	}
3050 	else
3051 	{
3052 		return Float4(Frac(Float4(x))).x;
3053 	}
3054 #else
3055 	// x - floor(x) can be 1.0 for very small negative x.
3056 	// Clamp against the value just below 1.0.
3057 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3058 #endif
3059 }
3060 
Floor(RValue<Float> x)3061 RValue<Float> Floor(RValue<Float> x)
3062 {
3063 	RR_DEBUG_INFO_UPDATE_LOC();
3064 #if defined(__i386__) || defined(__x86_64__)
3065 	if(CPUID::supportsSSE4_1())
3066 	{
3067 		return x86::floorss(x);
3068 	}
3069 	else
3070 	{
3071 		return Float4(Floor(Float4(x))).x;
3072 	}
3073 #else
3074 	return RValue<Float>(V(lowerFloor(V(x.value()))));
3075 #endif
3076 }
3077 
Ceil(RValue<Float> x)3078 RValue<Float> Ceil(RValue<Float> x)
3079 {
3080 	RR_DEBUG_INFO_UPDATE_LOC();
3081 #if defined(__i386__) || defined(__x86_64__)
3082 	if(CPUID::supportsSSE4_1())
3083 	{
3084 		return x86::ceilss(x);
3085 	}
3086 	else
3087 #endif
3088 	{
3089 		return Float4(Ceil(Float4(x))).x;
3090 	}
3091 }
3092 
type()3093 Type *Float::type()
3094 {
3095 	return T(llvm::Type::getFloatTy(*jit->context));
3096 }
3097 
type()3098 Type *Float2::type()
3099 {
3100 	return T(Type_v2f32);
3101 }
3102 
Exp2(RValue<Float> v)3103 RValue<Float> Exp2(RValue<Float> v)
3104 {
3105 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::type()) });
3106 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3107 }
3108 
Log2(RValue<Float> v)3109 RValue<Float> Log2(RValue<Float> v)
3110 {
3111 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::type()) });
3112 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3113 }
3114 
Float4(RValue<Float> rhs)3115 Float4::Float4(RValue<Float> rhs)
3116     : XYZW(this)
3117 {
3118 	RR_DEBUG_INFO_UPDATE_LOC();
3119 	Value *vector = loadValue();
3120 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3121 
3122 	int swizzle[4] = { 0, 0, 0, 0 };
3123 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3124 
3125 	storeValue(replicate);
3126 }
3127 
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3128 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3129 {
3130 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
3131 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3132 }
3133 
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3134 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3135 {
3136 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(Float4::type()) });
3137 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3138 }
3139 
Abs(RValue<Float4> x)3140 RValue<Float4> Abs(RValue<Float4> x)
3141 {
3142 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
3143 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(x.value()))));
3144 }
3145 
Max(RValue<Float4> x,RValue<Float4> y)3146 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3147 {
3148 	RR_DEBUG_INFO_UPDATE_LOC();
3149 #if defined(__i386__) || defined(__x86_64__)
3150 	return x86::maxps(x, y);
3151 #else
3152 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3153 #endif
3154 }
3155 
Min(RValue<Float4> x,RValue<Float4> y)3156 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3157 {
3158 	RR_DEBUG_INFO_UPDATE_LOC();
3159 #if defined(__i386__) || defined(__x86_64__)
3160 	return x86::minps(x, y);
3161 #else
3162 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3163 #endif
3164 }
3165 
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3166 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3167 {
3168 	RR_DEBUG_INFO_UPDATE_LOC();
3169 #if defined(__i386__) || defined(__x86_64__)
3170 	if(exactAtPow2)
3171 	{
3172 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
3173 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
3174 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
3175 	}
3176 	return x86::rcpps(x);
3177 #else
3178 	return As<Float4>(V(lowerRCP(V(x.value()))));
3179 #endif
3180 }
3181 
RcpSqrt_pp(RValue<Float4> x)3182 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3183 {
3184 	RR_DEBUG_INFO_UPDATE_LOC();
3185 #if defined(__i386__) || defined(__x86_64__)
3186 	return x86::rsqrtps(x);
3187 #else
3188 	return As<Float4>(V(lowerRSQRT(V(x.value()))));
3189 #endif
3190 }
3191 
Sqrt(RValue<Float4> x)3192 RValue<Float4> Sqrt(RValue<Float4> x)
3193 {
3194 	RR_DEBUG_INFO_UPDATE_LOC();
3195 #if defined(__i386__) || defined(__x86_64__)
3196 	return x86::sqrtps(x);
3197 #else
3198 	return As<Float4>(V(lowerSQRT(V(x.value()))));
3199 #endif
3200 }
3201 
SignMask(RValue<Float4> x)3202 RValue<Int> SignMask(RValue<Float4> x)
3203 {
3204 	RR_DEBUG_INFO_UPDATE_LOC();
3205 #if defined(__i386__) || defined(__x86_64__)
3206 	return x86::movmskps(x);
3207 #else
3208 	return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3209 #endif
3210 }
3211 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3212 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3213 {
3214 	RR_DEBUG_INFO_UPDATE_LOC();
3215 	//	return As<Int4>(x86::cmpeqps(x, y));
3216 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3217 }
3218 
CmpLT(RValue<Float4> x,RValue<Float4> y)3219 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3220 {
3221 	RR_DEBUG_INFO_UPDATE_LOC();
3222 	//	return As<Int4>(x86::cmpltps(x, y));
3223 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3224 }
3225 
CmpLE(RValue<Float4> x,RValue<Float4> y)3226 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3227 {
3228 	RR_DEBUG_INFO_UPDATE_LOC();
3229 	//	return As<Int4>(x86::cmpleps(x, y));
3230 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3231 }
3232 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3233 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3234 {
3235 	RR_DEBUG_INFO_UPDATE_LOC();
3236 	//	return As<Int4>(x86::cmpneqps(x, y));
3237 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3238 }
3239 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3240 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3241 {
3242 	RR_DEBUG_INFO_UPDATE_LOC();
3243 	//	return As<Int4>(x86::cmpnltps(x, y));
3244 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3245 }
3246 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3247 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3248 {
3249 	RR_DEBUG_INFO_UPDATE_LOC();
3250 	//	return As<Int4>(x86::cmpnleps(x, y));
3251 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3252 }
3253 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3254 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3255 {
3256 	RR_DEBUG_INFO_UPDATE_LOC();
3257 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3258 }
3259 
CmpULT(RValue<Float4> x,RValue<Float4> y)3260 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3261 {
3262 	RR_DEBUG_INFO_UPDATE_LOC();
3263 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3264 }
3265 
CmpULE(RValue<Float4> x,RValue<Float4> y)3266 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3267 {
3268 	RR_DEBUG_INFO_UPDATE_LOC();
3269 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3270 }
3271 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3272 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3273 {
3274 	RR_DEBUG_INFO_UPDATE_LOC();
3275 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3276 }
3277 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3278 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3279 {
3280 	RR_DEBUG_INFO_UPDATE_LOC();
3281 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3282 }
3283 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3284 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3285 {
3286 	RR_DEBUG_INFO_UPDATE_LOC();
3287 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3288 }
3289 
Round(RValue<Float4> x)3290 RValue<Float4> Round(RValue<Float4> x)
3291 {
3292 	RR_DEBUG_INFO_UPDATE_LOC();
3293 #if defined(__i386__) || defined(__x86_64__)
3294 	if(CPUID::supportsSSE4_1())
3295 	{
3296 		return x86::roundps(x, 0);
3297 	}
3298 	else
3299 	{
3300 		return Float4(RoundInt(x));
3301 	}
3302 #else
3303 	return RValue<Float4>(V(lowerRound(V(x.value()))));
3304 #endif
3305 }
3306 
Trunc(RValue<Float4> x)3307 RValue<Float4> Trunc(RValue<Float4> x)
3308 {
3309 	RR_DEBUG_INFO_UPDATE_LOC();
3310 #if defined(__i386__) || defined(__x86_64__)
3311 	if(CPUID::supportsSSE4_1())
3312 	{
3313 		return x86::roundps(x, 3);
3314 	}
3315 	else
3316 	{
3317 		return Float4(Int4(x));
3318 	}
3319 #else
3320 	return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3321 #endif
3322 }
3323 
Frac(RValue<Float4> x)3324 RValue<Float4> Frac(RValue<Float4> x)
3325 {
3326 	RR_DEBUG_INFO_UPDATE_LOC();
3327 	Float4 frc;
3328 
3329 #if defined(__i386__) || defined(__x86_64__)
3330 	if(CPUID::supportsSSE4_1())
3331 	{
3332 		frc = x - x86::floorps(x);
3333 	}
3334 	else
3335 	{
3336 		frc = x - Float4(Int4(x));  // Signed fractional part.
3337 
3338 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3339 	}
3340 #else
3341 	frc = x - Floor(x);
3342 #endif
3343 
3344 	// x - floor(x) can be 1.0 for very small negative x.
3345 	// Clamp against the value just below 1.0.
3346 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3347 }
3348 
Floor(RValue<Float4> x)3349 RValue<Float4> Floor(RValue<Float4> x)
3350 {
3351 	RR_DEBUG_INFO_UPDATE_LOC();
3352 #if defined(__i386__) || defined(__x86_64__)
3353 	if(CPUID::supportsSSE4_1())
3354 	{
3355 		return x86::floorps(x);
3356 	}
3357 	else
3358 	{
3359 		return x - Frac(x);
3360 	}
3361 #else
3362 	return RValue<Float4>(V(lowerFloor(V(x.value()))));
3363 #endif
3364 }
3365 
Ceil(RValue<Float4> x)3366 RValue<Float4> Ceil(RValue<Float4> x)
3367 {
3368 	RR_DEBUG_INFO_UPDATE_LOC();
3369 #if defined(__i386__) || defined(__x86_64__)
3370 	if(CPUID::supportsSSE4_1())
3371 	{
3372 		return x86::ceilps(x);
3373 	}
3374 	else
3375 #endif
3376 	{
3377 		return -Floor(-x);
3378 	}
3379 }
3380 
Sin(RValue<Float4> v)3381 RValue<Float4> Sin(RValue<Float4> v)
3382 {
3383 	RR_DEBUG_INFO_UPDATE_LOC();
3384 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value())->getType() });
3385 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3386 }
3387 
Cos(RValue<Float4> v)3388 RValue<Float4> Cos(RValue<Float4> v)
3389 {
3390 	RR_DEBUG_INFO_UPDATE_LOC();
3391 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value())->getType() });
3392 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3393 }
3394 
Tan(RValue<Float4> v)3395 RValue<Float4> Tan(RValue<Float4> v)
3396 {
3397 	RR_DEBUG_INFO_UPDATE_LOC();
3398 	return Sin(v) / Cos(v);
3399 }
3400 
TransformFloat4PerElement(RValue<Float4> v,const char * name)3401 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3402 {
3403 	auto funcTy = llvm::FunctionType::get(T(Float::type()), llvm::ArrayRef<llvm::Type *>(T(Float::type())), false);
3404 	auto func = jit->module->getOrInsertFunction(name, funcTy);
3405 	llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3406 	for(uint64_t i = 0; i < 4; i++)
3407 	{
3408 		auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value(), Float::type(), i)));
3409 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3410 	}
3411 	return RValue<Float4>(V(out));
3412 }
3413 
Asin(RValue<Float4> v)3414 RValue<Float4> Asin(RValue<Float4> v)
3415 {
3416 	RR_DEBUG_INFO_UPDATE_LOC();
3417 	return TransformFloat4PerElement(v, "asinf");
3418 }
3419 
Acos(RValue<Float4> v)3420 RValue<Float4> Acos(RValue<Float4> v)
3421 {
3422 	RR_DEBUG_INFO_UPDATE_LOC();
3423 	return TransformFloat4PerElement(v, "acosf");
3424 }
3425 
Atan(RValue<Float4> v)3426 RValue<Float4> Atan(RValue<Float4> v)
3427 {
3428 	RR_DEBUG_INFO_UPDATE_LOC();
3429 	return TransformFloat4PerElement(v, "atanf");
3430 }
3431 
Sinh(RValue<Float4> v)3432 RValue<Float4> Sinh(RValue<Float4> v)
3433 {
3434 	RR_DEBUG_INFO_UPDATE_LOC();
3435 	return TransformFloat4PerElement(v, "sinhf");
3436 }
3437 
Cosh(RValue<Float4> v)3438 RValue<Float4> Cosh(RValue<Float4> v)
3439 {
3440 	RR_DEBUG_INFO_UPDATE_LOC();
3441 	return TransformFloat4PerElement(v, "coshf");
3442 }
3443 
Tanh(RValue<Float4> v)3444 RValue<Float4> Tanh(RValue<Float4> v)
3445 {
3446 	RR_DEBUG_INFO_UPDATE_LOC();
3447 	return TransformFloat4PerElement(v, "tanhf");
3448 }
3449 
Asinh(RValue<Float4> v)3450 RValue<Float4> Asinh(RValue<Float4> v)
3451 {
3452 	RR_DEBUG_INFO_UPDATE_LOC();
3453 	return TransformFloat4PerElement(v, "asinhf");
3454 }
3455 
Acosh(RValue<Float4> v)3456 RValue<Float4> Acosh(RValue<Float4> v)
3457 {
3458 	RR_DEBUG_INFO_UPDATE_LOC();
3459 	return TransformFloat4PerElement(v, "acoshf");
3460 }
3461 
Atanh(RValue<Float4> v)3462 RValue<Float4> Atanh(RValue<Float4> v)
3463 {
3464 	RR_DEBUG_INFO_UPDATE_LOC();
3465 	return TransformFloat4PerElement(v, "atanhf");
3466 }
3467 
Atan2(RValue<Float4> x,RValue<Float4> y)3468 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3469 {
3470 	RR_DEBUG_INFO_UPDATE_LOC();
3471 	llvm::SmallVector<llvm::Type *, 2> paramTys;
3472 	paramTys.push_back(T(Float::type()));
3473 	paramTys.push_back(T(Float::type()));
3474 	auto funcTy = llvm::FunctionType::get(T(Float::type()), paramTys, false);
3475 	auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3476 	llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3477 	for(uint64_t i = 0; i < 4; i++)
3478 	{
3479 		auto el = jit->builder->CreateCall(func, { V(Nucleus::createExtractElement(x.value(), Float::type(), i)),
3480 		                                           V(Nucleus::createExtractElement(y.value(), Float::type(), i)) });
3481 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3482 	}
3483 	return RValue<Float4>(V(out));
3484 }
3485 
Pow(RValue<Float4> x,RValue<Float4> y)3486 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3487 {
3488 	RR_DEBUG_INFO_UPDATE_LOC();
3489 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::type()) });
3490 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()) })));
3491 }
3492 
Exp(RValue<Float4> v)3493 RValue<Float4> Exp(RValue<Float4> v)
3494 {
3495 	RR_DEBUG_INFO_UPDATE_LOC();
3496 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::type()) });
3497 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3498 }
3499 
Log(RValue<Float4> v)3500 RValue<Float4> Log(RValue<Float4> v)
3501 {
3502 	RR_DEBUG_INFO_UPDATE_LOC();
3503 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::type()) });
3504 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3505 }
3506 
Exp2(RValue<Float4> v)3507 RValue<Float4> Exp2(RValue<Float4> v)
3508 {
3509 	RR_DEBUG_INFO_UPDATE_LOC();
3510 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::type()) });
3511 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3512 }
3513 
Log2(RValue<Float4> v)3514 RValue<Float4> Log2(RValue<Float4> v)
3515 {
3516 	RR_DEBUG_INFO_UPDATE_LOC();
3517 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::type()) });
3518 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3519 }
3520 
Ctlz(RValue<UInt> v,bool isZeroUndef)3521 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3522 {
3523 	RR_DEBUG_INFO_UPDATE_LOC();
3524 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3525 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3526 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3527 }
3528 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3529 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3530 {
3531 	RR_DEBUG_INFO_UPDATE_LOC();
3532 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3533 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3534 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3535 }
3536 
Cttz(RValue<UInt> v,bool isZeroUndef)3537 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3538 {
3539 	RR_DEBUG_INFO_UPDATE_LOC();
3540 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3541 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3542 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3543 }
3544 
Cttz(RValue<UInt4> v,bool isZeroUndef)3545 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3546 {
3547 	RR_DEBUG_INFO_UPDATE_LOC();
3548 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3549 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3550 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3551 }
3552 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3553 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3554 {
3555 	return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3556 }
3557 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3558 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3559 {
3560 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3561 }
3562 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3563 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3564 {
3565 	return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3566 }
3567 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3568 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3569 {
3570 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3571 }
3572 
type()3573 Type *Float4::type()
3574 {
3575 	return T(llvm::VectorType::get(T(Float::type()), 4, false));
3576 }
3577 
Ticks()3578 RValue<Long> Ticks()
3579 {
3580 	RR_DEBUG_INFO_UPDATE_LOC();
3581 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3582 
3583 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3584 }
3585 
ConstantPointer(void const * ptr)3586 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3587 {
3588 	RR_DEBUG_INFO_UPDATE_LOC();
3589 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3590 	// is defined to truncate (and zero extend) if necessary.
3591 	auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3592 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3593 }
3594 
ConstantData(void const * data,size_t size)3595 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3596 {
3597 	RR_DEBUG_INFO_UPDATE_LOC();
3598 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3599 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3600 	return RValue<Pointer<Byte>>(V(ptr));
3601 }
3602 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3603 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3604 {
3605 	// If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3606 	// mark all call arguments as initialized by calling __msan_unpoison_param().
3607 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3608 	{
3609 		// void __msan_unpoison_param(size_t n)
3610 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
3611 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3612 		auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3613 		auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3614 
3615 		jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3616 	}
3617 
3618 	RR_DEBUG_INFO_UPDATE_LOC();
3619 	llvm::SmallVector<llvm::Type *, 8> paramTys;
3620 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3621 	auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3622 
3623 	auto funcPtrTy = funcTy->getPointerTo();
3624 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3625 
3626 	llvm::SmallVector<llvm::Value *, 8> arguments;
3627 	for(auto arg : args) { arguments.push_back(V(arg)); }
3628 	return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3629 }
3630 
Breakpoint()3631 void Breakpoint()
3632 {
3633 	RR_DEBUG_INFO_UPDATE_LOC();
3634 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3635 
3636 	jit->builder->CreateCall(debugtrap);
3637 }
3638 
3639 }  // namespace rr
3640 
3641 namespace rr {
3642 
3643 #if defined(__i386__) || defined(__x86_64__)
3644 namespace x86 {
3645 
3646 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3647 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3648 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3649 {
3650 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3651 
3652 	return V(jit->builder->CreateCall(intrinsic, V(x)));
3653 }
3654 
3655 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3656 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3657 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3658 {
3659 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3660 
3661 	return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3662 }
3663 
cvtss2si(RValue<Float> val)3664 RValue<Int> cvtss2si(RValue<Float> val)
3665 {
3666 	Float4 vector;
3667 	vector.x = val;
3668 
3669 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3670 }
3671 
cvtps2dq(RValue<Float4> val)3672 RValue<Int4> cvtps2dq(RValue<Float4> val)
3673 {
3674 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3675 }
3676 
rcpss(RValue<Float> val)3677 RValue<Float> rcpss(RValue<Float> val)
3678 {
3679 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3680 
3681 	// TODO(b/172238865): MemorySanitizer does not support the rcpss instruction,
3682 	// which makes it look at the entire 128-bit input operand for undefined bits.
3683 	// Use zero-initialized values instead.
3684 	if(__has_feature(memory_sanitizer))
3685 	{
3686 		undef = Float4(0).loadValue();
3687 	}
3688 
3689 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3690 
3691 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3692 }
3693 
sqrtss(RValue<Float> val)3694 RValue<Float> sqrtss(RValue<Float> val)
3695 {
3696 	return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3697 }
3698 
rsqrtss(RValue<Float> val)3699 RValue<Float> rsqrtss(RValue<Float> val)
3700 {
3701 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3702 
3703 	// TODO(b/172238865): MemorySanitizer does not support the rsqrtss instruction,
3704 	// which makes it look at the entire 128-bit input operand for undefined bits.
3705 	// Use zero-initialized values instead.
3706 	if(__has_feature(memory_sanitizer))
3707 	{
3708 		undef = Float4(0).loadValue();
3709 	}
3710 
3711 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3712 
3713 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3714 }
3715 
rcpps(RValue<Float4> val)3716 RValue<Float4> rcpps(RValue<Float4> val)
3717 {
3718 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3719 }
3720 
sqrtps(RValue<Float4> val)3721 RValue<Float4> sqrtps(RValue<Float4> val)
3722 {
3723 	return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3724 }
3725 
rsqrtps(RValue<Float4> val)3726 RValue<Float4> rsqrtps(RValue<Float4> val)
3727 {
3728 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3729 }
3730 
maxps(RValue<Float4> x,RValue<Float4> y)3731 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3732 {
3733 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3734 }
3735 
minps(RValue<Float4> x,RValue<Float4> y)3736 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3737 {
3738 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3739 }
3740 
roundss(RValue<Float> val,unsigned char imm)3741 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3742 {
3743 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3744 
3745 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3746 
3747 	// TODO(b/172238865): MemorySanitizer does not support the roundss instruction,
3748 	// which makes it look at the entire 128-bit input operands for undefined bits.
3749 	// Use zero-initialized values instead.
3750 	if(__has_feature(memory_sanitizer))
3751 	{
3752 		undef = Float4(0).loadValue();
3753 	}
3754 
3755 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3756 
3757 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3758 }
3759 
floorss(RValue<Float> val)3760 RValue<Float> floorss(RValue<Float> val)
3761 {
3762 	return roundss(val, 1);
3763 }
3764 
ceilss(RValue<Float> val)3765 RValue<Float> ceilss(RValue<Float> val)
3766 {
3767 	return roundss(val, 2);
3768 }
3769 
roundps(RValue<Float4> val,unsigned char imm)3770 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3771 {
3772 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3773 }
3774 
floorps(RValue<Float4> val)3775 RValue<Float4> floorps(RValue<Float4> val)
3776 {
3777 	return roundps(val, 1);
3778 }
3779 
ceilps(RValue<Float4> val)3780 RValue<Float4> ceilps(RValue<Float4> val)
3781 {
3782 	return roundps(val, 2);
3783 }
3784 
paddsw(RValue<Short4> x,RValue<Short4> y)3785 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3786 {
3787 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3788 }
3789 
psubsw(RValue<Short4> x,RValue<Short4> y)3790 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3791 {
3792 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3793 }
3794 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3795 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3796 {
3797 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3798 }
3799 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3800 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3801 {
3802 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3803 }
3804 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3805 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3806 {
3807 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3808 }
3809 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3810 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3811 {
3812 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3813 }
3814 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3815 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3816 {
3817 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3818 }
3819 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3820 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3821 {
3822 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3823 }
3824 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3825 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3826 {
3827 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3828 }
3829 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3830 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3831 {
3832 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3833 }
3834 
pminsw(RValue<Short4> x,RValue<Short4> y)3835 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3836 {
3837 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3838 }
3839 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3840 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3841 {
3842 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3843 }
3844 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3845 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3846 {
3847 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3848 }
3849 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3850 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3851 {
3852 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3853 }
3854 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3855 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3856 {
3857 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3858 }
3859 
packssdw(RValue<Int2> x,RValue<Int2> y)3860 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3861 {
3862 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3863 }
3864 
packssdw(RValue<Int4> x,RValue<Int4> y)3865 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3866 {
3867 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3868 }
3869 
packsswb(RValue<Short4> x,RValue<Short4> y)3870 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3871 {
3872 	return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3873 }
3874 
packuswb(RValue<Short4> x,RValue<Short4> y)3875 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3876 {
3877 	return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3878 }
3879 
packusdw(RValue<Int4> x,RValue<Int4> y)3880 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3881 {
3882 	if(CPUID::supportsSSE4_1())
3883 	{
3884 		return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3885 	}
3886 	else
3887 	{
3888 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3889 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3890 
3891 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3892 	}
3893 }
3894 
psrlw(RValue<UShort4> x,unsigned char y)3895 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3896 {
3897 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3898 }
3899 
psrlw(RValue<UShort8> x,unsigned char y)3900 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3901 {
3902 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3903 }
3904 
psraw(RValue<Short4> x,unsigned char y)3905 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3906 {
3907 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3908 }
3909 
psraw(RValue<Short8> x,unsigned char y)3910 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3911 {
3912 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3913 }
3914 
psllw(RValue<Short4> x,unsigned char y)3915 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3916 {
3917 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3918 }
3919 
psllw(RValue<Short8> x,unsigned char y)3920 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3921 {
3922 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3923 }
3924 
pslld(RValue<Int2> x,unsigned char y)3925 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3926 {
3927 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3928 }
3929 
pslld(RValue<Int4> x,unsigned char y)3930 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3931 {
3932 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3933 }
3934 
psrad(RValue<Int2> x,unsigned char y)3935 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3936 {
3937 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3938 }
3939 
psrad(RValue<Int4> x,unsigned char y)3940 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3941 {
3942 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3943 }
3944 
psrld(RValue<UInt2> x,unsigned char y)3945 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3946 {
3947 	return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3948 }
3949 
psrld(RValue<UInt4> x,unsigned char y)3950 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3951 {
3952 	return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3953 }
3954 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3955 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3956 {
3957 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3958 }
3959 
pminsd(RValue<Int4> x,RValue<Int4> y)3960 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3961 {
3962 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3963 }
3964 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3965 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3966 {
3967 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3968 }
3969 
pminud(RValue<UInt4> x,RValue<UInt4> y)3970 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3971 {
3972 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3973 }
3974 
pmulhw(RValue<Short4> x,RValue<Short4> y)3975 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3976 {
3977 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3978 }
3979 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3980 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3981 {
3982 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3983 }
3984 
pmaddwd(RValue<Short4> x,RValue<Short4> y)3985 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3986 {
3987 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3988 }
3989 
pmulhw(RValue<Short8> x,RValue<Short8> y)3990 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3991 {
3992 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3993 }
3994 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3995 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3996 {
3997 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3998 }
3999 
pmaddwd(RValue<Short8> x,RValue<Short8> y)4000 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
4001 {
4002 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
4003 }
4004 
movmskps(RValue<Float4> x)4005 RValue<Int> movmskps(RValue<Float4> x)
4006 {
4007 	Value *v = x.value();
4008 
4009 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4010 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
4011 	// just the sign bits to avoid false positives.
4012 	if(__has_feature(memory_sanitizer))
4013 	{
4014 		v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
4015 	}
4016 
4017 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
4018 }
4019 
pmovmskb(RValue<Byte8> x)4020 RValue<Int> pmovmskb(RValue<Byte8> x)
4021 {
4022 	Value *v = x.value();
4023 
4024 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4025 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
4026 	// just the sign bits in the lower 64-bit vector to avoid false positives.
4027 	if(__has_feature(memory_sanitizer))
4028 	{
4029 		v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
4030 	}
4031 
4032 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
4033 }
4034 
4035 }  // namespace x86
4036 #endif  // defined(__i386__) || defined(__x86_64__)
4037 
4038 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)4039 void VPrintf(const std::vector<Value *> &vals)
4040 {
4041 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4042 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4043 	auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
4044 	auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
4045 	jit->builder->CreateCall(func, V(vals));
4046 }
4047 #endif  // ENABLE_RR_PRINT
4048 
Nop()4049 void Nop()
4050 {
4051 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4052 	auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
4053 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
4054 	jit->builder->CreateCall(func);
4055 }
4056 
EmitDebugLocation()4057 void EmitDebugLocation()
4058 {
4059 #ifdef ENABLE_RR_DEBUG_INFO
4060 	if(jit->debugInfo != nullptr)
4061 	{
4062 		jit->debugInfo->EmitLocation();
4063 	}
4064 #endif  // ENABLE_RR_DEBUG_INFO
4065 }
4066 
EmitDebugVariable(Value * value)4067 void EmitDebugVariable(Value *value)
4068 {
4069 #ifdef ENABLE_RR_DEBUG_INFO
4070 	if(jit->debugInfo != nullptr)
4071 	{
4072 		jit->debugInfo->EmitVariable(value);
4073 	}
4074 #endif  // ENABLE_RR_DEBUG_INFO
4075 }
4076 
FlushDebug()4077 void FlushDebug()
4078 {
4079 #ifdef ENABLE_RR_DEBUG_INFO
4080 	if(jit->debugInfo != nullptr)
4081 	{
4082 		jit->debugInfo->Flush();
4083 	}
4084 #endif  // ENABLE_RR_DEBUG_INFO
4085 }
4086 
4087 }  // namespace rr
4088 
4089 // ------------------------------  Coroutines ------------------------------
4090 
4091 namespace {
4092 
4093 // Magic values retuned by llvm.coro.suspend.
4094 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
4095 enum SuspendAction
4096 {
4097 	SuspendActionSuspend = -1,
4098 	SuspendActionResume = 0,
4099 	SuspendActionDestroy = 1
4100 };
4101 
promoteFunctionToCoroutine()4102 void promoteFunctionToCoroutine()
4103 {
4104 	ASSERT(jit->coroutine.id == nullptr);
4105 
4106 	// Types
4107 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4108 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4109 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4110 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4111 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4112 	auto promiseTy = jit->coroutine.yieldType;
4113 	auto promisePtrTy = promiseTy->getPointerTo();
4114 
4115 	// LLVM intrinsics
4116 	auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
4117 	auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
4118 	auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
4119 	auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
4120 	auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
4121 	auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
4122 	auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
4123 	auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
4124 	auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
4125 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4126 
4127 	auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
4128 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
4129 	auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
4130 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
4131 
4132 	auto oldInsertionPoint = jit->builder->saveIP();
4133 
4134 	// Build the coroutine_await() function:
4135 	//
4136 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4137 	//    {
4138 	//        if(llvm.coro.done(handle))
4139 	//        {
4140 	//            return false;
4141 	//        }
4142 	//        else
4143 	//        {
4144 	//            *value = (T*)llvm.coro.promise(handle);
4145 	//            llvm.coro.resume(handle);
4146 	//            return true;
4147 	//        }
4148 	//    }
4149 	//
4150 	{
4151 		auto args = jit->coroutine.await->arg_begin();
4152 		auto handle = args++;
4153 		auto outPtr = args++;
4154 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
4155 		auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
4156 		auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
4157 
4158 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4159 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4160 
4161 		jit->builder->SetInsertPoint(doneBlock);
4162 		jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
4163 
4164 		jit->builder->SetInsertPoint(resumeBlock);
4165 		auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
4166 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
4167 		auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4168 		jit->builder->CreateStore(promise, outPtr);
4169 		jit->builder->CreateCall(coro_resume, { handle });
4170 		jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
4171 	}
4172 
4173 	// Build the coroutine_destroy() function:
4174 	//
4175 	//    void coroutine_destroy(CoroutineHandle* handle)
4176 	//    {
4177 	//        llvm.coro.destroy(handle);
4178 	//    }
4179 	//
4180 	{
4181 		auto handle = jit->coroutine.destroy->arg_begin();
4182 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4183 		jit->builder->CreateCall(coro_destroy, { handle });
4184 		jit->builder->CreateRetVoid();
4185 	}
4186 
4187 	// Begin building the main coroutine_begin() function.
4188 	//
4189 	//    CoroutineHandle* coroutine_begin(<Arguments>)
4190 	//    {
4191 	//        YieldType promise;
4192 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4193 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4194 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
4195 	//
4196 	//        ... <REACTOR CODE> ...
4197 	//
4198 	//    end:
4199 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
4200 	//        switch(action)
4201 	//        {
4202 	//        case SuspendActionResume:
4203 	//            UNREACHABLE(); // Illegal to resume after final suspend.
4204 	//        case SuspendActionDestroy:
4205 	//            goto destroy;
4206 	//        default: // (SuspendActionSuspend)
4207 	//            goto suspend;
4208 	//        }
4209 	//
4210 	//    destroy:
4211 	//        coroutine_free_frame(llvm.coro.free(id, handle));
4212 	//        goto suspend;
4213 	//
4214 	//    suspend:
4215 	//        llvm.coro.end(handle, false);
4216 	//        return handle;
4217 	//    }
4218 	//
4219 
4220 #ifdef ENABLE_RR_DEBUG_INFO
4221 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
4222 #endif  // ENABLE_RR_DEBUG_INFO
4223 
4224 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
4225 	jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
4226 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
4227 
4228 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4229 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4230 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4231 	                                                          llvm::ConstantInt::get(i32Ty, 0),
4232 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4233 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
4234 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
4235 	                                                      });
4236 	auto size = jit->builder->CreateCall(coro_size, {});
4237 	auto frame = jit->builder->CreateCall(allocFrame, { size });
4238 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4239 
4240 	// Build the suspend block
4241 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4242 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4243 	jit->builder->CreateRet(jit->coroutine.handle);
4244 
4245 	// Build the end block
4246 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4247 	auto action = jit->builder->CreateCall(coro_suspend, {
4248 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4249 	                                                         llvm::ConstantInt::get(i1Ty, 1),  // final: true
4250 	                                                     });
4251 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4252 	// switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4253 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4254 
4255 	// Build the destroy block
4256 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4257 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4258 	jit->builder->CreateCall(freeFrame, { memory });
4259 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4260 
4261 	// Switch back to original insert point to continue building the coroutine.
4262 	jit->builder->restoreIP(oldInsertionPoint);
4263 }
4264 
4265 }  // anonymous namespace
4266 
4267 namespace rr {
4268 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4269 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4270 {
4271 	// Coroutines are initially created as a regular function.
4272 	// Upon the first call to Yield(), the function is promoted to a true
4273 	// coroutine.
4274 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4275 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4276 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4277 	auto handleTy = i8PtrTy;
4278 	auto boolTy = i1Ty;
4279 	auto promiseTy = T(YieldType);
4280 	auto promisePtrTy = promiseTy->getPointerTo();
4281 
4282 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4283 	jit->function->addFnAttr("coroutine.presplit", "0");
4284 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4285 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4286 	jit->coroutine.yieldType = promiseTy;
4287 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4288 
4289 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4290 }
4291 
yield(Value * val)4292 void Nucleus::yield(Value *val)
4293 {
4294 	if(jit->coroutine.id == nullptr)
4295 	{
4296 		// First call to yield().
4297 		// Promote the function to a full coroutine.
4298 		promoteFunctionToCoroutine();
4299 		ASSERT(jit->coroutine.id != nullptr);
4300 	}
4301 
4302 	//      promise = val;
4303 	//
4304 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4305 	//      switch(action)
4306 	//      {
4307 	//      case SuspendActionResume:
4308 	//          goto resume;
4309 	//      case SuspendActionDestroy:
4310 	//          goto destroy;
4311 	//      default: // (SuspendActionSuspend)
4312 	//          goto suspend;
4313 	//      }
4314 	//  resume:
4315 	//
4316 
4317 	RR_DEBUG_INFO_UPDATE_LOC();
4318 	Variable::materializeAll();
4319 
4320 	// Types
4321 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4322 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4323 
4324 	// Intrinsics
4325 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4326 
4327 	// Create a block to resume execution.
4328 	auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4329 
4330 	// Store the promise (yield value)
4331 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4332 	auto action = jit->builder->CreateCall(coro_suspend, {
4333 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4334 	                                                         llvm::ConstantInt::get(i1Ty, 0),  // final: true
4335 	                                                     });
4336 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4337 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4338 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4339 
4340 	// Continue building in the resume block.
4341 	jit->builder->SetInsertPoint(resumeBlock);
4342 }
4343 
acquireCoroutine(const char * name,const Config::Edit * cfgEdit)4344 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
4345 {
4346 	if(jit->coroutine.id)
4347 	{
4348 		jit->builder->CreateBr(jit->coroutine.endBlock);
4349 	}
4350 	else
4351 	{
4352 		// Coroutine without a Yield acts as a regular function.
4353 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4354 		// handle.
4355 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4356 		// The 'coroutine_await' function always returns false (coroutine done).
4357 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4358 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4359 		// The 'coroutine_destroy' does nothing, returns void.
4360 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4361 		jit->builder->CreateRetVoid();
4362 	}
4363 
4364 #ifdef ENABLE_RR_DEBUG_INFO
4365 	if(jit->debugInfo != nullptr)
4366 	{
4367 		jit->debugInfo->Finalize();
4368 	}
4369 #endif  // ENABLE_RR_DEBUG_INFO
4370 
4371 	if(false)
4372 	{
4373 		std::error_code error;
4374 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4375 		jit->module->print(file, 0);
4376 	}
4377 
4378 	Config cfg = jit->config;
4379 	if(cfgEdit)
4380 	{
4381 		cfg = cfgEdit->apply(jit->config);
4382 	}
4383 	jit->runPasses(cfg);
4384 
4385 	if(false)
4386 	{
4387 		std::error_code error;
4388 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4389 		jit->module->print(file, 0);
4390 	}
4391 
4392 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4393 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4394 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4395 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4396 
4397 	auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount, cfg);
4398 
4399 	delete jit;
4400 	jit = nullptr;
4401 
4402 	return routine;
4403 }
4404 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4405 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4406 {
4407 	return func();
4408 }
4409 
4410 }  // namespace rr
4411