• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "EmulatedIntrinsics.hpp"
20 #include "LLVMReactorDebugInfo.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "x86.hpp"
24 
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/IntrinsicsX86.h"
27 #include "llvm/IR/LegacyPassManager.h"
28 #include "llvm/IR/Verifier.h"
29 #include "llvm/Support/Alignment.h"
30 #include "llvm/Support/ManagedStatic.h"
31 #include "llvm/Transforms/Coroutines.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Scalar.h"
34 
35 #include <fstream>
36 #include <iostream>
37 #include <mutex>
38 #include <numeric>
39 #include <thread>
40 #include <unordered_map>
41 
42 #if defined(__i386__) || defined(__x86_64__)
43 #	include <xmmintrin.h>
44 #endif
45 
46 #include <math.h>
47 
48 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()49 extern "C" void X86CompilationCallback()
50 {
51 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
52 }
53 #endif
54 
55 #if !LLVM_ENABLE_THREADS
56 #	error "LLVM_ENABLE_THREADS needs to be enabled"
57 #endif
58 
59 #if LLVM_VERSION_MAJOR < 11
60 namespace llvm {
61 using FixedVectorType = VectorType;
62 }  // namespace llvm
63 #endif
64 
65 namespace {
66 
67 // Used to automatically invoke llvm_shutdown() when driver is unloaded
68 llvm::llvm_shutdown_obj llvmShutdownObj;
69 
70 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
71 // for destructing objects at exit. See crbug.com/1074222
72 thread_local rr::JITBuilder *jit = nullptr;
73 
74 // Default configuration settings. Must be accessed under mutex lock.
75 std::mutex defaultConfigLock;
defaultConfig()76 rr::Config &defaultConfig()
77 {
78 	// This uses a static in a function to avoid the cost of a global static
79 	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
80 	static rr::Config config = rr::Config::Edit()
81 	                               .add(rr::Optimization::Pass::ScalarReplAggregates)
82 	                               .add(rr::Optimization::Pass::InstructionCombining)
83 	                               .apply({});
84 	return config;
85 }
86 
lowerPAVG(llvm::Value * x,llvm::Value * y)87 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
88 {
89 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
90 
91 	llvm::VectorType *extTy =
92 	    llvm::VectorType::getExtendedElementVectorType(ty);
93 	x = jit->builder->CreateZExt(x, extTy);
94 	y = jit->builder->CreateZExt(y, extTy);
95 
96 	// (x + y + 1) >> 1
97 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
98 	llvm::Value *res = jit->builder->CreateAdd(x, y);
99 	res = jit->builder->CreateAdd(res, one);
100 	res = jit->builder->CreateLShr(res, one);
101 	return jit->builder->CreateTrunc(res, ty);
102 }
103 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)104 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
105                           llvm::ICmpInst::Predicate pred)
106 {
107 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
108 }
109 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)110 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
111                        llvm::Value *y, llvm::Type *dstTy)
112 {
113 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
114 }
115 
116 #if defined(__i386__) || defined(__x86_64__)
lowerPMOV(llvm::Value * op,llvm::Type * dstType,bool sext)117 llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
118 {
119 	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
120 	llvm::FixedVectorType *dstTy = llvm::cast<llvm::FixedVectorType>(dstType);
121 
122 	llvm::Value *undef = llvm::UndefValue::get(srcTy);
123 	llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
124 	std::iota(mask.begin(), mask.end(), 0);
125 	llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
126 
127 	return sext ? jit->builder->CreateSExt(v, dstTy)
128 	            : jit->builder->CreateZExt(v, dstTy);
129 }
130 
lowerPABS(llvm::Value * v)131 llvm::Value *lowerPABS(llvm::Value *v)
132 {
133 	llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
134 	llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
135 	llvm::Value *neg = jit->builder->CreateNeg(v);
136 	return jit->builder->CreateSelect(cmp, v, neg);
137 }
138 #endif  // defined(__i386__) || defined(__x86_64__)
139 
140 #if !defined(__i386__) && !defined(__x86_64__)
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)141 llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
142                            llvm::FCmpInst::Predicate pred)
143 {
144 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
145 }
146 
lowerRound(llvm::Value * x)147 llvm::Value *lowerRound(llvm::Value *x)
148 {
149 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
150 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
151 	return jit->builder->CreateCall(nearbyint, { x });
152 }
153 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)154 llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
155 {
156 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
157 }
158 
lowerFloor(llvm::Value * x)159 llvm::Value *lowerFloor(llvm::Value *x)
160 {
161 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
162 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
163 	return jit->builder->CreateCall(floor, { x });
164 }
165 
lowerTrunc(llvm::Value * x)166 llvm::Value *lowerTrunc(llvm::Value *x)
167 {
168 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
169 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
170 	return jit->builder->CreateCall(trunc, { x });
171 }
172 
lowerSQRT(llvm::Value * x)173 llvm::Value *lowerSQRT(llvm::Value *x)
174 {
175 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
176 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
177 	return jit->builder->CreateCall(sqrt, { x });
178 }
179 
lowerRCP(llvm::Value * x)180 llvm::Value *lowerRCP(llvm::Value *x)
181 {
182 	llvm::Type *ty = x->getType();
183 	llvm::Constant *one;
184 	if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
185 	{
186 		one = llvm::ConstantVector::getSplat(
187 #	if LLVM_VERSION_MAJOR >= 11
188 		    vectorTy->getElementCount(),
189 #	else
190 		    vectorTy->getNumElements(),
191 #	endif
192 		    llvm::ConstantFP::get(vectorTy->getElementType(), 1));
193 	}
194 	else
195 	{
196 		one = llvm::ConstantFP::get(ty, 1);
197 	}
198 	return jit->builder->CreateFDiv(one, x);
199 }
200 
lowerRSQRT(llvm::Value * x)201 llvm::Value *lowerRSQRT(llvm::Value *x)
202 {
203 	return lowerRCP(lowerSQRT(x));
204 }
205 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)206 llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
207 {
208 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
209 	llvm::Value *y = llvm::ConstantVector::getSplat(
210 #	if LLVM_VERSION_MAJOR >= 11
211 	    ty->getElementCount(),
212 #	else
213 	    ty->getNumElements(),
214 #	endif
215 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
216 	return jit->builder->CreateShl(x, y);
217 }
218 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)219 llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
220 {
221 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
222 	llvm::Value *y = llvm::ConstantVector::getSplat(
223 #	if LLVM_VERSION_MAJOR >= 11
224 	    ty->getElementCount(),
225 #	else
226 	    ty->getNumElements(),
227 #	endif
228 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
229 	return jit->builder->CreateAShr(x, y);
230 }
231 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)232 llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
233 {
234 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
235 	llvm::Value *y = llvm::ConstantVector::getSplat(
236 #	if LLVM_VERSION_MAJOR >= 11
237 	    ty->getElementCount(),
238 #	else
239 	    ty->getNumElements(),
240 #	endif
241 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
242 	return jit->builder->CreateLShr(x, y);
243 }
244 
lowerMulAdd(llvm::Value * x,llvm::Value * y)245 llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
246 {
247 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
248 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
249 
250 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
251 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
252 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
253 
254 	llvm::Value *undef = llvm::UndefValue::get(extTy);
255 
256 	llvm::SmallVector<uint32_t, 16> evenIdx;
257 	llvm::SmallVector<uint32_t, 16> oddIdx;
258 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
259 	{
260 		evenIdx.push_back(i);
261 		oddIdx.push_back(i + 1);
262 	}
263 
264 	llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
265 	llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
266 	return jit->builder->CreateAdd(lhs, rhs);
267 }
268 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)269 llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
270 {
271 	llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
272 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
273 
274 	llvm::IntegerType *dstElemTy =
275 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
276 
277 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
278 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
279 	llvm::Constant *max, *min;
280 	if(isSigned)
281 	{
282 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
283 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
284 	}
285 	else
286 	{
287 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
288 		min = llvm::ConstantInt::get(srcTy, 0, false);
289 	}
290 
291 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
292 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
293 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
294 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
295 
296 	x = jit->builder->CreateTrunc(x, dstTy);
297 	y = jit->builder->CreateTrunc(y, dstTy);
298 
299 	llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
300 	std::iota(index.begin(), index.end(), 0);
301 
302 	return jit->builder->CreateShuffleVector(x, y, index);
303 }
304 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)305 llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
306 {
307 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
308 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
309 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
310 
311 	llvm::Value *ret = jit->builder->CreateZExt(
312 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
313 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
314 	{
315 		llvm::Value *elem = jit->builder->CreateZExt(
316 		    jit->builder->CreateExtractElement(cmp, i), retTy);
317 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
318 	}
319 	return ret;
320 }
321 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)322 llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
323 {
324 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
325 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
326 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
327 
328 	llvm::Value *ret = jit->builder->CreateZExt(
329 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
330 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
331 	{
332 		llvm::Value *elem = jit->builder->CreateZExt(
333 		    jit->builder->CreateExtractElement(cmp, i), retTy);
334 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
335 	}
336 	return ret;
337 }
338 #endif  // !defined(__i386__) && !defined(__x86_64__)
339 
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)340 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
341 {
342 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
343 }
344 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)345 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
346 {
347 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
348 }
349 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)350 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
351 {
352 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
353 }
354 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)355 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
356 {
357 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
358 }
359 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)360 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
361 {
362 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
363 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
364 
365 	llvm::Value *extX, *extY;
366 	if(sext)
367 	{
368 		extX = jit->builder->CreateSExt(x, extTy);
369 		extY = jit->builder->CreateSExt(y, extTy);
370 	}
371 	else
372 	{
373 		extX = jit->builder->CreateZExt(x, extTy);
374 		extY = jit->builder->CreateZExt(y, extTy);
375 	}
376 
377 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
378 
379 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
380 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
381 	return jit->builder->CreateTrunc(mulh, ty);
382 }
383 
384 }  // namespace
385 
386 namespace rr {
387 
BackendName()388 std::string BackendName()
389 {
390 	return std::string("LLVM ") + LLVM_VERSION_STRING;
391 }
392 
393 const Capabilities Caps = {
394 	true,  // CoroutinesSupported
395 };
396 
397 // The abstract Type* types are implemented as LLVM types, except that
398 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
399 // and VFP in ARM, and eliminate the overhead of converting them to explicit
400 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
401 // as abstract pointers with small enum values.
402 enum InternalType : uintptr_t
403 {
404 	// Emulated types:
405 	Type_v2i32,
406 	Type_v4i16,
407 	Type_v2i16,
408 	Type_v8i8,
409 	Type_v4i8,
410 	Type_v2f32,
411 	EmulatedTypeCount,
412 	// Returned by asInternalType() to indicate that the abstract Type*
413 	// should be interpreted as LLVM type pointer:
414 	Type_LLVM
415 };
416 
asInternalType(Type * type)417 inline InternalType asInternalType(Type *type)
418 {
419 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
420 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
421 }
422 
T(Type * t)423 llvm::Type *T(Type *t)
424 {
425 	// Use 128-bit vectors to implement logically shorter ones.
426 	switch(asInternalType(t))
427 	{
428 	case Type_v2i32: return T(Int4::type());
429 	case Type_v4i16: return T(Short8::type());
430 	case Type_v2i16: return T(Short8::type());
431 	case Type_v8i8: return T(Byte16::type());
432 	case Type_v4i8: return T(Byte16::type());
433 	case Type_v2f32: return T(Float4::type());
434 	case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
435 	default:
436 		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
437 		return nullptr;
438 	}
439 }
440 
T(InternalType t)441 Type *T(InternalType t)
442 {
443 	return reinterpret_cast<Type *>(t);
444 }
445 
T(const std::vector<Type * > & t)446 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
447 {
448 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
449 }
450 
B(BasicBlock * t)451 inline llvm::BasicBlock *B(BasicBlock *t)
452 {
453 	return reinterpret_cast<llvm::BasicBlock *>(t);
454 }
455 
B(llvm::BasicBlock * t)456 inline BasicBlock *B(llvm::BasicBlock *t)
457 {
458 	return reinterpret_cast<BasicBlock *>(t);
459 }
460 
typeSize(Type * type)461 static size_t typeSize(Type *type)
462 {
463 	switch(asInternalType(type))
464 	{
465 	case Type_v2i32: return 8;
466 	case Type_v4i16: return 8;
467 	case Type_v2i16: return 4;
468 	case Type_v8i8: return 8;
469 	case Type_v4i8: return 4;
470 	case Type_v2f32: return 8;
471 	case Type_LLVM:
472 		{
473 			llvm::Type *t = T(type);
474 
475 			if(t->isPointerTy())
476 			{
477 				return sizeof(void *);
478 			}
479 
480 			// At this point we should only have LLVM 'primitive' types.
481 			unsigned int bits = t->getPrimitiveSizeInBits();
482 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
483 
484 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
485 			// but are typically stored as one byte. The DataLayout structure should
486 			// be used here and many other places if this assumption fails.
487 			return (bits + 7) / 8;
488 		}
489 		break;
490 	default:
491 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
492 		return 0;
493 	}
494 }
495 
elementCount(Type * type)496 static unsigned int elementCount(Type *type)
497 {
498 	switch(asInternalType(type))
499 	{
500 	case Type_v2i32: return 2;
501 	case Type_v4i16: return 4;
502 	case Type_v2i16: return 2;
503 	case Type_v8i8: return 8;
504 	case Type_v4i8: return 4;
505 	case Type_v2f32: return 2;
506 	case Type_LLVM: return llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();
507 	default:
508 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
509 		return 0;
510 	}
511 }
512 
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)513 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> &params)
514 {
515 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
516 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
517 
518 	func->setLinkage(llvm::GlobalValue::ExternalLinkage);
519 	func->setDoesNotThrow();
520 	func->setCallingConv(llvm::CallingConv::C);
521 
522 	if(__has_feature(memory_sanitizer))
523 	{
524 		func->addFnAttr(llvm::Attribute::SanitizeMemory);
525 	}
526 
527 	return func;
528 }
529 
Nucleus()530 Nucleus::Nucleus()
531 {
532 #if !__has_feature(memory_sanitizer)
533 	// thread_local variables in shared libraries are initialized at load-time,
534 	// but this is not observed by MemorySanitizer if the loader itself was not
535 	// instrumented, leading to false-positive uninitialized variable errors.
536 	ASSERT(jit == nullptr);
537 	ASSERT(Variable::unmaterializedVariables == nullptr);
538 #endif
539 
540 	jit = new JITBuilder(Nucleus::getDefaultConfig());
541 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
542 }
543 
~Nucleus()544 Nucleus::~Nucleus()
545 {
546 	delete Variable::unmaterializedVariables;
547 	Variable::unmaterializedVariables = nullptr;
548 
549 	delete jit;
550 	jit = nullptr;
551 }
552 
setDefaultConfig(const Config & cfg)553 void Nucleus::setDefaultConfig(const Config &cfg)
554 {
555 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
556 	::defaultConfig() = cfg;
557 }
558 
adjustDefaultConfig(const Config::Edit & cfgEdit)559 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
560 {
561 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
562 	auto &config = ::defaultConfig();
563 	config = cfgEdit.apply(config);
564 }
565 
getDefaultConfig()566 Config Nucleus::getDefaultConfig()
567 {
568 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
569 	return ::defaultConfig();
570 }
571 
acquireRoutine(const char * name,const Config::Edit * cfgEdit)572 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
573 {
574 	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
575 	{
576 		llvm::Type *type = jit->function->getReturnType();
577 
578 		if(type->isVoidTy())
579 		{
580 			createRetVoid();
581 		}
582 		else
583 		{
584 			createRet(V(llvm::UndefValue::get(type)));
585 		}
586 	}
587 
588 	std::shared_ptr<Routine> routine;
589 
590 	auto acquire = [&](rr::JITBuilder *jit) {
591 		// ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
592 		// it needs to only use the jit variable passed in as an argument.
593 
594 		Config cfg = jit->config;
595 		if(cfgEdit)
596 		{
597 			cfg = cfgEdit->apply(jit->config);
598 		}
599 
600 #ifdef ENABLE_RR_DEBUG_INFO
601 		if(jit->debugInfo != nullptr)
602 		{
603 			jit->debugInfo->Finalize();
604 		}
605 #endif  // ENABLE_RR_DEBUG_INFO
606 
607 		if(false)
608 		{
609 			std::error_code error;
610 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
611 			jit->module->print(file, 0);
612 		}
613 
614 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
615 		{
616 			llvm::legacy::PassManager pm;
617 			pm.add(llvm::createVerifierPass());
618 			pm.run(*jit->module);
619 		}
620 #endif  // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
621 
622 		jit->optimize(cfg);
623 
624 		if(false)
625 		{
626 			std::error_code error;
627 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
628 			jit->module->print(file, 0);
629 		}
630 
631 		routine = jit->acquireRoutine(name, &jit->function, 1, cfg);
632 	};
633 
634 #ifdef JIT_IN_SEPARATE_THREAD
635 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
636 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
637 	// over the threading and stack sizes of its users, so this should be addressed
638 	// at a higher level instead.
639 	std::thread thread(acquire, jit);
640 	thread.join();
641 #else
642 	acquire(jit);
643 #endif
644 
645 	return routine;
646 }
647 
allocateStackVariable(Type * type,int arraySize)648 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
649 {
650 	// Need to allocate it in the entry block for mem2reg to work
651 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
652 
653 	llvm::Instruction *declaration;
654 
655 #if LLVM_VERSION_MAJOR >= 11
656 	auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
657 #else
658 	auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
659 #endif
660 
661 	if(arraySize)
662 	{
663 		Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
664 		declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
665 	}
666 	else
667 	{
668 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
669 	}
670 
671 	entryBlock.getInstList().push_front(declaration);
672 
673 	return V(declaration);
674 }
675 
createBasicBlock()676 BasicBlock *Nucleus::createBasicBlock()
677 {
678 	return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
679 }
680 
getInsertBlock()681 BasicBlock *Nucleus::getInsertBlock()
682 {
683 	return B(jit->builder->GetInsertBlock());
684 }
685 
setInsertBlock(BasicBlock * basicBlock)686 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
687 {
688 	// assert(jit->builder->GetInsertBlock()->back().isTerminator());
689 
690 	jit->builder->SetInsertPoint(B(basicBlock));
691 }
692 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)693 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
694 {
695 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
696 
697 #ifdef ENABLE_RR_DEBUG_INFO
698 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
699 #endif  // ENABLE_RR_DEBUG_INFO
700 
701 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
702 }
703 
getArgument(unsigned int index)704 Value *Nucleus::getArgument(unsigned int index)
705 {
706 	llvm::Function::arg_iterator args = jit->function->arg_begin();
707 
708 	while(index)
709 	{
710 		args++;
711 		index--;
712 	}
713 
714 	return V(&*args);
715 }
716 
createRetVoid()717 void Nucleus::createRetVoid()
718 {
719 	RR_DEBUG_INFO_UPDATE_LOC();
720 
721 	ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
722 
723 	// Code generated after this point is unreachable, so any variables
724 	// being read can safely return an undefined value. We have to avoid
725 	// materializing variables after the terminator ret instruction.
726 	Variable::killUnmaterialized();
727 
728 	jit->builder->CreateRetVoid();
729 }
730 
createRet(Value * v)731 void Nucleus::createRet(Value *v)
732 {
733 	RR_DEBUG_INFO_UPDATE_LOC();
734 
735 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
736 
737 	// Code generated after this point is unreachable, so any variables
738 	// being read can safely return an undefined value. We have to avoid
739 	// materializing variables after the terminator ret instruction.
740 	Variable::killUnmaterialized();
741 
742 	jit->builder->CreateRet(V(v));
743 }
744 
createBr(BasicBlock * dest)745 void Nucleus::createBr(BasicBlock *dest)
746 {
747 	RR_DEBUG_INFO_UPDATE_LOC();
748 	Variable::materializeAll();
749 
750 	jit->builder->CreateBr(B(dest));
751 }
752 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)753 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
754 {
755 	RR_DEBUG_INFO_UPDATE_LOC();
756 	Variable::materializeAll();
757 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
758 }
759 
createAdd(Value * lhs,Value * rhs)760 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
761 {
762 	RR_DEBUG_INFO_UPDATE_LOC();
763 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
764 }
765 
createSub(Value * lhs,Value * rhs)766 Value *Nucleus::createSub(Value *lhs, Value *rhs)
767 {
768 	RR_DEBUG_INFO_UPDATE_LOC();
769 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
770 }
771 
createMul(Value * lhs,Value * rhs)772 Value *Nucleus::createMul(Value *lhs, Value *rhs)
773 {
774 	RR_DEBUG_INFO_UPDATE_LOC();
775 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
776 }
777 
createUDiv(Value * lhs,Value * rhs)778 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
779 {
780 	RR_DEBUG_INFO_UPDATE_LOC();
781 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
782 }
783 
createSDiv(Value * lhs,Value * rhs)784 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
785 {
786 	RR_DEBUG_INFO_UPDATE_LOC();
787 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
788 }
789 
createFAdd(Value * lhs,Value * rhs)790 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
791 {
792 	RR_DEBUG_INFO_UPDATE_LOC();
793 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
794 }
795 
createFSub(Value * lhs,Value * rhs)796 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
797 {
798 	RR_DEBUG_INFO_UPDATE_LOC();
799 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
800 }
801 
createFMul(Value * lhs,Value * rhs)802 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
803 {
804 	RR_DEBUG_INFO_UPDATE_LOC();
805 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
806 }
807 
createFDiv(Value * lhs,Value * rhs)808 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
809 {
810 	RR_DEBUG_INFO_UPDATE_LOC();
811 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
812 }
813 
createURem(Value * lhs,Value * rhs)814 Value *Nucleus::createURem(Value *lhs, Value *rhs)
815 {
816 	RR_DEBUG_INFO_UPDATE_LOC();
817 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
818 }
819 
createSRem(Value * lhs,Value * rhs)820 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
821 {
822 	RR_DEBUG_INFO_UPDATE_LOC();
823 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
824 }
825 
createFRem(Value * lhs,Value * rhs)826 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
827 {
828 	RR_DEBUG_INFO_UPDATE_LOC();
829 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
830 }
831 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)832 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
833 {
834 	return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
835 }
836 
createShl(Value * lhs,Value * rhs)837 Value *Nucleus::createShl(Value *lhs, Value *rhs)
838 {
839 	RR_DEBUG_INFO_UPDATE_LOC();
840 	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
841 }
842 
createLShr(Value * lhs,Value * rhs)843 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
844 {
845 	RR_DEBUG_INFO_UPDATE_LOC();
846 	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
847 }
848 
createAShr(Value * lhs,Value * rhs)849 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
850 {
851 	RR_DEBUG_INFO_UPDATE_LOC();
852 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
853 }
854 
createAnd(Value * lhs,Value * rhs)855 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
856 {
857 	RR_DEBUG_INFO_UPDATE_LOC();
858 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
859 }
860 
createOr(Value * lhs,Value * rhs)861 Value *Nucleus::createOr(Value *lhs, Value *rhs)
862 {
863 	RR_DEBUG_INFO_UPDATE_LOC();
864 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
865 }
866 
createXor(Value * lhs,Value * rhs)867 Value *Nucleus::createXor(Value *lhs, Value *rhs)
868 {
869 	RR_DEBUG_INFO_UPDATE_LOC();
870 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
871 }
872 
createNeg(Value * v)873 Value *Nucleus::createNeg(Value *v)
874 {
875 	RR_DEBUG_INFO_UPDATE_LOC();
876 	return V(jit->builder->CreateNeg(V(v)));
877 }
878 
createFNeg(Value * v)879 Value *Nucleus::createFNeg(Value *v)
880 {
881 	RR_DEBUG_INFO_UPDATE_LOC();
882 	return V(jit->builder->CreateFNeg(V(v)));
883 }
884 
createNot(Value * v)885 Value *Nucleus::createNot(Value *v)
886 {
887 	RR_DEBUG_INFO_UPDATE_LOC();
888 	return V(jit->builder->CreateNot(V(v)));
889 }
890 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)891 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
892 {
893 	RR_DEBUG_INFO_UPDATE_LOC();
894 	switch(asInternalType(type))
895 	{
896 	case Type_v2i32:
897 	case Type_v4i16:
898 	case Type_v8i8:
899 	case Type_v2f32:
900 		return createBitCast(
901 		    createInsertElement(
902 		        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
903 		        createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
904 		        0),
905 		    type);
906 	case Type_v2i16:
907 	case Type_v4i8:
908 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
909 		{
910 			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
911 			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
912 			i = createZExt(i, Long::type());
913 			Value *v = createInsertElement(u, i, 0);
914 			return createBitCast(v, type);
915 		}
916 		// Fallthrough to non-emulated case.
917 	case Type_LLVM:
918 		{
919 			auto elTy = T(type);
920 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
921 
922 			if(!atomic)
923 			{
924 				return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
925 			}
926 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
927 			{
928 				// Integers and pointers can be atomically loaded by setting
929 				// the ordering constraint on the load instruction.
930 				auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
931 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
932 				return V(load);
933 			}
934 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
935 			{
936 				// LLVM claims to support atomic loads of float types as
937 				// above, but certain backends cannot deal with this.
938 				// Load as an integer and bitcast. See b/136037244.
939 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
940 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
941 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
942 				auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
943 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
944 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
945 				return V(loadCast);
946 			}
947 			else
948 			{
949 				// More exotic types require falling back to the extern:
950 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
951 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
952 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
953 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
954 				auto i8PtrTy = i8Ty->getPointerTo();
955 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
956 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
957 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
958 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
959 				auto out = allocateStackVariable(type);
960 				jit->builder->CreateCall(func, {
961 				                                   llvm::ConstantInt::get(sizetTy, size),
962 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
963 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
964 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
965 				                               });
966 				return V(jit->builder->CreateLoad(T(type), V(out)));
967 			}
968 		}
969 	default:
970 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
971 		return nullptr;
972 	}
973 }
974 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)975 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
976 {
977 	RR_DEBUG_INFO_UPDATE_LOC();
978 	switch(asInternalType(type))
979 	{
980 	case Type_v2i32:
981 	case Type_v4i16:
982 	case Type_v8i8:
983 	case Type_v2f32:
984 		createStore(
985 		    createExtractElement(
986 		        createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
987 		    createBitCast(ptr, Pointer<Long>::type()),
988 		    Long::type(), isVolatile, alignment, atomic, memoryOrder);
989 		return value;
990 	case Type_v2i16:
991 	case Type_v4i8:
992 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
993 		{
994 			createStore(
995 			    createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
996 			    createBitCast(ptr, Pointer<Int>::type()),
997 			    Int::type(), isVolatile, alignment, atomic, memoryOrder);
998 			return value;
999 		}
1000 		// Fallthrough to non-emulated case.
1001 	case Type_LLVM:
1002 		{
1003 			auto elTy = T(type);
1004 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
1005 
1006 			if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1007 			{
1008 				// Mark all memory writes as initialized by calling __msan_unpoison
1009 				// void __msan_unpoison(const volatile void *a, size_t size)
1010 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1011 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1012 				auto voidPtrTy = i8Ty->getPointerTo();
1013 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1014 				auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1015 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1016 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1017 
1018 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
1019 				                                 llvm::ConstantInt::get(sizetTy, size) });
1020 			}
1021 
1022 			if(!atomic)
1023 			{
1024 				jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1025 			}
1026 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
1027 			{
1028 				// Integers and pointers can be atomically stored by setting
1029 				// the ordering constraint on the store instruction.
1030 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1031 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1032 			}
1033 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
1034 			{
1035 				// LLVM claims to support atomic stores of float types as
1036 				// above, but certain backends cannot deal with this.
1037 				// Store as an bitcast integer. See b/136037244.
1038 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1039 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1040 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1041 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1042 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1043 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1044 			}
1045 			else
1046 			{
1047 				// More exotic types require falling back to the extern:
1048 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1049 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1050 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1051 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1052 				auto i8PtrTy = i8Ty->getPointerTo();
1053 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1054 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1055 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1056 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1057 				auto copy = allocateStackVariable(type);
1058 				jit->builder->CreateStore(V(value), V(copy));
1059 				jit->builder->CreateCall(func, {
1060 				                                   llvm::ConstantInt::get(sizetTy, size),
1061 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1062 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1063 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1064 				                               });
1065 			}
1066 
1067 			return value;
1068 		}
1069 	default:
1070 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1071 		return nullptr;
1072 	}
1073 }
1074 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1075 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1076 {
1077 	RR_DEBUG_INFO_UPDATE_LOC();
1078 
1079 	ASSERT(V(ptr)->getType()->isPointerTy());
1080 	ASSERT(V(mask)->getType()->isVectorTy());
1081 
1082 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1083 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1084 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1085 	auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1086 	auto elVecPtrTy = elVecTy->getPointerTo();
1087 	auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1088 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1089 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1090 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1091 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1092 }
1093 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1094 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1095 {
1096 	RR_DEBUG_INFO_UPDATE_LOC();
1097 
1098 	ASSERT(V(ptr)->getType()->isPointerTy());
1099 	ASSERT(V(val)->getType()->isVectorTy());
1100 	ASSERT(V(mask)->getType()->isVectorTy());
1101 
1102 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1103 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1104 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1105 	auto elVecTy = V(val)->getType();
1106 	auto elVecPtrTy = elVecTy->getPointerTo();
1107 	auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1108 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1109 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1110 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1111 
1112 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1113 	{
1114 		// Mark memory writes as initialized by calling __msan_unpoison
1115 		// void __msan_unpoison(const volatile void *a, size_t size)
1116 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
1117 		auto voidPtrTy = voidTy->getPointerTo();
1118 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1119 		auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1120 		auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1121 		auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1122 
1123 		for(unsigned i = 0; i < numEls; i++)
1124 		{
1125 			// Check mask for this element
1126 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1127 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1128 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1129 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1130 			jit->builder->SetInsertPoint(thenBlock);
1131 
1132 			// Insert __msan_unpoison call in conditional block
1133 			auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1134 			jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1135 			                                 llvm::ConstantInt::get(sizetTy, size) });
1136 
1137 			jit->builder->CreateBr(mergeBlock);
1138 			jit->builder->SetInsertPoint(mergeBlock);
1139 		}
1140 	}
1141 }
1142 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1143 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1144 {
1145 	ASSERT(base->getType()->isPointerTy());
1146 	ASSERT(offsets->getType()->isVectorTy());
1147 	ASSERT(mask->getType()->isVectorTy());
1148 
1149 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1150 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1151 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1152 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1153 	auto i8PtrTy = i8Ty->getPointerTo();
1154 	auto elPtrTy = elTy->getPointerTo();
1155 	auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1156 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1157 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1158 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1159 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1160 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1161 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1162 
1163 	if(!__has_feature(memory_sanitizer))
1164 	{
1165 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1166 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1167 		return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1168 	}
1169 	else  // __has_feature(memory_sanitizer)
1170 	{
1171 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1172 		// Work around it by emulating gather with element-wise loads.
1173 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1174 
1175 		Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1176 		Nucleus::createStore(V(passthrough), result, T(elVecTy));
1177 
1178 		for(unsigned i = 0; i < numEls; i++)
1179 		{
1180 			// Check mask for this element
1181 			Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1182 
1183 			If(RValue<Bool>(elementMask))
1184 			{
1185 				Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1186 				Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1187 
1188 				Value *v = Nucleus::createLoad(result, T(elVecTy));
1189 				v = Nucleus::createInsertElement(v, el, i);
1190 				Nucleus::createStore(v, result, T(elVecTy));
1191 			}
1192 		}
1193 
1194 		return V(Nucleus::createLoad(result, T(elVecTy)));
1195 	}
1196 }
1197 
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1198 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1199 {
1200 	return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1201 }
1202 
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1203 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1204 {
1205 	return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1206 }
1207 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1208 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1209 {
1210 	ASSERT(base->getType()->isPointerTy());
1211 	ASSERT(val->getType()->isVectorTy());
1212 	ASSERT(offsets->getType()->isVectorTy());
1213 	ASSERT(mask->getType()->isVectorTy());
1214 
1215 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1216 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1217 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1218 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1219 	auto i8PtrTy = i8Ty->getPointerTo();
1220 	auto elVecTy = val->getType();
1221 	auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1222 	auto elPtrTy = elTy->getPointerTo();
1223 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1224 
1225 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1226 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1227 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1228 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1229 
1230 	if(!__has_feature(memory_sanitizer))
1231 	{
1232 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1233 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1234 		jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1235 	}
1236 	else  // __has_feature(memory_sanitizer)
1237 	{
1238 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1239 		// Work around it by emulating scatter with element-wise stores.
1240 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1241 
1242 		for(unsigned i = 0; i < numEls; i++)
1243 		{
1244 			// Check mask for this element
1245 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1246 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1247 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1248 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1249 			jit->builder->SetInsertPoint(thenBlock);
1250 
1251 			auto el = jit->builder->CreateExtractElement(val, idx);
1252 			auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1253 			Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1254 
1255 			jit->builder->CreateBr(mergeBlock);
1256 			jit->builder->SetInsertPoint(mergeBlock);
1257 		}
1258 	}
1259 }
1260 
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1261 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1262 {
1263 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1264 }
1265 
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1266 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1267 {
1268 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1269 }
1270 
createFence(std::memory_order memoryOrder)1271 void Nucleus::createFence(std::memory_order memoryOrder)
1272 {
1273 	RR_DEBUG_INFO_UPDATE_LOC();
1274 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1275 }
1276 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1277 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1278 {
1279 	RR_DEBUG_INFO_UPDATE_LOC();
1280 	ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1281 	if(sizeof(void *) == 8)
1282 	{
1283 		// LLVM manual: "When indexing into an array, pointer or vector,
1284 		// integers of any width are allowed, and they are not required to
1285 		// be constant. These integers are treated as signed values where
1286 		// relevant."
1287 		//
1288 		// Thus if we want indexes to be treated as unsigned we have to
1289 		// zero-extend them ourselves.
1290 		//
1291 		// Note that this is not because we want to address anywhere near
1292 		// 4 GB of data. Instead this is important for performance because
1293 		// x86 supports automatic zero-extending of 32-bit registers to
1294 		// 64-bit. Thus when indexing into an array using a uint32 is
1295 		// actually faster than an int32.
1296 		index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1297 	}
1298 
1299 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1300 	// effective address correctly.
1301 	if(asInternalType(type) == Type_LLVM)
1302 	{
1303 		return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1304 	}
1305 
1306 	// For emulated types we have to multiply the index by the intended
1307 	// type size ourselves to obain the byte offset.
1308 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1309 
1310 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1311 	// original pointer type.
1312 	return createBitCast(
1313 	    V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1314 	    T(llvm::PointerType::get(T(type), 0)));
1315 }
1316 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1317 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1318 {
1319 	RR_DEBUG_INFO_UPDATE_LOC();
1320 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1321 #if LLVM_VERSION_MAJOR >= 11
1322 	                                       llvm::MaybeAlign(),
1323 #endif
1324 	                                       atomicOrdering(true, memoryOrder)));
1325 }
1326 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1327 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1328 {
1329 	RR_DEBUG_INFO_UPDATE_LOC();
1330 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1331 #if LLVM_VERSION_MAJOR >= 11
1332 	                                       llvm::MaybeAlign(),
1333 #endif
1334 	                                       atomicOrdering(true, memoryOrder)));
1335 }
1336 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1337 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1338 {
1339 	RR_DEBUG_INFO_UPDATE_LOC();
1340 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1341 #if LLVM_VERSION_MAJOR >= 11
1342 	                                       llvm::MaybeAlign(),
1343 #endif
1344 	                                       atomicOrdering(true, memoryOrder)));
1345 }
1346 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1347 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1348 {
1349 	RR_DEBUG_INFO_UPDATE_LOC();
1350 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1351 #if LLVM_VERSION_MAJOR >= 11
1352 	                                       llvm::MaybeAlign(),
1353 #endif
1354 	                                       atomicOrdering(true, memoryOrder)));
1355 }
1356 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1357 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1358 {
1359 	RR_DEBUG_INFO_UPDATE_LOC();
1360 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1361 #if LLVM_VERSION_MAJOR >= 11
1362 	                                       llvm::MaybeAlign(),
1363 #endif
1364 	                                       atomicOrdering(true, memoryOrder)));
1365 }
1366 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1367 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1368 {
1369 	RR_DEBUG_INFO_UPDATE_LOC();
1370 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1371 #if LLVM_VERSION_MAJOR >= 11
1372 	                                       llvm::MaybeAlign(),
1373 #endif
1374 	                                       atomicOrdering(true, memoryOrder)));
1375 }
1376 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1377 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1378 {
1379 	RR_DEBUG_INFO_UPDATE_LOC();
1380 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1381 #if LLVM_VERSION_MAJOR >= 11
1382 	                                       llvm::MaybeAlign(),
1383 #endif
1384 	                                       atomicOrdering(true, memoryOrder)));
1385 }
1386 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1387 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1388 {
1389 	RR_DEBUG_INFO_UPDATE_LOC();
1390 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1391 #if LLVM_VERSION_MAJOR >= 11
1392 	                                       llvm::MaybeAlign(),
1393 #endif
1394 	                                       atomicOrdering(true, memoryOrder)));
1395 }
1396 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1397 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1398 {
1399 	RR_DEBUG_INFO_UPDATE_LOC();
1400 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1401 #if LLVM_VERSION_MAJOR >= 11
1402 	                                       llvm::MaybeAlign(),
1403 #endif
1404 	                                       atomicOrdering(true, memoryOrder)));
1405 }
1406 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1407 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1408 {
1409 	RR_DEBUG_INFO_UPDATE_LOC();
1410 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1411 #if LLVM_VERSION_MAJOR >= 11
1412 	                                       llvm::MaybeAlign(),
1413 #endif
1414 	                                       atomicOrdering(true, memoryOrder)));
1415 }
1416 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1417 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1418 {
1419 	RR_DEBUG_INFO_UPDATE_LOC();
1420 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1421 	return V(jit->builder->CreateExtractValue(
1422 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1423 #if LLVM_VERSION_MAJOR >= 11
1424 	                                      llvm::MaybeAlign(),
1425 #endif
1426 	                                      atomicOrdering(true, memoryOrderEqual),
1427 	                                      atomicOrdering(true, memoryOrderUnequal)),
1428 	    llvm::ArrayRef<unsigned>(0u)));
1429 }
1430 
createTrunc(Value * v,Type * destType)1431 Value *Nucleus::createTrunc(Value *v, Type *destType)
1432 {
1433 	RR_DEBUG_INFO_UPDATE_LOC();
1434 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1435 }
1436 
createZExt(Value * v,Type * destType)1437 Value *Nucleus::createZExt(Value *v, Type *destType)
1438 {
1439 	RR_DEBUG_INFO_UPDATE_LOC();
1440 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1441 }
1442 
createSExt(Value * v,Type * destType)1443 Value *Nucleus::createSExt(Value *v, Type *destType)
1444 {
1445 	RR_DEBUG_INFO_UPDATE_LOC();
1446 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1447 }
1448 
createFPToUI(Value * v,Type * destType)1449 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1450 {
1451 	RR_DEBUG_INFO_UPDATE_LOC();
1452 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1453 }
1454 
createFPToSI(Value * v,Type * destType)1455 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1456 {
1457 	RR_DEBUG_INFO_UPDATE_LOC();
1458 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1459 }
1460 
createSIToFP(Value * v,Type * destType)1461 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1462 {
1463 	RR_DEBUG_INFO_UPDATE_LOC();
1464 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1465 }
1466 
createFPTrunc(Value * v,Type * destType)1467 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1468 {
1469 	RR_DEBUG_INFO_UPDATE_LOC();
1470 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1471 }
1472 
createFPExt(Value * v,Type * destType)1473 Value *Nucleus::createFPExt(Value *v, Type *destType)
1474 {
1475 	RR_DEBUG_INFO_UPDATE_LOC();
1476 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1477 }
1478 
createBitCast(Value * v,Type * destType)1479 Value *Nucleus::createBitCast(Value *v, Type *destType)
1480 {
1481 	RR_DEBUG_INFO_UPDATE_LOC();
1482 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1483 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1484 	// reading back as the destination type.
1485 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1486 	{
1487 		Value *readAddress = allocateStackVariable(destType);
1488 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1489 		createStore(v, writeAddress, T(V(v)->getType()));
1490 		return createLoad(readAddress, destType);
1491 	}
1492 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1493 	{
1494 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1495 		createStore(v, writeAddress, T(V(v)->getType()));
1496 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1497 		return createLoad(readAddress, destType);
1498 	}
1499 
1500 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1501 }
1502 
createICmpEQ(Value * lhs,Value * rhs)1503 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1504 {
1505 	RR_DEBUG_INFO_UPDATE_LOC();
1506 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1507 }
1508 
createICmpNE(Value * lhs,Value * rhs)1509 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1510 {
1511 	RR_DEBUG_INFO_UPDATE_LOC();
1512 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1513 }
1514 
createICmpUGT(Value * lhs,Value * rhs)1515 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1516 {
1517 	RR_DEBUG_INFO_UPDATE_LOC();
1518 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1519 }
1520 
createICmpUGE(Value * lhs,Value * rhs)1521 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1522 {
1523 	RR_DEBUG_INFO_UPDATE_LOC();
1524 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1525 }
1526 
createICmpULT(Value * lhs,Value * rhs)1527 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1528 {
1529 	RR_DEBUG_INFO_UPDATE_LOC();
1530 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1531 }
1532 
createICmpULE(Value * lhs,Value * rhs)1533 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1534 {
1535 	RR_DEBUG_INFO_UPDATE_LOC();
1536 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1537 }
1538 
createICmpSGT(Value * lhs,Value * rhs)1539 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1540 {
1541 	RR_DEBUG_INFO_UPDATE_LOC();
1542 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1543 }
1544 
createICmpSGE(Value * lhs,Value * rhs)1545 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1546 {
1547 	RR_DEBUG_INFO_UPDATE_LOC();
1548 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1549 }
1550 
createICmpSLT(Value * lhs,Value * rhs)1551 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1552 {
1553 	RR_DEBUG_INFO_UPDATE_LOC();
1554 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1555 }
1556 
createICmpSLE(Value * lhs,Value * rhs)1557 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1558 {
1559 	RR_DEBUG_INFO_UPDATE_LOC();
1560 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1561 }
1562 
createFCmpOEQ(Value * lhs,Value * rhs)1563 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1564 {
1565 	RR_DEBUG_INFO_UPDATE_LOC();
1566 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1567 }
1568 
createFCmpOGT(Value * lhs,Value * rhs)1569 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1570 {
1571 	RR_DEBUG_INFO_UPDATE_LOC();
1572 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1573 }
1574 
createFCmpOGE(Value * lhs,Value * rhs)1575 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1576 {
1577 	RR_DEBUG_INFO_UPDATE_LOC();
1578 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1579 }
1580 
createFCmpOLT(Value * lhs,Value * rhs)1581 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1582 {
1583 	RR_DEBUG_INFO_UPDATE_LOC();
1584 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1585 }
1586 
createFCmpOLE(Value * lhs,Value * rhs)1587 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1588 {
1589 	RR_DEBUG_INFO_UPDATE_LOC();
1590 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1591 }
1592 
createFCmpONE(Value * lhs,Value * rhs)1593 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1594 {
1595 	RR_DEBUG_INFO_UPDATE_LOC();
1596 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1597 }
1598 
createFCmpORD(Value * lhs,Value * rhs)1599 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1600 {
1601 	RR_DEBUG_INFO_UPDATE_LOC();
1602 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1603 }
1604 
createFCmpUNO(Value * lhs,Value * rhs)1605 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1606 {
1607 	RR_DEBUG_INFO_UPDATE_LOC();
1608 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1609 }
1610 
createFCmpUEQ(Value * lhs,Value * rhs)1611 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1612 {
1613 	RR_DEBUG_INFO_UPDATE_LOC();
1614 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1615 }
1616 
createFCmpUGT(Value * lhs,Value * rhs)1617 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1618 {
1619 	RR_DEBUG_INFO_UPDATE_LOC();
1620 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1621 }
1622 
createFCmpUGE(Value * lhs,Value * rhs)1623 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1624 {
1625 	RR_DEBUG_INFO_UPDATE_LOC();
1626 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1627 }
1628 
createFCmpULT(Value * lhs,Value * rhs)1629 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1630 {
1631 	RR_DEBUG_INFO_UPDATE_LOC();
1632 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1633 }
1634 
createFCmpULE(Value * lhs,Value * rhs)1635 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1636 {
1637 	RR_DEBUG_INFO_UPDATE_LOC();
1638 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1639 }
1640 
createFCmpUNE(Value * lhs,Value * rhs)1641 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1642 {
1643 	RR_DEBUG_INFO_UPDATE_LOC();
1644 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1645 }
1646 
createExtractElement(Value * vector,Type * type,int index)1647 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1648 {
1649 	RR_DEBUG_INFO_UPDATE_LOC();
1650 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1651 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1652 }
1653 
createInsertElement(Value * vector,Value * element,int index)1654 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1655 {
1656 	RR_DEBUG_INFO_UPDATE_LOC();
1657 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1658 }
1659 
createShuffleVector(Value * v1,Value * v2,const int * select)1660 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1661 {
1662 	RR_DEBUG_INFO_UPDATE_LOC();
1663 
1664 	int size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1665 	const int maxSize = 16;
1666 	llvm::Constant *swizzle[maxSize];
1667 	ASSERT(size <= maxSize);
1668 
1669 	for(int i = 0; i < size; i++)
1670 	{
1671 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
1672 	}
1673 
1674 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
1675 
1676 	return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1677 }
1678 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1679 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1680 {
1681 	RR_DEBUG_INFO_UPDATE_LOC();
1682 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1683 }
1684 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1685 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1686 {
1687 	RR_DEBUG_INFO_UPDATE_LOC();
1688 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1689 }
1690 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1691 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1692 {
1693 	RR_DEBUG_INFO_UPDATE_LOC();
1694 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1695 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1696 }
1697 
createUnreachable()1698 void Nucleus::createUnreachable()
1699 {
1700 	RR_DEBUG_INFO_UPDATE_LOC();
1701 	jit->builder->CreateUnreachable();
1702 }
1703 
getType(Value * value)1704 Type *Nucleus::getType(Value *value)
1705 {
1706 	return T(V(value)->getType());
1707 }
1708 
getContainedType(Type * vectorType)1709 Type *Nucleus::getContainedType(Type *vectorType)
1710 {
1711 	return T(T(vectorType)->getContainedType(0));
1712 }
1713 
getPointerType(Type * ElementType)1714 Type *Nucleus::getPointerType(Type *ElementType)
1715 {
1716 	return T(llvm::PointerType::get(T(ElementType), 0));
1717 }
1718 
getNaturalIntType()1719 static llvm::Type *getNaturalIntType()
1720 {
1721 	return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1722 }
1723 
getPrintfStorageType(Type * valueType)1724 Type *Nucleus::getPrintfStorageType(Type *valueType)
1725 {
1726 	llvm::Type *valueTy = T(valueType);
1727 	if(valueTy->isIntegerTy())
1728 	{
1729 		return T(getNaturalIntType());
1730 	}
1731 	if(valueTy->isFloatTy())
1732 	{
1733 		return T(llvm::Type::getDoubleTy(*jit->context));
1734 	}
1735 
1736 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1737 	return {};
1738 }
1739 
createNullValue(Type * Ty)1740 Value *Nucleus::createNullValue(Type *Ty)
1741 {
1742 	RR_DEBUG_INFO_UPDATE_LOC();
1743 	return V(llvm::Constant::getNullValue(T(Ty)));
1744 }
1745 
createConstantLong(int64_t i)1746 Value *Nucleus::createConstantLong(int64_t i)
1747 {
1748 	RR_DEBUG_INFO_UPDATE_LOC();
1749 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1750 }
1751 
createConstantInt(int i)1752 Value *Nucleus::createConstantInt(int i)
1753 {
1754 	RR_DEBUG_INFO_UPDATE_LOC();
1755 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1756 }
1757 
createConstantInt(unsigned int i)1758 Value *Nucleus::createConstantInt(unsigned int i)
1759 {
1760 	RR_DEBUG_INFO_UPDATE_LOC();
1761 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1762 }
1763 
createConstantBool(bool b)1764 Value *Nucleus::createConstantBool(bool b)
1765 {
1766 	RR_DEBUG_INFO_UPDATE_LOC();
1767 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1768 }
1769 
createConstantByte(signed char i)1770 Value *Nucleus::createConstantByte(signed char i)
1771 {
1772 	RR_DEBUG_INFO_UPDATE_LOC();
1773 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1774 }
1775 
createConstantByte(unsigned char i)1776 Value *Nucleus::createConstantByte(unsigned char i)
1777 {
1778 	RR_DEBUG_INFO_UPDATE_LOC();
1779 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1780 }
1781 
createConstantShort(short i)1782 Value *Nucleus::createConstantShort(short i)
1783 {
1784 	RR_DEBUG_INFO_UPDATE_LOC();
1785 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1786 }
1787 
createConstantShort(unsigned short i)1788 Value *Nucleus::createConstantShort(unsigned short i)
1789 {
1790 	RR_DEBUG_INFO_UPDATE_LOC();
1791 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1792 }
1793 
createConstantFloat(float x)1794 Value *Nucleus::createConstantFloat(float x)
1795 {
1796 	RR_DEBUG_INFO_UPDATE_LOC();
1797 	return V(llvm::ConstantFP::get(T(Float::type()), x));
1798 }
1799 
createNullPointer(Type * Ty)1800 Value *Nucleus::createNullPointer(Type *Ty)
1801 {
1802 	RR_DEBUG_INFO_UPDATE_LOC();
1803 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1804 }
1805 
createConstantVector(const int64_t * constants,Type * type)1806 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1807 {
1808 	RR_DEBUG_INFO_UPDATE_LOC();
1809 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1810 	const int numConstants = elementCount(type);                                           // Number of provided constants for the (emulated) type.
1811 	const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1812 	ASSERT(numElements <= 16 && numConstants <= numElements);
1813 	llvm::Constant *constantVector[16];
1814 
1815 	for(int i = 0; i < numElements; i++)
1816 	{
1817 		constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1818 	}
1819 
1820 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1821 }
1822 
createConstantVector(const double * constants,Type * type)1823 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1824 {
1825 	RR_DEBUG_INFO_UPDATE_LOC();
1826 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1827 	const int numConstants = elementCount(type);                                           // Number of provided constants for the (emulated) type.
1828 	const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1829 	ASSERT(numElements <= 8 && numConstants <= numElements);
1830 	llvm::Constant *constantVector[8];
1831 
1832 	for(int i = 0; i < numElements; i++)
1833 	{
1834 		constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1835 	}
1836 
1837 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1838 }
1839 
createConstantString(const char * v)1840 Value *Nucleus::createConstantString(const char *v)
1841 {
1842 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1843 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1844 	return V(ptr);
1845 }
1846 
setOptimizerCallback(OptimizerCallback * callback)1847 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1848 {
1849 	// The LLVM backend does not produce optimizer reports.
1850 	(void)callback;
1851 }
1852 
type()1853 Type *Void::type()
1854 {
1855 	return T(llvm::Type::getVoidTy(*jit->context));
1856 }
1857 
type()1858 Type *Bool::type()
1859 {
1860 	return T(llvm::Type::getInt1Ty(*jit->context));
1861 }
1862 
type()1863 Type *Byte::type()
1864 {
1865 	return T(llvm::Type::getInt8Ty(*jit->context));
1866 }
1867 
type()1868 Type *SByte::type()
1869 {
1870 	return T(llvm::Type::getInt8Ty(*jit->context));
1871 }
1872 
type()1873 Type *Short::type()
1874 {
1875 	return T(llvm::Type::getInt16Ty(*jit->context));
1876 }
1877 
type()1878 Type *UShort::type()
1879 {
1880 	return T(llvm::Type::getInt16Ty(*jit->context));
1881 }
1882 
type()1883 Type *Byte4::type()
1884 {
1885 	return T(Type_v4i8);
1886 }
1887 
type()1888 Type *SByte4::type()
1889 {
1890 	return T(Type_v4i8);
1891 }
1892 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1893 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1894 {
1895 	RR_DEBUG_INFO_UPDATE_LOC();
1896 #if defined(__i386__) || defined(__x86_64__)
1897 	return x86::paddusb(x, y);
1898 #else
1899 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1900 #endif
1901 }
1902 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1903 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1904 {
1905 	RR_DEBUG_INFO_UPDATE_LOC();
1906 #if defined(__i386__) || defined(__x86_64__)
1907 	return x86::psubusb(x, y);
1908 #else
1909 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1910 #endif
1911 }
1912 
SignMask(RValue<Byte8> x)1913 RValue<Int> SignMask(RValue<Byte8> x)
1914 {
1915 	RR_DEBUG_INFO_UPDATE_LOC();
1916 #if defined(__i386__) || defined(__x86_64__)
1917 	return x86::pmovmskb(x);
1918 #else
1919 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1920 #endif
1921 }
1922 
1923 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1924 //	{
1925 //#if defined(__i386__) || defined(__x86_64__)
1926 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1927 //#else
1928 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1929 //#endif
1930 //	}
1931 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1932 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1933 {
1934 	RR_DEBUG_INFO_UPDATE_LOC();
1935 #if defined(__i386__) || defined(__x86_64__)
1936 	return x86::pcmpeqb(x, y);
1937 #else
1938 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1939 #endif
1940 }
1941 
type()1942 Type *Byte8::type()
1943 {
1944 	return T(Type_v8i8);
1945 }
1946 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1947 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1948 {
1949 	RR_DEBUG_INFO_UPDATE_LOC();
1950 #if defined(__i386__) || defined(__x86_64__)
1951 	return x86::paddsb(x, y);
1952 #else
1953 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1954 #endif
1955 }
1956 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1957 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1958 {
1959 	RR_DEBUG_INFO_UPDATE_LOC();
1960 #if defined(__i386__) || defined(__x86_64__)
1961 	return x86::psubsb(x, y);
1962 #else
1963 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1964 #endif
1965 }
1966 
SignMask(RValue<SByte8> x)1967 RValue<Int> SignMask(RValue<SByte8> x)
1968 {
1969 	RR_DEBUG_INFO_UPDATE_LOC();
1970 #if defined(__i386__) || defined(__x86_64__)
1971 	return x86::pmovmskb(As<Byte8>(x));
1972 #else
1973 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1974 #endif
1975 }
1976 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1977 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1978 {
1979 	RR_DEBUG_INFO_UPDATE_LOC();
1980 #if defined(__i386__) || defined(__x86_64__)
1981 	return x86::pcmpgtb(x, y);
1982 #else
1983 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1984 #endif
1985 }
1986 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1987 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1988 {
1989 	RR_DEBUG_INFO_UPDATE_LOC();
1990 #if defined(__i386__) || defined(__x86_64__)
1991 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1992 #else
1993 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1994 #endif
1995 }
1996 
type()1997 Type *SByte8::type()
1998 {
1999 	return T(Type_v8i8);
2000 }
2001 
type()2002 Type *Byte16::type()
2003 {
2004 	return T(llvm::VectorType::get(T(Byte::type()), 16, false));
2005 }
2006 
type()2007 Type *SByte16::type()
2008 {
2009 	return T(llvm::VectorType::get(T(SByte::type()), 16, false));
2010 }
2011 
type()2012 Type *Short2::type()
2013 {
2014 	return T(Type_v2i16);
2015 }
2016 
type()2017 Type *UShort2::type()
2018 {
2019 	return T(Type_v2i16);
2020 }
2021 
Short4(RValue<Int4> cast)2022 Short4::Short4(RValue<Int4> cast)
2023 {
2024 	RR_DEBUG_INFO_UPDATE_LOC();
2025 	int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2026 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2027 
2028 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2029 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2030 
2031 	storeValue(short4);
2032 }
2033 
2034 //	Short4::Short4(RValue<Float> cast)
2035 //	{
2036 //	}
2037 
Short4(RValue<Float4> cast)2038 Short4::Short4(RValue<Float4> cast)
2039 {
2040 	RR_DEBUG_INFO_UPDATE_LOC();
2041 	Int4 v4i32 = Int4(cast);
2042 #if defined(__i386__) || defined(__x86_64__)
2043 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2044 #else
2045 	Value *v = v4i32.loadValue();
2046 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2047 #endif
2048 
2049 	storeValue(As<Short4>(Int2(v4i32)).value());
2050 }
2051 
operator <<(RValue<Short4> lhs,unsigned char rhs)2052 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2053 {
2054 	RR_DEBUG_INFO_UPDATE_LOC();
2055 #if defined(__i386__) || defined(__x86_64__)
2056 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2057 
2058 	return x86::psllw(lhs, rhs);
2059 #else
2060 	return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2061 #endif
2062 }
2063 
operator >>(RValue<Short4> lhs,unsigned char rhs)2064 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2065 {
2066 	RR_DEBUG_INFO_UPDATE_LOC();
2067 #if defined(__i386__) || defined(__x86_64__)
2068 	return x86::psraw(lhs, rhs);
2069 #else
2070 	return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2071 #endif
2072 }
2073 
Max(RValue<Short4> x,RValue<Short4> y)2074 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2075 {
2076 	RR_DEBUG_INFO_UPDATE_LOC();
2077 #if defined(__i386__) || defined(__x86_64__)
2078 	return x86::pmaxsw(x, y);
2079 #else
2080 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2081 #endif
2082 }
2083 
Min(RValue<Short4> x,RValue<Short4> y)2084 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2085 {
2086 	RR_DEBUG_INFO_UPDATE_LOC();
2087 #if defined(__i386__) || defined(__x86_64__)
2088 	return x86::pminsw(x, y);
2089 #else
2090 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2091 #endif
2092 }
2093 
AddSat(RValue<Short4> x,RValue<Short4> y)2094 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2095 {
2096 	RR_DEBUG_INFO_UPDATE_LOC();
2097 #if defined(__i386__) || defined(__x86_64__)
2098 	return x86::paddsw(x, y);
2099 #else
2100 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2101 #endif
2102 }
2103 
SubSat(RValue<Short4> x,RValue<Short4> y)2104 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2105 {
2106 	RR_DEBUG_INFO_UPDATE_LOC();
2107 #if defined(__i386__) || defined(__x86_64__)
2108 	return x86::psubsw(x, y);
2109 #else
2110 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2111 #endif
2112 }
2113 
MulHigh(RValue<Short4> x,RValue<Short4> y)2114 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2115 {
2116 	RR_DEBUG_INFO_UPDATE_LOC();
2117 #if defined(__i386__) || defined(__x86_64__)
2118 	return x86::pmulhw(x, y);
2119 #else
2120 	return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2121 #endif
2122 }
2123 
MulAdd(RValue<Short4> x,RValue<Short4> y)2124 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2125 {
2126 	RR_DEBUG_INFO_UPDATE_LOC();
2127 #if defined(__i386__) || defined(__x86_64__)
2128 	return x86::pmaddwd(x, y);
2129 #else
2130 	return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2131 #endif
2132 }
2133 
PackSigned(RValue<Short4> x,RValue<Short4> y)2134 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2135 {
2136 	RR_DEBUG_INFO_UPDATE_LOC();
2137 #if defined(__i386__) || defined(__x86_64__)
2138 	auto result = x86::packsswb(x, y);
2139 #else
2140 	auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2141 #endif
2142 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2143 }
2144 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2145 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2146 {
2147 	RR_DEBUG_INFO_UPDATE_LOC();
2148 #if defined(__i386__) || defined(__x86_64__)
2149 	auto result = x86::packuswb(x, y);
2150 #else
2151 	auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2152 #endif
2153 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2154 }
2155 
CmpGT(RValue<Short4> x,RValue<Short4> y)2156 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2157 {
2158 	RR_DEBUG_INFO_UPDATE_LOC();
2159 #if defined(__i386__) || defined(__x86_64__)
2160 	return x86::pcmpgtw(x, y);
2161 #else
2162 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2163 #endif
2164 }
2165 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2166 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2167 {
2168 	RR_DEBUG_INFO_UPDATE_LOC();
2169 #if defined(__i386__) || defined(__x86_64__)
2170 	return x86::pcmpeqw(x, y);
2171 #else
2172 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2173 #endif
2174 }
2175 
type()2176 Type *Short4::type()
2177 {
2178 	return T(Type_v4i16);
2179 }
2180 
UShort4(RValue<Float4> cast,bool saturate)2181 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2182 {
2183 	RR_DEBUG_INFO_UPDATE_LOC();
2184 	if(saturate)
2185 	{
2186 #if defined(__i386__) || defined(__x86_64__)
2187 		if(CPUID::supportsSSE4_1())
2188 		{
2189 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2190 			*this = As<Short4>(PackUnsigned(int4, int4));
2191 		}
2192 		else
2193 #endif
2194 		{
2195 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2196 		}
2197 	}
2198 	else
2199 	{
2200 		*this = Short4(Int4(cast));
2201 	}
2202 }
2203 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2204 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2205 {
2206 	RR_DEBUG_INFO_UPDATE_LOC();
2207 #if defined(__i386__) || defined(__x86_64__)
2208 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2209 
2210 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2211 #else
2212 	return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2213 #endif
2214 }
2215 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2216 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2217 {
2218 	RR_DEBUG_INFO_UPDATE_LOC();
2219 #if defined(__i386__) || defined(__x86_64__)
2220 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2221 
2222 	return x86::psrlw(lhs, rhs);
2223 #else
2224 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2225 #endif
2226 }
2227 
Max(RValue<UShort4> x,RValue<UShort4> y)2228 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2229 {
2230 	RR_DEBUG_INFO_UPDATE_LOC();
2231 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2232 }
2233 
Min(RValue<UShort4> x,RValue<UShort4> y)2234 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2235 {
2236 	RR_DEBUG_INFO_UPDATE_LOC();
2237 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2238 }
2239 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2240 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2241 {
2242 	RR_DEBUG_INFO_UPDATE_LOC();
2243 #if defined(__i386__) || defined(__x86_64__)
2244 	return x86::paddusw(x, y);
2245 #else
2246 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2247 #endif
2248 }
2249 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2250 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2251 {
2252 	RR_DEBUG_INFO_UPDATE_LOC();
2253 #if defined(__i386__) || defined(__x86_64__)
2254 	return x86::psubusw(x, y);
2255 #else
2256 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2257 #endif
2258 }
2259 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2260 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2261 {
2262 	RR_DEBUG_INFO_UPDATE_LOC();
2263 #if defined(__i386__) || defined(__x86_64__)
2264 	return x86::pmulhuw(x, y);
2265 #else
2266 	return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2267 #endif
2268 }
2269 
Average(RValue<UShort4> x,RValue<UShort4> y)2270 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2271 {
2272 	RR_DEBUG_INFO_UPDATE_LOC();
2273 #if defined(__i386__) || defined(__x86_64__)
2274 	return x86::pavgw(x, y);
2275 #else
2276 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2277 #endif
2278 }
2279 
type()2280 Type *UShort4::type()
2281 {
2282 	return T(Type_v4i16);
2283 }
2284 
operator <<(RValue<Short8> lhs,unsigned char rhs)2285 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2286 {
2287 	RR_DEBUG_INFO_UPDATE_LOC();
2288 #if defined(__i386__) || defined(__x86_64__)
2289 	return x86::psllw(lhs, rhs);
2290 #else
2291 	return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2292 #endif
2293 }
2294 
operator >>(RValue<Short8> lhs,unsigned char rhs)2295 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2296 {
2297 	RR_DEBUG_INFO_UPDATE_LOC();
2298 #if defined(__i386__) || defined(__x86_64__)
2299 	return x86::psraw(lhs, rhs);
2300 #else
2301 	return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2302 #endif
2303 }
2304 
MulAdd(RValue<Short8> x,RValue<Short8> y)2305 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2306 {
2307 	RR_DEBUG_INFO_UPDATE_LOC();
2308 #if defined(__i386__) || defined(__x86_64__)
2309 	return x86::pmaddwd(x, y);
2310 #else
2311 	return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2312 #endif
2313 }
2314 
MulHigh(RValue<Short8> x,RValue<Short8> y)2315 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2316 {
2317 	RR_DEBUG_INFO_UPDATE_LOC();
2318 #if defined(__i386__) || defined(__x86_64__)
2319 	return x86::pmulhw(x, y);
2320 #else
2321 	return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2322 #endif
2323 }
2324 
type()2325 Type *Short8::type()
2326 {
2327 	return T(llvm::VectorType::get(T(Short::type()), 8, false));
2328 }
2329 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2330 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2331 {
2332 	RR_DEBUG_INFO_UPDATE_LOC();
2333 #if defined(__i386__) || defined(__x86_64__)
2334 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2335 #else
2336 	return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2337 #endif
2338 }
2339 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2340 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2341 {
2342 	RR_DEBUG_INFO_UPDATE_LOC();
2343 #if defined(__i386__) || defined(__x86_64__)
2344 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2345 #else
2346 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2347 #endif
2348 }
2349 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2350 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2351 {
2352 	RR_DEBUG_INFO_UPDATE_LOC();
2353 #if defined(__i386__) || defined(__x86_64__)
2354 	return x86::pmulhuw(x, y);
2355 #else
2356 	return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2357 #endif
2358 }
2359 
type()2360 Type *UShort8::type()
2361 {
2362 	return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2363 }
2364 
operator ++(Int & val,int)2365 RValue<Int> operator++(Int &val, int)  // Post-increment
2366 {
2367 	RR_DEBUG_INFO_UPDATE_LOC();
2368 	RValue<Int> res = val;
2369 
2370 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2371 	val.storeValue(inc);
2372 
2373 	return res;
2374 }
2375 
operator ++(Int & val)2376 const Int &operator++(Int &val)  // Pre-increment
2377 {
2378 	RR_DEBUG_INFO_UPDATE_LOC();
2379 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2380 	val.storeValue(inc);
2381 
2382 	return val;
2383 }
2384 
operator --(Int & val,int)2385 RValue<Int> operator--(Int &val, int)  // Post-decrement
2386 {
2387 	RR_DEBUG_INFO_UPDATE_LOC();
2388 	RValue<Int> res = val;
2389 
2390 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2391 	val.storeValue(inc);
2392 
2393 	return res;
2394 }
2395 
operator --(Int & val)2396 const Int &operator--(Int &val)  // Pre-decrement
2397 {
2398 	RR_DEBUG_INFO_UPDATE_LOC();
2399 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2400 	val.storeValue(inc);
2401 
2402 	return val;
2403 }
2404 
RoundInt(RValue<Float> cast)2405 RValue<Int> RoundInt(RValue<Float> cast)
2406 {
2407 	RR_DEBUG_INFO_UPDATE_LOC();
2408 #if defined(__i386__) || defined(__x86_64__)
2409 	return x86::cvtss2si(cast);
2410 #else
2411 	return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2412 #endif
2413 }
2414 
type()2415 Type *Int::type()
2416 {
2417 	return T(llvm::Type::getInt32Ty(*jit->context));
2418 }
2419 
type()2420 Type *Long::type()
2421 {
2422 	return T(llvm::Type::getInt64Ty(*jit->context));
2423 }
2424 
UInt(RValue<Float> cast)2425 UInt::UInt(RValue<Float> cast)
2426 {
2427 	RR_DEBUG_INFO_UPDATE_LOC();
2428 	Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2429 	storeValue(integer);
2430 }
2431 
operator ++(UInt & val,int)2432 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2433 {
2434 	RR_DEBUG_INFO_UPDATE_LOC();
2435 	RValue<UInt> res = val;
2436 
2437 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2438 	val.storeValue(inc);
2439 
2440 	return res;
2441 }
2442 
operator ++(UInt & val)2443 const UInt &operator++(UInt &val)  // Pre-increment
2444 {
2445 	RR_DEBUG_INFO_UPDATE_LOC();
2446 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2447 	val.storeValue(inc);
2448 
2449 	return val;
2450 }
2451 
operator --(UInt & val,int)2452 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2453 {
2454 	RR_DEBUG_INFO_UPDATE_LOC();
2455 	RValue<UInt> res = val;
2456 
2457 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2458 	val.storeValue(inc);
2459 
2460 	return res;
2461 }
2462 
operator --(UInt & val)2463 const UInt &operator--(UInt &val)  // Pre-decrement
2464 {
2465 	RR_DEBUG_INFO_UPDATE_LOC();
2466 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2467 	val.storeValue(inc);
2468 
2469 	return val;
2470 }
2471 
2472 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2473 //	{
2474 //#if defined(__i386__) || defined(__x86_64__)
2475 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2476 //#else
2477 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2478 //#endif
2479 //	}
2480 
type()2481 Type *UInt::type()
2482 {
2483 	return T(llvm::Type::getInt32Ty(*jit->context));
2484 }
2485 
2486 //	Int2::Int2(RValue<Int> cast)
2487 //	{
2488 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2489 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
2490 //
2491 //		int shuffle[2] = {0, 0};
2492 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2493 //
2494 //		storeValue(replicate);
2495 //	}
2496 
operator <<(RValue<Int2> lhs,unsigned char rhs)2497 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2498 {
2499 	RR_DEBUG_INFO_UPDATE_LOC();
2500 #if defined(__i386__) || defined(__x86_64__)
2501 	//	return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2502 
2503 	return x86::pslld(lhs, rhs);
2504 #else
2505 	return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2506 #endif
2507 }
2508 
operator >>(RValue<Int2> lhs,unsigned char rhs)2509 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2510 {
2511 	RR_DEBUG_INFO_UPDATE_LOC();
2512 #if defined(__i386__) || defined(__x86_64__)
2513 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2514 
2515 	return x86::psrad(lhs, rhs);
2516 #else
2517 	return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2518 #endif
2519 }
2520 
type()2521 Type *Int2::type()
2522 {
2523 	return T(Type_v2i32);
2524 }
2525 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2526 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2527 {
2528 	RR_DEBUG_INFO_UPDATE_LOC();
2529 #if defined(__i386__) || defined(__x86_64__)
2530 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2531 
2532 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2533 #else
2534 	return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2535 #endif
2536 }
2537 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2538 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2539 {
2540 	RR_DEBUG_INFO_UPDATE_LOC();
2541 #if defined(__i386__) || defined(__x86_64__)
2542 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2543 
2544 	return x86::psrld(lhs, rhs);
2545 #else
2546 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2547 #endif
2548 }
2549 
type()2550 Type *UInt2::type()
2551 {
2552 	return T(Type_v2i32);
2553 }
2554 
Int4(RValue<Byte4> cast)2555 Int4::Int4(RValue<Byte4> cast)
2556     : XYZW(this)
2557 {
2558 	RR_DEBUG_INFO_UPDATE_LOC();
2559 #if defined(__i386__) || defined(__x86_64__)
2560 	if(CPUID::supportsSSE4_1())
2561 	{
2562 		*this = x86::pmovzxbd(As<Byte16>(cast));
2563 	}
2564 	else
2565 #endif
2566 	{
2567 		int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2568 		Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2569 		Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2570 
2571 		int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2572 		Value *c = Nucleus::createBitCast(b, Short8::type());
2573 		Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2574 
2575 		*this = As<Int4>(d);
2576 	}
2577 }
2578 
Int4(RValue<SByte4> cast)2579 Int4::Int4(RValue<SByte4> cast)
2580     : XYZW(this)
2581 {
2582 	RR_DEBUG_INFO_UPDATE_LOC();
2583 #if defined(__i386__) || defined(__x86_64__)
2584 	if(CPUID::supportsSSE4_1())
2585 	{
2586 		*this = x86::pmovsxbd(As<SByte16>(cast));
2587 	}
2588 	else
2589 #endif
2590 	{
2591 		int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2592 		Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2593 		Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2594 
2595 		int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2596 		Value *c = Nucleus::createBitCast(b, Short8::type());
2597 		Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2598 
2599 		*this = As<Int4>(d) >> 24;
2600 	}
2601 }
2602 
Int4(RValue<Short4> cast)2603 Int4::Int4(RValue<Short4> cast)
2604     : XYZW(this)
2605 {
2606 	RR_DEBUG_INFO_UPDATE_LOC();
2607 #if defined(__i386__) || defined(__x86_64__)
2608 	if(CPUID::supportsSSE4_1())
2609 	{
2610 		*this = x86::pmovsxwd(As<Short8>(cast));
2611 	}
2612 	else
2613 #endif
2614 	{
2615 		int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2616 		Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2617 		*this = As<Int4>(c) >> 16;
2618 	}
2619 }
2620 
Int4(RValue<UShort4> cast)2621 Int4::Int4(RValue<UShort4> cast)
2622     : XYZW(this)
2623 {
2624 	RR_DEBUG_INFO_UPDATE_LOC();
2625 #if defined(__i386__) || defined(__x86_64__)
2626 	if(CPUID::supportsSSE4_1())
2627 	{
2628 		*this = x86::pmovzxwd(As<UShort8>(cast));
2629 	}
2630 	else
2631 #endif
2632 	{
2633 		int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2634 		Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2635 		*this = As<Int4>(c);
2636 	}
2637 }
2638 
Int4(RValue<Int> rhs)2639 Int4::Int4(RValue<Int> rhs)
2640     : XYZW(this)
2641 {
2642 	RR_DEBUG_INFO_UPDATE_LOC();
2643 	Value *vector = loadValue();
2644 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2645 
2646 	int swizzle[4] = { 0, 0, 0, 0 };
2647 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2648 
2649 	storeValue(replicate);
2650 }
2651 
operator <<(RValue<Int4> lhs,unsigned char rhs)2652 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2653 {
2654 	RR_DEBUG_INFO_UPDATE_LOC();
2655 #if defined(__i386__) || defined(__x86_64__)
2656 	return x86::pslld(lhs, rhs);
2657 #else
2658 	return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2659 #endif
2660 }
2661 
operator >>(RValue<Int4> lhs,unsigned char rhs)2662 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2663 {
2664 	RR_DEBUG_INFO_UPDATE_LOC();
2665 #if defined(__i386__) || defined(__x86_64__)
2666 	return x86::psrad(lhs, rhs);
2667 #else
2668 	return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2669 #endif
2670 }
2671 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2672 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2673 {
2674 	RR_DEBUG_INFO_UPDATE_LOC();
2675 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2676 }
2677 
CmpLT(RValue<Int4> x,RValue<Int4> y)2678 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2679 {
2680 	RR_DEBUG_INFO_UPDATE_LOC();
2681 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2682 }
2683 
CmpLE(RValue<Int4> x,RValue<Int4> y)2684 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2685 {
2686 	RR_DEBUG_INFO_UPDATE_LOC();
2687 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2688 }
2689 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2690 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2691 {
2692 	RR_DEBUG_INFO_UPDATE_LOC();
2693 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2694 }
2695 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2696 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2697 {
2698 	RR_DEBUG_INFO_UPDATE_LOC();
2699 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2700 }
2701 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2702 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2703 {
2704 	RR_DEBUG_INFO_UPDATE_LOC();
2705 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2706 }
2707 
Max(RValue<Int4> x,RValue<Int4> y)2708 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2709 {
2710 	RR_DEBUG_INFO_UPDATE_LOC();
2711 #if defined(__i386__) || defined(__x86_64__)
2712 	if(CPUID::supportsSSE4_1())
2713 	{
2714 		return x86::pmaxsd(x, y);
2715 	}
2716 	else
2717 #endif
2718 	{
2719 		RValue<Int4> greater = CmpNLE(x, y);
2720 		return (x & greater) | (y & ~greater);
2721 	}
2722 }
2723 
Min(RValue<Int4> x,RValue<Int4> y)2724 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2725 {
2726 	RR_DEBUG_INFO_UPDATE_LOC();
2727 #if defined(__i386__) || defined(__x86_64__)
2728 	if(CPUID::supportsSSE4_1())
2729 	{
2730 		return x86::pminsd(x, y);
2731 	}
2732 	else
2733 #endif
2734 	{
2735 		RValue<Int4> less = CmpLT(x, y);
2736 		return (x & less) | (y & ~less);
2737 	}
2738 }
2739 
RoundInt(RValue<Float4> cast)2740 RValue<Int4> RoundInt(RValue<Float4> cast)
2741 {
2742 	RR_DEBUG_INFO_UPDATE_LOC();
2743 #if defined(__i386__) || defined(__x86_64__)
2744 	return x86::cvtps2dq(cast);
2745 #else
2746 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2747 #endif
2748 }
2749 
RoundIntClamped(RValue<Float4> cast)2750 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2751 {
2752 	RR_DEBUG_INFO_UPDATE_LOC();
2753 #if defined(__i386__) || defined(__x86_64__)
2754 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
2755 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2756 	// saturate to 0x80000000.
2757 	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2758 #else
2759 	// ARM saturates to the largest positive or negative integer. Unit tests
2760 	// verify that lowerRoundInt() behaves as desired.
2761 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2762 #endif
2763 }
2764 
MulHigh(RValue<Int4> x,RValue<Int4> y)2765 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2766 {
2767 	RR_DEBUG_INFO_UPDATE_LOC();
2768 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2769 	return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2770 }
2771 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2772 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2773 {
2774 	RR_DEBUG_INFO_UPDATE_LOC();
2775 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2776 	return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2777 }
2778 
PackSigned(RValue<Int4> x,RValue<Int4> y)2779 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2780 {
2781 	RR_DEBUG_INFO_UPDATE_LOC();
2782 #if defined(__i386__) || defined(__x86_64__)
2783 	return x86::packssdw(x, y);
2784 #else
2785 	return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2786 #endif
2787 }
2788 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2789 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2790 {
2791 	RR_DEBUG_INFO_UPDATE_LOC();
2792 #if defined(__i386__) || defined(__x86_64__)
2793 	return x86::packusdw(x, y);
2794 #else
2795 	return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2796 #endif
2797 }
2798 
SignMask(RValue<Int4> x)2799 RValue<Int> SignMask(RValue<Int4> x)
2800 {
2801 	RR_DEBUG_INFO_UPDATE_LOC();
2802 #if defined(__i386__) || defined(__x86_64__)
2803 	return x86::movmskps(As<Float4>(x));
2804 #else
2805 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2806 #endif
2807 }
2808 
type()2809 Type *Int4::type()
2810 {
2811 	return T(llvm::VectorType::get(T(Int::type()), 4, false));
2812 }
2813 
UInt4(RValue<Float4> cast)2814 UInt4::UInt4(RValue<Float4> cast)
2815     : XYZW(this)
2816 {
2817 	RR_DEBUG_INFO_UPDATE_LOC();
2818 	Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2819 	storeValue(xyzw);
2820 }
2821 
UInt4(RValue<UInt> rhs)2822 UInt4::UInt4(RValue<UInt> rhs)
2823     : XYZW(this)
2824 {
2825 	RR_DEBUG_INFO_UPDATE_LOC();
2826 	Value *vector = loadValue();
2827 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2828 
2829 	int swizzle[4] = { 0, 0, 0, 0 };
2830 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2831 
2832 	storeValue(replicate);
2833 }
2834 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2835 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2836 {
2837 	RR_DEBUG_INFO_UPDATE_LOC();
2838 #if defined(__i386__) || defined(__x86_64__)
2839 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2840 #else
2841 	return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2842 #endif
2843 }
2844 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2845 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2846 {
2847 	RR_DEBUG_INFO_UPDATE_LOC();
2848 #if defined(__i386__) || defined(__x86_64__)
2849 	return x86::psrld(lhs, rhs);
2850 #else
2851 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2852 #endif
2853 }
2854 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2855 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2856 {
2857 	RR_DEBUG_INFO_UPDATE_LOC();
2858 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2859 }
2860 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2861 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2862 {
2863 	RR_DEBUG_INFO_UPDATE_LOC();
2864 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2865 }
2866 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2867 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2868 {
2869 	RR_DEBUG_INFO_UPDATE_LOC();
2870 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2871 }
2872 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2873 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2874 {
2875 	RR_DEBUG_INFO_UPDATE_LOC();
2876 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2877 }
2878 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2879 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2880 {
2881 	RR_DEBUG_INFO_UPDATE_LOC();
2882 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2883 }
2884 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2885 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2886 {
2887 	RR_DEBUG_INFO_UPDATE_LOC();
2888 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2889 }
2890 
Max(RValue<UInt4> x,RValue<UInt4> y)2891 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2892 {
2893 	RR_DEBUG_INFO_UPDATE_LOC();
2894 #if defined(__i386__) || defined(__x86_64__)
2895 	if(CPUID::supportsSSE4_1())
2896 	{
2897 		return x86::pmaxud(x, y);
2898 	}
2899 	else
2900 #endif
2901 	{
2902 		RValue<UInt4> greater = CmpNLE(x, y);
2903 		return (x & greater) | (y & ~greater);
2904 	}
2905 }
2906 
Min(RValue<UInt4> x,RValue<UInt4> y)2907 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2908 {
2909 	RR_DEBUG_INFO_UPDATE_LOC();
2910 #if defined(__i386__) || defined(__x86_64__)
2911 	if(CPUID::supportsSSE4_1())
2912 	{
2913 		return x86::pminud(x, y);
2914 	}
2915 	else
2916 #endif
2917 	{
2918 		RValue<UInt4> less = CmpLT(x, y);
2919 		return (x & less) | (y & ~less);
2920 	}
2921 }
2922 
type()2923 Type *UInt4::type()
2924 {
2925 	return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2926 }
2927 
type()2928 Type *Half::type()
2929 {
2930 	return T(llvm::Type::getInt16Ty(*jit->context));
2931 }
2932 
Rcp_pp(RValue<Float> x,bool exactAtPow2)2933 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2934 {
2935 	RR_DEBUG_INFO_UPDATE_LOC();
2936 #if defined(__i386__) || defined(__x86_64__)
2937 	if(exactAtPow2)
2938 	{
2939 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2940 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2941 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2942 	}
2943 	return x86::rcpss(x);
2944 #else
2945 	return As<Float>(V(lowerRCP(V(x.value()))));
2946 #endif
2947 }
2948 
RcpSqrt_pp(RValue<Float> x)2949 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2950 {
2951 	RR_DEBUG_INFO_UPDATE_LOC();
2952 #if defined(__i386__) || defined(__x86_64__)
2953 	return x86::rsqrtss(x);
2954 #else
2955 	return As<Float>(V(lowerRSQRT(V(x.value()))));
2956 #endif
2957 }
2958 
HasRcpApprox()2959 bool HasRcpApprox()
2960 {
2961 #if defined(__i386__) || defined(__x86_64__)
2962 	return true;
2963 #else
2964 	return false;
2965 #endif
2966 }
2967 
RcpApprox(RValue<Float4> x,bool exactAtPow2)2968 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2969 {
2970 #if defined(__i386__) || defined(__x86_64__)
2971 	if(exactAtPow2)
2972 	{
2973 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2974 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2975 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2976 	}
2977 	return x86::rcpps(x);
2978 #else
2979 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2980 	return { 0.0f };
2981 #endif
2982 }
2983 
RcpApprox(RValue<Float> x,bool exactAtPow2)2984 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2985 {
2986 #if defined(__i386__) || defined(__x86_64__)
2987 	if(exactAtPow2)
2988 	{
2989 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2990 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2991 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2992 	}
2993 	return x86::rcpss(x);
2994 #else
2995 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2996 	return { 0.0f };
2997 #endif
2998 }
2999 
HasRcpSqrtApprox()3000 bool HasRcpSqrtApprox()
3001 {
3002 #if defined(__i386__) || defined(__x86_64__)
3003 	return true;
3004 #else
3005 	return false;
3006 #endif
3007 }
3008 
RcpSqrtApprox(RValue<Float4> x)3009 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
3010 {
3011 #if defined(__i386__) || defined(__x86_64__)
3012 	return x86::rsqrtps(x);
3013 #else
3014 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
3015 	return { 0.0f };
3016 #endif
3017 }
3018 
RcpSqrtApprox(RValue<Float> x)3019 RValue<Float> RcpSqrtApprox(RValue<Float> x)
3020 {
3021 #if defined(__i386__) || defined(__x86_64__)
3022 	return x86::rsqrtss(x);
3023 #else
3024 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
3025 	return { 0.0f };
3026 #endif
3027 }
3028 
Sqrt(RValue<Float> x)3029 RValue<Float> Sqrt(RValue<Float> x)
3030 {
3031 	RR_DEBUG_INFO_UPDATE_LOC();
3032 #if defined(__i386__) || defined(__x86_64__)
3033 	return x86::sqrtss(x);
3034 #else
3035 	return As<Float>(V(lowerSQRT(V(x.value()))));
3036 #endif
3037 }
3038 
Round(RValue<Float> x)3039 RValue<Float> Round(RValue<Float> x)
3040 {
3041 	RR_DEBUG_INFO_UPDATE_LOC();
3042 #if defined(__i386__) || defined(__x86_64__)
3043 	if(CPUID::supportsSSE4_1())
3044 	{
3045 		return x86::roundss(x, 0);
3046 	}
3047 	else
3048 	{
3049 		return Float4(Round(Float4(x))).x;
3050 	}
3051 #else
3052 	return RValue<Float>(V(lowerRound(V(x.value()))));
3053 #endif
3054 }
3055 
Trunc(RValue<Float> x)3056 RValue<Float> Trunc(RValue<Float> x)
3057 {
3058 	RR_DEBUG_INFO_UPDATE_LOC();
3059 #if defined(__i386__) || defined(__x86_64__)
3060 	if(CPUID::supportsSSE4_1())
3061 	{
3062 		return x86::roundss(x, 3);
3063 	}
3064 	else
3065 	{
3066 		return Float(Int(x));  // Rounded toward zero
3067 	}
3068 #else
3069 	return RValue<Float>(V(lowerTrunc(V(x.value()))));
3070 #endif
3071 }
3072 
Frac(RValue<Float> x)3073 RValue<Float> Frac(RValue<Float> x)
3074 {
3075 	RR_DEBUG_INFO_UPDATE_LOC();
3076 #if defined(__i386__) || defined(__x86_64__)
3077 	if(CPUID::supportsSSE4_1())
3078 	{
3079 		return x - x86::floorss(x);
3080 	}
3081 	else
3082 	{
3083 		return Float4(Frac(Float4(x))).x;
3084 	}
3085 #else
3086 	// x - floor(x) can be 1.0 for very small negative x.
3087 	// Clamp against the value just below 1.0.
3088 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3089 #endif
3090 }
3091 
Floor(RValue<Float> x)3092 RValue<Float> Floor(RValue<Float> x)
3093 {
3094 	RR_DEBUG_INFO_UPDATE_LOC();
3095 #if defined(__i386__) || defined(__x86_64__)
3096 	if(CPUID::supportsSSE4_1())
3097 	{
3098 		return x86::floorss(x);
3099 	}
3100 	else
3101 	{
3102 		return Float4(Floor(Float4(x))).x;
3103 	}
3104 #else
3105 	return RValue<Float>(V(lowerFloor(V(x.value()))));
3106 #endif
3107 }
3108 
Ceil(RValue<Float> x)3109 RValue<Float> Ceil(RValue<Float> x)
3110 {
3111 	RR_DEBUG_INFO_UPDATE_LOC();
3112 #if defined(__i386__) || defined(__x86_64__)
3113 	if(CPUID::supportsSSE4_1())
3114 	{
3115 		return x86::ceilss(x);
3116 	}
3117 	else
3118 #endif
3119 	{
3120 		return Float4(Ceil(Float4(x))).x;
3121 	}
3122 }
3123 
type()3124 Type *Float::type()
3125 {
3126 	return T(llvm::Type::getFloatTy(*jit->context));
3127 }
3128 
type()3129 Type *Float2::type()
3130 {
3131 	return T(Type_v2f32);
3132 }
3133 
Exp2(RValue<Float> v)3134 RValue<Float> Exp2(RValue<Float> v)
3135 {
3136 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::type()) });
3137 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3138 }
3139 
Log2(RValue<Float> v)3140 RValue<Float> Log2(RValue<Float> v)
3141 {
3142 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::type()) });
3143 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3144 }
3145 
Float4(RValue<Float> rhs)3146 Float4::Float4(RValue<Float> rhs)
3147     : XYZW(this)
3148 {
3149 	RR_DEBUG_INFO_UPDATE_LOC();
3150 	Value *vector = loadValue();
3151 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3152 
3153 	int swizzle[4] = { 0, 0, 0, 0 };
3154 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3155 
3156 	storeValue(replicate);
3157 }
3158 
Max(RValue<Float4> x,RValue<Float4> y)3159 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3160 {
3161 	RR_DEBUG_INFO_UPDATE_LOC();
3162 #if defined(__i386__) || defined(__x86_64__)
3163 	return x86::maxps(x, y);
3164 #else
3165 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3166 #endif
3167 }
3168 
Min(RValue<Float4> x,RValue<Float4> y)3169 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3170 {
3171 	RR_DEBUG_INFO_UPDATE_LOC();
3172 #if defined(__i386__) || defined(__x86_64__)
3173 	return x86::minps(x, y);
3174 #else
3175 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3176 #endif
3177 }
3178 
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3179 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3180 {
3181 	RR_DEBUG_INFO_UPDATE_LOC();
3182 #if defined(__i386__) || defined(__x86_64__)
3183 	if(exactAtPow2)
3184 	{
3185 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
3186 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
3187 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
3188 	}
3189 	return x86::rcpps(x);
3190 #else
3191 	return As<Float4>(V(lowerRCP(V(x.value()))));
3192 #endif
3193 }
3194 
RcpSqrt_pp(RValue<Float4> x)3195 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3196 {
3197 	RR_DEBUG_INFO_UPDATE_LOC();
3198 #if defined(__i386__) || defined(__x86_64__)
3199 	return x86::rsqrtps(x);
3200 #else
3201 	return As<Float4>(V(lowerRSQRT(V(x.value()))));
3202 #endif
3203 }
3204 
Sqrt(RValue<Float4> x)3205 RValue<Float4> Sqrt(RValue<Float4> x)
3206 {
3207 	RR_DEBUG_INFO_UPDATE_LOC();
3208 #if defined(__i386__) || defined(__x86_64__)
3209 	return x86::sqrtps(x);
3210 #else
3211 	return As<Float4>(V(lowerSQRT(V(x.value()))));
3212 #endif
3213 }
3214 
SignMask(RValue<Float4> x)3215 RValue<Int> SignMask(RValue<Float4> x)
3216 {
3217 	RR_DEBUG_INFO_UPDATE_LOC();
3218 #if defined(__i386__) || defined(__x86_64__)
3219 	return x86::movmskps(x);
3220 #else
3221 	return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3222 #endif
3223 }
3224 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3225 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3226 {
3227 	RR_DEBUG_INFO_UPDATE_LOC();
3228 	//	return As<Int4>(x86::cmpeqps(x, y));
3229 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3230 }
3231 
CmpLT(RValue<Float4> x,RValue<Float4> y)3232 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3233 {
3234 	RR_DEBUG_INFO_UPDATE_LOC();
3235 	//	return As<Int4>(x86::cmpltps(x, y));
3236 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3237 }
3238 
CmpLE(RValue<Float4> x,RValue<Float4> y)3239 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3240 {
3241 	RR_DEBUG_INFO_UPDATE_LOC();
3242 	//	return As<Int4>(x86::cmpleps(x, y));
3243 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3244 }
3245 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3246 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3247 {
3248 	RR_DEBUG_INFO_UPDATE_LOC();
3249 	//	return As<Int4>(x86::cmpneqps(x, y));
3250 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3251 }
3252 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3253 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3254 {
3255 	RR_DEBUG_INFO_UPDATE_LOC();
3256 	//	return As<Int4>(x86::cmpnltps(x, y));
3257 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3258 }
3259 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3260 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3261 {
3262 	RR_DEBUG_INFO_UPDATE_LOC();
3263 	//	return As<Int4>(x86::cmpnleps(x, y));
3264 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3265 }
3266 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3267 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3268 {
3269 	RR_DEBUG_INFO_UPDATE_LOC();
3270 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3271 }
3272 
CmpULT(RValue<Float4> x,RValue<Float4> y)3273 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3274 {
3275 	RR_DEBUG_INFO_UPDATE_LOC();
3276 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3277 }
3278 
CmpULE(RValue<Float4> x,RValue<Float4> y)3279 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3280 {
3281 	RR_DEBUG_INFO_UPDATE_LOC();
3282 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3283 }
3284 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3285 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3286 {
3287 	RR_DEBUG_INFO_UPDATE_LOC();
3288 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3289 }
3290 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3291 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3292 {
3293 	RR_DEBUG_INFO_UPDATE_LOC();
3294 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3295 }
3296 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3297 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3298 {
3299 	RR_DEBUG_INFO_UPDATE_LOC();
3300 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3301 }
3302 
Round(RValue<Float4> x)3303 RValue<Float4> Round(RValue<Float4> x)
3304 {
3305 	RR_DEBUG_INFO_UPDATE_LOC();
3306 #if defined(__i386__) || defined(__x86_64__)
3307 	if(CPUID::supportsSSE4_1())
3308 	{
3309 		return x86::roundps(x, 0);
3310 	}
3311 	else
3312 	{
3313 		return Float4(RoundInt(x));
3314 	}
3315 #else
3316 	return RValue<Float4>(V(lowerRound(V(x.value()))));
3317 #endif
3318 }
3319 
Trunc(RValue<Float4> x)3320 RValue<Float4> Trunc(RValue<Float4> x)
3321 {
3322 	RR_DEBUG_INFO_UPDATE_LOC();
3323 #if defined(__i386__) || defined(__x86_64__)
3324 	if(CPUID::supportsSSE4_1())
3325 	{
3326 		return x86::roundps(x, 3);
3327 	}
3328 	else
3329 	{
3330 		return Float4(Int4(x));
3331 	}
3332 #else
3333 	return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3334 #endif
3335 }
3336 
Frac(RValue<Float4> x)3337 RValue<Float4> Frac(RValue<Float4> x)
3338 {
3339 	RR_DEBUG_INFO_UPDATE_LOC();
3340 	Float4 frc;
3341 
3342 #if defined(__i386__) || defined(__x86_64__)
3343 	if(CPUID::supportsSSE4_1())
3344 	{
3345 		frc = x - x86::floorps(x);
3346 	}
3347 	else
3348 	{
3349 		frc = x - Float4(Int4(x));  // Signed fractional part.
3350 
3351 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3352 	}
3353 #else
3354 	frc = x - Floor(x);
3355 #endif
3356 
3357 	// x - floor(x) can be 1.0 for very small negative x.
3358 	// Clamp against the value just below 1.0.
3359 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3360 }
3361 
Floor(RValue<Float4> x)3362 RValue<Float4> Floor(RValue<Float4> x)
3363 {
3364 	RR_DEBUG_INFO_UPDATE_LOC();
3365 #if defined(__i386__) || defined(__x86_64__)
3366 	if(CPUID::supportsSSE4_1())
3367 	{
3368 		return x86::floorps(x);
3369 	}
3370 	else
3371 	{
3372 		return x - Frac(x);
3373 	}
3374 #else
3375 	return RValue<Float4>(V(lowerFloor(V(x.value()))));
3376 #endif
3377 }
3378 
Ceil(RValue<Float4> x)3379 RValue<Float4> Ceil(RValue<Float4> x)
3380 {
3381 	RR_DEBUG_INFO_UPDATE_LOC();
3382 #if defined(__i386__) || defined(__x86_64__)
3383 	if(CPUID::supportsSSE4_1())
3384 	{
3385 		return x86::ceilps(x);
3386 	}
3387 	else
3388 #endif
3389 	{
3390 		return -Floor(-x);
3391 	}
3392 }
3393 
Sin(RValue<Float4> v)3394 RValue<Float4> Sin(RValue<Float4> v)
3395 {
3396 	RR_DEBUG_INFO_UPDATE_LOC();
3397 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value())->getType() });
3398 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3399 }
3400 
Cos(RValue<Float4> v)3401 RValue<Float4> Cos(RValue<Float4> v)
3402 {
3403 	RR_DEBUG_INFO_UPDATE_LOC();
3404 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value())->getType() });
3405 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3406 }
3407 
Tan(RValue<Float4> v)3408 RValue<Float4> Tan(RValue<Float4> v)
3409 {
3410 	RR_DEBUG_INFO_UPDATE_LOC();
3411 	return Sin(v) / Cos(v);
3412 }
3413 
TransformFloat4PerElement(RValue<Float4> v,const char * name)3414 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3415 {
3416 	auto funcTy = llvm::FunctionType::get(T(Float::type()), llvm::ArrayRef<llvm::Type *>(T(Float::type())), false);
3417 	auto func = jit->module->getOrInsertFunction(name, funcTy);
3418 	llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3419 	for(uint64_t i = 0; i < 4; i++)
3420 	{
3421 		auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value(), Float::type(), i)));
3422 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3423 	}
3424 	return RValue<Float4>(V(out));
3425 }
3426 
Asin(RValue<Float4> v,Precision p)3427 RValue<Float4> Asin(RValue<Float4> v, Precision p)
3428 {
3429 	RR_DEBUG_INFO_UPDATE_LOC();
3430 	return TransformFloat4PerElement(v, "asinf");
3431 }
3432 
Acos(RValue<Float4> v,Precision p)3433 RValue<Float4> Acos(RValue<Float4> v, Precision p)
3434 {
3435 	RR_DEBUG_INFO_UPDATE_LOC();
3436 	return TransformFloat4PerElement(v, "acosf");
3437 }
3438 
Atan(RValue<Float4> v)3439 RValue<Float4> Atan(RValue<Float4> v)
3440 {
3441 	RR_DEBUG_INFO_UPDATE_LOC();
3442 	return TransformFloat4PerElement(v, "atanf");
3443 }
3444 
Sinh(RValue<Float4> v)3445 RValue<Float4> Sinh(RValue<Float4> v)
3446 {
3447 	RR_DEBUG_INFO_UPDATE_LOC();
3448 	return emulated::Sinh(v);
3449 }
3450 
Cosh(RValue<Float4> v)3451 RValue<Float4> Cosh(RValue<Float4> v)
3452 {
3453 	RR_DEBUG_INFO_UPDATE_LOC();
3454 	return emulated::Cosh(v);
3455 }
3456 
Tanh(RValue<Float4> v)3457 RValue<Float4> Tanh(RValue<Float4> v)
3458 {
3459 	RR_DEBUG_INFO_UPDATE_LOC();
3460 	return TransformFloat4PerElement(v, "tanhf");
3461 }
3462 
Asinh(RValue<Float4> v)3463 RValue<Float4> Asinh(RValue<Float4> v)
3464 {
3465 	RR_DEBUG_INFO_UPDATE_LOC();
3466 	return TransformFloat4PerElement(v, "asinhf");
3467 }
3468 
Acosh(RValue<Float4> v)3469 RValue<Float4> Acosh(RValue<Float4> v)
3470 {
3471 	RR_DEBUG_INFO_UPDATE_LOC();
3472 	return TransformFloat4PerElement(v, "acoshf");
3473 }
3474 
Atanh(RValue<Float4> v)3475 RValue<Float4> Atanh(RValue<Float4> v)
3476 {
3477 	RR_DEBUG_INFO_UPDATE_LOC();
3478 	return TransformFloat4PerElement(v, "atanhf");
3479 }
3480 
Atan2(RValue<Float4> x,RValue<Float4> y)3481 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3482 {
3483 	RR_DEBUG_INFO_UPDATE_LOC();
3484 	llvm::SmallVector<llvm::Type *, 2> paramTys;
3485 	paramTys.push_back(T(Float::type()));
3486 	paramTys.push_back(T(Float::type()));
3487 	auto funcTy = llvm::FunctionType::get(T(Float::type()), paramTys, false);
3488 	auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3489 	llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3490 	for(uint64_t i = 0; i < 4; i++)
3491 	{
3492 		auto el = jit->builder->CreateCall(func, { V(Nucleus::createExtractElement(x.value(), Float::type(), i)),
3493 		                                           V(Nucleus::createExtractElement(y.value(), Float::type(), i)) });
3494 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3495 	}
3496 	return RValue<Float4>(V(out));
3497 }
3498 
Pow(RValue<Float4> x,RValue<Float4> y)3499 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3500 {
3501 	RR_DEBUG_INFO_UPDATE_LOC();
3502 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::type()) });
3503 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()) })));
3504 }
3505 
Exp(RValue<Float4> v)3506 RValue<Float4> Exp(RValue<Float4> v)
3507 {
3508 	RR_DEBUG_INFO_UPDATE_LOC();
3509 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::type()) });
3510 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3511 }
3512 
Log(RValue<Float4> v)3513 RValue<Float4> Log(RValue<Float4> v)
3514 {
3515 	RR_DEBUG_INFO_UPDATE_LOC();
3516 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::type()) });
3517 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3518 }
3519 
Exp2(RValue<Float4> v)3520 RValue<Float4> Exp2(RValue<Float4> v)
3521 {
3522 	RR_DEBUG_INFO_UPDATE_LOC();
3523 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::type()) });
3524 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3525 }
3526 
Log2(RValue<Float4> v)3527 RValue<Float4> Log2(RValue<Float4> v)
3528 {
3529 	RR_DEBUG_INFO_UPDATE_LOC();
3530 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::type()) });
3531 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3532 }
3533 
Ctlz(RValue<UInt> v,bool isZeroUndef)3534 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3535 {
3536 	RR_DEBUG_INFO_UPDATE_LOC();
3537 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3538 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3539 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3540 }
3541 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3542 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3543 {
3544 	RR_DEBUG_INFO_UPDATE_LOC();
3545 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3546 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3547 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3548 }
3549 
Cttz(RValue<UInt> v,bool isZeroUndef)3550 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3551 {
3552 	RR_DEBUG_INFO_UPDATE_LOC();
3553 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3554 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3555 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3556 }
3557 
Cttz(RValue<UInt4> v,bool isZeroUndef)3558 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3559 {
3560 	RR_DEBUG_INFO_UPDATE_LOC();
3561 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3562 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3563 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3564 }
3565 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3566 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3567 {
3568 	return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3569 }
3570 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3571 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3572 {
3573 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3574 }
3575 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3576 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3577 {
3578 	return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3579 }
3580 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3581 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3582 {
3583 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3584 }
3585 
type()3586 Type *Float4::type()
3587 {
3588 	return T(llvm::VectorType::get(T(Float::type()), 4, false));
3589 }
3590 
Ticks()3591 RValue<Long> Ticks()
3592 {
3593 	RR_DEBUG_INFO_UPDATE_LOC();
3594 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3595 
3596 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3597 }
3598 
ConstantPointer(void const * ptr)3599 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3600 {
3601 	RR_DEBUG_INFO_UPDATE_LOC();
3602 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3603 	// is defined to truncate (and zero extend) if necessary.
3604 	auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3605 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3606 }
3607 
ConstantData(void const * data,size_t size)3608 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3609 {
3610 	RR_DEBUG_INFO_UPDATE_LOC();
3611 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3612 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3613 	return RValue<Pointer<Byte>>(V(ptr));
3614 }
3615 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3616 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3617 {
3618 	// If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3619 	// mark all call arguments as initialized by calling __msan_unpoison_param().
3620 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3621 	{
3622 		// void __msan_unpoison_param(size_t n)
3623 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
3624 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3625 		auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3626 		auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3627 
3628 		jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3629 	}
3630 
3631 	RR_DEBUG_INFO_UPDATE_LOC();
3632 	llvm::SmallVector<llvm::Type *, 8> paramTys;
3633 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3634 	auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3635 
3636 	auto funcPtrTy = funcTy->getPointerTo();
3637 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3638 
3639 	llvm::SmallVector<llvm::Value *, 8> arguments;
3640 	for(auto arg : args) { arguments.push_back(V(arg)); }
3641 	return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3642 }
3643 
Breakpoint()3644 void Breakpoint()
3645 {
3646 	RR_DEBUG_INFO_UPDATE_LOC();
3647 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3648 
3649 	jit->builder->CreateCall(debugtrap);
3650 }
3651 
3652 }  // namespace rr
3653 
3654 namespace rr {
3655 
3656 #if defined(__i386__) || defined(__x86_64__)
3657 namespace x86 {
3658 
3659 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3660 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3661 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3662 {
3663 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3664 
3665 	return V(jit->builder->CreateCall(intrinsic, V(x)));
3666 }
3667 
3668 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3669 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3670 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3671 {
3672 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3673 
3674 	return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3675 }
3676 
cvtss2si(RValue<Float> val)3677 RValue<Int> cvtss2si(RValue<Float> val)
3678 {
3679 	Float4 vector;
3680 	vector.x = val;
3681 
3682 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3683 }
3684 
cvtps2dq(RValue<Float4> val)3685 RValue<Int4> cvtps2dq(RValue<Float4> val)
3686 {
3687 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3688 }
3689 
rcpss(RValue<Float> val)3690 RValue<Float> rcpss(RValue<Float> val)
3691 {
3692 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3693 
3694 	// TODO(b/172238865): MemorySanitizer does not support the rcpss instruction,
3695 	// which makes it look at the entire 128-bit input operand for undefined bits.
3696 	// Use zero-initialized values instead.
3697 	if(__has_feature(memory_sanitizer))
3698 	{
3699 		undef = Float4(0).loadValue();
3700 	}
3701 
3702 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3703 
3704 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3705 }
3706 
sqrtss(RValue<Float> val)3707 RValue<Float> sqrtss(RValue<Float> val)
3708 {
3709 	return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3710 }
3711 
rsqrtss(RValue<Float> val)3712 RValue<Float> rsqrtss(RValue<Float> val)
3713 {
3714 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3715 
3716 	// TODO(b/172238865): MemorySanitizer does not support the rsqrtss instruction,
3717 	// which makes it look at the entire 128-bit input operand for undefined bits.
3718 	// Use zero-initialized values instead.
3719 	if(__has_feature(memory_sanitizer))
3720 	{
3721 		undef = Float4(0).loadValue();
3722 	}
3723 
3724 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3725 
3726 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3727 }
3728 
rcpps(RValue<Float4> val)3729 RValue<Float4> rcpps(RValue<Float4> val)
3730 {
3731 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3732 }
3733 
sqrtps(RValue<Float4> val)3734 RValue<Float4> sqrtps(RValue<Float4> val)
3735 {
3736 	return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3737 }
3738 
rsqrtps(RValue<Float4> val)3739 RValue<Float4> rsqrtps(RValue<Float4> val)
3740 {
3741 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3742 }
3743 
maxps(RValue<Float4> x,RValue<Float4> y)3744 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3745 {
3746 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3747 }
3748 
minps(RValue<Float4> x,RValue<Float4> y)3749 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3750 {
3751 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3752 }
3753 
roundss(RValue<Float> val,unsigned char imm)3754 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3755 {
3756 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3757 
3758 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3759 
3760 	// TODO(b/172238865): MemorySanitizer does not support the roundss instruction,
3761 	// which makes it look at the entire 128-bit input operands for undefined bits.
3762 	// Use zero-initialized values instead.
3763 	if(__has_feature(memory_sanitizer))
3764 	{
3765 		undef = Float4(0).loadValue();
3766 	}
3767 
3768 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3769 
3770 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3771 }
3772 
floorss(RValue<Float> val)3773 RValue<Float> floorss(RValue<Float> val)
3774 {
3775 	return roundss(val, 1);
3776 }
3777 
ceilss(RValue<Float> val)3778 RValue<Float> ceilss(RValue<Float> val)
3779 {
3780 	return roundss(val, 2);
3781 }
3782 
roundps(RValue<Float4> val,unsigned char imm)3783 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3784 {
3785 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3786 }
3787 
floorps(RValue<Float4> val)3788 RValue<Float4> floorps(RValue<Float4> val)
3789 {
3790 	return roundps(val, 1);
3791 }
3792 
ceilps(RValue<Float4> val)3793 RValue<Float4> ceilps(RValue<Float4> val)
3794 {
3795 	return roundps(val, 2);
3796 }
3797 
pabsd(RValue<Int4> x)3798 RValue<Int4> pabsd(RValue<Int4> x)
3799 {
3800 	return RValue<Int4>(V(lowerPABS(V(x.value()))));
3801 }
3802 
paddsw(RValue<Short4> x,RValue<Short4> y)3803 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3804 {
3805 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3806 }
3807 
psubsw(RValue<Short4> x,RValue<Short4> y)3808 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3809 {
3810 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3811 }
3812 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3813 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3814 {
3815 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3816 }
3817 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3818 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3819 {
3820 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3821 }
3822 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3823 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3824 {
3825 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3826 }
3827 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3828 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3829 {
3830 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3831 }
3832 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3833 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3834 {
3835 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3836 }
3837 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3838 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3839 {
3840 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3841 }
3842 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3843 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3844 {
3845 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3846 }
3847 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3848 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3849 {
3850 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3851 }
3852 
pminsw(RValue<Short4> x,RValue<Short4> y)3853 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3854 {
3855 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3856 }
3857 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3858 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3859 {
3860 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3861 }
3862 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3863 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3864 {
3865 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3866 }
3867 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3868 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3869 {
3870 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3871 }
3872 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3873 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3874 {
3875 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3876 }
3877 
packssdw(RValue<Int2> x,RValue<Int2> y)3878 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3879 {
3880 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3881 }
3882 
packssdw(RValue<Int4> x,RValue<Int4> y)3883 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3884 {
3885 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3886 }
3887 
packsswb(RValue<Short4> x,RValue<Short4> y)3888 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3889 {
3890 	return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3891 }
3892 
packuswb(RValue<Short4> x,RValue<Short4> y)3893 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3894 {
3895 	return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3896 }
3897 
packusdw(RValue<Int4> x,RValue<Int4> y)3898 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3899 {
3900 	if(CPUID::supportsSSE4_1())
3901 	{
3902 		return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3903 	}
3904 	else
3905 	{
3906 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3907 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3908 
3909 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3910 	}
3911 }
3912 
psrlw(RValue<UShort4> x,unsigned char y)3913 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3914 {
3915 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3916 }
3917 
psrlw(RValue<UShort8> x,unsigned char y)3918 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3919 {
3920 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3921 }
3922 
psraw(RValue<Short4> x,unsigned char y)3923 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3924 {
3925 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3926 }
3927 
psraw(RValue<Short8> x,unsigned char y)3928 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3929 {
3930 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3931 }
3932 
psllw(RValue<Short4> x,unsigned char y)3933 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3934 {
3935 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3936 }
3937 
psllw(RValue<Short8> x,unsigned char y)3938 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3939 {
3940 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3941 }
3942 
pslld(RValue<Int2> x,unsigned char y)3943 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3944 {
3945 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3946 }
3947 
pslld(RValue<Int4> x,unsigned char y)3948 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3949 {
3950 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3951 }
3952 
psrad(RValue<Int2> x,unsigned char y)3953 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3954 {
3955 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3956 }
3957 
psrad(RValue<Int4> x,unsigned char y)3958 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3959 {
3960 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3961 }
3962 
psrld(RValue<UInt2> x,unsigned char y)3963 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3964 {
3965 	return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3966 }
3967 
psrld(RValue<UInt4> x,unsigned char y)3968 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3969 {
3970 	return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3971 }
3972 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3973 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3974 {
3975 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3976 }
3977 
pminsd(RValue<Int4> x,RValue<Int4> y)3978 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3979 {
3980 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3981 }
3982 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3983 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3984 {
3985 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3986 }
3987 
pminud(RValue<UInt4> x,RValue<UInt4> y)3988 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3989 {
3990 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3991 }
3992 
pmulhw(RValue<Short4> x,RValue<Short4> y)3993 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3994 {
3995 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3996 }
3997 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3998 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3999 {
4000 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
4001 }
4002 
pmaddwd(RValue<Short4> x,RValue<Short4> y)4003 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
4004 {
4005 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
4006 }
4007 
pmulhw(RValue<Short8> x,RValue<Short8> y)4008 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
4009 {
4010 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
4011 }
4012 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)4013 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
4014 {
4015 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
4016 }
4017 
pmaddwd(RValue<Short8> x,RValue<Short8> y)4018 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
4019 {
4020 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
4021 }
4022 
movmskps(RValue<Float4> x)4023 RValue<Int> movmskps(RValue<Float4> x)
4024 {
4025 	Value *v = x.value();
4026 
4027 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4028 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
4029 	// just the sign bits to avoid false positives.
4030 	if(__has_feature(memory_sanitizer))
4031 	{
4032 		v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
4033 	}
4034 
4035 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
4036 }
4037 
pmovmskb(RValue<Byte8> x)4038 RValue<Int> pmovmskb(RValue<Byte8> x)
4039 {
4040 	Value *v = x.value();
4041 
4042 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
4043 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
4044 	// just the sign bits in the lower 64-bit vector to avoid false positives.
4045 	if(__has_feature(memory_sanitizer))
4046 	{
4047 		v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
4048 	}
4049 
4050 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
4051 }
4052 
pmovzxbd(RValue<Byte16> x)4053 RValue<Int4> pmovzxbd(RValue<Byte16> x)
4054 {
4055 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
4056 }
4057 
pmovsxbd(RValue<SByte16> x)4058 RValue<Int4> pmovsxbd(RValue<SByte16> x)
4059 {
4060 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
4061 }
4062 
pmovzxwd(RValue<UShort8> x)4063 RValue<Int4> pmovzxwd(RValue<UShort8> x)
4064 {
4065 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
4066 }
4067 
pmovsxwd(RValue<Short8> x)4068 RValue<Int4> pmovsxwd(RValue<Short8> x)
4069 {
4070 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
4071 }
4072 
4073 }  // namespace x86
4074 #endif  // defined(__i386__) || defined(__x86_64__)
4075 
4076 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)4077 void VPrintf(const std::vector<Value *> &vals)
4078 {
4079 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4080 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4081 	auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
4082 	auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
4083 	jit->builder->CreateCall(func, V(vals));
4084 }
4085 #endif  // ENABLE_RR_PRINT
4086 
Nop()4087 void Nop()
4088 {
4089 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4090 	auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
4091 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
4092 	jit->builder->CreateCall(func);
4093 }
4094 
EmitDebugLocation()4095 void EmitDebugLocation()
4096 {
4097 #ifdef ENABLE_RR_DEBUG_INFO
4098 	if(jit->debugInfo != nullptr)
4099 	{
4100 		jit->debugInfo->EmitLocation();
4101 	}
4102 #endif  // ENABLE_RR_DEBUG_INFO
4103 }
4104 
EmitDebugVariable(Value * value)4105 void EmitDebugVariable(Value *value)
4106 {
4107 #ifdef ENABLE_RR_DEBUG_INFO
4108 	if(jit->debugInfo != nullptr)
4109 	{
4110 		jit->debugInfo->EmitVariable(value);
4111 	}
4112 #endif  // ENABLE_RR_DEBUG_INFO
4113 }
4114 
FlushDebug()4115 void FlushDebug()
4116 {
4117 #ifdef ENABLE_RR_DEBUG_INFO
4118 	if(jit->debugInfo != nullptr)
4119 	{
4120 		jit->debugInfo->Flush();
4121 	}
4122 #endif  // ENABLE_RR_DEBUG_INFO
4123 }
4124 
4125 }  // namespace rr
4126 
4127 // ------------------------------  Coroutines ------------------------------
4128 
4129 namespace {
4130 
4131 // Magic values retuned by llvm.coro.suspend.
4132 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
4133 enum SuspendAction
4134 {
4135 	SuspendActionSuspend = -1,
4136 	SuspendActionResume = 0,
4137 	SuspendActionDestroy = 1
4138 };
4139 
promoteFunctionToCoroutine()4140 void promoteFunctionToCoroutine()
4141 {
4142 	ASSERT(jit->coroutine.id == nullptr);
4143 
4144 	// Types
4145 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4146 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4147 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4148 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4149 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4150 	auto promiseTy = jit->coroutine.yieldType;
4151 	auto promisePtrTy = promiseTy->getPointerTo();
4152 
4153 	// LLVM intrinsics
4154 	auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
4155 	auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
4156 	auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
4157 	auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
4158 	auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
4159 	auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
4160 	auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
4161 	auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
4162 	auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
4163 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4164 
4165 	auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
4166 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
4167 	auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
4168 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
4169 
4170 	auto oldInsertionPoint = jit->builder->saveIP();
4171 
4172 	// Build the coroutine_await() function:
4173 	//
4174 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4175 	//    {
4176 	//        if(llvm.coro.done(handle))
4177 	//        {
4178 	//            return false;
4179 	//        }
4180 	//        else
4181 	//        {
4182 	//            *value = (T*)llvm.coro.promise(handle);
4183 	//            llvm.coro.resume(handle);
4184 	//            return true;
4185 	//        }
4186 	//    }
4187 	//
4188 	{
4189 		auto args = jit->coroutine.await->arg_begin();
4190 		auto handle = args++;
4191 		auto outPtr = args++;
4192 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
4193 		auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
4194 		auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
4195 
4196 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4197 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4198 
4199 		jit->builder->SetInsertPoint(doneBlock);
4200 		jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
4201 
4202 		jit->builder->SetInsertPoint(resumeBlock);
4203 		auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
4204 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
4205 		auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4206 		jit->builder->CreateStore(promise, outPtr);
4207 		jit->builder->CreateCall(coro_resume, { handle });
4208 		jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
4209 	}
4210 
4211 	// Build the coroutine_destroy() function:
4212 	//
4213 	//    void coroutine_destroy(CoroutineHandle* handle)
4214 	//    {
4215 	//        llvm.coro.destroy(handle);
4216 	//    }
4217 	//
4218 	{
4219 		auto handle = jit->coroutine.destroy->arg_begin();
4220 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4221 		jit->builder->CreateCall(coro_destroy, { handle });
4222 		jit->builder->CreateRetVoid();
4223 	}
4224 
4225 	// Begin building the main coroutine_begin() function.
4226 	//
4227 	//    CoroutineHandle* coroutine_begin(<Arguments>)
4228 	//    {
4229 	//        YieldType promise;
4230 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4231 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4232 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
4233 	//
4234 	//        ... <REACTOR CODE> ...
4235 	//
4236 	//    end:
4237 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
4238 	//        switch(action)
4239 	//        {
4240 	//        case SuspendActionResume:
4241 	//            UNREACHABLE(); // Illegal to resume after final suspend.
4242 	//        case SuspendActionDestroy:
4243 	//            goto destroy;
4244 	//        default: // (SuspendActionSuspend)
4245 	//            goto suspend;
4246 	//        }
4247 	//
4248 	//    destroy:
4249 	//        coroutine_free_frame(llvm.coro.free(id, handle));
4250 	//        goto suspend;
4251 	//
4252 	//    suspend:
4253 	//        llvm.coro.end(handle, false);
4254 	//        return handle;
4255 	//    }
4256 	//
4257 
4258 #ifdef ENABLE_RR_DEBUG_INFO
4259 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
4260 #endif  // ENABLE_RR_DEBUG_INFO
4261 
4262 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
4263 	jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
4264 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
4265 
4266 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4267 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4268 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4269 	                                                          llvm::ConstantInt::get(i32Ty, 0),
4270 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4271 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
4272 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
4273 	                                                      });
4274 	auto size = jit->builder->CreateCall(coro_size, {});
4275 	auto frame = jit->builder->CreateCall(allocFrame, { size });
4276 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4277 
4278 	// Build the suspend block
4279 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4280 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4281 	jit->builder->CreateRet(jit->coroutine.handle);
4282 
4283 	// Build the end block
4284 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4285 	auto action = jit->builder->CreateCall(coro_suspend, {
4286 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4287 	                                                         llvm::ConstantInt::get(i1Ty, 1),  // final: true
4288 	                                                     });
4289 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4290 	// switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4291 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4292 
4293 	// Build the destroy block
4294 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4295 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4296 	jit->builder->CreateCall(freeFrame, { memory });
4297 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4298 
4299 	// Switch back to original insert point to continue building the coroutine.
4300 	jit->builder->restoreIP(oldInsertionPoint);
4301 }
4302 
4303 }  // anonymous namespace
4304 
4305 namespace rr {
4306 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4307 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4308 {
4309 	// Coroutines are initially created as a regular function.
4310 	// Upon the first call to Yield(), the function is promoted to a true
4311 	// coroutine.
4312 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4313 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4314 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4315 	auto handleTy = i8PtrTy;
4316 	auto boolTy = i1Ty;
4317 	auto promiseTy = T(YieldType);
4318 	auto promisePtrTy = promiseTy->getPointerTo();
4319 
4320 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4321 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4322 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4323 	jit->coroutine.yieldType = promiseTy;
4324 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4325 
4326 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4327 }
4328 
yield(Value * val)4329 void Nucleus::yield(Value *val)
4330 {
4331 	if(jit->coroutine.id == nullptr)
4332 	{
4333 		// First call to yield().
4334 		// Promote the function to a full coroutine.
4335 		promoteFunctionToCoroutine();
4336 		ASSERT(jit->coroutine.id != nullptr);
4337 	}
4338 
4339 	//      promise = val;
4340 	//
4341 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4342 	//      switch(action)
4343 	//      {
4344 	//      case SuspendActionResume:
4345 	//          goto resume;
4346 	//      case SuspendActionDestroy:
4347 	//          goto destroy;
4348 	//      default: // (SuspendActionSuspend)
4349 	//          goto suspend;
4350 	//      }
4351 	//  resume:
4352 	//
4353 
4354 	RR_DEBUG_INFO_UPDATE_LOC();
4355 	Variable::materializeAll();
4356 
4357 	// Types
4358 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4359 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4360 
4361 	// Intrinsics
4362 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4363 
4364 	// Create a block to resume execution.
4365 	auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4366 
4367 	// Store the promise (yield value)
4368 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4369 	auto action = jit->builder->CreateCall(coro_suspend, {
4370 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4371 	                                                         llvm::ConstantInt::get(i1Ty, 0),  // final: true
4372 	                                                     });
4373 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4374 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4375 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4376 
4377 	// Continue building in the resume block.
4378 	jit->builder->SetInsertPoint(resumeBlock);
4379 }
4380 
acquireCoroutine(const char * name,const Config::Edit * cfgEdit)4381 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
4382 {
4383 	bool isCoroutine = jit->coroutine.id != nullptr;
4384 	if(isCoroutine)
4385 	{
4386 		jit->builder->CreateBr(jit->coroutine.endBlock);
4387 	}
4388 	else
4389 	{
4390 		// Coroutine without a Yield acts as a regular function.
4391 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4392 		// handle.
4393 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4394 		// The 'coroutine_await' function always returns false (coroutine done).
4395 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4396 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4397 		// The 'coroutine_destroy' does nothing, returns void.
4398 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4399 		jit->builder->CreateRetVoid();
4400 	}
4401 
4402 #ifdef ENABLE_RR_DEBUG_INFO
4403 	if(jit->debugInfo != nullptr)
4404 	{
4405 		jit->debugInfo->Finalize();
4406 	}
4407 #endif  // ENABLE_RR_DEBUG_INFO
4408 
4409 	if(false)
4410 	{
4411 		std::error_code error;
4412 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4413 		jit->module->print(file, 0);
4414 	}
4415 
4416 	if(isCoroutine)
4417 	{
4418 		// Run manadory coroutine transforms.
4419 		llvm::legacy::PassManager pm;
4420 
4421 		pm.add(llvm::createCoroEarlyLegacyPass());
4422 		pm.add(llvm::createCoroSplitLegacyPass());
4423 		pm.add(llvm::createCoroElideLegacyPass());
4424 		pm.add(llvm::createBarrierNoopPass());
4425 		pm.add(llvm::createCoroCleanupLegacyPass());
4426 
4427 		pm.run(*jit->module);
4428 	}
4429 
4430 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4431 	{
4432 		llvm::legacy::PassManager pm;
4433 		pm.add(llvm::createVerifierPass());
4434 		pm.run(*jit->module);
4435 	}
4436 #endif  // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4437 
4438 	Config cfg = jit->config;
4439 	if(cfgEdit)
4440 	{
4441 		cfg = cfgEdit->apply(jit->config);
4442 	}
4443 	jit->optimize(cfg);
4444 
4445 	if(false)
4446 	{
4447 		std::error_code error;
4448 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4449 		jit->module->print(file, 0);
4450 	}
4451 
4452 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4453 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4454 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4455 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4456 
4457 	auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount, cfg);
4458 
4459 	delete jit;
4460 	jit = nullptr;
4461 
4462 	return routine;
4463 }
4464 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4465 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4466 {
4467 	return func();
4468 }
4469 
4470 }  // namespace rr
4471