• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Debug.hpp"
16 #include "EmulatedIntrinsics.hpp"
17 #include "OptimalIntrinsics.hpp"
18 #include "Print.hpp"
19 #include "Reactor.hpp"
20 #include "ReactorDebugInfo.hpp"
21 
22 #include "ExecutableMemory.hpp"
23 #include "Optimizer.hpp"
24 
25 #include "src/IceCfg.h"
26 #include "src/IceCfgNode.h"
27 #include "src/IceELFObjectWriter.h"
28 #include "src/IceELFStreamer.h"
29 #include "src/IceGlobalContext.h"
30 #include "src/IceGlobalInits.h"
31 #include "src/IceTypes.h"
32 
33 #include "llvm/Support/Compiler.h"
34 #include "llvm/Support/FileSystem.h"
35 #include "llvm/Support/ManagedStatic.h"
36 #include "llvm/Support/raw_os_ostream.h"
37 
38 #include "marl/event.h"
39 
40 #if __has_feature(memory_sanitizer)
41 #	include <sanitizer/msan_interface.h>
42 #endif
43 
44 #if defined(_WIN32)
45 #	ifndef WIN32_LEAN_AND_MEAN
46 #		define WIN32_LEAN_AND_MEAN
47 #	endif  // !WIN32_LEAN_AND_MEAN
48 #	ifndef NOMINMAX
49 #		define NOMINMAX
50 #	endif  // !NOMINMAX
51 #	include <Windows.h>
52 #endif
53 
54 #include <array>
55 #include <iostream>
56 #include <limits>
57 #include <mutex>
58 
59 // Subzero utility functions
60 // These functions only accept and return Subzero (Ice) types, and do not access any globals.
61 namespace {
62 namespace sz {
63 
createFunction(Ice::GlobalContext * context,Ice::Type returnType,const std::vector<Ice::Type> & paramTypes)64 Ice::Cfg *createFunction(Ice::GlobalContext *context, Ice::Type returnType, const std::vector<Ice::Type> &paramTypes)
65 {
66 	uint32_t sequenceNumber = 0;
67 	auto *function = Ice::Cfg::create(context, sequenceNumber).release();
68 
69 	function->setStackSizeLimit(512 * 1024);  // 512 KiB
70 
71 	Ice::CfgLocalAllocatorScope allocScope{ function };
72 
73 	for(auto type : paramTypes)
74 	{
75 		Ice::Variable *arg = function->makeVariable(type);
76 		function->addArg(arg);
77 	}
78 
79 	Ice::CfgNode *node = function->makeNode();
80 	function->setEntryNode(node);
81 
82 	return function;
83 }
84 
getPointerType(Ice::Type elementType)85 Ice::Type getPointerType(Ice::Type elementType)
86 {
87 	if(sizeof(void *) == 8)
88 	{
89 		return Ice::IceType_i64;
90 	}
91 	else
92 	{
93 		return Ice::IceType_i32;
94 	}
95 }
96 
allocateStackVariable(Ice::Cfg * function,Ice::Type type,int arraySize=0)97 Ice::Variable *allocateStackVariable(Ice::Cfg *function, Ice::Type type, int arraySize = 0)
98 {
99 	int typeSize = Ice::typeWidthInBytes(type);
100 	int totalSize = typeSize * (arraySize ? arraySize : 1);
101 
102 	auto bytes = Ice::ConstantInteger32::create(function->getContext(), Ice::IceType_i32, totalSize);
103 	auto address = function->makeVariable(getPointerType(type));
104 	auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize);  // SRoA depends on the alignment to match the type size.
105 	function->getEntryNode()->getInsts().push_front(alloca);
106 
107 	return address;
108 }
109 
getConstantPointer(Ice::GlobalContext * context,void const * ptr)110 Ice::Constant *getConstantPointer(Ice::GlobalContext *context, void const *ptr)
111 {
112 	if(sizeof(void *) == 8)
113 	{
114 		return context->getConstantInt64(reinterpret_cast<intptr_t>(ptr));
115 	}
116 	else
117 	{
118 		return context->getConstantInt32(reinterpret_cast<intptr_t>(ptr));
119 	}
120 }
121 
122 // TODO(amaiorano): remove this prototype once these are moved to separate header/cpp
123 Ice::Variable *createTruncate(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *from, Ice::Type toType);
124 
125 // Wrapper for calls on C functions with Ice types
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Type retTy,Ice::Operand * callTarget,const std::vector<Ice::Operand * > & iceArgs,bool isVariadic)126 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Type retTy, Ice::Operand *callTarget, const std::vector<Ice::Operand *> &iceArgs, bool isVariadic)
127 {
128 	Ice::Variable *ret = nullptr;
129 
130 	// Subzero doesn't support boolean return values. Replace with an i32 temporarily,
131 	// then truncate result to bool.
132 	// TODO(b/151158858): Add support to Subzero's InstCall for bool-returning functions
133 	const bool returningBool = (retTy == Ice::IceType_i1);
134 	if(returningBool)
135 	{
136 		ret = function->makeVariable(Ice::IceType_i32);
137 	}
138 	else if(retTy != Ice::IceType_void)
139 	{
140 		ret = function->makeVariable(retTy);
141 	}
142 
143 	auto call = Ice::InstCall::create(function, iceArgs.size(), ret, callTarget, false, false, isVariadic);
144 	for(auto arg : iceArgs)
145 	{
146 		call->addArg(arg);
147 	}
148 
149 	basicBlock->appendInst(call);
150 
151 	if(returningBool)
152 	{
153 		// Truncate result to bool so that if any (lsb) bits were set, result will be true
154 		ret = createTruncate(function, basicBlock, ret, Ice::IceType_i1);
155 	}
156 
157 	return ret;
158 }
159 
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Type retTy,void const * fptr,const std::vector<Ice::Operand * > & iceArgs,bool isVariadic)160 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Type retTy, void const *fptr, const std::vector<Ice::Operand *> &iceArgs, bool isVariadic)
161 {
162 	Ice::Operand *callTarget = getConstantPointer(function->getContext(), fptr);
163 	return Call(function, basicBlock, retTy, callTarget, iceArgs, isVariadic);
164 }
165 
166 // Wrapper for calls on C functions with Ice types
167 template<typename Return, typename... CArgs, typename... RArgs>
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Return (fptr)(CArgs...),RArgs &&...args)168 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Return(fptr)(CArgs...), RArgs &&... args)
169 {
170 	static_assert(sizeof...(CArgs) == sizeof...(RArgs), "Expected number of args don't match");
171 
172 	Ice::Type retTy = T(rr::CToReactorT<Return>::type());
173 	std::vector<Ice::Operand *> iceArgs{ std::forward<RArgs>(args)... };
174 	return Call(function, basicBlock, retTy, reinterpret_cast<void const *>(fptr), iceArgs, false);
175 }
176 
createTruncate(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Operand * from,Ice::Type toType)177 Ice::Variable *createTruncate(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *from, Ice::Type toType)
178 {
179 	Ice::Variable *to = function->makeVariable(toType);
180 	Ice::InstCast *cast = Ice::InstCast::create(function, Ice::InstCast::Trunc, to, from);
181 	basicBlock->appendInst(cast);
182 	return to;
183 }
184 
createLoad(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Operand * ptr,Ice::Type type,unsigned int align)185 Ice::Variable *createLoad(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *ptr, Ice::Type type, unsigned int align)
186 {
187 	Ice::Variable *result = function->makeVariable(type);
188 	auto load = Ice::InstLoad::create(function, result, ptr, align);
189 	basicBlock->appendInst(load);
190 
191 	return result;
192 }
193 
194 }  // namespace sz
195 }  // namespace
196 
197 namespace rr {
198 class ELFMemoryStreamer;
199 class CoroutineGenerator;
200 }  // namespace rr
201 
202 namespace {
203 
204 // Used to automatically invoke llvm_shutdown() when driver is unloaded
205 llvm::llvm_shutdown_obj llvmShutdownObj;
206 
207 // Default configuration settings. Must be accessed under mutex lock.
208 std::mutex defaultConfigLock;
defaultConfig()209 rr::Config &defaultConfig()
210 {
211 	// This uses a static in a function to avoid the cost of a global static
212 	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
213 	static rr::Config config = rr::Config::Edit()
214 	                               .apply({});
215 	return config;
216 }
217 
218 Ice::GlobalContext *context = nullptr;
219 Ice::Cfg *function = nullptr;
220 Ice::CfgNode *entryBlock = nullptr;
221 Ice::CfgNode *basicBlockTop = nullptr;
222 Ice::CfgNode *basicBlock = nullptr;
223 Ice::CfgLocalAllocatorScope *allocator = nullptr;
224 rr::ELFMemoryStreamer *routine = nullptr;
225 
226 std::mutex codegenMutex;
227 
228 Ice::ELFFileStreamer *elfFile = nullptr;
229 Ice::Fdstream *out = nullptr;
230 
231 // Coroutine globals
232 rr::Type *coroYieldType = nullptr;
233 std::shared_ptr<rr::CoroutineGenerator> coroGen;
getOrCreateScheduler()234 marl::Scheduler &getOrCreateScheduler()
235 {
236 	static auto scheduler = [] {
237 		marl::Scheduler::Config cfg;
238 		cfg.setWorkerThreadCount(8);
239 		return std::make_unique<marl::Scheduler>(cfg);
240 	}();
241 
242 	return *scheduler;
243 }
244 
245 rr::Nucleus::OptimizerCallback *optimizerCallback = nullptr;
246 
247 }  // Anonymous namespace
248 
249 namespace {
250 
251 #if !defined(__i386__) && defined(_M_IX86)
252 #	define __i386__ 1
253 #endif
254 
255 #if !defined(__x86_64__) && (defined(_M_AMD64) || defined(_M_X64))
256 #	define __x86_64__ 1
257 #endif
258 
toIce(rr::Optimization::Level level)259 Ice::OptLevel toIce(rr::Optimization::Level level)
260 {
261 	switch(level)
262 	{
263 	// Note that Opt_0 and Opt_1 are not implemented by Subzero
264 	case rr::Optimization::Level::None: return Ice::Opt_m1;
265 	case rr::Optimization::Level::Less: return Ice::Opt_m1;
266 	case rr::Optimization::Level::Default: return Ice::Opt_2;
267 	case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
268 	default: UNREACHABLE("Unknown Optimization Level %d", int(level));
269 	}
270 	return Ice::Opt_2;
271 }
272 
stdToIceMemoryOrder(std::memory_order memoryOrder)273 Ice::Intrinsics::MemoryOrder stdToIceMemoryOrder(std::memory_order memoryOrder)
274 {
275 	switch(memoryOrder)
276 	{
277 	case std::memory_order_relaxed: return Ice::Intrinsics::MemoryOrderRelaxed;
278 	case std::memory_order_consume: return Ice::Intrinsics::MemoryOrderConsume;
279 	case std::memory_order_acquire: return Ice::Intrinsics::MemoryOrderAcquire;
280 	case std::memory_order_release: return Ice::Intrinsics::MemoryOrderRelease;
281 	case std::memory_order_acq_rel: return Ice::Intrinsics::MemoryOrderAcquireRelease;
282 	case std::memory_order_seq_cst: return Ice::Intrinsics::MemoryOrderSequentiallyConsistent;
283 	}
284 	return Ice::Intrinsics::MemoryOrderInvalid;
285 }
286 
287 class CPUID
288 {
289 public:
290 	const static bool ARM;
291 	const static bool SSE4_1;
292 
293 private:
cpuid(int registers[4],int info)294 	static void cpuid(int registers[4], int info)
295 	{
296 #if defined(__i386__) || defined(__x86_64__)
297 #	if defined(_WIN32)
298 		__cpuid(registers, info);
299 #	else
300 		__asm volatile("cpuid"
301 		               : "=a"(registers[0]), "=b"(registers[1]), "=c"(registers[2]), "=d"(registers[3])
302 		               : "a"(info));
303 #	endif
304 #else
305 		registers[0] = 0;
306 		registers[1] = 0;
307 		registers[2] = 0;
308 		registers[3] = 0;
309 #endif
310 	}
311 
detectARM()312 	constexpr static bool detectARM()
313 	{
314 #if defined(__arm__) || defined(__aarch64__)
315 		return true;
316 #elif defined(__i386__) || defined(__x86_64__)
317 		return false;
318 #elif defined(__mips__)
319 		return false;
320 #else
321 #	error "Unknown architecture"
322 #endif
323 	}
324 
detectSSE4_1()325 	static bool detectSSE4_1()
326 	{
327 #if defined(__i386__) || defined(__x86_64__)
328 		int registers[4];
329 		cpuid(registers, 1);
330 		return (registers[2] & 0x00080000) != 0;
331 #else
332 		return false;
333 #endif
334 	}
335 };
336 
337 constexpr bool CPUID::ARM = CPUID::detectARM();
338 const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
339 constexpr bool emulateIntrinsics = false;
340 constexpr bool emulateMismatchedBitCast = CPUID::ARM;
341 
342 constexpr bool subzeroDumpEnabled = false;
343 constexpr bool subzeroEmitTextAsm = false;
344 
345 #if !ALLOW_DUMP
346 static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
347 static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
348 #endif
349 
350 }  // anonymous namespace
351 
352 namespace rr {
353 
BackendName()354 std::string BackendName()
355 {
356 	return "Subzero";
357 }
358 
359 const Capabilities Caps = {
360 	true,  // CoroutinesSupported
361 };
362 
363 enum EmulatedType
364 {
365 	EmulatedShift = 16,
366 	EmulatedV2 = 2 << EmulatedShift,
367 	EmulatedV4 = 4 << EmulatedShift,
368 	EmulatedV8 = 8 << EmulatedShift,
369 	EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
370 
371 	Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
372 	Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
373 	Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
374 	Type_v8i8 = Ice::IceType_v16i8 | EmulatedV8,
375 	Type_v4i8 = Ice::IceType_v16i8 | EmulatedV4,
376 	Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
377 };
378 
379 class Value : public Ice::Operand
380 {};
381 class SwitchCases : public Ice::InstSwitch
382 {};
383 class BasicBlock : public Ice::CfgNode
384 {};
385 
T(Type * t)386 Ice::Type T(Type *t)
387 {
388 	static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
389 	return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
390 }
391 
T(Ice::Type t)392 Type *T(Ice::Type t)
393 {
394 	return reinterpret_cast<Type *>(t);
395 }
396 
T(EmulatedType t)397 Type *T(EmulatedType t)
398 {
399 	return reinterpret_cast<Type *>(t);
400 }
401 
T(const std::vector<Type * > & types)402 std::vector<Ice::Type> T(const std::vector<Type *> &types)
403 {
404 	std::vector<Ice::Type> result;
405 	result.reserve(types.size());
406 	for(auto &t : types)
407 	{
408 		result.push_back(T(t));
409 	}
410 	return result;
411 }
412 
V(Ice::Operand * v)413 Value *V(Ice::Operand *v)
414 {
415 	return reinterpret_cast<Value *>(v);
416 }
417 
V(Value * v)418 Ice::Operand *V(Value *v)
419 {
420 	return reinterpret_cast<Ice::Operand *>(v);
421 }
422 
V(const std::vector<Value * > & values)423 std::vector<Ice::Operand *> V(const std::vector<Value *> &values)
424 {
425 	std::vector<Ice::Operand *> result;
426 	result.reserve(values.size());
427 	for(auto &v : values)
428 	{
429 		result.push_back(V(v));
430 	}
431 	return result;
432 }
433 
B(Ice::CfgNode * b)434 BasicBlock *B(Ice::CfgNode *b)
435 {
436 	return reinterpret_cast<BasicBlock *>(b);
437 }
438 
typeSize(Type * type)439 static size_t typeSize(Type *type)
440 {
441 	if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
442 	{
443 		switch(reinterpret_cast<std::intptr_t>(type))
444 		{
445 		case Type_v2i32: return 8;
446 		case Type_v4i16: return 8;
447 		case Type_v2i16: return 4;
448 		case Type_v8i8: return 8;
449 		case Type_v4i8: return 4;
450 		case Type_v2f32: return 8;
451 		default: ASSERT(false);
452 		}
453 	}
454 
455 	return Ice::typeWidthInBytes(T(type));
456 }
457 
finalizeFunction()458 static void finalizeFunction()
459 {
460 	// Create a return if none was added
461 	if(::basicBlock->getInsts().empty() || ::basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
462 	{
463 		Nucleus::createRetVoid();
464 	}
465 
466 	// Connect the entry block to the top of the initial basic block
467 	auto br = Ice::InstBr::create(::function, ::basicBlockTop);
468 	::entryBlock->appendInst(br);
469 }
470 
471 using ElfHeader = std::conditional<sizeof(void *) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
472 using SectionHeader = std::conditional<sizeof(void *) == 8, Elf64_Shdr, Elf32_Shdr>::type;
473 
sectionHeader(const ElfHeader * elfHeader)474 inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
475 {
476 	return reinterpret_cast<const SectionHeader *>((intptr_t)elfHeader + elfHeader->e_shoff);
477 }
478 
elfSection(const ElfHeader * elfHeader,int index)479 inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
480 {
481 	return &sectionHeader(elfHeader)[index];
482 }
483 
relocateSymbol(const ElfHeader * elfHeader,const Elf32_Rel & relocation,const SectionHeader & relocationTable)484 static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
485 {
486 	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
487 
488 	uint32_t index = relocation.getSymbol();
489 	int table = relocationTable.sh_link;
490 	void *symbolValue = nullptr;
491 
492 	if(index != SHN_UNDEF)
493 	{
494 		if(table == SHN_UNDEF) return nullptr;
495 		const SectionHeader *symbolTable = elfSection(elfHeader, table);
496 
497 		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
498 		if(index >= symtab_entries)
499 		{
500 			ASSERT(index < symtab_entries && "Symbol Index out of range");
501 			return nullptr;
502 		}
503 
504 		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
505 		Elf32_Sym &symbol = ((Elf32_Sym *)symbolAddress)[index];
506 		uint16_t section = symbol.st_shndx;
507 
508 		if(section != SHN_UNDEF && section < SHN_LORESERVE)
509 		{
510 			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
511 			symbolValue = reinterpret_cast<void *>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
512 		}
513 		else
514 		{
515 			return nullptr;
516 		}
517 	}
518 
519 	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
520 	unaligned_ptr<int32_t> patchSite = (int32_t *)(address + relocation.r_offset);
521 
522 	if(CPUID::ARM)
523 	{
524 		switch(relocation.getType())
525 		{
526 		case R_ARM_NONE:
527 			// No relocation
528 			break;
529 		case R_ARM_MOVW_ABS_NC:
530 			{
531 				uint32_t thumb = 0;  // Calls to Thumb code not supported.
532 				uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
533 				*patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
534 			}
535 			break;
536 		case R_ARM_MOVT_ABS:
537 			{
538 				uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
539 				*patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
540 			}
541 			break;
542 		default:
543 			ASSERT(false && "Unsupported relocation type");
544 			return nullptr;
545 		}
546 	}
547 	else
548 	{
549 		switch(relocation.getType())
550 		{
551 		case R_386_NONE:
552 			// No relocation
553 			break;
554 		case R_386_32:
555 			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
556 			break;
557 		case R_386_PC32:
558 			*patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
559 			break;
560 		default:
561 			ASSERT(false && "Unsupported relocation type");
562 			return nullptr;
563 		}
564 	}
565 
566 	return symbolValue;
567 }
568 
relocateSymbol(const ElfHeader * elfHeader,const Elf64_Rela & relocation,const SectionHeader & relocationTable)569 static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
570 {
571 	const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
572 
573 	uint32_t index = relocation.getSymbol();
574 	int table = relocationTable.sh_link;
575 	void *symbolValue = nullptr;
576 
577 	if(index != SHN_UNDEF)
578 	{
579 		if(table == SHN_UNDEF) return nullptr;
580 		const SectionHeader *symbolTable = elfSection(elfHeader, table);
581 
582 		uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
583 		if(index >= symtab_entries)
584 		{
585 			ASSERT(index < symtab_entries && "Symbol Index out of range");
586 			return nullptr;
587 		}
588 
589 		intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
590 		Elf64_Sym &symbol = ((Elf64_Sym *)symbolAddress)[index];
591 		uint16_t section = symbol.st_shndx;
592 
593 		if(section != SHN_UNDEF && section < SHN_LORESERVE)
594 		{
595 			const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
596 			symbolValue = reinterpret_cast<void *>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
597 		}
598 		else
599 		{
600 			return nullptr;
601 		}
602 	}
603 
604 	intptr_t address = (intptr_t)elfHeader + target->sh_offset;
605 	unaligned_ptr<int32_t> patchSite32 = (int32_t *)(address + relocation.r_offset);
606 	unaligned_ptr<int64_t> patchSite64 = (int64_t *)(address + relocation.r_offset);
607 
608 	switch(relocation.getType())
609 	{
610 	case R_X86_64_NONE:
611 		// No relocation
612 		break;
613 	case R_X86_64_64:
614 		*patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
615 		break;
616 	case R_X86_64_PC32:
617 		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
618 		break;
619 	case R_X86_64_32S:
620 		*patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
621 		break;
622 	default:
623 		ASSERT(false && "Unsupported relocation type");
624 		return nullptr;
625 	}
626 
627 	return symbolValue;
628 }
629 
630 struct EntryPoint
631 {
632 	const void *entry;
633 	size_t codeSize = 0;
634 };
635 
loadImage(uint8_t * const elfImage,const std::vector<const char * > & functionNames)636 std::vector<EntryPoint> loadImage(uint8_t *const elfImage, const std::vector<const char *> &functionNames)
637 {
638 	ASSERT(functionNames.size() > 0);
639 	std::vector<EntryPoint> entryPoints(functionNames.size());
640 
641 	ElfHeader *elfHeader = (ElfHeader *)elfImage;
642 
643 	// TODO: assert?
644 	if(!elfHeader->checkMagic())
645 	{
646 		return {};
647 	}
648 
649 	// Expect ELF bitness to match platform
650 	ASSERT(sizeof(void *) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
651 #if defined(__i386__)
652 	ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_386);
653 #elif defined(__x86_64__)
654 	ASSERT(sizeof(void *) == 8 && elfHeader->e_machine == EM_X86_64);
655 #elif defined(__arm__)
656 	ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_ARM);
657 #elif defined(__aarch64__)
658 	ASSERT(sizeof(void *) == 8 && elfHeader->e_machine == EM_AARCH64);
659 #elif defined(__mips__)
660 	ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_MIPS);
661 #else
662 #	error "Unsupported platform"
663 #endif
664 
665 	SectionHeader *sectionHeader = (SectionHeader *)(elfImage + elfHeader->e_shoff);
666 
667 	for(int i = 0; i < elfHeader->e_shnum; i++)
668 	{
669 		if(sectionHeader[i].sh_type == SHT_PROGBITS)
670 		{
671 			if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
672 			{
673 				auto findSectionNameEntryIndex = [&]() -> size_t {
674 					auto sectionNameOffset = sectionHeader[elfHeader->e_shstrndx].sh_offset + sectionHeader[i].sh_name;
675 					const char *sectionName = reinterpret_cast<const char *>(elfImage + sectionNameOffset);
676 
677 					for(size_t j = 0; j < functionNames.size(); ++j)
678 					{
679 						if(strstr(sectionName, functionNames[j]) != nullptr)
680 						{
681 							return j;
682 						}
683 					}
684 
685 					UNREACHABLE("Failed to find executable section that matches input function names");
686 					return static_cast<size_t>(-1);
687 				};
688 
689 				size_t index = findSectionNameEntryIndex();
690 				entryPoints[index].entry = elfImage + sectionHeader[i].sh_offset;
691 				entryPoints[index].codeSize = sectionHeader[i].sh_size;
692 			}
693 		}
694 		else if(sectionHeader[i].sh_type == SHT_REL)
695 		{
696 			ASSERT(sizeof(void *) == 4 && "UNIMPLEMENTED");  // Only expected/implemented for 32-bit code
697 
698 			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
699 			{
700 				const Elf32_Rel &relocation = ((const Elf32_Rel *)(elfImage + sectionHeader[i].sh_offset))[index];
701 				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
702 			}
703 		}
704 		else if(sectionHeader[i].sh_type == SHT_RELA)
705 		{
706 			ASSERT(sizeof(void *) == 8 && "UNIMPLEMENTED");  // Only expected/implemented for 64-bit code
707 
708 			for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
709 			{
710 				const Elf64_Rela &relocation = ((const Elf64_Rela *)(elfImage + sectionHeader[i].sh_offset))[index];
711 				relocateSymbol(elfHeader, relocation, sectionHeader[i]);
712 			}
713 		}
714 	}
715 
716 	return entryPoints;
717 }
718 
719 template<typename T>
720 struct ExecutableAllocator
721 {
ExecutableAllocatorrr::ExecutableAllocator722 	ExecutableAllocator() {}
723 	template<class U>
ExecutableAllocatorrr::ExecutableAllocator724 	ExecutableAllocator(const ExecutableAllocator<U> &other)
725 	{}
726 
727 	using value_type = T;
728 	using size_type = std::size_t;
729 
allocaterr::ExecutableAllocator730 	T *allocate(size_type n)
731 	{
732 		return (T *)allocateMemoryPages(
733 		    sizeof(T) * n, PERMISSION_READ | PERMISSION_WRITE, true);
734 	}
735 
deallocaterr::ExecutableAllocator736 	void deallocate(T *p, size_type n)
737 	{
738 		deallocateMemoryPages(p, sizeof(T) * n);
739 	}
740 };
741 
742 class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
743 {
744 	ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
745 	ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
746 
747 public:
ELFMemoryStreamer()748 	ELFMemoryStreamer()
749 	    : Routine()
750 	{
751 		position = 0;
752 		buffer.reserve(0x1000);
753 	}
754 
~ELFMemoryStreamer()755 	~ELFMemoryStreamer() override
756 	{
757 	}
758 
write8(uint8_t Value)759 	void write8(uint8_t Value) override
760 	{
761 		if(position == (uint64_t)buffer.size())
762 		{
763 			buffer.push_back(Value);
764 			position++;
765 		}
766 		else if(position < (uint64_t)buffer.size())
767 		{
768 			buffer[position] = Value;
769 			position++;
770 		}
771 		else
772 			ASSERT(false && "UNIMPLEMENTED");
773 	}
774 
writeBytes(llvm::StringRef Bytes)775 	void writeBytes(llvm::StringRef Bytes) override
776 	{
777 		std::size_t oldSize = buffer.size();
778 		buffer.resize(oldSize + Bytes.size());
779 		memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
780 		position += Bytes.size();
781 	}
782 
tell() const783 	uint64_t tell() const override { return position; }
784 
seek(uint64_t Off)785 	void seek(uint64_t Off) override { position = Off; }
786 
loadImageAndGetEntryPoints(const std::vector<const char * > & functionNames)787 	std::vector<EntryPoint> loadImageAndGetEntryPoints(const std::vector<const char *> &functionNames)
788 	{
789 		auto entryPoints = loadImage(&buffer[0], functionNames);
790 
791 #if defined(_WIN32)
792 		FlushInstructionCache(GetCurrentProcess(), NULL, 0);
793 #else
794 		for(auto &entryPoint : entryPoints)
795 		{
796 			__builtin___clear_cache((char *)entryPoint.entry, (char *)entryPoint.entry + entryPoint.codeSize);
797 		}
798 #endif
799 
800 		return entryPoints;
801 	}
802 
finalize()803 	void finalize()
804 	{
805 		position = std::numeric_limits<std::size_t>::max();  // Can't stream more data after this
806 
807 		protectMemoryPages(&buffer[0], buffer.size(), PERMISSION_READ | PERMISSION_EXECUTE);
808 	}
809 
setEntry(int index,const void * func)810 	void setEntry(int index, const void *func)
811 	{
812 		ASSERT(func);
813 		funcs[index] = func;
814 	}
815 
getEntry(int index) const816 	const void *getEntry(int index) const override
817 	{
818 		ASSERT(funcs[index]);
819 		return funcs[index];
820 	}
821 
addConstantData(const void * data,size_t size,size_t alignment=1)822 	const void *addConstantData(const void *data, size_t size, size_t alignment = 1)
823 	{
824 		// Check if we already have a suitable constant.
825 		for(const auto &c : constantsPool)
826 		{
827 			void *ptr = c.data.get();
828 			size_t space = c.space;
829 
830 			void *alignedPtr = std::align(alignment, size, ptr, space);
831 
832 			if(space < size)
833 			{
834 				continue;
835 			}
836 
837 			if(memcmp(data, alignedPtr, size) == 0)
838 			{
839 				return alignedPtr;
840 			}
841 		}
842 
843 		// TODO(b/148086935): Replace with a buffer allocator.
844 		size_t space = size + alignment;
845 		auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[space]);
846 		void *ptr = buf.get();
847 		void *alignedPtr = std::align(alignment, size, ptr, space);
848 		ASSERT(alignedPtr);
849 		memcpy(alignedPtr, data, size);
850 		constantsPool.emplace_back(std::move(buf), space);
851 
852 		return alignedPtr;
853 	}
854 
855 private:
856 	struct Constant
857 	{
Constantrr::ELFMemoryStreamer::Constant858 		Constant(std::unique_ptr<uint8_t[]> data, size_t space)
859 		    : data(std::move(data))
860 		    , space(space)
861 		{}
862 
863 		std::unique_ptr<uint8_t[]> data;
864 		size_t space;
865 	};
866 
867 	std::array<const void *, Nucleus::CoroutineEntryCount> funcs = {};
868 	std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
869 	std::size_t position;
870 	std::vector<Constant> constantsPool;
871 };
872 
873 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)874 void VPrintf(const std::vector<Value *> &vals)
875 {
876 	sz::Call(::function, ::basicBlock, Ice::IceType_i32, reinterpret_cast<const void *>(rr::DebugPrintf), V(vals), true);
877 }
878 #endif  // ENABLE_RR_PRINT
879 
Nucleus()880 Nucleus::Nucleus()
881 {
882 	::codegenMutex.lock();  // SubzeroReactor is currently not thread safe
883 
884 	Ice::ClFlags &Flags = Ice::ClFlags::Flags;
885 	Ice::ClFlags::getParsedClFlags(Flags);
886 
887 #if defined(__arm__)
888 	Flags.setTargetArch(Ice::Target_ARM32);
889 	Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
890 #elif defined(__mips__)
891 	Flags.setTargetArch(Ice::Target_MIPS32);
892 	Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
893 #else  // x86
894 	Flags.setTargetArch(sizeof(void *) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
895 	Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
896 #endif
897 	Flags.setOutFileType(Ice::FT_Elf);
898 	Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
899 	Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
900 	Flags.setDisableHybridAssembly(true);
901 
902 	// Emit functions into separate sections in the ELF so we can find them by name
903 	Flags.setFunctionSections(true);
904 
905 	static llvm::raw_os_ostream cout(std::cout);
906 	static llvm::raw_os_ostream cerr(std::cerr);
907 
908 	if(subzeroEmitTextAsm)
909 	{
910 		// Decorate text asm with liveness info
911 		Flags.setDecorateAsm(true);
912 	}
913 
914 	if(false)  // Write out to a file
915 	{
916 		std::error_code errorCode;
917 		::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
918 		::elfFile = new Ice::ELFFileStreamer(*out);
919 		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
920 	}
921 	else
922 	{
923 		ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
924 		::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
925 		::routine = elfMemory;
926 	}
927 
928 #if !__has_feature(memory_sanitizer)
929 	// thread_local variables in shared libraries are initialized at load-time,
930 	// but this is not observed by MemorySanitizer if the loader itself was not
931 	// instrumented, leading to false-positive uninitialized variable errors.
932 	ASSERT(Variable::unmaterializedVariables == nullptr);
933 #endif
934 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables{};
935 }
936 
~Nucleus()937 Nucleus::~Nucleus()
938 {
939 	delete Variable::unmaterializedVariables;
940 	Variable::unmaterializedVariables = nullptr;
941 
942 	delete ::routine;
943 	::routine = nullptr;
944 
945 	delete ::allocator;
946 	::allocator = nullptr;
947 
948 	delete ::function;
949 	::function = nullptr;
950 
951 	delete ::context;
952 	::context = nullptr;
953 
954 	delete ::elfFile;
955 	::elfFile = nullptr;
956 
957 	delete ::out;
958 	::out = nullptr;
959 
960 	::entryBlock = nullptr;
961 	::basicBlock = nullptr;
962 	::basicBlockTop = nullptr;
963 
964 	::codegenMutex.unlock();
965 }
966 
setDefaultConfig(const Config & cfg)967 void Nucleus::setDefaultConfig(const Config &cfg)
968 {
969 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
970 	::defaultConfig() = cfg;
971 }
972 
adjustDefaultConfig(const Config::Edit & cfgEdit)973 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
974 {
975 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
976 	auto &config = ::defaultConfig();
977 	config = cfgEdit.apply(config);
978 }
979 
getDefaultConfig()980 Config Nucleus::getDefaultConfig()
981 {
982 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
983 	return ::defaultConfig();
984 }
985 
986 // This function lowers and produces executable binary code in memory for the input functions,
987 // and returns a Routine with the entry points to these functions.
988 template<size_t Count>
acquireRoutine(Ice::Cfg * const (& functions)[Count],const char * const (& names)[Count],const Config::Edit * cfgEdit)989 static std::shared_ptr<Routine> acquireRoutine(Ice::Cfg *const (&functions)[Count], const char *const (&names)[Count], const Config::Edit *cfgEdit)
990 {
991 	// This logic is modeled after the IceCompiler, as well as GlobalContext::translateFunctions
992 	// and GlobalContext::emitItems.
993 
994 	if(subzeroDumpEnabled)
995 	{
996 		// Output dump strings immediately, rather than once buffer is full. Useful for debugging.
997 		::context->getStrDump().SetUnbuffered();
998 	}
999 
1000 	::context->emitFileHeader();
1001 
1002 	// Translate
1003 
1004 	for(size_t i = 0; i < Count; ++i)
1005 	{
1006 		Ice::Cfg *currFunc = functions[i];
1007 
1008 		// Install function allocator in TLS for Cfg-specific container allocators
1009 		Ice::CfgLocalAllocatorScope allocScope(currFunc);
1010 
1011 		currFunc->setFunctionName(Ice::GlobalString::createWithString(::context, names[i]));
1012 
1013 		if(::optimizerCallback)
1014 		{
1015 			Nucleus::OptimizerReport report;
1016 			rr::optimize(currFunc, &report);
1017 			::optimizerCallback(&report);
1018 			::optimizerCallback = nullptr;
1019 		}
1020 		else
1021 		{
1022 			rr::optimize(currFunc);
1023 		}
1024 
1025 		currFunc->computeInOutEdges();
1026 		ASSERT_MSG(!currFunc->hasError(), "%s", currFunc->getError().c_str());
1027 
1028 		currFunc->translate();
1029 		ASSERT_MSG(!currFunc->hasError(), "%s", currFunc->getError().c_str());
1030 
1031 		currFunc->getAssembler<>()->setInternal(currFunc->getInternal());
1032 
1033 		if(subzeroEmitTextAsm)
1034 		{
1035 			currFunc->emit();
1036 		}
1037 
1038 		currFunc->emitIAS();
1039 
1040 		if(currFunc->hasError())
1041 		{
1042 			return nullptr;
1043 		}
1044 	}
1045 
1046 	// Emit items
1047 
1048 	::context->lowerGlobals("");
1049 
1050 	auto objectWriter = ::context->getObjectWriter();
1051 
1052 	for(size_t i = 0; i < Count; ++i)
1053 	{
1054 		Ice::Cfg *currFunc = functions[i];
1055 
1056 		// Accumulate globals from functions to emit into the "last" section at the end
1057 		auto globals = currFunc->getGlobalInits();
1058 		if(globals && !globals->empty())
1059 		{
1060 			::context->getGlobals()->merge(globals.get());
1061 		}
1062 
1063 		auto assembler = currFunc->releaseAssembler();
1064 		assembler->alignFunction();
1065 		objectWriter->writeFunctionCode(currFunc->getFunctionName(), currFunc->getInternal(), assembler.get());
1066 	}
1067 
1068 	::context->lowerGlobals("last");
1069 	::context->lowerConstants();
1070 	::context->lowerJumpTables();
1071 
1072 	objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
1073 	::context->emitTargetRODataSections();
1074 	objectWriter->writeNonUserSections();
1075 
1076 	// Done compiling functions, get entry pointers to each of them
1077 	auto entryPoints = ::routine->loadImageAndGetEntryPoints({ names, names + Count });
1078 	ASSERT(entryPoints.size() == Count);
1079 	for(size_t i = 0; i < entryPoints.size(); ++i)
1080 	{
1081 		::routine->setEntry(i, entryPoints[i].entry);
1082 	}
1083 
1084 	::routine->finalize();
1085 
1086 	Routine *handoffRoutine = ::routine;
1087 	::routine = nullptr;
1088 
1089 	return std::shared_ptr<Routine>(handoffRoutine);
1090 }
1091 
acquireRoutine(const char * name,const Config::Edit * cfgEdit)1092 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
1093 {
1094 	finalizeFunction();
1095 	return rr::acquireRoutine({ ::function }, { name }, cfgEdit);
1096 }
1097 
allocateStackVariable(Type * t,int arraySize)1098 Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
1099 {
1100 	Ice::Type type = T(t);
1101 	int typeSize = Ice::typeWidthInBytes(type);
1102 	int totalSize = typeSize * (arraySize ? arraySize : 1);
1103 
1104 	auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
1105 	auto address = ::function->makeVariable(T(getPointerType(t)));
1106 	auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);  // SRoA depends on the alignment to match the type size.
1107 	::function->getEntryNode()->getInsts().push_front(alloca);
1108 
1109 	return V(address);
1110 }
1111 
createBasicBlock()1112 BasicBlock *Nucleus::createBasicBlock()
1113 {
1114 	return B(::function->makeNode());
1115 }
1116 
getInsertBlock()1117 BasicBlock *Nucleus::getInsertBlock()
1118 {
1119 	return B(::basicBlock);
1120 }
1121 
setInsertBlock(BasicBlock * basicBlock)1122 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
1123 {
1124 	// ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
1125 
1126 	::basicBlock = basicBlock;
1127 }
1128 
createFunction(Type * returnType,const std::vector<Type * > & paramTypes)1129 void Nucleus::createFunction(Type *returnType, const std::vector<Type *> &paramTypes)
1130 {
1131 	ASSERT(::function == nullptr);
1132 	ASSERT(::allocator == nullptr);
1133 	ASSERT(::entryBlock == nullptr);
1134 	ASSERT(::basicBlock == nullptr);
1135 	ASSERT(::basicBlockTop == nullptr);
1136 
1137 	::function = sz::createFunction(::context, T(returnType), T(paramTypes));
1138 
1139 	// NOTE: The scoped allocator sets the TLS allocator to the one in the function. This global one
1140 	// becomes invalid if another one is created; for example, when creating await and destroy functions
1141 	// for coroutines, in which case, we must make sure to create a new scoped allocator for ::function again.
1142 	// TODO: Get rid of this as a global, and create scoped allocs in every Nucleus function instead.
1143 	::allocator = new Ice::CfgLocalAllocatorScope(::function);
1144 
1145 	::entryBlock = ::function->getEntryNode();
1146 	::basicBlock = ::function->makeNode();
1147 	::basicBlockTop = ::basicBlock;
1148 }
1149 
getArgument(unsigned int index)1150 Value *Nucleus::getArgument(unsigned int index)
1151 {
1152 	return V(::function->getArgs()[index]);
1153 }
1154 
createRetVoid()1155 void Nucleus::createRetVoid()
1156 {
1157 	RR_DEBUG_INFO_UPDATE_LOC();
1158 
1159 	// Code generated after this point is unreachable, so any variables
1160 	// being read can safely return an undefined value. We have to avoid
1161 	// materializing variables after the terminator ret instruction.
1162 	Variable::killUnmaterialized();
1163 
1164 	Ice::InstRet *ret = Ice::InstRet::create(::function);
1165 	::basicBlock->appendInst(ret);
1166 }
1167 
createRet(Value * v)1168 void Nucleus::createRet(Value *v)
1169 {
1170 	RR_DEBUG_INFO_UPDATE_LOC();
1171 
1172 	// Code generated after this point is unreachable, so any variables
1173 	// being read can safely return an undefined value. We have to avoid
1174 	// materializing variables after the terminator ret instruction.
1175 	Variable::killUnmaterialized();
1176 
1177 	Ice::InstRet *ret = Ice::InstRet::create(::function, v);
1178 	::basicBlock->appendInst(ret);
1179 }
1180 
createBr(BasicBlock * dest)1181 void Nucleus::createBr(BasicBlock *dest)
1182 {
1183 	RR_DEBUG_INFO_UPDATE_LOC();
1184 	Variable::materializeAll();
1185 
1186 	auto br = Ice::InstBr::create(::function, dest);
1187 	::basicBlock->appendInst(br);
1188 }
1189 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)1190 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
1191 {
1192 	RR_DEBUG_INFO_UPDATE_LOC();
1193 	Variable::materializeAll();
1194 
1195 	auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
1196 	::basicBlock->appendInst(br);
1197 }
1198 
isCommutative(Ice::InstArithmetic::OpKind op)1199 static bool isCommutative(Ice::InstArithmetic::OpKind op)
1200 {
1201 	switch(op)
1202 	{
1203 	case Ice::InstArithmetic::Add:
1204 	case Ice::InstArithmetic::Fadd:
1205 	case Ice::InstArithmetic::Mul:
1206 	case Ice::InstArithmetic::Fmul:
1207 	case Ice::InstArithmetic::And:
1208 	case Ice::InstArithmetic::Or:
1209 	case Ice::InstArithmetic::Xor:
1210 		return true;
1211 	default:
1212 		return false;
1213 	}
1214 }
1215 
createArithmetic(Ice::InstArithmetic::OpKind op,Value * lhs,Value * rhs)1216 static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
1217 {
1218 	ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
1219 
1220 	bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
1221 
1222 	Ice::Variable *result = ::function->makeVariable(lhs->getType());
1223 	Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
1224 	::basicBlock->appendInst(arithmetic);
1225 
1226 	return V(result);
1227 }
1228 
createAdd(Value * lhs,Value * rhs)1229 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
1230 {
1231 	RR_DEBUG_INFO_UPDATE_LOC();
1232 	return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
1233 }
1234 
createSub(Value * lhs,Value * rhs)1235 Value *Nucleus::createSub(Value *lhs, Value *rhs)
1236 {
1237 	RR_DEBUG_INFO_UPDATE_LOC();
1238 	return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
1239 }
1240 
createMul(Value * lhs,Value * rhs)1241 Value *Nucleus::createMul(Value *lhs, Value *rhs)
1242 {
1243 	RR_DEBUG_INFO_UPDATE_LOC();
1244 	return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
1245 }
1246 
createUDiv(Value * lhs,Value * rhs)1247 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
1248 {
1249 	RR_DEBUG_INFO_UPDATE_LOC();
1250 	return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
1251 }
1252 
createSDiv(Value * lhs,Value * rhs)1253 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
1254 {
1255 	RR_DEBUG_INFO_UPDATE_LOC();
1256 	return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
1257 }
1258 
createFAdd(Value * lhs,Value * rhs)1259 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
1260 {
1261 	RR_DEBUG_INFO_UPDATE_LOC();
1262 	return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
1263 }
1264 
createFSub(Value * lhs,Value * rhs)1265 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
1266 {
1267 	RR_DEBUG_INFO_UPDATE_LOC();
1268 	return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
1269 }
1270 
createFMul(Value * lhs,Value * rhs)1271 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
1272 {
1273 	RR_DEBUG_INFO_UPDATE_LOC();
1274 	return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
1275 }
1276 
createFDiv(Value * lhs,Value * rhs)1277 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
1278 {
1279 	RR_DEBUG_INFO_UPDATE_LOC();
1280 	return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
1281 }
1282 
createURem(Value * lhs,Value * rhs)1283 Value *Nucleus::createURem(Value *lhs, Value *rhs)
1284 {
1285 	RR_DEBUG_INFO_UPDATE_LOC();
1286 	return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
1287 }
1288 
createSRem(Value * lhs,Value * rhs)1289 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
1290 {
1291 	RR_DEBUG_INFO_UPDATE_LOC();
1292 	return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
1293 }
1294 
createFRem(Value * lhs,Value * rhs)1295 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
1296 {
1297 	RR_DEBUG_INFO_UPDATE_LOC();
1298 	// TODO(b/148139679) Fix Subzero generating invalid code for FRem on vector types
1299 	// createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
1300 	UNIMPLEMENTED("b/148139679 Nucleus::createFRem");
1301 	return nullptr;
1302 }
1303 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)1304 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
1305 {
1306 	return emulated::FRem(lhs, rhs);
1307 }
1308 
createShl(Value * lhs,Value * rhs)1309 Value *Nucleus::createShl(Value *lhs, Value *rhs)
1310 {
1311 	RR_DEBUG_INFO_UPDATE_LOC();
1312 	return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
1313 }
1314 
createLShr(Value * lhs,Value * rhs)1315 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
1316 {
1317 	RR_DEBUG_INFO_UPDATE_LOC();
1318 	return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
1319 }
1320 
createAShr(Value * lhs,Value * rhs)1321 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
1322 {
1323 	RR_DEBUG_INFO_UPDATE_LOC();
1324 	return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
1325 }
1326 
createAnd(Value * lhs,Value * rhs)1327 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
1328 {
1329 	RR_DEBUG_INFO_UPDATE_LOC();
1330 	return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
1331 }
1332 
createOr(Value * lhs,Value * rhs)1333 Value *Nucleus::createOr(Value *lhs, Value *rhs)
1334 {
1335 	RR_DEBUG_INFO_UPDATE_LOC();
1336 	return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
1337 }
1338 
createXor(Value * lhs,Value * rhs)1339 Value *Nucleus::createXor(Value *lhs, Value *rhs)
1340 {
1341 	RR_DEBUG_INFO_UPDATE_LOC();
1342 	return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
1343 }
1344 
createNeg(Value * v)1345 Value *Nucleus::createNeg(Value *v)
1346 {
1347 	RR_DEBUG_INFO_UPDATE_LOC();
1348 	return createSub(createNullValue(T(v->getType())), v);
1349 }
1350 
createFNeg(Value * v)1351 Value *Nucleus::createFNeg(Value *v)
1352 {
1353 	RR_DEBUG_INFO_UPDATE_LOC();
1354 	double c[4] = { -0.0, -0.0, -0.0, -0.0 };
1355 	Value *negativeZero = Ice::isVectorType(v->getType()) ? createConstantVector(c, T(v->getType())) : V(::context->getConstantFloat(-0.0f));
1356 
1357 	return createFSub(negativeZero, v);
1358 }
1359 
createNot(Value * v)1360 Value *Nucleus::createNot(Value *v)
1361 {
1362 	RR_DEBUG_INFO_UPDATE_LOC();
1363 	if(Ice::isScalarIntegerType(v->getType()))
1364 	{
1365 		return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
1366 	}
1367 	else  // Vector
1368 	{
1369 		int64_t c[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
1370 		return createXor(v, createConstantVector(c, T(v->getType())));
1371 	}
1372 }
1373 
validateAtomicAndMemoryOrderArgs(bool atomic,std::memory_order memoryOrder)1374 static void validateAtomicAndMemoryOrderArgs(bool atomic, std::memory_order memoryOrder)
1375 {
1376 #if defined(__i386__) || defined(__x86_64__)
1377 	// We're good, atomics and strictest memory order (except seq_cst) are guaranteed.
1378 	// Note that sequential memory ordering could be guaranteed by using x86's LOCK prefix.
1379 	// Note also that relaxed memory order could be implemented using MOVNTPS and friends.
1380 #else
1381 	if(atomic)
1382 	{
1383 		UNIMPLEMENTED("b/150475088 Atomic load/store not implemented for current platform");
1384 	}
1385 	if(memoryOrder != std::memory_order_relaxed)
1386 	{
1387 		UNIMPLEMENTED("b/150475088 Memory order other than memory_order_relaxed not implemented for current platform");
1388 	}
1389 #endif
1390 
1391 	// Vulkan doesn't allow sequential memory order
1392 	ASSERT(memoryOrder != std::memory_order_seq_cst);
1393 }
1394 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int align,bool atomic,std::memory_order memoryOrder)1395 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
1396 {
1397 	RR_DEBUG_INFO_UPDATE_LOC();
1398 	validateAtomicAndMemoryOrderArgs(atomic, memoryOrder);
1399 
1400 	int valueType = (int)reinterpret_cast<intptr_t>(type);
1401 	Ice::Variable *result = nullptr;
1402 
1403 	if((valueType & EmulatedBits) && (align != 0))  // Narrow vector not stored on stack.
1404 	{
1405 		if(emulateIntrinsics)
1406 		{
1407 			if(typeSize(type) == 4)
1408 			{
1409 				auto pointer = RValue<Pointer<Byte>>(ptr);
1410 				Int x = *Pointer<Int>(pointer);
1411 
1412 				Int4 vector;
1413 				vector = Insert(vector, x, 0);
1414 
1415 				result = ::function->makeVariable(T(type));
1416 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
1417 				::basicBlock->appendInst(bitcast);
1418 			}
1419 			else if(typeSize(type) == 8)
1420 			{
1421 				ASSERT_MSG(!atomic, "Emulated 64-bit loads are not atomic");
1422 				auto pointer = RValue<Pointer<Byte>>(ptr);
1423 				Int x = *Pointer<Int>(pointer);
1424 				Int y = *Pointer<Int>(pointer + 4);
1425 
1426 				Int4 vector;
1427 				vector = Insert(vector, x, 0);
1428 				vector = Insert(vector, y, 1);
1429 
1430 				result = ::function->makeVariable(T(type));
1431 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
1432 				::basicBlock->appendInst(bitcast);
1433 			}
1434 			else
1435 				UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
1436 		}
1437 		else
1438 		{
1439 			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
1440 			result = ::function->makeVariable(T(type));
1441 			auto load = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
1442 			load->addArg(ptr);
1443 			load->addArg(::context->getConstantInt32(typeSize(type)));
1444 			::basicBlock->appendInst(load);
1445 		}
1446 	}
1447 	else
1448 	{
1449 		result = sz::createLoad(::function, ::basicBlock, V(ptr), T(type), align);
1450 	}
1451 
1452 	ASSERT(result);
1453 	return V(result);
1454 }
1455 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int align,bool atomic,std::memory_order memoryOrder)1456 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
1457 {
1458 	RR_DEBUG_INFO_UPDATE_LOC();
1459 	validateAtomicAndMemoryOrderArgs(atomic, memoryOrder);
1460 
1461 #if __has_feature(memory_sanitizer)
1462 	// Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
1463 	if(align != 0)
1464 	{
1465 		auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
1466 		call->addArg(ptr);
1467 		call->addArg(::context->getConstantInt64(typeSize(type)));
1468 		::basicBlock->appendInst(call);
1469 	}
1470 #endif
1471 
1472 	int valueType = (int)reinterpret_cast<intptr_t>(type);
1473 
1474 	if((valueType & EmulatedBits) && (align != 0))  // Narrow vector not stored on stack.
1475 	{
1476 		if(emulateIntrinsics)
1477 		{
1478 			if(typeSize(type) == 4)
1479 			{
1480 				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
1481 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
1482 				::basicBlock->appendInst(bitcast);
1483 
1484 				RValue<Int4> v(V(vector));
1485 
1486 				auto pointer = RValue<Pointer<Byte>>(ptr);
1487 				Int x = Extract(v, 0);
1488 				*Pointer<Int>(pointer) = x;
1489 			}
1490 			else if(typeSize(type) == 8)
1491 			{
1492 				ASSERT_MSG(!atomic, "Emulated 64-bit stores are not atomic");
1493 				Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
1494 				auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
1495 				::basicBlock->appendInst(bitcast);
1496 
1497 				RValue<Int4> v(V(vector));
1498 
1499 				auto pointer = RValue<Pointer<Byte>>(ptr);
1500 				Int x = Extract(v, 0);
1501 				*Pointer<Int>(pointer) = x;
1502 				Int y = Extract(v, 1);
1503 				*Pointer<Int>(pointer + 4) = y;
1504 			}
1505 			else
1506 				UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
1507 		}
1508 		else
1509 		{
1510 			const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1511 			auto store = Ice::InstIntrinsic::create(::function, 3, nullptr, intrinsic);
1512 			store->addArg(value);
1513 			store->addArg(ptr);
1514 			store->addArg(::context->getConstantInt32(typeSize(type)));
1515 			::basicBlock->appendInst(store);
1516 		}
1517 	}
1518 	else
1519 	{
1520 		ASSERT(value->getType() == T(type));
1521 
1522 		auto store = Ice::InstStore::create(::function, V(value), V(ptr), align);
1523 		::basicBlock->appendInst(store);
1524 	}
1525 
1526 	return value;
1527 }
1528 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1529 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1530 {
1531 	RR_DEBUG_INFO_UPDATE_LOC();
1532 	ASSERT(index->getType() == Ice::IceType_i32);
1533 
1534 	if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
1535 	{
1536 		int32_t offset = constant->getValue() * (int)typeSize(type);
1537 
1538 		if(offset == 0)
1539 		{
1540 			return ptr;
1541 		}
1542 
1543 		return createAdd(ptr, createConstantInt(offset));
1544 	}
1545 
1546 	if(!Ice::isByteSizedType(T(type)))
1547 	{
1548 		index = createMul(index, createConstantInt((int)typeSize(type)));
1549 	}
1550 
1551 	if(sizeof(void *) == 8)
1552 	{
1553 		if(unsignedIndex)
1554 		{
1555 			index = createZExt(index, T(Ice::IceType_i64));
1556 		}
1557 		else
1558 		{
1559 			index = createSExt(index, T(Ice::IceType_i64));
1560 		}
1561 	}
1562 
1563 	return createAdd(ptr, index);
1564 }
1565 
createAtomicRMW(Ice::Intrinsics::AtomicRMWOperation rmwOp,Value * ptr,Value * value,std::memory_order memoryOrder)1566 static Value *createAtomicRMW(Ice::Intrinsics::AtomicRMWOperation rmwOp, Value *ptr, Value *value, std::memory_order memoryOrder)
1567 {
1568 	Ice::Variable *result = ::function->makeVariable(value->getType());
1569 
1570 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicRMW, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1571 	auto inst = Ice::InstIntrinsic::create(::function, 0, result, intrinsic);
1572 	auto op = ::context->getConstantInt32(rmwOp);
1573 	auto order = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrder));
1574 	inst->addArg(op);
1575 	inst->addArg(ptr);
1576 	inst->addArg(value);
1577 	inst->addArg(order);
1578 	::basicBlock->appendInst(inst);
1579 
1580 	return V(result);
1581 }
1582 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1583 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1584 {
1585 	RR_DEBUG_INFO_UPDATE_LOC();
1586 	return createAtomicRMW(Ice::Intrinsics::AtomicAdd, ptr, value, memoryOrder);
1587 }
1588 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1589 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1590 {
1591 	RR_DEBUG_INFO_UPDATE_LOC();
1592 	return createAtomicRMW(Ice::Intrinsics::AtomicSub, ptr, value, memoryOrder);
1593 }
1594 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1595 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1596 {
1597 	RR_DEBUG_INFO_UPDATE_LOC();
1598 	return createAtomicRMW(Ice::Intrinsics::AtomicAnd, ptr, value, memoryOrder);
1599 }
1600 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1601 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1602 {
1603 	RR_DEBUG_INFO_UPDATE_LOC();
1604 	return createAtomicRMW(Ice::Intrinsics::AtomicOr, ptr, value, memoryOrder);
1605 }
1606 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1607 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1608 {
1609 	RR_DEBUG_INFO_UPDATE_LOC();
1610 	return createAtomicRMW(Ice::Intrinsics::AtomicXor, ptr, value, memoryOrder);
1611 }
1612 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1613 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1614 {
1615 	RR_DEBUG_INFO_UPDATE_LOC();
1616 	return createAtomicRMW(Ice::Intrinsics::AtomicExchange, ptr, value, memoryOrder);
1617 }
1618 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1619 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1620 {
1621 	RR_DEBUG_INFO_UPDATE_LOC();
1622 	Ice::Variable *result = ::function->makeVariable(value->getType());
1623 
1624 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicCmpxchg, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1625 	auto inst = Ice::InstIntrinsic::create(::function, 0, result, intrinsic);
1626 	auto orderEq = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrderEqual));
1627 	auto orderNeq = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrderUnequal));
1628 	inst->addArg(ptr);
1629 	inst->addArg(compare);
1630 	inst->addArg(value);
1631 	inst->addArg(orderEq);
1632 	inst->addArg(orderNeq);
1633 	::basicBlock->appendInst(inst);
1634 
1635 	return V(result);
1636 }
1637 
createCast(Ice::InstCast::OpKind op,Value * v,Type * destType)1638 static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
1639 {
1640 	if(v->getType() == T(destType))
1641 	{
1642 		return v;
1643 	}
1644 
1645 	Ice::Variable *result = ::function->makeVariable(T(destType));
1646 	Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
1647 	::basicBlock->appendInst(cast);
1648 
1649 	return V(result);
1650 }
1651 
createTrunc(Value * v,Type * destType)1652 Value *Nucleus::createTrunc(Value *v, Type *destType)
1653 {
1654 	RR_DEBUG_INFO_UPDATE_LOC();
1655 	return createCast(Ice::InstCast::Trunc, v, destType);
1656 }
1657 
createZExt(Value * v,Type * destType)1658 Value *Nucleus::createZExt(Value *v, Type *destType)
1659 {
1660 	RR_DEBUG_INFO_UPDATE_LOC();
1661 	return createCast(Ice::InstCast::Zext, v, destType);
1662 }
1663 
createSExt(Value * v,Type * destType)1664 Value *Nucleus::createSExt(Value *v, Type *destType)
1665 {
1666 	RR_DEBUG_INFO_UPDATE_LOC();
1667 	return createCast(Ice::InstCast::Sext, v, destType);
1668 }
1669 
createFPToUI(Value * v,Type * destType)1670 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1671 {
1672 	RR_DEBUG_INFO_UPDATE_LOC();
1673 	return createCast(Ice::InstCast::Fptoui, v, destType);
1674 }
1675 
createFPToSI(Value * v,Type * destType)1676 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1677 {
1678 	RR_DEBUG_INFO_UPDATE_LOC();
1679 	return createCast(Ice::InstCast::Fptosi, v, destType);
1680 }
1681 
createSIToFP(Value * v,Type * destType)1682 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1683 {
1684 	RR_DEBUG_INFO_UPDATE_LOC();
1685 	return createCast(Ice::InstCast::Sitofp, v, destType);
1686 }
1687 
createFPTrunc(Value * v,Type * destType)1688 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1689 {
1690 	RR_DEBUG_INFO_UPDATE_LOC();
1691 	return createCast(Ice::InstCast::Fptrunc, v, destType);
1692 }
1693 
createFPExt(Value * v,Type * destType)1694 Value *Nucleus::createFPExt(Value *v, Type *destType)
1695 {
1696 	RR_DEBUG_INFO_UPDATE_LOC();
1697 	return createCast(Ice::InstCast::Fpext, v, destType);
1698 }
1699 
createBitCast(Value * v,Type * destType)1700 Value *Nucleus::createBitCast(Value *v, Type *destType)
1701 {
1702 	RR_DEBUG_INFO_UPDATE_LOC();
1703 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1704 	// support for casting between scalars and wide vectors. For platforms where this is not supported,
1705 	// emulate them by writing to the stack and reading back as the destination type.
1706 	if(emulateMismatchedBitCast)
1707 	{
1708 		if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
1709 		{
1710 			Value *address = allocateStackVariable(destType);
1711 			createStore(v, address, T(v->getType()));
1712 			return createLoad(address, destType);
1713 		}
1714 		else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
1715 		{
1716 			Value *address = allocateStackVariable(T(v->getType()));
1717 			createStore(v, address, T(v->getType()));
1718 			return createLoad(address, destType);
1719 		}
1720 	}
1721 
1722 	return createCast(Ice::InstCast::Bitcast, v, destType);
1723 }
1724 
createIntCompare(Ice::InstIcmp::ICond condition,Value * lhs,Value * rhs)1725 static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
1726 {
1727 	ASSERT(lhs->getType() == rhs->getType());
1728 
1729 	auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
1730 	auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
1731 	::basicBlock->appendInst(cmp);
1732 
1733 	return V(result);
1734 }
1735 
createICmpEQ(Value * lhs,Value * rhs)1736 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1737 {
1738 	RR_DEBUG_INFO_UPDATE_LOC();
1739 	return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
1740 }
1741 
createICmpNE(Value * lhs,Value * rhs)1742 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1743 {
1744 	RR_DEBUG_INFO_UPDATE_LOC();
1745 	return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
1746 }
1747 
createICmpUGT(Value * lhs,Value * rhs)1748 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1749 {
1750 	RR_DEBUG_INFO_UPDATE_LOC();
1751 	return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
1752 }
1753 
createICmpUGE(Value * lhs,Value * rhs)1754 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1755 {
1756 	RR_DEBUG_INFO_UPDATE_LOC();
1757 	return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
1758 }
1759 
createICmpULT(Value * lhs,Value * rhs)1760 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1761 {
1762 	RR_DEBUG_INFO_UPDATE_LOC();
1763 	return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
1764 }
1765 
createICmpULE(Value * lhs,Value * rhs)1766 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1767 {
1768 	RR_DEBUG_INFO_UPDATE_LOC();
1769 	return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
1770 }
1771 
createICmpSGT(Value * lhs,Value * rhs)1772 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1773 {
1774 	RR_DEBUG_INFO_UPDATE_LOC();
1775 	return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
1776 }
1777 
createICmpSGE(Value * lhs,Value * rhs)1778 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1779 {
1780 	RR_DEBUG_INFO_UPDATE_LOC();
1781 	return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
1782 }
1783 
createICmpSLT(Value * lhs,Value * rhs)1784 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1785 {
1786 	RR_DEBUG_INFO_UPDATE_LOC();
1787 	return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
1788 }
1789 
createICmpSLE(Value * lhs,Value * rhs)1790 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1791 {
1792 	RR_DEBUG_INFO_UPDATE_LOC();
1793 	return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
1794 }
1795 
createFloatCompare(Ice::InstFcmp::FCond condition,Value * lhs,Value * rhs)1796 static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
1797 {
1798 	ASSERT(lhs->getType() == rhs->getType());
1799 	ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
1800 
1801 	auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
1802 	auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
1803 	::basicBlock->appendInst(cmp);
1804 
1805 	return V(result);
1806 }
1807 
createFCmpOEQ(Value * lhs,Value * rhs)1808 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1809 {
1810 	RR_DEBUG_INFO_UPDATE_LOC();
1811 	return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
1812 }
1813 
createFCmpOGT(Value * lhs,Value * rhs)1814 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1815 {
1816 	RR_DEBUG_INFO_UPDATE_LOC();
1817 	return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
1818 }
1819 
createFCmpOGE(Value * lhs,Value * rhs)1820 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1821 {
1822 	RR_DEBUG_INFO_UPDATE_LOC();
1823 	return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
1824 }
1825 
createFCmpOLT(Value * lhs,Value * rhs)1826 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1827 {
1828 	RR_DEBUG_INFO_UPDATE_LOC();
1829 	return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
1830 }
1831 
createFCmpOLE(Value * lhs,Value * rhs)1832 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1833 {
1834 	RR_DEBUG_INFO_UPDATE_LOC();
1835 	return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
1836 }
1837 
createFCmpONE(Value * lhs,Value * rhs)1838 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1839 {
1840 	RR_DEBUG_INFO_UPDATE_LOC();
1841 	return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
1842 }
1843 
createFCmpORD(Value * lhs,Value * rhs)1844 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1845 {
1846 	RR_DEBUG_INFO_UPDATE_LOC();
1847 	return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
1848 }
1849 
createFCmpUNO(Value * lhs,Value * rhs)1850 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1851 {
1852 	RR_DEBUG_INFO_UPDATE_LOC();
1853 	return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
1854 }
1855 
createFCmpUEQ(Value * lhs,Value * rhs)1856 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1857 {
1858 	RR_DEBUG_INFO_UPDATE_LOC();
1859 	return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
1860 }
1861 
createFCmpUGT(Value * lhs,Value * rhs)1862 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1863 {
1864 	RR_DEBUG_INFO_UPDATE_LOC();
1865 	return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
1866 }
1867 
createFCmpUGE(Value * lhs,Value * rhs)1868 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1869 {
1870 	RR_DEBUG_INFO_UPDATE_LOC();
1871 	return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
1872 }
1873 
createFCmpULT(Value * lhs,Value * rhs)1874 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1875 {
1876 	RR_DEBUG_INFO_UPDATE_LOC();
1877 	return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
1878 }
1879 
createFCmpULE(Value * lhs,Value * rhs)1880 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1881 {
1882 	RR_DEBUG_INFO_UPDATE_LOC();
1883 	return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
1884 }
1885 
createFCmpUNE(Value * lhs,Value * rhs)1886 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1887 {
1888 	RR_DEBUG_INFO_UPDATE_LOC();
1889 	return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
1890 }
1891 
createExtractElement(Value * vector,Type * type,int index)1892 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1893 {
1894 	RR_DEBUG_INFO_UPDATE_LOC();
1895 	auto result = ::function->makeVariable(T(type));
1896 	auto extract = Ice::InstExtractElement::create(::function, result, V(vector), ::context->getConstantInt32(index));
1897 	::basicBlock->appendInst(extract);
1898 
1899 	return V(result);
1900 }
1901 
createInsertElement(Value * vector,Value * element,int index)1902 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1903 {
1904 	RR_DEBUG_INFO_UPDATE_LOC();
1905 	auto result = ::function->makeVariable(vector->getType());
1906 	auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
1907 	::basicBlock->appendInst(insert);
1908 
1909 	return V(result);
1910 }
1911 
createShuffleVector(Value * V1,Value * V2,const int * select)1912 Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
1913 {
1914 	RR_DEBUG_INFO_UPDATE_LOC();
1915 	ASSERT(V1->getType() == V2->getType());
1916 
1917 	int size = Ice::typeNumElements(V1->getType());
1918 	auto result = ::function->makeVariable(V1->getType());
1919 	auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
1920 
1921 	for(int i = 0; i < size; i++)
1922 	{
1923 		shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
1924 	}
1925 
1926 	::basicBlock->appendInst(shuffle);
1927 
1928 	return V(result);
1929 }
1930 
createSelect(Value * C,Value * ifTrue,Value * ifFalse)1931 Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
1932 {
1933 	RR_DEBUG_INFO_UPDATE_LOC();
1934 	ASSERT(ifTrue->getType() == ifFalse->getType());
1935 
1936 	auto result = ::function->makeVariable(ifTrue->getType());
1937 	auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
1938 	::basicBlock->appendInst(select);
1939 
1940 	return V(result);
1941 }
1942 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1943 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1944 {
1945 	RR_DEBUG_INFO_UPDATE_LOC();
1946 	auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
1947 	::basicBlock->appendInst(switchInst);
1948 
1949 	return reinterpret_cast<SwitchCases *>(switchInst);
1950 }
1951 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1952 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1953 {
1954 	RR_DEBUG_INFO_UPDATE_LOC();
1955 	switchCases->addBranch(label, label, branch);
1956 }
1957 
createUnreachable()1958 void Nucleus::createUnreachable()
1959 {
1960 	RR_DEBUG_INFO_UPDATE_LOC();
1961 	Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
1962 	::basicBlock->appendInst(unreachable);
1963 }
1964 
getType(Value * value)1965 Type *Nucleus::getType(Value *value)
1966 {
1967 	return T(V(value)->getType());
1968 }
1969 
getContainedType(Type * vectorType)1970 Type *Nucleus::getContainedType(Type *vectorType)
1971 {
1972 	Ice::Type vecTy = T(vectorType);
1973 	switch(vecTy)
1974 	{
1975 	case Ice::IceType_v4i1: return T(Ice::IceType_i1);
1976 	case Ice::IceType_v8i1: return T(Ice::IceType_i1);
1977 	case Ice::IceType_v16i1: return T(Ice::IceType_i1);
1978 	case Ice::IceType_v16i8: return T(Ice::IceType_i8);
1979 	case Ice::IceType_v8i16: return T(Ice::IceType_i16);
1980 	case Ice::IceType_v4i32: return T(Ice::IceType_i32);
1981 	case Ice::IceType_v4f32: return T(Ice::IceType_f32);
1982 	default:
1983 		ASSERT_MSG(false, "getContainedType: input type is not a vector type");
1984 		return {};
1985 	}
1986 }
1987 
getPointerType(Type * ElementType)1988 Type *Nucleus::getPointerType(Type *ElementType)
1989 {
1990 	return T(sz::getPointerType(T(ElementType)));
1991 }
1992 
getNaturalIntType()1993 static constexpr Ice::Type getNaturalIntType()
1994 {
1995 	constexpr size_t intSize = sizeof(int);
1996 	static_assert(intSize == 4 || intSize == 8, "");
1997 	return intSize == 4 ? Ice::IceType_i32 : Ice::IceType_i64;
1998 }
1999 
getPrintfStorageType(Type * valueType)2000 Type *Nucleus::getPrintfStorageType(Type *valueType)
2001 {
2002 	Ice::Type valueTy = T(valueType);
2003 	switch(valueTy)
2004 	{
2005 	case Ice::IceType_i32:
2006 		return T(getNaturalIntType());
2007 
2008 	case Ice::IceType_f32:
2009 		return T(Ice::IceType_f64);
2010 
2011 	default:
2012 		UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
2013 		return {};
2014 	}
2015 }
2016 
createNullValue(Type * Ty)2017 Value *Nucleus::createNullValue(Type *Ty)
2018 {
2019 	RR_DEBUG_INFO_UPDATE_LOC();
2020 	if(Ice::isVectorType(T(Ty)))
2021 	{
2022 		ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
2023 		int64_t c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2024 		return createConstantVector(c, Ty);
2025 	}
2026 	else
2027 	{
2028 		return V(::context->getConstantZero(T(Ty)));
2029 	}
2030 }
2031 
createConstantLong(int64_t i)2032 Value *Nucleus::createConstantLong(int64_t i)
2033 {
2034 	RR_DEBUG_INFO_UPDATE_LOC();
2035 	return V(::context->getConstantInt64(i));
2036 }
2037 
createConstantInt(int i)2038 Value *Nucleus::createConstantInt(int i)
2039 {
2040 	RR_DEBUG_INFO_UPDATE_LOC();
2041 	return V(::context->getConstantInt32(i));
2042 }
2043 
createConstantInt(unsigned int i)2044 Value *Nucleus::createConstantInt(unsigned int i)
2045 {
2046 	RR_DEBUG_INFO_UPDATE_LOC();
2047 	return V(::context->getConstantInt32(i));
2048 }
2049 
createConstantBool(bool b)2050 Value *Nucleus::createConstantBool(bool b)
2051 {
2052 	RR_DEBUG_INFO_UPDATE_LOC();
2053 	return V(::context->getConstantInt1(b));
2054 }
2055 
createConstantByte(signed char i)2056 Value *Nucleus::createConstantByte(signed char i)
2057 {
2058 	RR_DEBUG_INFO_UPDATE_LOC();
2059 	return V(::context->getConstantInt8(i));
2060 }
2061 
createConstantByte(unsigned char i)2062 Value *Nucleus::createConstantByte(unsigned char i)
2063 {
2064 	RR_DEBUG_INFO_UPDATE_LOC();
2065 	return V(::context->getConstantInt8(i));
2066 }
2067 
createConstantShort(short i)2068 Value *Nucleus::createConstantShort(short i)
2069 {
2070 	RR_DEBUG_INFO_UPDATE_LOC();
2071 	return V(::context->getConstantInt16(i));
2072 }
2073 
createConstantShort(unsigned short i)2074 Value *Nucleus::createConstantShort(unsigned short i)
2075 {
2076 	RR_DEBUG_INFO_UPDATE_LOC();
2077 	return V(::context->getConstantInt16(i));
2078 }
2079 
createConstantFloat(float x)2080 Value *Nucleus::createConstantFloat(float x)
2081 {
2082 	RR_DEBUG_INFO_UPDATE_LOC();
2083 	return V(::context->getConstantFloat(x));
2084 }
2085 
createNullPointer(Type * Ty)2086 Value *Nucleus::createNullPointer(Type *Ty)
2087 {
2088 	RR_DEBUG_INFO_UPDATE_LOC();
2089 	return createNullValue(T(sizeof(void *) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
2090 }
2091 
IceConstantData(void const * data,size_t size,size_t alignment=1)2092 static Ice::Constant *IceConstantData(void const *data, size_t size, size_t alignment = 1)
2093 {
2094 	return sz::getConstantPointer(::context, ::routine->addConstantData(data, size, alignment));
2095 }
2096 
createConstantVector(const int64_t * constants,Type * type)2097 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
2098 {
2099 	RR_DEBUG_INFO_UPDATE_LOC();
2100 	const int vectorSize = 16;
2101 	ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
2102 	const int alignment = vectorSize;
2103 
2104 	const int64_t *i = constants;
2105 	const double *f = reinterpret_cast<const double *>(constants);
2106 
2107 	// TODO(b/148082873): Fix global variable constants when generating multiple functions
2108 	Ice::Constant *ptr = nullptr;
2109 
2110 	switch((int)reinterpret_cast<intptr_t>(type))
2111 	{
2112 	case Ice::IceType_v4i32:
2113 	case Ice::IceType_v4i1:
2114 		{
2115 			const int initializer[4] = { (int)i[0], (int)i[1], (int)i[2], (int)i[3] };
2116 			static_assert(sizeof(initializer) == vectorSize, "!");
2117 			ptr = IceConstantData(initializer, vectorSize, alignment);
2118 		}
2119 		break;
2120 	case Ice::IceType_v4f32:
2121 		{
2122 			const float initializer[4] = { (float)f[0], (float)f[1], (float)f[2], (float)f[3] };
2123 			static_assert(sizeof(initializer) == vectorSize, "!");
2124 			ptr = IceConstantData(initializer, vectorSize, alignment);
2125 		}
2126 		break;
2127 	case Ice::IceType_v8i16:
2128 	case Ice::IceType_v8i1:
2129 		{
2130 			const short initializer[8] = { (short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7] };
2131 			static_assert(sizeof(initializer) == vectorSize, "!");
2132 			ptr = IceConstantData(initializer, vectorSize, alignment);
2133 		}
2134 		break;
2135 	case Ice::IceType_v16i8:
2136 	case Ice::IceType_v16i1:
2137 		{
2138 			const char initializer[16] = { (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15] };
2139 			static_assert(sizeof(initializer) == vectorSize, "!");
2140 			ptr = IceConstantData(initializer, vectorSize, alignment);
2141 		}
2142 		break;
2143 	case Type_v2i32:
2144 		{
2145 			const int initializer[4] = { (int)i[0], (int)i[1], (int)i[0], (int)i[1] };
2146 			static_assert(sizeof(initializer) == vectorSize, "!");
2147 			ptr = IceConstantData(initializer, vectorSize, alignment);
2148 		}
2149 		break;
2150 	case Type_v2f32:
2151 		{
2152 			const float initializer[4] = { (float)f[0], (float)f[1], (float)f[0], (float)f[1] };
2153 			static_assert(sizeof(initializer) == vectorSize, "!");
2154 			ptr = IceConstantData(initializer, vectorSize, alignment);
2155 		}
2156 		break;
2157 	case Type_v4i16:
2158 		{
2159 			const short initializer[8] = { (short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3] };
2160 			static_assert(sizeof(initializer) == vectorSize, "!");
2161 			ptr = IceConstantData(initializer, vectorSize, alignment);
2162 		}
2163 		break;
2164 	case Type_v8i8:
2165 		{
2166 			const char initializer[16] = { (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7] };
2167 			static_assert(sizeof(initializer) == vectorSize, "!");
2168 			ptr = IceConstantData(initializer, vectorSize, alignment);
2169 		}
2170 		break;
2171 	case Type_v4i8:
2172 		{
2173 			const char initializer[16] = { (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3] };
2174 			static_assert(sizeof(initializer) == vectorSize, "!");
2175 			ptr = IceConstantData(initializer, vectorSize, alignment);
2176 		}
2177 		break;
2178 	default:
2179 		UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
2180 	}
2181 
2182 	ASSERT(ptr);
2183 
2184 	Ice::Variable *result = sz::createLoad(::function, ::basicBlock, ptr, T(type), alignment);
2185 	return V(result);
2186 }
2187 
createConstantVector(const double * constants,Type * type)2188 Value *Nucleus::createConstantVector(const double *constants, Type *type)
2189 {
2190 	return createConstantVector((const int64_t *)constants, type);
2191 }
2192 
createConstantString(const char * v)2193 Value *Nucleus::createConstantString(const char *v)
2194 {
2195 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
2196 	return V(IceConstantData(v, strlen(v) + 1));
2197 }
2198 
setOptimizerCallback(OptimizerCallback * callback)2199 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
2200 {
2201 	::optimizerCallback = callback;
2202 }
2203 
type()2204 Type *Void::type()
2205 {
2206 	return T(Ice::IceType_void);
2207 }
2208 
type()2209 Type *Bool::type()
2210 {
2211 	return T(Ice::IceType_i1);
2212 }
2213 
type()2214 Type *Byte::type()
2215 {
2216 	return T(Ice::IceType_i8);
2217 }
2218 
type()2219 Type *SByte::type()
2220 {
2221 	return T(Ice::IceType_i8);
2222 }
2223 
type()2224 Type *Short::type()
2225 {
2226 	return T(Ice::IceType_i16);
2227 }
2228 
type()2229 Type *UShort::type()
2230 {
2231 	return T(Ice::IceType_i16);
2232 }
2233 
type()2234 Type *Byte4::type()
2235 {
2236 	return T(Type_v4i8);
2237 }
2238 
type()2239 Type *SByte4::type()
2240 {
2241 	return T(Type_v4i8);
2242 }
2243 
2244 namespace {
SaturateUnsigned(RValue<Short> x)2245 RValue<Byte> SaturateUnsigned(RValue<Short> x)
2246 {
2247 	return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
2248 }
2249 
Extract(RValue<Byte8> val,int i)2250 RValue<Byte> Extract(RValue<Byte8> val, int i)
2251 {
2252 	return RValue<Byte>(Nucleus::createExtractElement(val.value(), Byte::type(), i));
2253 }
2254 
Insert(RValue<Byte8> val,RValue<Byte> element,int i)2255 RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
2256 {
2257 	return RValue<Byte8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2258 }
2259 }  // namespace
2260 
AddSat(RValue<Byte8> x,RValue<Byte8> y)2261 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2262 {
2263 	RR_DEBUG_INFO_UPDATE_LOC();
2264 	if(emulateIntrinsics)
2265 	{
2266 		Byte8 result;
2267 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
2268 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
2269 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
2270 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
2271 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
2272 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
2273 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
2274 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
2275 
2276 		return result;
2277 	}
2278 	else
2279 	{
2280 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2281 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2282 		auto paddusb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2283 		paddusb->addArg(x.value());
2284 		paddusb->addArg(y.value());
2285 		::basicBlock->appendInst(paddusb);
2286 
2287 		return RValue<Byte8>(V(result));
2288 	}
2289 }
2290 
SubSat(RValue<Byte8> x,RValue<Byte8> y)2291 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2292 {
2293 	RR_DEBUG_INFO_UPDATE_LOC();
2294 	if(emulateIntrinsics)
2295 	{
2296 		Byte8 result;
2297 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
2298 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
2299 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
2300 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
2301 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
2302 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
2303 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
2304 		result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
2305 
2306 		return result;
2307 	}
2308 	else
2309 	{
2310 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2311 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2312 		auto psubusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2313 		psubusw->addArg(x.value());
2314 		psubusw->addArg(y.value());
2315 		::basicBlock->appendInst(psubusw);
2316 
2317 		return RValue<Byte8>(V(result));
2318 	}
2319 }
2320 
Extract(RValue<SByte8> val,int i)2321 RValue<SByte> Extract(RValue<SByte8> val, int i)
2322 {
2323 	RR_DEBUG_INFO_UPDATE_LOC();
2324 	return RValue<SByte>(Nucleus::createExtractElement(val.value(), SByte::type(), i));
2325 }
2326 
Insert(RValue<SByte8> val,RValue<SByte> element,int i)2327 RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
2328 {
2329 	RR_DEBUG_INFO_UPDATE_LOC();
2330 	return RValue<SByte8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2331 }
2332 
operator >>(RValue<SByte8> lhs,unsigned char rhs)2333 RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2334 {
2335 	RR_DEBUG_INFO_UPDATE_LOC();
2336 	if(emulateIntrinsics)
2337 	{
2338 		SByte8 result;
2339 		result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
2340 		result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
2341 		result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
2342 		result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
2343 		result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
2344 		result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
2345 		result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
2346 		result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
2347 
2348 		return result;
2349 	}
2350 	else
2351 	{
2352 #if defined(__i386__) || defined(__x86_64__)
2353 		// SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
2354 		RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
2355 		RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
2356 
2357 		return As<SByte8>(hi | lo);
2358 #else
2359 		return RValue<SByte8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2360 #endif
2361 	}
2362 }
2363 
SignMask(RValue<Byte8> x)2364 RValue<Int> SignMask(RValue<Byte8> x)
2365 {
2366 	RR_DEBUG_INFO_UPDATE_LOC();
2367 	if(emulateIntrinsics || CPUID::ARM)
2368 	{
2369 		Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
2370 		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
2371 	}
2372 	else
2373 	{
2374 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2375 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2376 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
2377 		movmsk->addArg(x.value());
2378 		::basicBlock->appendInst(movmsk);
2379 
2380 		return RValue<Int>(V(result)) & 0xFF;
2381 	}
2382 }
2383 
2384 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2385 //	{
2386 //		return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Ugt, x.value(), y.value()));
2387 //	}
2388 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)2389 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2390 {
2391 	RR_DEBUG_INFO_UPDATE_LOC();
2392 	return RValue<Byte8>(Nucleus::createICmpEQ(x.value(), y.value()));
2393 }
2394 
type()2395 Type *Byte8::type()
2396 {
2397 	return T(Type_v8i8);
2398 }
2399 
2400 //	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2401 //	{
2402 //		return RValue<SByte8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2403 //	}
2404 
2405 //	RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2406 //	{
2407 //		return RValue<SByte8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2408 //	}
2409 
SaturateSigned(RValue<Short> x)2410 RValue<SByte> SaturateSigned(RValue<Short> x)
2411 {
2412 	RR_DEBUG_INFO_UPDATE_LOC();
2413 	return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
2414 }
2415 
AddSat(RValue<SByte8> x,RValue<SByte8> y)2416 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2417 {
2418 	RR_DEBUG_INFO_UPDATE_LOC();
2419 	if(emulateIntrinsics)
2420 	{
2421 		SByte8 result;
2422 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
2423 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
2424 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
2425 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
2426 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
2427 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
2428 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
2429 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
2430 
2431 		return result;
2432 	}
2433 	else
2434 	{
2435 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2436 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2437 		auto paddsb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2438 		paddsb->addArg(x.value());
2439 		paddsb->addArg(y.value());
2440 		::basicBlock->appendInst(paddsb);
2441 
2442 		return RValue<SByte8>(V(result));
2443 	}
2444 }
2445 
SubSat(RValue<SByte8> x,RValue<SByte8> y)2446 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2447 {
2448 	RR_DEBUG_INFO_UPDATE_LOC();
2449 	if(emulateIntrinsics)
2450 	{
2451 		SByte8 result;
2452 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
2453 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
2454 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
2455 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
2456 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
2457 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
2458 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
2459 		result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
2460 
2461 		return result;
2462 	}
2463 	else
2464 	{
2465 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2466 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2467 		auto psubsb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2468 		psubsb->addArg(x.value());
2469 		psubsb->addArg(y.value());
2470 		::basicBlock->appendInst(psubsb);
2471 
2472 		return RValue<SByte8>(V(result));
2473 	}
2474 }
2475 
SignMask(RValue<SByte8> x)2476 RValue<Int> SignMask(RValue<SByte8> x)
2477 {
2478 	RR_DEBUG_INFO_UPDATE_LOC();
2479 	if(emulateIntrinsics || CPUID::ARM)
2480 	{
2481 		SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
2482 		return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
2483 	}
2484 	else
2485 	{
2486 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2487 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2488 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
2489 		movmsk->addArg(x.value());
2490 		::basicBlock->appendInst(movmsk);
2491 
2492 		return RValue<Int>(V(result)) & 0xFF;
2493 	}
2494 }
2495 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)2496 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2497 {
2498 	RR_DEBUG_INFO_UPDATE_LOC();
2499 	return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value(), y.value()));
2500 }
2501 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)2502 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2503 {
2504 	RR_DEBUG_INFO_UPDATE_LOC();
2505 	return RValue<Byte8>(Nucleus::createICmpEQ(x.value(), y.value()));
2506 }
2507 
type()2508 Type *SByte8::type()
2509 {
2510 	return T(Type_v8i8);
2511 }
2512 
type()2513 Type *Byte16::type()
2514 {
2515 	return T(Ice::IceType_v16i8);
2516 }
2517 
type()2518 Type *SByte16::type()
2519 {
2520 	return T(Ice::IceType_v16i8);
2521 }
2522 
type()2523 Type *Short2::type()
2524 {
2525 	return T(Type_v2i16);
2526 }
2527 
type()2528 Type *UShort2::type()
2529 {
2530 	return T(Type_v2i16);
2531 }
2532 
Short4(RValue<Int4> cast)2533 Short4::Short4(RValue<Int4> cast)
2534 {
2535 	int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2536 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2537 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2538 
2539 	Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value();
2540 	Value *short4 = Nucleus::createBitCast(int2, Short4::type());
2541 
2542 	storeValue(short4);
2543 }
2544 
2545 //	Short4::Short4(RValue<Float> cast)
2546 //	{
2547 //	}
2548 
Short4(RValue<Float4> cast)2549 Short4::Short4(RValue<Float4> cast)
2550 {
2551 	// TODO(b/150791192): Generalize and optimize
2552 	auto smin = std::numeric_limits<short>::min();
2553 	auto smax = std::numeric_limits<short>::max();
2554 	*this = Short4(Int4(Max(Min(cast, Float4(smax)), Float4(smin))));
2555 }
2556 
operator <<(RValue<Short4> lhs,unsigned char rhs)2557 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2558 {
2559 	RR_DEBUG_INFO_UPDATE_LOC();
2560 	if(emulateIntrinsics)
2561 	{
2562 		Short4 result;
2563 		result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
2564 		result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
2565 		result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
2566 		result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
2567 
2568 		return result;
2569 	}
2570 	else
2571 	{
2572 		return RValue<Short4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2573 	}
2574 }
2575 
operator >>(RValue<Short4> lhs,unsigned char rhs)2576 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2577 {
2578 	RR_DEBUG_INFO_UPDATE_LOC();
2579 	if(emulateIntrinsics)
2580 	{
2581 		Short4 result;
2582 		result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
2583 		result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
2584 		result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
2585 		result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
2586 
2587 		return result;
2588 	}
2589 	else
2590 	{
2591 		return RValue<Short4>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2592 	}
2593 }
2594 
Max(RValue<Short4> x,RValue<Short4> y)2595 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2596 {
2597 	RR_DEBUG_INFO_UPDATE_LOC();
2598 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2599 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
2600 	::basicBlock->appendInst(cmp);
2601 
2602 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2603 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2604 	::basicBlock->appendInst(select);
2605 
2606 	return RValue<Short4>(V(result));
2607 }
2608 
Min(RValue<Short4> x,RValue<Short4> y)2609 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2610 {
2611 	RR_DEBUG_INFO_UPDATE_LOC();
2612 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2613 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
2614 	::basicBlock->appendInst(cmp);
2615 
2616 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2617 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2618 	::basicBlock->appendInst(select);
2619 
2620 	return RValue<Short4>(V(result));
2621 }
2622 
SaturateSigned(RValue<Int> x)2623 RValue<Short> SaturateSigned(RValue<Int> x)
2624 {
2625 	RR_DEBUG_INFO_UPDATE_LOC();
2626 	return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
2627 }
2628 
AddSat(RValue<Short4> x,RValue<Short4> y)2629 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2630 {
2631 	RR_DEBUG_INFO_UPDATE_LOC();
2632 	if(emulateIntrinsics)
2633 	{
2634 		Short4 result;
2635 		result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
2636 		result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
2637 		result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
2638 		result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
2639 
2640 		return result;
2641 	}
2642 	else
2643 	{
2644 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2645 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2646 		auto paddsw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2647 		paddsw->addArg(x.value());
2648 		paddsw->addArg(y.value());
2649 		::basicBlock->appendInst(paddsw);
2650 
2651 		return RValue<Short4>(V(result));
2652 	}
2653 }
2654 
SubSat(RValue<Short4> x,RValue<Short4> y)2655 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2656 {
2657 	RR_DEBUG_INFO_UPDATE_LOC();
2658 	if(emulateIntrinsics)
2659 	{
2660 		Short4 result;
2661 		result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
2662 		result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
2663 		result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
2664 		result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
2665 
2666 		return result;
2667 	}
2668 	else
2669 	{
2670 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2671 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2672 		auto psubsw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2673 		psubsw->addArg(x.value());
2674 		psubsw->addArg(y.value());
2675 		::basicBlock->appendInst(psubsw);
2676 
2677 		return RValue<Short4>(V(result));
2678 	}
2679 }
2680 
MulHigh(RValue<Short4> x,RValue<Short4> y)2681 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2682 {
2683 	RR_DEBUG_INFO_UPDATE_LOC();
2684 	if(emulateIntrinsics)
2685 	{
2686 		Short4 result;
2687 		result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
2688 		result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
2689 		result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
2690 		result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
2691 
2692 		return result;
2693 	}
2694 	else
2695 	{
2696 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2697 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2698 		auto pmulhw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2699 		pmulhw->addArg(x.value());
2700 		pmulhw->addArg(y.value());
2701 		::basicBlock->appendInst(pmulhw);
2702 
2703 		return RValue<Short4>(V(result));
2704 	}
2705 }
2706 
MulAdd(RValue<Short4> x,RValue<Short4> y)2707 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2708 {
2709 	RR_DEBUG_INFO_UPDATE_LOC();
2710 	if(emulateIntrinsics)
2711 	{
2712 		Int2 result;
2713 		result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
2714 		result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
2715 
2716 		return result;
2717 	}
2718 	else
2719 	{
2720 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2721 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2722 		auto pmaddwd = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2723 		pmaddwd->addArg(x.value());
2724 		pmaddwd->addArg(y.value());
2725 		::basicBlock->appendInst(pmaddwd);
2726 
2727 		return As<Int2>(V(result));
2728 	}
2729 }
2730 
PackSigned(RValue<Short4> x,RValue<Short4> y)2731 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2732 {
2733 	RR_DEBUG_INFO_UPDATE_LOC();
2734 	if(emulateIntrinsics)
2735 	{
2736 		SByte8 result;
2737 		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
2738 		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
2739 		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
2740 		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
2741 		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
2742 		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
2743 		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
2744 		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
2745 
2746 		return result;
2747 	}
2748 	else
2749 	{
2750 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2751 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2752 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2753 		pack->addArg(x.value());
2754 		pack->addArg(y.value());
2755 		::basicBlock->appendInst(pack);
2756 
2757 		return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
2758 	}
2759 }
2760 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2761 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2762 {
2763 	RR_DEBUG_INFO_UPDATE_LOC();
2764 	if(emulateIntrinsics)
2765 	{
2766 		Byte8 result;
2767 		result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
2768 		result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
2769 		result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
2770 		result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
2771 		result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
2772 		result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
2773 		result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
2774 		result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
2775 
2776 		return result;
2777 	}
2778 	else
2779 	{
2780 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2781 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2782 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2783 		pack->addArg(x.value());
2784 		pack->addArg(y.value());
2785 		::basicBlock->appendInst(pack);
2786 
2787 		return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
2788 	}
2789 }
2790 
CmpGT(RValue<Short4> x,RValue<Short4> y)2791 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2792 {
2793 	RR_DEBUG_INFO_UPDATE_LOC();
2794 	return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value(), y.value()));
2795 }
2796 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2797 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2798 {
2799 	RR_DEBUG_INFO_UPDATE_LOC();
2800 	return RValue<Short4>(Nucleus::createICmpEQ(x.value(), y.value()));
2801 }
2802 
type()2803 Type *Short4::type()
2804 {
2805 	return T(Type_v4i16);
2806 }
2807 
UShort4(RValue<Float4> cast,bool saturate)2808 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2809 {
2810 	if(saturate)
2811 	{
2812 		if(CPUID::SSE4_1)
2813 		{
2814 			// x86 produces 0x80000000 on 32-bit integer overflow/underflow.
2815 			// PackUnsigned takes care of 0x0000 saturation.
2816 			Int4 int4(Min(cast, Float4(0xFFFF)));
2817 			*this = As<UShort4>(PackUnsigned(int4, int4));
2818 		}
2819 		else if(CPUID::ARM)
2820 		{
2821 			// ARM saturates the 32-bit integer result on overflow/undeflow.
2822 			Int4 int4(cast);
2823 			*this = As<UShort4>(PackUnsigned(int4, int4));
2824 		}
2825 		else
2826 		{
2827 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2828 		}
2829 	}
2830 	else
2831 	{
2832 		*this = Short4(Int4(cast));
2833 	}
2834 }
2835 
Extract(RValue<UShort4> val,int i)2836 RValue<UShort> Extract(RValue<UShort4> val, int i)
2837 {
2838 	return RValue<UShort>(Nucleus::createExtractElement(val.value(), UShort::type(), i));
2839 }
2840 
Insert(RValue<UShort4> val,RValue<UShort> element,int i)2841 RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
2842 {
2843 	return RValue<UShort4>(Nucleus::createInsertElement(val.value(), element.value(), i));
2844 }
2845 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2846 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2847 {
2848 	RR_DEBUG_INFO_UPDATE_LOC();
2849 	if(emulateIntrinsics)
2850 
2851 	{
2852 		UShort4 result;
2853 		result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
2854 		result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
2855 		result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
2856 		result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
2857 
2858 		return result;
2859 	}
2860 	else
2861 	{
2862 		return RValue<UShort4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2863 	}
2864 }
2865 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2866 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2867 {
2868 	RR_DEBUG_INFO_UPDATE_LOC();
2869 	if(emulateIntrinsics)
2870 	{
2871 		UShort4 result;
2872 		result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
2873 		result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
2874 		result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
2875 		result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
2876 
2877 		return result;
2878 	}
2879 	else
2880 	{
2881 		return RValue<UShort4>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2882 	}
2883 }
2884 
Max(RValue<UShort4> x,RValue<UShort4> y)2885 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2886 {
2887 	RR_DEBUG_INFO_UPDATE_LOC();
2888 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2889 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
2890 	::basicBlock->appendInst(cmp);
2891 
2892 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2893 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2894 	::basicBlock->appendInst(select);
2895 
2896 	return RValue<UShort4>(V(result));
2897 }
2898 
Min(RValue<UShort4> x,RValue<UShort4> y)2899 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2900 {
2901 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2902 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
2903 	::basicBlock->appendInst(cmp);
2904 
2905 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2906 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2907 	::basicBlock->appendInst(select);
2908 
2909 	return RValue<UShort4>(V(result));
2910 }
2911 
SaturateUnsigned(RValue<Int> x)2912 RValue<UShort> SaturateUnsigned(RValue<Int> x)
2913 {
2914 	RR_DEBUG_INFO_UPDATE_LOC();
2915 	return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
2916 }
2917 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2918 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2919 {
2920 	RR_DEBUG_INFO_UPDATE_LOC();
2921 	if(emulateIntrinsics)
2922 	{
2923 		UShort4 result;
2924 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
2925 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
2926 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
2927 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
2928 
2929 		return result;
2930 	}
2931 	else
2932 	{
2933 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2934 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2935 		auto paddusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2936 		paddusw->addArg(x.value());
2937 		paddusw->addArg(y.value());
2938 		::basicBlock->appendInst(paddusw);
2939 
2940 		return RValue<UShort4>(V(result));
2941 	}
2942 }
2943 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2944 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2945 {
2946 	RR_DEBUG_INFO_UPDATE_LOC();
2947 	if(emulateIntrinsics)
2948 	{
2949 		UShort4 result;
2950 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
2951 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
2952 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
2953 		result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
2954 
2955 		return result;
2956 	}
2957 	else
2958 	{
2959 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2960 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2961 		auto psubusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2962 		psubusw->addArg(x.value());
2963 		psubusw->addArg(y.value());
2964 		::basicBlock->appendInst(psubusw);
2965 
2966 		return RValue<UShort4>(V(result));
2967 	}
2968 }
2969 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2970 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2971 {
2972 	RR_DEBUG_INFO_UPDATE_LOC();
2973 	if(emulateIntrinsics)
2974 	{
2975 		UShort4 result;
2976 		result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
2977 		result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
2978 		result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
2979 		result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
2980 
2981 		return result;
2982 	}
2983 	else
2984 	{
2985 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2986 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2987 		auto pmulhuw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2988 		pmulhuw->addArg(x.value());
2989 		pmulhuw->addArg(y.value());
2990 		::basicBlock->appendInst(pmulhuw);
2991 
2992 		return RValue<UShort4>(V(result));
2993 	}
2994 }
2995 
MulHigh(RValue<Int4> x,RValue<Int4> y)2996 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2997 {
2998 	RR_DEBUG_INFO_UPDATE_LOC();
2999 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
3000 
3001 	// Scalarized implementation.
3002 	Int4 result;
3003 	result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
3004 	result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
3005 	result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
3006 	result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
3007 
3008 	return result;
3009 }
3010 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)3011 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
3012 {
3013 	RR_DEBUG_INFO_UPDATE_LOC();
3014 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
3015 
3016 	if(false)  // Partial product based implementation.
3017 	{
3018 		auto xh = x >> 16;
3019 		auto yh = y >> 16;
3020 		auto xl = x & UInt4(0x0000FFFF);
3021 		auto yl = y & UInt4(0x0000FFFF);
3022 		auto xlyh = xl * yh;
3023 		auto xhyl = xh * yl;
3024 		auto xlyhh = xlyh >> 16;
3025 		auto xhylh = xhyl >> 16;
3026 		auto xlyhl = xlyh & UInt4(0x0000FFFF);
3027 		auto xhyll = xhyl & UInt4(0x0000FFFF);
3028 		auto xlylh = (xl * yl) >> 16;
3029 		auto oflow = (xlyhl + xhyll + xlylh) >> 16;
3030 
3031 		return (xh * yh) + (xlyhh + xhylh) + oflow;
3032 	}
3033 
3034 	// Scalarized implementation.
3035 	Int4 result;
3036 	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
3037 	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
3038 	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
3039 	result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
3040 
3041 	return As<UInt4>(result);
3042 }
3043 
Average(RValue<UShort4> x,RValue<UShort4> y)3044 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3045 {
3046 	RR_DEBUG_INFO_UPDATE_LOC();
3047 	UNIMPLEMENTED_NO_BUG("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
3048 	return UShort4(0);
3049 }
3050 
type()3051 Type *UShort4::type()
3052 {
3053 	return T(Type_v4i16);
3054 }
3055 
Extract(RValue<Short8> val,int i)3056 RValue<Short> Extract(RValue<Short8> val, int i)
3057 {
3058 	RR_DEBUG_INFO_UPDATE_LOC();
3059 	return RValue<Short>(Nucleus::createExtractElement(val.value(), Short::type(), i));
3060 }
3061 
Insert(RValue<Short8> val,RValue<Short> element,int i)3062 RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
3063 {
3064 	RR_DEBUG_INFO_UPDATE_LOC();
3065 	return RValue<Short8>(Nucleus::createInsertElement(val.value(), element.value(), i));
3066 }
3067 
operator <<(RValue<Short8> lhs,unsigned char rhs)3068 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3069 {
3070 	RR_DEBUG_INFO_UPDATE_LOC();
3071 	if(emulateIntrinsics)
3072 	{
3073 		Short8 result;
3074 		result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
3075 		result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
3076 		result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
3077 		result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
3078 		result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
3079 		result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
3080 		result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
3081 		result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
3082 
3083 		return result;
3084 	}
3085 	else
3086 	{
3087 		return RValue<Short8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3088 	}
3089 }
3090 
operator >>(RValue<Short8> lhs,unsigned char rhs)3091 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3092 {
3093 	RR_DEBUG_INFO_UPDATE_LOC();
3094 	if(emulateIntrinsics)
3095 	{
3096 		Short8 result;
3097 		result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
3098 		result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
3099 		result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
3100 		result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
3101 		result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
3102 		result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
3103 		result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
3104 		result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
3105 
3106 		return result;
3107 	}
3108 	else
3109 	{
3110 		return RValue<Short8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3111 	}
3112 }
3113 
MulAdd(RValue<Short8> x,RValue<Short8> y)3114 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3115 {
3116 	RR_DEBUG_INFO_UPDATE_LOC();
3117 	UNIMPLEMENTED_NO_BUG("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
3118 	return Int4(0);
3119 }
3120 
MulHigh(RValue<Short8> x,RValue<Short8> y)3121 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3122 {
3123 	RR_DEBUG_INFO_UPDATE_LOC();
3124 	UNIMPLEMENTED_NO_BUG("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
3125 	return Short8(0);
3126 }
3127 
type()3128 Type *Short8::type()
3129 {
3130 	return T(Ice::IceType_v8i16);
3131 }
3132 
Extract(RValue<UShort8> val,int i)3133 RValue<UShort> Extract(RValue<UShort8> val, int i)
3134 {
3135 	RR_DEBUG_INFO_UPDATE_LOC();
3136 	return RValue<UShort>(Nucleus::createExtractElement(val.value(), UShort::type(), i));
3137 }
3138 
Insert(RValue<UShort8> val,RValue<UShort> element,int i)3139 RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
3140 {
3141 	RR_DEBUG_INFO_UPDATE_LOC();
3142 	return RValue<UShort8>(Nucleus::createInsertElement(val.value(), element.value(), i));
3143 }
3144 
operator <<(RValue<UShort8> lhs,unsigned char rhs)3145 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3146 {
3147 	RR_DEBUG_INFO_UPDATE_LOC();
3148 	if(emulateIntrinsics)
3149 	{
3150 		UShort8 result;
3151 		result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
3152 		result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
3153 		result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
3154 		result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
3155 		result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
3156 		result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
3157 		result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
3158 		result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
3159 
3160 		return result;
3161 	}
3162 	else
3163 	{
3164 		return RValue<UShort8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3165 	}
3166 }
3167 
operator >>(RValue<UShort8> lhs,unsigned char rhs)3168 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3169 {
3170 	RR_DEBUG_INFO_UPDATE_LOC();
3171 	if(emulateIntrinsics)
3172 	{
3173 		UShort8 result;
3174 		result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
3175 		result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
3176 		result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
3177 		result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
3178 		result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
3179 		result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
3180 		result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
3181 		result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
3182 
3183 		return result;
3184 	}
3185 	else
3186 	{
3187 		return RValue<UShort8>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3188 	}
3189 }
3190 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)3191 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3192 {
3193 	RR_DEBUG_INFO_UPDATE_LOC();
3194 	UNIMPLEMENTED_NO_BUG("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
3195 	return UShort8(0);
3196 }
3197 
type()3198 Type *UShort8::type()
3199 {
3200 	return T(Ice::IceType_v8i16);
3201 }
3202 
operator ++(Int & val,int)3203 RValue<Int> operator++(Int &val, int)  // Post-increment
3204 {
3205 	RR_DEBUG_INFO_UPDATE_LOC();
3206 	RValue<Int> res = val;
3207 	val += 1;
3208 	return res;
3209 }
3210 
operator ++(Int & val)3211 const Int &operator++(Int &val)  // Pre-increment
3212 {
3213 	RR_DEBUG_INFO_UPDATE_LOC();
3214 	val += 1;
3215 	return val;
3216 }
3217 
operator --(Int & val,int)3218 RValue<Int> operator--(Int &val, int)  // Post-decrement
3219 {
3220 	RR_DEBUG_INFO_UPDATE_LOC();
3221 	RValue<Int> res = val;
3222 	val -= 1;
3223 	return res;
3224 }
3225 
operator --(Int & val)3226 const Int &operator--(Int &val)  // Pre-decrement
3227 {
3228 	RR_DEBUG_INFO_UPDATE_LOC();
3229 	val -= 1;
3230 	return val;
3231 }
3232 
RoundInt(RValue<Float> cast)3233 RValue<Int> RoundInt(RValue<Float> cast)
3234 {
3235 	RR_DEBUG_INFO_UPDATE_LOC();
3236 	if(emulateIntrinsics || CPUID::ARM)
3237 	{
3238 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3239 		return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
3240 	}
3241 	else
3242 	{
3243 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3244 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3245 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3246 		nearbyint->addArg(cast.value());
3247 		::basicBlock->appendInst(nearbyint);
3248 
3249 		return RValue<Int>(V(result));
3250 	}
3251 }
3252 
type()3253 Type *Int::type()
3254 {
3255 	return T(Ice::IceType_i32);
3256 }
3257 
type()3258 Type *Long::type()
3259 {
3260 	return T(Ice::IceType_i64);
3261 }
3262 
UInt(RValue<Float> cast)3263 UInt::UInt(RValue<Float> cast)
3264 {
3265 	RR_DEBUG_INFO_UPDATE_LOC();
3266 	// Smallest positive value representable in UInt, but not in Int
3267 	const unsigned int ustart = 0x80000000u;
3268 	const float ustartf = float(ustart);
3269 
3270 	// If the value is negative, store 0, otherwise store the result of the conversion
3271 	storeValue((~(As<Int>(cast) >> 31) &
3272 	            // Check if the value can be represented as an Int
3273 	            IfThenElse(cast >= ustartf,
3274 	                       // If the value is too large, subtract ustart and re-add it after conversion.
3275 	                       As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
3276 	                       // Otherwise, just convert normally
3277 	                       Int(cast)))
3278 	               .value());
3279 }
3280 
operator ++(UInt & val,int)3281 RValue<UInt> operator++(UInt &val, int)  // Post-increment
3282 {
3283 	RR_DEBUG_INFO_UPDATE_LOC();
3284 	RValue<UInt> res = val;
3285 	val += 1;
3286 	return res;
3287 }
3288 
operator ++(UInt & val)3289 const UInt &operator++(UInt &val)  // Pre-increment
3290 {
3291 	RR_DEBUG_INFO_UPDATE_LOC();
3292 	val += 1;
3293 	return val;
3294 }
3295 
operator --(UInt & val,int)3296 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
3297 {
3298 	RR_DEBUG_INFO_UPDATE_LOC();
3299 	RValue<UInt> res = val;
3300 	val -= 1;
3301 	return res;
3302 }
3303 
operator --(UInt & val)3304 const UInt &operator--(UInt &val)  // Pre-decrement
3305 {
3306 	RR_DEBUG_INFO_UPDATE_LOC();
3307 	val -= 1;
3308 	return val;
3309 }
3310 
3311 //	RValue<UInt> RoundUInt(RValue<Float> cast)
3312 //	{
3313 //		ASSERT(false && "UNIMPLEMENTED"); return RValue<UInt>(V(nullptr));
3314 //	}
3315 
type()3316 Type *UInt::type()
3317 {
3318 	return T(Ice::IceType_i32);
3319 }
3320 
3321 //	Int2::Int2(RValue<Int> cast)
3322 //	{
3323 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
3324 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
3325 //
3326 //		Constant *shuffle[2];
3327 //		shuffle[0] = Nucleus::createConstantInt(0);
3328 //		shuffle[1] = Nucleus::createConstantInt(0);
3329 //
3330 //		Value *replicate = Nucleus::createShuffleVector(vector, UndefValue::get(Int2::type()), Nucleus::createConstantVector(shuffle, 2));
3331 //
3332 //		storeValue(replicate);
3333 //	}
3334 
operator <<(RValue<Int2> lhs,unsigned char rhs)3335 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
3336 {
3337 	RR_DEBUG_INFO_UPDATE_LOC();
3338 	if(emulateIntrinsics)
3339 	{
3340 		Int2 result;
3341 		result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
3342 		result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
3343 
3344 		return result;
3345 	}
3346 	else
3347 	{
3348 		return RValue<Int2>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3349 	}
3350 }
3351 
operator >>(RValue<Int2> lhs,unsigned char rhs)3352 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
3353 {
3354 	RR_DEBUG_INFO_UPDATE_LOC();
3355 	if(emulateIntrinsics)
3356 	{
3357 		Int2 result;
3358 		result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
3359 		result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
3360 
3361 		return result;
3362 	}
3363 	else
3364 	{
3365 		return RValue<Int2>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3366 	}
3367 }
3368 
type()3369 Type *Int2::type()
3370 {
3371 	return T(Type_v2i32);
3372 }
3373 
operator <<(RValue<UInt2> lhs,unsigned char rhs)3374 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
3375 {
3376 	RR_DEBUG_INFO_UPDATE_LOC();
3377 	if(emulateIntrinsics)
3378 	{
3379 		UInt2 result;
3380 		result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
3381 		result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
3382 
3383 		return result;
3384 	}
3385 	else
3386 	{
3387 		return RValue<UInt2>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3388 	}
3389 }
3390 
operator >>(RValue<UInt2> lhs,unsigned char rhs)3391 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
3392 {
3393 	RR_DEBUG_INFO_UPDATE_LOC();
3394 	if(emulateIntrinsics)
3395 	{
3396 		UInt2 result;
3397 		result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
3398 		result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
3399 
3400 		return result;
3401 	}
3402 	else
3403 	{
3404 		return RValue<UInt2>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3405 	}
3406 }
3407 
type()3408 Type *UInt2::type()
3409 {
3410 	return T(Type_v2i32);
3411 }
3412 
Int4(RValue<Byte4> cast)3413 Int4::Int4(RValue<Byte4> cast)
3414     : XYZW(this)
3415 {
3416 	RR_DEBUG_INFO_UPDATE_LOC();
3417 	Value *x = Nucleus::createBitCast(cast.value(), Int::type());
3418 	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
3419 
3420 	Value *e;
3421 	int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
3422 	Value *b = Nucleus::createBitCast(a, Byte16::type());
3423 	Value *c = Nucleus::createShuffleVector(b, Nucleus::createNullValue(Byte16::type()), swizzle);
3424 
3425 	int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
3426 	Value *d = Nucleus::createBitCast(c, Short8::type());
3427 	e = Nucleus::createShuffleVector(d, Nucleus::createNullValue(Short8::type()), swizzle2);
3428 
3429 	Value *f = Nucleus::createBitCast(e, Int4::type());
3430 	storeValue(f);
3431 }
3432 
Int4(RValue<SByte4> cast)3433 Int4::Int4(RValue<SByte4> cast)
3434     : XYZW(this)
3435 {
3436 	RR_DEBUG_INFO_UPDATE_LOC();
3437 	Value *x = Nucleus::createBitCast(cast.value(), Int::type());
3438 	Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
3439 
3440 	int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
3441 	Value *b = Nucleus::createBitCast(a, Byte16::type());
3442 	Value *c = Nucleus::createShuffleVector(b, b, swizzle);
3443 
3444 	int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
3445 	Value *d = Nucleus::createBitCast(c, Short8::type());
3446 	Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
3447 
3448 	*this = As<Int4>(e) >> 24;
3449 }
3450 
Int4(RValue<Short4> cast)3451 Int4::Int4(RValue<Short4> cast)
3452     : XYZW(this)
3453 {
3454 	RR_DEBUG_INFO_UPDATE_LOC();
3455 	int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
3456 	Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
3457 
3458 	*this = As<Int4>(c) >> 16;
3459 }
3460 
Int4(RValue<UShort4> cast)3461 Int4::Int4(RValue<UShort4> cast)
3462     : XYZW(this)
3463 {
3464 	RR_DEBUG_INFO_UPDATE_LOC();
3465 	int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
3466 	Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
3467 	Value *d = Nucleus::createBitCast(c, Int4::type());
3468 	storeValue(d);
3469 }
3470 
Int4(RValue<Int> rhs)3471 Int4::Int4(RValue<Int> rhs)
3472     : XYZW(this)
3473 {
3474 	RR_DEBUG_INFO_UPDATE_LOC();
3475 	Value *vector = Nucleus::createBitCast(rhs.value(), Int4::type());
3476 
3477 	int swizzle[4] = { 0, 0, 0, 0 };
3478 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3479 
3480 	storeValue(replicate);
3481 }
3482 
operator <<(RValue<Int4> lhs,unsigned char rhs)3483 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
3484 {
3485 	RR_DEBUG_INFO_UPDATE_LOC();
3486 	if(emulateIntrinsics)
3487 	{
3488 		Int4 result;
3489 		result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
3490 		result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
3491 		result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
3492 		result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
3493 
3494 		return result;
3495 	}
3496 	else
3497 	{
3498 		return RValue<Int4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3499 	}
3500 }
3501 
operator >>(RValue<Int4> lhs,unsigned char rhs)3502 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
3503 {
3504 	RR_DEBUG_INFO_UPDATE_LOC();
3505 	if(emulateIntrinsics)
3506 	{
3507 		Int4 result;
3508 		result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
3509 		result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
3510 		result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
3511 		result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
3512 
3513 		return result;
3514 	}
3515 	else
3516 	{
3517 		return RValue<Int4>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3518 	}
3519 }
3520 
CmpEQ(RValue<Int4> x,RValue<Int4> y)3521 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
3522 {
3523 	RR_DEBUG_INFO_UPDATE_LOC();
3524 	return RValue<Int4>(Nucleus::createICmpEQ(x.value(), y.value()));
3525 }
3526 
CmpLT(RValue<Int4> x,RValue<Int4> y)3527 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
3528 {
3529 	RR_DEBUG_INFO_UPDATE_LOC();
3530 	return RValue<Int4>(Nucleus::createICmpSLT(x.value(), y.value()));
3531 }
3532 
CmpLE(RValue<Int4> x,RValue<Int4> y)3533 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
3534 {
3535 	RR_DEBUG_INFO_UPDATE_LOC();
3536 	return RValue<Int4>(Nucleus::createICmpSLE(x.value(), y.value()));
3537 }
3538 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)3539 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
3540 {
3541 	RR_DEBUG_INFO_UPDATE_LOC();
3542 	return RValue<Int4>(Nucleus::createICmpNE(x.value(), y.value()));
3543 }
3544 
CmpNLT(RValue<Int4> x,RValue<Int4> y)3545 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
3546 {
3547 	RR_DEBUG_INFO_UPDATE_LOC();
3548 	return RValue<Int4>(Nucleus::createICmpSGE(x.value(), y.value()));
3549 }
3550 
CmpNLE(RValue<Int4> x,RValue<Int4> y)3551 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
3552 {
3553 	RR_DEBUG_INFO_UPDATE_LOC();
3554 	return RValue<Int4>(Nucleus::createICmpSGT(x.value(), y.value()));
3555 }
3556 
Max(RValue<Int4> x,RValue<Int4> y)3557 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
3558 {
3559 	RR_DEBUG_INFO_UPDATE_LOC();
3560 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3561 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
3562 	::basicBlock->appendInst(cmp);
3563 
3564 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3565 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3566 	::basicBlock->appendInst(select);
3567 
3568 	return RValue<Int4>(V(result));
3569 }
3570 
Min(RValue<Int4> x,RValue<Int4> y)3571 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
3572 {
3573 	RR_DEBUG_INFO_UPDATE_LOC();
3574 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3575 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
3576 	::basicBlock->appendInst(cmp);
3577 
3578 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3579 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3580 	::basicBlock->appendInst(select);
3581 
3582 	return RValue<Int4>(V(result));
3583 }
3584 
RoundInt(RValue<Float4> cast)3585 RValue<Int4> RoundInt(RValue<Float4> cast)
3586 {
3587 	RR_DEBUG_INFO_UPDATE_LOC();
3588 	if(emulateIntrinsics || CPUID::ARM)
3589 	{
3590 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3591 		return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
3592 	}
3593 	else
3594 	{
3595 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3596 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3597 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3598 		nearbyint->addArg(cast.value());
3599 		::basicBlock->appendInst(nearbyint);
3600 
3601 		return RValue<Int4>(V(result));
3602 	}
3603 }
3604 
RoundIntClamped(RValue<Float4> cast)3605 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
3606 {
3607 	RR_DEBUG_INFO_UPDATE_LOC();
3608 
3609 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
3610 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
3611 	// saturate to 0x80000000.
3612 	RValue<Float4> clamped = Min(cast, Float4(0x7FFFFF80));
3613 
3614 	if(emulateIntrinsics || CPUID::ARM)
3615 	{
3616 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
3617 		return Int4((clamped + Float4(0x00C00000)) - Float4(0x00C00000));
3618 	}
3619 	else
3620 	{
3621 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3622 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3623 		auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3624 		nearbyint->addArg(clamped.value());
3625 		::basicBlock->appendInst(nearbyint);
3626 
3627 		return RValue<Int4>(V(result));
3628 	}
3629 }
3630 
PackSigned(RValue<Int4> x,RValue<Int4> y)3631 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
3632 {
3633 	RR_DEBUG_INFO_UPDATE_LOC();
3634 	if(emulateIntrinsics)
3635 	{
3636 		Short8 result;
3637 		result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
3638 		result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
3639 		result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
3640 		result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
3641 		result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
3642 		result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
3643 		result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
3644 		result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
3645 
3646 		return result;
3647 	}
3648 	else
3649 	{
3650 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
3651 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3652 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3653 		pack->addArg(x.value());
3654 		pack->addArg(y.value());
3655 		::basicBlock->appendInst(pack);
3656 
3657 		return RValue<Short8>(V(result));
3658 	}
3659 }
3660 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)3661 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
3662 {
3663 	RR_DEBUG_INFO_UPDATE_LOC();
3664 	if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
3665 	{
3666 		RValue<Int4> sx = As<Int4>(x);
3667 		RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
3668 
3669 		RValue<Int4> sy = As<Int4>(y);
3670 		RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
3671 
3672 		return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
3673 	}
3674 	else
3675 	{
3676 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
3677 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3678 		auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3679 		pack->addArg(x.value());
3680 		pack->addArg(y.value());
3681 		::basicBlock->appendInst(pack);
3682 
3683 		return RValue<UShort8>(V(result));
3684 	}
3685 }
3686 
SignMask(RValue<Int4> x)3687 RValue<Int> SignMask(RValue<Int4> x)
3688 {
3689 	RR_DEBUG_INFO_UPDATE_LOC();
3690 	if(emulateIntrinsics || CPUID::ARM)
3691 	{
3692 		Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
3693 		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
3694 	}
3695 	else
3696 	{
3697 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3698 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3699 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3700 		movmsk->addArg(x.value());
3701 		::basicBlock->appendInst(movmsk);
3702 
3703 		return RValue<Int>(V(result));
3704 	}
3705 }
3706 
type()3707 Type *Int4::type()
3708 {
3709 	return T(Ice::IceType_v4i32);
3710 }
3711 
UInt4(RValue<Float4> cast)3712 UInt4::UInt4(RValue<Float4> cast)
3713     : XYZW(this)
3714 {
3715 	RR_DEBUG_INFO_UPDATE_LOC();
3716 	// Smallest positive value representable in UInt, but not in Int
3717 	const unsigned int ustart = 0x80000000u;
3718 	const float ustartf = float(ustart);
3719 
3720 	// Check if the value can be represented as an Int
3721 	Int4 uiValue = CmpNLT(cast, Float4(ustartf));
3722 	// If the value is too large, subtract ustart and re-add it after conversion.
3723 	uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
3724 	          // Otherwise, just convert normally
3725 	          (~uiValue & Int4(cast));
3726 	// If the value is negative, store 0, otherwise store the result of the conversion
3727 	storeValue((~(As<Int4>(cast) >> 31) & uiValue).value());
3728 }
3729 
UInt4(RValue<UInt> rhs)3730 UInt4::UInt4(RValue<UInt> rhs)
3731     : XYZW(this)
3732 {
3733 	RR_DEBUG_INFO_UPDATE_LOC();
3734 	Value *vector = Nucleus::createBitCast(rhs.value(), UInt4::type());
3735 
3736 	int swizzle[4] = { 0, 0, 0, 0 };
3737 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3738 
3739 	storeValue(replicate);
3740 }
3741 
operator <<(RValue<UInt4> lhs,unsigned char rhs)3742 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
3743 {
3744 	RR_DEBUG_INFO_UPDATE_LOC();
3745 	if(emulateIntrinsics)
3746 	{
3747 		UInt4 result;
3748 		result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
3749 		result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
3750 		result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
3751 		result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
3752 
3753 		return result;
3754 	}
3755 	else
3756 	{
3757 		return RValue<UInt4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3758 	}
3759 }
3760 
operator >>(RValue<UInt4> lhs,unsigned char rhs)3761 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
3762 {
3763 	RR_DEBUG_INFO_UPDATE_LOC();
3764 	if(emulateIntrinsics)
3765 	{
3766 		UInt4 result;
3767 		result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
3768 		result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
3769 		result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
3770 		result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
3771 
3772 		return result;
3773 	}
3774 	else
3775 	{
3776 		return RValue<UInt4>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3777 	}
3778 }
3779 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)3780 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
3781 {
3782 	RR_DEBUG_INFO_UPDATE_LOC();
3783 	return RValue<UInt4>(Nucleus::createICmpEQ(x.value(), y.value()));
3784 }
3785 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)3786 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
3787 {
3788 	RR_DEBUG_INFO_UPDATE_LOC();
3789 	return RValue<UInt4>(Nucleus::createICmpULT(x.value(), y.value()));
3790 }
3791 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)3792 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
3793 {
3794 	RR_DEBUG_INFO_UPDATE_LOC();
3795 	return RValue<UInt4>(Nucleus::createICmpULE(x.value(), y.value()));
3796 }
3797 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)3798 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
3799 {
3800 	RR_DEBUG_INFO_UPDATE_LOC();
3801 	return RValue<UInt4>(Nucleus::createICmpNE(x.value(), y.value()));
3802 }
3803 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)3804 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
3805 {
3806 	RR_DEBUG_INFO_UPDATE_LOC();
3807 	return RValue<UInt4>(Nucleus::createICmpUGE(x.value(), y.value()));
3808 }
3809 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)3810 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
3811 {
3812 	RR_DEBUG_INFO_UPDATE_LOC();
3813 	return RValue<UInt4>(Nucleus::createICmpUGT(x.value(), y.value()));
3814 }
3815 
Max(RValue<UInt4> x,RValue<UInt4> y)3816 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
3817 {
3818 	RR_DEBUG_INFO_UPDATE_LOC();
3819 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3820 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
3821 	::basicBlock->appendInst(cmp);
3822 
3823 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3824 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3825 	::basicBlock->appendInst(select);
3826 
3827 	return RValue<UInt4>(V(result));
3828 }
3829 
Min(RValue<UInt4> x,RValue<UInt4> y)3830 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
3831 {
3832 	RR_DEBUG_INFO_UPDATE_LOC();
3833 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3834 	auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
3835 	::basicBlock->appendInst(cmp);
3836 
3837 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3838 	auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3839 	::basicBlock->appendInst(select);
3840 
3841 	return RValue<UInt4>(V(result));
3842 }
3843 
type()3844 Type *UInt4::type()
3845 {
3846 	return T(Ice::IceType_v4i32);
3847 }
3848 
type()3849 Type *Half::type()
3850 {
3851 	return T(Ice::IceType_i16);
3852 }
3853 
Rcp_pp(RValue<Float> x,bool exactAtPow2)3854 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
3855 {
3856 	RR_DEBUG_INFO_UPDATE_LOC();
3857 	return 1.0f / x;
3858 }
3859 
RcpSqrt_pp(RValue<Float> x)3860 RValue<Float> RcpSqrt_pp(RValue<Float> x)
3861 {
3862 	RR_DEBUG_INFO_UPDATE_LOC();
3863 	return Rcp_pp(Sqrt(x));
3864 }
3865 
Sqrt(RValue<Float> x)3866 RValue<Float> Sqrt(RValue<Float> x)
3867 {
3868 	RR_DEBUG_INFO_UPDATE_LOC();
3869 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
3870 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3871 	auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3872 	sqrt->addArg(x.value());
3873 	::basicBlock->appendInst(sqrt);
3874 
3875 	return RValue<Float>(V(result));
3876 }
3877 
Round(RValue<Float> x)3878 RValue<Float> Round(RValue<Float> x)
3879 {
3880 	RR_DEBUG_INFO_UPDATE_LOC();
3881 	return Float4(Round(Float4(x))).x;
3882 }
3883 
Trunc(RValue<Float> x)3884 RValue<Float> Trunc(RValue<Float> x)
3885 {
3886 	RR_DEBUG_INFO_UPDATE_LOC();
3887 	return Float4(Trunc(Float4(x))).x;
3888 }
3889 
Frac(RValue<Float> x)3890 RValue<Float> Frac(RValue<Float> x)
3891 {
3892 	RR_DEBUG_INFO_UPDATE_LOC();
3893 	return Float4(Frac(Float4(x))).x;
3894 }
3895 
Floor(RValue<Float> x)3896 RValue<Float> Floor(RValue<Float> x)
3897 {
3898 	RR_DEBUG_INFO_UPDATE_LOC();
3899 	return Float4(Floor(Float4(x))).x;
3900 }
3901 
Ceil(RValue<Float> x)3902 RValue<Float> Ceil(RValue<Float> x)
3903 {
3904 	RR_DEBUG_INFO_UPDATE_LOC();
3905 	return Float4(Ceil(Float4(x))).x;
3906 }
3907 
type()3908 Type *Float::type()
3909 {
3910 	return T(Ice::IceType_f32);
3911 }
3912 
type()3913 Type *Float2::type()
3914 {
3915 	return T(Type_v2f32);
3916 }
3917 
Float4(RValue<Float> rhs)3918 Float4::Float4(RValue<Float> rhs)
3919     : XYZW(this)
3920 {
3921 	RR_DEBUG_INFO_UPDATE_LOC();
3922 	Value *vector = Nucleus::createBitCast(rhs.value(), Float4::type());
3923 
3924 	int swizzle[4] = { 0, 0, 0, 0 };
3925 	Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3926 
3927 	storeValue(replicate);
3928 }
3929 
Max(RValue<Float4> x,RValue<Float4> y)3930 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3931 {
3932 	RR_DEBUG_INFO_UPDATE_LOC();
3933 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3934 	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value(), y.value());
3935 	::basicBlock->appendInst(cmp);
3936 
3937 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3938 	auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
3939 	::basicBlock->appendInst(select);
3940 
3941 	return RValue<Float4>(V(result));
3942 }
3943 
Min(RValue<Float4> x,RValue<Float4> y)3944 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3945 {
3946 	RR_DEBUG_INFO_UPDATE_LOC();
3947 	Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3948 	auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value(), y.value());
3949 	::basicBlock->appendInst(cmp);
3950 
3951 	Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3952 	auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
3953 	::basicBlock->appendInst(select);
3954 
3955 	return RValue<Float4>(V(result));
3956 }
3957 
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3958 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3959 {
3960 	RR_DEBUG_INFO_UPDATE_LOC();
3961 	return Float4(1.0f) / x;
3962 }
3963 
RcpSqrt_pp(RValue<Float4> x)3964 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3965 {
3966 	RR_DEBUG_INFO_UPDATE_LOC();
3967 	return Rcp_pp(Sqrt(x));
3968 }
3969 
HasRcpApprox()3970 bool HasRcpApprox()
3971 {
3972 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3973 	return false;
3974 }
3975 
RcpApprox(RValue<Float4> x,bool exactAtPow2)3976 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
3977 {
3978 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3979 	UNREACHABLE("RValue<Float4> RcpApprox()");
3980 	return { 0.0f };
3981 }
3982 
RcpApprox(RValue<Float> x,bool exactAtPow2)3983 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
3984 {
3985 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3986 	UNREACHABLE("RValue<Float> RcpApprox()");
3987 	return { 0.0f };
3988 }
3989 
HasRcpSqrtApprox()3990 bool HasRcpSqrtApprox()
3991 {
3992 	return false;
3993 }
3994 
RcpSqrtApprox(RValue<Float4> x)3995 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
3996 {
3997 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3998 	UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
3999 	return { 0.0f };
4000 }
4001 
RcpSqrtApprox(RValue<Float> x)4002 RValue<Float> RcpSqrtApprox(RValue<Float> x)
4003 {
4004 	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
4005 	UNREACHABLE("RValue<Float> RcpSqrtApprox()");
4006 	return { 0.0f };
4007 }
4008 
Sqrt(RValue<Float4> x)4009 RValue<Float4> Sqrt(RValue<Float4> x)
4010 {
4011 	RR_DEBUG_INFO_UPDATE_LOC();
4012 	if(emulateIntrinsics || CPUID::ARM)
4013 	{
4014 		Float4 result;
4015 		result.x = Sqrt(Float(Float4(x).x));
4016 		result.y = Sqrt(Float(Float4(x).y));
4017 		result.z = Sqrt(Float(Float4(x).z));
4018 		result.w = Sqrt(Float(Float4(x).w));
4019 
4020 		return result;
4021 	}
4022 	else
4023 	{
4024 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4025 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4026 		auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4027 		sqrt->addArg(x.value());
4028 		::basicBlock->appendInst(sqrt);
4029 
4030 		return RValue<Float4>(V(result));
4031 	}
4032 }
4033 
SignMask(RValue<Float4> x)4034 RValue<Int> SignMask(RValue<Float4> x)
4035 {
4036 	RR_DEBUG_INFO_UPDATE_LOC();
4037 	if(emulateIntrinsics || CPUID::ARM)
4038 	{
4039 		Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
4040 		return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
4041 	}
4042 	else
4043 	{
4044 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4045 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4046 		auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4047 		movmsk->addArg(x.value());
4048 		::basicBlock->appendInst(movmsk);
4049 
4050 		return RValue<Int>(V(result));
4051 	}
4052 }
4053 
CmpEQ(RValue<Float4> x,RValue<Float4> y)4054 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
4055 {
4056 	RR_DEBUG_INFO_UPDATE_LOC();
4057 	return RValue<Int4>(Nucleus::createFCmpOEQ(x.value(), y.value()));
4058 }
4059 
CmpLT(RValue<Float4> x,RValue<Float4> y)4060 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
4061 {
4062 	RR_DEBUG_INFO_UPDATE_LOC();
4063 	return RValue<Int4>(Nucleus::createFCmpOLT(x.value(), y.value()));
4064 }
4065 
CmpLE(RValue<Float4> x,RValue<Float4> y)4066 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
4067 {
4068 	RR_DEBUG_INFO_UPDATE_LOC();
4069 	return RValue<Int4>(Nucleus::createFCmpOLE(x.value(), y.value()));
4070 }
4071 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)4072 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
4073 {
4074 	RR_DEBUG_INFO_UPDATE_LOC();
4075 	return RValue<Int4>(Nucleus::createFCmpONE(x.value(), y.value()));
4076 }
4077 
CmpNLT(RValue<Float4> x,RValue<Float4> y)4078 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
4079 {
4080 	RR_DEBUG_INFO_UPDATE_LOC();
4081 	return RValue<Int4>(Nucleus::createFCmpOGE(x.value(), y.value()));
4082 }
4083 
CmpNLE(RValue<Float4> x,RValue<Float4> y)4084 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
4085 {
4086 	RR_DEBUG_INFO_UPDATE_LOC();
4087 	return RValue<Int4>(Nucleus::createFCmpOGT(x.value(), y.value()));
4088 }
4089 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)4090 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
4091 {
4092 	RR_DEBUG_INFO_UPDATE_LOC();
4093 	return RValue<Int4>(Nucleus::createFCmpUEQ(x.value(), y.value()));
4094 }
4095 
CmpULT(RValue<Float4> x,RValue<Float4> y)4096 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
4097 {
4098 	RR_DEBUG_INFO_UPDATE_LOC();
4099 	return RValue<Int4>(Nucleus::createFCmpULT(x.value(), y.value()));
4100 }
4101 
CmpULE(RValue<Float4> x,RValue<Float4> y)4102 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
4103 {
4104 	RR_DEBUG_INFO_UPDATE_LOC();
4105 	return RValue<Int4>(Nucleus::createFCmpULE(x.value(), y.value()));
4106 }
4107 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)4108 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
4109 {
4110 	RR_DEBUG_INFO_UPDATE_LOC();
4111 	return RValue<Int4>(Nucleus::createFCmpUNE(x.value(), y.value()));
4112 }
4113 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)4114 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
4115 {
4116 	RR_DEBUG_INFO_UPDATE_LOC();
4117 	return RValue<Int4>(Nucleus::createFCmpUGE(x.value(), y.value()));
4118 }
4119 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)4120 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
4121 {
4122 	RR_DEBUG_INFO_UPDATE_LOC();
4123 	return RValue<Int4>(Nucleus::createFCmpUGT(x.value(), y.value()));
4124 }
4125 
Round(RValue<Float4> x)4126 RValue<Float4> Round(RValue<Float4> x)
4127 {
4128 	RR_DEBUG_INFO_UPDATE_LOC();
4129 	if(emulateIntrinsics || CPUID::ARM)
4130 	{
4131 		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
4132 		return (x + Float4(0x00C00000)) - Float4(0x00C00000);
4133 	}
4134 	else if(CPUID::SSE4_1)
4135 	{
4136 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4137 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4138 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4139 		round->addArg(x.value());
4140 		round->addArg(::context->getConstantInt32(0));
4141 		::basicBlock->appendInst(round);
4142 
4143 		return RValue<Float4>(V(result));
4144 	}
4145 	else
4146 	{
4147 		return Float4(RoundInt(x));
4148 	}
4149 }
4150 
Trunc(RValue<Float4> x)4151 RValue<Float4> Trunc(RValue<Float4> x)
4152 {
4153 	RR_DEBUG_INFO_UPDATE_LOC();
4154 	if(CPUID::SSE4_1)
4155 	{
4156 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4157 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4158 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4159 		round->addArg(x.value());
4160 		round->addArg(::context->getConstantInt32(3));
4161 		::basicBlock->appendInst(round);
4162 
4163 		return RValue<Float4>(V(result));
4164 	}
4165 	else
4166 	{
4167 		return Float4(Int4(x));
4168 	}
4169 }
4170 
Frac(RValue<Float4> x)4171 RValue<Float4> Frac(RValue<Float4> x)
4172 {
4173 	RR_DEBUG_INFO_UPDATE_LOC();
4174 	Float4 frc;
4175 
4176 	if(CPUID::SSE4_1)
4177 	{
4178 		frc = x - Floor(x);
4179 	}
4180 	else
4181 	{
4182 		frc = x - Float4(Int4(x));  // Signed fractional part.
4183 
4184 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));  // Add 1.0 if negative.
4185 	}
4186 
4187 	// x - floor(x) can be 1.0 for very small negative x.
4188 	// Clamp against the value just below 1.0.
4189 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
4190 }
4191 
Floor(RValue<Float4> x)4192 RValue<Float4> Floor(RValue<Float4> x)
4193 {
4194 	RR_DEBUG_INFO_UPDATE_LOC();
4195 	if(CPUID::SSE4_1)
4196 	{
4197 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4198 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4199 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4200 		round->addArg(x.value());
4201 		round->addArg(::context->getConstantInt32(1));
4202 		::basicBlock->appendInst(round);
4203 
4204 		return RValue<Float4>(V(result));
4205 	}
4206 	else
4207 	{
4208 		return x - Frac(x);
4209 	}
4210 }
4211 
Ceil(RValue<Float4> x)4212 RValue<Float4> Ceil(RValue<Float4> x)
4213 {
4214 	RR_DEBUG_INFO_UPDATE_LOC();
4215 	if(CPUID::SSE4_1)
4216 	{
4217 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4218 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4219 		auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4220 		round->addArg(x.value());
4221 		round->addArg(::context->getConstantInt32(2));
4222 		::basicBlock->appendInst(round);
4223 
4224 		return RValue<Float4>(V(result));
4225 	}
4226 	else
4227 	{
4228 		return -Floor(-x);
4229 	}
4230 }
4231 
type()4232 Type *Float4::type()
4233 {
4234 	return T(Ice::IceType_v4f32);
4235 }
4236 
Ticks()4237 RValue<Long> Ticks()
4238 {
4239 	RR_DEBUG_INFO_UPDATE_LOC();
4240 	UNIMPLEMENTED_NO_BUG("RValue<Long> Ticks()");
4241 	return Long(Int(0));
4242 }
4243 
ConstantPointer(void const * ptr)4244 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
4245 {
4246 	RR_DEBUG_INFO_UPDATE_LOC();
4247 	return RValue<Pointer<Byte>>{ V(sz::getConstantPointer(::context, ptr)) };
4248 }
4249 
ConstantData(void const * data,size_t size)4250 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
4251 {
4252 	RR_DEBUG_INFO_UPDATE_LOC();
4253 	return RValue<Pointer<Byte>>{ V(IceConstantData(data, size)) };
4254 }
4255 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)4256 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
4257 {
4258 	RR_DEBUG_INFO_UPDATE_LOC();
4259 	return V(sz::Call(::function, ::basicBlock, T(retTy), V(fptr.value()), V(args), false));
4260 }
4261 
Breakpoint()4262 void Breakpoint()
4263 {
4264 	RR_DEBUG_INFO_UPDATE_LOC();
4265 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4266 	auto trap = Ice::InstIntrinsic::create(::function, 0, nullptr, intrinsic);
4267 	::basicBlock->appendInst(trap);
4268 }
4269 
createFence(std::memory_order memoryOrder)4270 void Nucleus::createFence(std::memory_order memoryOrder)
4271 {
4272 	RR_DEBUG_INFO_UPDATE_LOC();
4273 	const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicFence, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4274 	auto inst = Ice::InstIntrinsic::create(::function, 0, nullptr, intrinsic);
4275 	auto order = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrder));
4276 	inst->addArg(order);
4277 	::basicBlock->appendInst(inst);
4278 }
4279 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)4280 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
4281 {
4282 	RR_DEBUG_INFO_UPDATE_LOC();
4283 	UNIMPLEMENTED("b/155867273 Subzero createMaskedLoad()");
4284 	return nullptr;
4285 }
4286 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)4287 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
4288 {
4289 	RR_DEBUG_INFO_UPDATE_LOC();
4290 	UNIMPLEMENTED("b/155867273 Subzero createMaskedStore()");
4291 }
4292 
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)4293 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
4294 {
4295 	RR_DEBUG_INFO_UPDATE_LOC();
4296 	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
4297 }
4298 
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)4299 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
4300 {
4301 	RR_DEBUG_INFO_UPDATE_LOC();
4302 	return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
4303 }
4304 
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)4305 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
4306 {
4307 	RR_DEBUG_INFO_UPDATE_LOC();
4308 	return emulated::Scatter(base, val, offsets, mask, alignment);
4309 }
4310 
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)4311 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
4312 {
4313 	RR_DEBUG_INFO_UPDATE_LOC();
4314 	return emulated::Scatter(base, val, offsets, mask, alignment);
4315 }
4316 
Exp2(RValue<Float> x)4317 RValue<Float> Exp2(RValue<Float> x)
4318 {
4319 	RR_DEBUG_INFO_UPDATE_LOC();
4320 	return emulated::Exp2(x);
4321 }
4322 
Log2(RValue<Float> x)4323 RValue<Float> Log2(RValue<Float> x)
4324 {
4325 	RR_DEBUG_INFO_UPDATE_LOC();
4326 	return emulated::Log2(x);
4327 }
4328 
Sin(RValue<Float4> x)4329 RValue<Float4> Sin(RValue<Float4> x)
4330 {
4331 	RR_DEBUG_INFO_UPDATE_LOC();
4332 	return optimal::Sin(x);
4333 }
4334 
Cos(RValue<Float4> x)4335 RValue<Float4> Cos(RValue<Float4> x)
4336 {
4337 	RR_DEBUG_INFO_UPDATE_LOC();
4338 	return optimal::Cos(x);
4339 }
4340 
Tan(RValue<Float4> x)4341 RValue<Float4> Tan(RValue<Float4> x)
4342 {
4343 	RR_DEBUG_INFO_UPDATE_LOC();
4344 	return optimal::Tan(x);
4345 }
4346 
Asin(RValue<Float4> x,Precision p)4347 RValue<Float4> Asin(RValue<Float4> x, Precision p)
4348 {
4349 	RR_DEBUG_INFO_UPDATE_LOC();
4350 	if(p == Precision::Full)
4351 	{
4352 		return emulated::Asin(x);
4353 	}
4354 	return optimal::Asin_8_terms(x);
4355 }
4356 
Acos(RValue<Float4> x,Precision p)4357 RValue<Float4> Acos(RValue<Float4> x, Precision p)
4358 {
4359 	RR_DEBUG_INFO_UPDATE_LOC();
4360 	// Surprisingly, deqp-vk's precision.acos.highp/mediump tests pass when using the 4-term polynomial approximation
4361 	// version of acos, unlike for Asin, which requires higher precision algorithms.
4362 	return optimal::Acos_4_terms(x);
4363 }
4364 
Atan(RValue<Float4> x)4365 RValue<Float4> Atan(RValue<Float4> x)
4366 {
4367 	RR_DEBUG_INFO_UPDATE_LOC();
4368 	return optimal::Atan(x);
4369 }
4370 
Sinh(RValue<Float4> x)4371 RValue<Float4> Sinh(RValue<Float4> x)
4372 {
4373 	RR_DEBUG_INFO_UPDATE_LOC();
4374 	return optimal::Sinh(x);
4375 }
4376 
Cosh(RValue<Float4> x)4377 RValue<Float4> Cosh(RValue<Float4> x)
4378 {
4379 	RR_DEBUG_INFO_UPDATE_LOC();
4380 	return optimal::Cosh(x);
4381 }
4382 
Tanh(RValue<Float4> x)4383 RValue<Float4> Tanh(RValue<Float4> x)
4384 {
4385 	RR_DEBUG_INFO_UPDATE_LOC();
4386 	return optimal::Tanh(x);
4387 }
4388 
Asinh(RValue<Float4> x)4389 RValue<Float4> Asinh(RValue<Float4> x)
4390 {
4391 	RR_DEBUG_INFO_UPDATE_LOC();
4392 	return optimal::Asinh(x);
4393 }
4394 
Acosh(RValue<Float4> x)4395 RValue<Float4> Acosh(RValue<Float4> x)
4396 {
4397 	RR_DEBUG_INFO_UPDATE_LOC();
4398 	return optimal::Acosh(x);
4399 }
4400 
Atanh(RValue<Float4> x)4401 RValue<Float4> Atanh(RValue<Float4> x)
4402 {
4403 	RR_DEBUG_INFO_UPDATE_LOC();
4404 	return optimal::Atanh(x);
4405 }
4406 
Atan2(RValue<Float4> x,RValue<Float4> y)4407 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
4408 {
4409 	RR_DEBUG_INFO_UPDATE_LOC();
4410 	return optimal::Atan2(x, y);
4411 }
4412 
Pow(RValue<Float4> x,RValue<Float4> y)4413 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
4414 {
4415 	RR_DEBUG_INFO_UPDATE_LOC();
4416 	return optimal::Pow(x, y);
4417 }
4418 
Exp(RValue<Float4> x)4419 RValue<Float4> Exp(RValue<Float4> x)
4420 {
4421 	RR_DEBUG_INFO_UPDATE_LOC();
4422 	return optimal::Exp(x);
4423 }
4424 
Log(RValue<Float4> x)4425 RValue<Float4> Log(RValue<Float4> x)
4426 {
4427 	RR_DEBUG_INFO_UPDATE_LOC();
4428 	return optimal::Log(x);
4429 }
4430 
Exp2(RValue<Float4> x)4431 RValue<Float4> Exp2(RValue<Float4> x)
4432 {
4433 	RR_DEBUG_INFO_UPDATE_LOC();
4434 	return optimal::Exp2(x);
4435 }
4436 
Log2(RValue<Float4> x)4437 RValue<Float4> Log2(RValue<Float4> x)
4438 {
4439 	RR_DEBUG_INFO_UPDATE_LOC();
4440 	return optimal::Log2(x);
4441 }
4442 
Ctlz(RValue<UInt> x,bool isZeroUndef)4443 RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
4444 {
4445 	RR_DEBUG_INFO_UPDATE_LOC();
4446 	if(emulateIntrinsics)
4447 	{
4448 		UNIMPLEMENTED_NO_BUG("Subzero Ctlz()");
4449 		return UInt(0);
4450 	}
4451 	else
4452 	{
4453 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4454 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4455 		auto ctlz = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4456 		ctlz->addArg(x.value());
4457 		::basicBlock->appendInst(ctlz);
4458 
4459 		return RValue<UInt>(V(result));
4460 	}
4461 }
4462 
Ctlz(RValue<UInt4> x,bool isZeroUndef)4463 RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
4464 {
4465 	RR_DEBUG_INFO_UPDATE_LOC();
4466 	if(emulateIntrinsics)
4467 	{
4468 		UNIMPLEMENTED_NO_BUG("Subzero Ctlz()");
4469 		return UInt4(0);
4470 	}
4471 	else
4472 	{
4473 		// TODO: implement vectorized version in Subzero
4474 		UInt4 result;
4475 		result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
4476 		result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
4477 		result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
4478 		result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
4479 		return result;
4480 	}
4481 }
4482 
Cttz(RValue<UInt> x,bool isZeroUndef)4483 RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
4484 {
4485 	RR_DEBUG_INFO_UPDATE_LOC();
4486 	if(emulateIntrinsics)
4487 	{
4488 		UNIMPLEMENTED_NO_BUG("Subzero Cttz()");
4489 		return UInt(0);
4490 	}
4491 	else
4492 	{
4493 		Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4494 		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4495 		auto ctlz = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4496 		ctlz->addArg(x.value());
4497 		::basicBlock->appendInst(ctlz);
4498 
4499 		return RValue<UInt>(V(result));
4500 	}
4501 }
4502 
Cttz(RValue<UInt4> x,bool isZeroUndef)4503 RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
4504 {
4505 	RR_DEBUG_INFO_UPDATE_LOC();
4506 	if(emulateIntrinsics)
4507 	{
4508 		UNIMPLEMENTED_NO_BUG("Subzero Cttz()");
4509 		return UInt4(0);
4510 	}
4511 	else
4512 	{
4513 		// TODO: implement vectorized version in Subzero
4514 		UInt4 result;
4515 		result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
4516 		result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
4517 		result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
4518 		result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
4519 		return result;
4520 	}
4521 }
4522 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)4523 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
4524 {
4525 	RR_DEBUG_INFO_UPDATE_LOC();
4526 	return emulated::MinAtomic(x, y, memoryOrder);
4527 }
4528 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)4529 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
4530 {
4531 	RR_DEBUG_INFO_UPDATE_LOC();
4532 	return emulated::MinAtomic(x, y, memoryOrder);
4533 }
4534 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)4535 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
4536 {
4537 	RR_DEBUG_INFO_UPDATE_LOC();
4538 	return emulated::MaxAtomic(x, y, memoryOrder);
4539 }
4540 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)4541 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
4542 {
4543 	RR_DEBUG_INFO_UPDATE_LOC();
4544 	return emulated::MaxAtomic(x, y, memoryOrder);
4545 }
4546 
EmitDebugLocation()4547 void EmitDebugLocation()
4548 {
4549 #ifdef ENABLE_RR_DEBUG_INFO
4550 	emitPrintLocation(getCallerBacktrace());
4551 #endif  // ENABLE_RR_DEBUG_INFO
4552 }
EmitDebugVariable(Value * value)4553 void EmitDebugVariable(Value *value) {}
FlushDebug()4554 void FlushDebug() {}
4555 
4556 namespace {
4557 namespace coro {
4558 
4559 // Instance data per generated coroutine
4560 // This is the "handle" type used for Coroutine functions
4561 // Lifetime: from yield to when CoroutineEntryDestroy generated function is called.
4562 struct CoroutineData
4563 {
4564 	bool useInternalScheduler = false;
4565 	bool done = false;        // the coroutine should stop at the next yield()
4566 	bool terminated = false;  // the coroutine has finished.
4567 	bool inRoutine = false;   // is the coroutine currently executing?
4568 	marl::Scheduler::Fiber *mainFiber = nullptr;
4569 	marl::Scheduler::Fiber *routineFiber = nullptr;
4570 	void *promisePtr = nullptr;
4571 };
4572 
createCoroutineData()4573 CoroutineData *createCoroutineData()
4574 {
4575 	return new CoroutineData{};
4576 }
4577 
destroyCoroutineData(CoroutineData * coroData)4578 void destroyCoroutineData(CoroutineData *coroData)
4579 {
4580 	delete coroData;
4581 }
4582 
4583 // suspend() pauses execution of the coroutine, and resumes execution from the
4584 // caller's call to await().
4585 // Returns true if await() is called again, or false if coroutine_destroy()
4586 // is called.
suspend(Nucleus::CoroutineHandle handle)4587 bool suspend(Nucleus::CoroutineHandle handle)
4588 {
4589 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4590 	ASSERT(marl::Scheduler::Fiber::current() == coroData->routineFiber);
4591 	ASSERT(coroData->inRoutine);
4592 	coroData->inRoutine = false;
4593 	coroData->mainFiber->notify();
4594 	while(!coroData->inRoutine)
4595 	{
4596 		coroData->routineFiber->wait();
4597 	}
4598 	return !coroData->done;
4599 }
4600 
4601 // resume() is called by await(), blocking until the coroutine calls yield()
4602 // or the coroutine terminates.
resume(Nucleus::CoroutineHandle handle)4603 void resume(Nucleus::CoroutineHandle handle)
4604 {
4605 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4606 	ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
4607 	ASSERT(!coroData->inRoutine);
4608 	coroData->inRoutine = true;
4609 	coroData->routineFiber->notify();
4610 	while(coroData->inRoutine)
4611 	{
4612 		coroData->mainFiber->wait();
4613 	}
4614 }
4615 
4616 // stop() is called by coroutine_destroy(), signalling that it's done, then blocks
4617 // until the coroutine ends, and deletes the coroutine data.
stop(Nucleus::CoroutineHandle handle)4618 void stop(Nucleus::CoroutineHandle handle)
4619 {
4620 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4621 	ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
4622 	ASSERT(!coroData->inRoutine);
4623 	if(!coroData->terminated)
4624 	{
4625 		coroData->done = true;
4626 		coroData->inRoutine = true;
4627 		coroData->routineFiber->notify();
4628 		while(!coroData->terminated)
4629 		{
4630 			coroData->mainFiber->wait();
4631 		}
4632 	}
4633 	if(coroData->useInternalScheduler)
4634 	{
4635 		::getOrCreateScheduler().unbind();
4636 	}
4637 	coro::destroyCoroutineData(coroData);  // free the coroutine data.
4638 }
4639 
4640 namespace detail {
4641 thread_local rr::Nucleus::CoroutineHandle coroHandle{};
4642 }  // namespace detail
4643 
setHandleParam(Nucleus::CoroutineHandle handle)4644 void setHandleParam(Nucleus::CoroutineHandle handle)
4645 {
4646 	ASSERT(!detail::coroHandle);
4647 	detail::coroHandle = handle;
4648 }
4649 
getHandleParam()4650 Nucleus::CoroutineHandle getHandleParam()
4651 {
4652 	ASSERT(detail::coroHandle);
4653 	auto handle = detail::coroHandle;
4654 	detail::coroHandle = {};
4655 	return handle;
4656 }
4657 
isDone(Nucleus::CoroutineHandle handle)4658 bool isDone(Nucleus::CoroutineHandle handle)
4659 {
4660 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4661 	return coroData->done;
4662 }
4663 
setPromisePtr(Nucleus::CoroutineHandle handle,void * promisePtr)4664 void setPromisePtr(Nucleus::CoroutineHandle handle, void *promisePtr)
4665 {
4666 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4667 	coroData->promisePtr = promisePtr;
4668 }
4669 
getPromisePtr(Nucleus::CoroutineHandle handle)4670 void *getPromisePtr(Nucleus::CoroutineHandle handle)
4671 {
4672 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4673 	return coroData->promisePtr;
4674 }
4675 
4676 }  // namespace coro
4677 }  // namespace
4678 
4679 // Used to generate coroutines.
4680 // Lifetime: from yield to acquireCoroutine
4681 class CoroutineGenerator
4682 {
4683 public:
CoroutineGenerator()4684 	CoroutineGenerator()
4685 	{
4686 	}
4687 
4688 	// Inserts instructions at the top of the current function to make it a coroutine.
generateCoroutineBegin()4689 	void generateCoroutineBegin()
4690 	{
4691 		// Begin building the main coroutine_begin() function.
4692 		// We insert these instructions at the top of the entry node,
4693 		// before existing reactor-generated instructions.
4694 
4695 		//    CoroutineHandle coroutine_begin(<Arguments>)
4696 		//    {
4697 		//        this->handle = coro::getHandleParam();
4698 		//
4699 		//        YieldType promise;
4700 		//        coro::setPromisePtr(handle, &promise); // For await
4701 		//
4702 		//        ... <REACTOR CODE> ...
4703 		//
4704 
4705 		//        this->handle = coro::getHandleParam();
4706 		this->handle = sz::Call(::function, ::entryBlock, coro::getHandleParam);
4707 
4708 		//        YieldType promise;
4709 		//        coro::setPromisePtr(handle, &promise); // For await
4710 		this->promise = sz::allocateStackVariable(::function, T(::coroYieldType));
4711 		sz::Call(::function, ::entryBlock, coro::setPromisePtr, this->handle, this->promise);
4712 	}
4713 
4714 	// Adds instructions for Yield() calls at the current location of the main coroutine function.
generateYield(Value * val)4715 	void generateYield(Value *val)
4716 	{
4717 		//        ... <REACTOR CODE> ...
4718 		//
4719 		//        promise = val;
4720 		//        if (!coro::suspend(handle)) {
4721 		//            return false; // coroutine has been stopped by the caller.
4722 		//        }
4723 		//
4724 		//        ... <REACTOR CODE> ...
4725 
4726 		//        promise = val;
4727 		Nucleus::createStore(val, V(this->promise), ::coroYieldType);
4728 
4729 		//        if (!coro::suspend(handle)) {
4730 		auto result = sz::Call(::function, ::basicBlock, coro::suspend, this->handle);
4731 		auto doneBlock = Nucleus::createBasicBlock();
4732 		auto resumeBlock = Nucleus::createBasicBlock();
4733 		Nucleus::createCondBr(V(result), resumeBlock, doneBlock);
4734 
4735 		//            return false; // coroutine has been stopped by the caller.
4736 		::basicBlock = doneBlock;
4737 		Nucleus::createRetVoid();  // coroutine return value is ignored.
4738 
4739 		//        ... <REACTOR CODE> ...
4740 		::basicBlock = resumeBlock;
4741 	}
4742 
4743 	using FunctionUniquePtr = std::unique_ptr<Ice::Cfg>;
4744 
4745 	// Generates the await function for the current coroutine.
4746 	// Cannot use Nucleus functions that modify ::function and ::basicBlock.
generateAwaitFunction()4747 	static FunctionUniquePtr generateAwaitFunction()
4748 	{
4749 		// bool coroutine_await(CoroutineHandle handle, YieldType* out)
4750 		// {
4751 		//     if (coro::isDone())
4752 		//     {
4753 		//         return false;
4754 		//     }
4755 		//     else // resume
4756 		//     {
4757 		//         YieldType* promise = coro::getPromisePtr(handle);
4758 		//         *out = *promise;
4759 		//         coro::resume(handle);
4760 		//         return true;
4761 		//     }
4762 		// }
4763 
4764 		// Subzero doesn't support bool types (IceType_i1) as return type
4765 		const Ice::Type ReturnType = Ice::IceType_i32;
4766 		const Ice::Type YieldPtrType = sz::getPointerType(T(::coroYieldType));
4767 		const Ice::Type HandleType = sz::getPointerType(Ice::IceType_void);
4768 
4769 		Ice::Cfg *awaitFunc = sz::createFunction(::context, ReturnType, std::vector<Ice::Type>{ HandleType, YieldPtrType });
4770 		Ice::CfgLocalAllocatorScope scopedAlloc{ awaitFunc };
4771 
4772 		Ice::Variable *handle = awaitFunc->getArgs()[0];
4773 		Ice::Variable *outPtr = awaitFunc->getArgs()[1];
4774 
4775 		auto doneBlock = awaitFunc->makeNode();
4776 		{
4777 			//         return false;
4778 			Ice::InstRet *ret = Ice::InstRet::create(awaitFunc, ::context->getConstantInt32(0));
4779 			doneBlock->appendInst(ret);
4780 		}
4781 
4782 		auto resumeBlock = awaitFunc->makeNode();
4783 		{
4784 			//         YieldType* promise = coro::getPromisePtr(handle);
4785 			Ice::Variable *promise = sz::Call(awaitFunc, resumeBlock, coro::getPromisePtr, handle);
4786 
4787 			//         *out = *promise;
4788 			// Load promise value
4789 			Ice::Variable *promiseVal = awaitFunc->makeVariable(T(::coroYieldType));
4790 			auto load = Ice::InstLoad::create(awaitFunc, promiseVal, promise);
4791 			resumeBlock->appendInst(load);
4792 			// Then store it in output param
4793 			auto store = Ice::InstStore::create(awaitFunc, promiseVal, outPtr);
4794 			resumeBlock->appendInst(store);
4795 
4796 			//         coro::resume(handle);
4797 			sz::Call(awaitFunc, resumeBlock, coro::resume, handle);
4798 
4799 			//         return true;
4800 			Ice::InstRet *ret = Ice::InstRet::create(awaitFunc, ::context->getConstantInt32(1));
4801 			resumeBlock->appendInst(ret);
4802 		}
4803 
4804 		//     if (coro::isDone())
4805 		//     {
4806 		//         <doneBlock>
4807 		//     }
4808 		//     else // resume
4809 		//     {
4810 		//         <resumeBlock>
4811 		//     }
4812 		Ice::CfgNode *bb = awaitFunc->getEntryNode();
4813 		Ice::Variable *done = sz::Call(awaitFunc, bb, coro::isDone, handle);
4814 		auto br = Ice::InstBr::create(awaitFunc, done, doneBlock, resumeBlock);
4815 		bb->appendInst(br);
4816 
4817 		return FunctionUniquePtr{ awaitFunc };
4818 	}
4819 
4820 	// Generates the destroy function for the current coroutine.
4821 	// Cannot use Nucleus functions that modify ::function and ::basicBlock.
generateDestroyFunction()4822 	static FunctionUniquePtr generateDestroyFunction()
4823 	{
4824 		// void coroutine_destroy(Nucleus::CoroutineHandle handle)
4825 		// {
4826 		//     coro::stop(handle); // signal and wait for coroutine to stop, and delete coroutine data
4827 		//     return;
4828 		// }
4829 
4830 		const Ice::Type ReturnType = Ice::IceType_void;
4831 		const Ice::Type HandleType = sz::getPointerType(Ice::IceType_void);
4832 
4833 		Ice::Cfg *destroyFunc = sz::createFunction(::context, ReturnType, std::vector<Ice::Type>{ HandleType });
4834 		Ice::CfgLocalAllocatorScope scopedAlloc{ destroyFunc };
4835 
4836 		Ice::Variable *handle = destroyFunc->getArgs()[0];
4837 
4838 		auto *bb = destroyFunc->getEntryNode();
4839 
4840 		//     coro::stop(handle); // signal and wait for coroutine to stop, and delete coroutine data
4841 		sz::Call(destroyFunc, bb, coro::stop, handle);
4842 
4843 		//     return;
4844 		Ice::InstRet *ret = Ice::InstRet::create(destroyFunc);
4845 		bb->appendInst(ret);
4846 
4847 		return FunctionUniquePtr{ destroyFunc };
4848 	}
4849 
4850 private:
4851 	Ice::Variable *handle{};
4852 	Ice::Variable *promise{};
4853 };
4854 
invokeCoroutineBegin(std::function<Nucleus::CoroutineHandle ()> beginFunc)4855 static Nucleus::CoroutineHandle invokeCoroutineBegin(std::function<Nucleus::CoroutineHandle()> beginFunc)
4856 {
4857 	// This doubles up as our coroutine handle
4858 	auto coroData = coro::createCoroutineData();
4859 
4860 	coroData->useInternalScheduler = (marl::Scheduler::get() == nullptr);
4861 	if(coroData->useInternalScheduler)
4862 	{
4863 		::getOrCreateScheduler().bind();
4864 	}
4865 
4866 	auto run = [=] {
4867 		// Store handle in TLS so that the coroutine can grab it right away, before
4868 		// any fiber switch occurs.
4869 		coro::setHandleParam(coroData);
4870 
4871 		ASSERT(!coroData->routineFiber);
4872 		coroData->routineFiber = marl::Scheduler::Fiber::current();
4873 
4874 		beginFunc();
4875 
4876 		ASSERT(coroData->inRoutine);
4877 		coroData->done = true;        // coroutine is done.
4878 		coroData->terminated = true;  // signal that the coroutine data is ready for freeing.
4879 		coroData->inRoutine = false;
4880 		coroData->mainFiber->notify();
4881 	};
4882 
4883 	ASSERT(!coroData->mainFiber);
4884 	coroData->mainFiber = marl::Scheduler::Fiber::current();
4885 
4886 	// block until the first yield or coroutine end
4887 	ASSERT(!coroData->inRoutine);
4888 	coroData->inRoutine = true;
4889 	marl::schedule(marl::Task(run, marl::Task::Flags::SameThread));
4890 	while(coroData->inRoutine)
4891 	{
4892 		coroData->mainFiber->wait();
4893 	}
4894 
4895 	return coroData;
4896 }
4897 
createCoroutine(Type * yieldType,const std::vector<Type * > & params)4898 void Nucleus::createCoroutine(Type *yieldType, const std::vector<Type *> &params)
4899 {
4900 	// Start by creating a regular function
4901 	createFunction(yieldType, params);
4902 
4903 	// Save in case yield() is called
4904 	ASSERT(::coroYieldType == nullptr);  // Only one coroutine can be generated at once
4905 	::coroYieldType = yieldType;
4906 }
4907 
yield(Value * val)4908 void Nucleus::yield(Value *val)
4909 {
4910 	RR_DEBUG_INFO_UPDATE_LOC();
4911 	Variable::materializeAll();
4912 
4913 	// On first yield, we start generating coroutine functions
4914 	if(!::coroGen)
4915 	{
4916 		::coroGen = std::make_shared<CoroutineGenerator>();
4917 		::coroGen->generateCoroutineBegin();
4918 	}
4919 
4920 	ASSERT(::coroGen);
4921 	::coroGen->generateYield(val);
4922 }
4923 
coroutineEntryAwaitStub(Nucleus::CoroutineHandle,void * yieldValue)4924 static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void *yieldValue)
4925 {
4926 	return false;
4927 }
4928 
coroutineEntryDestroyStub(Nucleus::CoroutineHandle handle)4929 static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle handle)
4930 {
4931 }
4932 
acquireCoroutine(const char * name,const Config::Edit * cfgEdit)4933 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit *cfgEdit /* = nullptr */)
4934 {
4935 	if(::coroGen)
4936 	{
4937 		// Finish generating coroutine functions
4938 		{
4939 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
4940 			finalizeFunction();
4941 		}
4942 
4943 		auto awaitFunc = ::coroGen->generateAwaitFunction();
4944 		auto destroyFunc = ::coroGen->generateDestroyFunction();
4945 
4946 		// At this point, we no longer need the CoroutineGenerator.
4947 		::coroGen.reset();
4948 		::coroYieldType = nullptr;
4949 
4950 		auto routine = rr::acquireRoutine({ ::function, awaitFunc.get(), destroyFunc.get() },
4951 		                                  { name, "await", "destroy" },
4952 		                                  cfgEdit);
4953 
4954 		return routine;
4955 	}
4956 	else
4957 	{
4958 		{
4959 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
4960 			finalizeFunction();
4961 		}
4962 
4963 		::coroYieldType = nullptr;
4964 
4965 		// Not an actual coroutine (no yields), so return stubs for await and destroy
4966 		auto routine = rr::acquireRoutine({ ::function }, { name }, cfgEdit);
4967 
4968 		auto routineImpl = std::static_pointer_cast<ELFMemoryStreamer>(routine);
4969 		routineImpl->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void *>(&coroutineEntryAwaitStub));
4970 		routineImpl->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void *>(&coroutineEntryDestroyStub));
4971 		return routine;
4972 	}
4973 }
4974 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4975 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4976 {
4977 	const bool isCoroutine = routine.getEntry(Nucleus::CoroutineEntryAwait) != reinterpret_cast<const void *>(&coroutineEntryAwaitStub);
4978 
4979 	if(isCoroutine)
4980 	{
4981 		return rr::invokeCoroutineBegin(func);
4982 	}
4983 	else
4984 	{
4985 		// For regular routines, just invoke the begin func directly
4986 		return func();
4987 	}
4988 }
4989 
4990 }  // namespace rr
4991