1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/core/SkStream.h" 9 #include "include/core/SkString.h" 10 #include "include/private/SkHalf.h" 11 #include "include/private/SkTFitsIn.h" 12 #include "include/private/SkThreadID.h" 13 #include "src/core/SkColorSpacePriv.h" 14 #include "src/core/SkColorSpaceXformSteps.h" 15 #include "src/core/SkCpu.h" 16 #include "src/core/SkEnumerate.h" 17 #include "src/core/SkOpts.h" 18 #include "src/core/SkVM.h" 19 #include <algorithm> 20 #include <atomic> 21 #include <queue> 22 23 #if defined(SKVM_LLVM) 24 #include <future> 25 #include <llvm/Bitcode/BitcodeWriter.h> 26 #include <llvm/ExecutionEngine/ExecutionEngine.h> 27 #include <llvm/IR/IRBuilder.h> 28 #include <llvm/IR/Verifier.h> 29 #include <llvm/Support/TargetSelect.h> 30 #include <llvm/Support/Host.h> 31 32 // Platform-specific intrinsics got their own files in LLVM 10. 33 #if __has_include(<llvm/IR/IntrinsicsX86.h>) 34 #include <llvm/IR/IntrinsicsX86.h> 35 #endif 36 #endif 37 38 // #define SKVM_LLVM_WAIT_FOR_COMPILATION 39 40 bool gSkVMAllowJIT{false}; 41 bool gSkVMJITViaDylib{false}; 42 43 #if defined(SKVM_JIT) 44 #if defined(SK_BUILD_FOR_WIN) 45 #include "src/core/SkLeanWindows.h" 46 #include <memoryapi.h> 47 alloc_jit_buffer(size_t * len)48 static void* alloc_jit_buffer(size_t* len) { 49 return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); 50 } remap_as_executable(void * ptr,size_t len)51 static void remap_as_executable(void* ptr, size_t len) { 52 DWORD old; 53 VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old); 54 SkASSERT(old == PAGE_READWRITE); 55 } 56 #if !defined(SKVM_LLVM) unmap_jit_buffer(void * ptr,size_t len)57 static void unmap_jit_buffer(void* ptr, size_t len) { 58 VirtualFree(ptr, 0, MEM_RELEASE); 59 } close_dylib(void * dylib)60 static void close_dylib(void* dylib) { 61 SkASSERT(false); // TODO? For now just assert we never make one. 62 } 63 #endif 64 #else 65 #include <dlfcn.h> 66 #include <sys/mman.h> 67 alloc_jit_buffer(size_t * len)68 static void* alloc_jit_buffer(size_t* len) { 69 // While mprotect and VirtualAlloc both work at page granularity, 70 // mprotect doesn't round up for you, and instead requires *len is at page granularity. 71 const size_t page = sysconf(_SC_PAGESIZE); 72 *len = ((*len + page - 1) / page) * page; 73 return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); 74 } remap_as_executable(void * ptr,size_t len)75 static void remap_as_executable(void* ptr, size_t len) { 76 mprotect(ptr, len, PROT_READ|PROT_EXEC); 77 __builtin___clear_cache((char*)ptr, 78 (char*)ptr + len); 79 } 80 #if !defined(SKVM_LLVM) unmap_jit_buffer(void * ptr,size_t len)81 static void unmap_jit_buffer(void* ptr, size_t len) { 82 munmap(ptr, len); 83 } close_dylib(void * dylib)84 static void close_dylib(void* dylib) { 85 dlclose(dylib); 86 } 87 #endif 88 #endif 89 90 #if defined(SKVM_JIT_VTUNE) 91 #include <jitprofiling.h> notify_vtune(const char * name,void * addr,size_t len)92 static void notify_vtune(const char* name, void* addr, size_t len) { 93 if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) { 94 iJIT_Method_Load event; 95 memset(&event, 0, sizeof(event)); 96 event.method_id = iJIT_GetNewMethodID(); 97 event.method_name = const_cast<char*>(name); 98 event.method_load_address = addr; 99 event.method_size = len; 100 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event); 101 } 102 } 103 #else notify_vtune(const char * name,void * addr,size_t len)104 static void notify_vtune(const char* name, void* addr, size_t len) {} 105 #endif 106 #endif 107 108 // JIT code isn't MSAN-instrumented, so we won't see when it uses 109 // uninitialized memory, and we'll not see the writes it makes as properly 110 // initializing memory. Instead force the interpreter, which should let 111 // MSAN see everything our programs do properly. 112 // 113 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. 114 #if defined(__has_feature) 115 #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) 116 #define SKVM_JIT_BUT_IGNORE_IT 117 #endif 118 #endif 119 120 #if defined(SKSL_STANDALONE) 121 // skslc needs to link against this module (for the VM code generator). This module pulls in 122 // color-space code, but attempting to add those transitive dependencies to skslc gets out of 123 // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM 124 // never cares about color management. sk_program_transfer_fn(skvm::F32 v,TFKind tf_kind,skvm::F32 G,skvm::F32 A,skvm::F32 B,skvm::F32 C,skvm::F32 D,skvm::F32 E,skvm::F32 F)125 skvm::F32 sk_program_transfer_fn( 126 skvm::F32 v, TFKind tf_kind, 127 skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) { 128 return v; 129 } 130 skcms_sRGB_TransferFunction()131 const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; } skcms_sRGB_Inverse_TransferFunction()132 const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; } 133 #endif 134 135 namespace skvm { 136 detect_features()137 static Features detect_features() { 138 static const bool fma = 139 #if defined(SK_CPU_X86) 140 SkCpu::Supports(SkCpu::HSW); 141 #elif defined(SK_CPU_ARM64) 142 true; 143 #else 144 false; 145 #endif 146 147 static const bool fp16 = false; // TODO 148 149 return { fma, fp16 }; 150 } 151 Builder()152 Builder::Builder() : fFeatures(detect_features()) {} Builder(Features features)153 Builder::Builder(Features features) : fFeatures(features ) {} 154 155 156 struct Program::Impl { 157 std::vector<InterpreterInstruction> instructions; 158 int regs = 0; 159 int loop = 0; 160 std::vector<int> strides; 161 162 std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders 163 size_t jit_size = 0; 164 void* dylib = nullptr; 165 166 #if defined(SKVM_LLVM) 167 std::unique_ptr<llvm::LLVMContext> llvm_ctx; 168 std::unique_ptr<llvm::ExecutionEngine> llvm_ee; 169 std::future<void> llvm_compiling; 170 #endif 171 }; 172 173 // Debugging tools, mostly for printing various data structures out to a stream. 174 175 namespace { 176 class SkDebugfStream final : public SkWStream { 177 size_t fBytesWritten = 0; 178 write(const void * buffer,size_t size)179 bool write(const void* buffer, size_t size) override { 180 SkDebugf("%.*s", (int)size, (const char*)buffer); 181 fBytesWritten += size; 182 return true; 183 } 184 bytesWritten() const185 size_t bytesWritten() const override { 186 return fBytesWritten; 187 } 188 }; 189 190 struct V { Val id; }; 191 struct R { Reg id; }; 192 struct Shift { int bits; }; 193 struct Splat { int bits; }; 194 struct Hex { int bits; }; 195 // For op `trace_line` or `trace_call` 196 struct Line { int bits; }; 197 // For op `trace_var` 198 struct VarSlot { int bits; }; 199 struct VarType { int bits; }; 200 static constexpr VarType kVarTypeInt{0}; 201 static constexpr VarType kVarTypeFloat{1}; 202 static constexpr VarType kVarTypeBool{2}; 203 // For op `trace_call` 204 struct CallType { int bits; }; 205 static constexpr CallType kCallTypeEnter{1}; 206 static constexpr CallType kCallTypeExit{0}; 207 write(SkWStream * o,const char * s)208 static void write(SkWStream* o, const char* s) { 209 o->writeText(s); 210 } 211 name(Op op)212 static const char* name(Op op) { 213 switch (op) { 214 #define M(x) case Op::x: return #x; 215 SKVM_OPS(M) 216 #undef M 217 } 218 return "unknown op"; 219 } 220 write(SkWStream * o,Op op)221 static void write(SkWStream* o, Op op) { 222 o->writeText(name(op)); 223 } write(SkWStream * o,Ptr p)224 static void write(SkWStream* o, Ptr p) { 225 write(o, "ptr"); 226 o->writeDecAsText(p.ix); 227 } write(SkWStream * o,V v)228 static void write(SkWStream* o, V v) { 229 write(o, "v"); 230 o->writeDecAsText(v.id); 231 } write(SkWStream * o,R r)232 static void write(SkWStream* o, R r) { 233 write(o, "r"); 234 o->writeDecAsText(r.id); 235 } write(SkWStream * o,Shift s)236 static void write(SkWStream* o, Shift s) { 237 o->writeDecAsText(s.bits); 238 } write(SkWStream * o,Splat s)239 static void write(SkWStream* o, Splat s) { 240 float f; 241 memcpy(&f, &s.bits, 4); 242 o->writeHexAsText(s.bits); 243 write(o, " ("); 244 o->writeScalarAsText(f); 245 write(o, ")"); 246 } write(SkWStream * o,Hex h)247 static void write(SkWStream* o, Hex h) { 248 o->writeHexAsText(h.bits); 249 } write(SkWStream * o,Line d)250 static void write(SkWStream* o, Line d) { 251 write(o, "L"); 252 o->writeDecAsText(d.bits); 253 } write(SkWStream * o,VarSlot s)254 static void write(SkWStream* o, VarSlot s) { 255 write(o, "$"); 256 o->writeDecAsText(s.bits); 257 } write(SkWStream * o,VarType n)258 static void write(SkWStream* o, VarType n) { 259 if (n.bits == kVarTypeFloat.bits) { 260 write(o, "(F32)"); 261 } else if (n.bits == kVarTypeInt.bits) { 262 write(o, "(I32)"); 263 } else if (n.bits == kVarTypeBool.bits) { 264 write(o, "(bool)"); 265 } else { 266 write(o, "???"); 267 } 268 } write(SkWStream * o,CallType n)269 static void write(SkWStream* o, CallType n) { 270 if (n.bits == kCallTypeEnter.bits) { 271 write(o, "(enter)"); 272 } else if (n.bits == kCallTypeExit.bits) { 273 write(o, "(exit)"); 274 } else { 275 write(o, "???"); 276 } 277 } 278 279 template <typename T, typename... Ts> write(SkWStream * o,T first,Ts...rest)280 static void write(SkWStream* o, T first, Ts... rest) { 281 write(o, first); 282 write(o, " "); 283 write(o, rest...); 284 } 285 } // namespace 286 write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)287 static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) { 288 Op op = inst.op; 289 Val x = inst.x, 290 y = inst.y, 291 z = inst.z, 292 w = inst.w; 293 int immA = inst.immA, 294 immB = inst.immB, 295 immC = inst.immC; 296 switch (op) { 297 case Op::assert_true: write(o, op, V{x}, V{y}); break; 298 299 case Op::trace_line: write(o, op, V{x}, Line{immA}); break; 300 case Op::trace_var: write(o, op, V{x}, VarSlot{immA}, "=", V{y}, VarType{immB}); break; 301 case Op::trace_call: write(o, op, V{x}, Line{immA}, CallType{immB}); break; 302 303 case Op::store8: write(o, op, Ptr{immA}, V{x} ); break; 304 case Op::store16: write(o, op, Ptr{immA}, V{x} ); break; 305 case Op::store32: write(o, op, Ptr{immA}, V{x} ); break; 306 case Op::store64: write(o, op, Ptr{immA}, V{x},V{y} ); break; 307 case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break; 308 309 case Op::index: write(o, V{id}, "=", op); break; 310 311 case Op::load8: write(o, V{id}, "=", op, Ptr{immA}); break; 312 case Op::load16: write(o, V{id}, "=", op, Ptr{immA}); break; 313 case Op::load32: write(o, V{id}, "=", op, Ptr{immA}); break; 314 case Op::load64: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 315 case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 316 317 case Op::gather8: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 318 case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 319 case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break; 320 321 case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break; 322 case Op::array32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break; 323 324 case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break; 325 326 case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 327 case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 328 case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 329 case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 330 case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 331 case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 332 case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 333 case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 334 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 335 336 337 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break; 338 339 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 340 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 341 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 342 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 343 344 345 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 346 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 347 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 348 349 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 350 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 351 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break; 352 353 case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 354 case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 355 356 357 case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y}); break; 358 case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y}); break; 359 case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y}); break; 360 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break; 361 362 case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 363 364 case Op::ceil: write(o, V{id}, "=", op, V{x}); break; 365 case Op::floor: write(o, V{id}, "=", op, V{x}); break; 366 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break; 367 case Op::to_fp16: write(o, V{id}, "=", op, V{x}); break; 368 case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break; 369 case Op::trunc: write(o, V{id}, "=", op, V{x}); break; 370 case Op::round: write(o, V{id}, "=", op, V{x}); break; 371 } 372 373 write(o, "\n"); 374 } 375 dump(SkWStream * o) const376 void Builder::dump(SkWStream* o) const { 377 SkDebugfStream debug; 378 if (!o) { o = &debug; } 379 380 std::vector<OptimizedInstruction> optimized = this->optimize(); 381 o->writeDecAsText(optimized.size()); 382 o->writeText(" values (originally "); 383 o->writeDecAsText(fProgram.size()); 384 o->writeText("):\n"); 385 for (Val id = 0; id < (Val)optimized.size(); id++) { 386 const OptimizedInstruction& inst = optimized[id]; 387 write(o, inst.can_hoist ? "↑ " : " "); 388 write_one_instruction(id, inst, o); 389 } 390 } 391 dump(SkWStream * o) const392 void Program::dump(SkWStream* o) const { 393 SkDebugfStream debug; 394 if (!o) { o = &debug; } 395 396 o->writeDecAsText(fImpl->regs); 397 o->writeText(" registers, "); 398 o->writeDecAsText(fImpl->instructions.size()); 399 o->writeText(" instructions:\n"); 400 for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { 401 if (i == fImpl->loop) { write(o, "loop:\n"); } 402 o->writeDecAsText(i); 403 o->writeText("\t"); 404 if (i >= fImpl->loop) { write(o, " "); } 405 const InterpreterInstruction& inst = fImpl->instructions[i]; 406 Op op = inst.op; 407 Reg d = inst.d, 408 x = inst.x, 409 y = inst.y, 410 z = inst.z, 411 w = inst.w; 412 int immA = inst.immA, 413 immB = inst.immB, 414 immC = inst.immC; 415 switch (op) { 416 case Op::assert_true: write(o, op, R{x}, R{y}); break; 417 418 case Op::trace_line: write(o, op, R{x}, Line{immA}); break; 419 case Op::trace_var: write(o, op, R{x}, VarSlot{immA}, "=", R{y}, VarType{immB}); 420 break; 421 case Op::trace_call: write(o, op, R{x}, Line{immA}, CallType{immB}); break; 422 423 case Op::store8: write(o, op, Ptr{immA}, R{x} ); break; 424 case Op::store16: write(o, op, Ptr{immA}, R{x} ); break; 425 case Op::store32: write(o, op, Ptr{immA}, R{x} ); break; 426 case Op::store64: write(o, op, Ptr{immA}, R{x}, R{y} ); break; 427 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break; 428 429 case Op::index: write(o, R{d}, "=", op); break; 430 431 case Op::load8: write(o, R{d}, "=", op, Ptr{immA}); break; 432 case Op::load16: write(o, R{d}, "=", op, Ptr{immA}); break; 433 case Op::load32: write(o, R{d}, "=", op, Ptr{immA}); break; 434 case Op::load64: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 435 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 436 437 case Op::gather8: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 438 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 439 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break; 440 441 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break; 442 case Op::array32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break; 443 444 case Op::splat: write(o, R{d}, "=", op, Splat{immA}); break; 445 446 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 447 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 448 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 449 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 450 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 451 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 452 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 453 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 454 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 455 456 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break; 457 458 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 459 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 460 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 461 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 462 463 464 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 465 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 466 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 467 468 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 469 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 470 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break; 471 472 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 473 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 474 475 case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y}); break; 476 case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y}); break; 477 case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y}); break; 478 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break; 479 480 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 481 482 case Op::ceil: write(o, R{d}, "=", op, R{x}); break; 483 case Op::floor: write(o, R{d}, "=", op, R{x}); break; 484 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break; 485 case Op::to_fp16: write(o, R{d}, "=", op, R{x}); break; 486 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break; 487 case Op::trunc: write(o, R{d}, "=", op, R{x}); break; 488 case Op::round: write(o, R{d}, "=", op, R{x}); break; 489 } 490 write(o, "\n"); 491 } 492 } 493 eliminate_dead_code(std::vector<Instruction> program)494 std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) { 495 // Determine which Instructions are live by working back from side effects. 496 std::vector<bool> live(program.size(), false); 497 for (Val id = program.size(); id--;) { 498 if (live[id] || has_side_effect(program[id].op)) { 499 live[id] = true; 500 const Instruction& inst = program[id]; 501 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 502 if (arg != NA) { live[arg] = true; } 503 } 504 } 505 } 506 507 // After removing non-live instructions, we can be left with redundant back-to-back 508 // trace_line instructions. (e.g. one line could have multiple statements on it.) 509 // Eliminate any duplicate ops. 510 int lastId = -1; 511 for (Val id = 0; id < (Val)program.size(); id++) { 512 if (!live[id]) { 513 continue; 514 } 515 const Instruction& inst = program[id]; 516 if (inst.op != Op::trace_line) { 517 lastId = -1; 518 continue; 519 } 520 if (lastId >= 0) { 521 const Instruction& last = program[lastId]; 522 if (inst.immA == last.immA && inst.x == last.x) { 523 // Found two matching trace_lines in a row. Mark the first one as dead. 524 live[lastId] = false; 525 } 526 } 527 lastId = id; 528 } 529 530 // Rewrite the program with only live Instructions: 531 // - remap IDs in live Instructions to what they'll be once dead Instructions are removed; 532 // - then actually remove the dead Instructions. 533 std::vector<Val> new_id(program.size(), NA); 534 for (Val id = 0, next = 0; id < (Val)program.size(); id++) { 535 if (live[id]) { 536 Instruction& inst = program[id]; 537 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) { 538 if (*arg != NA) { 539 *arg = new_id[*arg]; 540 SkASSERT(*arg != NA); 541 } 542 } 543 new_id[id] = next++; 544 } 545 } 546 547 // Eliminate any non-live ops. 548 auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) { 549 Val id = (Val)(&inst - program.data()); 550 return !live[id]; 551 }); 552 program.erase(it, program.end()); 553 554 return program; 555 } 556 finalize(const std::vector<Instruction> program)557 std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) { 558 std::vector<OptimizedInstruction> optimized(program.size()); 559 for (Val id = 0; id < (Val)program.size(); id++) { 560 Instruction inst = program[id]; 561 optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w, 562 inst.immA,inst.immB,inst.immC, 563 /*death=*/id, /*can_hoist=*/true}; 564 } 565 566 // Each Instruction's inputs need to live at least until that Instruction issues. 567 for (Val id = 0; id < (Val)optimized.size(); id++) { 568 OptimizedInstruction& inst = optimized[id]; 569 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 570 // (We're walking in order, so this is the same as max()ing with the existing Val.) 571 if (arg != NA) { optimized[arg].death = id; } 572 } 573 } 574 575 // Mark which values don't depend on the loop and can be hoisted. 576 for (OptimizedInstruction& inst : optimized) { 577 // Varying loads (and gathers) and stores cannot be hoisted out of the loop. 578 if (is_always_varying(inst.op) || is_trace(inst.op)) { 579 inst.can_hoist = false; 580 } 581 582 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. 583 if (inst.can_hoist) { 584 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 585 if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; } 586 } 587 } 588 } 589 590 // Extend the lifetime of any hoisted value that's used in the loop to infinity. 591 for (OptimizedInstruction& inst : optimized) { 592 if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) { 593 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) { 594 if (arg != NA && optimized[arg].can_hoist) { 595 optimized[arg].death = (Val)program.size(); 596 } 597 } 598 } 599 } 600 601 return optimized; 602 } 603 optimize() const604 std::vector<OptimizedInstruction> Builder::optimize() const { 605 std::vector<Instruction> program = this->program(); 606 program = eliminate_dead_code(std::move(program)); 607 return finalize (std::move(program)); 608 } 609 done(const char * debug_name,bool allow_jit) const610 Program Builder::done(const char* debug_name, bool allow_jit) const { 611 char buf[64] = "skvm-jit-"; 612 if (!debug_name) { 613 *SkStrAppendU32(buf+9, this->hash()) = '\0'; 614 debug_name = buf; 615 } 616 617 return {this->optimize(), fStrides, debug_name, allow_jit}; 618 } 619 hash() const620 uint64_t Builder::hash() const { 621 uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), 622 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); 623 return (uint64_t)lo | (uint64_t)hi << 32; 624 } 625 operator !=(Ptr a,Ptr b)626 bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; } 627 operator ==(const Instruction & a,const Instruction & b)628 bool operator==(const Instruction& a, const Instruction& b) { 629 return a.op == b.op 630 && a.x == b.x 631 && a.y == b.y 632 && a.z == b.z 633 && a.w == b.w 634 && a.immA == b.immA 635 && a.immB == b.immB 636 && a.immC == b.immC; 637 } 638 operator ()(const Instruction & inst,uint32_t seed) const639 uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { 640 return SkOpts::hash(&inst, sizeof(inst), seed); 641 } 642 643 644 // Most instructions produce a value and return it by ID, 645 // the value-producing instruction's own index in the program vector. push(Instruction inst)646 Val Builder::push(Instruction inst) { 647 // Basic common subexpression elimination: 648 // if we've already seen this exact Instruction, use it instead of creating a new one. 649 // 650 // But we never dedup loads or stores: an intervening store could change that memory. 651 // Uniforms and gathers touch only uniform memory, so they're fine to dedup, 652 // and index is varying but doesn't touch memory, so it's fine to dedup too. 653 if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) { 654 if (Val* id = fIndex.find(inst)) { 655 return *id; 656 } 657 } 658 Val id = static_cast<Val>(fProgram.size()); 659 fProgram.push_back(inst); 660 fIndex.set(inst, id); 661 return id; 662 } 663 arg(int stride)664 Ptr Builder::arg(int stride) { 665 int ix = (int)fStrides.size(); 666 fStrides.push_back(stride); 667 return {ix}; 668 } 669 assert_true(I32 cond,I32 debug)670 void Builder::assert_true(I32 cond, I32 debug) { 671 #ifdef SK_DEBUG 672 int imm; 673 if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } 674 (void)push(Op::assert_true, cond.id, debug.id); 675 #endif 676 } 677 trace_line(I32 mask,int line)678 void Builder::trace_line(I32 mask, int line) { 679 if (this->isImm(mask.id, 0)) { return; } 680 (void)push(Op::trace_line, mask.id,NA,NA,NA, line); 681 } trace_var(I32 mask,int slot,I32 val)682 void Builder::trace_var(I32 mask, int slot, I32 val) { 683 if (this->isImm(mask.id, 0)) { return; } 684 (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeInt.bits); 685 } trace_var(I32 mask,int slot,F32 val)686 void Builder::trace_var(I32 mask, int slot, F32 val) { 687 if (this->isImm(mask.id, 0)) { return; } 688 (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeFloat.bits); 689 } trace_var(I32 mask,int slot,bool b)690 void Builder::trace_var(I32 mask, int slot, bool b) { 691 if (this->isImm(mask.id, 0)) { return; } 692 I32 val = b ? this->splat(1) : this->splat(0); 693 (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeBool.bits); 694 } trace_call_enter(I32 mask,int line)695 void Builder::trace_call_enter(I32 mask, int line) { 696 if (this->isImm(mask.id, 0)) { return; } 697 (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeEnter.bits); 698 } trace_call_exit(I32 mask,int line)699 void Builder::trace_call_exit(I32 mask, int line) { 700 if (this->isImm(mask.id, 0)) { return; } 701 (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeExit.bits); 702 } 703 store8(Ptr ptr,I32 val)704 void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); } store16(Ptr ptr,I32 val)705 void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); } store32(Ptr ptr,I32 val)706 void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); } store64(Ptr ptr,I32 lo,I32 hi)707 void Builder::store64(Ptr ptr, I32 lo, I32 hi) { 708 (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix); 709 } store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)710 void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) { 711 (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix); 712 } 713 index()714 I32 Builder::index() { return {this, push(Op::index)}; } 715 load8(Ptr ptr)716 I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; } load16(Ptr ptr)717 I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; } load32(Ptr ptr)718 I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; } load64(Ptr ptr,int lane)719 I32 Builder::load64(Ptr ptr, int lane) { 720 return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) }; 721 } load128(Ptr ptr,int lane)722 I32 Builder::load128(Ptr ptr, int lane) { 723 return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) }; 724 } 725 gather8(UPtr ptr,int offset,I32 index)726 I32 Builder::gather8 (UPtr ptr, int offset, I32 index) { 727 return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)}; 728 } gather16(UPtr ptr,int offset,I32 index)729 I32 Builder::gather16(UPtr ptr, int offset, I32 index) { 730 return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)}; 731 } gather32(UPtr ptr,int offset,I32 index)732 I32 Builder::gather32(UPtr ptr, int offset, I32 index) { 733 return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)}; 734 } 735 uniform32(UPtr ptr,int offset)736 I32 Builder::uniform32(UPtr ptr, int offset) { 737 return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)}; 738 } 739 740 // Note: this converts the array index into a byte offset for the op. array32(UPtr ptr,int offset,int index)741 I32 Builder::array32 (UPtr ptr, int offset, int index) { 742 return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))}; 743 } 744 splat(int n)745 I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; } 746 747 // Be careful peepholing float math! Transformations you might expect to 748 // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. 749 // Float peepholes must pass this equivalence test for all ~4B floats: 750 // 751 // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } 752 // 753 // unsigned bits = 0; 754 // do { 755 // float f; 756 // memcpy(&f, &bits, 4); 757 // if (!equiv(f, ...)) { 758 // abort(); 759 // } 760 // } while (++bits != 0); 761 add(F32 x,F32 y)762 F32 Builder::add(F32 x, F32 y) { 763 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 764 if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x 765 if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y 766 767 if (fFeatures.fma) { 768 if (fProgram[x.id].op == Op::mul_f32) { 769 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 770 } 771 if (fProgram[y.id].op == Op::mul_f32) { 772 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 773 } 774 } 775 return {this, this->push(Op::add_f32, x.id, y.id)}; 776 } 777 sub(F32 x,F32 y)778 F32 Builder::sub(F32 x, F32 y) { 779 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 780 if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x 781 if (fFeatures.fma) { 782 if (fProgram[x.id].op == Op::mul_f32) { 783 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 784 } 785 if (fProgram[y.id].op == Op::mul_f32) { 786 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 787 } 788 } 789 return {this, this->push(Op::sub_f32, x.id, y.id)}; 790 } 791 mul(F32 x,F32 y)792 F32 Builder::mul(F32 x, F32 y) { 793 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 794 if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x 795 if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y 796 return {this, this->push(Op::mul_f32, x.id, y.id)}; 797 } 798 fast_mul(F32 x,F32 y)799 F32 Builder::fast_mul(F32 x, F32 y) { 800 if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); } 801 return mul(x,y); 802 } 803 div(F32 x,F32 y)804 F32 Builder::div(F32 x, F32 y) { 805 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); } 806 if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x 807 return {this, this->push(Op::div_f32, x.id, y.id)}; 808 } 809 sqrt(F32 x)810 F32 Builder::sqrt(F32 x) { 811 if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); } 812 return {this, this->push(Op::sqrt_f32, x.id)}; 813 } 814 815 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. approx_log2(F32 x)816 F32 Builder::approx_log2(F32 x) { 817 // e - 127 is a fair approximation of log2(x) in its own right... 818 F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23))); 819 820 // ... but using the mantissa to refine its error is _much_ better. 821 F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff), 822 0x3f000000)); 823 F32 approx = sub(e, 124.225514990f); 824 approx = sub(approx, mul(1.498030302f, m)); 825 approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); 826 827 return approx; 828 } 829 approx_pow2(F32 x)830 F32 Builder::approx_pow2(F32 x) { 831 F32 f = fract(x); 832 F32 approx = add(x, 121.274057500f); 833 approx = sub(approx, mul( 1.490129070f, f)); 834 approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); 835 836 return pun_to_F32(round(mul(1.0f * (1<<23), approx))); 837 } 838 approx_powf(F32 x,F32 y)839 F32 Builder::approx_powf(F32 x, F32 y) { 840 // TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210. 841 x = max(0.0f, x); 842 843 auto is_x = bit_or(eq(x, 0.0f), 844 eq(x, 1.0f)); 845 return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); 846 } 847 848 // Bhaskara I's sine approximation 849 // 16x(pi - x) / (5*pi^2 - 4x(pi - x) 850 // ... divide by 4 851 // 4x(pi - x) / 5*pi^2/4 - x(pi - x) 852 // 853 // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get 854 // radians into that range first. 855 // approx_sin(F32 radians)856 F32 Builder::approx_sin(F32 radians) { 857 constexpr float Pi = SK_ScalarPI; 858 // x = radians mod 2pi 859 F32 x = fract(radians * (0.5f/Pi)) * (2*Pi); 860 I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result 861 x = select(neg, x - Pi, x); 862 863 F32 pair = x * (Pi - x); 864 x = 4.0f * pair / ((5*Pi*Pi/4) - pair); 865 x = select(neg, -x, x); 866 return x; 867 } 868 869 /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION" 870 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf 871 872 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9 873 874 Some simplifications: 875 1. tan(x) is periodic, -PI/2 < x < PI/2 876 2. tan(x) is odd, so tan(-x) = -tan(x) 877 3. Our polynomial approximation is best near zero, so we use the following identity 878 tan(x) + tan(y) 879 tan(x + y) = ----------------- 880 1 - tan(x)*tan(y) 881 tan(PI/4) = 1 882 883 So for x > PI/8, we do the following refactor: 884 x' = x - PI/4 885 886 1 + tan(x') 887 tan(x) = ------------ 888 1 - tan(x') 889 */ approx_tan(F32 x)890 F32 Builder::approx_tan(F32 x) { 891 constexpr float Pi = SK_ScalarPI; 892 // periodic between -pi/2 ... pi/2 893 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back 894 x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2); 895 896 I32 neg = (x < 0.0f); 897 x = select(neg, -x, x); 898 899 // minimize total error by shifting if x > pi/8 900 I32 use_quotient = (x > (Pi/8)); 901 x = select(use_quotient, x - (Pi/4), x); 902 903 // 9th order poly = 4th order(x^2) * x 904 x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x; 905 x = select(use_quotient, (1+x)/(1-x), x); 906 x = select(neg, -x, x); 907 return x; 908 } 909 910 // http://mathforum.org/library/drmath/view/54137.html 911 // referencing Handbook of Mathematical Functions, 912 // by Milton Abramowitz and Irene Stegun approx_asin(F32 x)913 F32 Builder::approx_asin(F32 x) { 914 I32 neg = (x < 0.0f); 915 x = select(neg, -x, x); 916 x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f); 917 x = select(neg, -x, x); 918 return x; 919 } 920 921 /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/ 922 * with 129 values of x,atan(x) for x:[0...1] 923 * This only works for 0 <= x <= 1 924 */ approx_atan_unit(F32 x)925 static F32 approx_atan_unit(F32 x) { 926 // for now we might be given NaN... let that through 927 x->assert_true((x != x) | ((x >= 0) & (x <= 1))); 928 return poly(x, 0.14130025741326729f, 929 -0.34312835980675116f, 930 -0.016172900528248768f, 931 1.0037696976200385f, 932 -0.00014758242182738969f); 933 } 934 935 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 936 */ approx_atan(F32 x)937 F32 Builder::approx_atan(F32 x) { 938 I32 neg = (x < 0.0f); 939 x = select(neg, -x, x); 940 I32 flip = (x > 1.0f); 941 x = select(flip, 1/x, x); 942 x = approx_atan_unit(x); 943 x = select(flip, SK_ScalarPI/2 - x, x); 944 x = select(neg, -x, x); 945 return x; 946 } 947 948 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 949 * By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit() 950 * which avoids a 2nd divide instruction if we had instead called atan(). 951 */ approx_atan2(F32 y0,F32 x0)952 F32 Builder::approx_atan2(F32 y0, F32 x0) { 953 954 I32 flip = (abs(y0) > abs(x0)); 955 F32 y = select(flip, x0, y0); 956 F32 x = select(flip, y0, x0); 957 F32 arg = y/x; 958 959 I32 neg = (arg < 0.0f); 960 arg = select(neg, -arg, arg); 961 962 F32 r = approx_atan_unit(arg); 963 r = select(flip, SK_ScalarPI/2 - r, r); 964 r = select(neg, -r, r); 965 966 // handle quadrant distinctions 967 r = select((y0 >= 0) & (x0 < 0), r + SK_ScalarPI, r); 968 r = select((y0 < 0) & (x0 <= 0), r - SK_ScalarPI, r); 969 // Note: we don't try to handle 0,0 or infinities (yet) 970 return r; 971 } 972 min(F32 x,F32 y)973 F32 Builder::min(F32 x, F32 y) { 974 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); } 975 return {this, this->push(Op::min_f32, x.id, y.id)}; 976 } max(F32 x,F32 y)977 F32 Builder::max(F32 x, F32 y) { 978 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); } 979 return {this, this->push(Op::max_f32, x.id, y.id)}; 980 } 981 982 SK_ATTRIBUTE(no_sanitize("signed-integer-overflow")) add(I32 x,I32 y)983 I32 Builder::add(I32 x, I32 y) { 984 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } 985 if (this->isImm(x.id, 0)) { return y; } 986 if (this->isImm(y.id, 0)) { return x; } 987 return {this, this->push(Op::add_i32, x.id, y.id)}; 988 } 989 SK_ATTRIBUTE(no_sanitize("signed-integer-overflow")) sub(I32 x,I32 y)990 I32 Builder::sub(I32 x, I32 y) { 991 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } 992 if (this->isImm(y.id, 0)) { return x; } 993 return {this, this->push(Op::sub_i32, x.id, y.id)}; 994 } 995 SK_ATTRIBUTE(no_sanitize("signed-integer-overflow")) mul(I32 x,I32 y)996 I32 Builder::mul(I32 x, I32 y) { 997 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } 998 if (this->isImm(x.id, 0)) { return splat(0); } 999 if (this->isImm(y.id, 0)) { return splat(0); } 1000 if (this->isImm(x.id, 1)) { return y; } 1001 if (this->isImm(y.id, 1)) { return x; } 1002 return {this, this->push(Op::mul_i32, x.id, y.id)}; 1003 } 1004 1005 SK_ATTRIBUTE(no_sanitize("shift")) shl(I32 x,int bits)1006 I32 Builder::shl(I32 x, int bits) { 1007 if (bits == 0) { return x; } 1008 if (int X; this->allImm(x.id,&X)) { return splat(X << bits); } 1009 return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)}; 1010 } shr(I32 x,int bits)1011 I32 Builder::shr(I32 x, int bits) { 1012 if (bits == 0) { return x; } 1013 if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); } 1014 return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)}; 1015 } sra(I32 x,int bits)1016 I32 Builder::sra(I32 x, int bits) { 1017 if (bits == 0) { return x; } 1018 if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); } 1019 return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)}; 1020 } 1021 eq(F32 x,F32 y)1022 I32 Builder:: eq(F32 x, F32 y) { 1023 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 1024 return {this, this->push(Op::eq_f32, x.id, y.id)}; 1025 } neq(F32 x,F32 y)1026 I32 Builder::neq(F32 x, F32 y) { 1027 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 1028 return {this, this->push(Op::neq_f32, x.id, y.id)}; 1029 } lt(F32 x,F32 y)1030 I32 Builder::lt(F32 x, F32 y) { 1031 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); } 1032 return {this, this->push(Op::gt_f32, y.id, x.id)}; 1033 } lte(F32 x,F32 y)1034 I32 Builder::lte(F32 x, F32 y) { 1035 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); } 1036 return {this, this->push(Op::gte_f32, y.id, x.id)}; 1037 } gt(F32 x,F32 y)1038 I32 Builder::gt(F32 x, F32 y) { 1039 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 1040 return {this, this->push(Op::gt_f32, x.id, y.id)}; 1041 } gte(F32 x,F32 y)1042 I32 Builder::gte(F32 x, F32 y) { 1043 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 1044 return {this, this->push(Op::gte_f32, x.id, y.id)}; 1045 } 1046 eq(I32 x,I32 y)1047 I32 Builder:: eq(I32 x, I32 y) { 1048 if (x.id == y.id) { return splat(~0); } 1049 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } 1050 return {this, this->push(Op:: eq_i32, x.id, y.id)}; 1051 } neq(I32 x,I32 y)1052 I32 Builder::neq(I32 x, I32 y) { 1053 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } 1054 return ~(x == y); 1055 } gt(I32 x,I32 y)1056 I32 Builder:: gt(I32 x, I32 y) { 1057 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } 1058 return {this, this->push(Op:: gt_i32, x.id, y.id)}; 1059 } gte(I32 x,I32 y)1060 I32 Builder::gte(I32 x, I32 y) { 1061 if (x.id == y.id) { return splat(~0); } 1062 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } 1063 return ~(x < y); 1064 } lt(I32 x,I32 y)1065 I32 Builder:: lt(I32 x, I32 y) { return y>x; } lte(I32 x,I32 y)1066 I32 Builder::lte(I32 x, I32 y) { return y>=x; } 1067 bit_and(I32 x,I32 y)1068 I32 Builder::bit_and(I32 x, I32 y) { 1069 if (x.id == y.id) { return x; } 1070 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); } 1071 if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false 1072 if (this->isImm(x.id, 0)) { return splat(0); } // (false & y) == false 1073 if (this->isImm(y.id,~0)) { return x; } // (x & true) == x 1074 if (this->isImm(x.id,~0)) { return y; } // (true & y) == y 1075 return {this, this->push(Op::bit_and, x.id, y.id)}; 1076 } bit_or(I32 x,I32 y)1077 I32 Builder::bit_or(I32 x, I32 y) { 1078 if (x.id == y.id) { return x; } 1079 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); } 1080 if (this->isImm(y.id, 0)) { return x; } // (x | false) == x 1081 if (this->isImm(x.id, 0)) { return y; } // (false | y) == y 1082 if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true 1083 if (this->isImm(x.id,~0)) { return splat(~0); } // (true | y) == true 1084 return {this, this->push(Op::bit_or, x.id, y.id)}; 1085 } bit_xor(I32 x,I32 y)1086 I32 Builder::bit_xor(I32 x, I32 y) { 1087 if (x.id == y.id) { return splat(0); } 1088 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); } 1089 if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x 1090 if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y 1091 return {this, this->push(Op::bit_xor, x.id, y.id)}; 1092 } 1093 bit_clear(I32 x,I32 y)1094 I32 Builder::bit_clear(I32 x, I32 y) { 1095 if (x.id == y.id) { return splat(0); } 1096 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); } 1097 if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x 1098 if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false 1099 if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false 1100 return {this, this->push(Op::bit_clear, x.id, y.id)}; 1101 } 1102 select(I32 x,I32 y,I32 z)1103 I32 Builder::select(I32 x, I32 y, I32 z) { 1104 if (y.id == z.id) { return y; } 1105 if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); } 1106 if (this->isImm(x.id,~0)) { return y; } // true ? y : z == y 1107 if (this->isImm(x.id, 0)) { return z; } // false ? y : z == z 1108 if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z 1109 if (this->isImm(z.id, 0)) { return bit_and (y,x); } // x ? y : 0 == x&y 1110 return {this, this->push(Op::select, x.id, y.id, z.id)}; 1111 } 1112 extract(I32 x,int bits,I32 z)1113 I32 Builder::extract(I32 x, int bits, I32 z) { 1114 if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); } 1115 return bit_and(z, shr(x, bits)); 1116 } 1117 pack(I32 x,I32 y,int bits)1118 I32 Builder::pack(I32 x, I32 y, int bits) { 1119 return bit_or(x, shl(y, bits)); 1120 } 1121 ceil(F32 x)1122 F32 Builder::ceil(F32 x) { 1123 if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); } 1124 return {this, this->push(Op::ceil, x.id)}; 1125 } floor(F32 x)1126 F32 Builder::floor(F32 x) { 1127 if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); } 1128 return {this, this->push(Op::floor, x.id)}; 1129 } to_F32(I32 x)1130 F32 Builder::to_F32(I32 x) { 1131 if (int X; this->allImm(x.id,&X)) { return splat((float)X); } 1132 return {this, this->push(Op::to_f32, x.id)}; 1133 } trunc(F32 x)1134 I32 Builder::trunc(F32 x) { 1135 if (float X; this->allImm(x.id,&X)) { return splat((int)X); } 1136 return {this, this->push(Op::trunc, x.id)}; 1137 } round(F32 x)1138 I32 Builder::round(F32 x) { 1139 if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); } 1140 return {this, this->push(Op::round, x.id)}; 1141 } 1142 to_fp16(F32 x)1143 I32 Builder::to_fp16(F32 x) { 1144 if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); } 1145 return {this, this->push(Op::to_fp16, x.id)}; 1146 } from_fp16(I32 x)1147 F32 Builder::from_fp16(I32 x) { 1148 if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); } 1149 return {this, this->push(Op::from_fp16, x.id)}; 1150 } 1151 from_unorm(int bits,I32 x)1152 F32 Builder::from_unorm(int bits, I32 x) { 1153 F32 limit = splat(1 / ((1<<bits)-1.0f)); 1154 return mul(to_F32(x), limit); 1155 } to_unorm(int bits,F32 x)1156 I32 Builder::to_unorm(int bits, F32 x) { 1157 F32 limit = splat((1<<bits)-1.0f); 1158 return round(mul(x, limit)); 1159 } 1160 SkColorType_to_PixelFormat(SkColorType ct)1161 PixelFormat SkColorType_to_PixelFormat(SkColorType ct) { 1162 auto UNORM = PixelFormat::UNORM, 1163 SRGB = PixelFormat::SRGB, 1164 FLOAT = PixelFormat::FLOAT; 1165 switch (ct) { 1166 case kUnknown_SkColorType: break; 1167 1168 case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96}; 1169 1170 case kRGBA_F16Norm_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1171 case kRGBA_F16_SkColorType: return {FLOAT,16,16,16,16, 0,16,32,48}; 1172 case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48}; 1173 1174 case kA16_float_SkColorType: return {FLOAT, 0, 0,0,16, 0, 0,0,0}; 1175 case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0}; 1176 1177 case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0}; 1178 case kGray_8_SkColorType: return {UNORM, 8,8,8,0, 0,0,0,0}; // Subtle. 1179 1180 case kRGB_565_SkColorType: return {UNORM, 5,6,5,0, 11,5,0,0}; // (BGR) 1181 case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0}; // (ABGR) 1182 1183 case kRGBA_8888_SkColorType: return {UNORM, 8,8,8,8, 0,8,16,24}; 1184 case kRGB_888x_SkColorType: return {UNORM, 8,8,8,0, 0,8,16,32}; // 32-bit 1185 case kBGRA_8888_SkColorType: return {UNORM, 8,8,8,8, 16,8, 0,24}; 1186 case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8, 0,8,16,24}; 1187 1188 case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2, 0,10,20,30}; 1189 case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30}; 1190 case kRGB_101010x_SkColorType: return {UNORM, 10,10,10,0, 0,10,20, 0}; 1191 case kBGR_101010x_SkColorType: return {UNORM, 10,10,10,0, 20,10, 0, 0}; 1192 1193 case kR8G8_unorm_SkColorType: return {UNORM, 8, 8,0, 0, 0, 8,0,0}; 1194 case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0}; 1195 case kA16_unorm_SkColorType: return {UNORM, 0, 0,0,16, 0, 0,0,0}; 1196 } 1197 SkASSERT(false); 1198 return {UNORM, 0,0,0,0, 0,0,0,0}; 1199 } 1200 byte_size(PixelFormat f)1201 static int byte_size(PixelFormat f) { 1202 // What's the highest bit we read? 1203 int bits = std::max(f.r_bits + f.r_shift, 1204 std::max(f.g_bits + f.g_shift, 1205 std::max(f.b_bits + f.b_shift, 1206 f.a_bits + f.a_shift))); 1207 // Round up to bytes. 1208 return (bits + 7) / 8; 1209 } 1210 unpack(PixelFormat f,I32 x)1211 static Color unpack(PixelFormat f, I32 x) { 1212 SkASSERT(byte_size(f) <= 4); 1213 1214 auto from_srgb = [](int bits, I32 channel) -> F32 { 1215 const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction(); 1216 F32 v = from_unorm(bits, channel); 1217 return sk_program_transfer_fn(v, sRGBish_TF, 1218 v->splat(tf->g), 1219 v->splat(tf->a), 1220 v->splat(tf->b), 1221 v->splat(tf->c), 1222 v->splat(tf->d), 1223 v->splat(tf->e), 1224 v->splat(tf->f)); 1225 }; 1226 1227 auto unpack_rgb = [=](int bits, int shift) -> F32 { 1228 I32 channel = extract(x, shift, (1<<bits)-1); 1229 switch (f.encoding) { 1230 case PixelFormat::UNORM: return from_unorm(bits, channel); 1231 case PixelFormat:: SRGB: return from_srgb (bits, channel); 1232 case PixelFormat::FLOAT: return from_fp16 ( channel); 1233 } 1234 SkUNREACHABLE; 1235 }; 1236 auto unpack_alpha = [=](int bits, int shift) -> F32 { 1237 I32 channel = extract(x, shift, (1<<bits)-1); 1238 switch (f.encoding) { 1239 case PixelFormat::UNORM: 1240 case PixelFormat:: SRGB: return from_unorm(bits, channel); 1241 case PixelFormat::FLOAT: return from_fp16 ( channel); 1242 } 1243 SkUNREACHABLE; 1244 }; 1245 return { 1246 f.r_bits ? unpack_rgb (f.r_bits, f.r_shift) : x->splat(0.0f), 1247 f.g_bits ? unpack_rgb (f.g_bits, f.g_shift) : x->splat(0.0f), 1248 f.b_bits ? unpack_rgb (f.b_bits, f.b_shift) : x->splat(0.0f), 1249 f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f), 1250 }; 1251 } 1252 split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1253 static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) { 1254 SkASSERT(byte_size(f) == 8); 1255 // We assume some of the channels are in the low 32 bits, some in the high 32 bits. 1256 // The assert on byte_size(lo) will trigger if this assumption is violated. 1257 *lo = f; 1258 if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; } 1259 if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; } 1260 if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; } 1261 if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; } 1262 SkASSERT(byte_size(*lo) == 4); 1263 1264 *hi = f; 1265 if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; } 1266 if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; } 1267 if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; } 1268 if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; } 1269 SkASSERT(byte_size(*hi) == 4); 1270 } 1271 1272 // The only 16-byte format we support today is RGBA F32, 1273 // though, TODO, we could generalize that to any swizzle, and to allow UNORM too. assert_16byte_is_rgba_f32(PixelFormat f)1274 static void assert_16byte_is_rgba_f32(PixelFormat f) { 1275 #if defined(SK_DEBUG) 1276 SkASSERT(byte_size(f) == 16); 1277 PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType); 1278 1279 SkASSERT(f.encoding == rgba_f32.encoding); 1280 1281 SkASSERT(f.r_bits == rgba_f32.r_bits); 1282 SkASSERT(f.g_bits == rgba_f32.g_bits); 1283 SkASSERT(f.b_bits == rgba_f32.b_bits); 1284 SkASSERT(f.a_bits == rgba_f32.a_bits); 1285 1286 SkASSERT(f.r_shift == rgba_f32.r_shift); 1287 SkASSERT(f.g_shift == rgba_f32.g_shift); 1288 SkASSERT(f.b_shift == rgba_f32.b_shift); 1289 SkASSERT(f.a_shift == rgba_f32.a_shift); 1290 #endif 1291 } 1292 load(PixelFormat f,Ptr ptr)1293 Color Builder::load(PixelFormat f, Ptr ptr) { 1294 switch (byte_size(f)) { 1295 case 1: return unpack(f, load8 (ptr)); 1296 case 2: return unpack(f, load16(ptr)); 1297 case 4: return unpack(f, load32(ptr)); 1298 case 8: { 1299 PixelFormat lo,hi; 1300 split_disjoint_8byte_format(f, &lo,&hi); 1301 Color l = unpack(lo, load64(ptr, 0)), 1302 h = unpack(hi, load64(ptr, 1)); 1303 return { 1304 lo.r_bits ? l.r : h.r, 1305 lo.g_bits ? l.g : h.g, 1306 lo.b_bits ? l.b : h.b, 1307 lo.a_bits ? l.a : h.a, 1308 }; 1309 } 1310 case 16: { 1311 assert_16byte_is_rgba_f32(f); 1312 return { 1313 pun_to_F32(load128(ptr, 0)), 1314 pun_to_F32(load128(ptr, 1)), 1315 pun_to_F32(load128(ptr, 2)), 1316 pun_to_F32(load128(ptr, 3)), 1317 }; 1318 } 1319 default: SkUNREACHABLE; 1320 } 1321 return {}; 1322 } 1323 gather(PixelFormat f,UPtr ptr,int offset,I32 index)1324 Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) { 1325 switch (byte_size(f)) { 1326 case 1: return unpack(f, gather8 (ptr, offset, index)); 1327 case 2: return unpack(f, gather16(ptr, offset, index)); 1328 case 4: return unpack(f, gather32(ptr, offset, index)); 1329 case 8: { 1330 PixelFormat lo,hi; 1331 split_disjoint_8byte_format(f, &lo,&hi); 1332 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)), 1333 h = unpack(hi, gather32(ptr, offset, (index<<1)+1)); 1334 return { 1335 lo.r_bits ? l.r : h.r, 1336 lo.g_bits ? l.g : h.g, 1337 lo.b_bits ? l.b : h.b, 1338 lo.a_bits ? l.a : h.a, 1339 }; 1340 } 1341 case 16: { 1342 assert_16byte_is_rgba_f32(f); 1343 return { 1344 gatherF(ptr, offset, (index<<2)+0), 1345 gatherF(ptr, offset, (index<<2)+1), 1346 gatherF(ptr, offset, (index<<2)+2), 1347 gatherF(ptr, offset, (index<<2)+3), 1348 }; 1349 } 1350 default: SkUNREACHABLE; 1351 } 1352 return {}; 1353 } 1354 pack32(PixelFormat f,Color c)1355 static I32 pack32(PixelFormat f, Color c) { 1356 SkASSERT(byte_size(f) <= 4); 1357 1358 auto to_srgb = [](int bits, F32 v) { 1359 const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction(); 1360 return to_unorm(bits, sk_program_transfer_fn(v, sRGBish_TF, 1361 v->splat(tf->g), 1362 v->splat(tf->a), 1363 v->splat(tf->b), 1364 v->splat(tf->c), 1365 v->splat(tf->d), 1366 v->splat(tf->e), 1367 v->splat(tf->f))); 1368 }; 1369 1370 I32 packed = c->splat(0); 1371 auto pack_rgb = [&](F32 channel, int bits, int shift) { 1372 I32 encoded; 1373 switch (f.encoding) { 1374 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break; 1375 case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break; 1376 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1377 } 1378 packed = pack(packed, encoded, shift); 1379 }; 1380 auto pack_alpha = [&](F32 channel, int bits, int shift) { 1381 I32 encoded; 1382 switch (f.encoding) { 1383 case PixelFormat::UNORM: 1384 case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break; 1385 case PixelFormat::FLOAT: encoded = to_fp16 ( channel); break; 1386 } 1387 packed = pack(packed, encoded, shift); 1388 }; 1389 if (f.r_bits) { pack_rgb (c.r, f.r_bits, f.r_shift); } 1390 if (f.g_bits) { pack_rgb (c.g, f.g_bits, f.g_shift); } 1391 if (f.b_bits) { pack_rgb (c.b, f.b_bits, f.b_shift); } 1392 if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); } 1393 return packed; 1394 } 1395 store(PixelFormat f,Ptr ptr,Color c)1396 void Builder::store(PixelFormat f, Ptr ptr, Color c) { 1397 // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal. 1398 if (f.r_bits == f.g_bits && f.g_bits == f.b_bits && 1399 f.r_shift == f.g_shift && f.g_shift == f.b_shift) { 1400 1401 // TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance. 1402 c.r = c.r * 0.2126f 1403 + c.g * 0.7152f 1404 + c.b * 0.0722f; 1405 f.g_bits = f.b_bits = 0; 1406 } 1407 1408 switch (byte_size(f)) { 1409 case 1: store8 (ptr, pack32(f,c)); break; 1410 case 2: store16(ptr, pack32(f,c)); break; 1411 case 4: store32(ptr, pack32(f,c)); break; 1412 case 8: { 1413 PixelFormat lo,hi; 1414 split_disjoint_8byte_format(f, &lo,&hi); 1415 store64(ptr, pack32(lo,c) 1416 , pack32(hi,c)); 1417 break; 1418 } 1419 case 16: { 1420 assert_16byte_is_rgba_f32(f); 1421 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a)); 1422 break; 1423 } 1424 default: SkUNREACHABLE; 1425 } 1426 } 1427 unpremul(F32 * r,F32 * g,F32 * b,F32 a)1428 void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { 1429 skvm::F32 invA = 1.0f / a, 1430 inf = pun_to_F32(splat(0x7f800000)); 1431 // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). 1432 invA = select(invA < inf, invA 1433 , 0.0f); 1434 *r *= invA; 1435 *g *= invA; 1436 *b *= invA; 1437 } 1438 premul(F32 * r,F32 * g,F32 * b,F32 a)1439 void Builder::premul(F32* r, F32* g, F32* b, F32 a) { 1440 *r *= a; 1441 *g *= a; 1442 *b *= a; 1443 } 1444 uniformColor(SkColor4f color,Uniforms * uniforms)1445 Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) { 1446 auto [r,g,b,a] = color; 1447 return { 1448 uniformF(uniforms->pushF(r)), 1449 uniformF(uniforms->pushF(g)), 1450 uniformF(uniforms->pushF(b)), 1451 uniformF(uniforms->pushF(a)), 1452 }; 1453 } 1454 lerp(F32 lo,F32 hi,F32 t)1455 F32 Builder::lerp(F32 lo, F32 hi, F32 t) { 1456 if (this->isImm(t.id, 0.0f)) { return lo; } 1457 if (this->isImm(t.id, 1.0f)) { return hi; } 1458 return mad(sub(hi, lo), t, lo); 1459 } 1460 lerp(Color lo,Color hi,F32 t)1461 Color Builder::lerp(Color lo, Color hi, F32 t) { 1462 return { 1463 lerp(lo.r, hi.r, t), 1464 lerp(lo.g, hi.g, t), 1465 lerp(lo.b, hi.b, t), 1466 lerp(lo.a, hi.a, t), 1467 }; 1468 } 1469 to_hsla(Color c)1470 HSLA Builder::to_hsla(Color c) { 1471 F32 mx = max(max(c.r,c.g),c.b), 1472 mn = min(min(c.r,c.g),c.b), 1473 d = mx - mn, 1474 invd = 1.0f / d, 1475 g_lt_b = select(c.g < c.b, splat(6.0f) 1476 , splat(0.0f)); 1477 1478 F32 h = (1/6.0f) * select(mx == mn, 0.0f, 1479 select(mx == c.r, invd * (c.g - c.b) + g_lt_b, 1480 select(mx == c.g, invd * (c.b - c.r) + 2.0f 1481 , invd * (c.r - c.g) + 4.0f))); 1482 1483 F32 sum = mx + mn, 1484 l = sum * 0.5f, 1485 s = select(mx == mn, 0.0f 1486 , d / select(l > 0.5f, 2.0f - sum 1487 , sum)); 1488 return {h, s, l, c.a}; 1489 } 1490 to_rgba(HSLA c)1491 Color Builder::to_rgba(HSLA c) { 1492 // See GrRGBToHSLFilterEffect.fp 1493 1494 auto [h,s,l,a] = c; 1495 F32 x = s * (1.0f - abs(l + l - 1.0f)); 1496 1497 auto hue_to_rgb = [&,l=l](auto hue) { 1498 auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f; 1499 return x * (clamp01(q) - 0.5f) + l; 1500 }; 1501 1502 return { 1503 hue_to_rgb(h + 0/3.0f), 1504 hue_to_rgb(h + 2/3.0f), 1505 hue_to_rgb(h + 1/3.0f), 1506 c.a, 1507 }; 1508 } 1509 1510 // We're basing our implementation of non-separable blend modes on 1511 // https://www.w3.org/TR/compositing-1/#blendingnonseparable. 1512 // and 1513 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf 1514 // They're equivalent, but ES' math has been better simplified. 1515 // 1516 // Anything extra we add beyond that is to make the math work with premul inputs. 1517 saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1518 static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1519 return max(r, max(g, b)) 1520 - min(r, min(g, b)); 1521 } 1522 luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1523 static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1524 return r*0.30f + g*0.59f + b*0.11f; 1525 } 1526 set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1527 static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { 1528 F32 mn = min(*r, min(*g, *b)), 1529 mx = max(*r, max(*g, *b)), 1530 sat = mx - mn; 1531 1532 // Map min channel to 0, max channel to s, and scale the middle proportionally. 1533 auto scale = [&](skvm::F32 c) { 1534 auto scaled = ((c - mn) * s) / sat; 1535 return select(is_finite(scaled), scaled, 0.0f); 1536 }; 1537 *r = scale(*r); 1538 *g = scale(*g); 1539 *b = scale(*b); 1540 } 1541 set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1542 static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { 1543 auto diff = lu - luminance(*r, *g, *b); 1544 *r += diff; 1545 *g += diff; 1546 *b += diff; 1547 } 1548 clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1549 static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { 1550 F32 mn = min(*r, min(*g, *b)), 1551 mx = max(*r, max(*g, *b)), 1552 lu = luminance(*r, *g, *b); 1553 1554 auto clip = [&](auto c) { 1555 c = select(mn >= 0, c 1556 , lu + ((c-lu)*( lu)) / (lu-mn)); 1557 c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu) 1558 , c); 1559 return clamp01(c); // May be a little negative, or worse, NaN. 1560 }; 1561 *r = clip(*r); 1562 *g = clip(*g); 1563 *b = clip(*b); 1564 } 1565 blend(SkBlendMode mode,Color src,Color dst)1566 Color Builder::blend(SkBlendMode mode, Color src, Color dst) { 1567 auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { 1568 return x*y + z*w; 1569 }; 1570 1571 auto two = [](skvm::F32 x) { return x+x; }; 1572 1573 auto apply_rgba = [&](auto fn) { 1574 return Color { 1575 fn(src.r, dst.r), 1576 fn(src.g, dst.g), 1577 fn(src.b, dst.b), 1578 fn(src.a, dst.a), 1579 }; 1580 }; 1581 1582 auto apply_rgb_srcover_a = [&](auto fn) { 1583 return Color { 1584 fn(src.r, dst.r), 1585 fn(src.g, dst.g), 1586 fn(src.b, dst.b), 1587 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1588 }; 1589 }; 1590 1591 auto non_sep = [&](auto R, auto G, auto B) { 1592 return Color{ 1593 R + mma(src.r, 1-dst.a, dst.r, 1-src.a), 1594 G + mma(src.g, 1-dst.a, dst.g, 1-src.a), 1595 B + mma(src.b, 1-dst.a, dst.b, 1-src.a), 1596 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1597 }; 1598 }; 1599 1600 switch (mode) { 1601 default: 1602 SkASSERT(false); 1603 [[fallthrough]]; /*but also, for safety, fallthrough*/ 1604 1605 case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; 1606 1607 case SkBlendMode::kSrc: return src; 1608 case SkBlendMode::kDst: return dst; 1609 1610 case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]]; 1611 case SkBlendMode::kSrcOver: 1612 return apply_rgba([&](auto s, auto d) { 1613 return mad(d,1-src.a, s); 1614 }); 1615 1616 case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]]; 1617 case SkBlendMode::kSrcIn: 1618 return apply_rgba([&](auto s, auto d) { 1619 return s * dst.a; 1620 }); 1621 1622 case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]]; 1623 1624 case SkBlendMode::kSrcOut: 1625 return apply_rgba([&](auto s, auto d) { 1626 return s * (1-dst.a); 1627 }); 1628 1629 case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]]; 1630 case SkBlendMode::kSrcATop: 1631 return apply_rgba([&](auto s, auto d) { 1632 return mma(s, dst.a, d, 1-src.a); 1633 }); 1634 1635 case SkBlendMode::kXor: 1636 return apply_rgba([&](auto s, auto d) { 1637 return mma(s, 1-dst.a, d, 1-src.a); 1638 }); 1639 1640 case SkBlendMode::kPlus: 1641 return apply_rgba([&](auto s, auto d) { 1642 return min(s+d, 1.0f); 1643 }); 1644 1645 case SkBlendMode::kModulate: 1646 return apply_rgba([&](auto s, auto d) { 1647 return s * d; 1648 }); 1649 1650 case SkBlendMode::kScreen: 1651 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. 1652 // It's kind of plausible that s + (d - sd) keeps more precision? 1653 return apply_rgba([&](auto s, auto d) { 1654 return s + (d - s*d); 1655 }); 1656 1657 case SkBlendMode::kDarken: 1658 return apply_rgb_srcover_a([&](auto s, auto d) { 1659 return s + (d - max(s * dst.a, 1660 d * src.a)); 1661 }); 1662 1663 case SkBlendMode::kLighten: 1664 return apply_rgb_srcover_a([&](auto s, auto d) { 1665 return s + (d - min(s * dst.a, 1666 d * src.a)); 1667 }); 1668 1669 case SkBlendMode::kDifference: 1670 return apply_rgb_srcover_a([&](auto s, auto d) { 1671 return s + (d - two(min(s * dst.a, 1672 d * src.a))); 1673 }); 1674 1675 case SkBlendMode::kExclusion: 1676 return apply_rgb_srcover_a([&](auto s, auto d) { 1677 return s + (d - two(s * d)); 1678 }); 1679 1680 case SkBlendMode::kColorBurn: 1681 return apply_rgb_srcover_a([&](auto s, auto d) { 1682 auto mn = min(dst.a, 1683 src.a * (dst.a - d) / s), 1684 burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a); 1685 return select(d == dst.a , s * (1-dst.a) + d, 1686 select(is_finite(burn), burn 1687 , d * (1-src.a) + s)); 1688 }); 1689 1690 case SkBlendMode::kColorDodge: 1691 return apply_rgb_srcover_a([&](auto s, auto d) { 1692 auto dodge = src.a * min(dst.a, 1693 d * src.a / (src.a - s)) 1694 + mma(s, 1-dst.a, d, 1-src.a); 1695 return select(d == 0.0f , s * (1-dst.a) + d, 1696 select(is_finite(dodge), dodge 1697 , d * (1-src.a) + s)); 1698 }); 1699 1700 case SkBlendMode::kHardLight: 1701 return apply_rgb_srcover_a([&](auto s, auto d) { 1702 return mma(s, 1-dst.a, d, 1-src.a) + 1703 select(two(s) <= src.a, 1704 two(s * d), 1705 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1706 }); 1707 1708 case SkBlendMode::kOverlay: 1709 return apply_rgb_srcover_a([&](auto s, auto d) { 1710 return mma(s, 1-dst.a, d, 1-src.a) + 1711 select(two(d) <= dst.a, 1712 two(s * d), 1713 src.a * dst.a - two((dst.a - d) * (src.a - s))); 1714 }); 1715 1716 case SkBlendMode::kMultiply: 1717 return apply_rgba([&](auto s, auto d) { 1718 return mma(s, 1-dst.a, d, 1-src.a) + s * d; 1719 }); 1720 1721 case SkBlendMode::kSoftLight: 1722 return apply_rgb_srcover_a([&](auto s, auto d) { 1723 auto m = select(dst.a > 0.0f, d / dst.a 1724 , 0.0f), 1725 s2 = two(s), 1726 m4 = 4*m; 1727 1728 // The logic forks three ways: 1729 // 1. dark src? 1730 // 2. light src, dark dst? 1731 // 3. light src, light dst? 1732 1733 // Used in case 1 1734 auto darkSrc = d * ((s2-src.a) * (1-m) + src.a), 1735 // Used in case 2 1736 darkDst = (m4 * m4 + m4) * (m-1) + 7*m, 1737 // Used in case 3. 1738 liteDst = sqrt(m) - m, 1739 // Used in 2 or 3? 1740 liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst 1741 , liteDst) 1742 + d * src.a; 1743 return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc 1744 , liteSrc); 1745 }); 1746 1747 case SkBlendMode::kHue: { 1748 skvm::F32 R = src.r * src.a, 1749 G = src.g * src.a, 1750 B = src.b * src.a; 1751 1752 set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b)); 1753 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1754 clip_color(&R, &G, &B, src.a * dst.a); 1755 1756 return non_sep(R, G, B); 1757 } 1758 1759 case SkBlendMode::kSaturation: { 1760 skvm::F32 R = dst.r * src.a, 1761 G = dst.g * src.a, 1762 B = dst.b * src.a; 1763 1764 set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b)); 1765 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); 1766 clip_color(&R, &G, &B, src.a * dst.a); 1767 1768 return non_sep(R, G, B); 1769 } 1770 1771 case SkBlendMode::kColor: { 1772 skvm::F32 R = src.r * dst.a, 1773 G = src.g * dst.a, 1774 B = src.b * dst.a; 1775 1776 set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b)); 1777 clip_color(&R, &G, &B, src.a * dst.a); 1778 1779 return non_sep(R, G, B); 1780 } 1781 1782 case SkBlendMode::kLuminosity: { 1783 skvm::F32 R = dst.r * src.a, 1784 G = dst.g * src.a, 1785 B = dst.b * src.a; 1786 1787 set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b)); 1788 clip_color(&R, &G, &B, dst.a * src.a); 1789 1790 return non_sep(R, G, B); 1791 } 1792 } 1793 } 1794 1795 // ~~~~ Program::eval() and co. ~~~~ // 1796 1797 // Handy references for x86-64 instruction encoding: 1798 // https://wiki.osdev.org/X86-64_Instruction_Encoding 1799 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm 1800 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm 1801 // http://ref.x86asm.net/coder64.html 1802 1803 // Used for ModRM / immediate instruction encoding. _233(int a,int b,int c)1804 static uint8_t _233(int a, int b, int c) { 1805 return (a & 3) << 6 1806 | (b & 7) << 3 1807 | (c & 7) << 0; 1808 } 1809 1810 // ModRM byte encodes the arguments of an opcode. 1811 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; mod_rm(Mod mod,int reg,int rm)1812 static uint8_t mod_rm(Mod mod, int reg, int rm) { 1813 return _233((int)mod, reg, rm); 1814 } 1815 mod(int imm)1816 static Mod mod(int imm) { 1817 if (imm == 0) { return Mod::Indirect; } 1818 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } 1819 return Mod::FourByteImm; 1820 } 1821 imm_bytes(Mod mod)1822 static int imm_bytes(Mod mod) { 1823 switch (mod) { 1824 case Mod::Indirect: return 0; 1825 case Mod::OneByteImm: return 1; 1826 case Mod::FourByteImm: return 4; 1827 case Mod::Direct: SkUNREACHABLE; 1828 } 1829 SkUNREACHABLE; 1830 } 1831 1832 // SIB byte encodes a memory address, base + (index * scale). sib(Assembler::Scale scale,int index,int base)1833 static uint8_t sib(Assembler::Scale scale, int index, int base) { 1834 return _233((int)scale, index, base); 1835 } 1836 1837 // The REX prefix is used to extend most old 32-bit instructions to 64-bit. rex(bool W,bool R,bool X,bool B)1838 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. 1839 bool R, // Extra top bit to select ModRM reg, registers 8-15. 1840 bool X, // Extra top bit for SIB index register. 1841 bool B) { // Extra top bit for SIB base or ModRM rm register. 1842 return 0b01000000 // Fixed 0100 for top four bits. 1843 | (W << 3) 1844 | (R << 2) 1845 | (X << 1) 1846 | (B << 0); 1847 } 1848 1849 1850 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. 1851 struct VEX { 1852 int len; 1853 uint8_t bytes[3]; 1854 }; 1855 vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1856 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? 1857 bool R, // Same as REX R. Pass high bit of dst register, dst>>3. 1858 bool X, // Same as REX X. 1859 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. 1860 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. 1861 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. 1862 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. 1863 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. 1864 1865 // Pack x86 opcode map selector to 5-bit VEX encoding. 1866 map = [map]{ 1867 switch (map) { 1868 case 0x0f: return 0b00001; 1869 case 0x380f: return 0b00010; 1870 case 0x3a0f: return 0b00011; 1871 // Several more cases only used by XOP / TBM. 1872 } 1873 SkUNREACHABLE; 1874 }(); 1875 1876 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. 1877 pp = [pp]{ 1878 switch (pp) { 1879 case 0x66: return 0b01; 1880 case 0xf3: return 0b10; 1881 case 0xf2: return 0b11; 1882 } 1883 return 0b00; 1884 }(); 1885 1886 VEX vex = {0, {0,0,0}}; 1887 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { 1888 // With these conditions met, we can optionally compress VEX to 2-byte. 1889 vex.len = 2; 1890 vex.bytes[0] = 0xc5; 1891 vex.bytes[1] = (pp & 3) << 0 1892 | (L & 1) << 2 1893 | (~vvvv & 15) << 3 1894 | (~(int)R & 1) << 7; 1895 } else { 1896 // We could use this 3-byte VEX prefix all the time if we like. 1897 vex.len = 3; 1898 vex.bytes[0] = 0xc4; 1899 vex.bytes[1] = (map & 31) << 0 1900 | (~(int)B & 1) << 5 1901 | (~(int)X & 1) << 6 1902 | (~(int)R & 1) << 7; 1903 vex.bytes[2] = (pp & 3) << 0 1904 | (L & 1) << 2 1905 | (~vvvv & 15) << 3 1906 | (WE & 1) << 7; 1907 } 1908 return vex; 1909 } 1910 Assembler(void * buf)1911 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {} 1912 size() const1913 size_t Assembler::size() const { return fSize; } 1914 bytes(const void * p,int n)1915 void Assembler::bytes(const void* p, int n) { 1916 if (fCode) { 1917 memcpy(fCode+fSize, p, n); 1918 } 1919 fSize += n; 1920 } 1921 byte(uint8_t b)1922 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } word(uint32_t w)1923 void Assembler::word(uint32_t w) { this->bytes(&w, 4); } 1924 align(int mod)1925 void Assembler::align(int mod) { 1926 while (this->size() % mod) { 1927 this->byte(0x00); 1928 } 1929 } 1930 int3()1931 void Assembler::int3() { 1932 this->byte(0xcc); 1933 } 1934 vzeroupper()1935 void Assembler::vzeroupper() { 1936 this->byte(0xc5); 1937 this->byte(0xf8); 1938 this->byte(0x77); 1939 } ret()1940 void Assembler::ret() { this->byte(0xc3); } 1941 op(int opcode,Operand dst,GP64 x)1942 void Assembler::op(int opcode, Operand dst, GP64 x) { 1943 if (dst.kind == Operand::REG) { 1944 this->byte(rex(W1,x>>3,0,dst.reg>>3)); 1945 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1946 this->byte(mod_rm(Mod::Direct, x, dst.reg&7)); 1947 } else { 1948 SkASSERT(dst.kind == Operand::MEM); 1949 const Mem& m = dst.mem; 1950 const bool need_SIB = (m.base&7) == rsp 1951 || m.index != rsp; 1952 1953 this->byte(rex(W1,x>>3,m.index>>3,m.base>>3)); 1954 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); 1955 this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7)); 1956 if (need_SIB) { 1957 this->byte(sib(m.scale, m.index&7, m.base&7)); 1958 } 1959 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 1960 } 1961 } 1962 op(int opcode,int opcode_ext,Operand dst,int imm)1963 void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) { 1964 opcode |= 0b1000'0000; // top bit set for instructions with any immediate 1965 1966 int imm_bytes = 4; 1967 if (SkTFitsIn<int8_t>(imm)) { 1968 imm_bytes = 1; 1969 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. 1970 } 1971 1972 this->op(opcode, dst, (GP64)opcode_ext); 1973 this->bytes(&imm, imm_bytes); 1974 } 1975 add(Operand dst,int imm)1976 void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); } sub(Operand dst,int imm)1977 void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); } cmp(Operand dst,int imm)1978 void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); } 1979 1980 // These don't work quite like the other instructions with immediates: 1981 // these immediates are always fixed size at 4 bytes or 1 byte. mov(Operand dst,int imm)1982 void Assembler::mov(Operand dst, int imm) { 1983 this->op(0xC7,dst,(GP64)0b000); 1984 this->word(imm); 1985 } movb(Operand dst,int imm)1986 void Assembler::movb(Operand dst, int imm) { 1987 this->op(0xC6,dst,(GP64)0b000); 1988 this->byte(imm); 1989 } 1990 add(Operand dst,GP64 x)1991 void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); } sub(Operand dst,GP64 x)1992 void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); } cmp(Operand dst,GP64 x)1993 void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); } mov(Operand dst,GP64 x)1994 void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); } movb(Operand dst,GP64 x)1995 void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); } 1996 add(GP64 dst,Operand x)1997 void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); } sub(GP64 dst,Operand x)1998 void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); } cmp(GP64 dst,Operand x)1999 void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); } mov(GP64 dst,Operand x)2000 void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); } movb(GP64 dst,Operand x)2001 void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); } 2002 movzbq(GP64 dst,Operand x)2003 void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); } movzwq(GP64 dst,Operand x)2004 void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); } 2005 vpaddd(Ymm dst,Ymm x,Operand y)2006 void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } vpsubd(Ymm dst,Ymm x,Operand y)2007 void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } vpmulld(Ymm dst,Ymm x,Operand y)2008 void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); } 2009 vpaddw(Ymm dst,Ymm x,Operand y)2010 void Assembler::vpaddw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfd, dst,x,y); } vpsubw(Ymm dst,Ymm x,Operand y)2011 void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xf9, dst,x,y); } vpmullw(Ymm dst,Ymm x,Operand y)2012 void Assembler::vpmullw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xd5, dst,x,y); } vpavgw(Ymm dst,Ymm x,Operand y)2013 void Assembler::vpavgw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xe3, dst,x,y); } vpmulhrsw(Ymm dst,Ymm x,Operand y)2014 void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); } vpminsw(Ymm dst,Ymm x,Operand y)2015 void Assembler::vpminsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xea, dst,x,y); } vpmaxsw(Ymm dst,Ymm x,Operand y)2016 void Assembler::vpmaxsw (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xee, dst,x,y); } vpminuw(Ymm dst,Ymm x,Operand y)2017 void Assembler::vpminuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); } vpmaxuw(Ymm dst,Ymm x,Operand y)2018 void Assembler::vpmaxuw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); } 2019 vpabsw(Ymm dst,Operand x)2020 void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); } 2021 2022 vpand(Ymm dst,Ymm x,Operand y)2023 void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); } vpor(Ymm dst,Ymm x,Operand y)2024 void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); } vpxor(Ymm dst,Ymm x,Operand y)2025 void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); } vpandn(Ymm dst,Ymm x,Operand y)2026 void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); } 2027 vaddps(Ymm dst,Ymm x,Operand y)2028 void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); } vsubps(Ymm dst,Ymm x,Operand y)2029 void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); } vmulps(Ymm dst,Ymm x,Operand y)2030 void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); } vdivps(Ymm dst,Ymm x,Operand y)2031 void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); } vminps(Ymm dst,Ymm x,Operand y)2032 void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); } vmaxps(Ymm dst,Ymm x,Operand y)2033 void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); } 2034 vfmadd132ps(Ymm dst,Ymm x,Operand y)2035 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); } vfmadd213ps(Ymm dst,Ymm x,Operand y)2036 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); } vfmadd231ps(Ymm dst,Ymm x,Operand y)2037 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); } 2038 vfmsub132ps(Ymm dst,Ymm x,Operand y)2039 void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); } vfmsub213ps(Ymm dst,Ymm x,Operand y)2040 void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); } vfmsub231ps(Ymm dst,Ymm x,Operand y)2041 void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); } 2042 vfnmadd132ps(Ymm dst,Ymm x,Operand y)2043 void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); } vfnmadd213ps(Ymm dst,Ymm x,Operand y)2044 void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); } vfnmadd231ps(Ymm dst,Ymm x,Operand y)2045 void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); } 2046 vpackusdw(Ymm dst,Ymm x,Operand y)2047 void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); } vpackuswb(Ymm dst,Ymm x,Operand y)2048 void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0x67, dst,x,y); } 2049 vpunpckldq(Ymm dst,Ymm x,Operand y)2050 void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); } vpunpckhdq(Ymm dst,Ymm x,Operand y)2051 void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); } 2052 vpcmpeqd(Ymm dst,Ymm x,Operand y)2053 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); } vpcmpeqw(Ymm dst,Ymm x,Operand y)2054 void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); } vpcmpgtd(Ymm dst,Ymm x,Operand y)2055 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); } vpcmpgtw(Ymm dst,Ymm x,Operand y)2056 void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); } 2057 2058 imm_byte_after_operand(const Operand & operand,int imm)2059 void Assembler::imm_byte_after_operand(const Operand& operand, int imm) { 2060 // When we've embedded a label displacement in the middle of an instruction, 2061 // we need to tweak it a little so that the resolved displacement starts 2062 // from the end of the instruction and not the end of the displacement. 2063 if (operand.kind == Operand::LABEL && fCode) { 2064 int disp; 2065 memcpy(&disp, fCode+fSize-4, 4); 2066 disp--; 2067 memcpy(fCode+fSize-4, &disp, 4); 2068 } 2069 this->byte(imm); 2070 } 2071 vcmpps(Ymm dst,Ymm x,Operand y,int imm)2072 void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) { 2073 this->op(0,0x0f,0xc2, dst,x,y); 2074 this->imm_byte_after_operand(y, imm); 2075 } 2076 vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)2077 void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) { 2078 this->op(0x66,0x3a0f,0x4c, dst,x,y); 2079 this->imm_byte_after_operand(y, z << 4); 2080 } 2081 2082 // Shift instructions encode their opcode extension as "dst", dst as x, and x as y. vpslld(Ymm dst,Ymm x,int imm)2083 void Assembler::vpslld(Ymm dst, Ymm x, int imm) { 2084 this->op(0x66,0x0f,0x72,(Ymm)6, dst,x); 2085 this->byte(imm); 2086 } vpsrld(Ymm dst,Ymm x,int imm)2087 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { 2088 this->op(0x66,0x0f,0x72,(Ymm)2, dst,x); 2089 this->byte(imm); 2090 } vpsrad(Ymm dst,Ymm x,int imm)2091 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { 2092 this->op(0x66,0x0f,0x72,(Ymm)4, dst,x); 2093 this->byte(imm); 2094 } vpsllw(Ymm dst,Ymm x,int imm)2095 void Assembler::vpsllw(Ymm dst, Ymm x, int imm) { 2096 this->op(0x66,0x0f,0x71,(Ymm)6, dst,x); 2097 this->byte(imm); 2098 } vpsrlw(Ymm dst,Ymm x,int imm)2099 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { 2100 this->op(0x66,0x0f,0x71,(Ymm)2, dst,x); 2101 this->byte(imm); 2102 } vpsraw(Ymm dst,Ymm x,int imm)2103 void Assembler::vpsraw(Ymm dst, Ymm x, int imm) { 2104 this->op(0x66,0x0f,0x71,(Ymm)4, dst,x); 2105 this->byte(imm); 2106 } 2107 vpermq(Ymm dst,Operand x,int imm)2108 void Assembler::vpermq(Ymm dst, Operand x, int imm) { 2109 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. 2110 this->op(0x66,0x3a0f,0x00, dst,x,W1); 2111 this->imm_byte_after_operand(x, imm); 2112 } 2113 vperm2f128(Ymm dst,Ymm x,Operand y,int imm)2114 void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) { 2115 this->op(0x66,0x3a0f,0x06, dst,x,y); 2116 this->imm_byte_after_operand(y, imm); 2117 } 2118 vpermps(Ymm dst,Ymm ix,Operand src)2119 void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) { 2120 this->op(0x66,0x380f,0x16, dst,ix,src); 2121 } 2122 vroundps(Ymm dst,Operand x,Rounding imm)2123 void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) { 2124 this->op(0x66,0x3a0f,0x08, dst,x); 2125 this->imm_byte_after_operand(x, imm); 2126 } 2127 vmovdqa(Ymm dst,Operand src)2128 void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); } vmovups(Ymm dst,Operand src)2129 void Assembler::vmovups(Ymm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Xmm dst,Operand src)2130 void Assembler::vmovups(Xmm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } vmovups(Operand dst,Ymm src)2131 void Assembler::vmovups(Operand dst, Ymm src) { this->op( 0,0x0f,0x11, src,dst); } vmovups(Operand dst,Xmm src)2132 void Assembler::vmovups(Operand dst, Xmm src) { this->op( 0,0x0f,0x11, src,dst); } 2133 vcvtdq2ps(Ymm dst,Operand x)2134 void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( 0,0x0f,0x5b, dst,x); } vcvttps2dq(Ymm dst,Operand x)2135 void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); } vcvtps2dq(Ymm dst,Operand x)2136 void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); } vsqrtps(Ymm dst,Operand x)2137 void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( 0,0x0f,0x51, dst,x); } 2138 vcvtps2ph(Operand dst,Ymm x,Rounding imm)2139 void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) { 2140 this->op(0x66,0x3a0f,0x1d, x,dst); 2141 this->imm_byte_after_operand(dst, imm); 2142 } vcvtph2ps(Ymm dst,Operand x)2143 void Assembler::vcvtph2ps(Ymm dst, Operand x) { 2144 this->op(0x66,0x380f,0x13, dst,x); 2145 } 2146 disp19(Label * l)2147 int Assembler::disp19(Label* l) { 2148 SkASSERT(l->kind == Label::NotYetSet || 2149 l->kind == Label::ARMDisp19); 2150 int here = (int)this->size(); 2151 l->kind = Label::ARMDisp19; 2152 l->references.push_back(here); 2153 // ARM 19-bit instruction count, from the beginning of this instruction. 2154 return (l->offset - here) / 4; 2155 } 2156 disp32(Label * l)2157 int Assembler::disp32(Label* l) { 2158 SkASSERT(l->kind == Label::NotYetSet || 2159 l->kind == Label::X86Disp32); 2160 int here = (int)this->size(); 2161 l->kind = Label::X86Disp32; 2162 l->references.push_back(here); 2163 // x86 32-bit byte count, from the end of this instruction. 2164 return l->offset - (here + 4); 2165 } 2166 op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)2167 void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) { 2168 switch (y.kind) { 2169 case Operand::REG: { 2170 VEX v = vex(w, dst>>3, 0, y.reg>>3, 2171 map, x, l, prefix); 2172 this->bytes(v.bytes, v.len); 2173 this->byte(opcode); 2174 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7)); 2175 } return; 2176 2177 case Operand::MEM: { 2178 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows; 2179 // without an SIB byte, that's where the base register would usually go. 2180 // This means we have to use an SIB byte if we want to use rsp as a base register. 2181 const Mem& m = y.mem; 2182 const bool need_SIB = m.base == rsp 2183 || m.index != rsp; 2184 2185 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3, 2186 map, x, l, prefix); 2187 this->bytes(v.bytes, v.len); 2188 this->byte(opcode); 2189 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7)); 2190 if (need_SIB) { 2191 this->byte(sib(m.scale, m.index&7, m.base&7)); 2192 } 2193 this->bytes(&m.disp, imm_bytes(mod(m.disp))); 2194 } return; 2195 2196 case Operand::LABEL: { 2197 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. 2198 const int rip = rbp; 2199 2200 VEX v = vex(w, dst>>3, 0, rip>>3, 2201 map, x, l, prefix); 2202 this->bytes(v.bytes, v.len); 2203 this->byte(opcode); 2204 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); 2205 this->word(this->disp32(y.label)); 2206 } return; 2207 } 2208 } 2209 vpshufb(Ymm dst,Ymm x,Operand y)2210 void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); } 2211 vptest(Ymm x,Operand y)2212 void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); } 2213 vbroadcastss(Ymm dst,Operand y)2214 void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); } 2215 jump(uint8_t condition,Label * l)2216 void Assembler::jump(uint8_t condition, Label* l) { 2217 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): 2218 // 7? one-byte-disp 2219 // 0F 8? four-byte-disp 2220 // We always use the near displacement to make updating labels simpler (no resizing). 2221 this->byte(0x0f); 2222 this->byte(condition); 2223 this->word(this->disp32(l)); 2224 } je(Label * l)2225 void Assembler::je (Label* l) { this->jump(0x84, l); } jne(Label * l)2226 void Assembler::jne(Label* l) { this->jump(0x85, l); } jl(Label * l)2227 void Assembler::jl (Label* l) { this->jump(0x8c, l); } jc(Label * l)2228 void Assembler::jc (Label* l) { this->jump(0x82, l); } 2229 jmp(Label * l)2230 void Assembler::jmp(Label* l) { 2231 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. 2232 this->byte(0xe9); 2233 this->word(this->disp32(l)); 2234 } 2235 vpmovzxwd(Ymm dst,Operand src)2236 void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); } vpmovzxbd(Ymm dst,Operand src)2237 void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); } 2238 vmovq(Operand dst,Xmm src)2239 void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); } 2240 vmovd(Operand dst,Xmm src)2241 void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); } vmovd(Xmm dst,Operand src)2242 void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); } 2243 vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2244 void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) { 2245 this->op(0x66,0x3a0f,0x22, dst,src,y); 2246 this->imm_byte_after_operand(y, imm); 2247 } vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2248 void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) { 2249 this->op(0x66,0x0f,0xc4, dst,src,y); 2250 this->imm_byte_after_operand(y, imm); 2251 } vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2252 void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) { 2253 this->op(0x66,0x3a0f,0x20, dst,src,y); 2254 this->imm_byte_after_operand(y, imm); 2255 } 2256 vextracti128(Operand dst,Ymm src,int imm)2257 void Assembler::vextracti128(Operand dst, Ymm src, int imm) { 2258 this->op(0x66,0x3a0f,0x39, src,dst); 2259 SkASSERT(dst.kind != Operand::LABEL); 2260 this->byte(imm); 2261 } vpextrd(Operand dst,Xmm src,int imm)2262 void Assembler::vpextrd(Operand dst, Xmm src, int imm) { 2263 this->op(0x66,0x3a0f,0x16, src,dst); 2264 SkASSERT(dst.kind != Operand::LABEL); 2265 this->byte(imm); 2266 } vpextrw(Operand dst,Xmm src,int imm)2267 void Assembler::vpextrw(Operand dst, Xmm src, int imm) { 2268 this->op(0x66,0x3a0f,0x15, src,dst); 2269 SkASSERT(dst.kind != Operand::LABEL); 2270 this->byte(imm); 2271 } vpextrb(Operand dst,Xmm src,int imm)2272 void Assembler::vpextrb(Operand dst, Xmm src, int imm) { 2273 this->op(0x66,0x3a0f,0x14, src,dst); 2274 SkASSERT(dst.kind != Operand::LABEL); 2275 this->byte(imm); 2276 } 2277 vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2278 void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { 2279 // Unlike most instructions, no aliasing is permitted here. 2280 SkASSERT(dst != ix); 2281 SkASSERT(dst != mask); 2282 SkASSERT(mask != ix); 2283 2284 int prefix = 0x66, 2285 map = 0x380f, 2286 opcode = 0x92; 2287 VEX v = vex(0, dst>>3, ix>>3, base>>3, 2288 map, mask, /*ymm?*/1, prefix); 2289 this->bytes(v.bytes, v.len); 2290 this->byte(opcode); 2291 this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); 2292 this->byte(sib(scale, ix&7, base&7)); 2293 } 2294 2295 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf 2296 operator ""_mask(unsigned long long bits)2297 static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } 2298 op(uint32_t hi,V m,uint32_t lo,V n,V d)2299 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { 2300 this->word( (hi & 11_mask) << 21 2301 | (m & 5_mask) << 16 2302 | (lo & 6_mask) << 10 2303 | (n & 5_mask) << 5 2304 | (d & 5_mask) << 0); 2305 } op(uint32_t op22,V n,V d,int imm)2306 void Assembler::op(uint32_t op22, V n, V d, int imm) { 2307 this->word( (op22 & 22_mask) << 10 2308 | imm // size and location depends on the instruction 2309 | (n & 5_mask) << 5 2310 | (d & 5_mask) << 0); 2311 } 2312 and16b(V d,V n,V m)2313 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } orr16b(V d,V n,V m)2314 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } eor16b(V d,V n,V m)2315 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } bic16b(V d,V n,V m)2316 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } bsl16b(V d,V n,V m)2317 void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } not16b(V d,V n)2318 void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } 2319 add4s(V d,V n,V m)2320 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } sub4s(V d,V n,V m)2321 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } mul4s(V d,V n,V m)2322 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } 2323 cmeq4s(V d,V n,V m)2324 void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } cmgt4s(V d,V n,V m)2325 void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } 2326 sub8h(V d,V n,V m)2327 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } mul8h(V d,V n,V m)2328 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } 2329 fadd4s(V d,V n,V m)2330 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } fsub4s(V d,V n,V m)2331 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } fmul4s(V d,V n,V m)2332 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } fdiv4s(V d,V n,V m)2333 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } fmin4s(V d,V n,V m)2334 void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } fmax4s(V d,V n,V m)2335 void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } 2336 fneg4s(V d,V n)2337 void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); } fsqrt4s(V d,V n)2338 void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); } 2339 fcmeq4s(V d,V n,V m)2340 void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } fcmgt4s(V d,V n,V m)2341 void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } fcmge4s(V d,V n,V m)2342 void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } 2343 fmla4s(V d,V n,V m)2344 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } fmls4s(V d,V n,V m)2345 void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } 2346 tbl(V d,V n,V m)2347 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } 2348 uzp14s(V d,V n,V m)2349 void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); } uzp24s(V d,V n,V m)2350 void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); } zip14s(V d,V n,V m)2351 void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); } zip24s(V d,V n,V m)2352 void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); } 2353 sli4s(V d,V n,int imm5)2354 void Assembler::sli4s(V d, V n, int imm5) { 2355 this->op(0b0'1'1'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); 2356 } shl4s(V d,V n,int imm5)2357 void Assembler::shl4s(V d, V n, int imm5) { 2358 this->op(0b0'1'0'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); 2359 } sshr4s(V d,V n,int imm5)2360 void Assembler::sshr4s(V d, V n, int imm5) { 2361 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); 2362 } ushr4s(V d,V n,int imm5)2363 void Assembler::ushr4s(V d, V n, int imm5) { 2364 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); 2365 } ushr8h(V d,V n,int imm4)2366 void Assembler::ushr8h(V d, V n, int imm4) { 2367 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16); 2368 } 2369 scvtf4s(V d,V n)2370 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } fcvtzs4s(V d,V n)2371 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } fcvtns4s(V d,V n)2372 void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } frintp4s(V d,V n)2373 void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); } frintm4s(V d,V n)2374 void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); } 2375 fcvtn(V d,V n)2376 void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); } fcvtl(V d,V n)2377 void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); } 2378 xtns2h(V d,V n)2379 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } xtnh2b(V d,V n)2380 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } 2381 uxtlb2h(V d,V n)2382 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } uxtlh2s(V d,V n)2383 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } 2384 uminv4s(V d,V n)2385 void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } 2386 brk(int imm16)2387 void Assembler::brk(int imm16) { 2388 this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5); 2389 } 2390 ret(X n)2391 void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); } 2392 add(X d,X n,int imm12)2393 void Assembler::add(X d, X n, int imm12) { 2394 this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2395 } sub(X d,X n,int imm12)2396 void Assembler::sub(X d, X n, int imm12) { 2397 this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2398 } subs(X d,X n,int imm12)2399 void Assembler::subs(X d, X n, int imm12) { 2400 this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); 2401 } 2402 add(X d,X n,X m,Shift shift,int imm6)2403 void Assembler::add(X d, X n, X m, Shift shift, int imm6) { 2404 SkASSERT(shift != ROR); 2405 2406 int imm = (imm6 & 6_mask) << 0 2407 | (m & 5_mask) << 6 2408 | (0 & 1_mask) << 11 2409 | (shift & 2_mask) << 12; 2410 this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10); 2411 } 2412 b(Condition cond,Label * l)2413 void Assembler::b(Condition cond, Label* l) { 2414 const int imm19 = this->disp19(l); 2415 this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5); 2416 } cbz(X t,Label * l)2417 void Assembler::cbz(X t, Label* l) { 2418 const int imm19 = this->disp19(l); 2419 this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); 2420 } cbnz(X t,Label * l)2421 void Assembler::cbnz(X t, Label* l) { 2422 const int imm19 = this->disp19(l); 2423 this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); 2424 } 2425 ldrd(X dst,X src,int imm12)2426 void Assembler::ldrd(X dst, X src, int imm12) { 2427 this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2428 } ldrs(X dst,X src,int imm12)2429 void Assembler::ldrs(X dst, X src, int imm12) { 2430 this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2431 } ldrh(X dst,X src,int imm12)2432 void Assembler::ldrh(X dst, X src, int imm12) { 2433 this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2434 } ldrb(X dst,X src,int imm12)2435 void Assembler::ldrb(X dst, X src, int imm12) { 2436 this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2437 } 2438 ldrq(V dst,X src,int imm12)2439 void Assembler::ldrq(V dst, X src, int imm12) { 2440 this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10); 2441 } ldrd(V dst,X src,int imm12)2442 void Assembler::ldrd(V dst, X src, int imm12) { 2443 this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2444 } ldrs(V dst,X src,int imm12)2445 void Assembler::ldrs(V dst, X src, int imm12) { 2446 this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2447 } ldrh(V dst,X src,int imm12)2448 void Assembler::ldrh(V dst, X src, int imm12) { 2449 this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2450 } ldrb(V dst,X src,int imm12)2451 void Assembler::ldrb(V dst, X src, int imm12) { 2452 this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); 2453 } 2454 strs(X src,X dst,int imm12)2455 void Assembler::strs(X src, X dst, int imm12) { 2456 this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2457 } 2458 strq(V src,X dst,int imm12)2459 void Assembler::strq(V src, X dst, int imm12) { 2460 this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10); 2461 } strd(V src,X dst,int imm12)2462 void Assembler::strd(V src, X dst, int imm12) { 2463 this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2464 } strs(V src,X dst,int imm12)2465 void Assembler::strs(V src, X dst, int imm12) { 2466 this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2467 } strh(V src,X dst,int imm12)2468 void Assembler::strh(V src, X dst, int imm12) { 2469 this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2470 } strb(V src,X dst,int imm12)2471 void Assembler::strb(V src, X dst, int imm12) { 2472 this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); 2473 } 2474 movs(X dst,V src,int lane)2475 void Assembler::movs(X dst, V src, int lane) { 2476 int imm5 = (lane << 3) | 0b100; 2477 this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16); 2478 } inss(V dst,X src,int lane)2479 void Assembler::inss(V dst, X src, int lane) { 2480 int imm5 = (lane << 3) | 0b100; 2481 this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16); 2482 } 2483 2484 ldrq(V dst,Label * l)2485 void Assembler::ldrq(V dst, Label* l) { 2486 const int imm19 = this->disp19(l); 2487 this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5); 2488 } 2489 dup4s(V dst,X src)2490 void Assembler::dup4s(V dst, X src) { 2491 this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst); 2492 } 2493 ld1r4s(V dst,X src)2494 void Assembler::ld1r4s(V dst, X src) { 2495 this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst); 2496 } ld1r8h(V dst,X src)2497 void Assembler::ld1r8h(V dst, X src) { 2498 this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst); 2499 } ld1r16b(V dst,X src)2500 void Assembler::ld1r16b(V dst, X src) { 2501 this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst); 2502 } 2503 ld24s(V dst,X src)2504 void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); } ld44s(V dst,X src)2505 void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); } st24s(V src,X dst)2506 void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); } st44s(V src,X dst)2507 void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); } 2508 ld24s(V dst,X src,int lane)2509 void Assembler::ld24s(V dst, X src, int lane) { 2510 int Q = (lane & 2)>>1, 2511 S = (lane & 1); 2512 /* Q S */ 2513 this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12)); 2514 } ld44s(V dst,X src,int lane)2515 void Assembler::ld44s(V dst, X src, int lane) { 2516 int Q = (lane & 2)>>1, 2517 S = (lane & 1); 2518 this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12)); 2519 } 2520 label(Label * l)2521 void Assembler::label(Label* l) { 2522 if (fCode) { 2523 // The instructions all currently point to l->offset. 2524 // We'll want to add a delta to point them to here. 2525 int here = (int)this->size(); 2526 int delta = here - l->offset; 2527 l->offset = here; 2528 2529 if (l->kind == Label::ARMDisp19) { 2530 for (int ref : l->references) { 2531 // ref points to a 32-bit instruction with 19-bit displacement in instructions. 2532 uint32_t inst; 2533 memcpy(&inst, fCode + ref, 4); 2534 2535 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] 2536 int disp = (int)(inst << 8) >> 13; 2537 2538 disp += delta/4; // delta is in bytes, we want instructions. 2539 2540 // Put it all back together, preserving the high 8 bits and low 5. 2541 inst = ((disp << 5) & (19_mask << 5)) 2542 | ((inst ) & ~(19_mask << 5)); 2543 memcpy(fCode + ref, &inst, 4); 2544 } 2545 } 2546 2547 if (l->kind == Label::X86Disp32) { 2548 for (int ref : l->references) { 2549 // ref points to a 32-bit displacement in bytes. 2550 int disp; 2551 memcpy(&disp, fCode + ref, 4); 2552 2553 disp += delta; 2554 2555 memcpy(fCode + ref, &disp, 4); 2556 } 2557 } 2558 } 2559 } 2560 eval(int n,void * args[]) const2561 void Program::eval(int n, void* args[]) const { 2562 #define SKVM_JIT_STATS 0 2563 #if SKVM_JIT_STATS 2564 static std::atomic<int64_t> calls{0}, jits{0}, 2565 pixels{0}, fast{0}; 2566 pixels += n; 2567 if (0 == calls++) { 2568 atexit([]{ 2569 int64_t num = jits .load(), 2570 den = calls.load(); 2571 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den); 2572 num = fast .load(); 2573 den = pixels.load(); 2574 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den); 2575 }); 2576 } 2577 #endif 2578 2579 #if !defined(SKVM_JIT_BUT_IGNORE_IT) 2580 const void* jit_entry = fImpl->jit_entry.load(); 2581 // jit_entry may be null either simply because we can't JIT, or when using LLVM 2582 // if the work represented by fImpl->llvm_compiling hasn't finished yet. 2583 // 2584 // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it 2585 // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off, 2586 // due to timing or program caching. 2587 if (jit_entry != nullptr && gSkVMAllowJIT) { 2588 #if SKVM_JIT_STATS 2589 jits++; 2590 fast += n; 2591 #endif 2592 void** a = args; 2593 switch (fImpl->strides.size()) { 2594 case 0: return ((void(*)(int ))jit_entry)(n ); 2595 case 1: return ((void(*)(int,void* ))jit_entry)(n,a[0] ); 2596 case 2: return ((void(*)(int,void*,void* ))jit_entry)(n,a[0],a[1] ); 2597 case 3: return ((void(*)(int,void*,void*,void* ))jit_entry)(n,a[0],a[1],a[2]); 2598 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry) 2599 (n,a[0],a[1],a[2],a[3]); 2600 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry) 2601 (n,a[0],a[1],a[2],a[3],a[4]); 2602 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry) 2603 (n,a[0],a[1],a[2],a[3],a[4],a[5]); 2604 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry) 2605 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]); 2606 default: break; //SkASSERT(fImpl->strides.size() <= 7); 2607 } 2608 } 2609 #endif 2610 2611 // So we'll sometimes use the interpreter here even if later calls will use the JIT. 2612 SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), 2613 this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(), 2614 n, args); 2615 } 2616 2617 #if defined(SKVM_LLVM) 2618 // -- SKVM_LLVM -------------------------------------------------------------------------------- setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2619 void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions, 2620 const char* debug_name) { 2621 auto ctx = std::make_unique<llvm::LLVMContext>(); 2622 2623 auto mod = std::make_unique<llvm::Module>("", *ctx); 2624 // All the scary bare pointers from here on are owned by ctx or mod, I think. 2625 2626 // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines. 2627 const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4; 2628 2629 llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(), 2630 *i32 = llvm::Type::getInt32Ty(*ctx); 2631 2632 std::vector<llvm::Type*> arg_types = { i32 }; 2633 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2634 arg_types.push_back(ptr); 2635 } 2636 2637 llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx), 2638 arg_types, /*vararg?=*/false); 2639 llvm::Function* fn 2640 = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod); 2641 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2642 fn->addParamAttr(i+1, llvm::Attribute::NoAlias); 2643 } 2644 2645 llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn), 2646 *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn), 2647 *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn), 2648 *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn), 2649 *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn), 2650 *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn), 2651 *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn), 2652 *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn); 2653 2654 using IRBuilder = llvm::IRBuilder<>; 2655 2656 llvm::PHINode* n; 2657 std::vector<llvm::PHINode*> args; 2658 std::vector<llvm::Value*> vals(instructions.size()); 2659 2660 auto emit = [&](size_t i, bool scalar, IRBuilder* b) { 2661 auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i]; 2662 2663 llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx), 2664 *i8 = llvm::Type::getInt8Ty (*ctx), 2665 *i16 = llvm::Type::getInt16Ty(*ctx), 2666 *f32 = llvm::Type::getFloatTy(*ctx), 2667 *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K, false ), 2668 *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K, false ), 2669 *I16 = scalar ? i16 : llvm::VectorType::get(i16, K, false ), 2670 *I32 = scalar ? i32 : llvm::VectorType::get(i32, K, false ), 2671 *F32 = scalar ? f32 : llvm::VectorType::get(f32, K, false ); 2672 2673 auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); }; 2674 auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); }; 2675 2676 auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); }; 2677 2678 llvm::Type* vt = nullptr; 2679 switch (llvm::Type* t = nullptr; op) { 2680 default: 2681 SkDebugf("can't llvm %s (%d)\n", name(op), op); 2682 return false; 2683 2684 case Op::assert_true: /*TODO*/ break; 2685 2686 case Op::trace_line: 2687 case Op::trace_var: 2688 case Op::trace_call: 2689 /* Only supported in the interpreter. */ 2690 break; 2691 2692 case Op::index: 2693 if (I32->isVectorTy()) { 2694 std::vector<llvm::Constant*> iota(K); 2695 for (int j = 0; j < K; j++) { 2696 iota[j] = b->getInt32(j); 2697 } 2698 vals[i] = b->CreateSub(b->CreateVectorSplat(K, n), 2699 llvm::ConstantVector::get(iota)); 2700 } else { 2701 vals[i] = n; 2702 } break; 2703 2704 case Op::load8: t = I8 ; goto load; 2705 case Op::load16: t = I16; goto load; 2706 case Op::load32: t = I32; goto load; 2707 load: { 2708 llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo()); 2709 vals[i] = b->CreateZExt( 2710 b->CreateAlignedLoad(t, ptr, llvm::MaybeAlign{1}), I32); 2711 } break; 2712 2713 2714 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break; 2715 2716 case Op::uniform32: { 2717 llvm::Value* ptr = b->CreateBitCast( 2718 b->CreateConstInBoundsGEP1_32(i8, args[immA], immB), 2719 i32->getPointerTo()); 2720 llvm::Value* val = b->CreateZExt( 2721 b->CreateAlignedLoad(i32, ptr, llvm::MaybeAlign{1}), i32); 2722 vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val) 2723 : val; 2724 } break; 2725 2726 case Op::gather8: t = i8 ; vt = I8; goto gather; 2727 case Op::gather16: t = i16; vt = I16; goto gather; 2728 case Op::gather32: t = i32; vt = I32; goto gather; 2729 gather: { 2730 // Our gather base pointer is immB bytes off of uniform immA. 2731 llvm::Value* base = 2732 b->CreateLoad(b->CreateBitCast( 2733 b->CreateConstInBoundsGEP1_32(i8, args[immA],immB), 2734 t->getPointerTo()->getPointerTo())); 2735 2736 llvm::Value* ptr = b->CreateInBoundsGEP(t, base, vals[x]); 2737 llvm::Value* gathered; 2738 if (ptr->getType()->isVectorTy()) { 2739 gathered = b->CreateMaskedGather( 2740 vt, 2741 ptr, 2742 llvm::Align{1}); 2743 } else { 2744 gathered = b->CreateAlignedLoad(vt, ptr, llvm::MaybeAlign{1}); 2745 } 2746 vals[i] = b->CreateZExt(gathered, I32); 2747 } break; 2748 2749 case Op::store8: t = I8 ; goto store; 2750 case Op::store16: t = I16; goto store; 2751 case Op::store32: t = I32; goto store; 2752 store: { 2753 llvm::Value* val = b->CreateTrunc(vals[x], t); 2754 llvm::Value* ptr = b->CreateBitCast(args[immA], 2755 val->getType()->getPointerTo()); 2756 vals[i] = b->CreateAlignedStore(val, ptr, llvm::MaybeAlign{1}); 2757 } break; 2758 2759 case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; 2760 case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; 2761 case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; 2762 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; 2763 2764 case Op::select: 2765 vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); 2766 break; 2767 2768 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break; 2769 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break; 2770 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break; 2771 2772 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break; 2773 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break; 2774 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break; 2775 2776 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break; 2777 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break; 2778 2779 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break; 2780 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break; 2781 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break; 2782 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break; 2783 2784 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break; 2785 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break; 2786 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break; 2787 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break; 2788 2789 case Op::fma_f32: 2790 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2791 {F(vals[x]), F(vals[y]), F(vals[z])})); 2792 break; 2793 2794 case Op::fms_f32: 2795 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2796 {F(vals[x]), F(vals[y]), 2797 b->CreateFNeg(F(vals[z]))})); 2798 break; 2799 2800 case Op::fnma_f32: 2801 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2802 {b->CreateFNeg(F(vals[x])), F(vals[y]), 2803 F(vals[z])})); 2804 break; 2805 2806 case Op::ceil: 2807 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x]))); 2808 break; 2809 case Op::floor: 2810 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x]))); 2811 break; 2812 2813 case Op::max_f32: 2814 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])), 2815 F(vals[y]), F(vals[x]))); 2816 break; 2817 case Op::min_f32: 2818 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])), 2819 F(vals[y]), F(vals[x]))); 2820 break; 2821 2822 case Op::sqrt_f32: 2823 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x]))); 2824 break; 2825 2826 case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break; 2827 case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break; 2828 case Op::round : { 2829 // Basic impl when we can't use cvtps2dq and co. 2830 auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x])); 2831 vals[i] = b->CreateFPToSI(round, I32); 2832 2833 #if 1 && defined(SK_CPU_X86) 2834 // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling. 2835 if (scalar) { 2836 // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯ 2837 llvm::Value* v = llvm::UndefValue::get( 2838 llvm::VectorType::get(f32, 4, false)); 2839 v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0); 2840 vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v}); 2841 } else { 2842 SkASSERT(K == 4 || K == 8); 2843 auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq : 2844 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256; 2845 vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])}); 2846 } 2847 #endif 2848 } break; 2849 2850 } 2851 return true; 2852 }; 2853 2854 { 2855 IRBuilder b(enter); 2856 b.CreateBr(hoistK); 2857 } 2858 2859 // hoistK: emit each hoistable vector instruction; goto testK; 2860 // LLVM can do this sort of thing itself, but we've got the information cheap, 2861 // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe. 2862 { 2863 IRBuilder b(hoistK); 2864 2865 // Hoisted instructions will need args (think, uniforms), so set that up now. 2866 // These phi nodes are degenerate... they'll always be the passed-in args from enter. 2867 // Later on when we start looping the phi nodes will start looking useful. 2868 llvm::Argument* arg = fn->arg_begin(); 2869 (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction. 2870 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2871 args.push_back(b.CreatePHI(arg->getType(), 1)); 2872 args.back()->addIncoming(arg++, enter); 2873 } 2874 2875 for (size_t i = 0; i < instructions.size(); i++) { 2876 if (instructions[i].can_hoist && !emit(i, false, &b)) { 2877 return; 2878 } 2879 } 2880 2881 b.CreateBr(testK); 2882 } 2883 2884 // testK: if (N >= K) goto loopK; else goto hoist1; 2885 { 2886 IRBuilder b(testK); 2887 2888 // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK. 2889 // These also start as the initial function arguments; hoistK can't have changed them. 2890 llvm::Argument* arg = fn->arg_begin(); 2891 2892 n = b.CreatePHI(arg->getType(), 2); 2893 n->addIncoming(arg++, hoistK); 2894 2895 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2896 args[i] = b.CreatePHI(arg->getType(), 2); 2897 args[i]->addIncoming(arg++, hoistK); 2898 } 2899 2900 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1); 2901 } 2902 2903 // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK; 2904 { 2905 IRBuilder b(loopK); 2906 for (size_t i = 0; i < instructions.size(); i++) { 2907 if (!instructions[i].can_hoist && !emit(i, false, &b)) { 2908 return; 2909 } 2910 } 2911 2912 // n -= K 2913 llvm::Value* n_next = b.CreateSub(n, b.getInt32(K)); 2914 n->addIncoming(n_next, loopK); 2915 2916 // Each arg ptr += K 2917 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2918 llvm::Value* arg_next 2919 = b.CreateConstInBoundsGEP1_32( 2920 llvm::Type::getInt8Ty (*ctx), 2921 args[i], 2922 K*fImpl->strides[i]); 2923 args[i]->addIncoming(arg_next, loopK); 2924 } 2925 b.CreateBr(testK); 2926 } 2927 2928 // hoist1: emit each hoistable scalar instruction; goto test1; 2929 { 2930 IRBuilder b(hoist1); 2931 for (size_t i = 0; i < instructions.size(); i++) { 2932 if (instructions[i].can_hoist && !emit(i, true, &b)) { 2933 return; 2934 } 2935 } 2936 b.CreateBr(test1); 2937 } 2938 2939 // test1: if (N >= 1) goto loop1; else goto leave; 2940 { 2941 IRBuilder b(test1); 2942 2943 // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1. 2944 llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2); 2945 n_new->addIncoming(n, hoist1); 2946 n = n_new; 2947 2948 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2949 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2); 2950 arg_new->addIncoming(args[i], hoist1); 2951 args[i] = arg_new; 2952 } 2953 2954 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave); 2955 } 2956 2957 // loop1: ... insts on scalars; N -= 1, args += stride; goto test1; 2958 { 2959 IRBuilder b(loop1); 2960 for (size_t i = 0; i < instructions.size(); i++) { 2961 if (!instructions[i].can_hoist && !emit(i, true, &b)) { 2962 return; 2963 } 2964 } 2965 2966 // n -= 1 2967 llvm::Value* n_next = b.CreateSub(n, b.getInt32(1)); 2968 n->addIncoming(n_next, loop1); 2969 2970 // Each arg ptr += 1 2971 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2972 llvm::Value* arg_next 2973 = b.CreateConstInBoundsGEP1_32( 2974 llvm::Type::getInt8Ty (*ctx), args[i], fImpl->strides[i]); 2975 args[i]->addIncoming(arg_next, loop1); 2976 } 2977 b.CreateBr(test1); 2978 } 2979 2980 // leave: ret 2981 { 2982 IRBuilder b(leave); 2983 b.CreateRetVoid(); 2984 } 2985 2986 SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs())); 2987 2988 if (true) { 2989 SkString path = SkStringPrintf("/tmp/%s.bc", debug_name); 2990 std::error_code err; 2991 llvm::raw_fd_ostream os(path.c_str(), err); 2992 if (err) { 2993 return; 2994 } 2995 llvm::WriteBitcodeToFile(*mod, os); 2996 } 2997 2998 static SkOnce once; 2999 once([]{ 3000 SkAssertResult(false == llvm::InitializeNativeTarget()); 3001 SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter()); 3002 }); 3003 3004 if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod)) 3005 .setEngineKind(llvm::EngineKind::JIT) 3006 .setMCPU(llvm::sys::getHostCPUName()) 3007 .create()) { 3008 fImpl->llvm_ctx = std::move(ctx); 3009 fImpl->llvm_ee.reset(ee); 3010 3011 #if defined(SKVM_LLVM_WAIT_FOR_COMPILATION) 3012 // Wait for llvm to compile 3013 void* function = (void*)ee->getFunctionAddress(debug_name); 3014 fImpl->jit_entry.store(function); 3015 // We have to be careful here about what we close over and how, in case fImpl moves. 3016 // fImpl itself may change, but its pointee fields won't, so close over them by value. 3017 // Also, debug_name will almost certainly leave scope, so copy it. 3018 #else 3019 fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry, 3020 ee = fImpl->llvm_ee.get(), 3021 name = std::string(debug_name)]{ 3022 // std::atomic<void*>* dst; 3023 // llvm::ExecutionEngine* ee; 3024 // std::string name; 3025 dst->store( (void*)ee->getFunctionAddress(name.c_str()) ); 3026 }); 3027 #endif 3028 } 3029 } 3030 #endif // SKVM_LLVM 3031 waitForLLVM() const3032 void Program::waitForLLVM() const { 3033 #if defined(SKVM_LLVM) && !defined(SKVM_LLVM_WAIT_FOR_COMPILATION) 3034 if (fImpl->llvm_compiling.valid()) { 3035 fImpl->llvm_compiling.wait(); 3036 } 3037 #endif 3038 } 3039 hasJIT() const3040 bool Program::hasJIT() const { 3041 // Program::hasJIT() is really just a debugging / test aid, 3042 // so we don't mind adding a sync point here to wait for compilation. 3043 this->waitForLLVM(); 3044 3045 return fImpl->jit_entry.load() != nullptr; 3046 } 3047 dropJIT()3048 void Program::dropJIT() { 3049 #if defined(SKVM_LLVM) 3050 this->waitForLLVM(); 3051 fImpl->llvm_ee .reset(nullptr); 3052 fImpl->llvm_ctx.reset(nullptr); 3053 #elif defined(SKVM_JIT) 3054 if (fImpl->dylib) { 3055 close_dylib(fImpl->dylib); 3056 } else if (auto jit_entry = fImpl->jit_entry.load()) { 3057 unmap_jit_buffer(jit_entry, fImpl->jit_size); 3058 } 3059 #else 3060 SkASSERT(!this->hasJIT()); 3061 #endif 3062 3063 fImpl->jit_entry.store(nullptr); 3064 fImpl->jit_size = 0; 3065 fImpl->dylib = nullptr; 3066 } 3067 Program()3068 Program::Program() : fImpl(std::make_unique<Impl>()) {} 3069 ~Program()3070 Program::~Program() { 3071 // Moved-from Programs may have fImpl == nullptr. 3072 if (fImpl) { 3073 this->dropJIT(); 3074 } 3075 } 3076 Program(Program && other)3077 Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} 3078 operator =(Program && other)3079 Program& Program::operator=(Program&& other) { 3080 fImpl = std::move(other.fImpl); 3081 return *this; 3082 } 3083 Program(const std::vector<OptimizedInstruction> & instructions,const std::vector<int> & strides,const char * debug_name,bool allow_jit)3084 Program::Program(const std::vector<OptimizedInstruction>& instructions, 3085 const std::vector<int>& strides, 3086 const char* debug_name, bool allow_jit) : Program() { 3087 fImpl->strides = strides; 3088 if (gSkVMAllowJIT && allow_jit) { 3089 #if 1 && defined(SKVM_LLVM) 3090 this->setupLLVM(instructions, debug_name); 3091 #elif 1 && defined(SKVM_JIT) 3092 this->setupJIT(instructions, debug_name); 3093 #endif 3094 } 3095 3096 // Might as well do this after setupLLVM() to get a little more time to compile. 3097 this->setupInterpreter(instructions); 3098 } 3099 instructions() const3100 std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } nargs() const3101 int Program::nargs() const { return (int)fImpl->strides.size(); } nregs() const3102 int Program::nregs() const { return fImpl->regs; } loop() const3103 int Program::loop () const { return fImpl->loop; } empty() const3104 bool Program::empty() const { return fImpl->instructions.empty(); } 3105 3106 // Translate OptimizedInstructions to InterpreterInstructions. setupInterpreter(const std::vector<OptimizedInstruction> & instructions)3107 void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { 3108 // Register each instruction is assigned to. 3109 std::vector<Reg> reg(instructions.size()); 3110 3111 // This next bit is a bit more complicated than strictly necessary; 3112 // we could just assign every instruction to its own register. 3113 // 3114 // But recycling registers is fairly cheap, and good practice for the 3115 // JITs where minimizing register pressure really is important. 3116 // 3117 // We have effectively infinite registers, so we hoist any value we can. 3118 // (The JIT may choose a more complex policy to reduce register pressure.) 3119 3120 fImpl->regs = 0; 3121 std::vector<Reg> avail; 3122 3123 // Assign this value to a register, recycling them where we can. 3124 auto assign_register = [&](Val id) { 3125 const OptimizedInstruction& inst = instructions[id]; 3126 3127 // If this is a real input and it's lifetime ends at this instruction, 3128 // we can recycle the register it's occupying. 3129 auto maybe_recycle_register = [&](Val input) { 3130 if (input != NA && instructions[input].death == id) { 3131 avail.push_back(reg[input]); 3132 } 3133 }; 3134 3135 // Take care to not recycle the same register twice. 3136 const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w; 3137 if (true ) { maybe_recycle_register(x); } 3138 if (y != x ) { maybe_recycle_register(y); } 3139 if (z != x && z != y ) { maybe_recycle_register(z); } 3140 if (w != x && w != y && w != z) { maybe_recycle_register(w); } 3141 3142 // Instructions that die at themselves (stores) don't need a register. 3143 if (inst.death != id) { 3144 // Allocate a register if we have to, preferring to reuse anything available. 3145 if (avail.empty()) { 3146 reg[id] = fImpl->regs++; 3147 } else { 3148 reg[id] = avail.back(); 3149 avail.pop_back(); 3150 } 3151 } 3152 }; 3153 3154 // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. 3155 for (Val id = 0; id < (Val)instructions.size(); id++) { 3156 if ( instructions[id].can_hoist) { assign_register(id); } 3157 } 3158 for (Val id = 0; id < (Val)instructions.size(); id++) { 3159 if (!instructions[id].can_hoist) { assign_register(id); } 3160 } 3161 3162 // Translate OptimizedInstructions to InterpreterIstructions by mapping values to 3163 // registers. This will be two passes, first hoisted instructions, then inside the loop. 3164 3165 // The loop begins at the fImpl->loop'th Instruction. 3166 fImpl->loop = 0; 3167 fImpl->instructions.reserve(instructions.size()); 3168 3169 // Add a mapping for the N/A sentinel Val to any arbitrary register 3170 // so lookups don't have to know which arguments are used by which Ops. 3171 auto lookup_register = [&](Val id) { 3172 return id == NA ? (Reg)0 3173 : reg[id]; 3174 }; 3175 3176 auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { 3177 InterpreterInstruction pinst{ 3178 inst.op, 3179 lookup_register(id), 3180 lookup_register(inst.x), 3181 lookup_register(inst.y), 3182 lookup_register(inst.z), 3183 lookup_register(inst.w), 3184 inst.immA, 3185 inst.immB, 3186 inst.immC, 3187 }; 3188 fImpl->instructions.push_back(pinst); 3189 }; 3190 3191 for (Val id = 0; id < (Val)instructions.size(); id++) { 3192 const OptimizedInstruction& inst = instructions[id]; 3193 if (inst.can_hoist) { 3194 push_instruction(id, inst); 3195 fImpl->loop++; 3196 } 3197 } 3198 for (Val id = 0; id < (Val)instructions.size(); id++) { 3199 const OptimizedInstruction& inst = instructions[id]; 3200 if (!inst.can_hoist) { 3201 push_instruction(id, inst); 3202 } 3203 } 3204 } 3205 3206 #if defined(SKVM_JIT) 3207 3208 namespace SkVMJitTypes { 3209 #if defined(__x86_64__) || defined(_M_X64) 3210 using Reg = Assembler::Ymm; 3211 #elif defined(__aarch64__) 3212 using Reg = Assembler::V; 3213 #endif 3214 } // namespace SkVMJitTypes 3215 jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const3216 bool Program::jit(const std::vector<OptimizedInstruction>& instructions, 3217 int* stack_hint, 3218 uint32_t* registers_used, 3219 Assembler* a) const { 3220 using A = Assembler; 3221 using SkVMJitTypes::Reg; 3222 3223 SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool. 3224 A::Label iota; // Varies per lane, for Op::index. 3225 A::Label load64_index; // Used to load low or high half of 64-bit lanes. 3226 3227 // The `regs` array tracks everything we know about each register's state: 3228 // - NA: empty 3229 // - RES: reserved by ABI 3230 // - TMP: holding a temporary 3231 // - id: holding Val id 3232 constexpr Val RES = NA-1, 3233 TMP = RES-1; 3234 3235 // Map val -> stack slot. 3236 std::vector<int> stack_slot(instructions.size(), NA); 3237 int next_stack_slot = 0; 3238 3239 const int nstack_slots = *stack_hint >= 0 ? *stack_hint 3240 : stack_slot.size(); 3241 #if defined(__x86_64__) || defined(_M_X64) 3242 if (!SkCpu::Supports(SkCpu::HSW)) { 3243 return false; 3244 } 3245 const int K = 8; 3246 #if defined(_M_X64) // Important to check this first; clang-cl defines both. 3247 const A::GP64 N = A::rcx, 3248 GP0 = A::rax, 3249 GP1 = A::r11, 3250 arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi }; 3251 3252 // xmm6-15 need are callee-saved. 3253 std::array<Val,16> regs = { 3254 NA, NA, NA, NA, NA, NA,RES,RES, 3255 RES,RES,RES,RES, RES,RES,RES,RES, 3256 }; 3257 const uint32_t incoming_registers_used = *registers_used; 3258 3259 auto enter = [&]{ 3260 // rcx,rdx,r8,r9 are all already holding their correct values. 3261 // Load caller-saved r10 from rsp+40 if there's a fourth arg. 3262 if (fImpl->strides.size() >= 4) { 3263 a->mov(A::r10, A::Mem{A::rsp, 40}); 3264 } 3265 // Load callee-saved rdi from rsp+48 if there's a fifth arg, 3266 // first saving it to ABI reserved shadow area rsp+8. 3267 if (fImpl->strides.size() >= 5) { 3268 a->mov(A::Mem{A::rsp, 8}, A::rdi); 3269 a->mov(A::rdi, A::Mem{A::rsp, 48}); 3270 } 3271 // Load callee-saved rsi from rsp+56 if there's a sixth arg, 3272 // first saving it to ABI reserved shadow area rsp+16. 3273 if (fImpl->strides.size() >= 6) { 3274 a->mov(A::Mem{A::rsp, 16}, A::rsi); 3275 a->mov(A::rsi, A::Mem{A::rsp, 56}); 3276 } 3277 3278 // Allocate stack for our values and callee-saved xmm6-15. 3279 int stack_needed = nstack_slots*K*4; 3280 for (int r = 6; r < 16; r++) { 3281 if (incoming_registers_used & (1<<r)) { 3282 stack_needed += 16; 3283 } 3284 } 3285 if (stack_needed) { a->sub(A::rsp, stack_needed); } 3286 3287 int next_saved_xmm = nstack_slots*K*4; 3288 for (int r = 6; r < 16; r++) { 3289 if (incoming_registers_used & (1<<r)) { 3290 a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r); 3291 next_saved_xmm += 16; 3292 regs[r] = NA; 3293 } 3294 } 3295 }; 3296 auto exit = [&]{ 3297 // The second pass of jit() shouldn't use any register it didn't in the first pass. 3298 SkASSERT((*registers_used & incoming_registers_used) == *registers_used); 3299 3300 // Restore callee-saved xmm6-15 and the stack pointer. 3301 int stack_used = nstack_slots*K*4; 3302 for (int r = 6; r < 16; r++) { 3303 if (incoming_registers_used & (1<<r)) { 3304 a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used}); 3305 stack_used += 16; 3306 } 3307 } 3308 if (stack_used) { a->add(A::rsp, stack_used); } 3309 3310 // Restore callee-saved rdi/rsi if we used them. 3311 if (fImpl->strides.size() >= 5) { 3312 a->mov(A::rdi, A::Mem{A::rsp, 8}); 3313 } 3314 if (fImpl->strides.size() >= 6) { 3315 a->mov(A::rsi, A::Mem{A::rsp, 16}); 3316 } 3317 3318 a->vzeroupper(); 3319 a->ret(); 3320 }; 3321 #elif defined(__x86_64__) 3322 const A::GP64 N = A::rdi, 3323 GP0 = A::rax, 3324 GP1 = A::r11, 3325 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 }; 3326 3327 // All 16 ymm registers are available to use. 3328 std::array<Val,16> regs = { 3329 NA,NA,NA,NA, NA,NA,NA,NA, 3330 NA,NA,NA,NA, NA,NA,NA,NA, 3331 }; 3332 3333 auto enter = [&]{ 3334 // Load caller-saved r10 from rsp+8 if there's a sixth arg. 3335 if (fImpl->strides.size() >= 6) { 3336 a->mov(A::r10, A::Mem{A::rsp, 8}); 3337 } 3338 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } 3339 }; 3340 auto exit = [&]{ 3341 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); } 3342 a->vzeroupper(); 3343 a->ret(); 3344 }; 3345 #endif 3346 3347 auto load_from_memory = [&](Reg r, Val v) { 3348 if (instructions[v].op == Op::splat) { 3349 if (instructions[v].immA == 0) { 3350 a->vpxor(r,r,r); 3351 } else { 3352 a->vmovups(r, constants.find(instructions[v].immA)); 3353 } 3354 } else { 3355 SkASSERT(stack_slot[v] != NA); 3356 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4}); 3357 } 3358 }; 3359 auto store_to_stack = [&](Reg r, Val v) { 3360 SkASSERT(next_stack_slot < nstack_slots); 3361 stack_slot[v] = next_stack_slot++; 3362 a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r); 3363 }; 3364 #elif defined(__aarch64__) 3365 const int K = 4; 3366 const A::X N = A::x0, 3367 GP0 = A::x8, 3368 GP1 = A::x9, 3369 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; 3370 3371 // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit. 3372 std::array<Val,32> regs = { 3373 NA, NA, NA, NA, NA, NA, NA, NA, 3374 RES,RES,RES,RES, RES,RES,RES,RES, 3375 NA, NA, NA, NA, NA, NA, NA, NA, 3376 NA, NA, NA, NA, NA, NA, NA, NA, 3377 }; 3378 3379 auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } }; 3380 auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); } 3381 a->ret(A::x30); }; 3382 3383 auto load_from_memory = [&](Reg r, Val v) { 3384 if (instructions[v].op == Op::splat) { 3385 if (instructions[v].immA == 0) { 3386 a->eor16b(r,r,r); 3387 } else { 3388 a->ldrq(r, constants.find(instructions[v].immA)); 3389 } 3390 } else { 3391 SkASSERT(stack_slot[v] != NA); 3392 a->ldrq(r, A::sp, stack_slot[v]); 3393 } 3394 }; 3395 auto store_to_stack = [&](Reg r, Val v) { 3396 SkASSERT(next_stack_slot < nstack_slots); 3397 stack_slot[v] = next_stack_slot++; 3398 a->strq(r, A::sp, stack_slot[v]); 3399 }; 3400 #endif 3401 3402 *registers_used = 0; // We'll update this as we go. 3403 3404 if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) { 3405 return false; 3406 } 3407 3408 auto emit = [&](Val id, bool scalar) { 3409 const int active_lanes = scalar ? 1 : K; 3410 const OptimizedInstruction& inst = instructions[id]; 3411 const Op op = inst.op; 3412 const Val x = inst.x, 3413 y = inst.y, 3414 z = inst.z, 3415 w = inst.w; 3416 const int immA = inst.immA, 3417 immB = inst.immB, 3418 immC = inst.immC; 3419 3420 // alloc_tmp() returns the first of N adjacent temporary registers, 3421 // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst(). 3422 auto alloc_tmp = [&](int N=1) -> Reg { 3423 auto needs_spill = [&](Val v) -> bool { 3424 SkASSERT(v >= 0); // {NA,TMP,RES} need to be handled before calling this. 3425 return stack_slot[v] == NA // We haven't spilled it already? 3426 && instructions[v].op != Op::splat; // No need to spill constants. 3427 }; 3428 3429 // We want to find a block of N adjacent registers requiring the fewest spills. 3430 int best_block = -1, 3431 min_spills = 0x7fff'ffff; 3432 for (int block = 0; block+N <= (int)regs.size(); block++) { 3433 int spills = 0; 3434 for (int r = block; r < block+N; r++) { 3435 Val v = regs[r]; 3436 // Registers holding NA (nothing) are ideal, nothing to spill. 3437 if (v == NA) { 3438 continue; 3439 } 3440 // We can't spill anything REServed or that we'll need this instruction. 3441 if (v == RES || 3442 v == TMP || v == id || v == x || v == y || v == z || v == w) { 3443 spills = 0x7fff'ffff; 3444 block = r; // (optimization) continue outer loop at next register. 3445 break; 3446 } 3447 // Usually here we've got a value v that we'd have to spill to the stack 3448 // before reusing its register, but sometimes even now we get a freebie. 3449 spills += needs_spill(v) ? 1 : 0; 3450 } 3451 3452 // TODO: non-arbitrary tie-breaking? 3453 if (min_spills > spills) { 3454 min_spills = spills; 3455 best_block = block; 3456 } 3457 if (min_spills == 0) { 3458 break; // (optimization) stop early if we find an unbeatable block. 3459 } 3460 } 3461 3462 // TODO: our search's success isn't obviously guaranteed... it depends on N 3463 // and the number and relative position in regs of any unspillable values. 3464 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64; 3465 // we'll need to revisit this logic should this assert fire. 3466 SkASSERT(min_spills <= N); 3467 3468 // Spill what needs spilling, and mark the block all as TMP. 3469 for (int r = best_block; r < best_block+N; r++) { 3470 Val& v = regs[r]; 3471 *registers_used |= (1<<r); 3472 3473 SkASSERT(v == NA || v >= 0); 3474 if (v >= 0 && needs_spill(v)) { 3475 store_to_stack((Reg)r, v); 3476 SkASSERT(!needs_spill(v)); 3477 min_spills--; 3478 } 3479 3480 v = TMP; 3481 } 3482 SkASSERT(min_spills == 0); 3483 return (Reg)best_block; 3484 }; 3485 3486 auto free_tmp = [&](Reg r) { 3487 SkASSERT(regs[r] == TMP); 3488 regs[r] = NA; 3489 }; 3490 3491 // Which register holds dst,x,y,z,w for this instruction? NA if none does yet. 3492 int rd = NA, 3493 rx = NA, 3494 ry = NA, 3495 rz = NA, 3496 rw = NA; 3497 3498 auto update_regs = [&](Reg r, Val v) { 3499 if (v == id) { rd = r; } 3500 if (v == x) { rx = r; } 3501 if (v == y) { ry = r; } 3502 if (v == z) { rz = r; } 3503 if (v == w) { rw = r; } 3504 return r; 3505 }; 3506 3507 auto find_existing_reg = [&](Val v) -> int { 3508 // Quick-check our working registers. 3509 if (v == id && rd != NA) { return rd; } 3510 if (v == x && rx != NA) { return rx; } 3511 if (v == y && ry != NA) { return ry; } 3512 if (v == z && rz != NA) { return rz; } 3513 if (v == w && rw != NA) { return rw; } 3514 3515 // Search inter-instruction register map. 3516 for (auto [r,val] : SkMakeEnumerate(regs)) { 3517 if (val == v) { 3518 return update_regs((Reg)r, v); 3519 } 3520 } 3521 return NA; 3522 }; 3523 3524 // Return a register for Val, holding that value if it already exists. 3525 // During this instruction all calls to r(v) will return the same register. 3526 auto r = [&](Val v) -> Reg { 3527 SkASSERT(v >= 0); 3528 3529 if (int found = find_existing_reg(v); found != NA) { 3530 return (Reg)found; 3531 } 3532 3533 Reg r = alloc_tmp(); 3534 SkASSERT(regs[r] == TMP); 3535 3536 SkASSERT(v <= id); 3537 if (v < id) { 3538 // If v < id, we're loading one of this instruction's inputs. 3539 // If v == id we're just allocating its destination register. 3540 load_from_memory(r, v); 3541 } 3542 regs[r] = v; 3543 return update_regs(r, v); 3544 }; 3545 3546 auto dies_here = [&](Val v) -> bool { 3547 SkASSERT(v >= 0); 3548 return instructions[v].death == id; 3549 }; 3550 3551 // Alias dst() to r(v) if dies_here(v). 3552 auto try_alias = [&](Val v) -> bool { 3553 SkASSERT(v == x || v == y || v == z || v == w); 3554 if (dies_here(v)) { 3555 rd = r(v); // Vals v and id share a register for this instruction. 3556 regs[rd] = id; // Next instruction, Val id will be in the register, not Val v. 3557 return true; 3558 } 3559 return false; 3560 }; 3561 3562 // Generally r(id), 3563 // but with a hint, try to alias dst() to r(v) if dies_here(v). 3564 auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg { 3565 if (hint1 != NA && try_alias(hint1)) { return r(id); } 3566 if (hint2 != NA && try_alias(hint2)) { return r(id); } 3567 return r(id); 3568 }; 3569 3570 #if defined(__aarch64__) // Nothing sneaky, just unused on x86-64. 3571 auto mark_tmp_as_dst = [&](Reg tmp) { 3572 SkASSERT(regs[tmp] == TMP); 3573 rd = tmp; 3574 regs[rd] = id; 3575 SkASSERT(dst() == tmp); 3576 }; 3577 #endif 3578 3579 #if defined(__x86_64__) || defined(_M_X64) 3580 // On x86 we can work with many values directly from the stack or program constant pool. 3581 auto any = [&](Val v) -> A::Operand { 3582 SkASSERT(v >= 0); 3583 SkASSERT(v < id); 3584 3585 if (int found = find_existing_reg(v); found != NA) { 3586 return (Reg)found; 3587 } 3588 if (instructions[v].op == Op::splat) { 3589 return constants.find(instructions[v].immA); 3590 } 3591 return A::Mem{A::rsp, stack_slot[v]*K*4}; 3592 }; 3593 3594 // This is never really worth asking except when any() might be used; 3595 // if we need this value in ARM, might as well just call r(v) to get it into a register. 3596 auto in_reg = [&](Val v) -> bool { 3597 return find_existing_reg(v) != NA; 3598 }; 3599 #endif 3600 3601 switch (op) { 3602 // Make sure splat constants can be found by load_from_memory() or any(). 3603 case Op::splat: 3604 (void)constants[immA]; 3605 break; 3606 3607 #if defined(__x86_64__) || defined(_M_X64) 3608 case Op::assert_true: { 3609 a->vptest (r(x), &constants[0xffffffff]); 3610 A::Label all_true; 3611 a->jc(&all_true); 3612 a->int3(); 3613 a->label(&all_true); 3614 } break; 3615 3616 case Op::trace_line: 3617 case Op::trace_var: 3618 case Op::trace_call: 3619 /* Only supported in the interpreter. */ 3620 break; 3621 3622 case Op::store8: 3623 if (scalar) { 3624 a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3625 } else { 3626 a->vpackusdw(dst(x), r(x), r(x)); 3627 a->vpermq (dst(), dst(), 0xd8); 3628 a->vpackuswb(dst(), dst(), dst()); 3629 a->vmovq (A::Mem{arg[immA]}, (A::Xmm)dst()); 3630 } break; 3631 3632 case Op::store16: 3633 if (scalar) { 3634 a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0); 3635 } else { 3636 a->vpackusdw(dst(x), r(x), r(x)); 3637 a->vpermq (dst(), dst(), 0xd8); 3638 a->vmovups (A::Mem{arg[immA]}, (A::Xmm)dst()); 3639 } break; 3640 3641 case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immA]}, (A::Xmm)r(x)); } 3642 else { a->vmovups(A::Mem{arg[immA]}, r(x)); } 3643 break; 3644 3645 case Op::store64: if (scalar) { 3646 a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x)); 3647 a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y)); 3648 } else { 3649 // r(x) = {a,b,c,d|e,f,g,h} 3650 // r(y) = {i,j,k,l|m,n,o,p} 3651 // We want to write a,i,b,j,c,k,d,l,e,m... 3652 A::Ymm L = alloc_tmp(), 3653 H = alloc_tmp(); 3654 a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j|e,m,f,n} 3655 a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l|g,o,h,p} 3656 a->vperm2f128(dst(), L,H, 0x20); // = {a,i,b,j|c,k,d,l} 3657 a->vmovups(A::Mem{arg[immA], 0}, dst()); 3658 a->vperm2f128(dst(), L,H, 0x31); // = {e,m,f,n|g,o,h,p} 3659 a->vmovups(A::Mem{arg[immA],32}, dst()); 3660 free_tmp(L); 3661 free_tmp(H); 3662 } break; 3663 3664 case Op::store128: { 3665 // TODO: >32-bit stores 3666 a->vmovd (A::Mem{arg[immA], 0*16 + 0}, (A::Xmm)r(x) ); 3667 a->vmovd (A::Mem{arg[immA], 0*16 + 4}, (A::Xmm)r(y) ); 3668 a->vmovd (A::Mem{arg[immA], 0*16 + 8}, (A::Xmm)r(z) ); 3669 a->vmovd (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w) ); 3670 if (scalar) { break; } 3671 3672 a->vpextrd(A::Mem{arg[immA], 1*16 + 0}, (A::Xmm)r(x), 1); 3673 a->vpextrd(A::Mem{arg[immA], 1*16 + 4}, (A::Xmm)r(y), 1); 3674 a->vpextrd(A::Mem{arg[immA], 1*16 + 8}, (A::Xmm)r(z), 1); 3675 a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1); 3676 3677 a->vpextrd(A::Mem{arg[immA], 2*16 + 0}, (A::Xmm)r(x), 2); 3678 a->vpextrd(A::Mem{arg[immA], 2*16 + 4}, (A::Xmm)r(y), 2); 3679 a->vpextrd(A::Mem{arg[immA], 2*16 + 8}, (A::Xmm)r(z), 2); 3680 a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2); 3681 3682 a->vpextrd(A::Mem{arg[immA], 3*16 + 0}, (A::Xmm)r(x), 3); 3683 a->vpextrd(A::Mem{arg[immA], 3*16 + 4}, (A::Xmm)r(y), 3); 3684 a->vpextrd(A::Mem{arg[immA], 3*16 + 8}, (A::Xmm)r(z), 3); 3685 a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3); 3686 // Now we need to store the upper 128 bits of x,y,z,w. 3687 // Storing in this order rather than interlacing minimizes temporaries. 3688 a->vextracti128(dst(), r(x), 1); 3689 a->vmovd (A::Mem{arg[immA], 4*16 + 0}, (A::Xmm)dst() ); 3690 a->vpextrd(A::Mem{arg[immA], 5*16 + 0}, (A::Xmm)dst(), 1); 3691 a->vpextrd(A::Mem{arg[immA], 6*16 + 0}, (A::Xmm)dst(), 2); 3692 a->vpextrd(A::Mem{arg[immA], 7*16 + 0}, (A::Xmm)dst(), 3); 3693 3694 a->vextracti128(dst(), r(y), 1); 3695 a->vmovd (A::Mem{arg[immA], 4*16 + 4}, (A::Xmm)dst() ); 3696 a->vpextrd(A::Mem{arg[immA], 5*16 + 4}, (A::Xmm)dst(), 1); 3697 a->vpextrd(A::Mem{arg[immA], 6*16 + 4}, (A::Xmm)dst(), 2); 3698 a->vpextrd(A::Mem{arg[immA], 7*16 + 4}, (A::Xmm)dst(), 3); 3699 3700 a->vextracti128(dst(), r(z), 1); 3701 a->vmovd (A::Mem{arg[immA], 4*16 + 8}, (A::Xmm)dst() ); 3702 a->vpextrd(A::Mem{arg[immA], 5*16 + 8}, (A::Xmm)dst(), 1); 3703 a->vpextrd(A::Mem{arg[immA], 6*16 + 8}, (A::Xmm)dst(), 2); 3704 a->vpextrd(A::Mem{arg[immA], 7*16 + 8}, (A::Xmm)dst(), 3); 3705 3706 a->vextracti128(dst(), r(w), 1); 3707 a->vmovd (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst() ); 3708 a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1); 3709 a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2); 3710 a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3); 3711 } break; 3712 3713 case Op::load8: if (scalar) { 3714 a->vpxor (dst(), dst(), dst()); 3715 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3716 } else { 3717 a->vpmovzxbd(dst(), A::Mem{arg[immA]}); 3718 } break; 3719 3720 case Op::load16: if (scalar) { 3721 a->vpxor (dst(), dst(), dst()); 3722 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0); 3723 } else { 3724 a->vpmovzxwd(dst(), A::Mem{arg[immA]}); 3725 } break; 3726 3727 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immA]}); } 3728 else { a->vmovups( dst(), A::Mem{arg[immA]}); } 3729 break; 3730 3731 case Op::load64: if (scalar) { 3732 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3733 } else { 3734 A::Ymm tmp = alloc_tmp(); 3735 a->vmovups(tmp, &load64_index); 3736 a->vpermps(dst(), tmp, A::Mem{arg[immA], 0}); 3737 a->vpermps( tmp, tmp, A::Mem{arg[immA], 32}); 3738 // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1. 3739 a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20); 3740 free_tmp(tmp); 3741 } break; 3742 3743 case Op::load128: if (scalar) { 3744 a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB}); 3745 } else { 3746 // Load 4 low values into xmm tmp, 3747 A::Ymm tmp = alloc_tmp(); 3748 A::Xmm t = (A::Xmm)tmp; 3749 a->vmovd (t, A::Mem{arg[immA], 0*16 + 4*immB} ); 3750 a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1); 3751 a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2); 3752 a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3); 3753 3754 // Load 4 high values into xmm dst(), 3755 A::Xmm d = (A::Xmm)dst(); 3756 a->vmovd (d, A::Mem{arg[immA], 4*16 + 4*immB} ); 3757 a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1); 3758 a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2); 3759 a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3); 3760 3761 // Merge the two, ymm dst() = {xmm tmp|xmm dst()} 3762 a->vperm2f128(dst(), tmp,dst(), 0x20); 3763 free_tmp(tmp); 3764 } break; 3765 3766 case Op::gather8: { 3767 // As usual, the gather base pointer is immB bytes off of uniform immA. 3768 a->mov(GP0, A::Mem{arg[immA], immB}); 3769 3770 A::Ymm tmp = alloc_tmp(); 3771 a->vmovups(tmp, any(x)); 3772 3773 for (int i = 0; i < active_lanes; i++) { 3774 if (i == 4) { 3775 // vpextrd can only pluck indices out from an Xmm register, 3776 // so we manually swap over to the top when we're halfway through. 3777 a->vextracti128((A::Xmm)tmp, tmp, 1); 3778 } 3779 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3780 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i); 3781 } 3782 a->vpmovzxbd(dst(), dst()); 3783 free_tmp(tmp); 3784 } break; 3785 3786 case Op::gather16: { 3787 // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd. 3788 a->mov(GP0, A::Mem{arg[immA], immB}); 3789 3790 A::Ymm tmp = alloc_tmp(); 3791 a->vmovups(tmp, any(x)); 3792 3793 for (int i = 0; i < active_lanes; i++) { 3794 if (i == 4) { 3795 a->vextracti128((A::Xmm)tmp, tmp, 1); 3796 } 3797 a->vpextrd(GP1, (A::Xmm)tmp, i%4); 3798 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i); 3799 } 3800 a->vpmovzxwd(dst(), dst()); 3801 free_tmp(tmp); 3802 } break; 3803 3804 case Op::gather32: 3805 if (scalar) { 3806 // Our gather base pointer is immB bytes off of uniform immA. 3807 a->mov(GP0, A::Mem{arg[immA], immB}); 3808 3809 // Grab our index from lane 0 of the index argument. 3810 a->vmovd(GP1, (A::Xmm)r(x)); 3811 3812 // dst = *(base + 4*index) 3813 a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR}); 3814 } else { 3815 a->mov(GP0, A::Mem{arg[immA], immB}); 3816 3817 A::Ymm mask = alloc_tmp(); 3818 a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) 3819 3820 a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask); 3821 free_tmp(mask); 3822 } 3823 break; 3824 3825 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB}); 3826 break; 3827 3828 case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB}); 3829 a->vbroadcastss(dst(), A::Mem{GP0, immC}); 3830 break; 3831 3832 case Op::index: a->vmovd((A::Xmm)dst(), N); 3833 a->vbroadcastss(dst(), dst()); 3834 a->vpsubd(dst(), dst(), &iota); 3835 break; 3836 3837 // We can swap the arguments of symmetric instructions to make better use of any(). 3838 case Op::add_f32: 3839 if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); } 3840 else { a->vaddps(dst(y), r(y), any(x)); } 3841 break; 3842 3843 case Op::mul_f32: 3844 if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); } 3845 else { a->vmulps(dst(y), r(y), any(x)); } 3846 break; 3847 3848 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break; 3849 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break; 3850 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters, 3851 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max. 3852 3853 case Op::fma_f32: 3854 if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else 3855 if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else 3856 if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else 3857 { a->vmovups (dst(), any(x)); 3858 a->vfmadd132ps(dst(), r(z), any(y)); } 3859 break; 3860 3861 case Op::fms_f32: 3862 if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else 3863 if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else 3864 if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else 3865 { a->vmovups (dst(), any(x)); 3866 a->vfmsub132ps(dst(), r(z), any(y)); } 3867 break; 3868 3869 case Op::fnma_f32: 3870 if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else 3871 if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else 3872 if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else 3873 { a->vmovups (dst(), any(x)); 3874 a->vfnmadd132ps(dst(), r(z), any(y)); } 3875 break; 3876 3877 // In situations like this we want to try aliasing dst(x) when x is 3878 // already in a register, but not if we'd have to load it from the stack 3879 // just to alias it. That's done better directly into the new register. 3880 case Op::sqrt_f32: 3881 if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); } 3882 else { a->vsqrtps(dst(), any(x)); } 3883 break; 3884 3885 case Op::add_i32: 3886 if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); } 3887 else { a->vpaddd(dst(y), r(y), any(x)); } 3888 break; 3889 3890 case Op::mul_i32: 3891 if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); } 3892 else { a->vpmulld(dst(y), r(y), any(x)); } 3893 break; 3894 3895 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break; 3896 3897 case Op::bit_and: 3898 if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); } 3899 else { a->vpand(dst(y), r(y), any(x)); } 3900 break; 3901 case Op::bit_or: 3902 if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); } 3903 else { a->vpor(dst(y), r(y), any(x)); } 3904 break; 3905 case Op::bit_xor: 3906 if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); } 3907 else { a->vpxor(dst(y), r(y), any(x)); } 3908 break; 3909 3910 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x. 3911 3912 case Op::select: 3913 if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); } 3914 else { a->vpblendvb(dst(x), r(z), any(y), r(x)); } 3915 break; 3916 3917 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break; 3918 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break; 3919 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break; 3920 3921 case Op::eq_i32: 3922 if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); } 3923 else { a->vpcmpeqd(dst(y), r(y), any(x)); } 3924 break; 3925 3926 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break; 3927 3928 case Op::eq_f32: 3929 if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); } 3930 else { a->vcmpeqps(dst(y), r(y), any(x)); } 3931 break; 3932 case Op::neq_f32: 3933 if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); } 3934 else { a->vcmpneqps(dst(y), r(y), any(x)); } 3935 break; 3936 3937 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break; 3938 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break; 3939 3940 case Op::ceil: 3941 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); } 3942 else { a->vroundps(dst(), any(x), Assembler::CEIL); } 3943 break; 3944 3945 case Op::floor: 3946 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); } 3947 else { a->vroundps(dst(), any(x), Assembler::FLOOR); } 3948 break; 3949 3950 case Op::to_f32: 3951 if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); } 3952 else { a->vcvtdq2ps(dst(), any(x)); } 3953 break; 3954 3955 case Op::trunc: 3956 if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); } 3957 else { a->vcvttps2dq(dst(), any(x)); } 3958 break; 3959 3960 case Op::round: 3961 if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); } 3962 else { a->vcvtps2dq(dst(), any(x)); } 3963 break; 3964 3965 case Op::to_fp16: 3966 a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm 3967 a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm 3968 break; 3969 3970 case Op::from_fp16: 3971 a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm 3972 a->vpermq (dst(), dst(), 0xd8); // swap middle two 64-bit lanes 3973 a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm 3974 break; 3975 3976 #elif defined(__aarch64__) 3977 case Op::assert_true: { 3978 a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector. 3979 a->movs(GP0, dst(), 0); 3980 A::Label all_true; 3981 a->cbnz(GP0, &all_true); 3982 a->brk(0); 3983 a->label(&all_true); 3984 } break; 3985 3986 case Op::trace_line: 3987 case Op::trace_var: 3988 case Op::trace_call: 3989 /* Only supported in the interpreter. */ 3990 break; 3991 3992 case Op::index: { 3993 A::V tmp = alloc_tmp(); 3994 a->ldrq (tmp, &iota); 3995 a->dup4s(dst(), N); 3996 a->sub4s(dst(), dst(), tmp); 3997 free_tmp(tmp); 3998 } break; 3999 4000 case Op::store8: a->xtns2h(dst(x), r(x)); 4001 a->xtnh2b(dst(), dst()); 4002 if (scalar) { a->strb (dst(), arg[immA]); } 4003 else { a->strs (dst(), arg[immA]); } 4004 break; 4005 4006 case Op::store16: a->xtns2h(dst(x), r(x)); 4007 if (scalar) { a->strh (dst(), arg[immA]); } 4008 else { a->strd (dst(), arg[immA]); } 4009 break; 4010 4011 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); } 4012 else { a->strq(r(x), arg[immA]); } 4013 break; 4014 4015 case Op::store64: if (scalar) { 4016 a->strs(r(x), arg[immA], 0); 4017 a->strs(r(y), arg[immA], 1); 4018 } else if (r(y) == r(x)+1) { 4019 a->st24s(r(x), arg[immA]); 4020 } else { 4021 Reg tmp0 = alloc_tmp(2), 4022 tmp1 = (Reg)(tmp0+1); 4023 a->orr16b(tmp0, r(x), r(x)); 4024 a->orr16b(tmp1, r(y), r(y)); 4025 a-> st24s(tmp0, arg[immA]); 4026 free_tmp(tmp0); 4027 free_tmp(tmp1); 4028 } break; 4029 4030 case Op::store128: 4031 if (scalar) { 4032 a->strs(r(x), arg[immA], 0); 4033 a->strs(r(y), arg[immA], 1); 4034 a->strs(r(z), arg[immA], 2); 4035 a->strs(r(w), arg[immA], 3); 4036 } else if (r(y) == r(x)+1 && 4037 r(z) == r(x)+2 && 4038 r(w) == r(x)+3) { 4039 a->st44s(r(x), arg[immA]); 4040 } else { 4041 Reg tmp0 = alloc_tmp(4), 4042 tmp1 = (Reg)(tmp0+1), 4043 tmp2 = (Reg)(tmp0+2), 4044 tmp3 = (Reg)(tmp0+3); 4045 a->orr16b(tmp0, r(x), r(x)); 4046 a->orr16b(tmp1, r(y), r(y)); 4047 a->orr16b(tmp2, r(z), r(z)); 4048 a->orr16b(tmp3, r(w), r(w)); 4049 a-> st44s(tmp0, arg[immA]); 4050 free_tmp(tmp0); 4051 free_tmp(tmp1); 4052 free_tmp(tmp2); 4053 free_tmp(tmp3); 4054 } break; 4055 4056 4057 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); } 4058 else { a->ldrs(dst(), arg[immA]); } 4059 a->uxtlb2h(dst(), dst()); 4060 a->uxtlh2s(dst(), dst()); 4061 break; 4062 4063 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); } 4064 else { a->ldrd(dst(), arg[immA]); } 4065 a->uxtlh2s(dst(), dst()); 4066 break; 4067 4068 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); } 4069 else { a->ldrq(dst(), arg[immA]); } 4070 break; 4071 4072 case Op::load64: if (scalar) { 4073 a->ldrs(dst(), arg[immA], immB); 4074 } else { 4075 Reg tmp0 = alloc_tmp(2), 4076 tmp1 = (Reg)(tmp0+1); 4077 a->ld24s(tmp0, arg[immA]); 4078 // TODO: return both 4079 switch (immB) { 4080 case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break; 4081 case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break; 4082 } 4083 } break; 4084 4085 case Op::load128: if (scalar) { 4086 a->ldrs(dst(), arg[immA], immB); 4087 } else { 4088 Reg tmp0 = alloc_tmp(4), 4089 tmp1 = (Reg)(tmp0+1), 4090 tmp2 = (Reg)(tmp0+2), 4091 tmp3 = (Reg)(tmp0+3); 4092 a->ld44s(tmp0, arg[immA]); 4093 // TODO: return all four 4094 switch (immB) { 4095 case 0: mark_tmp_as_dst(tmp0); break; 4096 case 1: mark_tmp_as_dst(tmp1); break; 4097 case 2: mark_tmp_as_dst(tmp2); break; 4098 case 3: mark_tmp_as_dst(tmp3); break; 4099 } 4100 if (immB != 0) { free_tmp(tmp0); } 4101 if (immB != 1) { free_tmp(tmp1); } 4102 if (immB != 2) { free_tmp(tmp2); } 4103 if (immB != 3) { free_tmp(tmp3); } 4104 } break; 4105 4106 case Op::uniform32: a->add(GP0, arg[immA], immB); 4107 a->ld1r4s(dst(), GP0); 4108 break; 4109 4110 case Op::array32: a->add(GP0, arg[immA], immB); 4111 a->ldrd(GP0, GP0); 4112 a->add(GP0, GP0, immC); 4113 a->ld1r4s(dst(), GP0); 4114 break; 4115 4116 case Op::gather8: { 4117 // As usual, the gather base pointer is immB bytes off of uniform immA. 4118 a->add (GP0, arg[immA], immB); // GP0 = &(gather base pointer) 4119 a->ldrd(GP0, GP0); // GP0 = gather base pointer 4120 4121 for (int i = 0; i < active_lanes; i++) { 4122 a->movs(GP1, r(x), i); // Extract index lane i into GP1. 4123 a->add (GP1, GP0, GP1); // Add the gather base pointer. 4124 a->ldrb(GP1, GP1); // Load that byte. 4125 a->inss(dst(x), GP1, i); // Insert it into dst() lane i. 4126 } 4127 } break; 4128 4129 // See gather8 for general idea; comments here only where gather16 differs. 4130 case Op::gather16: { 4131 a->add (GP0, arg[immA], immB); 4132 a->ldrd(GP0, GP0); 4133 for (int i = 0; i < active_lanes; i++) { 4134 a->movs(GP1, r(x), i); 4135 a->add (GP1, GP0, GP1, A::LSL, 1); // Scale index 2x into a byte offset. 4136 a->ldrh(GP1, GP1); // 2-byte load. 4137 a->inss(dst(x), GP1, i); 4138 } 4139 } break; 4140 4141 // See gather8 for general idea; comments here only where gather32 differs. 4142 case Op::gather32: { 4143 a->add (GP0, arg[immA], immB); 4144 a->ldrd(GP0, GP0); 4145 for (int i = 0; i < active_lanes; i++) { 4146 a->movs(GP1, r(x), i); 4147 a->add (GP1, GP0, GP1, A::LSL, 2); // Scale index 4x into a byte offset. 4148 a->ldrs(GP1, GP1); // 4-byte load. 4149 a->inss(dst(x), GP1, i); 4150 } 4151 } break; 4152 4153 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break; 4154 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break; 4155 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break; 4156 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break; 4157 4158 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break; 4159 4160 case Op::fma_f32: // fmla.4s is z += x*y 4161 if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); } 4162 else { a->orr16b(dst(), r(z), r(z)); 4163 a->fmla4s(dst(), r(x), r(y)); } 4164 break; 4165 4166 case Op::fnma_f32: // fmls.4s is z -= x*y 4167 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 4168 else { a->orr16b(dst(), r(z), r(z)); 4169 a->fmls4s(dst(), r(x), r(y)); } 4170 break; 4171 4172 case Op::fms_f32: // calculate z - xy, then negate to xy - z 4173 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } 4174 else { a->orr16b(dst(), r(z), r(z)); 4175 a->fmls4s(dst(), r(x), r(y)); } 4176 a->fneg4s(dst(), dst()); 4177 break; 4178 4179 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break; 4180 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break; 4181 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break; 4182 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); 4183 a->not16b (dst(), dst()); break; 4184 4185 4186 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break; 4187 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break; 4188 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break; 4189 4190 case Op::bit_and : a->and16b(dst(x,y), r(x), r(y)); break; 4191 case Op::bit_or : a->orr16b(dst(x,y), r(x), r(y)); break; 4192 case Op::bit_xor : a->eor16b(dst(x,y), r(x), r(y)); break; 4193 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break; 4194 4195 case Op::select: // bsl16b is x = x ? y : z 4196 if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); } 4197 else { a->orr16b(dst(), r(x), r(x)); 4198 a->bsl16b(dst(), r(y), r(z)); } 4199 break; 4200 4201 // fmin4s and fmax4s don't work the way we want with NaN, 4202 // so we write them the long way: 4203 case Op::min_f32: // min(x,y) = y<x ? y : x 4204 a->fcmgt4s(dst(), r(x), r(y)); 4205 a->bsl16b (dst(), r(y), r(x)); 4206 break; 4207 4208 case Op::max_f32: // max(x,y) = x<y ? y : x 4209 a->fcmgt4s(dst(), r(y), r(x)); 4210 a->bsl16b (dst(), r(y), r(x)); 4211 break; 4212 4213 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break; 4214 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break; 4215 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break; 4216 4217 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break; 4218 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break; 4219 4220 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break; 4221 case Op::trunc: a->fcvtzs4s(dst(x), r(x)); break; 4222 case Op::round: a->fcvtns4s(dst(x), r(x)); break; 4223 case Op::ceil: a->frintp4s(dst(x), r(x)); break; 4224 case Op::floor: a->frintm4s(dst(x), r(x)); break; 4225 4226 case Op::to_fp16: 4227 a->fcvtn (dst(x), r(x)); // 4x f32 -> 4x f16 in bottom four lanes 4228 a->uxtlh2s(dst(), dst()); // expand to 4x f16 in even 16-bit lanes 4229 break; 4230 4231 case Op::from_fp16: 4232 a->xtns2h(dst(x), r(x)); // pack even 16-bit lanes into bottom four lanes 4233 a->fcvtl (dst(), dst()); // 4x f16 -> 4x f32 4234 break; 4235 #endif 4236 } 4237 4238 // Proactively free the registers holding any value that dies here. 4239 if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; } 4240 if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; } 4241 if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; } 4242 if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; } 4243 if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; } 4244 return true; 4245 }; 4246 4247 #if defined(__x86_64__) || defined(_M_X64) 4248 auto jump_if_less = [&](A::Label* l) { a->jl (l); }; 4249 auto jump = [&](A::Label* l) { a->jmp(l); }; 4250 4251 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; 4252 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; 4253 #elif defined(__aarch64__) 4254 auto jump_if_less = [&](A::Label* l) { a->blt(l); }; 4255 auto jump = [&](A::Label* l) { a->b (l); }; 4256 4257 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; 4258 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; 4259 #endif 4260 4261 A::Label body, 4262 tail, 4263 done; 4264 4265 enter(); 4266 for (Val id = 0; id < (Val)instructions.size(); id++) { 4267 if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 4268 return false; 4269 } 4270 } 4271 4272 // This point marks a kind of canonical fixed point for register contents: if loop 4273 // code is generated as if these registers are holding these values, the next time 4274 // the loop comes around we'd better find those same registers holding those same values. 4275 auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot, 4276 saved_next_stack_slot=next_stack_slot]{ 4277 for (int r = 0; r < (int)regs.size(); r++) { 4278 if (regs[r] != incoming[r]) { 4279 regs[r] = incoming[r]; 4280 if (regs[r] >= 0) { 4281 load_from_memory((Reg)r, regs[r]); 4282 } 4283 } 4284 } 4285 *stack_hint = std::max(*stack_hint, next_stack_slot); 4286 stack_slot = saved_stack_slot; 4287 next_stack_slot = saved_next_stack_slot; 4288 }; 4289 4290 a->label(&body); 4291 { 4292 a->cmp(N, K); 4293 jump_if_less(&tail); 4294 for (Val id = 0; id < (Val)instructions.size(); id++) { 4295 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { 4296 return false; 4297 } 4298 } 4299 restore_incoming_regs(); 4300 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 4301 if (fImpl->strides[i]) { 4302 add(arg[i], K*fImpl->strides[i]); 4303 } 4304 } 4305 sub(N, K); 4306 jump(&body); 4307 } 4308 4309 a->label(&tail); 4310 { 4311 a->cmp(N, 1); 4312 jump_if_less(&done); 4313 for (Val id = 0; id < (Val)instructions.size(); id++) { 4314 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) { 4315 return false; 4316 } 4317 } 4318 restore_incoming_regs(); 4319 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 4320 if (fImpl->strides[i]) { 4321 add(arg[i], 1*fImpl->strides[i]); 4322 } 4323 } 4324 sub(N, 1); 4325 jump(&tail); 4326 } 4327 4328 a->label(&done); 4329 { 4330 exit(); 4331 } 4332 4333 // Except for explicit aligned load and store instructions, AVX allows 4334 // memory operands to be unaligned. So even though we're creating 16 4335 // byte patterns on ARM or 32-byte patterns on x86, we only need to 4336 // align to 4 bytes, the element size and alignment requirement. 4337 4338 constants.foreach([&](int imm, A::Label* label) { 4339 a->align(4); 4340 a->label(label); 4341 for (int i = 0; i < K; i++) { 4342 a->word(imm); 4343 } 4344 }); 4345 4346 if (!iota.references.empty()) { 4347 a->align(4); 4348 a->label(&iota); // 0,1,2,3,4,... 4349 for (int i = 0; i < K; i++) { 4350 a->word(i); 4351 } 4352 } 4353 4354 if (!load64_index.references.empty()) { 4355 a->align(4); 4356 a->label(&load64_index); // {0,2,4,6|1,3,5,7} 4357 a->word(0); a->word(2); a->word(4); a->word(6); 4358 a->word(1); a->word(3); a->word(5); a->word(7); 4359 } 4360 4361 return true; 4362 } 4363 setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4364 void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, 4365 const char* debug_name) { 4366 // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble) 4367 // and stack_hint/registers_used to feed forward into the next jit() call. 4368 Assembler a{nullptr}; 4369 int stack_hint = -1; 4370 uint32_t registers_used = 0xffff'ffff; // Start conservatively with all. 4371 if (!this->jit(instructions, &stack_hint, ®isters_used, &a)) { 4372 return; 4373 } 4374 4375 fImpl->jit_size = a.size(); 4376 void* jit_entry = alloc_jit_buffer(&fImpl->jit_size); 4377 fImpl->jit_entry.store(jit_entry); 4378 4379 // Assemble the program for real with stack_hint/registers_used as feedback from first call. 4380 a = Assembler{jit_entry}; 4381 SkAssertResult(this->jit(instructions, &stack_hint, ®isters_used, &a)); 4382 SkASSERT(a.size() <= fImpl->jit_size); 4383 4384 // Remap as executable, and flush caches on platforms that need that. 4385 remap_as_executable(jit_entry, fImpl->jit_size); 4386 4387 notify_vtune(debug_name, jit_entry, fImpl->jit_size); 4388 4389 #if !defined(SK_BUILD_FOR_WIN) 4390 // For profiling and debugging, it's helpful to have this code loaded 4391 // dynamically rather than just jumping info fImpl->jit_entry. 4392 if (gSkVMJITViaDylib) { 4393 // Dump the raw program binary. 4394 SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name); 4395 int fd = mkstemp(path.writable_str()); 4396 ::write(fd, jit_entry, a.size()); 4397 close(fd); 4398 4399 this->dropJIT(); // (unmap and null out fImpl->jit_entry.) 4400 4401 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4402 SkString cmd = SkStringPrintf( 4403 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4404 " | clang -x assembler -shared - -o %s", 4405 path.c_str(), path.c_str()); 4406 system(cmd.c_str()); 4407 4408 // Load that dynamic library and look up skvm_jit(). 4409 fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); 4410 void* sym = nullptr; 4411 for (const char* name : {"skvm_jit", "_skvm_jit"} ) { 4412 if (!sym) { sym = dlsym(fImpl->dylib, name); } 4413 } 4414 fImpl->jit_entry.store(sym); 4415 } 4416 #endif 4417 } 4418 disassemble(SkWStream * o) const4419 void Program::disassemble(SkWStream* o) const { 4420 #if !defined(SK_BUILD_FOR_WIN) 4421 SkDebugfStream debug; 4422 if (!o) { o = &debug; } 4423 4424 const void* jit_entry = fImpl->jit_entry.load(); 4425 size_t jit_size = fImpl->jit_size; 4426 4427 if (!jit_entry) { 4428 o->writeText("Program not JIT'd. Did you pass --jit?\n"); 4429 return; 4430 } 4431 4432 char path[] = "/tmp/skvm-jit.XXXXXX"; 4433 int fd = mkstemp(path); 4434 ::write(fd, jit_entry, jit_size); 4435 close(fd); 4436 4437 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 4438 SkString cmd = SkStringPrintf( 4439 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 4440 " | clang -x assembler -shared - -o %s", 4441 path, path); 4442 system(cmd.c_str()); 4443 4444 // Now objdump to disassemble our function: 4445 // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`, 4446 // but the symbol name varies with OS, and that option may be missing from objdump on some 4447 // machines? There also apears to be quite a bit of junk after the end of the JIT'd code. 4448 // Trimming that would let us pass '--visualize-jumps' and get the loop annotated. 4449 // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art. 4450 cmd = SkStringPrintf("objdump -D %s", path); 4451 #if defined(SK_BUILD_FOR_UNIX) 4452 cmd.append(" --section=.text"); 4453 #endif 4454 FILE* fp = popen(cmd.c_str(), "r"); 4455 if (!fp) { 4456 o->writeText("objdump failed\n"); 4457 return; 4458 } 4459 4460 char line[1024]; 4461 while (fgets(line, sizeof(line), fp)) { 4462 o->writeText(line); 4463 } 4464 4465 pclose(fp); 4466 #endif 4467 } 4468 4469 #endif 4470 4471 } // namespace skvm 4472